INCLUDE(FindPkgConfig)
-INSTALL(DIRECTORY nltk DESTINATION ${LIBDIR}/python3.7/site-packages)
-INSTALL(DIRECTORY langdetect DESTINATION ${LIBDIR}/python3.7/site-packages)
+INSTALL(DIRECTORY nltk DESTINATION ${LIBDIR}/python2.7/site-packages)
+INSTALL(DIRECTORY langdetect DESTINATION ${LIBDIR}/python2.7/site-packages)
INSTALL(DIRECTORY nltk_data DESTINATION /usr/local/lib/)
+++ /dev/null
-{"freq":{"D":9246,"E":2445,"F":2510,"G":3299,"A":6930,"B":3706,"C":2451,"L":2519,"M":3951,"N":3334,"O":2514,"H":3034,"I":2837,"J":2196,"K":3663,"U":687,"T":2336,"W":2258,"V":2714,"Q":182,"P":3097,"S":8234,"R":3039,"Y":252,"X":214,"Z":422,"f":13583,"g":42805,"d":77385,"Feb":207,"e":240974,"b":21626,"c":4896,"a":128566,"n":127153,"o":86673,"l":57433,"m":31352,"j":4048,"k":45378,"h":17527,"i":140621,"w":24930,"v":32618,"u":35166,"t":82606,"s":102389,"r":98861,"q":199,"p":23331,"z":1187,"y":11757,"x":1123,"ï":264,"ë":2903,"ê":1053,"é":765,"á":212,"ü":233,"ö":184,"ó":216,"Eur":318,"Eng":637," l":3565," m":7731," n":16000," o":12065," h":7358," i":23795," j":1325," k":6363," d":33601," e":13358," f":1200," g":11018,"р":242,"с":306," a":8747,"т":161," b":8379," c":434," u":1931," t":8537," w":13128," v":24617," p":4859," s":15482," r":3617," J":2155," K":3559," H":2961," I":2185," N":3120," O":2318," L":2396," M":3803," B":3554," C":2109," A":6365," F":2371," G":3138," D":8986," E":2271,"л":219,"к":266," Z":368," Y":241,"и":371,"о":333,"н":199," S":7708,"Ger":200," R":2881,"в":199," Q":162," P":2912,"а":481," W":2205," V":2322," U":571,"е":266," T":2130,"Fra":1006,"A ":345,"Da":804,"Co":478,"Ch":621,"Du":1025,"Do":201,"De":763,"Di":5828,"Fe":367,"Eu":354,"En":721,"El":212,"Ge":659,"Ga":319,"I ":452,"Fr":1217,"Fo":165,"Fi":216,"II ":246,"C ":278,"Au":486,"Ar":425,"At":187,"As":201,"D ":158,"Ba":648,"Af":2087,"Am":566,"An":491,"Ap":353,"Al":628,"Bu":243,"Br":778,"Ca":399,"Bi":180,"Be":880,"Bo":481,"Bl":161,"Kr":224,"Ko":657,"Le":490,"Li":504,"La":658,"Lu":245,"Lo":347,"Me":800,"Mi":548,"Ma":1360,"Mu":186,"Mo":627,"Ni":257,"Ne":763,"Na":666,"No":1092,"Ok":339,"Ol":206,"Her":157,"Gr":1326,"Go":356,"Ha":534,"He":680,"II":369,"Hi":301,"Ho":503,"Hu":294,"Hy":550,"In":919,"Is":158,"It":218,"Ja":713,"Je":157,"Jo":565,"Ju":623,"Ka":1489,"Ki":194,"Ke":447,"Un":253,"Tu":248,"Tr":236,"To":272,"Th":313,"Te":262,"Ta":276,"V ":280,"Sw":402,"Sy":292,"St":964,"Su":1701,"Wo":181,"Wi":534,"Wa":412,"We":720,"Vo":315,"Vr":251,"Vi":374,"Va":314,"Ve":689,"Pr":551,"S ":157,"Pe":310,"Pa":727,"Po":681,"Pi":230,"Os":236,"Oo":423,"Or":191,"Se":814,"Sc":197,"Si":387,"Sl":222,"Sk":201,"Sp":443,"So":680,"Ru":645,"Ry":194,"Sa":728,"Re":621,"Ri":222,"Ro":746,"SA":233,"Ra":223,"Gre":501,"Gri":383,"Gra":158,"b ":1179,"Gro":254,"a ":7054,"i ":2513,"gd":570,"ge":16432,"ga":1621,"gb":319,"fk":224,"fl":183,"fg":323,"ff":351,"fi":1111,"fh":169,"fs":1224,"fr":2334,"fu":174,"ft":300,"fo":725,"Int":180,"he":6229,"ha":2610,"gn":360,"gl":334,"gi":2135,"gh":921,"gg":418,"gu":592,"gt":1512,"gs":1974,"gr":3459,"go":1385,"dt":211,"du":998,"dw":506,"g ":10256,"ea":936,"eb":3497,"ec":406,"ed":5721,"de":18394,"dd":606,"dg":161,"di":29432,"dh":249,"dj":173,"dm":299,"do":2521,"ds":2062,"dr":1453,"ew":3034,"eu":3603,"ev":2016,"ey":309,"fa":570,"h ":864,"Ind":251,"fd":469,"fe":948,"eh":993,"eg":3187,"ef":995,"ee":12296,"el":15653,"ek":7920,"ei":5726,"ep":2393,"eo":692,"en":27638,"em":4686,"et":10282,"es":15156,"er":33393,"ca":479,"e ":78745,"by":1025,"br":1953,"bu":1057,"bo":2123,"bl":1117,"bi":1966,"bb":156,"be":8513,"db":222,"In ":319,"da":3617,"f ":4067,"ct":207,"co":446,"ck":502,"ci":340,"ch":1526,"ce":547,"c ":311,"az":190,"ay":279,"ba":2057,"d ":15502,"at":11369,"as":9342,"ar":11432,"aw":597,"av":407,"au":883,"ak":2797,"al":9554,"ai":1291,"aj":155,"ap":2087,"am":3989,"an":36357,"ac":615,"ad":4564,"aa":18307,"ab":1064,"ag":2729,"ah":292,"ae":907,"af":1901,"nu":917,"nt":6760,"ns":9243,"nr":212,"no":2885,"nn":1621,"ny":191,"nw":666,"nv":455,"oe":6026,"of":3797,"oc":387,"od":1636,"oa":178,"ob":729,"om":5480,"on":10533,"ok":2525,"ol":5346,"oi":587,"og":2271,"oh":382,"ot":3827,"os":3306,"ov":1152,"ou":2993,"op":4558,"oo":12667,"or":14221,"r ":19504,"ow":1144,"pe":3683,"pg":229,"pa":2371,"pl":1195,"lê":351,"po":1932,"ph":223,"pi":1008,"lo":3369,"lm":315,"ll":2990,"ls":2634,"lp":392,"lw":311,"lv":239,"lu":1548,"lt":993,"ly":716,"o ":2083,"md":261,"ma":3853,"mb":2182,"mg":224,"me":9151,"mi":2940,"mm":802,"mp":1223,"mo":1485,"ië":1437,"mt":249,"ms":966,"mu":1085,"p ":4720,"na":6444,"nb":510,"nc":507,"nd":12581,"ne":5737,"nf":203,"ng":9804,"nh":460,"ni":6127,"nj":300,"nk":2057,"nl":616,"nm":203,"jo":532,"ki":2683,"kh":210,"kg":239,"ke":8584,"ka":6722,"m ":5913,"kw":457,"ky":282,"ks":2318,"kt":2084,"ku":1443,"ko":3908,"kr":2375,"kk":1579,"kl":2200,"km":469,"li":9515,"lh":279,"lk":1158,"lj":705,"le":10290,"ld":1944,"lg":1526,"lf":717,"la":8341,"lb":446,"n ":58065,"hr":313,"ht":702,"hu":1684,"hi":1067,"ho":3048,"dé":160,"id":5034,"ic":1058,"ib":451,"ia":2568,"ig":5540,"if":581,"ie":47836,"hy":348,"k ":9212,"ir":2359,"is":17403,"it":9361,"iu":405,"iv":1008,"iw":219,"ik":8953,"il":3774,"im":1386,"in":25004,"io":1984,"eë":1032,"ip":899,"je":609,"ji":572,"iz":156,"l ":8172,"ja":1960,"wy":994,"z ":242,"wi":1800,"wo":4179,"vy":166,"y ":4684,"wa":9856,"we":6959,"vl":1196,"vi":4040,"vu":178,"vr":662,"vo":4078,"uw":282,"uu":992,"ve":5906,"va":16173,"x ":845,"ui":7822,"uk":678,"ul":2052,"ue":905,"ug":1045,"ur":5410,"us":5098,"ut":907,"um":1711,"un":2596,"up":170,"ty":1434,"tu":2643,"tt":1277,"tw":1177,"tv":217,"ub":1182,"ua":728,"ud":950,"uc":160,"w ":232,"to":5433,"tm":201,"tl":667,"ts":3814,"tr":4026,"tg":532,"te":20430,"tk":279,"tj":177,"ti":5658,"th":1701,"tb":213,"ta":9118,"su":1177,"sv":424,"ss":2799,"st":17122,"sy":1309,"sw":531,"sl":1811,"sk":5006,"sn":242,"sm":693,"sp":2566,"oë":412,"so":3731,"sr":312,"sd":385,"sc":448,"sf":208,"se":15556,"sh":473,"sg":396,"sj":338,"si":8436,"u ":1834,"sa":2367,"sb":577,"rr":652,"rs":6262,"rt":4139,"ru":2543,"rv":1198,"rw":1199,"ry":2450,"rp":1265,"ro":8165,"rn":1586,"rm":2087,"rl":1734,"rk":2996,"ri":11752,"rh":614,"rg":2653,"rf":378,"re":10923,"rd":7372,"rc":234,"rb":955,"ra":7710,"t ":22731,"qu":168,"s ":35284,"px":614,"Hy ":529,"py":231,"pt":765,"pu":844,"pp":1058,"pr":3258,"ps":659,"wê":320,"zi":170,"ze":169,"za":209,"yg":162,"ye":406,"yf":643,"yd":927,"yw":439,"ys":1141,"yn":1041,"yl":288,"yk":1145,"Apr":247,"Aug":272,"Afr":2048,"Ame":464,"Ber":218,"Bel":171,"Bre":163,"Bra":191,"Bri":282,"Des":273,"Daa":460,"Chr":224,"Cha":171,"ër":307,"ël":325,"êr":697,"ë ":1979,"ê ":310,"é ":228,"Dit":1028,"Die":4537,"Dui":918,"Ned":417,"Nas":187,"Nov":238,"Noo":595,"Okt":256,"Oli":158,"Oos":361,"Par":313,"Pro":177,"Pre":186,"SA ":161,"Ita":207,"Jan":348,"Joh":290,"Jul":297,"Jun":245,"Kaa":543,"Kan":220,"Kat":191,"Kar":171,"Ker":270,"Kon":276,"Lat":181,"Lit":162,"Mei":281,"Mar":370,"Maa":286,"Mon":210,"Mid":157,"Wil":165,"Wes":439,"Vry":192,"Vol":161,"êre":674,"Swe":193,"Sy ":252,"Sui":1515,"Sta":443,"Ste":208,"Sep":228,"Spa":253,"Rus":560,"Sch":162,"Rep":214,"Rom":176,"Ver":555,"Uni":236,"The":196,"Tur":159,"bin":400,"blo":205,"bli":525,"bla":215,"boe":246,"boo":276,"bor":587,"bou":330,"ban":283,"bal":289,"bai":191,"baa":372,"bas":270,"bar":272,"beh":366,"beg":372,"bee":325,"bed":285,"ber":1916,"bel":540,"bek":1148,"bew":349,"bev":630,"bes":1308,"bet":510,"bie":1052,"ce ":276,"bri":159,"bro":237,"bra":211,"bre":258,"bru":1062,"bur":584,"by ":693,"am ":1182,"ake":292,"al ":2759,"ain":204,"ak ":856,"aie":241,"agt":446,"anu":467,"ann":632,"ant":1705,"ans":3841,"ane":404,"ang":1856,"ani":742,"anj":191,"ank":961,"ap ":635,"ana":788,"anc":195,"and":5528,"amm":186,"amp":480,"ami":512,"ame":657,"amb":236,"ama":204,"alt":231,"als":160,"all":667,"alk":171,"alg":320,"ali":1276,"ald":217,"ale":2352,"alf":209,"ala":367,"an ":18298,"aks":261,"akt":740,"akl":166,"abe":229,"abi":201,"aby":216,"ae ":624,"aag":175,"aad":172,"aak":679,"aai":350,"aan":6190,"aal":1515,"aam":1083,"aas":579,"aar":5293,"aap":567,"aat":1563,"ad ":2565,"afg":266,"ai ":311,"age":184,"afd":268,"adm":206,"adi":436,"ade":539,"ag ":1304,"ads":176,"ach":166,"ada":249,"af ":494,"at ":6755,"arg":256,"are":965,"ard":1124,"ara":390,"aro":332,"arn":185,"arm":157,"arl":301,"ark":397,"ari":1177,"arv":249,"ars":463,"art":1494,"ary":171,"asi":1669,"ase":210,"aso":169,"ar ":3216,"apa":189,"app":418,"aps":269,"as ":5230,"awe":308,"awi":169,"ata":346,"ast":673,"ass":518,"ato":426,"ate":1382,"ati":871,"ats":404,"atu":409,"aty":167,"aus":156,"jaa":1087,"jar":470,"je ":175,"joe":306,"jin":161,"jie":306,"ito":170,"itt":191,"its":1623,"isk":182,"ism":266,"iss":374,"ist":1582,"ita":608,"ite":1331,"itg":386,"iti":469,"ius":176,"ium":203,"ivi":590,"ive":294,"is ":12546,"ion":1252,"eër":158,"ipa":265,"ir ":1648,"isi":1018,"ise":601,"isa":220,"ire":181,"it ":3772,"kil":644,"kie":536,"kin":914,"km ":266,"kgr":173,"kee":210,"kei":339,"kel":962,"ken":2090,"kep":166,"ker":1342,"ke ":3014,"kra":345,"kse":472,"kry":1085,"kri":662,"kou":249,"kor":369,"kop":214,"koo":391,"kon":866,"kom":903,"kol":246,"koe":157,"ks ":710,"kke":1272,"kki":178,"klu":430,"kle":511,"kla":387,"kli":749,"kat":157,"kar":183,"kas":204,"kap":818,"kan":1256,"kal":611,"kaa":1596,"ka ":1388," Ga":319," Ge":658," Fo":161," Fr":1217," Fi":213," Ha":534," He":680," Go":354," Gr":1318," Hy":549," Hu":294," Ho":502," II":202," Hi":301," Ja":710," Is":157," It":218," In":916,"han":779," Ka":1486,"hal":311," Ke":447,"haw":164," Ki":192,"har":356," Jo":563," Ju":622,"haa":238,"had":164," La":657," Le":488," Li":502," Ko":657," Kr":224," Ma":1348," Mi":547," Me":799,"he ":399," Lo":346," Lu":244," Ne":762," Na":662," Ni":257," Mo":624," Mu":186,"hel":273,"hei":994,"hee":465,"hed":169,"het":2911,"her":350,"hem":255," Ap":349," Am":563," An":491," Al":626," Af":2082," Ba":645," Au":486," At":187," As":200," Ar":422," Be":877,"hie":290," Bi":179," Bl":161," Bo":479," Br":777," Bu":243,"his":173," Ca":384," Ch":612," Co":473," Da":803," Di":5802," De":761," Do":196," Du":1024," El":212," En":720," Eu":354," Fe":367," Wo":179," Wi":530," We":720," Wa":412,"god":193,"gs ":887,"gor":522,"gro":2150,"gra":537,"gri":320,"gre":401," Os":236," Or":191," Oo":422," Po":674," Pi":229," Pe":309," Pa":725,"gst":406," No":1092," Ol":205," Ok":339,"gte":962,"gti":391," Ra":221," Ro":743," Re":620," Ri":222," Pr":547,"gus":284," Sy":292," Sw":400," Su":1700," St":953," Ta":273," Th":307," Te":261," Tr":236," To":270," Ry":194," Ru":645," Sa":724," Si":385," Sc":196," Se":811," So":678," Sp":441," Sk":201," Sl":222," Va":313," Ve":669," Vi":371," Vo":314," Vr":251," Tu":243," Un":253," ja":1102,"ial":357,"ian":256," in":12303,"iaa":736," is":11238," ka":1533," ki":531," ke":481,"id ":2425," ha":612," he":3438," gr":2075," go":365,"ia ":794," hy":292," hi":477," ho":1750," hu":727,"iet":320,"ieu":180,"iew":413," ni":722,"iel":277," ne":437,"ien":998," na":2339,"ier":2228,"ies":4471,"ied":1248,"ief":177,"iek":2103," mu":691,"ig ":1346," mo":667," om":1497," on":2106," of":1952,"ifi":218," no":1205," le":910," li":598," n ":10980," la":1290," ku":387,"ich":258,"ie ":34696," km":407," kl":879,"ica":209," kr":319," ko":1672," me":4100," mi":830,"ids":257," ma":1329," lu":186,"idi":291,"ide":993,"idd":457,"ida":156," lo":197," af":820," aa":2320," ad":269," am":322," an":759," ak":286," al":829," ar":263," at":229," as":2284," ba":599,"il ":459," bi":320," be":5430," bo":565," bl":263," by":612," bu":213," br":340,"ika":2950,"igd":381,"ige":1604,"igh":698,"igi":270,"igg":185,"igt":498,"igs":156,"ik ":2305," en":9738,"imp":231," ei":517," el":502,"ime":187," ek":223," ee":1730,"ind":1030,"ina":506," fa":191,"inn":302," fo":227,"int":638,"ins":1349,"ine":545,"ing":6095," fi":368,"ini":615,"ink":417," ge":8191," ga":169,"inw":455,"ikk":629," ch":185,"ike":1814,"ila":498," da":1923,"in ":12178,"iku":209,"iks":287," do":1111,"ilo":514,"ill":662," dr":523," de":3947,"ilj":228,"ili":684,"ild":294," di":25510,"imb":245,"eë ":693,"io ":196," du":309," wê":298,"hom":166,"hou":360,"hoo":1325,"hoe":410," wy":201,"hul":552,"hui":260,"hri":224,"ht ":578," ru":233," sa":888," se":2315," si":590," sl":329," sk":1250," sp":887," so":2211," ra":237," re":1576," ri":825," ro":614," pr":1589," s ":207," px":614,"hy ":302," ou":447,"hum":674," oo":2639," op":2809," or":325," pe":402," pa":556," pl":641," po":737," lê":242," wa":7840," we":1395," wo":2888," wi":454," va":14670," ve":4043," vo":2359," vr":575," vi":2068," vl":594," ty":439," tw":582," tu":692," ui":1746," ta":895," sw":227," sy":1183," st":4293," su":859," tr":387," to":1857," th":729," ti":190," te":2715,"ffe":165,"fer":157,"fel":155,"fha":158,"fge":290,"fam":176,"fde":429,"eta":359,"ete":1299,"eti":372,"esp":358,"eso":210,"est":2951,"ess":405,"eun":234,"eto":320,"etr":438,"ets":217,"ett":493,"eve":456,"eva":262,"evo":907,"evi":274,"eur":2292,"eus":242,"ewi":337,"ewe":1704,"ewo":449,"ey ":181,"ewa":222,"epe":254,"er ":10617,"epa":228,"eor":221,"es ":4626,"ept":277,"epu":400,"epr":184,"erk":2067,"erl":875,"eri":1765,"erg":1022,"erh":416,"ere":1861,"erf":286,"erd":1514,"era":1470,"erb":529,"et ":6083,"esk":1018,"esl":228,"esi":976,"ese":3607,"eu ":338,"erv":860,"erw":949,"err":349,"ert":1101,"ers":4583,"ern":1142,"erm":861,"erp":342,"ero":382,"ekg":155,"ekk":206,"eko":474,"eks":950,"ekt":701,"en ":13492,"ela":904,"eld":1199,"elf":322,"ele":2593,"eli":1906,"elj":427,"elg":226,"elk":209,"ell":778,"elo":234,"els":1983,"elt":333,"ely":255,"emb":839,"ema":484,"eme":1266,"emo":181,"emi":456,"ep ":699,"ene":1142,"enh":254,"eng":314,"enb":269,"ena":610,"end":3112,"eno":500,"enn":400,"enk":275,"eni":1151,"ens":2864,"ent":2318,"ego":497,"ege":690,"egi":516,"eha":370,"egr":238,"egs":217,"egt":193,"eho":266,"ehe":259,"ek ":1799,"eis":330,"eil":544,"ein":1010,"eie":633,"eid":1307,"el ":3516,"eit":680,"eke":2739,"eka":220,"em ":967,"gin":784,"gie":714,"ght":548,"gep":249,"gen":1564,"get":297,"ger":1248,"ges":2014,"gev":788,"gew":944,"gee":448,"ged":475,"geb":2499,"geh":356,"geg":181,"gem":756,"gel":1995,"gek":350,"gde":427,"ge ":1916,"gaa":266,"gan":539,"ga ":157,"fst":852,"fri":2089,"for":371,"fie":369,"fil":208,"fin":174,"fis":177,"da ":327,"de ":6409,"daa":645,"dag":700,"dae":480,"dat":659,"dan":233,"dam":165,"dde":490,"ch ":316,"cha":160,"ck ":233,"che":490,"ed ":1090,"eba":159,"ebe":354,"ebi":752,"ebo":768,"ebr":1168,"ei ":821,"ega":168,"eek":631,"een":2520,"eel":2072,"eem":410,"eed":587,"ees":884,"eer":3295,"eeu":449,"eet":195,"edi":638,"ede":2561,"eda":161,"eg ":316,"eds":321,"edr":340,"ee ":892,"ef ":280,"dwe":310,"dus":171,"dor":875,"doo":416,"don":160,"dom":227,"ds ":353,"dmi":211,"doe":283,"dst":428,"dui":309,"dri":421,"dra":423,"dry":204,"dsk":181,"dse":527,"dia":294,"der":4829,"des":476,"deu":1676,"dee":1279,"del":1695,"dek":186,"den":1206,"do ":172,"din":875,"dio":177,"dis":425,"dit":656,"die":24964,"dig":1168,"dik":198,"rhe":301,"rga":496,"rgi":335,"rge":595,"ret":312,"res":944,"rg ":777,"rea":245,"ree":1091,"ref":257,"red":294,"rei":545,"reg":1039,"ren":1300,"rek":765,"rel":674,"rep":191,"rf ":180,"rdo":215,"rdi":841,"rde":1873,"re ":2607,"rd ":3667,"ras":532,"rat":587,"rbi":190,"rba":160,"rbe":287,"rag":291,"ran":2011,"ram":317,"ral":832,"rak":247,"raa":1046,"raf":284,"rad":331,"rs ":1922,"ros":273,"rot":330,"rom":305,"ron":1072,"roo":1778,"rop":575,"rou":212,"rov":708,"rod":199,"rol":315,"roe":1277,"rog":195,"rno":196,"rp ":728,"rna":508,"rne":469,"rmo":164,"rma":539,"rme":324,"rmi":175,"rlo":320,"rli":409,"rle":270,"rla":508,"rks":184,"rko":248,"rki":199,"rkl":203,"rke":440,"rka":271,"rm ":692,"rio":174,"rit":493,"ris":571,"riv":501,"rig":863,"ril":278,"rik":3384,"rin":1384,"ria":924,"ric":236,"rie":2029,"rk ":1040,"rwe":410,"rwy":498,"ryf":393,"rui":1143,"rug":256,"rum":244,"ruk":231,"rus":225,"rva":502,"rvl":353,"rvo":192,"rwa":171,"ry ":383,"rsk":872,"rsi":432,"rso":249,"rsp":591,"rsa":225,"rse":478,"rta":186,"rst":1083,"rtk":160,"rto":274,"rte":620,"rti":334,"rua":209,"rty":351,"rt ":1413,"rre":272,"saa":540,"sal":170,"sam":303,"san":408,"sas":204,"sa ":155,"ryw":338,"rys":282,"ryk":576,"sge":305,"sie":4039,"sid":185,"sia":299,"sit":436,"sis":296,"sip":279,"sin":541,"sio":799,"sil":194,"sim":173,"sik":231,"sif":160,"sig":289,"sbu":231,"se ":9840,"sch":268,"ser":501,"ses":400,"set":250,"seu":239,"sea":162,"see":618,"sed":264,"sen":1323,"sem":298,"sel":1093,"sek":186,"spo":405,"spr":756,"spe":934,"spa":260,"sow":508,"som":247,"son":545,"soo":954,"soe":195,"sok":377,"st ":267,"sli":202,"slu":297,"sky":183,"sla":1006,"sle":205,"ski":804,"sko":594,"skr":1152,"sku":244,"ska":1212,"ske":665,"sië":283,"sma":173,"sme":382,"sse":1275,"ssa":198,"ssi":922,"ste":6829,"sta":5065,"sto":805,"sti":1396,"stu":693,"str":1673,"sty":226,"sui":596,"sve":167,"sy ":1199,"swa":313,"tal":1301,"taa":2499,"tad":2323,"tau":165,"tat":456,"tas":164,"tan":1021,"te ":8469,"ta ":339,"pa ":202,"pe ":459,"par":608,"pas":176,"paa":333,"pal":324,"pan":428,"pge":207,"pen":295,"per":1379,"pes":438,"pee":201,"pel":568,"pla":660,"lê ":268,"pli":169,"ple":241,"pie":480,"por":394,"poo":160,"pos":197,"pol":518,"ppy":184,"ppe":636,"pst":229,"pub":435,"pte":575,"pra":251,"pri":484,"pre":726,"pro":1677,"pun":246,"px ":614,"py ":166,"ra ":424,"ngo":161,"ngr":289,"ngs":1292,"nge":2327,"nhe":276,"nel":314,"nen":189,"nem":225,"ner":1014,"net":468,"nes":533,"ng ":4906,"nee":762,"nce":206,"ne ":1530,"ndr":216,"nds":657,"ndo":326,"ndi":878,"nde":5081,"nda":453,"nal":790,"nam":291,"nad":316,"naf":372,"nab":229,"naa":1198,"nd ":4245,"nat":282,"nas":677,"na ":1572,"nwo":542,"nus":209,"nua":266,"ntw":393,"nto":201,"nts":300,"ntr":543,"nti":571,"ntl":164,"nta":457,"nte":1815,"nst":787,"nse":3345,"nsi":1079,"nsl":207,"nsk":498,"nt ":1757,"ns ":2476,"nog":456,"noe":477,"noo":659,"nom":368,"nne":904,"nni":442,"nië":246,"nli":373,"nke":345,"nkl":391,"nks":179,"nkr":453,"nje":156,"nig":640,"nie":1831,"nk ":274,"niv":210,"nis":1512,"nin":804,"ogr":272,"ogi":423,"oi ":216,"oha":228,"oeë":178,"ok ":1432,"ol ":554,"ock":164,"oe ":303,"ode":551,"odi":176,"ods":177,"of ":2323,"oek":499,"oel":276,"oem":563,"oeg":231,"oei":336,"oer":752,"oes":295,"oet":302,"oen":602,"oep":714,"odu":188,"oed":477,"og ":895,"ofs":803,"oew":261,"od ":254,"obe":382,"oud":510,"oue":197,"ote":350,"ott":175,"ots":913,"oto":266,"ost":637,"osi":266,"ose":346,"oss":176,"oso":190,"owa":484,"owe":208,"ovi":678,"ove":370,"ous":302,"our":167,"out":306,"opo":205,"opp":449,"ope":438,"opg":213,"opa":195,"os ":1171,"oon":731,"ool":561,"oom":393,"ook":1376,"ooi":288,"oof":1146,"oog":389,"ood":288,"or ":1152,"oot":1351,"oos":958,"oor":4776,"oop":341,"ork":260,"orl":386,"orm":964,"orp":858,"ord":4583,"ore":773,"org":587,"ori":1212,"ou ":999,"ort":1219,"ors":871,"orw":195,"ot ":1528,"orb":186,"ora":235,"ola":171,"on ":1522,"oli":772,"oll":288,"olk":702,"ole":263,"olg":904,"ols":270,"olo":636,"om ":1870,"okk":553,"ona":980,"ond":1915,"one":1178,"ong":620,"oni":1012,"onl":220,"onk":232,"onn":184,"ono":391,"ons":511,"ont":1339,"oma":425,"ome":845,"omi":324,"omm":454,"omp":297,"oms":595,"op ":2264,"la ":334,"le ":3834,"lf ":175,"lde":601,"laa":982,"lad":180,"lag":434,"lak":490,"lan":4154,"lar":155,"lat":361,"las":433,"ld ":695,"kus":410,"kun":548,"kul":242,"kwe":204,"kwa":191,"kte":822,"kst":257,"ksi":463,"ktr":342,"ktu":210,"kti":247,"kto":369,"ls ":1008,"lon":293,"lom":430,"loo":382,"loe":423,"log":655,"los":274,"lië":349,"lti":157,"lub":411,"lug":221,"lst":643,"lte":252,"lse":623,"lge":754,"lew":250,"leu":193,"les":329,"let":347,"ler":415,"lem":358,"len":1056,"lek":605,"lei":1010,"leg":257,"lee":477,"led":218,"lg ":483,"lo ":169,"lla":325,"lle":1578,"lli":615,"lke":200,"lki":447,"ljo":223,"ll ":176,"lja":430,"lit":831,"lis":504,"leë":449,"lin":1208,"lim":201,"lid":165,"lia":364,"lik":2917,"lig":818,"lie":1618,"ma ":226,"mb ":655,"maa":1244,"mag":221,"mar":331,"mas":207,"mal":270,"man":726,"mat":394,"mba":172,"mbi":179,"mbe":814,"mbo":161,"me ":936,"mde":163,"med":223,"mee":1533,"met":2981,"mes":247,"mer":991,"mel":330,"men":1550,"lui":390,"lus":194,"lwe":213,"lyk":221,"lyn":187,"mpi":220,"mpe":208,"mpo":176,"mpt":267,"ms ":488,"moe":196,"mod":233,"mon":329,"mst":248,"mus":488,"mun":417,"ër ":180,"mge":191,"min":806,"mil":465,"mit":231,"mig":184,"mie":523,"mid":310,"ië ":1136,"mme":353,"wêr":319,"yst":183,"ys ":680,"ywe":370,"ye ":306,"yf ":380,"yde":281,"yds":165,"yd ":230,"yn ":461,"yns":175,"yk ":810,"wys":531,"wor":2620,"woo":760,"won":526,"we ":1260,"wes":799,"wer":1583,"wet":305,"wen":427,"wel":545,"weg":270,"wee":1257,"wis":166,"wit":342,"wie":194,"win":417,"wil":177,"wik":231,"wan":300,"wat":5174,"war":532,"was":2236,"waa":1031,"vry":194,"vro":313,"vir":1570,"vin":921,"vie":880,"vis":289,"vla":709,"vlo":280,"voe":444,"vol":1592,"voo":1083,"vor":625,"ver":4566,"ven":170,"vem":236,"vel":250,"vee":302,"val":319,"van":14723,"vat":155,"vaa":414,"uwe":229,"uur":863,"usl":180,"usi":606,"use":380,"ust":585,"uss":1129,"ute":176,"uto":171,"us ":1998,"ure":395,"urg":669,"uri":191,"urk":167,"uro":352,"urs":211,"urt":189,"ur ":2547,"umb":689,"ume":172,"unt":325,"uns":289,"uni":820,"und":530,"um ":614,"ult":270,"ull":459,"uli":358,"un ":219,"uid":2285,"uik":850,"uim":162,"uis":508,"uk ":200,"uit":3378,"ul ":272,"ugb":161,"ugu":278,"ude":184,"udi":240,"ue ":322,"ug ":159,"ub ":406,"uar":522,"ubl":464,"ud ":181,"tyn":228,"ty ":384,"tur":232,"tus":988,"tuu":617,"tui":232,"tud":171,"tyd":628,"twi":269,"twe":751,"ts ":533,"tre":1022,"tra":1128,"tri":607,"tru":366,"tro":780,"tse":746,"tsk":298,"tsl":425,"tst":993,"tte":641,"tti":226,"to ":272,"tof":244,"toe":713,"tob":268,"tot":1108,"tom":182,"ton":586,"tol":317,"tor":808,"too":280,"til":187,"tik":334,"tie":1846,"tig":1053,"tis":241,"tin":826,"tio":267,"thu":695,"tkl":165,"tli":191,"tla":301,"tem":732,"ten":1059,"tei":844,"tek":528,"tel":2135,"tee":779,"teg":166,"ted":237,"th ":270,"teu":212,"tes":357,"ter":4231,"tge":442,"the":380},"n_words":[1541130,1808182,1328687],"name":"af"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"و":674395,"ى":83925,"ي":1050070,"ً":13534,"َ":5694,"ُ":4812,"ـ":6044,"ف":291773,"ق":234289,"ك":238048,"ل":1258387,"م":769173,"ن":600182,"ه":275861,"ّ":6098,"ِ":3286,"ْ":2044,"خ":81779,"د":374335,"ج":159763,"ح":207504,"ت":462068,"ث":62775,"ب":456733,"ة":436596,"ئ":43113,"ا":1630465,"ؤ":8533,"إ":87017,"آ":11409,"أ":206598,"ء":28935,"غ":62643,"ع":367154,"ظ":26879,"ط":114141,"ض":60872,"ص":101344,"ش":119185,"س":320648,"ز":83586,"ر":577132,"ذ":48426,"،":89553,"؛":1398," ،":20335," ن":38705," ه":63492," ل":88748," م":255388," ق":40529," ك":61242," ف":145932," ي":66494," و":214375," ص":12486," ش":28169," ط":11510," ض":8567," ر":24040," ذ":6579," س":49888," ز":6199," ع":110158," ظ":1667," غ":11703," إ":56701," ا":619492," آ":6461," أ":122164," ج":33981," ح":46703," خ":15646," د":26005," ب":145150," ت":96353," ث":8150,"ア":1366,"ا، ":7320,"ئة ":2129,"ؤسس":2077,"إن ":1277,"أيض":3023,"أور":2981,"أهم":1974,"أهل":1229,"أول":9618,"أنو":1568,"أنه":3955,"أمي":2439,"ألم":3407,"أما":1795,"ألف":1545,"أمر":6318,"أكب":2802,"أكت":1724,"أكث":2574,"أفر":1800,"أعل":1292,"أعم":1551,"أغس":1326,"أصل":1634,"اث ":2044,"ئي ":3037,"ب، ":1611,"ات ":57401,"إضا":1407,"اح ":3081,"اج ":4358,"إسب":3960,"ئل ":2536,"اة ":4792,"إسل":3234,"اب ":9098,"إسر":1395,"إدا":2059,"اء ":22733,"إحد":6124,"إذا":1265,"ئر ":2350,"ة، ":15452,"اي ":2212,"بت ":1435,"بب ":1489,"اه ":2688,"بة ":12759,"ان ":40858,"با ":4030,"بد ":6780,"بح ":1410,"اً ":10489,"اف ":3597,"إيط":2527,"إير":1277,"ام ":31875,"ال ":23428,"ئرة":2003,"اك ":2756,"اق ":4807,"اع ":5640,"إلي":1606,"إلى":18043,"إما":1946,"اض ":1731,"إقل":1614,"اط ":2017,"إنج":4004,"إنت":2754,"إنس":1518,"ار ":15484,"اد ":13875,"اص ":1586,"از ":3651,"اس ":4999,"بي ":11247,"ت، ":2665,"اضي":4506,"اصر":1618,"بو ":3109,"اصم":3245,"اعب":4589,"اعة":3428,"اعت":1731,"اعد":2039,"اعر":1432,"اطع":4959,"اطق":1642,"اطي":1824,"اعي":4248,"ادا":1408,"اخل":1728,"احي":1991,"اخت":2071,"احل":1721,"احت":2360,"احة":4653,"احد":2974,"ارب":3471,"ارة":7735,"ارا":5132,"ادي":11501,"ادة":5295,"بق ":1932,"ادر":1555,"ازي":2304,"است":9208,"اسة":1966,"اسا":1699,"اري":13333,"اره":1487,"ارو":1690,"ارك":3428,"ارع":1412,"ارس":4342,"بل ":7375,"ارد":1984,"ارت":2585,"ارج":1460,"اصة":1686,"اشم":2112,"به ":4466,"اسم":7414,"اشت":1547,"اسي":6201,"اسع":1320,"بن ":14722,"ئلة":1510,"بع ":5351,"ائد":1719,"ائر":5727,"ائز":1678,"ائم":1524,"ائل":4622,"ائي":8565,"ابا":4420,"ابت":1320,"ابة":2266,"ابر":1286,"ابع":5010,"ابي":3084,"ات،":1833,"ابل":1965,"ابق":3015,"ابن":2545,"اتح":2012,"اتب":1961,"ئيس":4458,"اتي":3444,"اته":3712,"ئية":4071,"اجت":1267,"ئيل":1637,"بر ":15051,"بط ":1447,"آخر":1604,"أن ":6708,"آن ":1439,"أس ":2019,"أبر":1929,"أبي":2141,"أبو":3598,"أحد":5818,"أحم":2690,"أخر":2780,"أحي":1348,"أرا":1757,"أرب":1317,"أرض":1852,"أرد":4510,"أست":1490,"أسا":2106,"أسس":2383,"أشه":1227,"أصب":1379,"أو ":17977,"أي ":2409,"جزي":2701,"جدي":2169,"جزء":1922,"حل ":2135,"جرا":1466,"جري":2220,"جزا":2275,"جتم":2280,"ثير":2870,"جات":1361,"ثنا":1548,"جار":2741,"جال":2447,"جان":2465,"جبا":1237,"جام":4767,"جبل":1832,"جلي":4317,"جما":2589,"جلس":2643,"حاف":7429,"حال":5921,"حاك":1299,"جهة":1426,"جنو":6846,"جها":2454,"جمي":2428,"حاد":3278,"حار":1507,"جنس":1263,"حاس":1289,"جمع":2692,"جمه":2396,"جمو":3506,"خط ":1642,"خر ":2126,"حي ":2379,"د، ":2106,"دث ":1521,"حصل":1369,"دة ":22913,"دت ":1894,"حسب":1473,"دا ":3896,"حسي":1367,"حسن":1628,"حرك":2813,"حرا":2117,"حرب":2616,"خل ":1874,"حزب":1786,"حري":2754,"دأ ":1419,"حدث":1823,"حدة":6151,"حدا":1890,"حدي":3344,"حدو":1601,"حدى":6092,"حتى":2850,"حتو":1235,"جية":1498,"جيا":1555,"جيد":1284,"جين":1406,"جود":2059,"ختص":1274,"حول":2075,"حوض":3972,"حوا":3753,"خاص":2779,"خار":1757,"حمل":1584,"خاب":1378,"حلي":1951,"حمد":7071,"حما":1348,"حكو":1587,"حكم":2998,"حقي":1342,"دس ":1564,"در ":3643,"دد ":5353,"ذا ":6858,"دن ":3532,"ده ":2855,"دو ":1586,"دى ":7927,"خصي":1891,"ر، ":3918,"دي ":16032,"حيا":3483,"ختل":2576,"حيث":5743,"حية":2840,"حيو":1435,"خدا":1713,"خدم":4681,"خرا":1406,"دل ":1527,"خرج":2035,"خرى":2288,"دم ":12380,"خلي":2944,"خلا":4049,"دان":2670,"دام":2603,"دال":1357,"داي":1553,"داء":1232,"دائ":2836,"دار":5259,"دات":2179,"داخ":1695,"داد":3415,"ذي ":7019,"رج ":3503,"رة ":42156,"رت ":3799,"رد ":2816,"را ":5024,"رب ":12795,"ذه ":5431,"درس":2590,"درج":3597,"درا":3796,"دري":2750,"دول":7090,"دون":2547,"دوا":2452,"دود":1701,"دور":4858,"دها":2546,"دني":4787,"ذات":1939,"رف ":5073,"زء ":1646,"ديو":1641,"ديم":4297,"دين":17630,"ديا":3615,"دية":10224,"ديس":2111,"ديد":5637,"دير":2378,"ديث":1831,"رس ":4802,"رز ":1453,"رع ":2500,"دما":2598,"رض ":4034,"اقي":2703,"اقت":1926,"اقة":1778,"افي":2956,"اقع":1373,"الق":27005,"الف":20852,"الل":14610,"الك":19511,"الي":32249,"ان،":1862,"امت":1485,"امج":2075,"اما":2969,"الن":19094,"الم":106832,"امة":2785,"الو":15207,"اله":10510,"امر":2004,"امع":4584,"الأ":53765,"الآ":3087,"الج":24268,"اكي":1240,"الث":10222,"الت":44136,"الة":4500,"الب":27575,"الا":23279,"اكم":1385,"الإ":23108,"الس":25474,"الش":23985,"الر":18584,"الز":4628,"الد":22592,"الذ":9992,"الح":28530,"الخ":10845,"الع":45326,"الغ":7597,"الط":9353,"الص":10951,"الض":2239,"افة":3763,"افظ":6821,"اين":1399,"ايو":2112,"بتم":1591,"ايا":4981,"اية":7953,"اير":3864,"بدأ":2040,"، ":87650,"بحي":1333,"بحر":3662,"اهر":2224,"اني":27995,"انه":3927,"انو":4021,"بان":8783,"بال":24841,"باد":1374,"اند":2371,"بار":7325,"انس":1452,"باس":3805,"انا":3741,"باب":2616,"انب":1315,"انة":1432,"بات":4730,"انت":10216,"امي":6789,"باح":1289,"امل":2591,"بائ":1535,"اوي":2785,"اول":1939,"اهي":1800,"تا ":1767,"تب ":3208,"بشك":1596,"بدا":2243,"برا":7053,"برت":1316,"برل":1347,"برو":1928,"برن":1554,"بري":6364,"تر ":3568,"بطو":2790,"بعد":8590,"بعة":3652,"بغد":1205,"بعض":3329,"بني":2271,"بها":4540,"بنا":4249,"ئ ":1205,"ا ":148281,"بلي":1501,"بلا":3530,"ب ":70841,"بلغ":5588,"بلد":6651,"ء ":25878,"بقا":1527,"أ ":2761,"ح ":16042,"خ ":7746,"د ":111902,"ذ ":7044,"بين":10021,"ة ":417779,"بيل":2585,"بيض":1538,"بيع":2501,"ت ":98281,"بيا":2983,"بية":13845,"بير":5823,"بيت":1659,"بون":1363,"ث ":14734,"بول":3156,"بور":2404,"ج ":16315,"بوا":1540,"تي ":19035,"ثة ":2508,"تو ":1236,"تى ":3327,"ته ":7416,"تل ":1533,"تم ":5813,"ثر ":3741,"؛ ":1372,"تأس":2342,"تبر":4383,"تبع":2953,"تال":2664,"تان":2849,"تبا":1684,"تبة":1215,"تار":5301,"تاب":6518,"تاج":2425,"تبل":2737,"تجا":3016,"تخد":5000,"تخب":1329,"تخا":1370,"تحد":6290,"تحر":1572,"تحا":3110,"تحت":3198,"تدا":1332,"ثم ":2995,"تري":1455,"ترو":2478,"ثل ":4130,"ترا":5683,"ترة":2099,"ترك":2885,"تشا":2599,"تسم":2030,"تست":1924,"تصا":3295,"تشر":1532,"جة ":6594,"تشي":1720,"تضم":1238,"تطو":1569,"تعا":1531,"تعت":2311,"تعم":2414,"تعل":2476,"تعر":2059,"تعد":2614,"جد ":3856,"جر ":1340,"تفا":2566,"تقا":2028,"تقد":2088,"تقر":1287,"تقس":2440,"تقع":8338,"تقل":2844,"تهر":1315,"ثال":2031,"تها":6374,"ثان":4499,"تنظ":2616,"تمي":2924,"تنا":1728,"تمد":1486,"تمر":1589,"تلف":3439,"تلك":1400,"تما":2848,"تمب":1576,"تكو":3404,"تلا":1846,"تين":4590,"تية":1561,"تيا":2052,"تون":4528,"توي":1644,"توف":2858,"تول":1447,"تور":2920,"توس":1688,"توب":2148,"توا":1906,"تهم":1291,"جه ":1478,"جم ":2117,"جل ":2059,"جي ":1843,"حة ":7424,"حت ":2989,"حر ":2424,"حد ":6903,"ثلا":2751,"ثما":1889,"ثقا":1374,"وي":38534,"ي،":8021,"وو":3027,"وى":2435,"ية":149201,"يب":15166,"يا":76978,"يئ":2097,"يض":5937,"يط":9332,"يع":16206,"يز":13642,"يس":24541,"يش":6638,"يص":2709,"يخ":8388,"يد":33231,"ير":50995,"يت":21919,"يث":8777,"يج":6934,"يح":6752,"يه":17510,"ين":82989,"ًا":1622,"يو":34779,"يك":18257,"يق":23953,"يم":33457,"يل":36829,"يف":12030,"يغ":2332,"يي":7762,"فع":3459,"فض":3349,"فص":2010,"فظ":7666,"فز":1405,"فر":20396,"فس":4079,"فة":10560,"فت":8107,"ق،":1414,"فا":19253,"فب":1563,"قف":1276,"قع":16814,"قط":5495,"قض":1727,"قص":4820,"قش":1443,"قس":4834,"قر":18930,"قد":24585,"فى":1203,"قت":5868,"في":137714,"فه":4197,"قب":10150,"قة":17223,"فو":7133,"فم":1876,"فن":5345,"قا":33043,"فك":2212,"فل":7982,"فق":4801,"لأ":58367,"لآ":3323,"كف":1496,"كس":5921,"قي":21786,"كث":5852,"ل،":2207,"كذ":1298,"كر":19872,"كز":7004,"كأ":1420,"قل":10832,"قم":3276,"كا":41128,"قن":3553,"كب":9528,"قه":2295,"كة":14080,"قو":10571,"كت":14016,"قى":1403,"لك":43877,"مؤ":5145,"لق":35996,"لف":31832,"لط":12463,"لض":2442,"لص":14130,"لش":25578,"لغ":17940,"لع":56838,"لظ":1228,"لخ":11729,"لد":39289,"لج":27745,"لح":34548,"لز":5989,"لس":38336,"لذ":10795,"لر":20100,"كم":13188,"لا":89050,"كن":10747,"كل":17531,"لإ":25479,"لت":54464,"م،":3972,"كي":22561,"لث":10985,"لب":37291,"كه":2654,"كو":22542,"لة":26342,"مع":31975,"مغ":4324,"مص":12239,"مض":1909,"مط":2885,"مف":2713,"مق":12733,"مك":8220,"مل":28998,"مت":25250,"لى":49564,"مة":21124,"لو":37077,"مج":15409,"ن،":5643,"لي":91040,"مث":7965,"لم":137485,"لل":37112,"مب":12595,"له":28033,"ما":84509,"لن":25730,"مز":2658,"مر":34445,"مش":6760,"مس":26092,"مخ":5223,"مح":22151,"مذ":1262,"مد":31562,"نغ":3178,"نظ":9512,"نع":2681,"نط":11265,"نل":1207,"نم":4711,"نق":5444,"نك":3963,"نف":6969,"نح":3012,"نج":9690,"مي":49166,"ه،":2071,"مى":3142,"نت":26741,"نة":26561,"مو":33352,"مه":13330,"نب":6671,"نا":51288,"من":110993,"مم":11520,"نص":4888,"نش":6378,"نس":20514,"نز":3094,"نر":1454,"نذ":4480,"ند":18475,"هـ":2446,"وأ":7843,"وإ":2408,"هل":3624,"هم":13508,"هن":5865,"وا":96512,"هب":2043,"نه":22166,"ها":63427,"هت":1254,"نى":2312,"نو":28243,"هة":2492,"هج":2434,"و،":1267,"ني":68053,"هذ":11164,"هد":6895,"هز":1306,"هر":13839,"وغ":3162,"وظ":1284,"وع":15945,"وق":15305,"وك":13977,"وف":17742,"ون":47917,"وه":18574,"ول":64949,"وم":36461,"هي":33444,"وث":2062,"وت":24045,"هو":37979,"وة":2112,"وب":25979,"ود":18341,"وخ":2267,"وح":8074,"وج":13010,"وس":25063,"وز":7609,"ور":46060,"وذ":2568,"وط":5229,"وض":6816,"وص":4846,"وش":3688,"دة":23918,"خو":3104,"دت":2861,"دا":34518,"دب":3013,"دخ":1822,"دث":2213,"خي":4889,"خل":11579,"خم":2109,"دأ":2148,"خط":4248,"خر":10258,"خد":6707,"خص":5211,"ده":6608,"دو":24057,"دى":7961,"دي":71508,"ر،":3948,"دف":3789,"دق":1886,"دك":1306,"دل":4421,"دم":20032,"ذا":11430,"دن":9851,"دع":2356,"دد":6837,"در":20942,"دس":4645,"حث":1327,"جي":13871,"حج":2300,"حة":7643,"جو":9984,"حت":11398,"جن":11985,"حا":28730,"حب":2668,"جه":7398,"جل":12168,"جم":17274,"جع":2397,"جس":2540,"جز":8412,"جر":7944,"جد":8657,"خت":6160,"حي":21851,"د،":2117,"حم":14977,"حن":1639,"خا":10036,"خب":2705,"حو":13676,"حف":3106,"حق":4632,"حك":5593,"حل":8199,"حض":1296,"حص":3737,"حز":2507,"حر":15992,"حس":5733,"حد":30065,"تغ":2721,"تف":6519,"تم":21550,"تل":11923,"تك":7075,"تق":23507,"تى":3340,"ثة":2642,"تو":27372,"ته":18459,"ثا":9771,"تن":10701,"تج":7596,"تح":21120,"تت":7159,"تر":25584,"تخ":11500,"تد":5897,"تش":11506,"تص":9178,"تز":2287,"تس":9386,"تع":17161,"تض":2140,"تط":3895,"ثق":2026,"ثل":9196,"ثن":2113,"جا":22121,"ثم":5275,"جة":6738,"ثو":2447,"جب":5048,"ثي":5910,"جت":3434,"تي":34720,"ثر":5818,"بغ":2025,"بع":25200,"به":11285,"بن":26227,"بم":5940,"بل":28735,"بك":4252,"بق":7310,"بد":14809,"بج":2196,"اً":11293,"بح":10218,"بت":8320,"اي":29388,"ة،":15567,"بب":2982,"اه":11841,"او":11068,"بة":13305,"بط":8037,"بش":3786,"بص":1566,"بس":4381,"بر":41886,"تا":29084,"تب":19287,"تأ":5378,"بو":18997,"ت،":2681,"بي":60818,"ئد":1726,"ئر":5978,"ئز":1688,"إن":13110,"ئا":1293,"إم":3605,"ئة":2198,"ا،":7371,"إي":6297,"اء":25293,"ئل":4704,"ائ":31094,"ئم":1653,"از":9640,"ار":69575,"اذ":2153,"اد":41536,"اض":8516,"اص":11589,"اش":9130,"اس":39158,"ات":77501,"اة":4919,"ئه":1298,"اب":38588,"اخ":6254,"اح":20482,"اج":12583,"اث":6287,"ئي":14445,"ب،":1623,"اف":23866,"بأ":2984,"اق":16552,"ام":64051,"با":69767,"ان":106270,"اك":11934,"ال":726452,"بإ":1475,"اع":27489,"اغ":2741,"اط":14678,"ؤس":2217,"أو":36459,"أه":4128,"أي":8325,"أل":9391,"أك":9086,"أن":19914,"أم":14875,"إس":11365,"إر":1668,"إذ":1752,"إد":2895,"إخ":1619,"إح":7179,"إب":1901,"إل":22468,"إق":2002,"إع":1925,"إض":1479,"آخ":1667,"آل":2561,"أت":2542,"آن":1925,"أب":10779,"أح":11512,"أخ":6156,"أث":2807,"أج":3225,"أر":12567,"أد":4137,"أص":5345,"أش":3739,"أس":15627,"أغ":3162,"أع":5434,"أط":1989,"أق":3836,"أف":5069,"غي":8633,"غو":4024,"غن":2949,"غل":3357,"عي":17935,"غد":1582,"غر":13200,"غس":1789,"غز":1267,"عق":2845,"عل":49520,"عم":19388,"غا":8614,"عن":19968,"عه":6016,"غة":3737,"عو":7644,"ظي":3474,"عث":1594,"عت":10284,"عة":23777,"عد":27555,"عش":4458,"عس":1709,"عز":2669,"عر":25424,"عظ":2315,"عط":1205,"عض":5918,"عص":2498,"عا":50253,"ظه":2422,"عب":25388,"ظم":4301,"ظا":4120,"طن":4817,"طل":7662,"طق":11023,"طف":1950,"طع":6327,"ظر":2587,"طي":9898,"طو":11014,"ظة":6299,"ضم":8353,"ضل":2264,"ضع":1853,"طس":1527,"طر":9793,"ضي":7171,"طح":1428,"طا":16584,"طب":7274,"طة":4986,"ضو":3284,"صل":10145,"صف":5067,"صط":2224,"صص":1416,"صغ":2398,"ضر":2826,"ضة":1642,"صو":8178,"صم":4802,"صن":4515,"ضا":11319,"صي":8440,"شف":1822,"شك":5750,"شق":1763,"شع":5311,"شغ":1513,"صح":4260,"صد":4269,"صر":13415,"شم":11602,"صا":13513,"صب":5260,"شه":5994,"صة":3495,"شو":3361,"شي":12032,"سع":7475,"سط":12291,"سس":4815,"سف":3591,"شأ":1378,"سي":47561,"شت":5440,"سو":18175,"سه":3635,"شب":4456,"شا":16192,"سن":12898,"سم":24549,"سل":21272,"سك":11128,"شر":23735,"شد":1209,"شخ":3092,"زع":1204,"سب":17052,"زه":1918,"زن":1607,"سا":35091,"ست":30958,"سة":7853,"زو":4517,"زم":3202,"زل":2051,"سد":1627,"سر":8485,"سج":2382,"زي":20894,"رس":13875,"رش":2594,"رز":2717,"رط":1498,"رض":6362,"رع":5610,"رغ":2804,"زء":1924,"رل":3156,"رك":24686,"رق":13754,"رف":9661,"رو":32705,"زة":4009,"زب":2789,"ره":9008,"زا":10186,"رن":13036,"رم":7236,"س،":1548,"ري":92759,"رى":6117,"زر":4095,"ذر":1628,"ذك":2116,"رأ":2244,"رئ":5538,"ذل":5170,"رب":36257,"ذه":6871,"را":67798,"رت":12252,"رة":43634,"ذو":1454,"رج":14223,"ذي":9511,"رخ":1356,"رح":4450,"رد":11552,"ف ":28941,"ـ ":3661,"ع ":53161,"غ ":7968,"ص ":6276,"ض ":14917,"ط ":15243,"ظ ":1554,"ر ":119691,"ز ":20518,"س ":42343,"ش ":6531,"ً ":11009,"ي ":305236,"ه ":68493,"ن ":236663,"ى ":82765,"و ":86344,"ك ":22299,"ق ":30798,"م ":131428,"ل ":111126," ، ":18980," و ":12423," م ":5345," جن":4512," حا":7431," جه":1218," جم":4339," جي":2378," جو":3451," حت":2592," جد":1981," جز":3197," جر":1610," ثل":1523," ثم":3265," جا":6114," جب":2225," تي":1533," خل":4736," دا":4684," خط":2109," حق":1280," حك":2014," حل":1546," حي":8173," حم":2037," خا":3436," حو":8126," حر":3258," حس":2733," حد":2242," بك":1952," بق":1818," به":5365," بن":17676," بم":5621," بل":8899," بغ":1392," بع":9614," بس":2145," بر":8567," بط":3259," بش":2647," بت":3125," بد":4624," بج":1734," بح":3333," بأ":2828," ال":581886," بإ":1446," با":31729," ان":6245," ام":2207," اع":1488," ار":1569," اس":7954," اب":3029," ات":1745," اح":1418," اخ":1766," تو":6684," ثا":1511," تن":5058," تم":5949," تل":2911," تك":2690," تق":13534," تع":7393," تط":1471," تش":4009," تص":2525," تس":4420," تر":4248," تخ":1540," تد":1553," تج":1794," تح":6360," تت":4407," تا":4988," تب":4366," تأ":3604," بو":6771," بي":13768," أل":4956," أك":6683," أن":14293," أم":6327," أو":27007," أه":2795," أي":6316," آل":1841," أب":8228," أخ":2816," أح":8158," أج":2000," أث":1423," أر":3743," أد":2035," أص":3105," أس":5597," أش":2245," أع":3015," أغ":2285," أف":3376," أق":2564," إي":4306," إن":5007," إم":1413," إب":1312," إس":5794," إر":1268," إذ":1294," إد":1530," إح":6465," إل":21224," إق":1208," طو":2165," عش":2714," عر":3955," عد":6773," عا":21951," عب":8841," عي":1567," غر":3853," عل":33952," عم":7142," عن":15313," غا":2188," غي":2774," سع":2415," سي":7136," شب":1886," سو":5683," سم":1931," سن":7850," شا":3992," سك":3794," سل":4129," شر":6157," شخ":1437," شع":1651," شك":1235," شي":2028," شم":4082," صا":1583," شه":2018," صح":1413," ضم":5470," طا":1844," طب":1958," طر":3408," در":2995," دي":6094," دو":6652," ذا":1885," رئ":2370," ذل":2035," را":2673," رس":1784," ري":3022," رق":1403," رو":5273," زي":1372," سب":3177," سا":6895," ست":1885," لك":4551," مؤ":2595," لق":1950," لع":2571," لغ":1253," لص":2532," لج":1545," لح":1350," لد":1501," لب":2260," كو":5523," لت":3577," كي":3645," كل":6741," لإ":1374," كم":7185," كن":1321," لا":10830," مل":4355," مك":2868," مق":7894," مغ":1598," مع":14545," مص":6997," مس":11150," مش":2698," مر":11437," مد":14412," مح":14231," مخ":2723," لي":4897," مث":3231," مج":7634," لو":4068," مت":7729," لن":2133," ما":17956," مب":2224," له":5386," لل":21688," لم":6816," نف":1935," نق":1569," نظ":2491," نس":4267," نش":1472," مم":3100," نا":8059," من":85288," مه":1676," مو":12716," مي":4669," نج":1432," وأ":7664," هـ":2402," هن":1965," وا":50314," وإ":2381," هذ":9093," نو":6152," ها":3011," نه":2387," ني":2637," فر":6896," فب":1504," فا":3547," فت":2193," قص":1901," قط":1391," فق":1797," فل":2626," فن":1697," قا":5741," فو":2197," فه":1391," قب":5253," في":115948," قد":7793," قر":6492," لأ":2928," قل":1628," كت":3116," قو":2125," كب":2590," كا":17231," قي":2400," كث":1288," كر":7439," ود":1531," وخ":1367," وح":3870," وج":3378," هي":20610," وت":16290," هو":22238," وب":6445," وص":1974," وش":2083," وس":5388," وز":1947," ور":2782," وذ":1466," وغ":1624," وع":5254," ون":2492," وه":15877," ول":15982," وم":13412," وق":7666," وك":6757," وف":4804," وو":1985," وي":13320," يا":1484," يب":2139," يح":3014," يج":1805," يت":5470," ير":1696," يد":1787," يص":1234," يش":1764," يس":4063," يع":6186," يل":3428," يم":2984," يق":8446," يك":1977," يو":8148," ين":4302,"فس ":1508,"فر ":2554,"فع ":1454,"فا ":1471,"ق، ":1401,"فة ":10112,"قع ":14708,"فار":1789,"فات":2286,"فال":1824,"فاع":2545,"فان":1282,"فبر":1355,"فتر":2588,"قب ":1566,"قا ":1491,"قت ":1589,"قة ":16768,"في ":111805,"قد ":7827,"عظم":1719,"عضو":1448,"عشر":3482,"عسك":1369,"عزي":1296,"عري":1226,"عرو":2224,"عرف":3788,"عرض":1945,"عرب":8060,"عرا":4113,"عدة":2689,"عدا":2204,"عدد":5430,"عدي":2807,"عتم":1554,"ظيم":2842,"ظهر":1451,"عبر":1705,"عبد":6249,"عبي":2224,"عتب":4439,"عات":3583,"عائ":1805,"عاب":1251,"عال":9007,"عام":18844,"عبا":2776,"عاص":3770,"عار":1586,"عاد":3689,"ظمة":1530,"غسط":1434,"غدا":1272,"غرا":1470,"غرب":8757,"عية":4635,"عيد":1672,"عين":3620,"عمل":7530,"عمو":1379,"عمر":2346,"عها":1818,"عني":1581,"غال":2120,"عند":3134,"عهد":2118,"عود":4342,"علا":3567,"علي":8023,"على":25792,"علو":2893,"عما":5586,"علم":6076,"غني":1336,"غير":5788,"شعر":1240,"شعب":2093,"شما":7339,"شكل":4349,"صال":3820,"شهر":2525,"صبح":1804,"صاد":2357,"صار":2022,"شمي":2083,"صدر":2012,"شير":1659,"شيخ":2626,"طة ":4836,"ضي ":2799,"صطل":1544,"صري":4147,"ضم ":1876,"طب ":1314,"طس ":1466,"صغي":1584,"صول":1823,"صور":2575,"ضاف":1670,"ضاء":2485,"صنا":1592,"صمة":2695,"طق ":1799,"صية":1437,"صين":1293,"طي ":1477,"ظة ":6257,"طان":3004,"طال":3847,"طاق":1625,"طار":1511,"طائ":1561,"ضمن":5751,"طبي":3127,"ضية":1599,"طري":4168,"ظم ":1728,"عب ":10724,"عة ":23265,"طعة":4116,"عد ":10757,"عر ":1910,"طلح":1521,"طلق":2117,"طلا":1304,"عض ":2738,"طقة":8653,"طول":4093,"طوي":1533,"طور":2637,"ظام":2445,"طني":2154,"طين":2907,"عل ":1213,"غة ":3658,"عن ":10475,"عه ":1714,"عي ":3208,"س، ":1536,"ري ":14515,"رن ":3048,"زب ":1762,"ره ":3780,"زة ":3835,"رو ":2228,"رى ":5751,"رق ":5370,"رك ":3561,"رجي":1396,"ردن":4580,"ردي":1386,"رتب":1460,"ربع":2964,"ربي":13069,"رجة":3401,"رجا":1686,"ذين":1312,"رتف":1376,"رته":1217,"رتي":1328,"ذلك":5018,"ران":5308,"ربا":2177,"راه":1718,"راي":2650,"رة،":1284,"راً":1437,"راط":1969,"راض":2661,"راع":2044,"راف":2275,"راك":1677,"راق":4338,"رام":2614,"رال":1755,"راب":2845,"رائ":2827,"راج":1725,"رات":7689,"رئي":4628,"رار":2322,"راد":1959,"راز":1251,"راس":3033,"راء":3735,"ذكر":1364,"رأس":1309,"سس ":1682,"سر ":1409,"زي ":2367,"سة ":7637,"ست ":2491,"رسة":1580,"رسا":1400,"سا ":2325,"رسو":1660,"رسم":1498,"رسي":1211,"سب ":2371,"سم ":8379,"زرا":1870,"سل ":2019,"ريو":1361,"ريم":1723,"رين":5069,"ريك":7429,"ريل":2022,"ريف":1889,"ريق":6975,"ريا":9368,"ريب":2334,"ريخ":4312,"ريد":2596,"ريت":1569,"رية":19287,"ريط":2009,"رير":1548,"ريس":2387,"روي":1263,"روف":2873,"رون":3533,"روم":2673,"روع":1294,"روس":3818,"روت":1414,"روا":3408,"روب":3276,"رها":3942,"رنس":4822,"زار":1503,"رنا":1951,"زائ":1892,"سع ":1228,"ركي":2730,"ركز":6519,"ركا":1845,"رقم":1561,"ركة":6574,"سط ":4117,"رقي":3306,"رفي":1260,"رقة":1238,"شر ":3663,"سن ":1880,"سي ":9223,"سري":1260,"سرا":2022,"صب ":1600,"ساع":1424,"سام":1889,"سال":1558,"سبب":1717,"سان":4418,"سبا":5421,"سبت":1759,"سبة":1761,"ساح":5795,"ساس":2185,"سائ":1433,"سات":2066,"ساب":4056,"ستخ":5235,"ستا":3297,"ستر":2386,"زوج":1305,"زيو":1415,"ستع":2199,"ستق":2335,"زيا":2152,"ستي":2146,"زية":3984,"ستو":3126,"زيز":1328,"زير":3447,"صر ":6441,"سلة":1569,"سكن":1586,"سلا":5866,"سكر":1892,"سلي":1241,"سمب":1534,"سمة":2358,"سلم":2588,"سما":2312,"سلط":1809,"سلس":3040,"سكا":4595,"سطس":1421,"سطي":2518,"صة ":3360,"سعو":3150,"شرك":4217,"شرق":6331,"شرو":1260,"شري":3368,"صل ":5101,"ضة ":1538,"ضا ":2191,"سوف":1368,"سون":1558,"شتا":1442,"سوي":1779,"سوا":1477,"سود":1542,"سور":3979,"شاع":1427,"سمى":2128,"سنة":6767,"سمه":2089,"شار":4714,"سمي":3653,"شخص":2650,"صف ":1872,"سين":3825,"سيم":3305,"سيق":1533,"سيس":1471,"سير":1927,"سيد":1957,"سية":7660,"سيا":7442,"شته":1322,"يين":4613,"يقو":1288,"يقي":3960,"يقع":4212,"يكا":2890,"يكي":7123,"يلع":3028,"يلة":3508,"يكو":2936,"يلا":3946,"يلي":4907,"ين،":2158,"يلم":2210,"يما":3516,"يمة":2518,"يلو":2309,"يمت":1285,"ينا":6385,"يمن":1503,"يمك":1768,"ينت":2412,"يمي":5784,"ينة":14065,"ينو":1382,"يني":7771,"ينم":1347,"ينه":1443,"يها":5753,"يوس":2044,"يور":1429,"يوج":1408,"يوا":1512,"يون":8407,"يوي":1456,"يوم":3633,"يول":2410,"يقا":3504,"يقة":2474,"يفي":1837,"يفة":1491,"يضا":3644,"يسي":3184,"يسم":2811,"يزي":6603,"يست":3754,"يسا":1431,"يره":2025,"يرو":2802,"يري":3325,"يعي":2030,"يعر":1330,"يعة":1447,"يعت":2839,"يطا":4786,"يجي":2143,"يتا":1503,"يتو":1517,"يتي":2848,"يتم":2214,"يته":1496,"يدي":4565,"يرا":5132,"يرة":7852,"يحي":1325,"يدة":3171,"يدا":2277,"يال":1789,"يبا":1460,"يان":5624,"يام":1663,"ية،":7809,"ياً":2041,"يبل":2267,"يبي":2606,"يئة":1233,"يا،":2810,"ياس":4902,"يار":3005,"ياد":1919,"ياض":3099,"ياء":2370,"يات":14201,"ياب":2680,"يائ":1532,"وز ":1740,"ور ":10877,"ود ":7936,"وض ":4335,"نفس":2468,"وس ":4788,"هاد":1315,"هاج":1381,"هاز":1295,"هار":1592,"هاش":2115,"نما":2137,"ها،":1369,"وع ":4025,"نقل":1354,"وط ":1225,"نيس":1670,"نيا":8848,"نية":21320,"نوي":1623,"نون":2797,"نور":1366,"نوب":7417,"نهم":1521,"نوا":4198,"نوف":1932,"نوع":3104,"هاي":1376,"هام":1659,"نها":10954,"نهر":1868,"وف ":3389,"هذا":5637,"وق ":2861,"نين":2937,"نيو":3670,"وم ":10050,"ون ":24085,"هرة":1975,"هذه":5272,"هرب":1206,"وك ":1613,"ول ":15598,"وي ":5541,"ي، ":7950,"وى ":2336,"مغر":2269,"معي":2389,"معل":1517,"معه":1532,"معر":3470,"معت":1206,"معة":3514,"معا":4652,"هد ":3199,"هر ":6421,"مقا":7017,"مقر":1875,"مقد":1322,"ملة":1472,"ملا":2140,"مكن":2428,"مكا":1474,"مكت":1317,"منا":3879,"ممل":5220,"نائ":1835,"ناء":3405,"ناد":6151,"منذ":4141,"منت":3699,"نات":4769,"منظ":1521,"منط":9032,"ناط":2245,"ناص":1498,"منص":1619,"ناس":1610,"ناع":1893,"ملك":8686,"مما":1241,"ملي":3442,"ممث":1944,"مهو":2239,"موا":8484,"نبي":1323,"موج":1832,"مور":1727,"مود":1263,"موس":3401,"موع":3576,"موق":1778,"ناك":1264,"نام":2198,"نان":5318,"نبا":1540,"مها":3875,"منه":3913,"ناي":2308,"مني":1297,"نتق":1216,"نتش":1566,"نتخ":2464,"نتج":1837,"نتر":1240,"نتا":3081,"مون":2835,"موم":1267,"مول":1330,"ميل":3548,"ميع":1518,"ميد":1868,"ميز":2165,"مير":3381,"نتم":1247,"ميا":3833,"ميت":1549,"مية":10968,"نتي":2558,"نجل":4522,"مين":4370,"هـ ":2122,"ندا":2054,"ندم":1329,"ندو":1295,"ندر":1700,"ندس":1559,"ندي":3768,"هل ":1310,"هم ":9416,"نسا":3625,"نسب":2400,"وا ":3570,"نسم":2218,"نشا":1687,"نسي":6788,"وب ":7482,"هو ":28172,"وة ":2028,"وت ":2509,"هي ":25821,"نطق":9203,"نظا":2438,"نظر":1914,"نظم":2245,"نظي":2652,"ومن":4650,"ونا":4025,"ومي":3551,"ونس":2979,"وما":5525,"وله":2128,"ولي":9104,"ومة":1640,"ولو":3124,"ولى":4041,"ومت":1425,"ولك":1651,"يع ":3808,"ولة":5953,"وكي":1778,"ولا":11130,"ولد":6407,"وكا":4860,"يط ":1772,"وقا":1242,"وفم":1445,"وفي":6567,"يض ":1235,"وقد":4338,"وقع":2237,"وفا":1769,"يش ":2562,"يس ":7953,"يق ":5733,"يف ":4553,"وين":2166,"ويو":1325,"ويق":1310,"ويل":2767,"ويس":2024,"ويع":1988,"وية":5123,"ويت":3787,"وير":2162,"ويد":1280,"وهي":6413,"وهو":7384,"وني":7306,"يه ":8938,"يو ":7129,"يم ":12246,"ين ":40630,"ًا ":1445,"يك ":2151,"يل ":12694,"وتو":2723,"وتق":1399,"هير":1949,"وتع":2163,"هول":1332,"وتر":1472,"وتت":1213,"وجه":1246,"وجي":2191,"وجو":2155,"وحد":1804,"هيم":1268,"وجد":2346,"واع":2005,"واس":3039,"وار":3337,"هند":2766,"واد":3067,"واح":3498,"واج":1496,"وات":4032,"واب":1388,"هنا":1458,"وائ":3044,"واء":1341,"هما":1800,"هور":3656,"وبي":4664,"وبل":1243,"وبر":3000,"واي":2060,"واق":2063,"وال":49239,"وان":5702,"وبا":3324,"وري":12364,"وسا":1602,"وزي":2144,"يا ":24320,"وسط":5112,"وسي":5438,"يب ":4787,"ية ":139658,"ودا":1655,"ودة":1389,"ودي":4370,"ورا":3830,"ورد":1319,"ورت":1259,"ورة":4176,"وزا":1263,"ورو":3091,"يد ":18420,"ير ":22585,"يز ":3857,"يث ":7060,"يت ":4897,"يج ":1671,"وطن":2615,"يح ":1259,"وعة":3516,"وعا":1992,"يخ ":5401,"وعي":1203,"لد ":7149,"لس ":3640,"لة ":25605,"كو ":1397,"لت ":2572,"لا ":8651,"كن ":4742,"قسم":1469,"لب ":3341,"قسي":2428,"لح ":4377,"لث ":1339,"م، ":3933,"كي ":5663,"لق ":2574,"لف ":3842,"كثي":2170,"له ":10668,"كسي":1295,"ما ":21645,"كرو":1414,"لم ":13383,"كري":3468,"لك ":9653,"كرا":1233,"كرة":8153,"كات":3795,"قنا":1376,"كار":2777,"لغ ":5250,"قلي":3507,"قيم":1916,"قيق":2120,"كثر":3184,"قية":4744,"كتو":3421,"قيا":3458,"كتب":2651,"كتا":3440,"قوم":2204,"قوا":2182,"كبي":3060,"كبر":3750,"كان":20675,"كام":2151,"كال":2865,"قني":1349,"كر ":2225,"كز ":6044,"كس ":1605,"قل ":2905,"فرا":2274,"قم ":1363,"فري":3990,"فرن":4775,"فرق":1262,"كا ":2472,"كة ":13804,"قى ":1356,"ل، ":2187,"قي ":5360,"فضل":1264,"فظة":6038,"فير":1961,"فيز":1412,"فية":2824,"فيد":1310,"فيل":3788,"فيه":4815,"فين":1921,"قتص":1591,"فيا":1360,"قدر":1289,"قدم":9632,"قدي":3437,"قرن":2599,"كم ":5038,"قرى":1679,"قري":5990,"كل ":7728,"قرا":2563,"قرب":1568,"فلس":2794,"فلا":1746,"قاط":5379,"قاف":1502,"قال":3148,"قائ":2080,"قاب":1505,"فنا":1210,"قات":2176,"قاد":1517,"قار":3777,"فمب":1413,"فور":1725,"قبل":4422,"قان":1522,"قام":2893,"لمع":7155,"لمغ":2396,"ماء":2699,"لمر":5941,"لمس":9884,"لمش":2559,"لمص":4216,"لمط":1236,"لمت":9057,"للو":1687,"لمة":1580,"لمج":4154,"للي":1280,"لمخ":2131,"لمح":5998,"لمد":5967,"لمؤ":2072,"للم":3559,"لمب":2156,"لله":4901,"لما":10778,"لنف":1218,"لنس":1662,"ماس":1244,"لنظ":1668,"ماع":3269,"لمي":7408,"مات":4917,"مار":7604,"ماد":2511,"لند":3683,"لمل":4323,"لمك":1995,"لمق":3111,"لمو":6905,"لمه":2311,"لنب":1670,"لنا":3789,"لمن":8707,"لمم":6914,"مائ":1529,"مؤس":2067,"لكر":4170,"لقي":1879,"لكب":1845,"لكا":2896,"لكت":3059,"لكة":5304,"لقو":2652,"للغ":3104,"للع":2547,"للح":1276,"لكي":2900,"للت":1711,"لكو":3675,"للب":1703,"لكه":1307,"للا":2747,"لكن":3420,"لكل":2395,"للأ":1511,"لفر":5137,"لفا":3058,"لفة":1423,"لفت":1915,"لقر":6914,"لقد":5675,"لقص":1290,"لقط":1312,"لفل":2420,"لقب":2460,"لقا":4848,"لفن":2512,"لفي":3665,"نس ":2245,"ند ":3871,"نذ ":4066,"لعب":7815,"لعا":12423,"لعد":3105,"لعش":1359,"لعز":1344,"لعر":10614,"لعص":1325,"لعل":5280,"لغا":2170,"لعم":4185,"لغة":3148,"لغر":4050,"لطب":2889,"لطا":3030,"لطر":1829,"ه، ":2068,"مي ":10289,"لطي":1270,"و، ":1265,"ني ":19921,"نى ":2242,"مصر":6502,"مصط":1846,"نو ":1687,"هة ":2433,"مصا":1239,"مسل":3430,"نه ":6502,"مسي":2261,"مشا":1795,"ها ":46135,"مست":5627,"مسا":7688,"مرك":7198,"مري":7170,"مرو":1205,"مرا":3794,"مرة":1768,"مرب":1566,"مرت":1375,"مدي":14448,"مدر":2755,"مدن":2068,"مخت":2389,"مدا":1536,"محل":1510,"محم":5603,"ليو":6519,"ليه":5228,"مجا":2341,"لين":3186,"مجم":3939,"محا":8346,"مجل":3322,"ليا":10564,"ليب":2394,"لية":13788,"متو":1712,"ليز":4850,"ليس":2021,"ليد":6582,"ليم":5806,"ليل":2383,"مثل":6304,"ليف":2446,"لوي":1597,"لون":3102,"متا":1234,"لول":3949,"لوم":4251,"متر":3819,"متد":1293,"متح":5418,"متع":1688,"لهن":1850,"لوا":3317,"لهو":1322,"مبي":1451,"لوج":1868,"لوح":1460,"لور":2028,"لوس":2838,"لوط":2044,"لوك":1356,"مال":13395,"مام":2587,"لنق":1670,"لنو":1865,"مان":13056,"مبا":2715,"لها":8071,"ماي":3281,"لهج":1315,"مبر":6074,"لأص":1592,"لأس":4356,"لأر":8270,"لأد":1635,"لأع":1940,"لأخ":2355,"لأح":2438,"لأب":1984,"لأف":1387,"لأل":2944,"لأك":1560,"لأن":3516,"لأم":7460,"لأو":8534,"لأي":1221,"لإس":5236,"كلا":1532,"مر ":4953,"مس ":1733,"مد ":7895,"مة ":20092,"لو ":1775,"مت ":2546,"لى ":49394,"لي ":18230,"ن، ":5582,"مج ":2119,"لسف":1264,"لشا":2194,"لسن":1963,"لسك":1528,"لسل":6280,"لسي":5235,"لشب":1219,"لسو":4213,"نب ":1230,"مه ":3791,"لسع":2105,"لسط":2488,"لسا":4332,"من ":76224,"نا ":5184,"لصو":1503,"مى ":3080,"نت ":7260,"لصي":1707,"لشم":4027,"لصا":2983,"لشه":1224,"لصح":2065,"نة ":25901,"لشي":3442,"لشر":6885,"لشع":2442,"لدي":8697,"لدو":6858,"لدر":3930,"لخل":2460,"لدا":2727,"لدة":1345,"لحي":2699,"لري":2867,"لرو":3390,"لرس":1728,"مل ":8293,"لذي":7896,"لرئ":2291,"لرا":2519,"لته":1222,"كيا":1952,"لتن":1720,"لثا":5157,"كية":5479,"لتو":2940,"لتي":15294,"كيل":2229,"كيم":1808,"لجا":2810,"لتا":4876,"كون":7678,"كوم":2797,"لتح":2710,"لتج":1719,"كوي":2650,"لتر":3418,"لتص":1218,"لتش":1260,"لتع":2665,"لتل":1255,"لتق":2650,"لحد":2696,"لجو":1720,"لجي":2140,"لحر":4857,"لحس":1519,"لحم":1993,"لخا":2598,"لحق":1482,"لحك":2052,"لجد":1516,"لجب":1382,"لجز":3847,"لحا":4240,"لجن":5221,"لجه":1784,"لجم":3177,"لاث":2511,"لاج":1537,"لاح":3135,"لاد":4259,"كند":1265,"لار":1311,"لاب":1932,"لات":8075,"لاق":3425,"لاف":2118,"لاس":6196,"لاع":5890,"كلي":3267,"لإي":1693,"مع ":8709,"كلم":2454,"لإم":2003,"لإن":6523,"كما":4559,"كور":1677,"لبو":1874,"لبي":3816,"لبل":3617,"لبن":3363,"كول":1392,"لبر":5552,"كهر":1331,"لبح":3898,"لاي":9010,"كنه":1287,"لام":9049,"لان":6278,"لبا":3771,"لال":6356,"لبط":1368},"n_words":[11749565,13990834,9440598],"name":"ar"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":2636,"E":1936,"F":2232,"G":2334,"A":4351,"B":3121,"C":4592,"L":2480,"M":3950,"N":1781,"O":1368,"H":1934,"I":6368,"J":1261,"K":1010,"T":2986,"W":1604,"V":1965,"P":3771,"S":5211,"R":2316,"X":1029,"f":3146,"g":5865,"d":9193,"e":32549,"b":4459,"c":9844,"a":30637,"n":21831,"o":21963,"l":16413,"m":12336,"k":7480,"h":8702,"i":28615,"w":2350,"v":3143,"u":15394,"t":17966,"s":19762,"r":22456,"p":5894,"z":1439,"y":4095,"x":1681,"²":3527,"̀":1150,"μ":1054,"ν":2280,"ο":2756,"ι":1833,"κ":1014,"λ":1144,"ε":1298,"α":2356,"ί":816,"σ":1479,"ς":1823,"ρ":1221,"τ":1963,"ь":4950,"ю":16520," o":1042,"я":119927,"ш":24527,"щ":37947,"ъ":118638," k":4461," d":1024,"ф":35061,"х":26992," e":2141,"ц":57885,"ч":69969,"р":442208,"с":363493," a":954,"т":513431,"у":110117,"ѝ":1026," t":1519," p":1297," s":806,"Й":1921,"И":10722,"Л":9341,"К":22064,"Н":13530,"М":19622,"П":22329,"О":13337,"Б":18556,"А":21177,"Г":11598,"В":16284,"Е":7594,"Д":15403,"З":5627,"Ж":1534," J":1196," K":898,"Ш":3131," H":1769,"Щ":2903," I":3444," N":1367,"Ю":3236,"Я":1596," O":956," L":2204," M":3517," B":2694,"Т":16259,"У":4389," C":3884,"Р":16110," A":3726,"С":33637," F":2034,"Ц":2988," G":2045,"Ч":3270,"Ф":8267," D":2259," E":1564,"Х":7715,"л":282008,"к":273267,"й":56719,"и":734943,"п":177542,"о":637033,"н":589099,"м":161532,"г":129585," S":4264," R":2038,"в":294348,"б":87024," P":3303,"а":881417," W":1432,"з":132692,"ж":46599," V":1365,"е":647345,"д":212987," T":2518," А":17337," Б":18206," В":15997," Г":11364," Д":15029," Е":7181," Ж":1503," З":5448," И":10217," Й":1919," К":20624," Л":9135," М":19107," Н":12710," О":12505," П":21699,"Co":972,"I ":2499," б":25135," а":29615," г":41568," в":81473," е":87884," д":46722," з":26978," ж":5999," и":93394," л":8242," к":53137," н":137556," м":36472," п":102853," о":88041," Р":15097," С":31643," Т":15680," У":4153," Ф":7812," Х":7558," Ц":2895," Ч":3246," Ш":3096," Ю":3200," Я":1567," т":27331," у":9960," р":36272," с":111437," ц":6298," ч":13239," ф":16514," х":6966," ш":3296," щ":6810," ю":3698," я":2381,"Ca":930,"Ma":1469,"II":1888,"Th":962,"b ":891,"a ":4950,"i ":1873,"ge":970,"he":2112,"ha":1204,"g ":1075,"ea":1397,"ec":826,"ed":916,"de":1775,"di":1045,"do":824,"h ":1014,"el":2062,"en":3274,"et":1200,"es":2494,"er":5287,"ca":1035,"e ":9458,"da":1108,"f ":974,"co":1092,"ci":1241,"ch":1482,"ce":1332,"d ":2383,"at":2587,"as":1457,"ar":3743,"al":2739,"ai":928,"am":1115,"an":4726,"ac":1396,"ad":830,"ae":1227,"nu":977,"nt":2097,"ns":1121,"no":824,"of":927,"om":1215,"on":3740,"ol":1525,"m²":3381,"ot":815,"os":1234,"ou":906,"or":3049,"r ":3015,"pe":805,"lo":1036,"ll":2061,"o ":2327,"ma":1173,"mb":856,"me":1262,"mi":900,"na":1825,"nd":1832,"ne":1694,"ng":1442,"ni":2114,"m ":2526,"km":4286,"li":2736,"le":2314,"la":2281,"n ":4895,"hu":824,"hi":896,"id":1203,"ic":2468,"ia":2261,"ig":1082,"ie":1280,"k ":904,"ir":844,"is":2366,"it":1525,"iu":2316,"il":1887,"in":4071,"io":1821,"l ":2342,"y ":2050,"vi":1029,"ve":1027,"x ":1094,"ul":1045,"ur":1295,"us":5560,"um":1527,"un":918,"tu":1122,"to":1372,"tr":962,"te":2234,"ti":2705,"th":2095,"ta":1616,"ss":891,"st":2061,"se":1041,"si":1173,"rt":1054,"ro":1958,"rn":867,"ri":3533,"re":2281,"rd":988,"ra":2624,"t ":3334,"s ":10393,"² ":3527,"ς ":1819,"ν ":1021,"К ":979,"В ":2290,"юг":1081,"юз":1052,"юж":980,"юл":1213,"юн":1123,"юр":1010,"ют":1002,"юц":3295,"юч":2002,"яд":1101,"яг":923,"яв":7999,"ян":5481,"ям":2556,"ял":2107,"як":3070,"ях":803,"яс":1766,"ят":23106,"яр":1490,"ящ":989,"щи":5926,"ще":7866,"ща":13149,"щт":2297,"що":3015,"щн":816,"ъв":5386,"ъг":8535,"ъд":2915,"ъе":2628,"ъж":1821,"ъз":5682,"ъб":2128,"ът":9515,"ъч":1319,"ъц":2116,"ъщ":3958,"ъл":18163,"ък":3158,"ън":6470,"ъм":3577,"ъп":2106,"ъо":998,"ъс":7274,"ър":28553,"ьо":4687,"хе":1611,"хи":3810,"хн":1728,"хо":7132,"хр":1667,"ху":1707,"ха":4667,"ци":32071,"цк":2149,"ца":7811,"це":9248,"чл":917,"чн":11748,"чо":1988,"чи":7731,"чк":2263,"чу":949,"чр":804,"цъ":803,"че":23255,"ча":14633,"чв":2213,"шн":3596,"шк":2046,"ши":6517,"ше":4003,"шв":1228,"ша":3103,"ск":74214,"см":3598,"сл":11817,"со":8891,"сн":7769,"ср":4324,"сп":8021,"св":7943,"се":41908,"си":23052,"рш":1570,"ръ":16028,"ря":3303,"са":17003,"рс":15501,"рт":9867,"ру":13758,"рх":2778,"рц":2298,"тн":10904,"тл":2176,"тк":3861,"тс":3598,"тр":27985,"то":91021,"те":76446,"тд":804,"тв":20249,"ти":47326,"сь":1346,"съ":18957,"ся":1659,"та":97852,"тб":3928,"су":2617,"ст":96170,"сц":1590,"ур":10561,"уп":6474,"ут":4765,"ус":9980,"ум":6401,"ул":6590,"ун":6478,"уи":1236,"уз":3299,"ук":5239,"уд":4037,"уг":3988,"уж":1616,"уе":1334,"уа":3124,"тя":2319,"уб":6162,"ув":3514,"тъ":10928,"тт":3628,"ту":8218,"фу":2995,"фс":1030,"фр":3456,"фо":5112,"фи":11161,"фе":4259,"фа":3044,"уч":6066,"уш":4842,"ух":1233,"уц":1227,"Щ ":2591," I ":819," II":1042," Ma":1457,"а ":358572,"С ":937," Ca":925," Co":962,"Ис":1007,"Им":977,"Ин":1158,"к ":21804,"Из":2286,"Ив":1372,"й ":18686,"Ле":1710,"Ли":1795,"Ла":1638,"Ку":1413,"Кл":1286,"Ко":5445,"м ":14788,"Кр":2175,"Ке":836,"Ки":1609,"Ка":6073,"л ":25621,"Йо":1544,"На":5278,"Не":1875,"Ни":2119,"Мо":2461,"о ":136170,"Ма":9260,"Ми":2960,"Ме":2505,"Ло":1497,"н ":69303,"Лу":1452,"Па":3246,"Пе":2744,"Пи":1272,"Пл":3225,"с ":29451,"По":4526,"Оп":1050,"р ":30654,"Ос":2133,"Ор":1247,"От":1087,"Об":2207,"Ок":2446,"Но":2049,"п ":2647,"в ":63211,"Ам":1040,"Ан":3455,"Ал":2956,"Ав":1300,"Ба":2769,"Ат":831,"Ар":1719,"б ":1391,"АЩ":2595,"Во":969,"д ":24331,"Ве":2399,"Ви":2290,"Въ":2366,"Га":1581,"Бо":2717,"г ":21217,"Бр":1925,"Бе":2785,"Би":1506,"Бъ":3741,"Ва":2968,"Бу":1134,"Ди":2316,"Дж":3197,"Де":2501,"Др":1035,"До":1935,"ж ":1730," Th":951,"Ед":825,"Ев":1551,"Ге":2579,"Гр":2290,"е ":164103,"Го":1786,"Гъ":1180,"Да":1394,"и ":202644,"За":3607,"Ел":966,"з ":12874,"Ша":824,"Юж":1162,"ю ":929," km":4266,"я ":66327,"Ст":4040,"Та":1626,"Съ":4599,"Ти":1166,"Те":2600,"ф ":1914,"То":5455,"Тр":1786,"Ту":994,"Тя":1180," e ":1798,"х ":2350,"Пр":4433,"Пъ":1179,"СА":2694,"Ра":3306,"Ре":3655,"Ри":2528,"т ":98842,"Ро":3022,"Ру":2132,"СС":822,"Са":2940,"Св":1625,"Си":2245,"Се":4020,"Сл":1252,"Ск":946,"Ср":881,"Сп":1322,"Со":2387,"у ":10195,"Це":1510,"ш ":1936,"Че":1438,"щ ":4203,"ц ":3623,"Фо":984,"Фр":2381,"Фе":819,"Фи":1127,"Фл":885,"Ха":2310,"Хр":971,"Хо":1081,"ч ":2829,"Хе":1295,"лю":6411,"мб":1834,"ля":9658,"ма":34017,"мв":3536,"ме":34130,"ми":25035,"лм":3442,"лн":16796,"ло":33848,"лс":4866,"лт":3196,"лу":5370,"лъ":2856,"къ":5642,"лв":914,"лб":2941,"ла":34165,"лж":1120,"ле":40750,"лд":1625,"лг":12737,"лк":4727,"ли":64840,"лз":2343,"км":1179,"кн":2476,"кл":6420,"кр":18141,"кс":6288,"ко":52495,"кт":11901,"ку":5862,"кц":1433,"ка":68697,"ки":58354,"кв":3364,"ке":7674,"йн":7255,"йо":1892,"йк":1995,"йл":1354,"йм":901,"йс":13136,"йт":4652,"ия":73064,"ищ":2902,"иш":2457,"йд":1453,"йв":1615,"ио":15275,"ип":3934,"им":23532,"ин":58356,"ик":36656,"ил":29439,"ии":4086,"ий":12560,"иц":13361,"ич":23215,"иф":2446,"их":3360,"ит":63592,"ир":17875,"ис":34276,"ри":74729,"рк":5043,"рл":2965,"рм":7553,"рн":12220,"ро":46663,"рп":941,"ра":98873,"рб":2434,"рв":7403,"рг":7600,"рд":5085,"ре":68474,"рж":3084,"рз":1336,"пъ":7097,"пр":53247,"пт":2156,"пс":2576,"пу":5784,"пи":12418,"по":49161,"пл":7108,"оя":7068,"па":18259,"пе":14376,"ощ":6422,"ош":1648,"оч":6409,"оц":2874,"ос":31400,"ор":52184,"оп":12711,"оо":918,"ох":1705,"оф":5618,"оу":1691,"от":71852,"ок":20081,"ол":46187,"ом":18413,"он":40545,"ож":8402,"оз":8385,"ои":10610,"ой":13778,"ов":49786,"ог":14314,"од":35825,"ое":7078,"оа":1786,"ня":5762,"об":22311,"нъ":1330,"нц":6022,"нч":1314,"нт":20098,"нс":29338,"нф":1197,"ну":2379,"но":61747,"нн":7922,"нр":827,"нк":4229,"нз":932,"ни":106631,"не":37699,"нг":9593,"нд":12141,"мя":2035,"на":206302,"мъ":3046,"му":8961,"мс":4731,"мп":7095,"мо":14552,"мн":4561,"ге":6356,"ги":12185,"гн":1340,"го":24434,"гл":8226,"гр":23857,"гу":2534,"гъ":1359,"дв":5622,"да":30493,"вг":1251,"вд":1142,"ве":44922,"ви":31125,"вк":2396,"вл":5218,"вн":10484,"во":35579,"вр":11007,"вс":5682,"ву":1966,"вт":3431,"вш":985,"въ":12004,"га":25651,"вя":3485,"би":11046,"бе":7947,"бр":9346,"бн":1672,"бо":12574,"бл":10831,"бу":4434,"бс":2154,"бя":950,"ва":58204,"бъ":10139,"бщ":5858,"ад":29187,"ае":3072,"аж":2734,"аз":20177,"аб":6725,"ав":32263,"аг":6621,"ам":19866,"ан":92124,"ап":10532,"аи":1848,"ай":16768,"ак":18833,"ал":43351,"ах":2622,"аф":3137,"ач":7356,"ац":9711,"ас":30917,"ар":52528,"ау":3593,"ат":95702,"ая":1940,"ба":6691,"ащ":6812,"аш":2543,"зт":3682,"зс":1637,"зр":2676,"зп":8092,"зх":1144,"зу":1891,"зк":2737,"зи":16355,"зо":4852,"зн":6991,"зм":2937,"зл":3437,"ив":14438,"иг":8987,"иа":9057,"иб":2301,"иж":2121,"из":36345,"ид":10074,"ие":26794,"зъ":1412,"жо":1971,"жу":937,"жи":10091,"жк":1307,"жн":4179,"за":35424,"зб":1209,"зв":12292,"зг":1227,"зд":6172,"зе":4307,"еф":2109,"ет":45213,"ес":32066,"ер":49860,"еп":8513,"ео":5548,"ен":105327,"ем":23126,"ел":47225,"ек":18670,"ей":11615,"еи":962,"ез":20414,"еж":9844,"ее":2077,"жд":7425,"же":11581,"жа":5863,"ея":1281,"ещ":3642,"еч":4774,"еш":4262,"ех":2366,"ец":5302,"дс":7551,"др":9809,"ду":10303,"дн":14843,"дм":2251,"дп":805,"до":23013,"ди":35808,"дл":987,"дк":815,"де":31362,"дж":3808,"еб":3744,"ев":22477,"ег":6758,"ед":43311,"еа":2059,"дя":1235," th":1043,"дъ":7215," ар":2430," ас":835," ба":2316," ав":2928," ад":1436," ал":2238," ак":2281," ан":5186," ам":5117," ап":1113," бу":1045," ва":1142," бъ":8445," би":3599," бе":1826," бр":3218," бо":2578," бл":1708," вт":1082," въ":7192," ви":5408," ве":5179," во":6415," вс":1918," вр":3197," вл":1868," вк":1271," дв":3064," да":7239," го":11993," гл":1931," гр":14818," ге":2801," ев":904," ед":7019," дъ":3603," дн":1385," до":9466," др":4782," ду":4688," де":7755," ди":3107," же":950," еп":830," ел":1747," ек":959," ез":2706," зе":1022," за":22793," зв":852," жи":3991," зн":1109," иг":2469," ид":823," из":17689," ил":7883," ин":3652," им":7956," ит":1193," ис":3053," ка":12665," ки":1971," кр":5742," ко":21829," кн":1333," км":1031," кл":2272," ку":1841," ла":1472," къ":3355," ли":3157," ле":2135," ме":7169," ми":3625," ма":9333," мо":4895," мн":1906," му":6862," ни":1121," не":9125," на":119378," но":5035," ок":9717," оз":981," од":1340," об":13021," ня":1648," ощ":1554," оф":1095," от":45416," ор":4392," ос":5533," оп":3022," по":33250," пл":5058," пи":3055," пе":4542," па":4088," Ре":3648," Ра":3274," Ро":3019," Ри":2525," Пр":4415," СА":2616," Пъ":1174," Пе":2739," Па":3238," с ":12955," По":4518," Пл":3222," Пи":1267," От":1080," Ос":2127," Ор":1245," Оп":1048," Те":2595," Ти":1162," То":5392," Тр":1782," Ст":4014," Съ":4598," Та":1620," Св":1624," Си":2198," Се":4013," Сл":1251," Ск":944," Сп":1315," Ср":879," Со":2383," Ру":2131," Са":2935," Фр":2376," Фо":984," Фи":1126," Фл":885," Фе":819," Тя":1180," Ту":993," Це":1509," Хр":970," Хо":1079," Хе":1290," Ха":2307," Ша":822," Че":1435," Юж":1162," Ба":2734," Ат":830," Ар":1714," в ":45152," Ан":3448," Ам":1038," Ал":2947," Ав":1297," Ва":2965," Бъ":3733," Бу":1133," Бо":2714," г ":8207," Бр":1924," Бе":2781," Би":1492," а ":4115," Ед":825," Ев":1550," Ди":2315," Дж":3188," Де":2497," Др":1035," До":1933," Ел":962," Въ":2363," Га":1572," Ве":2397," Ви":2282," Во":966," Гъ":1180," Да":1391," Ге":2574," е ":70638," Го":1784," Гр":2287," Ис":1003," Ин":1152," Им":970," Йо":1543," Ки":1607," Ке":833," Ка":6065," и ":47012," За":3604," Ив":1368," Из":2283," Мо":2455," На":5243," Не":1867," Ни":2110," Но":2047," Об":2206," Ок":2446," Кл":1282," Ко":5424," Кр":2158," Ку":1412," Ла":1629," Ле":1707," Ли":1793," Ло":1496," н ":881," Лу":1447," Ма":9223," Ме":2502," Ми":2950," В ":2161,"Зап":1336,"Ива":1277,"II ":1339,"Кар":1717,"Кал":1241,"Кон":1216,"Кол":892," ра":14338," ре":11569," ри":1504," ро":5884," пр":44389," пс":1741," пу":1035," пъ":4915," св":5747," си":8451," се":28117," сл":4935," см":1742," ск":1388," сп":4517," ср":3911," со":1964," ру":1874," са":8475," ти":1471," те":8827," то":4709," тр":4417," сц":1244," ст":9556," су":971," та":2605," съ":15734," ус":1789," уп":1428," ун":1177," ту":1481," тя":1501," тъ":1070," фо":2263," фр":2272," фу":2793," фе":1675," фи":5383," фа":1634," уч":2779," хр":1022," хо":1370," ху":840," хи":1498," ха":1277," це":3524," чо":1060," чл":866," чи":1314," че":4520," ча":4234,"Мак":3324,"Мар":2452," ша":881," ща":6448," юж":900," юг":1007,"Южн":1054,"Нам":821,"Нас":1183,"Ник":1456,"ад ":8166,"ав ":839,"ам ":1105,"ан ":12026,"ак ":1047,"ал ":5489,"ай ":7880,"Окр":2063,"авт":1723,"ага":1670,"авя":1374,"агр":1076,"аго":1396,"ада":3119,"ади":3451,"аде":5585,"адо":1154,"адм":1400,"адн":2853,"аем":863,"би ":1015,"ажд":973,"Опе":874,"аба":833,"або":2518,"ава":11168,"авн":2861,"авл":2542,"аво":1409,"аве":2231,"ави":4818,"алс":906,"алн":8729,"ало":2628,"алб":1650,"ала":2998,"алк":1807,"али":10040,"але":5565,"амо":2012,"амп":965,"ама":2784,"ами":3947,"аме":7196,"анн":912,"ано":4066,"анс":13953,"ант":4813,"анц":1702,"ана":8635,"анд":5902,"анг":4584,"ани":19618,"ане":11702,"анк":1502,"азр":1036,"азп":4124,"азо":828,"азн":1081,"азл":1632,"ази":2620,"азв":2060,"аза":1484,"айс":963,"айк":1056,"айо":1501,"айн":2031,"акт":4848,"ако":1531,"акс":811,"аке":4784,"аки":1127,"ака":2207,"Пар":949,"ас ":2329,"ар ":3643,"ат ":10948,"Пло":2474,"ба ":1349,"ая ":1362,"ащ ":1410,"Пет":1186,"Пър":1058,"САЩ":2595,"Пол":1310,"При":1038,"Пре":1768,"Про":1122,"Рим":1524,"АЩ ":2589,"Род":1271,"Раз":1500,"Реп":2237,"Але":1070,"Сан":921,"Ант":866,"Рус":1292,"Соф":1291,"Сев":1817,"Све":1167,"Бра":846,"Съе":2317,"Тя ":1123,"Ста":1292,"Сто":900,"Бел":1301,"Тов":901,"Той":2267,"Вел":1059,"Бъл":3319,"Гра":1071,"Вът":1318,"Фра":1531,"Гер":981,"Гео":1043,"Дим":1088,"Хри":815,"Гър":1058,"Джо":1453,"Цен":1121,"Евр":1146,"лбу":1567,"лам":805,"лан":5659,"лас":5904,"лат":3266,"лги":963,"ме ":3219,"лга":11038,"ма ":9888,"ля ":1353,"лав":4235,"лаг":1602,"лад":2109,"къс":900,"към":1928,"кус":995,"кул":1639,"кци":1405,"коя":2700,"кре":815,"кра":5743,"кри":2482,"кръ":8385,"кса":1220,"кси":1227,"кте":804,"кти":2426,"кто":3328,"ктр":1676,"кла":1490,"ло ":7485,"кло":1049,"клю":1993,"кни":998,"ког":1332,"ков":4810,"ком":4727,"кон":4363,"коп":1296,"кор":1875,"кос":1575,"кот":3860,"кое":2107,"кои":3390,"кой":3594,"кол":6405,"кин":1233,"кия":7501,"лм ":1747,"кит":4721,"ле ":963,"кед":4374,"ли ":17289,"ква":2572,"кат":20763,"кар":2222,"кан":8113,"как":1566,"кал":2533,"каз":1398,"ла ":6130,"йто":3630,"йск":8916,"йст":3543,"кт ":1337,"йна":2642,"йно":1113,"йни":1595,"кс ":1165,"йон":1165,"ко ":8475,"км ":856,"ки ":42277,"йво":1025,"ият":14767,"од ":4250,"нац":1700,"нау":1528,"нач":4474,"ог ":1478,"нан":1145,"нам":2383,"нал":6581,"мят":820,"нат":17022,"нас":8028,"нар":6159,"нап":1914,"над":1993,"наг":991,"най":5108,"наз":881,"нде":899,"нда":1986,"нгл":4272,"нга":993,"нем":1875,"нен":7192,"ои ":1291,"нер":3755,"нес":1561,"нет":4998,"нег":1689,"нев":1009,"нди":2598,"ндо":877,"ндс":1234,"ндр":1171,"нив":1213,"низ":3481,"ник":6697,"ниг":977,"ние":16641,"ок ":2212,"ой ":4351,"ня ":922,"ов ":8975,"нав":1281,"нт ":3764,"мпи":1340,"мпе":2702,"мпа":913,"мот":885,"мск":3992,"мун":847,"муз":1721,"мик":1065,"мил":1270,"мич":1772,"мин":5435,"мис":1421,"мир":3990,"мит":3385,"мия":2187,"но ":26769,"мна":831,"мно":2749,"мод":977,"мов":951,"мож":1224,"мон":1361,"мол":819,"мос":1285,"мор":2165,"нс ":993,"нд ":1174,"мац":839,"мал":2220,"мак":2180,"май":1485,"лят":839,"мат":5742,"мас":872,"мар":1751,"ляр":941,"нг ":1106,"ман":6016,"лян":866,"лям":2082,"люц":3280,"люч":1984,"ляв":2485,"маг":843,"мес":1879,"мет":5373,"мен":8396,"ни ":29732,"мер":7582,"мей":1790,"меж":2783,"мед":1422,"мвр":3060,"не ":9731,"на ":142528,"лощ":4100,"му ":4914,"лни":5607,"лно":5548,"лна":4870,"лог":4324,"лож":4131,"лор":940,"лос":1248,"лот":2604,"лом":832,"лон":1406,"лов":4045,"луч":1122,"лст":1405,"лск":2670,"лта":884,"лзв":2068,"лиа":1575,"лиг":889,"лив":1397,"лиз":3791,"лим":1095,"лий":4336,"лик":5518,"леж":946,"лев":1784,"лед":5662,"лер":878,"ми ":2984,"лен":16713,"лем":2731,"лек":3978,"лет":2054,"мо ":1651,"лищ":1562,"лиц":2347,"лич":3301,"лис":3220,"лит":6583,"лиф":806,"лин":3617,"лия":3676,"лко":1751,"лка":1568,"оят":3638,"пат":1787,"ояв":887,"пад":4016,"рг ":1045,"оян":1222,"пан":2587,"пар":3257,"рд ":1208,"ре ":1433,"ра ":12349,"пит":1282,"пис":5195,"пла":2637,"пле":1081,"ро ":1581,"пло":2643,"ри ":13979,"пер":6644,"пет":985,"пей":864,"пен":871,"пец":914,"рк ":1201,"ори":9957,"орд":1685,"оре":5253,"орг":4291,"орс":1471,"оро":2299,"орм":3070,"орн":2306,"опу":959,"ора":5790,"опе":1825,"опи":2343,"опо":1901,"опр":1742,"опа":1705,"оте":1711,"отк":1219,"отл":842,"оти":2542,"ото":15284,"отн":1748,"отр":1056,"отв":1153,"отб":1366,"ота":1480,"осе":1009,"оси":1734,"осл":2267,"осм":1054,"осн":3443,"осо":2118,"ост":14287,"орт":2090,"оръ":1155,"осв":865,"оми":2906,"оме":3032,"оля":2830,"ома":3616,"олю":3412,"олу":2069,"олс":1164,"олн":1271,"по ":9531,"оло":11644,"олк":1549,"оле":5004,"оли":8455,"олз":2123,"ола":2641,"окр":6747,"окт":1051,"око":5161,"оня":990,"онс":3193,"онт":1752,"они":6360,"оно":4114,"онн":3311,"она":6297,"онд":803,"оне":3282,"омо":1980,"омп":2499,"оше":916,"очи":922,"очн":3093,"още":1644,"ощт":2295,"офе":923,"офи":2989,"оце":1219,"оци":1428,"няк":1962,"няв":1202,"ова":9654,"общ":5055,"обр":3456,"обо":1542,"обн":833,"обл":3221,"оби":2313,"обе":2463,"па ":3482,"оит":3156,"ойв":1029,"ойт":3479,"ойс":1115,"ойн":2834,"ока":2144,"ожн":1229,"ози":2118,"оза":2042,"озн":2520,"оиз":4705,"одн":3018,"оди":10451,"оду":932,"одр":1677,"одс":1049,"одо":2573,"оем":891,"оен":1806,"оет":2833,"оже":5313,"ове":9903,"ови":7840,"ово":5297,"овн":3952,"овс":1386,"ога":2254,"оги":3492,"ого":3019,"огр":2846,"ода":3875,"оде":5129,"от ":40884,"нот":4382,"нос":7389,"нош":880,"нор":1012,"ос ":1874,"ном":2595,"ное":844,"ног":2289,"нов":10487,"ор ":8928,"нно":1572,"нни":2936,"нна":3374,"нко":832,"он ":7439,"нка":817,"ом ":1452,"ния":18624,"нир":1151,"нис":3663,"нит":11345,"ним":1508,"нин":2416,"ол ":1852,"нич":2172,"ниц":3898,"нце":866,"нци":4501,"ощ ":1985,"нтъ":2337,"нуа":804,"нта":3339,"нте":1868,"нти":4317,"нто":1031,"нтр":2358,"нск":22095,"нст":3816,"сам":1827,"сан":2444,"сат":1871,"сва":1135,"сво":1889,"те ":30876,"све":3239,"свъ":838,"сев":2490,"сед":1178,"сел":9974,"сек":1165,"сеп":870,"ти ":11022,"сен":2491,"сем":2208,"сет":920,"сер":1205,"сия":1303,"сич":1328,"сис":1833,"сит":2033,"сих":1495,"син":1858,"сил":2220,"сим":1511,"ски":43899,"ска":20500,"сле":5451,"сла":2258,"ско":7743,"сми":843,"слу":1333,"то ":53757,"сло":1460,"сна":1239,"сни":1224,"соб":1154,"сов":1181,"сок":1395,"сно":4759,"спе":2031,"спа":1513,"спи":822,"соф":1092,"соц":1039,"сре":3295,"спо":2745,"роц":854,"рот":1758,"роф":1212,"роп":2362,"рос":2878,"ст ":15103,"роя":1120,"рта":1478,"рти":2728,"рск":11358,"рси":1583,"руг":2898,"руп":3391,"рус":2190,"рум":975,"рци":1577,"рхи":885,"ръц":1917,"ръс":815,"ръг":7752,"ръж":849,"ръб":829,"та ":68838,"рад":11688,"рае":904,"раж":1539,"раз":12192,"раб":3055,"рав":5786,"рам":2735,"ран":13900,"рай":2815,"рак":2465,"рал":6906,"раф":2143,"рац":1472,"рас":2637,"рат":10837,"рая":1251,"ращ":1083,"рби":1070,"рва":2274,"рди":900,"рдж":839,"реб":1455,"рев":6384,"рег":1970,"ред":14324,"рет":1976,"рес":2290,"реп":1732,"си ":5183,"рен":6538,"рем":3987,"рел":1209,"рек":2832,"рей":892,"рез":11468,"реж":2438,"ржа":2590,"рещ":1470,"реч":1092,"реш":2153,"се ":16807,"рво":979,"рве":1684,"рви":2226,"рга":3761,"рги":1489,"рда":805,"рия":9828,"рио":1833,"рим":2306,"рин":4333,"рик":7682,"рил":2477,"рий":2243,"рич":4087,"рит":6765,"рир":851,"рис":4939,"рка":815,"риа":2344,"риг":1450,"рив":950,"рие":1786,"рид":1515,"риз":1180,"рни":3773,"рна":4116,"рок":1893,"рол":1507,"ром":2389,"рон":2340,"роз":1160,"рои":4466,"рой":1470,"ров":7381,"рог":1151,"род":7855,"рое":938,"рно":2617,"рла":1189,"рми":2339,"рма":3665,"пра":5058,"при":9598,"пре":21619,"про":14907,"поп":1206,"пор":4301,"пос":3943,"пот":1094,"поч":1394,"рт ":2255,"пое":904,"под":4233,"пов":2082,"пон":1889,"пом":919,"пол":13153,"пок":1115,"поз":1787,"пуб":3689,"пул":1018,"пте":935,"пси":1481,"са ":8477,"пър":3460,"път":1769,"пъл":1566,"вар":1920,"ват":9299,"ващ":3095,"ван":13950,"вал":2194,"га ":4136,"бщи":2449,"бъд":803,"бър":1419,"бъл":7641,"бща":1581,"бще":1011,"бум":1554,"бук":823,"бск":1045,"вто":2970,"вси":896,"вск":2447,"вст":1194,"вре":3504,"ври":3322,"вро":1874,"връ":1065,"вол":4504,"вой":3658,"вое":1398,"вод":4759,"вот":3890,"вор":2694,"вни":3836,"вна":3375,"вно":2879,"вля":1602,"вле":1219,"вли":1041,"вла":1152,"го ":2882,"вкл":1196,"виц":847,"вич":1491,"вия":2762,"виз":1513,"виж":1141,"вил":1146,"вин":3278,"вис":2864,"вит":3903,"вид":3578,"вие":1422,"веч":1347,"вес":4369,"вет":7499,"вер":5871,"вен":9985,"ги ":2787,"вел":1221,"век":3636,"веж":1619,"вед":2219,"вгу":856,"ва ":24595,"бан":1118,"ачи":1132,"аши":924,"аща":2195,"ащи":2202,"ащо":823,"афс":811,"афи":915,"ача":3148,"аче":1418,"аци":9279,"апр":1718,"апа":4649,"апо":1282,"апи":1535,"арх":1193,"арс":9548,"арт":4677,"аса":1041,"аре":2062,"ард":1683,"ара":4896,"арн":1173,"арм":1084,"аро":3343,"ари":12961,"арл":1170,"арк":1896,"аст":14401,"ася":976,"ата":47391,"аси":2434,"асе":6138,"асо":1029,"ату":1909,"ате":8423,"ати":9387,"ато":12657,"атр":1084,"бол":3321,"бор":2442,"бот":2082,"бро":1543,"бри":1900,"бре":1262,"бра":3810,"бла":3919,"бли":5668,"во ":9926,"ви ":4285,"бен":998,"бер":1182,"без":1026,"бед":1013,"бел":1391,"бек":854,"бит":1770,"бил":1742,"бик":912,"бив":1415,"ве ":3400,"дан":2923,"дар":1829,"дат":3453,"дви":1371,"два":2719,"две":833,"ед ":5333,"дал":1095,"дад":3266,"дав":2214,"ев ":3676,"дее":1168,"дек":992,"дей":1639,"дем":1659,"дел":3837,"ден":12663,"дер":1762,"джи":1277,"ей ":1537,"дес":1272,"дет":1508,"ез ":10672,"дст":4125,"дск":2604,"дро":826,"дру":2364,"дре":1806,"дри":2347,"дра":1827,"ет ":4265,"душ":3708,"ец ":2918,"ен ":33180,"дия":1914,"диц":1640,"ем ":1311,"див":1005,"дим":1128,"дин":15737,"дио":936,"ел ":8445,"дис":974,"дит":1804,"дие":842,"ек ":3487,"доб":1818,"дов":2389,"ес ":1992,"дос":1220,"дор":1353,"док":903,"дон":5500,"дна":5206,"дни":3855,"дне":986,"ер ":6034,"дно":4679,"дми":1636,"да ":13229,"вяв":1024,"гал":815,"вят":853,"гат":2134,"ган":4672,"гар":12015,"де ":1425,"and":809,"an ":963,"във":2376,"въз":2174,"вър":4758,"гол":3089,"гор":2288,"гов":3764,"год":7938,"гру":3152,"ду ":2227,"гръ":1949,"гра":16257,"гри":1115,"гус":909,"ген":2209,"гер":1710,"ди ":4057,"гио":1272,"гия":3508,"гич":1145,"ati":807,"гле":897,"гла":2323,"до ":6155,"гли":4051,"жан":924,"жав":2301,"за ":15694,"еще":814,"жит":2528,"жив":1957,"жис":1541,"жес":1221,"жду":3122,"зи ":3101,"жен":7044,"жда":3366,"жно":1097,"жни":1433,"жна":1550,"ея ":816,"жа ":1065,"дъл":1733,"дър":3663,"еец":1133,"ежи":2367,"ежд":5041,"едс":4153,"жи ":1063,"еза":1907,"езо":914,"ези":4043,"ева":1249,"еви":2758,"еве":5028,"его":2858,"едв":1395,"еда":2616,"еде":4183,"еди":11058,"едо":5545,"едн":6376,"евн":1170,"ево":4583,"же ":1283,"евр":1969,"ега":870,"еги":1678,"ент":10211,"енс":4280,"енц":1053,"ени":27128,"ено":7268,"енн":2947,"ена":10075,"ене":4173,"енд":1760,"еор":1928,"епт":1100,"епу":3069,"епо":1006,"ерс":2306,"ерт":1057,"ерм":2802,"ерн":4552,"еро":3866,"ери":14051,"ерг":986,"ере":1937,"ера":7049,"ерв":1379,"ейн":1799,"ейс":4529,"еке":826,"еки":1118,"еко":1156,"ект":4178,"екс":3530,"ека":2557,"елн":4078,"ели":9768,"елс":1847,"ело":4047,"еле":11585,"елг":908,"ела":1732,"емо":966,"еми":3799,"емс":1747,"еме":6592,"еля":2163,"ема":3994,"емв":2337,"ехн":1011,"еци":1425,"ече":2969,"ешн":2173,"еща":1146,"еса":875,"есе":2292,"еси":1554,"еск":7153,"есн":1000,"ест":15331,"ета":4044,"ети":4225,"ете":3470,"етр":1876,"ето":20630,"етн":1476,"етс":1612,"етъ":1538,"иве":2646,"иви":1509,"ива":2727,"иал":3813,"иан":3297,"иен":1203,"иет":8077,"иже":812,"иев":1368,"игр":2806,"иго":824,"ида":1273,"иди":1105,"иде":2406,"иво":2091,"ивн":2071,"ивш":976,"ига":2136,"иги":1169,"икн":877,"ико":3994,"йн ":1098,"ики":1147,"ика":16650,"ийс":6452,"иит":965,"изъ":865,"изх":1032,"изс":1113,"изт":2743,"изп":3253,"изм":1954,"изл":1205,"изо":1731,"изн":1188,"изи":3858,"изк":1474,"изд":1669,"иза":5067,"изв":5752,"ион":10338,"инц":2399,"иня":874,"иод":1232,"ине":4216,"ини":7734,"ино":2511,"инс":5959,"инт":1527,"ина":17378,"инд":1369,"инг":2575,"ими":3098,"име":5126,"имс":2034,"имо":1892,"имп":3052,"имн":924,"има":3880,"или":12587,"иле":1355,"илм":2481,"илн":880,"ило":3073,"ила":2122,"иси":1489,"иса":3563,"ист":15576,"исп":938,"исо":1673,"исл":1753,"иск":1625,"ити":5468,"ите":38859,"ита":6132,"исъ":1087,"ись":1263,"иту":932,"ито":6371,"итн":995,"ира":10008,"ири":1911,"иро":1628,"ихо":1462,"ице":1072,"ица":5886,"ици":6007,"ифо":843,"ище":1958,"ичи":1220,"ичк":1339,"ичн":6798,"ича":3197,"иче":8842,"ка ":28086,"ив ":1131,"зав":1648,"зае":830,"ид ":2167,"зви":1188,"зве":5171,"зва":3840,"зац":2538,"зат":1310,"зар":1064,"зап":3505,"зан":2665,"защ":954,"зда":4154,"зво":872,"ие ":14021,"ий ":4437,"ии ":2933,"зем":1420,"зик":4455,"зир":1742,"ил ":4244,"ик ":10314,"ин ":8393,"им ":2180,"зия":1529,"зит":1037,"ип ":824,"зма":848,"ио ":855,"зли":2352,"зна":4787,"зни":1000,"зно":822,"ир ":1662,"зов":1006,"ис ":2255,"зон":1141,"зпо":5089,"ит ":1689,"зпр":1401,"зра":1726,"зпъ":815,"зсл":944,"зто":3205,"ич ":1609,"зхо":1090,"ия ":57202,"ius":2045,"is ":934,"ion":1052,"ьор":2769,"km ":885,"южн":872,"km²":3379,"ъцк":1859,"ътр":1736,"ъще":1730,"ъщо":1266,"he ":1146,"ъде":1407,"ъед":2623,"ъве":1335,"ъзд":2638,"ълн":1684,"ълг":11342,"ълж":1010,"ърн":1183,"ърк":1004,"ърж":2771,"ърз":1179,"ърх":1303,"ърт":1540,"ърс":856,"ърш":1020,"ърц":1189,"ъст":3892,"ъпр":1168,"ърв":4886,"ърд":852,"ia ":930,"er ":1669,"es ":1352,"яко":2322,"яло":937,"ява":6515,"юци":3278,"ючв":950,"ят ":9602,"яне":821,"яни":1186,"яма":1227,"ята":8869,"ято":2957,"уци":861,"уча":2759,"учи":956,"уче":1377,"уши":3537,"фес":938,"фер":834,"фан":1085,"фин":933,"физ":1170,"фил":3622,"фия":1796,"фиц":1324,"фут":2220,"фре":1952,"фор":3324,"фон":806,"ца ":4811,"ци ":3879,"хан":805,"хар":1263,"хол":1095,"хор":974,"хов":933,"ход":1906,"хим":863,"стн":3691,"сто":10615,"стр":11666,"ств":16841,"сте":8055,"сти":11129,"ста":11874,"стъ":2246,"сту":903,"стт":2922,"сце":1326,"съд":995,"съв":1667,"съз":2361,"със":4442,"сък":852,"сън":863,"същ":2787,"сьо":1331,"тав":5484,"так":1485,"тал":3945,"тан":6731,"тай":879,"тат":3225,"тар":3238,"тбо":3512,"твъ":860,"тво":9227,"тви":1621,"тве":3786,"тва":3607,"тех":952,"тем":3541,"тел":16601,"тео":1008,"тен":6993,"тер":6816,"тет":1731,"тес":853,"тез":1494,"тек":1131,"тив":4139,"тие":1898,"тка":1702,"ум ":1519,"тич":5203,"тиц":909,"тия":3555,"тин":3775,"тик":4630,"тил":1307,"тир":2210,"тис":902,"тит":2981,"тла":1006,"тно":3117,"ток":1533,"тол":2748,"той":1268,"тов":5201,"тни":5121,"тна":2467,"тре":4236,"тра":10350,"три":4952,"тор":11467,"тот":821,"том":2089,"тон":2179,"ус ":1180,"топ":1014,"точ":3383,"тоя":1817,"тта":3294,"тро":5709,"тру":1929,"тск":2626,"тур":4751,"тър":5898,"тът":942,"тън":1062,"ува":2742,"уги":1696,"уга":1360,"уар":1716,"убл":3878,"узи":1937,"уди":922,"удо":1145,"уме":1194,"ума":1453,"улт":1584,"ули":1004,"ула":1122,"укт":810,"ука":1012,"упр":1211,"ура":2417,"ург":1080,"ури":1177,"упа":3307,"уна":1368,"уни":1783,"уст":2872,"утб":2290,"урс":982,"урн":1905,"уск":2598,"уси":1038,"що ":2387,"шни":1037,"шна":1940,"шин":804,"ще ":3484,"шен":1940,"щи ":1591,"ъв ":2817,"ът ":6352,"ъс ":1987,"ър ":7548,"ън ":3472,"ъм ":3109,"щит":1320,"щин":2379,"ъл ":2252,"ък ":1412,"щен":865,"щес":2508,"ъг ":7573,"щат":7215,"m² ":3381,"on ":1530,"щта":2294,"че ":2916,"цен":4575,"чи ":992,"цел":1258,"цес":947,"цер":828,"циа":2597,"ции":1846,"цио":6596,"цит":1550,"ция":11555,"ча ":1350,"цар":1123,"цат":1708,"цки":1353,"чев":1376,"чен":5933,"чес":8800,"чер":977,"ши ":3970,"чет":1965,"чле":872,"чки":1198,"чин":1845,"чит":1626,"ша ":1093,"чва":2149,"час":5618,"чал":1786,"чан":1529,"чав":2550,"ща ":4581,"чре":802,"чна":4150,"чов":1436,"чни":4370,"чно":3174,"us ":4874,"ter":901},"n_words":[7994134,9177756,6462334],"name":"bg"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":455,"E":376,"F":388,"G":399,"A":894,"B":684,"C":848,"L":430,"M":683,"N":429,"O":309,"H":360,"I":507,"J":241,"K":244,"T":598,"W":240,"V":199,"P":651,"S":947,"R":531,"f":1089,"g":1979,"d":2581,"e":8201,"b":1267,"c":2589,"a":8669,"n":6047,"o":5561,"l":3930,"m":2503,"j":208,"k":699,"h":3040,"i":6685,"w":672,"v":711,"u":2926,"t":5635,"s":4238,"r":5985,"p":1752,"z":245,"y":1310,"x":420,"এলা":514,"ক। ":614,"ঐতি":234," l":233," m":235," o":526," i":296," d":312," f":245," a":616," b":228," c":315," t":765," p":493," s":393," r":292," J":235," K":227," H":321," I":408," N":324," O":231," L":375," M":610," B":602," C":719," A":760," F":332," G":372," D":404," E":296," S":771," R":448," P":577," W":226," T":506,"ا":336,"Co":219,"উ":9536,"এ":27004,"ঃ":514,"ং":15654,"ঁ":2141,"ই":18078,"আ":10399,"অ":13627,"চ":16928,"ছ":9717,"ঘ":1893,"ঙ":3088,"ঞ":3794,"ট":30397,"জ":27267,"ঝ":790,"ও":9899,"ঐ":344,"খ":8927,"গ":20205,"ঔ":204,"ক":78264,"ন":86702,"ফ":6705,"প":46274,"ভ":18017,"ব":70605,"য":69717,"ম":48748,"ড":11071,"ঠ":3258,"ণ":10122,"ঢ":578,"থ":13519,"ত":69776,"ধ":12543,"দ":33053,"স":56165,"হ":25168,"়":39420,"া":175719,"ি":114763,"র":156970,"ল":52543,"Ma":241,"শ":24901,"ষ":17272,"ৎ":1686,"্":145506,"ৌ":1675,"ো":19195,"ৈ":1879,"ে":113569,"ৃ":4705,"ূ":5615,"ু":27604,"ী":22483,"৭":2295,"৬":2181,"৯":5541,"৮":2706,"৩":2127,"২":3895,"৫":2215,"৪":1978,"১":8858,"০":4887,"৷":254,"In":199,"।":25409,"Th":253,"b ":285,"a ":1054,"i ":350,"ge":332,"ga":252,"he":773,"ha":472,"gh":243,"g ":316,"ea":343,"ec":205,"ed":350,"de":506,"di":332,"এছা":284,"h ":358,"el":402,"en":818,"em":278,"et":317,"es":737,"er":1291,"ca":315,"e ":2258,"be":225,"da":210,"f ":445,"ct":266,"co":263,"ci":201,"ch":362,"ce":347,"c ":220,"এটি":2063,"d ":875,"at":923,"as":401,"ar":1026,"al":912,"ai":231,"am":360,"an":1536,"ac":322,"ad":251,"ag":317," ১":7243,"nt":573,"ns":267,"no":211," ৯":250," ৮":298," ৭":311," ৬":364," ৫":417," ৪":387," ৩":666," ২":2429,"of":407,"om":400,"on":1111,"ol":393,"ot":205,"os":230,"ou":252,"or":795,"r ":1035,"pe":220,"lo":359," ঢ":514," ড":1933,"ll":307," ঠ":230," ধ":2352," দ":9809," থ":3157," ত":9984," ফ":3863," প":28296," ন":11037," য":6674," ম":15133," ভ":9746," ব":28530," ল":4344," র":8416,"o ":382," শ":7392,"ma":464,"mb":342," স":27697,"me":396," হ":13181,"mi":232,"একট":5136,"একজ":1493,"na":524,"একক":204,"nc":255,"nd":698,"ne":429,"ng":599,"ni":414,"একা":295," ।":1901,"m ":412," এ":26382," উ":6430,"এখা":461," অ":13571," আ":10126," ই":6407,"li":503," জ":9765," ট":1754,"le":571," ঘ":968," ছ":3198," চ":4584," ক":22576,"la":589," ঔ":203," গ":7129," খ":2591," ঐ":343," ও":5820,"n ":1470,"ht":261,"hu":305,"hi":274,"ho":238,"id":213,"ic":689,"ia":484,"ig":326,"ie":307,"ir":212,"is":631,"it":565,"il":352,"in":1040,"io":639,"l ":743,"এবং":4187,"এপ্":233,"y ":803,"ve":326,"x ":253,"ul":217,"ur":460,"us":340,"ut":207,"um":443,"un":241,"to":352,"tr":334,"te":708,"ti":997,"th":909,"ta":488,"ss":271,"st":614,"se":325,"এমন":239,"si":361,"rt":257,"ry":236,"ro":438,"ri":872,"re":667,"rd":212,"ra":839,"t ":920,"s ":1577,"এদে":277,"এই ":3233," �":274,"এক ":911,"এর ":2530,"উনি":291,"উন্":360,"উদ্":622,"উপা":289,"উপর":460,"উপন":490,"উরো":279,"উল্":354,"উৎস":239,"উৎপ":278,"উৎ":563,"উর":543,"উপ":2213,"উস":347,"উল":530,"উক":210,"উজ":216,"উচ":390,"উট":503,"উত":973,"উন":827,"উদ":753,"ইন":1626,"ইব":205,"ইম":223,"ইয":591,"ইর":537,"ইল":683,"ইস":741,"উই":375,"এছ":287,"এখ":589,"এক":9005,"এম":465,"এব":4214,"এপ":275,"এন":320,"এদ":306,"এট":2154,"এই":3366,"ছর ":262,"ংশ":1413,"ংস":1069,"ংল":2600,"ংর":2237,"ংব":422,"ংঘ":236,"ংগ":523,"ংখ":753,"ংক":757,"ঁর":727,"ৎ ":620,"ইত":723,"ইট":668,"ইড":241,"ইজ":233,"আস":454,"আল":1088,"ইক":452,"আয":323,"অ্":840,"আম":516,"আর":1419,"আফ":296,"আব":708,"আধ":289,"আদ":422,"আন":1004,"ইউ":806,"ইং":2317,"আছ":551,"অস":568,"আগ":507,"আক":520,"অল":207,"অর":1154,"অভ":838,"অব":2125,"অফ":360,"অপ":433,"অন":3326,"অধ":754,"আই":743,"অথ":198,"অঞ":597,"অত":316,"অক":381,"অঙ":287,"অং":592,"ঙা":410,"চট":225,"চত":213,"চন":791,"ঙ্":2470,"৩ ":950,"চর":577,"চল":2320,"চী":679,"চু":374,"চা":2514,"চি":4076,"চে":985,"চ্":2421,"ছব":324,"৪ ":965,"ছর":393,"জগ":223,"ঘট":513,"গা":2158,"গস":249,"গি":590,"গী":642,"গু":2497,"গো":907,"গ্":3546,"গে":1783,"১ ":1256,"চক":208,"২ ":1015,"ঝা":497,"গোল":259,"৭ ":899,"টক":320,"টপ":202,"টন":552,"ঞ্":2042,"টব":433,"ঞা":1608,"টা":3413,"৮ ":899,"টর":259,"জন":5856,"ছে":4099,"জধ":330,"ছি":2738,"ছু":520,"গ্র":2862,"ছা":810,"গ্য":399,"জর":278,"৫ ":1051,"জল":300,"ছো":231,"জম":242,"জয":281,"জো":267,"জে":2691,"জু":830,"জী":975,"জি":3650,"জা":4328,"৬ ":1035,"জ্":3683,"গের":766,"ওয":1866,"ক।":744,"ঐত":235,"এশ":223,"এল":685,"এর":2874,"এস":513,"ক্":11379,"খন":635,"কে":8760,"কো":2299,"কী":339,"কি":3416,"কা":12553,"কৃ":1298,"কু":1325,"কস":263,"গর":1257,"০ ":1923,"গল":479,"খে":1085,"গন":254,"গব":429,"খ্":2313,"গঠ":636,"খি":334,"খা":2637,"খু":253,"গড":267,"গত":723,"গণ":853,"কল":1752,"খক":249,"কভ":207,"কম":1122,"কয":216,"কর":9263,"কব":584,"কথ":578,"কদ":234,"কন":446,"কত":422,"কট":5691,"কজ":1571,"কক":371,"পক":777,"নো":1529,"নৈ":281,"ন্":18037,"পথ":251,"পত":1140,"পন":1278,"পদ":1221,"পড":271,"পট":230,"পঞ":305,"নর":378,"৷ ":217,"নন":368,"ধে":496,"নপ":524,"নব":764,"নভ":385,"ধ্":3005,"নম":449,"নয":630,"নু":2067,"নী":3272,"নে":8228,"নস":741,"নি":10383,"না":9095,"বচ":430,"বছ":386,"বঙ":528,"বক":640,"ফল":533,"ফর":691,"পৌ":262,"প্":14673,"পো":493,"বপ":409,"বন":1731,"ফে":647,"বদ":516,"বত":795,"ম।":432,"ফু":801,"বড":289,"ফি":694,"ফা":803,"পশ":1040,"পস":313,"পল":236,"পম":204,"পর":6275,"পে":1955,"পৃ":469,"পূ":1624,"বই":313,"পি":1741,"পা":6024,"পু":2560,"পী":458,"বং":4443,"ভর":238,"ব্":5813,"মক":698,"মগ":375,"ভি":3000,"ভা":10166,"মত":941,"র।":1267,"ভু":429,"ভূ":887,"মণ":247,"ভে":813,"মন":1866,"মদ":356,"মধ":1657,"মব":565,"ভো":225,"মপ":208,"বব":837,"ফো":381,"বয":356,"ফ্":917,"বর":3916,"বল":2788,"বশ":202,"বহ":1569,"বস":2742,"বা":13175,"বি":13020,"বী":1634,"বু":748,"য।":373,"বৃ":690,"বে":6807,"বৈ":512,"বো":1000,"ভব":322,"রগ":376,"রক":3370,"রও":363,"রট":549,"যা":13478,"য়":35819,"রজ":517,"রচ":1226,"যস":235,"রঙ":226,"রদ":1297,"রধ":882,"রত":4811,"রথ":1065,"ল।":866,"যু":2179,"রণ":2957,"যি":521,"রম":1030,"রয":914,"যো":1322,"রব":1734,"যৌ":231,"রভ":428,"রপ":736,"রন":1614,"যে":3967,"মল":265,"যক":743,"ময":969,"ভ্":389,"মর":648,"মহ":1061,"মস":593,"মূ":1527,"মৃ":459,"যত":845,"যদ":240,"ঙাল":366,"মা":10507,"মি":3934,"মী":801,"মু":1962,"মো":1026,"যব":1634,"মৌ":259,"যম":725,"ম্":4843,"যন":902,"মে":4592,"ড়":3388,"ডা":1047,"ডি":1739,"ঠি":877,"ঠা":1161,"৯ ":835,"টে":2184,"ঠন":362,"ট্":2862,"টো":693,"টু":322,"টি":14501,"ণী":616,"ণি":777,"দ।":351,"ণা":1071,"তক":615,"ণত":463,"ঢা":467,"ড্":286,"ডো":228,"ডে":1416,"ত।":1976,"থা":4072,"থি":1849,"ন।":3725,"থন":275,"তে":6636,"তৈ":476,"থব":514,"তো":511,"ত্":11770,"থম":1047,"দক":1036,"তা":9664,"তি":11706,"তৃ":563,"তী":2327,"তু":1609,"তথ":334,"তত":465,"তন":1191,"ণে":1383,"ণ্":713,"তম":1948,"তব":730,"তল":215,"তর":2978,"নট":466,"ধা":3367,"নত":805,"নদ":832,"ধি":1235,"ধী":706,"ধু":525,"দো":593,"দ্":6813,"ধন":270,"দে":7678,"দৈ":238,"নক":807,"নগ":632,"ধর":1424,"নও":200,"দস":274,"দশ":433,"দূ":217,"ধত":234,"দৃ":215,"দী":1553,"দু":1592,"দা":3394,"দি":3189,"থে":3273,"দন":421,"ছে ":1536,"জন ":2036,"দল":773,"দর":900,"থ্":402,"হন":525,"সে":5003,"সী":651,"সু":1284,"সূ":479,"হণ":341,"হত":965,"সৃ":307,"সা":9422,"সি":3847,"সহ":471,"হচ":543,"হল":1575,"স্":12917,"হম":242,"হর":1722,"হয":5736,"সো":556,"হে":854,"হৃ":361,"়।":3879,"হু":683,"হি":3582,"হী":535,"হা":4314,"হ্":705,"হো":256,"়ক":477,"়ত":462,"়ন":1084,"া।":1709,"়ম":211,"়র":208,"ি।":908,"়ু":308,"়ী":503,"াং":3360,"ছু ":408,"াঁ":1482,"়ি":1178,"়া":8455,"়ো":436,"াই":3512,"়ে":7915,"াউ":581,"াও":962,"াক":5532,"়্":205,"াচ":1101,"াছ":570,"াজ":4489,"াখ":950,"াগ":2341,"াঙ":862,"াথ":1203,"াত":6156,"াণ":1382,"ী।":871,"াড":1549,"িং":1140,"াঠ":366,"াট":1770,"াঞ":296,"াভ":908,"াব":4643,"াফ":353,"াপ":2739,"ান":18665,"িউ":1255,"াধ":2334,"াদ":5045,"িক":11087,"িখ":922,"াল":10227,"িও":631,"ার":30756,"িএ":231,"াম":7273,"ায":8979,"িছ":516,"িজ":2552,"াস":5883,"িচ":1754,"াহ":2942,"িগ":546,"াশ":1723,"াষ":6041,"িট":1865,"িড":639,"িত":12239,"িণ":858,"িদ":2453,"িন":8978,"িধ":463,"িপ":1269,"িব":2698,"িফ":329,"িম":2790,"িভ":1825,"াৎ":271,"িয":6762,"ির":6023,"িল":5504,"ীক":817,"িশ":3794,"ীগ":209,"িষ":2506,"িস":4278,"িহ":941,"ুই":664,"ীদ":382,"ীত":1782,"ীপ":505,"ীন":1798,"ীয":4089,"িৎ":317,"ীম":347,"ীব":842,"ীল":338,"ীর":2436,"ুগ":541,"ুক":2294,"ুখ":360,"ুজ":362,"ুট":930,"র্":19499,"রো":2284,"রে":15121,"রূ":464,"রু":2561,"রী":3578,"রা":17630,"রি":11429,"রস":1549,"রহ":932,"রশ":407,"রল":454,"লয":931,"লম":504,"লব":520,"লন":1184,"লত":1046,"লট":257,"লচ":904,"লগ":240,"লক":1697,"ল্":3376,"লো":3053,"লে":9966,"লী":1486,"লু":671,"লা":9534,"লি":6096,"লস":278,"শক":649,"শব":592,"শন":1507,"ষক":239,"শর":329,"স।":308,"শত":558,"শট":242,"শে":3570,"ষম":327,"শ্":3936,"শো":350,"ষয":380,"সক":584,"শহ":1521,"শু":848,"শী":865,"সং":3293,"শি":3267,"শা":2252,"ষণ":732,"সন":1003,"ষে":1161,"সফ":273,"সব":1306,"সভ":427,"সম":3977,"ষ্":5176,"সর":1993,"সল":739,"হক":226,"সঙ":453,"ষা":5502,"ষি":1250,"সত":301,"সদ":563,"ৎস":482,"ৎপ":325,"্শ":719,"্স":2033,"্ষ":4401,"্ম":4357,"্ভ":1012,"্র":30653,"্য":25184,"্ল":2305,"্ত":14406,"্ণ":1498,"্ড":2937,"্ঠ":1626,"্ট":7869,"্ঞ":1750,"্ব":10276,"্প":3453,"্ন":2407,"্ধ":2336,"্দ":4738,"্থ":5602,"ৌল":231,"ৌর":302,"্ছ":784,"্জ":1742,"্চ":4084,"্গ":2822,"্ঘ":304,"্ক":3885,"োয":771,"োর":1773,"োল":1204,"োব":621,"োভ":229,"োম":910,"োষ":455,"োস":598,"োহ":395,"োঝ":365,"োজ":493,"োড":367,"োট":842,"োদ":303,"োত":491,"োপ":763,"োন":1801,"োধ":294,"োক":1031,"োচ":456,"োগ":1362,"গঠি":273,"খান":1047,"গঠন":344,"ৈর":614,"েশ":5180,"েল":4085,"ের":25063,"েয":1289,"েম":1903,"েভ":263,"েব":2130,"েপ":618,"েন":8060,"ৈত":343,"েহ":255,"েষ":1422,"েস":1309,"েও":994,"েখ":1729,"েক":4154,"েই":1429,"েউ":248,"েট":1435,"েড":786,"েত":1719,"েদ":505,"েগ":411,"েছ":3105,"েজ":2937,"ে।":4052,"ৃথ":340,"ৃত":2338,"ৃহ":465,"ৃষ":756,"ূত":331,"ূপ":414,"ূম":517,"ূর":2094,"ূল":1187,"ূহ":378,"ুত":1038,"ুড":349,"ুন":1462,"ুদ":1447,"ুব":643,"ুপ":599,"ুয":1183,"ুর":4508,"ুভ":198,"ুম":876,"ুল":3399,"ুস":1262,"ুষ":874,"ুশ":279,"কেট":226,"কেন":738,"কের":985,"কোন":942,"ক্ত":2888,"ক্ট":455,"ক্ষ":3987,"ক্ল":430,"ক্র":1781,"ক্য":806,"ক্স":679,"কিছ":467,"কাহ":230,"কিন":953,"কাব":216,"কায":382,"কার":4553,"কাল":1045,"কাশ":812,"কিল":233,"কিস":331,"কুল":267,"কুর":331,"কুম":261,"কৃত":1025,"কৃষ":259,"কে।":511,"কেই":224,"৮০":213,"৭১":230,"৯৯":610,"৯০":387,"৯৫":459,"৯৬":482,"৯৭":685,"৯৮":501,"৯১":321,"৯২":343,"৯৩":359,"৯৪":434,"৮৯":204,"৮৮":200,"গুর":367,"গুল":1495,"৩০":296,"২৪":220,"গীত":413,"৫০":212,"গান":542,"গার":291,"গায":230,"১৬":326,"১৫":332,"১৮":996,"১৭":415,"১২":292,"১১":238,"১৪":244,"১৩":256,"১৯":3860,"২০":1142,"০০":1455,"১০":452,"গস্":210,"্র।":452,"্যক":735,"্মা":901,"্মী":308,"্মি":327,"্যন":687,"্মে":324,"্যত":807,"্যম":709,"্যব":1629,"্রচ":617,"্যস":234,"্রক":1414,"্যু":488,"্যি":244,"্যা":10276,"্রজ":405,"গবে":354,"্বত":419,"্বন":491,"্বপ":359,"্বব":695,"্বর":1021,"্বা":1809,"্য।":373,"্বি":841,"্বী":559,"্বে":1025,"্বো":213,"্মগ":199,"্ভা":239,"্লা":616,"্লি":367,"গরে":240,"্লে":524,"্রব":578,"্যো":301,"্রম":577,"্রভ":304,"্রয":464,"্রত":2154,"্রণ":361,"্রদ":1113,"্রথ":1019,"্রন":758,"্যে":2625,"্রধ":869,"্রপ":383,"্রা":4952,"্রি":3967,"্রী":1344,"্রু":685,"খ্য":1786,"্রশ":293,"খ্র":509,"্রস":463,"্রহ":735,"্রো":780,"্রে":2630,"্ষণ":271,"গণি":276,"্ষি":1061,"্ষা":679,"্ষে":819,"্ষম":296,"গড়":257,"্শন":406,"খেল":564,"্সে":379,"্সি":438,"গত ":349,"খা ":665,"কজন":1518,"খে ":204,"। ":19077,"কলে":496,"কল্":282,"করত":378,"করণ":527,"করে":4422,"করা":2433,"কলক":407,"কর্":871,"কম্":406,"কাজ":532,"কাছ":278,"কাত":521,"কান":552,"গে ":596,"গর ":415,"কটি":5240,"ঃ ":340,"ং ":5055,"কবি":409,"ই ":6830,"উ ":312,"কথা":468,"ও ":6690,"এ ":847,"চ ":769,"খ ":354,"গ ":1629,"ক ":11586,"ত ":12607,"ণ ":3613,"ঠ ":296,"ড ":1706,"ট ":3372,"জ ":1903,"ব ":3202,"ফ ":611,"প ":1284,"ন ":20715,"ধ ":808,"দ ":2233,"থ ":1007,"কা ":1480,"ল ":8450,"ওয়":1861,"ভ ":587,"ম ":6723,"কি ":282,"য ":4809,"র ":54334,"হ ":1019,"় ":12855,"শ ":3191,"খন ":279,"কে ":5302,"ষ ":1259,"স ":4530,"া ":26160,"ী ":7142,"ু ":2897,"ি ":25805,"ো ":2835,"্ ":1910,"কল ":227,"ে ":37314,"তর ":887,"ডার":368,"ডিস":334,"ডিয":319,"তন ":462,"ণে ":593,"তম ":977,"ড়া":1089,"ড়ি":682,"ড়ে":451,"১৯০":231,"১৯১":267,"তু ":433,"১৯৪":366,"১৯৫":361,"১৯২":273,"১৯৩":285,"১৯৮":393,"১৯৯":498,"১৯৬":398,"১৯৭":613,"তী ":298,"তে ":4456,"ডের":406,"ডেন":243,"তি ":2372,"তা ":2207,"ঠান":465,"ঠিত":686,"২০০":842,"দ। ":304,"ণা ":499,"টাব":261,"টান":241,"টিক":566,"টার":1384,"টাল":237,"টিত":421,"টিন":246,"টিশ":274,"টির":990,"টাই":256,"ডে ":223,"ণত ":291,"টোব":204,"ট্র":2092,"ট্য":377,"ট্ট":258,"টেম":345,"টের":443,"টেল":237,"ঞান":1488,"০০০":220,"ত। ":1457,"ডি ":251,"ঞ্চ":1271,"ঞ্জ":764,"টবল":369,"ড় ":618,"টি ":10700,"জ্ঞ":1748,"টা ":567,"জ্য":1590,"টে ":252,"ঠন ":205,"ঝায":280,"ঠা ":199,"ছোট":215,"জয়":273,"টন ":232,"জান":513,"জাত":1408,"জার":1034,"জিক":202,"জিত":234,"জীব":714,"জুল":281,"জুন":236,"জেন":272,"জেল":1183,"জের":493,"চ্চ":1435,"চ্ছ":780,"ছবি":323,"ছিল":2594,"ছাড":436,"জধা":329,"ছেন":778,"জনপ":400,"জনী":266,"জন্":2129,"ছে।":1579,"জি ":2005,"৭১ ":202,"ঙ্গ":2125,"ঙ্ক":292,"জা ":285,"চনা":453,"চন্":198,"জে ":350,"চলি":385,"চলে":500,"চলচ":862,"চরি":276,"চিত":2401,"চাল":768,"চার":933,"চিম":924,"The":204,"চেয":454,"চীন":632,"পৃথ":340,"মক ":218,"পূর":1563,"পুর":1770,"পুত":228,"বংশ":207,"পের":502,"পেন":312,"পশ্":957,"পাশ":229,"পিত":257,"পাক":348,"পান":707,"পিউ":331,"পাত":414,"পাদ":424,"পাধ":205,"পার":1335,"পাল":505,"পায":279,"পাও":244,"পরে":327,"পর্":1509,"বী ":353,"পরি":2733,"য। ":281,"পরব":233,"বে ":2767,"পন্":745,"পদা":548,"পদ্":310,"বা ":2607,"বি ":661,"পঞ্":305,"বর ":1036,"পড়":268,"বল ":484,"পত্":595,"পতি":403,"বন ":272,"ন্ত":3819,"ন্ড":1774,"ন্ট":1141,"ন্স":716,"ন্থ":421,"ন্দ":2863,"ন্ধ":523,"ন্ন":1373,"ন্ম":1138,"ন্য":3823,"ম। ":348,"নেক":580,"নেত":451,"নের":3172,"নৈত":249,"নোব":216,"নুয":400,"নুষ":645,"নুস":424,"নিস":305,"নীত":622,"নীয":704,"নীর":259,"নিত":220,"নিজ":306,"নির":1511,"নিয":1391,"নিব":223,"নিম":242,"নাথ":204,"নী।":283,"নাট":333,"নিক":1359,"নাল":376,"নার":884,"নায":377,"নাম":2830,"নিউ":358,"নান":283,"নাই":230,"না।":209,"পে ":392,"বং ":4190,"নয়":574,"ধ্য":2527,"পি ":200,"নভে":235,"ধ্ব":397,"নপ্":410,"নদী":606,"ধের":201,"নতা":305,"নটি":399,"al ":424,"পর ":878,"ধার":1388,"ধিক":529,"ধান":1581,"ধুন":236,"ধীন":509,"and":351,"an ":333,"নগর":339,"ধর্":622,"নকা":301,"দ্দ":285,"দ্র":1656,"দ্য":1705,"দ্ব":1242,"দ্ভ":351,"দ্ধ":1443,"ধরণ":208,"ধরন":300,"দোল":241,"ধতি":214,"দেখ":386,"দেশ":3340,"দেব":447,"দের":2400,"দেয":262,"দীর":527,"দুই":388,"দুর":229,"পক ":215,"ati":407,"দর্":464,"০০ ":448,"দলে":293,"নী ":1022,"দশক":219,"নে ":2592,"নো ":750,"দস্":252,"দায":342,"দান":683,"দিক":556,"দার":1068,"দিয":527,"দিন":460,"ন্ ":258,"থিত":1239,"থিব":279,"থান":907,"থার":216,"থাপ":342,"থের":228,"থেক":1914,"থ্য":318,"নি ":3145,"না ":2078,"ত্ত":2681,"ত্ব":1563,"ত্র":5371,"ত্য":1749,"ত্ম":200,"১০ ":255,"ধি ":218,"দক্":833,"ধে ":246,"থাক":949,"তাঁ":801,"তি।":212,"তাক":433,"তা।":273,"তিত":496,"তিন":2758,"তিব":469,"তিয":323,"তির":977,"তিহ":514,"তিষ":915,"তিস":234,"তাদ":379,"তান":831,"তাব":456,"তায":251,"তার":2316,"তাল":484,"তিক":1444,"তুল":199,"তুর":359,"তীর":252,"তীয":1406,"তৃত":307,"তের":1547,"তৈর":454,"থবি":319,"তর্":666,"দী ":536,"তরা":670,"তরে":281,"দু ":322,"ণ্ড":418,"দা ":406,"তমা":738,"দি ":552,"ণ্য":221,"তবে":327,"দে ":324,"ণিত":430,"দর ":218,"তন্":360,"ণের":721,"তথ্":200,"তত্":385,"থা ":838,"ন। ":2670,"থে ":916,"তকে":220,"ঢাক":444,"থম ":866,"রকা":1796,"লত ":375,"রচন":240,"রকে":273,"রক্":515,"লন ":417,"য়ক":403,"য়ন":1056,"য়ত":420,"রজা":344,"য়র":205,"য়ম":204,"য়ো":399,"য়ে":7441,"য়ু":262,"য়ী":440,"য়ি":464,"য়া":7334,"রচল":394,"রচি":290,"য়।":3790,"যুগ":221,"যুক":999,"যুদ":441,"যাক":626," । ":1068,"যাট":233,"যাত":1108,"যাদ":399,"যান":2143,"যাপ":621,"যাব":244,"যাম":399,"যাল":1187,"যার":1249,"যায":1288,"যিক":236,"যাস":933,"রটি":492,"যিন":243,"রথম":978,"রত্":227,"রতে":1478,"রতি":1816,"রতী":622,"রণে":668,"রণা":328,"রণত":263,"রপত":250,"ion":522,"রন্":434,"রনা":231,"রনে":444,"যেম":243,"যের":1196,"রধা":866,"রদে":667,"রদা":349,"রভা":262,"লি ":1224,"রবা":369,"রবি":326,"লা ":2174,"রবর":362,"যোগ":846,"রমা":242,"লী ":505,"রয়":679,"লে ":3437,"রহণ":319,"রস্":785,"লো ":880,"ল্ ":250,"রাই":307,"রাক":460,"রাচ":539,"রাজ":2647,"রাখ":212,"রাণ":570,"রাপ":301,"রাব":230,"রান":1015,"রিক":1905,"রার":655,"রাম":984,"রায":1265,"রিজ":258,"রাস":948,"রিচ":1336,"রাহ":202,"রাশ":201,"রাষ":1115,"রিত":670,"রিট":394,"রিম":358,"রিব":807,"রিন":213,"রিস":680,"রিয":1449,"রিল":310,"রীক":256,"রীয":402,"রীর":289,"রুয":235,"রুত":425,"রূপ":403,"রে।":1025,"রেন":1720,"রের":3004,"রেল":310,"রেশ":198,"রেছ":854,"রেজ":2190,"রেট":233,"রেক":207,"রেস":215,"রোম":235,"রোপ":366,"রোগ":263,"র্ণ":1236,"র্ত":1912,"র্ড":513,"র্ট":567,"র্ব":2473,"র্ধ":342,"র্ন":427,"র্থ":1669,"র্দ":480,"র্ল":198,"র্য":1584,"র্ম":2023,"র্ভ":416,"র্স":595,"র্ষ":411,"র্শ":716,"র্চ":340,"র্জ":782,"র্ক":1521,"র্ঘ":284,"র্গ":673,"লকা":488,"লচ্":855,"লক্":395," Ma":238,"he ":468,"লতে":397," Co":216,"লটি":222,"লনা":313," Th":233,"লয়":917,"লম্":237,"লাই":481,"লাক":628,"লান":256,"লাদ":1657,"লাস":227,"লিখ":198,"লিক":746,"লার":1528,"লাভ":487,"লায":309,"লাম":488,"লিপ":223,"লিন":254,"লিত":865," of":374," an":214,"লে।":338,"লেও":259,"লের":1832,"লেন":1829,"লেজ":413,"লেখ":829,"লেক":269,"লীন":213,"লিম":293,"লিয":577,"লির":379,"ing":263,"লীয":377,"in ":228,"ল্প":990,"ল্ড":206,"ল্য":834,"ল্ল":756,"স। ":246,"লোচ":236,"লোম":202,"লোর":312,"লোয":202,"লোক":706,"শন ":671,"hum":238," th":574,"মত ":200,"মন ":560,"ফরা":505,"র। ":1082,"প্ত":664,"প্য":216,"প্র":13060,"প্ল":304,"প্ট":290,"পৌর":203,"ভা ":303,"বনি":284,"er ":443,"বনে":288,"বন্":515,"es ":375,"ফেব":217,"বচে":366,"বঙ্":528,"ent":254,"বছর":385,"ফুল":203,"বড়":273,"ফুট":478,"ফার":283,"বলে":833,"মে ":1672,"বস্":2112,"বহু":351,"বহৃ":337,"বহা":619,"ফোর":229,"ববি":644,"বপূ":262,"মি ":517,"ফ্র":706,"মা ":428,"বয়":292,"মী ":371,"বলা":524,"বলত":367,"বর্":2305,"বৃহ":359,"বেক":203,"বেল":404,"বের":931,"বেশ":937,"বেষ":386,"বোঝ":365,"য় ":12199,"বাই":313,"বাং":2410,"বান":279,"বাদ":861,"বাধ":440,"বার":1699,"বাম":198,"বায":216,"বাঙ":403,"বাজ":214,"বাচ":199,"বিত":730,"বিদ":1525,"বিধ":272,"বিন":362,"বিপ":215,"বিব":278,"বিভ":1099,"বিয":248,"বির":389,"বাল":212,"বিখ":512,"বিক":671,"বাহ":872,"বিচ":203,"বাস":937,"বিজ":1409,"রও ":300,"বীর":327,"বীপ":377,"বিষ":640,"বিস":375,"বিশ":2344,"ভাই":208,"রম ":217,"মগ্":268,"রন ":224,"যে ":1841,"রত ":384,"রণ ":1177,"ল। ":672,"ব্র":1160,"ব্য":3166,"ব্দ":1107,"যা ":1724,"রা ":4296,"রি ":1578,"ভ্য":238,"মবঙ":362,"মন্":664,"ভেম":223,"মধ্":1555,"মতা":307,"ভূম":496,"ভুক":240,"ভাগ":718,"ভার":2311,"ভাব":1222,"ভাষ":4828,"ভিয":254,"ভিন":1250,"ভিত":372,"ed ":213,"মাই":303,"মিন":199,"মিত":518,"মিট":419,"মাস":327,"মিক":501,"মাল":710,"মার":1752,"মান":3429,"মাত":575,"মাধ":574,"মাণ":369,"মাজ":400,"রো ":310,"মহা":801,"মস্":221,"রে ":4008,"যক্":405,"রু ":494,"রী ":1730,"ময়":958,"মৌল":207,"ম্র":226,"ম্য":700,"ম্ভ":243,"ম্ম":363,"ম্ব":1398,"ম্প":1633,"যমে":422,"যন্":880,"যবস":455,"যবহ":926,"মেন":347,"মের":1210,"মিশ":280,"মিল":414,"মীয":198,"মুখ":293,"মুক":494,"মুদ":244,"মুস":261,"মূল":1049,"লক ":453,"মূহ":371,"মৃত":392,"যতম":572,"সলা":347,"re ":199,"হচ্":536,"সাং":239,"সাম":709,"সায":256,"সার":942,"সাল":2557,"সিক":500,"সাধ":708,"সান":242,"সাব":458,"সিত":294,"সাহ":727,"সাই":326,"সাথ":746,"সাগ":439,"সীম":247,"সিন":266,"সির":308,"়ক ":242,"সূর":236,"সত্":211,"সদস":228,"ষেত":374,"সন্":338,"ষের":340,"সবচ":368,"সবা":223,"ষ্ক":275,"ষ্ণ":260,"ষ্ট":2671,"সভা":316,"ষ্ঠ":1554,"সময":711,"ষ্য":236,"সমা":613,"সমূ":375,"সম্":1299,"সরক":840,"সর্":566,"়ন ":433,"হিন":860,"হিস":1250,"হাদ":208,"হান":202,"হাম":272,"হায":221,"হার":1136,"হাস":659,"হিত":775,"হাজ":205,"হৃত":340,"সেই":206,"সেম":294,"সেব":1065,"সেপ":226,"সেন":564,"হতে":419,"সৃষ":262,"হত্":332,"সের":1064,"হয়":5615,"া। ":1439,"স্ক":1611,"স্ট":2276,"স্ত":2428,"স্থ":3469,"স্প":563,"স্ব":1109,"স্য":561,"হলে":402,"হলো":289,"হরে":416,"াগ ":429,"াও ":315,"াক ":329,"়া ":2568,"়ি ":218,"হ্য":294,"হের":399,"াই ":889,"়ে ":3374,"়ী ":420,"ি। ":726,"াল ":1724,"িও ":348,"ার ":13003,"াম ":1773,"াভ ":401," ১০":339,"াব ":352," ৩০":204," ১২":242,"়নে":202," ১৩":220," ১৪":201," ১৫":282," ১৬":286," ১৭":371," ১৮":940," ১৯":3783,"াহ ":308," ২০":1046,"াস ":1335,"াশ ":330,"িক ":5573,"িং ":561,"ী। ":724,"াট ":241,"াজ ":536,"াপ ":211,"ান ":5410,"াদ ":431,"াণ ":268,"াত ":1005," এই":3270," উৎ":560,"ng ":235,"শে ":909," উদ":684," উন":258," উত":959," উচ":373," উল":389," উপ":2191," ইত":669," আস":453," আয":322," অ্":835," আম":509," আর":1370," আল":1083," আধ":289," আদ":422," আন":905," ইউ":775," আফ":294," আব":706," উই":304," ইস":476," ইল":209,"nd ":283," ইর":274," ইয":287," ইন":731," অঙ":286," অক":379," অত":316," অঞ":596," অপ":429," অফ":358," অব":2124," অথ":198," আই":665," অধ":751," অন":3315," অল":206," আক":517," অভ":831," অর":1146," আছ":551," আগ":504," অস":564," ইং":2299,"শি ":343," অং":591,"শী ":336,"ষণ ":226," জ্":458," জা":2134," জি":334," জী":551," জু":646," জে":1308," জো":204," জল":229," ছো":231," ছি":1929," ছা":399," জন":3064," ছব":300," চা":848," চি":755," চে":275," চল":1081," চর":388," ২ ":205," ১ ":302," গ্":1853," গো":546," গে":282," গু":745," গি":203," ঘট":372," গা":986," গব":363," খ্":684," খে":607," গণ":589," গঠ":432," খা":596," খু":226," কা":3277," কি":1369," কু":537," কৃ":271," কে":1060," কো":1627," ক্":2305," কন":254," কথ":522," কব":408," কর":8000," কম":804," কল":1072," ওয":520," এস":383," এশ":223," এর":2835," এল":645," ঐত":234," এছ":286," এখ":588," এক":8988," এম":399," এব":4212," এপ":273," এন":256," এদ":302," এট":2135," ফে":435," বন":336," ফা":463," বড":264," ফি":389," ফু":675," বছ":364," পো":225," পৌ":257," প্":11714," ফর":609," ফল":347," বে":1555," বো":669," বৈ":440," বি":7573," বা":7403," বৃ":550," বু":368," বস":531," বহ":441," ফ্":592," বল":1749," বর":1400," মধ":1533," মন":645," ভে":259," ভূ":442,"শক্":282," মত":465," ভা":7371," ভি":710,"শকে":227," ব্":3507," ম্":464," মৌ":257," মো":583," মে":1083," মৃ":318," মূ":802," মু":1346," মি":1118," মা":4397,"সন ":262,"ষে ":226," মহ":923," মর":212," ধা":649," নদ":623," দে":2058," দৈ":202," দ্":1551," ধর":1182," নগ":253," না":3576," নি":3394," নী":245," নে":783," ধ্":292," নভ":274,"of ":368," পঞ":266," পড":228," পদ":913," পত":409," ন্":272," নো":287,"সব ":353," পু":1605," পা":3594," পি":495," পে":490," বই":234," পূ":997," পৃ":428," পর":4825," পশ":1001," তব":328," তত":253," তথ":332," তি":2653," তা":3867," তু":325," দক":818," ত্":257," তৈ":474,"ষা ":1614," তে":303," থা":1028," দল":654," দর":261," থে":1958," দি":1421," দা":849," দু":810," দশ":293," টা":272," টি":345," টে":546," ট্":234," ডি":734," ডা":459," ডে":331," ঢা":461," হয":5669," স্":3980," সো":334," হল":1487," সি":1221," সা":6870," সহ":408," হচ":537," সে":1681," হত":623," সৃ":267," সূ":402," সু":981," সী":216," সব":807," সম":3235," সন":329," সর":1532," সঙ":383," সত":207," সদ":329," হো":212," হ্":276," হা":1267," হি":1859," হে":312," শক":251," শা":1199," শি":1188," সং":2886," শু":645," শহ":1500," সক":314," শে":357," শ্":581," শত":509," শব":479,"on ":532," রি":295," রা":3902," রে":556," রু":295," রূ":298," রো":452," রক":302," যা":2826," রচ":547," যু":1137," যি":261," রয":403," রব":219," যো":242," যৌ":226," যে":1328," লি":670," লা":967," লে":811," লো":608," লক":345,"স্ ":266," ও ":4589,"শাখ":311,"শাস":697,"শিত":339,"হর ":728,"শাল":206,"শিক":625,"le ":226,"শহর":1422,"সে ":658,"হন ":263,"সী ":240," এ ":753,"হণ ":261,"সি ":877,"শব্":493,"সা ":275,"শনে":210,"শনা":303,"শতক":230,"শতা":251,"শটি":231,"mb ":231,"ষাত":229,"ষিণ":767,"ষায":1936,"ষার":498,"ষাব":214," স ":339,"ষা।":354,"সঙ্":452," র ":316,"়। ":2782,"ষয়":378,"হী ":223,"শ্র":601,"শ্য":362,"শ্ব":1645,"ষমত":242,"শ্চ":1032,"শেষ":751,"শের":1660,"ষণা":351,"শুর":445,"সংব":214,"হল ":715,"সংস":804,"শিয":443,"শিল":531,"সংক":417,"সংগ":424,"সংখ":731,"সংঘ":234,"শিষ":385,"ুসা":325,"ুষ্":372,"ুসল":207,"ূমি":444,"ূলক":257,"ূলত":394,"ে। ":2956,"ূর্":1748,"ুক্":1812,"ুটি":275,"ুড়":269,"ুটব":355,"ুপ্":205,"ুনি":328,"ুদ্":1024,"ুত্":534,"ুলা":360,"ুলি":957,"ুলো":677,"ুরস":488,"ুরা":502,"ুরু":902,"ুরে":214,"ুর্":475,"ুয়":999,"ুমা":378,"ৃহত":230,"েও ":751,"েক ":677,"েজ ":442,"েট ":476,"েড ":233,"েত ":298,"েন ":2383,"ৃতি":667,"ৃথি":275,"ৃত্":530,"ৃষ্":660,"েই ":990,"াচী":479,"াঙা":392,"াঙ্":430,"াজে":291,"াজ্":1187,"াজা":637,"াজি":350,"াজধ":330,"াজন":429,"াছে":290,"াখা":510,"াগর":531,"ীন ":996,"াকা":1674,"াকৃ":214,"াকি":417,"াক্":624,"াকে":1517,"াগা":228,"াগু":251,"াগে":340,"ীত ":407,"াওয":538,"ুই ":270,"াপে":239,"াপি":216,"াপা":332,"াপ্":438,"াবল":223,"াব্":798,"াবে":1282,"াবি":607,"াবা":550,"ামক":357,"ার।":391,"াথে":795,"াদা":503,"াদি":433,"াদী":233,"াদে":2459,"াদ্":228,"ানক":226,"াধি":292,"াধী":407,"াধা":767,"িউট":438,"াধ্":701,"ানম":216,"ানব":285,"ানা":935,"ানি":1277,"ানী":1247,"ানু":818,"ানে":2964,"ান্":3257,"ানো":483,"াপক":268,"াপন":257,"াণী":233,"াণি":288,"াতী":444,"াতি":1024,"াতা":904,"াতন":242,"াণে":256,"ান।":554,"াত্":1070,"াতে":889,"ীর ":1512,"াটি":687,"াঞ্":296,"াট্":216,"াড়":1116,"িন ":1620,"িদ ":390,"িত ":5217,"িণ ":578,"িজ ":216,"়ের":988,"়েল":232,"াইল":290,"িস ":391,"়েন":224,"়েত":314,"়েছ":1789,"িশ ":479,"াইক":267,"াইন":398,"াইট":347,"়িত":393,"াঁর":695,"াংশ":369,"িল ":1310,"াংল":2405,"়াড":214,"াৎ ":225,"়াল":346,"়াম":294,"ির ":3233,"়ার":2357,"়ান":720,"়াত":233,"িম ":623,"িহা":558,"ংশ ":608,"িস্":1568,"িসা":395,"িসি":199,"িসে":1302,"ূল ":317,"িরি":292,"িরা":288,"ির্":1412,"িলা":304,"িলি":409,"িলে":1534,"িল্":792,"িলো":314,"িশন":236,"িষ্":1946,"িশি":298,"িশে":548,"িষয":343,"িশ্":1581,"ীবন":365,"ীমা":216,"ীয়":4081,"ীর্":354,"ীরে":207,"ৃত ":857,"umb":265,"ীতি":604,"ীতে":555,"ীদে":242,"ীনত":266,"াহা":469,"িচা":733,"াহী":229,"িচি":688,"াহি":1315,"াস্":708,"াসে":596,"াসি":1223,"াসা":419,"াসী":261,"াষ্":1171,"াসন":338,"াষা":4713,"াশি":537,"ুর ":1172,"িটি":780,"িটা":629,"িজ্":1437,"িজে":324,"িছু":475,"ালক":364,"ালয":874,"ালন":396,"ারী":1146,"ারি":1557,"ারা":1659,"ার্":5067,"ারে":2345,"ায়":8671,"ারক":300,"ারন":347,"ারণ":1235,"ারত":1942,"ামি":443,"ামা":748,"াম্":609,"ামে":1488,"ামী":317,"াশন":211,"িখ্":507,"ুন ":561,"িকা":1858,"িকি":198,"িকে":1068,"িক্":1074,"ালী":661,"ালা":739,"ালি":1410,"ালো":249,"ালে":2817,"াল্":297,"িপি":219,"িপ্":331,"িনে":565,"িনা":408,"িনি":2917,"িনী":642,"িনয":241,"িন্":1908,"িমি":290,"িম্":258,"িমে":259,"িমা":636,"িল।":473,"িয়":6354,"িবা":741,"িবী":293,"িবি":361,"িবে":347,"িবর":417,"িভি":889,"িভা":580,"িমব":382,"tio":423,"thu":236,"িদ।":233,"ুল ":613,"িডি":222,"িত।":1272,"িদ্":1210,"িদে":199,"িনট":209,"িধা":259,"ter":251,"িতী":319,"িতি":339,"িতা":574,"িতে":1209,"িত্":2907,"the":401,"ঁর ":674,"্র ":2468,"্য ":4783,"োঝা":363,"্ম ":1055,"্ব ":1443,"্প ":322,"োগ্":438,"্ন ":1119,"্স ":725,"্ষ ":348,"ংশে":259,"ংসদ":201,"্ট ":1640,"্জ ":260,"্চ ":547,"ংলা":2409,"ংরে":2127,"্দ ":444,"োকে":215,"্ধ ":572,"্ত ":2310,"ংবা":245,"্থ ":613,"্ণ ":661,"্ঠ ":206,"্ড ":1005,"্ক ":455,"্গ ":569,"ংক্":310,"ংখ্":741,"্ঞা":1605,"্জা":489,"্জি":208,"্জে":211,"্জন":340,"্ছে":599,"্ত।":316,"্ডি":303,"্ডা":322,"্ডে":556,"্ঠা":754,"্ঠি":421,"্টি":1218,"্টা":1173,"্টো":370,"্ট্":1738,"্টে":710,"্দ্":1049,"আর ":224,"্দো":445,"্দা":237,"্দি":541,"্দী":307,"্দে":669,"্দু":407,"্ধত":232,"্দর":217,"্থা":2220,"্থি":1299,"্থে":279,"্থন":231,"্তে":276,"্থব":329,"্ত্":1963,"্তা":1329,"্তি":2212,"্তী":445,"্তু":740,"্তৃ":252,"্তন":456,"্তম":911,"্তর":2524,"্বক":355,"্প্":259,"্পে":443,"্পা":452,"্পী":215,"্পি":552,"্পর":340,"্পন":198,"অংশ":574,"্না":360,"্নি":236,"্ধে":424,"্ধি":256,"্ধা":305,"অক্":342,"ইন ":402,"অঙ্":286,"্কি":934,"্কা":944,"্কৃ":335,"্কে":282,"্গত":198,"্চা":486,"্চি":1888,"্চল":770,"্গে":523,"্গী":329,"্গা":318,"অফ ":315,"ংস্":691,"োনো":215,"োনা":252,"োপা":229,"োমা":248,"োমি":242,"োয়":741,"োরি":210,"োর্":375,"োলা":267,"োলন":236,"েছি":696,"েছে":2368,"েজি":1969,"ইংর":2136,"েক্":594,"আমে":217,"েকে":2070,"েখক":248,"অ্য":839,"আয়":311,"েখা":876,"আবি":225,"েকট":292,"ইউন":276,"আধু":208,"ইউর":313,"�":1457,"আন্":621,"আলো":342,"আর্":422,"আরব":337,"ইটি":293,"অঞ্":595,"অবস":1435,"অভি":728,"আইন":296,"অধি":383,"অনু":1003,"অধ্":210,"অন্":1608,"অনে":547,"আকা":241,"েশ ":1172,"েস ":337,"অর্":1060,"েষ ":526,"েল ":879,"আছে":543,"ের ":22964,"অস্":383,"োর ":595,"োন ":788,"োট ":334,"ৈরী":206,"ৈরি":250,"উচ্":373,"ৈতি":259,"োগ ":311,"উটা":283,"োক ":321,"উত্":954,"েস্":373,"ইত্":216,"ইতি":273,"ইন্":608,"েরা":505,"েরি":683,"েলে":417,"েলি":409,"েলা":1478,"েলো":239,"ইয়":586,"েশন":465,"েশী":418,"েশি":582,"েষণ":370,"ইরা":327,"েশে":1749,"েষ্":229,"েন্":1439,"েনে":324,"েনি":277,"েনী":276,"েনা":409,"েপ্":238,"েবে":991,"েব্":240,"েয়":1277,"েম্":732,"েমি":222,"েমন":290,"ইসল":353,"েটি":246,"েডি":240,"েতা":376,"েত্":488,"েন।":2598},"n_words":[1969690,2210879,1502429],"name":"bn"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"A":167967,"B":105769,"C":184193,"D":56742,"E":163017,"F":65433,"G":73588,"H":42421,"I":60824,"J":44362,"K":25608,"L":169492,"M":139288,"N":48908,"O":35569,"P":128397,"Q":7502,"R":71953,"S":138553,"T":80712,"U":35329,"V":59949,"W":15160,"X":18166,"Y":7184,"Z":7346,"a":5069786,"b":518425,"c":1582005,"d":1999798,"e":4809590,"f":422965,"g":584353,"h":244304,"i":3245856,"j":85607,"k":66957,"l":2733912,"m":1163655,"n":2818805,"o":1980421,"p":975294,"q":256584,"r":2749326,"s":2849210,"t":2490288,"u":1690205,"v":411610,"w":27879,"x":144397,"y":182310,"z":79156,"²":1429,"·":30306,"À":7240,"Á":636,"É":12780,"Í":3623,"Ò":1001,"à":198286,"á":7019,"â":1478,"ã":604,"ä":1134,"ç":38339,"è":199825,"é":298805,"ê":577,"ë":644,"í":174684,"ï":18462,"ñ":3283,"ò":99853,"ó":228041,"ô":729,"ö":1733,"ø":541,"ú":37007,"ü":11730,"ā":2431,"č":430,"ī":1261,"ı":619,"ō":1024,"ş":566,"š":763,"ū":1145,"ʿ":732,"́":819,"Δ":527,"Κ":607,"Μ":435,"ά":1060,"έ":590,"ί":1375,"α":3941,"γ":904,"δ":940,"ε":1681,"η":1323,"θ":467,"ι":2936,"κ":1516,"λ":2022,"μ":1369,"ν":2904,"ο":4163,"π":1084,"ρ":2637,"ς":3296,"σ":1399,"τ":2098,"υ":1042,"φ":460,"ω":718,"ό":1021,"ύ":457," A":166692,"С":472," B":105179," C":176036," D":56284," E":162233," F":64928," G":73223," H":42126," I":60593," J":44254," K":25473," L":168992," M":138313," N":48671," O":35253," P":127947,"а":4416," Q":7462,"б":612," R":71632,"в":1837," S":137876,"г":573," T":73676,"д":937," U":35184,"е":2590," V":59769," W":15003," X":18048,"и":2992," Y":7170,"й":818," Z":7293,"к":1874,"л":1832,"м":805,"н":2553,"о":3273,"п":464,"р":2416," a":725198,"с":2063," b":68648,"т":1583," c":407309,"у":846," d":1356539," e":639012," f":270157," g":94272,"ч":604," h":91458," i":370987," j":31625," k":11008,"ы":477," l":598095,"ь":661," m":263831," n":110413," o":166206,"я":694," p":438961," q":158543," r":159534," s":351560," t":204462," u":330955," v":140137," w":3345," x":7050," y":2383," z":6624," À":7220," Á":633," É":12740," Í":3619," Ò":1001," à":11900," è":4562," é":191856," í":1138," ò":3052," ú":6203,"ו":509,"י":657,"ا":3045,"ب":1225,"ة":477,"ت":440,"ح":495,"د":933,"ر":1451,"س":713,"ع":505,"ل":1984,"م":1193,"ن":1380,"و":1076,"ي":1784," ʿ":526," Δ":517," Κ":595," Μ":434,"A ":9255," С":470,"B ":2075,"C ":13226,"Ab":6627,"Ac":4787,"Ad":3585,"Ae":827,"Af":1295,"Ag":3813,"Ah":707,"Ai":4269,"Aj":795,"Ak":629,"Al":39224,"Am":9397,"An":20902,"Ap":2822,"Aq":9084,"Ar":25400,"As":6532,"At":5083,"Au":9222,"Av":2488,"Ay":587,"Az":1176,"D ":2763,"Ba":39223,"Be":14720,"Bh":637,"Bi":6584,"Bl":4174,"Bo":13076,"Br":13262,"Bu":6221,"E ":3270,"Ca":68472,"Ce":8421,"Ch":13953,"Ci":7812,"Cl":6317,"Co":48282,"Cr":7825,"Cu":5445,"Cy":693,"F ":1643,"Da":7941,"De":14385,"Dh":531,"Di":12144,"Dj":766,"Do":7923,"Dr":2842,"Du":4335,"G ":1233,"Ea":693,"Eb":905,"Ec":954,"Ed":3150,"Eg":2041,"Ei":1183,"El":69098,"Em":4361,"En":13208,"Ep":1014,"Eq":1096,"Er":7104,"Es":39860,"Et":1325,"Eu":9201,"Ev":823,"Ex":2356,"H ":1588,"Fa":5893,"Bà":594,"Fe":9420,"Fi":7653,"Fl":3848,"Bè":916,"Bé":1704,"Fo":13036,"Fr":15905,"Fu":3564,"Bò":470,"I ":21603,"Ga":15801,"Cà":785,"Ge":10838,"Gh":1076,"Gi":6837,"Gl":1477,"Cè":637,"Go":6345,"Gr":13805,"Gu":12143,"Cò":1744,"J ":861,"Ha":12389,"He":8029,"Hi":6527,"Dé":1253,"Ho":7750,"Hu":3320,"Hy":937,"K ":914,"Ib":1566,"Ic":431,"Id":511,"Ie":544,"Ig":1125,"Il":3787,"Im":2928,"In":18160,"Ir":2918,"Is":4771,"It":3174,"Iv":698,"L ":45795,"Ja":9385,"Je":4192,"Ji":1098,"Jo":19183,"Ju":8013,"Fó":1755,"M ":2715,"Ka":7690,"Gà":486,"Ke":2606,"Kh":3464,"Ki":2315,"Kl":511,"Ko":2713,"Kr":1035,"Ku":2506,"N ":2827,"La":56334,"Le":16809,"Li":11134,"Ll":15746,"Lo":13957,"Lu":6126,"Ly":567,"O ":2384,"Ma":59683,"Mc":443,"Me":15371,"Mi":17134,"Mo":23355,"Mu":11733,"My":826,"P ":2536,"Na":13517,"Ne":6267,"Ni":5864,"No":16913,"Nu":1421,"Ob":1520,"Oc":3465,"Od":657,"Oe":644,"Of":675,"Oi":1219,"Ol":5522,"Om":527,"On":1132,"Op":1151,"Or":10390,"Os":2218,"Ot":1074,"Ou":1055,"Ov":557," ا":1151," ب":472,"R ":1759,"Pa":41526,"Pe":16857,"Ph":2138,"Pi":12456,"Pl":6395,"Po":15956,"Lí":769,"Pr":19211,"Ps":950,"Pt":835,"Pu":5518,"S ":6346,"Qa":686,"Mà":1235,"Mè":2592,"Mé":951,"Qu":5784,"Mó":959,"T ":1872,"Mú":915,"Ra":9945,"Nà":738,"Re":23843,"Rh":761,"Ri":9307,"Ro":18718,"Ru":3720,"U ":929,"Sa":46292,"Sc":3755,"Se":22758,"Sh":4186,"Si":10935,"Sk":504,"Sl":456,"Sm":598,"So":13749,"Sp":2135,"St":6579,"Su":12982,"Sy":1208,"V ":6128,"Ta":12185,"Te":13493,"Th":7720,"Ti":4800,"Pè":909,"Pé":634,"To":18580,"Tr":10282,"Tu":4976,"Tx":977,"W ":470,"Uc":601,"Ul":841,"Un":23842,"Ur":3201,"Us":529,"Ut":952,"X ":4850,"Va":25736,"Ve":9006,"Vi":16767,"Vo":2501,"Rú":1355,"Wa":4037,"Sà":468,"We":2882,"Wh":595,"Wi":3723,"Sè":828,"Wo":1818,"Sí":750,"Só":1116,"Xa":1968,"Tà":555,"Xe":604,"Xi":3780,"Té":2128,"Tí":779,"Ya":1516,"Ye":462,"Yo":2907,"Yu":831,"Yv":453,"Za":2055,"Ze":1518,"Zh":746,"Zi":749,"Zo":625,"Ví":574,"Zu":662,"a ":1907139,"b ":69361,"aC":7284,"aM":438,"aT":1147,"c ":117335,"aa":2507,"ab":81857,"ac":168114,"ad":175744,"ae":19041,"af":16941,"ag":47208,"ah":9943,"ai":64967,"aj":11232,"ak":9722,"al":526126,"am":241207,"an":572272,"ao":5807,"ap":55491,"aq":20654,"ar":448864,"as":120225,"at":401723,"au":52031,"av":49622,"aw":4860,"ax":4714,"ay":11377,"az":7229,"d ":243396,"ba":77121,"bb":1707,"bc":1189,"bd":3624,"be":38173,"bf":959,"bg":430,"bh":519,"bi":65502,"bj":4126,"bl":57656,"bm":511,"bn":4600,"bo":28803,"br":91356,"bs":8723,"cT":763,"bt":3126,"bu":24674,"by":1055,"e ":1214321,"ca":278570,"cc":39000,"ce":91107,"ch":30090,"ci":397490,"ck":6861,"cl":40148,"cm":1337,"cn":3405,"co":253884,"cq":789,"cr":50505,"cs":28362,"ct":98830,"cu":58326,"cy":1648,"f ":11433,"da":170876,"db":629,"dd":2022,"de":1171054,"dg":1108,"dh":2156,"di":184633,"dj":2031,"dl":818,"dm":5146,"dn":868,"do":78971,"dq":613,"dr":40396,"ds":14629,"dt":962,"du":43475,"dv":2047,"dw":1267,"dy":1275,"g ":36541,"ea":46323,"eb":28701,"ec":98348,"ed":54037,"ee":9382,"ef":17938,"eg":148516,"eh":3371,"ei":74170,"ej":4770,"aç":6243,"ek":2937,"el":585626,"aè":664,"em":113998,"en":690324,"eo":20271,"ep":77495,"eq":8133,"aí":5793,"er":588374,"es":721405,"aï":5031,"et":119799,"eu":70196,"añ":1235,"ev":44586,"ew":2926,"aó":868,"ex":36681,"ey":6489,"ez":7065,"h ":20052,"fa":45069,"bà":4314,"fe":54681,"ff":2949,"fg":617,"fi":79892,"fl":9361,"bè":1987,"bé":22369,"fo":106827,"bí":1070,"fr":72660,"fs":1040,"ft":1561,"fu":22275,"bò":958,"bó":1179,"i ":486758,"bú":1399,"ga":75189,"gb":771,"gc":471,"cà":10275,"gd":4247,"ge":73143,"gg":1424,"gh":4754,"gi":106632,"gl":34226,"cè":55455,"cé":2934,"gm":2392,"gn":21080,"go":46155,"cí":5094,"gr":62392,"gs":3173,"gt":930,"gu":81931,"cò":5199,"có":1705,"gy":1048,"j ":1114,"cú":854,"ha":86053,"dà":6395,"he":31934,"hi":40755,"dè":10213,"hl":2013,"dé":2052,"hm":2373,"hn":2732,"ho":22152,"dí":3197,"hr":5173,"hs":1334,"ht":3853,"hu":11760,"dò":2833,"dó":2693,"hw":1082,"hy":3032,"k ":14568,"dú":1183,"ia":328149,"ib":41073,"ic":335049,"id":100152,"eà":1569,"ie":89153,"if":32414,"ig":77952,"ih":1878,"ii":2809,"ij":2329,"ik":6237,"eç":1708,"il":121450,"im":89358,"in":289495,"io":108785,"ip":102606,"iq":16936,"ir":97698,"is":214334,"eï":2440,"it":310090,"iu":65412,"eñ":500,"iv":56195,"eò":2925,"eó":1719,"iw":641,"ix":62728,"iy":2272,"iz":5848,"l ":882089,"ja":27282,"fà":1429,"je":8661,"ji":2284,"fè":1129,"jo":14068,"jp":497,"fí":6560,"ju":27368,"fò":2736,"fó":925,"m ":104114,"ka":9761,"gà":2624,"ke":6167,"kh":4960,"ki":7623,"kk":730,"kl":1393,"gè":11938,"km":6156,"ko":4420,"gí":1320,"kr":1224,"ks":2046,"kt":721,"ku":2884,"gò":1533,"gó":2256,"ky":900,"n ":578690,"gú":437,"la":530288,"gü":5534,"lb":8378,"lc":8376,"ld":12030,"hà":2116,"le":272073,"lf":6593,"hâ":823,"lg":12357,"lh":3062,"li":214695,"lk":1865,"ll":191289,"hè":710,"lm":29585,"hé":819,"ln":1295,"lo":99596,"lp":5594,"hí":443,"lq":539,"lr":541,"ls":184444,"lt":65042,"lu":50394,"lv":8536,"lw":659,"ly":3656,"lz":1155,"o ":143542,"ma":183950,"mb":120660,"ià":20729,"me":272481,"mf":2067,"mi":98713,"iç":485,"iè":6430,"ml":501,"mm":10261,"ié":606,"mn":3648,"mo":72058,"mp":89678,"mr":646,"ms":9139,"nT":869,"mt":7221,"mu":85902,"iñ":548,"iò":4221,"ió":161911,"my":1339,"p ":39705,"na":375478,"nb":2542,"nc":178780,"nd":100189,"jà":1447,"ne":166059,"nf":15519,"ng":69078,"nh":3574,"ni":268135,"nj":8272,"nk":3882,"nl":3285,"nm":1655,"nn":16989,"no":122300,"np":434,"nq":5665,"nr":4421,"ns":164301,"nt":494268,"nu":21373,"nv":20735,"nw":559,"nx":2047,"l·":30104,"ny":118633,"nz":6004,"q ":1464,"oa":11667,"ob":72497,"oc":96788,"od":44343,"oe":16400,"of":21715,"og":30375,"oh":5627,"oi":19729,"oj":5019,"ok":3546,"ol":160835,"om":204290,"on":355923,"oo":6465,"op":56175,"oq":4318,"or":346034,"os":142907,"ot":69675,"m²":1392,"ou":85807,"ov":56462,"ow":4060,"ox":4774,"oy":3450,"oz":2678,"r ":408018,"pa":172744,"pc":3590,"là":25527,"pe":221857,"lá":716,"ph":7073,"pi":107129,"lç":1604,"pl":39811,"lè":17093,"lé":5762,"po":126025,"pp":3431,"lí":35679,"pr":140727,"ps":16209,"pt":25645,"pu":33640,"lò":13377,"ló":5245,"py":430,"s ":1473639,"lú":1530,"qa":697,"mà":21678,"mè":8997,"mé":27182,"mí":23527,"qu":250822,"mò":4162,"mó":5285,"t ":625683,"mú":5977,"ra":441040,"rb":17931,"rc":58344,"rd":67527,"nà":7041,"re":497571,"ná":739,"rf":7074,"rg":40352,"rh":1988,"ri":359092,"rj":900,"nç":18933,"rk":6010,"nè":12361,"rl":16633,"rm":73740,"né":4208,"rn":44429,"ro":178134,"rp":7200,"rq":17579,"ní":11412,"rr":85564,"rs":79726,"sT":1915,"rt":160127,"ru":44855,"nò":4723,"rv":15280,"nó":1643,"rw":832,"rx":4521,"ry":6748,"rz":2127,"u ":147534,"nú":2164,"sa":108844,"sb":5605,"sc":71599,"sd":3875,"oà":1370,"se":241029,"sf":4238,"sg":5660,"sh":11747,"si":198022,"sk":5043,"oç":438,"sl":7672,"sm":18866,"sn":2982,"so":88253,"sp":70884,"sq":6816,"sr":1520,"ss":87061,"tT":542,"oï":1001,"st":335577,"su":57136,"sv":1065,"sw":922,"sy":1227,"sz":489,"v ":3284,"ta":510626,"tb":6398,"tc":2726,"pà":3263,"te":302092,"tf":766,"tg":14220,"th":18383,"ti":231505,"tj":8240,"pç":566,"tk":428,"tl":7081,"pè":15203,"tm":4208,"tn":2119,"to":113740,"tp":2252,"pí":2210,"tr":213293,"ts":128142,"tt":11395,"tu":133104,"pò":3840,"pó":2030,"tw":750,"tx":5039,"ty":3045,"tz":30195,"w ":3595,"pú":7209,"ua":125142,"ub":32903,"uc":40135,"ud":47596,"ue":222808,"uf":3537,"ug":17601,"uh":2436,"ui":65611,"uj":2631,"uk":2801,"ul":82163,"um":35757,"un":458542,"uo":1697,"up":31594,"uq":1182,"ur":127636,"us":102797,"ut":89357,"uu":506,"uv":3604,"uw":547,"ux":5117,"uy":1209,"uz":2643,"x ":40539,"va":160676,"qü":2554,"rà":23801,"ve":106866,"rá":563,"vi":85084,"rç":7866,"rè":14693,"ré":11337,"vo":29266,"rí":22061,"vr":1975,"vs":490,"vu":4528,"rò":15984,"ró":4652,"vy":458,"y ":80251,"rú":2239,"wa":9037,"sà":4294,"we":4065,"wi":3435,"sè":5971,"wl":546,"sé":2104,"wn":954,"wo":1249,"sí":4229,"ws":941,"sò":2669,"só":14955,"ww":1633,"z ":11513,"sú":925,"xa":17709,"xc":2574,"tà":33305,"xe":24074,"xf":784,"xi":24181,"tè":9568,"té":9288,"xo":5611,"xp":7246,"tí":21499,"xt":10062,"xu":1571,"tò":24328,"tó":12504,"tú":1377,"ya":40218,"yb":542,"yc":1775,"uà":2374,"yd":1562,"ye":9657,"yg":491,"yi":3467,"uè":8877,"yl":2725,"ym":2095,"ué":3485,"yn":2835,"yo":12462,"yp":1251,"uí":8337,"yr":2338,"ys":12606,"uï":9663,"yt":1373,"yu":1178,"uñ":433,"yy":1005,"za":33059,"zb":548,"và":1420,"ze":8191,"zh":878,"zi":6215,"vè":1754,"vé":3525,"zo":9277,"ví":10528,"zu":1263,"zy":458,"zz":1582,"xà":605,"xè":1753,"xí":3147,"xò":1895,"yà":1020,"zà":1087,"zá":498,"² ":1423,"·l":30185,"Àf":1855,"Àn":592,"Àr":795,"Às":1563,"Àu":1032,"És":11168,"Ín":3390,"Òl":650,"à ":79768,"á ":764,"àb":2832,"àc":12425,"àd":1403,"àf":4242,"àg":2016,"ài":670,"àl":10998,"àm":6161,"àn":28201,"àp":2170,"àq":1193,"àr":18957,"às":9461,"àt":12150,"àu":1000,"àv":961,"àx":2054,"ál":686,"án":2546,"ár":550,"ás":500,"ât":940,"ão":548,"ç ":8144,"è ":9699,"é ":40616,"ça":24720,"ço":2465,"çu":715,"èc":20010,"èd":3296,"èf":490,"èg":1496,"èi":2236,"èl":5777,"èm":3022,"èn":34822,"èp":2640,"èr":19285,"ès":83075,"èt":7978,"èu":817,"èv":1486,"èx":3115,"éc":527,"éd":596,"ée":1019,"ég":451,"éi":2349,"él":845,"ém":860,"én":4525,"ér":2460,"és":238308,"ét":1364,"éu":1948,"év":574,"í ":26502,"çà":803,"çó":1080,"ía":3855,"íb":1797,"íc":13966,"íd":3192,"íf":5625,"íg":2541,"íl":22042,"ím":15141,"ín":22592,"ío":4311,"íp":2416,"íq":1061,"ír":1956,"ís":23333,"ít":21665,"ív":1488,"ïc":443,"ïd":3760,"ïl":889,"ïn":3033,"ïs":4007,"ït":4826,"ò ":9454,"ó ":192574,"ña":1358,"ño":1045,"òb":1306,"òc":3252,"òd":1983,"òf":2506,"òg":6120,"òl":7780,"òm":7356,"òn":22930,"òp":6668,"òq":909,"òr":18203,"òs":7050,"òt":2457,"òv":580,"òx":778,"ón":23445,"óp":508,"ór":3217,"ós":6022,"ú ":4222,"úb":7182,"úl":2870,"úm":2316,"ún":4012,"úr":2758,"ús":11106,"út":602,"üe":5188,"üi":499,"ül":636,"ür":890,"üè":1316,"üí":1085,"ān":472,"ī ":511,"ō ":457,"あ":716,"ア":1421,"丁":1198,"三":1779,"丘":601,"並":683,"之":1430," 丁":432," 三":613," 之":527,"α ":985,"ν ":764,"ς ":3290,"ης":436,"ικ":453,"ιο":521,"ος":1578," A ":4408," B ":735," C ":1205," Ab":6311," Ac":4685," Ad":3550," Ae":816," Af":1280," Ag":3774," Ah":704," Ai":4234," Aj":780," Ak":625," Al":39006," Am":9331," An":20811," Ap":2783," Aq":9010," Ar":25239," As":6402," At":5045," Au":9186," Av":2457," Ay":583," Az":1157," D ":1164," Ba":38919," Be":14672," Bh":636," Bi":6545," Bl":4160," Bo":13028," Br":13227," Bu":6184," E ":775," Ca":68255," Ce":8404," Ch":13909," Ci":7767," Cl":6222," Co":48100," Cr":7791," Cu":5356," Cy":692," F ":454," Da":7891," De":14315," Dh":530," Di":12054," Dj":766," Do":7766," Dr":2838," Du":4321," Ea":682," Eb":904," Ec":948," Ed":3109," Eg":2035," Ei":1171," El":68877," Em":4324," En":13065," Ep":1006," Eq":1092," Er":7077," Es":39661," Et":1311," Eu":9149," Ev":812," Ex":2310," H ":700," Fa":5864," Bà":594," Fe":9397," Fi":7475," Fl":3835," Bè":916," Bé":1703," Fo":12863," Fr":15864," Fu":3553," Bò":470," I ":5983," Ga":15731," Cà":784," Ge":10789," Gh":1074," Gi":6808," Gl":1472," Cè":637," Go":6328," Gr":13726," Gu":12098," Cò":1742," J ":694," Ha":12355," He":7998," Hi":6493," Dé":1253," Ho":7677," Hu":3311," Hy":932," Ib":1560," Ic":427," Id":507," Ie":528," Ig":1119," Il":3765," Im":2905," In":18025," Ir":2911," Is":4748," It":3163," Iv":695," L ":43996," Ja":9364," Je":4175," Ji":1097," Jo":19131," Ju":8002," Fó":1755," M ":726," Ka":7658," Gà":486," Ke":2565," Kh":3462," Ki":2288," Kl":509," Ko":2706," Kr":1034," Ku":2506," N ":658," La":56149," Le":16748," Li":11048," Ll":15690," Lo":13931," Lu":6114," Ly":567," O ":760," Ma":59175," Mc":441," Me":15319," Mi":17084," Mo":23321," Mu":11454," My":824,"а ":1052," Na":13479," Ne":6202," Ni":5851," No":16843," Nu":1416," Ob":1491," Oc":3449," Od":648," Oe":640," Of":637," Oi":1216," Ol":5499," Om":522," On":1113," Op":1132," Or":10300," Os":2203," Ot":1069," Ou":1046," Ov":553," R ":654," Pa":41429," Pe":16814," Ph":2094," Pi":12404," Pl":6359," Po":15893," Lí":769," Pr":19157," Ps":927," Pt":835," Pu":5503," S ":3252," Qa":686," Mà":1235," Mè":2592," Mé":948," Qu":5748," Mó":958," Mú":914," Ra":9904," Nà":737," Re":23771," Rh":759," Ri":9299," Ro":18670," Ru":3714," Sa":46252," Sc":3705," Se":22671," Sh":4149," Si":10895," Sk":493," Sl":452," Sm":594," So":13713," Sp":2112," St":6344," Su":12961," Sy":1201," V ":1559," Ta":12137," Te":13393," Th":7441," Ti":4779," Pè":909," Pé":634," To":12436," Tr":10239," Tu":4923," Tx":972," Uc":597," Ul":830," Un":23813," Ur":3195," Us":518," Ut":947," X ":990," Va":25713," Ve":8965," Vi":16711," Vo":2488,"й ":603," Rú":1355," Wa":4019," Sà":468," We":2833," Wh":591," Wi":3680," Sè":828," Wo":1786," Sí":750," Só":1115," Xa":1951," Tà":551," Xe":597," Xi":3773," Té":2126," Tí":512," Ya":1513," Ye":461," Yo":2905," Yu":830," Yv":453," Za":2050," Ze":1516," Zh":744," Zi":744," Zo":622," Ví":573," Zu":634," a ":231352," aC":7206," c ":430," ab":12838," ac":25191," ad":11985," ae":1083," af":4594," ag":9996," ai":8072," aj":1499," al":160919," am":67850," an":89094," ap":11748," aq":16353," ar":28721," as":11985," at":6592," au":11616," av":4952," d ":162001," ba":23834," be":5533," bi":7646," bl":3360," bo":8059," br":9864," bu":2523," e ":1417," ca":98241," ce":19284," ch":1175," ci":34378," cl":10711," cm":668," co":200977," cr":18454," cu":13751," da":10184," de":1049103," di":85650," do":21151," dr":5214," du":18527," ec":3573," ed":10010," ef":1766," eg":916," ei":1300," el":207472," em":12650," en":176225," ep":1274," eq":4231," er":13002," es":169872," et":3945," eu":3020," ev":1852," ex":25956," fa":33351," bà":1738," fe":21683," fi":36564," fl":4625," bé":4485," fo":88068," fr":62189," fu":18369," i ":271040," ga":8265," cà":3140," ge":23086," gi":1550," cè":1581," gl":2065," go":8124," cí":770," gr":33196," gu":9031," cò":2765," ha":48412," he":5899," hi":20727," dè":1324," dé":790," ho":9258," ht":889," hu":4769," dó":1035," ib":4538," id":3775," ig":1019," il":7669," im":11832," in":61206," ir":1802," is":2015," it":4507," l ":149189," ja":7585," fà":748," je":550," jo":3465," fí":2217," ju":19392," fò":830," fó":669," m ":1255," ka":1313," kh":929," ki":720," gè":7765," km":5787," ku":747," n ":2490," la":304087," hà":645," le":69759," li":12244," ll":44363," lo":9050," lu":1492," o ":58017,"я ":506," ma":59149," me":33740," mi":24930," mo":39073," mu":67009," p ":665," na":17801," ne":9561," ni":4279," no":66588," nu":3940," ob":15205," oc":25051," oe":7522," of":8508," ol":2632," om":602," on":8518," op":3920," or":31833," os":1160," ot":673," ox":679,"ан":476," pa":61499," là":888," pe":161014," pi":11638," pl":16017," lè":787," po":63767," lí":4842," pr":107250," ps":1873," pu":9224," lò":503," s ":17098," mà":3346," mè":1643," mé":23474," mí":929," qu":157968," mò":673," mó":3234," mú":4524," ra":7726," re":118428," ri":10476," né":2548," ro":14342," ru":5495," nò":512," nú":1717," sa":13342," se":146872," sh":793," si":74594," so":35644," sp":523," st":706," su":40122," ta":28792," pà":1118," te":75775," th":3081," ti":9692," pè":733," to":18091," tr":50136," tu":4078," tx":561," pú":2386," ub":1521," uc":432," ul":1092," un":314756," ur":2881," us":3429," ut":5889," x ":546," va":79470," rà":1366," ve":23288," vi":23682," rè":848," vo":10050," vu":1584," y ":1832," wa":660," sà":457," we":1115," sè":3098," sí":1633," sò":1297," só":13852," ww":774," xa":1796," tà":547," xi":3734," tè":2248," té":5319," tí":4871,"ка":474," za":471," zo":5183," ví":900,"на":519,"ов":707,"ра":455,"ск":629,"ст":468," Àf":1850," Àn":588," Àr":787," Às":1561," Àu":1032," És":11156," Ín":3384," Òl":650," àc":959," àf":666," àl":1282," àm":1512," àn":818," àr":5183," àt":602," è ":614," èp":2053," èt":908," èx":653," és":191686," ín":757," òp":1627," òr":1127," úl":1460," ún":2954," ús":1545,"アアア":811,"가":632,"ة ":476,"ن ":754,"ي ":540,"ال":1136,"AC ":427,"BA ":592,"Abb":439,"Aba":835,"Abd":1681,"Abe":471,"Abi":451,"Abu":1629,"Aca":1206,"Acc":847,"Act":1576,"Ada":599,"Ade":460,"Adm":436,"Ado":605,"Adr":590,"Afr":441,"Aga":446,"Agr":922,"Agu":921,"Aig":490,"Ain":596,"Ais":995,"Air":926,"Aix":700,"Al ":2273,"Aju":580,"Alb":3276,"Ala":1970,"Alc":1635,"Alf":1418,"Ale":5749,"Alg":1324,"Ali":1700,"All":1516,"Alm":895,"Alp":2816,"Alt":9210,"Als":2588,"Alv":529,"Ama":1446,"Amb":1366,"Ame":1021,"Ami":781,"Amp":487,"Ana":1238,"And":4325,"Ang":3614,"Ani":441,"Ann":1359,"Ano":667,"Ant":6768,"Al·":437,"Apa":481,"App":480,"Apo":915,"Amè":2404,"Aqu":8976,"Arb":535,"Ara":3272,"Ard":1422,"Arc":1131,"Are":892,"Arg":2137,"Ari":1880,"Arn":611,"Arm":1606,"Arr":864,"Arq":5004,"Art":3155,"Arx":525,"Ass":2750,"Ast":1600,"Ate":1472,"Atl":2047,"Aub":950,"Aud":1123,"Aug":1230,"Aul":445,"Aus":2207,"Aur":757,"Aut":1669,"Ava":645,"Arà":478,"Ave":588,"Avi":816,"Bab":526,"Bad":1466,"Bac":609,"Bah":633,"Bag":1094,"Bai":4905,"Bal":3540,"Ban":2412,"Bar":16181,"Bat":1579,"Bas":2781,"Bav":652,"Bau":520,"Bay":640,"Bea":1289,"Bel":2629,"Ben":2746,"Ber":4569,"Bet":501,"Bes":801,"Bib":559,"Bil":822,"Bio":545,"Bir":867,"Bis":957,"Biz":433,"Bla":2977,"Blo":538,"Boi":680,"Bol":1542,"Bon":1514,"Bom":615,"Bor":2684,"Bot":563,"Bos":818,"Bou":1555,"Bra":3413,"Bre":3909,"Bri":2360,"Bro":1428,"Bru":1662,"Buc":460,"Bue":559,"Bul":739,"Bur":1549,"Bus":549,"Cab":1292,"Cae":457,"Cad":907,"Cai":1201,"Cam":7097,"Cal":8123,"Can":10091,"Cap":2678,"Cas":9757,"Car":8990,"Cau":681,"Cat":14977,"Cav":477,"Cel":753,"Cen":4020,"Cer":2044,"Cha":6699,"Che":1984,"Chi":1603,"Cho":571,"Chr":1309,"Chu":431,"Cic":741,"Cin":969,"Cir":1072,"Ciu":1559,"Cit":606,"Civ":835,"Cla":2546,"Châ":742,"Cle":767,"Cli":509,"Clo":595,"Clu":1412,"Ciè":520,"Cod":481,"Com":13254,"Col":5594,"Coo":476,"Con":12377,"Cop":1662,"Cos":4020,"Cor":6641,"Cou":1178,"Cra":772,"Cre":2168,"Cri":2170,"Cro":1381,"Cru":694,"Cub":578,"Cul":2134,"Cur":631,"EC ":776,"Dal":794,"Dan":1281,"Dam":954,"Dar":1085,"Dav":1039,"De ":1301,"Dec":620,"Def":481,"Del":1249,"Den":843,"Dem":2650,"Dep":1015,"Der":576,"Des":2834,"Dev":451,"Deu":783,"Dia":891,"Dic":847,"Die":793,"Din":2556,"Dip":625,"Dio":1016,"Dir":624,"Dis":1960,"Div":832,"Doc":553,"Dol":487,"Don":1274,"Dom":1970,"Dor":1286,"Dou":1102,"Dra":714,"Dre":955,"Dro":599,"Dub":501,"Duc":927,"Dur":1205,"FC ":460,"Ebr":656,"Eco":537,"Edi":863,"Edu":769,"Egi":1637,"El ":53100,"En ":7629,"Ele":990,"Eli":1057,"Ell":470,"Els":12202,"Emi":805,"Emp":2669,"Enc":1212,"Eng":618,"Enr":1136,"Ent":1321,"Epi":667,"Es ":7626,"Equ":1084,"Era":4172,"Eri":725,"Ern":564,"Esc":3957,"Esg":1543,"Esl":432,"Esq":691,"Esp":7947,"Ess":459,"Est":16703,"Eti":592,"Eug":470,"Eus":540,"Eur":6546,"Exp":558,"Ext":492,"Exè":520,"Fab":658,"Fal":839,"Far":1147,"Fed":2158,"Fel":1485,"Fer":3479,"Fes":1048,"Fig":804,"Fil":2790,"Fin":1628,"Fir":475,"Fit":451,"Fis":548,"Fla":1182,"Bèl":724,"Flo":1914,"Bén":1073,"Fon":1799,"For":4330,"Fou":4771,"Fra":10976,"Fre":2000,"Fri":1067,"Fro":1171,"Fun":1040,"Fut":785,"Gab":852,"Gai":1316,"Gam":644,"Gal":3763,"Gan":792,"Gas":726,"Gar":5151,"Gau":545,"Gav":516,"Gel":633,"Geo":1768,"Gen":3908,"Ger":2784,"Gha":668,"Gil":834,"Gio":742,"Gin":447,"Gir":2867,"Gla":555,"Cès":490,"Gol":772,"Gon":924,"Gor":807,"Got":462,"Gov":915,"Gra":8029,"Gre":1705,"Gri":568,"Gro":823,"Gru":812,"Gua":2233,"Gue":3552,"Gui":3733,"Grè":1211,"Còr":1485,"Ha ":1586,"Hab":849,"Hai":525,"Hal":1026,"Han":1601,"Ham":1069,"Har":1914,"Has":482,"Hau":489,"Hei":662,"Hel":854,"Hen":1083,"Her":3308,"Hes":521,"II ":11551,"Hi ":667,"IL ":496,"Hil":810,"Him":457,"Hip":538,"His":1995,"IN ":545,"Déu":993,"Hol":1076,"Hon":1356,"Hom":721,"Hor":1537,"Hos":973,"Hou":427,"IV ":2284,"IX ":2178,"Hug":456,"Hum":647,"Ibè":454,"Igu":523,"Ill":2804,"Imp":2310,"Ind":3042,"Inc":1174,"Inf":899,"Int":5298,"Ins":1704,"Inv":4574,"Ira":1225,"Irl":972,"Isa":677,"Isl":998,"Isr":606,"Isè":610,"Ità":2371,"Jac":1249,"Jan":842,"Jam":1007,"Jap":1788,"Jar":512,"Jav":647,"Jau":1327,"Jea":1258,"Jer":1024,"Jes":979,"Joa":4658,"Joc":2904,"Joh":2200,"Jon":498,"Jor":1675,"Jos":5343,"Jov":701,"Jua":1133,"Jul":1845,"Jun":1207,"Jur":1022,"Jus":1158,"Fór":1750,"Kal":667,"Kan":1326,"Kas":646,"Kar":1823,"Kat":730,"Ken":1039,"Ker":512,"Kha":2066,"Kin":628,"Kir":437,"Kon":521,"Kur":796,"La ":44734,"Lab":732,"Lac":602,"Lag":454,"Lan":2502,"Lam":739,"Lar":952,"Lat":473,"Las":877,"Lau":991,"Le ":1701,"Lea":612,"Leg":479,"Lei":605,"Len":488,"Leo":1055,"Let":618,"Les":8179,"Lib":1275,"Lic":729,"Lie":545,"Lig":440,"Lin":1292,"Lim":1985,"Lit":1358,"Lis":662,"Liv":631,"Lla":1420,"Lle":5009,"Lli":3849,"Llo":2525,"Llu":2662,"Lo ":509,"Loc":935,"Loi":3900,"Lon":2521,"Lor":1079,"Los":1387,"Lou":1368,"Lud":436,"Luc":2154,"Lui":841,"Lux":520,"Mac":1977,"Mad":4850,"Mag":1918,"Mai":1400,"Mah":1236,"Maj":817,"Mal":5341,"Man":7072,"Map":489,"Mas":2898,"Mar":24205,"Mau":1281,"Mat":2199,"May":903,"Max":566,"Med":2321,"Meg":499,"Mem":552,"Mel":1235,"Men":2299,"Mes":1909,"Mer":2220,"Meu":791,"Met":1461,"Mic":1967,"Mig":3773,"Mik":435,"Mil":2264,"Min":2581,"Miq":1084,"Mis":977,"Mir":1383,"Mit":1306,"Mod":572,"Moi":445,"Mol":1866,"Mon":10135,"Mos":3258,"Mor":3228,"Mou":615,"Mot":612,"Mov":719,"Muh":1530,"Mul":514,"Mun":3699,"Mus":3011,"Mur":1125,"Nad":543,"Nac":3718,"Nag":474,"Nan":670,"Nam":469,"Nar":887,"Nat":1765,"Nas":943,"Nav":1541,"Neg":551,"Nep":465,"Neu":723,"New":1011,"Nic":1721,"Nig":501,"Nil":485,"Nin":583,"No ":949,"Nob":765,"Nog":546,"Nom":506,"Nor":7475,"Not":558,"Nov":3754,"Nou":976,"Obr":542,"Occ":1964,"Oce":633,"Oes":458,"Ois":1085,"Oli":1151,"Or ":1144,"Ope":627,"Olí":2742,"Ord":1202,"Org":811,"Ori":3250,"Orl":432,"Orn":702,"Ort":537,"Osc":592,"Oso":563,"Pac":1255,"Pal":5917,"Pak":720,"Pan":1931,"Pam":450,"Pap":987,"Par":12424,"Pat":5920,"Pas":3293,"Pau":2314,"Ped":953,"Pel":908,"Pen":2445,"Per":8284,"Paí":2486,"Pet":1992,"Paï":1367,"Pha":593,"Phi":793,"Pic":997,"Pie":1705,"Pil":517,"Pin":1362,"Pir":5077,"Pit":507,"Pis":643,"Pla":4255,"Ple":573,"Pli":619,"Pob":891,"Pod":515,"Poi":546,"Pol":3110,"Pon":2233,"Pom":919,"Pop":1499,"Por":3188,"Pot":747,"Pos":637,"Pra":2096,"Pre":5132,"Pri":3663,"Pro":6882,"Psi":498,"Pto":509,"Pub":1052,"Pue":644,"Pui":963,"Prí":433,"Puè":630,"Màl":503," ال":969,"Mèx":2060,"Qua":1293,"Que":2037,"Qui":1884,"Món":857,"SA ":697,"Rad":711,"Raf":698,"Raj":585,"Rai":671,"Ral":542,"Ran":696,"Ram":2345,"Ras":506,"Nàp":561,"Rea":888,"Red":459,"Rec":1201,"Reg":5159,"Rei":2533,"Ren":1400,"Rep":4891,"Res":1790,"Rev":1903,"Reu":922,"Rib":1454,"Ric":1775,"Rie":437,"Rin":1512,"Rip":435,"Rio":773,"Riv":653,"Riu":648,"SO ":429,"Rob":1500,"Rod":1354,"Roc":1550,"Rog":460,"Roi":1301,"Ron":515,"Rom":4973,"Ros":3668,"Rou":702,"Roy":493,"SS ":576,"Rub":511,"Rus":893,"Sac":904,"Sab":1266,"Sad":437,"Sag":828,"Sai":6861,"Sam":1360,"Sal":4902,"Sao":1292,"San":18549,"Sar":3138,"Sau":1050,"Sat":669,"Sav":1601,"Sax":634,"Se ":1121,"Sca":430,"Sch":1869,"Sco":614,"Sec":816,"Seb":701,"Seg":4635,"Sei":460,"Sem":755,"Sel":1581,"Sen":4433,"Sep":474,"Ser":3924,"Set":597,"Sev":1041,"Sha":1754,"She":554,"Shi":788,"Sic":1112,"Sib":491,"Sie":912,"Sid":494,"Sig":466,"Sim":1293,"Sil":985,"Sin":1608,"Sis":732,"Sir":914,"Sit":741,"Soc":2681,"Sob":484,"Sof":504,"Som":1707,"Sol":2641,"Son":1522,"Sor":783,"Sou":790,"Sot":432,"Sov":997,"Spa":467,"Sta":2211,"Ste":1230,"Sto":592,"Str":1325,"Stu":700,"Sub":526,"Sud":4066,"Sum":592,"Sul":1193,"Sun":586,"Sup":1168,"Sur":766,"Suè":768,"Suï":870,"UA ":676,"Tai":859,"Tal":909,"Tan":1153,"Tam":2163,"Tar":2953,"Tau":512,"Tax":694,"Tea":1333,"Tel":1110,"Ten":1269,"Tem":675,"Teo":751,"Ter":5037,"Tes":748,"Tex":474,"Tha":531,"The":4710,"Thi":492,"Tho":1030,"Tib":574,"Tin":475,"Tim":819,"Tir":690,"Tit":744,"Pèr":751,"Pér":494,"Tol":1537,"Ton":536,"Tom":896,"Top":6603,"Tor":4551,"Tot":919,"Tos":566,"Tou":1405,"Tra":3043,"Tre":2743,"Tri":2234,"Tro":1124,"Tun":1064,"Tur":2311,"Txe":473,"Ucr":518,"VI ":1796,"Un ":5869,"Una":3414,"Uni":14119,"Urg":1121,"Uru":766,"Va ":8704,"Val":12827,"Van":1324,"Var":1091,"Veg":497,"Vel":1558,"Ven":2761,"Ver":2949,"Via":539,"Vid":677,"Vic":2194,"Vie":1915,"Vig":450,"Vil":5935,"Vin":1052,"Vir":789,"Vit":683,"Vis":651,"Viv":524,"Vol":1052,"Vos":641,"Rús":1325,"Wal":1101,"Was":531,"War":644,"Wei":623,"Wes":827,"XI ":700,"Wil":1831,"Win":668,"Wol":473,"Wor":825,"Sír":544,"XV ":828,"XX ":1330,"Són":982,"Xar":475,"Xil":666,"Xin":1974,"Té ":2031,"Tít":634,"Yon":610,"Yor":1648,"Zel":575,"丁 ":477,"三 ":646,"aC ":7161,"aTo":1030,"ab ":4227,"ac ":6540,"ad ":6641,"abb":488,"aba":10954,"abd":578,"abe":3580,"abi":32079,"abl":11206,"abo":3927,"abr":8859,"abs":2070,"acT":607,"abu":1194,"ae ":11093,"aca":8202,"acc":6873,"ace":6587,"aci":87894,"ach":4569,"ack":1414,"acl":1203,"aco":6238,"acq":477,"acs":2095,"acr":2622,"acu":2617,"act":26858,"af ":1940,"ada":73814,"add":677,"ade":24344,"adh":764,"adj":840,"adi":12618,"adm":4187,"ado":31930,"adr":7171,"adq":601,"adv":1601,"adu":4495,"ag ":1296,"ael":2215,"aen":718,"aes":793,"aer":1540,"aet":471,"ah ":2535,"afa":2091,"aff":697,"afe":1992,"afi":3879,"afl":795,"abè":473,"afo":1441,"afr":2220,"ai ":7608,"aga":6999,"age":3272,"acà":746,"agd":566,"agi":3767,"agh":680,"agm":689,"acè":540,"agl":465,"ago":11990,"agn":4049,"ací":903,"agr":4590,"agu":4082,"aha":2319,"adà":1479,"ahi":1122,"adè":1215,"ahm":588,"aho":687,"ahr":545,"adí":1006,"ak ":1321,"aia":1969,"aic":955,"aid":1169,"aig":9622,"aim":583,"ail":1889,"aio":474,"ain":11965,"ais":4814,"air":4856,"aiu":570,"ait":690,"aix":15023,"al ":239788,"aja":1558,"aje":644,"ajo":5914,"aju":1645,"am ":6592,"aka":1683,"ake":1123,"aki":1448,"akh":1508,"agè":1166,"ako":697,"aku":618,"agó":1804,"an ":66807,"alb":1945,"ala":24734,"ald":4936,"alc":4171,"alf":1579,"ale":25255,"alh":840,"alg":6863,"ali":38592,"all":34050,"alk":512,"alm":23959,"alp":1373,"alo":6503,"alt":21186,"als":45950,"alv":3401,"alu":13202,"alz":450,"ao ":1801,"ama":13473,"amb":79002,"ame":83485,"amf":804,"ami":6845,"amm":2154,"amo":4971,"amn":458,"amp":13969,"ams":1153,"amu":1923,"ap ":11550,"ana":43743,"and":29111,"anc":71924,"ane":15291,"anh":850,"ang":20211,"anj":1451,"ani":23420,"anl":536,"ank":1770,"ann":5057,"anm":616,"ano":18530,"anr":592,"anq":2008,"ant":129567,"ans":27548,"anv":2326,"anu":5845,"anx":1525,"anz":1554,"al·":4737,"any":83908,"aq ":723,"aon":2010,"aor":467,"ar ":93910,"apa":10585,"alà":12459,"ape":4186,"aph":689,"api":8079,"apl":3094,"alè":3173,"alç":1033,"app":589,"apo":4636,"apr":3633,"alí":1466,"apt":2457,"aps":822,"alò":521,"apu":1311,"aló":481,"alú":527,"as ":20122,"amà":1509,"amè":932,"amí":21249,"aqu":19443,"amó":1134,"at ":205988,"arb":6120,"ara":30683,"ard":16741,"anà":1635,"arc":29113,"are":20898,"arh":634,"arg":7849,"ari":46452,"arl":9250,"anè":1527,"anç":11606,"ark":2032,"arn":6199,"arm":5748,"arp":1136,"aro":5216,"arr":31135,"arq":7113,"aní":596,"art":92161,"ars":9714,"arv":792,"anò":614,"aru":1141,"arx":3639,"arz":535,"ary":1313,"au ":10283,"asa":10565,"asc":12981,"ase":5138,"asi":8686,"ash":2159,"ask":721,"asm":919,"asl":572,"aso":1870,"asq":958,"asp":2366,"ass":25671,"asu":572,"ast":24258,"ata":40935,"ate":23772,"ath":2609,"atg":10074,"atj":940,"ati":22473,"atl":1905,"apç":564,"atm":559,"atp":518,"ato":7197,"atr":17757,"att":1995,"ats":36813,"atu":14371,"atx":730,"apó":1638,"atz":1039,"aw ":507,"auc":1243,"aud":3138,"aug":1634,"aum":2063,"aul":5438,"aun":1108,"aus":6631,"aur":6501,"aut":10121,"auv":582,"aux":1245,"ax ":1119,"ava":21112,"arà":3336,"ave":9872,"avi":9915,"arè":1555,"arç":4698,"avo":3437,"arí":6995,"avu":1833,"aró":868,"ay ":3903,"awa":2159,"az ":1002,"axa":436,"atà":637,"axi":822,"atè":1859,"axo":1134,"atí":4152,"atò":3244,"ató":484,"aya":1481,"aye":1242,"ayo":535,"ayn":542,"ays":433,"之 ":486,"aza":1604,"azi":1649,"avé":2003,"azo":649,"azz":706,"axò":585,"ba ":12235,"bab":2635,"bad":3682,"bac":2347,"bag":527,"bai":3872,"bal":9163,"ban":11239,"bam":526,"bar":12405,"bat":6461,"bas":9205,"bav":489,"bd ":1382,"be ":3383,"bcl":710,"bdi":878,"bea":692,"bec":772,"beg":545,"bei":604,"bel":4237,"ben":4316,"ber":17069,"bet":1760,"bes":2688,"bfa":896,"bi ":2973,"bib":722,"bia":4295,"bid":1305,"bic":2896,"bie":1762,"big":444,"bil":5829,"bin":3024,"bio":2560,"bir":1703,"bit":31890,"bis":3356,"biz":656,"bje":4022,"bn ":4422,"bla":13353,"ble":25765,"bli":16768,"blo":1020,"bo ":825,"boa":461,"boc":1058,"bol":8535,"bon":3318,"bom":502,"bor":5228,"bot":2150,"bos":3156,"bou":954,"bs ":2550,"bra":15655,"bre":57548,"bri":13610,"bro":2609,"bru":988,"bu ":3185,"bse":1373,"bso":827,"cTo":715,"bst":2632,"bte":1652,"bti":549,"bud":487,"buc":1582,"bue":692,"bui":1777,"bul":1896,"bun":1101,"bum":1003,"bur":5072,"but":2804,"bus":3278,"by ":676,"buï":1006,"ca ":91703,"cac":9604,"cab":3071,"cad":13917,"cai":1639,"cam":11096,"cal":20680,"can":23151,"cap":21105,"cas":12109,"car":29335,"cau":2881,"cat":33246,"cav":2671,"ce ":4275,"cca":584,"cce":4115,"cci":32492,"cea":2192,"ceb":520,"ced":4416,"cei":1155,"caç":439,"cel":24876,"cen":17603,"cep":5727,"ces":14952,"cer":10622,"cet":1368,"ch ":5829,"ci ":10536,"ccé":569,"cha":4666,"che":6600,"chi":4008,"chl":697,"chn":490,"cho":1760,"chr":636,"cht":1042,"chs":468,"chu":1319,"chw":564,"chy":514,"ck ":3785,"cia":71294,"cic":3669,"cie":28020,"cid":8886,"ceà":1020,"cif":2343,"cim":1374,"cil":3459,"cio":42238,"cin":9175,"cip":70269,"cis":6308,"cir":3899,"ciu":18180,"cit":15001,"civ":1365,"cm ":663,"cke":1017,"cla":10996,"cle":7657,"cli":5636,"clo":9026,"clu":3084,"co ":5558,"cma":564,"cià":5744,"ciè":1687,"ció":93089,"cni":2204,"cno":1079,"coa":1145,"coc":1430,"cob":3849,"coe":611,"cod":1781,"cog":1054,"coi":889,"coh":497,"com":94267,"col":18694,"coo":976,"con":84469,"cop":3499,"cos":13573,"cor":15898,"cou":2376,"cot":1941,"cov":689,"clà":1541,"clò":1640,"cs ":28127,"cqu":601,"cra":3589,"cre":15748,"cri":20672,"cro":4644,"cru":742,"cta":11278,"cte":31381,"cti":13744,"cto":10968,"ctr":4907,"ctu":18407,"cua":740,"cuc":922,"cud":997,"cui":2876,"cum":3049,"cul":26044,"cun":1991,"cup":3866,"cus":3431,"cur":7121,"cut":6363,"crà":2375,"crí":1017,"crò":1272,"cy ":895,"ctà":449,"ctò":6507,"da ":106799,"dad":7741,"dac":2392,"dae":7362,"dag":1161,"dai":933,"dal":8337,"dan":7132,"dam":4466,"dap":1308,"dar":8439,"dat":8541,"das":1597,"dav":1950,"dau":578,"de ":731066,"ddi":693,"deb":713,"dea":1779,"ded":2420,"dec":4514,"def":4724,"dee":790,"deg":1083,"dei":4464,"del":223902,"den":29595,"dem":4208,"dep":44739,"deo":3384,"der":25618,"det":3669,"des":76584,"dev":3022,"deu":2190,"dex":508,"dez":729,"di ":11122,"dge":655,"dha":462,"dib":1304,"dia":26304,"did":2342,"dic":20315,"dif":12060,"die":5221,"dig":1614,"dil":1360,"din":14926,"dim":2318,"dip":2139,"dio":4697,"dir":15836,"diq":427,"dit":7813,"dis":34050,"div":14053,"diu":3152,"dja":992,"do ":6850,"dià":1566,"dmi":4472,"diò":459,"dob":2439,"doc":4823,"dol":3039,"don":12077,"dom":3069,"dop":1469,"dor":31450,"dot":1997,"dos":7697,"dov":779,"dou":475,"dox":667,"dow":451,"ds ":13269,"dqu":610,"dt ":624,"dra":5966,"dre":20197,"dri":8059,"dro":3178,"du ":1931,"dua":2037,"duc":9457,"due":6092,"dui":1768,"dul":1590,"dun":583,"dur":12174,"dut":440,"dus":2391,"drà":793,"dré":463,"dvo":1486,"drí":514,"dy ":834,"dwa":684,"dwi":444,"duï":2716,"ea ":11116,"eb ":1298,"ec ":10740,"eac":2447,"eae":1662,"ead":3245,"eag":611,"eal":7656,"ean":3211,"eas":681,"ear":4154,"eau":2217,"eat":7817,"ed ":3555,"eba":5649,"ebe":2525,"ebi":1137,"ebl":1185,"ebo":1010,"ebr":12552,"ebs":543,"ebu":1509,"ee ":1159,"eca":4354,"ecc":8576,"ece":4741,"eci":8492,"ech":1846,"eck":770,"ecl":2812,"eco":9587,"ecn":1045,"ecs":1149,"ecr":2148,"ecu":5052,"ect":34496,"ef ":453,"eda":7510,"ede":8641,"edi":20322,"edo":3166,"edr":3881,"edu":2561,"eg ":3638,"eed":448,"eei":505,"een":1291,"ees":2623,"eer":1377,"eet":440,"efa":810,"efe":8879,"efi":3643,"efl":540,"ebé":837,"efo":1656,"efr":499,"efu":502,"ei ":11603,"ega":18566,"ege":4080,"ecà":910,"egi":53966,"egl":11995,"ego":14924,"egn":5842,"ecí":1108,"egr":5884,"egu":26013,"ehi":1324,"edè":1034,"edò":1193,"aç ":1156,"ek ":1125,"eia":3285,"eic":714,"eie":670,"eid":2015,"eig":2063,"eim":1263,"eil":1187,"ein":4918,"eis":3439,"eir":2304,"eit":2121,"eix":36438,"el ":342397,"eja":3154,"ejo":757,"em ":5051,"aça":4285,"aço":547,"egí":471,"en ":198166,"elb":478,"ela":12101,"egü":997,"eld":1138,"elf":710,"ele":21700,"elh":575,"elg":974,"eli":11221,"ell":53060,"aèl":445,"elm":1163,"elo":14812,"elt":1433,"els":102578,"elv":2210,"elu":847,"eo ":2000,"ema":20188,"emb":23594,"eme":18533,"emi":9954,"emm":981,"emo":6063,"emn":559,"emp":21619,"ems":837,"emu":739,"ep ":5633,"enb":1598,"ena":31909,"end":15928,"enc":25792,"enf":1661,"ene":32525,"enh":951,"eng":12438,"enj":1041,"eni":43865,"enl":1490,"enk":437,"enn":4327,"eno":9400,"enr":1452,"enq":679,"ent":250920,"ens":21295,"env":6112,"enu":1527,"enz":2223,"eny":10523,"el·":13441,"eoc":722,"eod":852,"eog":1412,"eoj":1466,"eom":1010,"eol":3046,"eon":1964,"eop":812,"eos":1173,"eor":4067,"eot":635,"er ":189445,"epa":44596,"epc":812,"epe":4667,"eph":1592,"epi":1711,"elè":1649,"epp":440,"epo":1017,"epr":5519,"elí":752,"ept":4227,"eps":459,"epu":1513,"es ":411788,"emà":2705,"aís":5446,"equ":6099,"emò":1396,"et ":25747,"erb":3837,"era":74760,"erd":5652,"enà":736,"erc":13759,"erf":4507,"ere":32246,"erg":6703,"eri":45981,"enè":1818,"erl":3041,"enç":6985,"erk":557,"ern":23953,"erm":22973,"erp":2962,"ero":10444,"err":35859,"erq":1670,"ení":2568,"ert":22614,"ers":42235,"esT":439,"erv":13334,"enò":458,"eru":2316,"erz":433,"ery":712,"eu ":35559,"esa":19441,"esc":32880,"esb":561,"ese":21436,"esd":2452,"esg":3019,"esf":926,"esi":12819,"esh":2249,"esm":4178,"esl":1048,"aïl":606,"eso":4807,"aïn":1010,"esn":735,"esq":2957,"esp":45950,"ess":23338,"aïs":2742,"esu":5094,"est":121230,"ev ":1270,"eta":26671,"etc":1311,"ete":15994,"eth":1185,"etg":1896,"eti":12720,"etl":540,"etn":724,"etm":769,"eto":4154,"etr":11973,"ett":2525,"ets":4387,"etu":846,"etx":432,"etz":908,"epú":4257,"ew ":1125,"aó ":858,"eua":1296,"aña":505,"euc":836,"eue":676,"eud":1880,"eug":783,"eui":585,"euj":610,"eum":762,"eul":512,"eun":1000,"eus":15496,"eur":6223,"eut":1400,"euv":478,"eux":1268,"ex ":2741,"eva":19578,"eqü":1946,"erà":2492,"eve":8878,"evi":8750,"erè":7517,"erç":777,"evo":5130,"erí":5636,"erò":8453,"eró":999,"erú":955,"ey ":4222,"ewa":613,"esè":852,"ez ":5223,"esú":770,"exa":3082,"età":1153,"exc":2327,"exf":551,"exe":4662,"exi":4985,"etè":606,"exp":6667,"ext":8657,"etò":822,"exu":806,"etó":2471,"exè":1087,"fa ":5715,"fab":2410,"fac":1990,"fae":522,"fal":2239,"fan":3597,"fam":21118,"far":2707,"fat":794,"fas":1245,"fav":856,"bà ":1286,"fe ":979,"ff ":607,"bàc":497,"bàn":471,"bàs":1680,"feb":4294,"fed":1684,"fec":3709,"feg":553,"fei":1130,"fel":956,"fen":4386,"fem":2144,"fer":21684,"fet":4235,"fes":7134,"feu":1380,"ffa":432,"ffe":606,"ffi":601,"fi ":1082,"fga":590,"fib":922,"fia":3829,"fid":612,"fic":28976,"fie":778,"fig":1714,"fil":14950,"fin":20246,"fir":1021,"fiq":1022,"fit":2051,"fis":1262,"fix":818,"bé ":21222,"fla":1357,"fle":1233,"fli":683,"flo":2541,"bèr":1217,"flu":3428,"bén":800,"bí ":554,"foc":841,"fol":1629,"fon":7008,"for":42476,"fot":1319,"fos":1763,"fou":50634,"fs ":758,"ft ":899,"fra":59098,"fre":3413,"fri":4659,"fro":3635,"fru":1196,"bó ":1017,"fug":662,"ful":1038,"fun":11935,"fur":649,"fut":4817,"fus":2525,"bú ":739,"ga ":19272,"gac":2467,"gad":8463,"gaf":564,"gai":1485,"gam":2445,"gal":6733,"gan":12273,"gas":3274,"gar":8028,"gau":743,"gat":6271,"gav":699,"cà ":4299,"gbi":427,"ge ":14223,"gda":596,"gdi":3267,"càl":452,"càn":1571,"càr":2496,"geg":571,"gei":1282,"gem":444,"gel":5071,"gaè":430,"geo":2538,"gen":27727,"ges":8582,"ger":8998,"geu":553,"get":1515,"gh ":1166,"gi ":2290,"ggi":525,"gha":928,"ght":1061,"gia":10303,"gic":5745,"gie":1046,"gid":6384,"gim":1254,"gil":971,"gio":4566,"gin":10839,"giq":637,"gip":2151,"gis":5461,"gir":3350,"git":6862,"cè ":929,"gla":3610,"gle":14405,"gli":1448,"cèl":1465,"cèn":1040,"glo":1864,"cès":51222,"glu":480,"go ":4475,"gma":570,"gme":1573,"cés":2600,"gió":43533,"gna":5935,"gne":6704,"gni":3825,"gno":1760,"gny":1080,"goc":613,"gog":562,"gol":2625,"gon":15671,"gos":7535,"gor":5349,"got":978,"gov":6468,"glè":7577,"glé":4175,"gs ":2190,"cía":762,"cíc":646,"cíf":1693,"cíl":928,"gra":28539,"gre":13487,"gri":3099,"gnè":749,"gro":1831,"gru":9359,"gto":672,"có ":1079,"gua":16987,"gue":14100,"gud":7181,"gui":8806,"gum":532,"gul":3339,"gun":6066,"gus":2383,"gur":4581,"gut":13910,"grà":3970,"còc":650,"còl":466,"còn":1558,"còm":1025,"gré":941,"còp":598,"grí":718,"gy ":492,"gué":2153,"guè":693,"ha ":12835,"hab":29035,"had":849,"hac":947,"hae":966,"hah":490,"hag":1379,"hai":816,"hal":3284,"hak":573,"han":10626,"ham":3661,"hap":986,"har":8141,"hat":1256,"has":1344,"hav":4880,"hau":1739,"dà ":4550,"he ":9369,"dàr":741,"heb":937,"hea":599,"hec":618,"hei":1380,"hel":2633,"hen":2285,"hem":1268,"heo":584,"her":7245,"het":829,"hes":1479,"hi ":8328,"hib":968,"hia":1191,"hid":2139,"hic":1329,"hie":812,"hig":483,"hil":2191,"hik":471,"hin":3026,"him":1247,"hip":1365,"hio":574,"hir":1774,"hit":1226,"his":10997,"hiv":1397,"hn ":1390,"dèc":1241,"hle":576,"dèm":2035,"dèn":3518,"dès":2865,"ho ":1745,"hma":1072,"hme":479,"déu":772,"dí ":1171,"hod":538,"hoe":482,"hol":1392,"hon":3217,"hom":4461,"hop":470,"hor":4248,"hoq":607,"hot":684,"hos":1136,"hou":928,"dís":800,"ht ":1178,"hra":931,"hre":427,"hri":1581,"hro":903,"hry":475,"hu ":535,"htt":932,"dó ":1006,"hua":543,"hun":546,"hum":4995,"hur":1505,"hus":1570,"dòn":1756,"hy ":459,"hwa":576,"dón":1091,"dós":502,"hya":490,"hyl":446,"ia ":240700,"dús":667,"ib ":828,"ic ":66155,"iac":4872,"iab":1230,"iae":474,"iad":4471,"iag":1233,"iam":3348,"ial":34113,"ian":18657,"ias":1473,"iar":7010,"iat":8115,"eà ":1206,"id ":9736,"iba":5730,"ibe":5840,"ibi":2687,"ibl":5185,"ibn":4092,"ibo":751,"ibr":5119,"ibu":8678,"ie ":17464,"ica":99673,"icc":2791,"ice":5639,"ici":97310,"ich":5290,"ick":1279,"icl":5272,"ico":6687,"ics":16582,"icr":2098,"icu":3925,"ict":15934,"if ":436,"ida":28123,"ide":32628,"idg":508,"idi":6454,"ido":3414,"idr":1982,"ids":10903,"idu":2167,"ig ":10039,"ied":1379,"ieg":716,"iej":698,"iem":565,"iel":2750,"ien":17964,"ies":27602,"ier":8829,"ieu":1133,"iet":7033,"iev":1540,"ifa":1103,"ife":7155,"ifi":14910,"ibè":709,"ifo":4579,"ifr":661,"ifu":733,"ibú":572,"ii ":1495,"iga":11186,"ige":6095,"icà":3585,"igd":3266,"igi":17333,"igh":1474,"igm":515,"igl":1012,"igo":1893,"ign":8509,"icí":959,"igs":552,"igr":1725,"igu":11339,"icò":748,"iha":835,"idè":1574,"ik ":1476,"iid":759,"il ":18226,"ija":779,"ifò":1231,"im ":10274,"eça":920,"ika":1087,"ike":554,"iki":730,"ikh":444,"igè":690,"iko":916,"eçu":667,"in ":16856,"ilb":730,"ila":11336,"igü":807,"ild":1048,"ile":4850,"ilh":868,"ili":25409,"ill":37178,"ilm":879,"ilo":5127,"ilt":532,"ils":3166,"ilv":781,"ilu":637,"io ":7808,"ima":12326,"imb":1419,"ime":26750,"imf":554,"imi":7652,"imm":1734,"imo":8644,"imn":949,"imp":12072,"ims":3094,"imu":1172,"ip ":3014,"ina":55244,"ind":13573,"inc":26929,"inf":7398,"ine":27327,"inh":911,"ing":18368,"ini":21752,"inl":745,"ink":797,"inn":1417,"ino":7464,"inq":1248,"int":40330,"ins":28968,"inv":3498,"inu":3882,"inz":527,"il·":2910,"iny":3648,"ioc":841,"iod":2266,"iog":1116,"iom":1458,"iol":8925,"ion":66529,"iop":546,"ios":3642,"ior":10193,"iot":1969,"iov":765,"ir ":38156,"ipa":13903,"ilà":1296,"ipc":1672,"ipe":1388,"ipi":56300,"ipl":2748,"ipp":1057,"ipo":1719,"ipr":1169,"ilí":1353,"ipt":7598,"ips":1386,"ilò":3165,"ipu":7068,"is ":48738,"imà":684,"iqu":16573,"it ":34854,"ira":12762,"inà":2459,"irc":3627,"ire":12121,"irg":707,"iri":14280,"irl":1148,"inè":2542,"irm":1510,"iro":5273,"irr":1094,"irt":776,"irs":803,"iru":1243,"inó":1010,"iu ":23871,"isa":3230,"isc":7357,"isb":2891,"ise":3763,"isd":800,"isf":544,"isi":12145,"ish":2116,"ism":11247,"isl":2580,"iso":2263,"eïn":1531,"isn":1192,"isq":801,"isp":8390,"iss":11198,"isu":888,"ist":91367,"ita":105273,"itc":439,"ite":18961,"ith":1434,"itg":494,"itj":6011,"iti":5407,"ipè":1346,"ito":13878,"itr":2127,"ipí":486,"itt":1867,"its":13586,"ipò":666,"itu":63810,"itx":1409,"itz":26586,"ity":1073,"eó ":1479,"iue":1082,"iud":581,"ium":2617,"ius":11241,"iur":6384,"iut":18717,"ix ":29909,"iva":16973,"irà":1120,"ive":21240,"ivi":15764,"eòl":734,"ivo":762,"eòr":820,"ivu":434,"iwa":449,"isò":490,"iz ":983,"ixa":7163,"ità":9523,"ixe":14602,"ixi":2409,"itè":770,"ixo":2617,"ixt":502,"itò":478,"iya":1218,"iyy":518,"iza":2122,"ize":674,"izi":565,"izo":617,"ixí":2870,"ixò":1238,"ja ":9811,"jab":521,"jad":1156,"jac":943,"jan":5594,"jam":597,"jap":2013,"jar":2907,"jat":1290,"jas":516,"jec":6374,"jer":640,"ji ":775,"fèr":741,"jo ":1478,"joc":3043,"jol":501,"jor":5738,"jos":562,"jov":1333,"jou":428,"fíc":3185,"fís":2284,"jud":1724,"jue":850,"jug":3096,"jul":4737,"jun":11838,"jur":2180,"jut":652,"jus":1031,"fòs":737,"fòr":1399,"fór":670,"ka ":3378,"kal":645,"kan":1255,"kar":1252,"kat":459,"ke ":1535,"gàn":1108,"gàr":525,"kel":464,"ken":917,"kes":438,"ker":1269,"ket":532,"kh ":635,"ki ":2286,"kha":2001,"kil":538,"kin":1440,"kis":1164,"km ":4428,"gèn":10476,"gèr":686,"ko ":893,"kon":459,"km²":1350,"kot":435,"kov":432,"ks ":761,"kra":491,"ku ":504,"gó ":1930,"kur":755,"ky ":488,"la ":366608,"lab":3695,"lad":10085,"lac":17768,"laf":1186,"güe":3838,"lae":632,"lah":1156,"lag":2894,"laj":467,"güi":457,"lai":5096,"lal":1564,"lan":35918,"lam":7327,"lap":841,"lar":27534,"lat":19752,"las":9334,"lav":5042,"lau":6114,"law":647,"lay":1456,"ld ":2920,"lba":2224,"lbe":2433,"lbi":585,"lbo":638,"lbu":1332,"le ":44349,"lca":2167,"lce":653,"lci":863,"lco":1607,"lcu":700,"lf ":1584,"lda":1491,"hàb":605,"lde":3739,"ldi":977,"hàm":864,"ldo":785,"ldr":572,"leb":5789,"lea":4091,"led":1685,"lec":11774,"lef":726,"leg":10347,"lej":477,"lei":5536,"lel":975,"laç":2967,"len":22888,"lem":19927,"lep":867,"leo":2123,"ler":12983,"güí":1062,"let":9344,"les":102946,"lev":3996,"leu":2726,"lex":3722,"lez":616,"ley":1273,"lfa":1785,"lfo":1309,"lfr":505,"hât":765,"li ":13660,"lga":2207,"lcà":614,"lge":809,"lgi":1101,"lgo":616,"lgr":600,"lgu":6098,"lha":1103,"lhe":555,"lho":784,"lib":7028,"lia":36207,"lid":6442,"lic":25988,"lif":3375,"lie":4063,"lig":7319,"lil":1149,"lik":909,"lin":14087,"lim":5858,"lip":2949,"lio":8726,"lir":2152,"liq":1148,"lit":39579,"lis":20036,"liv":1295,"leò":554,"liu":3960,"lix":612,"leó":1184,"liz":562,"ll ":39351,"lm ":1385,"lla":48510,"lle":42354,"lli":19175,"llo":15655,"lls":9306,"llu":4700,"lly":1261,"lo ":5465,"lma":3091,"lià":4545,"lme":22342,"lmi":745,"lmo":863,"liò":1277,"lob":2604,"lod":773,"loc":12848,"loe":1134,"log":10645,"loi":800,"lol":567,"lon":19003,"lom":4084,"lop":2384,"lor":17064,"loq":527,"lot":4665,"los":9416,"lov":1246,"lou":3952,"lpa":437,"llà":2875,"lpi":844,"lph":454,"llé":435,"llè":1476,"lps":2560,"lló":3244,"ls ":175094,"lqu":497,"lt ":13361,"lu ":594,"lsa":2423,"lse":2148,"lsi":715,"lso":1025,"lst":785,"lta":12959,"lte":4554,"lti":6082,"lto":1778,"lts":3121,"ltr":12243,"ltu":9528,"lub":3271,"luc":4590,"lue":1692,"lug":599,"lui":2022,"lul":2008,"lun":11448,"lum":4460,"lup":4423,"lur":1356,"lut":2289,"lus":5945,"luv":436,"lux":666,"lva":4670,"lve":1388,"lvi":1149,"ly ":1812,"lwa":457,"lsà":1169,"ltà":475,"luè":981,"luí":1350,"lvè":457,"ma ":38293,"mb ":57615,"mac":7166,"mad":8615,"mag":4837,"mai":6164,"maj":4814,"mam":1722,"mal":9961,"man":37306,"map":527,"mas":7207,"mar":29309,"mau":502,"mat":23089,"mav":1042,"maz":573,"ià ":17352,"mba":4524,"mbd":669,"mbe":2096,"mbi":6355,"mbl":3299,"mbo":2954,"mbr":23851,"mbu":1912,"me ":21969,"iàc":438,"iàr":461,"iàt":728,"iàs":758,"mea":647,"mec":1479,"med":6233,"mei":2249,"mem":5597,"mel":3662,"men":163114,"mes":20454,"mer":28498,"meu":1125,"met":15043,"mex":577,"mfi":799,"mbé":16617,"mi ":4707,"mia":9327,"mic":14032,"mie":1315,"mid":3169,"mig":2506,"mil":12628,"mio":566,"min":22171,"miq":948,"mis":7181,"mir":3272,"mit":14514,"mm ":453,"ièn":2714,"ièr":1466,"ièt":1155,"ièv":447,"mo ":1578,"mma":3679,"mme":3188,"mmi":630,"mmo":1110,"mmu":872,"mp ":3415,"mna":954,"mne":950,"mni":680,"mno":552,"moc":3211,"mob":1740,"mod":6269,"mog":841,"mom":1280,"mol":10435,"mon":20326,"mos":4856,"mor":11845,"mou":1279,"mot":4140,"mov":2586,"mpa":7026,"mpe":12187,"mph":615,"mpi":8423,"mpl":13140,"mpo":20484,"mpr":9709,"mpt":6437,"mps":5011,"mpu":1876,"ms ":6835,"nTo":847,"mst":734,"mta":3202,"mte":3651,"ió ":158662,"mud":478,"mul":5494,"mun":70097,"mus":5896,"mur":1824,"mut":620,"iòd":522,"iòf":670,"iòp":1205,"my ":586,"ión":1227,"iós":1780,"na ":220961,"nc ":7524,"nab":616,"nad":17767,"nac":13835,"nae":935,"nag":1094,"nai":2426,"nal":38916,"nak":484,"nan":5493,"nam":5460,"nap":613,"nar":17101,"nat":30685,"nas":9688,"nav":3059,"nau":2292,"naz":561,"nay":822,"jà ":1387,"nd ":7816,"nbe":579,"nbu":1232,"ne ":22377,"nca":14539,"nce":16829,"nch":3145,"nci":64974,"ncl":9885,"nco":4144,"ncr":1949,"ncs":1443,"ncu":1505,"ncy":517,"nda":23869,"nde":16620,"ndi":19292,"ndo":6191,"nds":943,"ndr":11684,"ndu":3915,"ng ":11247,"neb":984,"nea":2648,"ned":2947,"nec":3698,"nef":504,"nee":1092,"neg":18439,"nei":6432,"nel":3597,"nen":12397,"nem":2849,"neo":1410,"ner":33907,"net":5899,"nes":34659,"nev":447,"neu":7424,"nex":705,"nez":789,"ney":938,"nfa":1307,"nfe":3228,"nfi":1231,"nfl":2773,"nfo":4729,"nfr":1766,"ni ":20144,"nga":4787,"nge":7038,"ngh":831,"ngi":4881,"ngl":12728,"ncè":50834,"ngo":3054,"ngr":3118,"ngt":766,"ngs":1374,"ngu":13250,"nha":1988,"nhe":494,"ndà":979,"ndé":554,"ndè":4617,"ndí":543,"nk ":1042,"ndú":819,"nib":664,"nia":53223,"nid":9692,"nic":86188,"nif":4650,"nie":3921,"nig":794,"nil":1584,"nik":481,"neç":754,"nin":4394,"nim":9359,"nip":464,"nio":2632,"nir":5941,"niq":1948,"nit":24686,"nis":21391,"niv":8065,"niu":1270,"nja":2333,"njo":868,"nju":4341,"nka":688,"nki":503,"nn ":1976,"ngü":3506,"nla":582,"nll":1177,"no ":17934,"nma":725,"nià":880,"nme":466,"nió":3827,"nna":2687,"nne":6803,"nni":2838,"nno":1064,"nob":2525,"nod":871,"noc":1416,"nof":494,"nog":1211,"noi":1668,"nol":3970,"non":1881,"nom":44404,"nop":1021,"nor":21716,"not":4432,"nos":4014,"nov":10569,"nou":2451,"nlà":603,"ns ":93685,"nqu":5542,"nt ":218860,"nre":1636,"nri":1763,"nu ":885,"nsa":6315,"nsc":2889,"nsf":1150,"nse":13629,"nsh":524,"nsi":13645,"nsl":626,"nsk":598,"nsm":857,"nsp":2161,"nso":4459,"nst":17890,"nsu":4044,"nta":57284,"ntc":442,"ntb":438,"nte":45205,"ntg":483,"nti":34736,"nth":1782,"ntm":1132,"nto":12026,"ntp":552,"nts":52368,"ntr":44179,"ntu":4635,"nua":2471,"nuc":3125,"nue":1727,"nul":593,"nun":1667,"num":4207,"nur":444,"nut":1033,"nus":3661,"nva":693,"nve":10382,"nvi":4155,"nvo":5353,"ny ":54383,"nz ":509,"nxa":1082,"ntà":2307,"nté":1802,"ntè":783,"ntí":4716,"ntó":8247,"nya":32997,"nye":6811,"nyi":2590,"l·l":30097,"nyo":10078,"nys":9599,"nza":1097,"nze":1172,"nzi":1387,"nzo":563,"nyà":880,"oa ":1401,"ob ":526,"oc ":11621,"oac":453,"oad":649,"oal":1685,"oan":4480,"oaq":609,"oat":570,"od ":1121,"oba":13189,"obe":6752,"obj":3414,"obi":3723,"obl":18679,"obo":674,"obr":20373,"obt":2022,"obs":1595,"obu":764,"oe ":493,"oca":16830,"occ":10555,"oce":9769,"oci":14079,"och":2121,"ock":2140,"ocl":1591,"oco":3039,"ocs":5711,"ocr":3188,"ocu":5515,"oct":6615,"of ":4550,"oda":2665,"ode":17239,"odi":7067,"odo":3984,"odr":1784,"odu":8140,"og ":698,"oem":738,"oel":520,"oen":1465,"oes":8459,"oet":3024,"ofa":494,"off":580,"ofe":5675,"ofi":6735,"ofo":434,"ofr":707,"oft":679,"ofu":793,"oi ":1289,"oga":2429,"oge":2507,"ogi":7936,"océ":1787,"ocè":1333,"ogl":1025,"ogo":1358,"ogn":1130,"ogr":9571,"ogu":1863,"oha":1268,"ohe":725,"ohi":598,"ohn":1581,"oho":474,"ok ":646,"oia":2098,"oic":468,"oid":2341,"oig":744,"oin":2067,"ois":3399,"oir":4439,"oit":531,"oix":942,"ol ":30372,"oja":1081,"oje":1839,"ojo":1630,"om ":61645,"on ":37263,"ola":15384,"old":2685,"olc":987,"olf":1900,"ole":8236,"oli":14768,"oll":8132,"olk":744,"olm":858,"olo":16547,"olt":11680,"ols":5388,"olu":10825,"oly":514,"oma":23743,"omb":8927,"ome":25817,"omi":12024,"omm":3091,"omo":5351,"omp":30868,"oms":1766,"omu":8862,"omt":6651,"op ":5673,"ona":74755,"ond":10218,"onc":9281,"onf":5042,"one":36322,"ong":9559,"onj":5102,"oni":20927,"onn":3651,"ono":7822,"onr":882,"onq":1433,"ont":36749,"ons":68141,"onv":3868,"onu":4027,"onz":1423,"ol·":4740,"ony":2654,"ood":947,"ook":705,"ool":561,"oon":456,"oop":682,"oor":1244,"oot":626,"or ":70900,"opa":6157,"olà":508,"ope":9789,"oph":1988,"opi":6583,"olè":1083,"opl":744,"olç":548,"opo":12014,"opr":1132,"olí":15446,"opt":2194,"ops":1309,"olò":6420,"opu":4511,"olú":505,"os ":45596,"omà":6487,"omé":2932,"omè":1744,"oqu":4205,"omò":1267,"ot ":16595,"m² ":1388,"omú":1077,"orb":2892,"ora":25812,"ord":38310,"onà":1051,"orc":5879,"orf":1857,"ore":13865,"org":12794,"ori":40355,"onè":4899,"orl":959,"ork":2130,"orn":8044,"orm":37822,"orp":2115,"oro":4638,"orr":12139,"orq":2088,"oní":6153,"ort":34195,"ors":17718,"onò":2272,"oru":2747,"ory":799,"ou ":62701,"osa":16929,"osc":5383,"ose":10173,"osg":602,"osf":1074,"osi":11689,"osh":645,"osm":641,"osl":529,"oso":4882,"osp":1959,"oss":9976,"osu":464,"ost":28273,"ov ":962,"ota":15314,"ote":11949,"oth":1551,"otg":555,"oti":2918,"opè":600,"oto":6053,"otr":1076,"ott":1806,"ots":4720,"opò":2183,"otu":509,"otx":851,"otz":628,"ow ":986,"oua":574,"ouc":609,"oub":998,"oue":927,"oud":487,"oug":838,"oui":1328,"oul":1643,"oun":1441,"oup":535,"ous":2944,"our":6788,"out":1242,"ouv":831,"oux":592,"ox ":683,"ova":10310,"orà":1216,"ove":22419,"ovi":10137,"orè":686,"orç":2260,"ovo":1045,"orí":1417,"oy ":974,"owe":632,"osé":1584,"own":771,"osí":520,"ows":544,"oz ":585,"oxa":515,"otà":1202,"oxi":2953,"otè":716,"otí":765,"oya":1044,"oza":526,"ovè":610,"ové":1028,"ozo":576,"oví":8946,"pa ":9739,"pad":1653,"pac":5079,"pag":1729,"pai":2828,"pal":13281,"pan":17440,"pam":2029,"pap":1804,"par":94459,"pat":8342,"pas":6344,"pav":979,"pau":650,"là ":15487,"pe ":1865,"pci":3487,"làb":466,"làm":772,"làn":3621,"làs":2212,"pea":2241,"ped":2950,"pec":9867,"pee":640,"pei":3947,"pel":25143,"paç":459,"pen":10267,"per":146616,"paí":2905,"pet":9548,"paï":1343,"pes":3904,"peu":2601,"pañ":529,"pez":544,"ph ":863,"pi ":50107,"pha":1431,"phe":861,"phi":1300,"pho":1016,"phy":655,"pia":4024,"pid":2120,"pic":8010,"pie":3141,"pig":652,"pil":2968,"peç":675,"pin":8423,"pio":3260,"pir":3302,"piq":784,"pit":8813,"pis":9463,"lé ":469,"lça":1180,"pla":15092,"lèc":1983,"ple":12246,"pli":6764,"plo":3236,"lèn":3270,"lès":10249,"plu":1347,"po ":810,"lés":4374,"pió":810,"lí ":2941,"pob":11436,"poa":620,"pod":6800,"poc":4020,"poe":3561,"pog":871,"pol":20332,"pon":13711,"pom":605,"pop":3253,"por":25781,"pot":6581,"pos":25283,"ppe":1145,"plà":837,"ppi":516,"ppo":488,"ps ":11507,"líd":807,"líc":6419,"lín":4234,"lím":5029,"líq":803,"lít":11654,"lís":1456,"lív":799,"pra":3353,"pre":34824,"pri":25834,"pro":63491,"poà":783,"pse":893,"psi":2653,"pta":8281,"pte":6450,"pti":2623,"pto":6121,"ptu":1270,"ló ":4588,"pub":5108,"pug":578,"pul":6310,"pun":3708,"pur":2968,"put":7580,"pus":5848,"prà":1039,"lòg":4388,"pré":7343,"lòm":2228,"prè":1483,"lòn":2075,"prí":1326,"lòs":3229,"prò":1685,"lúm":491,"lús":483,"mà ":8683,"màl":657,"màn":3002,"màr":618,"màq":700,"màt":5607,"màs":464,"màx":1377,"mèd":1307,"mèn":1100,"mèr":3763,"mèt":1992,"mès":692,"més":26164,"mí ":1247,"míf":815,"míl":19165,"mín":758,"mís":442,"mít":521,"mó ":601,"qua":27959,"que":179626,"qui":29966,"mòb":818,"mòc":660,"mòn":1162,"mòr":663,"món":3556,"mós":1051,"quà":681,"qué":942,"què":5556,"quí":5413,"mú ":1104,"ra ":117928,"múl":441,"mús":4126,"rb ":573,"rc ":4562,"rab":6001,"rad":25715,"rac":27295,"raf":5098,"rae":1650,"rah":1040,"rag":7813,"raj":1208,"rai":2449,"ral":36833,"rak":919,"ran":117404,"ram":13092,"rap":2196,"rao":812,"rar":11553,"raq":935,"rat":34383,"ras":9366,"rav":6126,"rau":4831,"rax":445,"raz":685,"ray":988,"rd ":28176,"nà ":1344,"rba":4175,"rbe":2172,"rbi":3110,"rbo":2571,"rbr":1395,"rbu":1357,"re ":127856,"rca":15228,"rce":16884,"rch":2208,"rci":9579,"rcl":628,"rco":1658,"rcs":525,"rcu":4447,"rf ":586,"rda":5771,"rde":7271,"rdi":7800,"nàm":532,"nàl":791,"rdo":4676,"nàs":563,"rds":1314,"rdr":6511,"nàr":2379,"rdu":1144,"rg ":8132,"reb":7840,"rea":19325,"red":6383,"rec":25629,"ref":7131,"ree":3093,"reh":637,"reg":59431,"rej":893,"rei":14923,"rel":14087,"raç":1275,"ren":38502,"nán":562,"rem":10413,"rep":9263,"reo":750,"rer":13581,"req":1335,"ret":28281,"res":90579,"raï":1274,"rev":5388,"reu":8404,"raó":605,"rez":1149,"rey":666,"rh ":437,"rfa":571,"rfe":986,"rbà":1165,"rfi":689,"rfo":609,"rbó":588,"ri ":38233,"rga":12320,"rge":7141,"rgi":4873,"rgo":1906,"rcí":934,"rgu":2380,"rha":441,"rdà":2437,"rdí":525,"rk ":3017,"nç ":1220,"rib":13766,"ria":54826,"rid":11934,"ric":41587,"rif":1883,"rie":19265,"rig":19048,"rii":559,"ril":7707,"rik":753,"rin":25425,"rim":24490,"rip":8142,"rio":15451,"rir":3990,"riq":1644,"rit":24233,"ris":22414,"riv":4321,"riu":12192,"rix":568,"riz":963,"rl ":840,"rja":462,"rfí":2906,"né ":642,"rm ":573,"nça":14937,"rka":439,"rgà":1140,"rke":572,"rki":433,"rgè":459,"nço":1204,"rn ":10052,"rla":7357,"nèc":538,"rle":3694,"rld":647,"nèi":452,"rli":990,"rlo":1299,"nès":8870,"nèr":476,"nèt":1405,"ro ":9618,"rma":33523,"rià":2590,"rme":22120,"rmi":5557,"néi":2057,"riè":519,"rmo":1879,"nét":629,"nés":466,"riò":721,"rmu":2846,"rió":532,"rp ":429,"rna":16135,"rne":8134,"rni":4677,"rno":1290,"rns":1273,"rnu":487,"ní ":1501,"rob":14119,"roa":1438,"rod":10472,"nçà":569,"roc":11373,"rof":6472,"roe":762,"roh":627,"rog":6246,"roj":2089,"roi":2600,"rol":6688,"ron":19875,"rom":14247,"rop":23195,"roo":664,"ror":1191,"roq":1246,"rot":11015,"ros":8555,"rov":19098,"rou":2184,"rox":1912,"row":590,"nçó":919,"roy":445,"rpa":569,"rpe":904,"rpi":853,"rpo":1802,"rlí":799,"rpr":1467,"rs ":43553,"rmà":5015,"rmè":811,"nín":1769,"ním":5187,"nís":1535,"rqu":17402,"rt ":37622,"rra":26732,"rre":30205,"rná":562,"rri":15516,"rro":7545,"rní":499,"rru":892,"rry":820,"ru ":895,"rsa":4205,"rse":7640,"rsh":486,"rsi":8955,"rso":11182,"sTo":1591,"rst":783,"rsu":641,"rta":61936,"rte":6840,"rti":28399,"rth":2873,"rto":4054,"rts":5527,"rtr":804,"rtu":4238,"nó ":1101,"rub":519,"rud":719,"ruc":5530,"rue":1673,"rug":2246,"rui":2261,"rul":696,"run":2378,"rum":3191,"rup":10424,"rur":944,"rut":1660,"rus":7244,"ruz":521,"rva":5037,"rve":5065,"rrà":1253,"rvi":4488,"nòm":2605,"nòn":672,"rrò":1005,"ry ":4327,"rxa":1731,"rxe":465,"rtà":1217,"rxi":2115,"rtí":4136,"ruï":2409,"rza":620,"rze":469,"sa ":49499,"núm":1489,"sc ":5158,"sac":3907,"sab":2777,"sad":5098,"sag":1808,"sai":1232,"sam":2513,"sal":7035,"san":7869,"sap":1258,"sas":1691,"sar":9997,"sau":1965,"sat":8262,"sav":1021,"say":540,"sba":1107,"sbe":2109,"sbo":842,"sbu":1086,"se ":23004,"sca":10216,"sce":2797,"sci":2911,"sch":2851,"scl":1633,"sco":15125,"scr":15598,"scu":12352,"oàc":1284,"sde":2503,"sdi":860,"sea":449,"sec":4985,"see":524,"sed":628,"seg":26346,"sei":1192,"sem":9727,"sel":10402,"sen":30737,"seq":1206,"sep":5994,"ses":14361,"ser":50030,"seu":29396,"set":8719,"sev":20050,"sey":497,"sex":1181,"sh ":2994,"sfa":531,"sfe":1589,"sfo":1332,"si ":8535,"sge":646,"scà":439,"sgl":4161,"scó":611,"scò":901,"sha":1765,"she":777,"shi":2827,"sho":1034,"shu":449,"sk ":596,"sia":14705,"sic":17879,"sib":3043,"sie":1616,"sid":12721,"sig":8275,"sif":2038,"sim":6186,"sil":5083,"sio":8144,"sin":7331,"siq":657,"sis":13553,"sir":1327,"siu":1368,"sit":64187,"siv":1692,"ska":1514,"ske":504,"ski":1332,"sla":3540,"sle":675,"sli":610,"sll":456,"slo":1071,"so ":2581,"sma":1501,"sme":14571,"sià":1711,"smi":1069,"smo":829,"sió":16559,"sne":1493,"sni":622,"soc":7991,"sob":11796,"sod":622,"sof":3156,"som":1185,"sol":10431,"son":13880,"sop":1090,"sos":16303,"sor":9876,"sou":838,"sot":4932,"sov":2577,"spa":16778,"slà":1095,"spe":11857,"spi":4099,"spl":1010,"spo":10340,"spr":7848,"spu":5277,"ss ":1800,"squ":6763,"st ":39126,"sra":955,"su ":660,"ssa":20530,"sse":19728,"oïd":505,"ssi":25382,"sso":13917,"tTo":499,"ssu":1701,"sta":101622,"spà":903,"ste":42113,"sth":493,"sti":43697,"spè":11910,"sto":9599,"str":59818,"sts":2256,"stu":8027,"sua":2241,"suc":3183,"sub":8739,"sue":1081,"sud":11949,"suf":817,"sum":2161,"sul":6935,"sun":568,"sup":9176,"sus":1301,"sur":6523,"svi":496,"sy ":573,"swa":456,"ssà":1360,"ssí":918,"stà":12114,"stè":2045,"sté":688,"stí":1611,"stò":9473,"stó":429,"stú":635,"ta ":122424,"tc ":1199,"tab":6420,"tad":11316,"tac":14485,"taf":1085,"tae":440,"tag":3020,"tai":1997,"tal":64634,"tak":604,"tan":69063,"tam":65119,"tap":1329,"tar":32566,"taq":429,"tat":102986,"tas":2466,"tav":5072,"tau":2716,"tax":624,"pà ":654,"tba":556,"tbo":5303,"te ":35517,"tch":930,"pàg":896,"pàn":607,"pàs":516,"teb":673,"tea":3306,"ted":1805,"tec":11537,"teg":9002,"tej":688,"tei":11170,"tel":17386,"ten":56121,"tem":24684,"tep":736,"teo":2790,"ter":78366,"tet":1273,"tes":39189,"tev":1479,"teu":1106,"tex":1880,"th ":3201,"ti ":4614,"tge":13266,"tha":2284,"the":5645,"thi":1651,"tho":1884,"thr":609,"thu":1372,"thy":461,"tib":1285,"tia":9331,"tid":5028,"tic":56853,"tif":3870,"tie":2340,"tig":8482,"til":13513,"tin":20966,"tim":7466,"tip":6726,"tio":6592,"tir":9312,"tiq":5351,"tit":28398,"tis":7207,"teï":834,"tiv":16038,"teò":848,"tiu":13608,"tja":6493,"pça":563,"tla":862,"pèc":11863,"tle":1839,"pèd":1199,"tli":456,"tll":1435,"pèl":878,"to ":6648,"tma":1238,"tià":1107,"tme":1776,"tmo":629,"tiò":607,"tió":1083,"tp ":925,"tna":665,"tjà":1175,"tni":849,"pí ":547,"tob":475,"tod":2373,"toc":2978,"tog":2089,"toi":886,"tol":9991,"ton":11361,"tom":4842,"top":2299,"tor":49989,"tot":12646,"tos":4059,"tov":523,"tou":627,"tpe":793,"tlà":1789,"ts ":123279,"pít":678,"tt ":1057,"tra":52849,"tre":76882,"tri":41278,"tro":24672,"tru":10105,"tu ":878,"tsa":465,"tse":1279,"tsi":484,"tsu":500,"tta":2267,"tte":2664,"tti":1561,"tto":1020,"ttp":937,"pó ":1893,"tub":5389,"tua":62128,"tud":8635,"tuc":2658,"tue":1514,"tug":2149,"tui":769,"tul":1604,"tun":4869,"tum":1676,"tur":31002,"tut":3095,"tus":3479,"tx ":864,"trà":2278,"pòn":1874,"trí":1078,"pòs":948,"tró":603,"trò":2082,"ty ":2018,"tz ":1159,"txa":1326,"txe":1510,"txi":632,"tuà":944,"tuï":1466,"tza":24199,"tze":2838,"ua ":10767,"púb":6544,"tzà":645,"ub ":3172,"uc ":2667,"uac":2232,"uad":11839,"uai":1953,"uam":682,"ual":28714,"uan":11822,"uas":561,"uar":7084,"uat":47989,"ud ":18502,"uba":1775,"ubd":594,"ubc":1131,"ubf":937,"ube":2183,"ubj":552,"ubi":3012,"ubl":6690,"ubm":494,"ubo":616,"ubr":5366,"ubt":888,"ubs":3721,"ubu":437,"ue ":126160,"uca":3754,"ucc":7080,"uce":759,"uci":9562,"uch":1395,"uck":584,"ucl":3075,"uco":864,"ucr":1695,"ucu":635,"uct":6590,"uf ":614,"uda":11060,"ude":3855,"udi":9915,"udo":1171,"uds":473,"uec":1219,"ueb":903,"ued":1823,"ueg":856,"uei":5164,"uel":7936,"ueo":992,"uen":7564,"ues":47764,"uer":15228,"ueu":754,"uet":3253,"uev":836,"uez":760,"uff":474,"ufi":449,"ufr":480,"ui ":5837,"uga":5261,"ugb":506,"uge":1945,"ugi":891,"ugh":677,"ugm":498,"ugo":826,"ugu":4941,"uha":903,"udò":503,"uk ":502,"uia":3917,"uic":515,"uie":813,"uid":2598,"uig":1442,"uim":1036,"uil":5429,"uio":761,"uin":7811,"uip":3162,"uis":4149,"uir":5469,"uit":17969,"uiv":1075,"uix":2053,"ul ":3717,"uja":1612,"um ":10025,"uka":443,"un ":192982,"ula":29038,"uhà":853,"ulc":505,"ulf":473,"ule":5627,"ulg":1229,"uli":10081,"ull":4486,"ulm":1227,"ulp":452,"ulo":1383,"ult":17665,"uls":2133,"ulu":974,"uma":4588,"umb":1839,"ume":9875,"umi":2916,"umo":867,"umn":844,"ump":546,"ums":1051,"umu":683,"up ":7583,"una":115810,"und":14739,"unc":5934,"une":6232,"ung":1717,"uni":75336,"unk":469,"unn":474,"uno":472,"unt":18187,"uns":8577,"uny":15412,"ul·":549,"ur ":10863,"upa":8358,"upe":8274,"upi":692,"upo":2679,"upr":678,"ulí":685,"ups":1372,"us ":57417,"umà":929,"uqu":973,"ut ":22574,"urb":2707,"ura":45952,"urd":1647,"urc":2054,"ure":17552,"urg":6124,"uri":8027,"urk":531,"urn":1964,"uro":9811,"urr":1213,"urq":1132,"uní":880,"urt":4141,"urs":6415,"uru":1680,"ury":571,"usa":7168,"usc":1878,"use":5260,"usi":6699,"ush":829,"usk":567,"uso":806,"uss":4155,"usu":2167,"ust":14137,"utb":4938,"uta":29096,"ute":2409,"uth":1062,"uti":9123,"utl":486,"uto":7896,"utr":950,"utt":722,"uts":3313,"utu":1063,"utx":493,"ux ":3488,"uva":445,"urà":594,"uve":1383,"uvi":1205,"urí":1257,"uró":857,"uz ":716,"utà":1102,"uxe":777,"utò":2459,"uza":451,"va ":106023,"vad":4853,"vac":2535,"qüe":1179,"vag":486,"vai":1691,"val":13789,"van":14175,"vam":2174,"var":7823,"vat":3912,"vas":1634,"rà ":3933,"ve ":4089,"ràc":5104,"ràb":602,"ràd":560,"ràf":3237,"ràm":664,"ràl":1743,"ràn":2104,"ràp":855,"ràs":1116,"ràr":841,"ràt":2402,"ved":590,"vec":630,"veh":638,"veg":5362,"vei":3281,"vel":8498,"qüè":1315,"ven":21441,"vem":4525,"ver":38422,"vet":1227,"ves":14888,"veu":1565,"vi ":3211,"rç ":5332,"via":10597,"vid":9383,"vic":2175,"vie":3820,"vig":1092,"vil":8533,"vin":8553,"vim":3021,"vio":1545,"vir":3660,"vit":4613,"vis":14610,"veï":765,"viv":1644,"viu":4994,"ré ":1046,"rça":2105,"rèc":1336,"règ":503,"rèi":642,"rèn":5308,"rès":5304,"vo ":1039,"vià":560,"viè":1375,"rés":8781,"rí ":2419,"voc":3261,"voi":1301,"vol":16015,"von":1417,"vor":4142,"vot":1057,"ría":1669,"ríd":607,"ríc":752,"ríg":1189,"rín":2502,"río":3512,"rít":3404,"rís":5081,"vre":1338,"rò ":7556,"ró ":3651,"vui":3195,"vul":923,"ròn":2822,"ròq":622,"ròp":2137,"ròs":681,"rós":546,"rú ":1201,"wa ":1404,"rús":665,"wal":701,"wan":1387,"war":2529,"wat":497,"way":601,"sà ":1407,"we ":460,"sàc":1251,"sàn":515,"web":921,"wer":676,"wig":486,"win":579,"sè ":545,"sé ":1728,"wn ":649,"sèn":1110,"sèr":3720,"sí ":864,"ws ":526,"sím":1023,"síl":458,"sín":504,"sís":559,"só ":937,"ww ":778,"sòl":1221,"són":13869,"www":774,"xa ":8099,"sús":797,"xad":949,"xac":781,"xan":2894,"xam":598,"xar":2283,"xat":539,"xas":500,"tà ":13676,"xe ":713,"xce":1029,"xcl":664,"tàc":678,"tàl":3228,"tàn":10732,"tàr":2876,"tàt":701,"xeb":700,"xec":2213,"xel":999,"xen":5829,"xem":5052,"xer":5848,"xes":1826,"xfu":529,"xi ":673,"xia":768,"xid":1180,"xic":3203,"xig":486,"xil":1324,"xin":3485,"xim":3896,"xip":901,"xit":1472,"xis":3718,"xiu":786,"tè ":945,"té ":7771,"tèc":1878,"tèg":457,"tèl":670,"tèn":2186,"tès":574,"tèr":1721,"tèt":687,"tén":954,"xió":452,"tí ":7856,"xon":1279,"xos":2744,"xpa":461,"xpe":1742,"xpl":1855,"xpo":1220,"xpr":1563,"tíc":1597,"tíf":2180,"tín":1414,"típ":1255,"tít":4017,"tís":1777,"xt ":1229,"xte":2775,"xti":2038,"xto":453,"xtr":2818,"tó ":11716,"xua":820,"tòg":462,"tòm":658,"tòl":2436,"tòn":8389,"tòr":10789,"ya ":28447,"túr":886,"yad":2319,"yam":961,"yal":1710,"yan":1672,"yar":1667,"yat":1531,"uà ":578,"ye ":706,"uàn":829,"yen":2530,"yes":3123,"yer":2450,"yia":1497,"yin":895,"uè ":4716,"ué ":2510,"yn ":760,"yla":494,"uèc":777,"uèi":776,"yll":431,"uèn":1009,"uès":1293,"yo ":645,"ués":652,"uí ":2229,"yol":7324,"yon":647,"yos":583,"yor":2318,"ys ":10504,"uím":3096,"uín":428,"uís":1893,"yra":458,"uïd":2904,"uïs":1175,"uït":4233,"yst":691,"yya":579,"za ":5587,"zad":5095,"zac":6060,"zal":446,"zak":429,"zan":2462,"zam":487,"zar":4633,"zat":5982,"ze ":2627,"vàn":432,"zel":560,"zen":1838,"zem":672,"zer":1157,"zi ":986,"zia":485,"zil":1095,"zin":652,"zim":483,"vé ":1312,"vèn":473,"vèr":780,"zo ":1020,"vés":2039,"zon":6127,"víd":553,"vín":8895,"zz ":445,"xèr":1610,"xí ":2939,"xò ":1228,"xòn":572,"yà ":951,"zà ":697,"アア":1081,"ης ":436,"ος ":1575,"·la":8064,"·le":5700,"·li":4957,"·lo":1202,"·lu":2994,"·lí":5910,"Àfr":1807,"Àsi":1439,"Àus":1017,"És ":10713,"Ésa":437,"Índ":3364,"Òlt":635,"àbi":1441,"àbr":1178,"àci":8619,"àct":3420,"àdi":1055,"àfi":3296,"àfr":638,"àgi":1334,"àlb":937,"àla":554,"àle":596,"àli":6485,"àmb":1406,"àme":878,"àmi":2273,"àmp":626,"àmm":855,"ànd":2746,"ànc":3868,"àni":16801,"ànt":2743,"àns":473,"àl·":933,"àpi":984,"àpo":807,"às ":2023,"àqu":1070,"àra":3012,"àre":2240,"àri":9009,"àrt":1073,"àrr":2164,"àsi":1729,"àsq":664,"àst":2034,"àss":2414,"àti":10227,"àto":823,"àvi":811,"àxi":1936,"án ":1347,"ánd":563,"âte":655,"ão ":540,"ça ":14258,"çad":1687,"çam":1930,"çal":758,"çan":2501,"çat":1213,"çar":1761,"çon":901,"ços":820,"çue":665,"èca":1286,"èci":14442,"ècn":2007,"ècu":627,"èct":1398,"èdi":3114,"èi ":602,"ègi":1097,"èix":1072,"èn ":1343,"èla":841,"èlg":684,"èle":454,"èli":1252,"èmi":2546,"ène":8766,"ènc":20961,"èni":2309,"ènt":776,"èl·":1637,"èpo":1872,"ès ":80199,"èra":876,"ère":1653,"èrc":1864,"èri":10984,"èrn":556,"èrt":486,"èrs":756,"èsa":561,"èsi":1364,"èst":608,"èti":5012,"èto":1124,"ètn":703,"ètr":730,"èvr":822,"èxi":2818,"ée ":805,"éix":2212,"én ":1549,"éns":1755,"és ":232119,"ét ":506,"éra":478,"ére":495,"éri":684,"éu ":1664,"ési":4228,"éss":1399,"çà ":741,"çó ":1068,"ía ":2053,"íac":872,"íad":461,"íbl":475,"íbi":489,"íci":5839,"ícl":678,"íco":662,"ícu":6130,"íct":496,"íde":1426,"ídi":1300,"ífe":1270,"ífi":4217,"íge":870,"ígi":569,"ígu":457,"ín ":2860,"íli":20926,"ímb":929,"íme":625,"ími":9376,"ímp":3941,"índ":1015,"ínc":10455,"íne":431,"íni":4688,"ínt":657,"íns":1631,"íod":3531,"ípi":1480,"ís ":12059,"íqu":1045,"íri":1722,"ísi":3436,"íst":7167,"íti":15581,"ítr":512,"íto":5091,"ívi":958,"ívo":458,"ïda":2603,"ïde":848,"ïll":554,"ïna":1371,"ïne":928,"ït ":3336,"ïso":2690,"ïss":1050,"ïta":563,"ïts":737,"ña ":779,"òbi":1083,"òci":1146,"òcr":1177,"òdi":1030,"òfi":1393,"òfo":870,"ògi":4468,"ògr":1140,"òl ":442,"òle":2548,"òli":3897,"òmb":968,"òme":2032,"òmi":3820,"òni":18460,"òno":2682,"òns":1135,"òpe":1178,"òpi":2718,"òpo":1001,"òpt":1029,"òps":647,"òs ":2570,"òqu":909,"òrg":1217,"òrd":960,"òri":12552,"òrn":1226,"òrs":1069,"òsi":753,"òso":1297,"òst":751,"òss":949,"òti":1705,"òvi":472,"òxi":771,"ón ":21967,"óna":1070,"ós ":5791,"órm":2361,"úbl":6781,"úli":431,"últ":1838,"úmb":502,"úme":1155,"úni":3359,"ús ":3526,"úri":1851,"úsc":672,"úsi":4110,"úst":870,"úss":1836,"üen":1610,"ües":3017,"üèn":1315,"üís":1030},"n_words":[42630400,51577449,41251739],"name":"ca"}
+++ /dev/null
-{"freq":{"D":14883,"E":9868,"F":10603,"G":9505,"A":22558,"B":22848,"C":16455,"L":15978,"M":26754,"N":16476,"O":10801,"H":14139,"I":10554,"J":20213,"K":19386,"U":5533,"T":19206,"W":5350,"V":19456,"P":31985,"S":35760,"R":13848,"Y":1260,"Z":7679,"f":46799,"g":61772,"d":331353,"e":908926,"b":175631,"c":304749,"a":764805,"n":728035,"o":890229,"l":442609,"m":328621,"j":233650,"k":440118,"h":259891,"i":521717,"w":9425,"v":451146,"u":342714,"t":548551,"s":514164,"r":514188,"q":1288,"p":289793,"z":213126,"y":191798,"x":11788,"í":321621,"é":145384,"á":236804,"ý":129650,"ú":13969,"ó":5647,"ě":138011,"ď":2560,"Č":7702,"č":98194,"ř":105343,"Ř":1405,"ň":6555,"Ž":1649,"ž":81570,"ť":3583,"Š":3572,"š":60015,"ů":48717," l":32594," m":62986," n":107131," o":79037," h":32176," i":15946," j":127655," k":89036," d":65217," e":11656," f":19925," g":5629,"р":1443," a":90604,"с":1165," b":47368," c":20381," z":85758," u":20743," t":59923," w":1272," v":146123," p":164476," s":159790," r":52674," J":20168," K":19243," H":14014," I":10456," N":16390," O":10658," L":15894," M":26594," B":22644," C":16189," A":22443," F":10496," G":9381," D":14695," E":9809," Z":7643," Y":1252,"и":1928,"о":2213,"н":1523," S":35443," R":13762,"в":1172," P":31807,"а":2293," W":5251," V":19363," U":5519,"е":1554," T":19078," ú":11862," č":34240," Č":7692," ž":9995," Ž":1647," Š":3567," š":7349," ř":11127," Ř":1403,"A ":3554,"Da":2148,"Co":3211,"Ce":1274,"Ch":3956,"Do":2726,"De":2027,"Di":1704,"Ev":1845,"Ge":1462,"Ga":1606,"I ":2501,"Fr":2934,"Fo":1409,"Fi":1671,"C ":2405,"Au":1413,"Ar":2392,"As":1426,"D ":1568,"Ba":3474,"Am":1706,"An":3282,"Al":3092,"By":2085,"Bu":2103,"Br":3950,"Ca":2582,"Bi":1362,"Be":3232,"Bo":3007,"Ku":1175,"Kl":1430,"Kr":2544,"Ko":4086,"Le":3365,"Li":3491,"La":2847,"Lu":1426,"Lo":2548,"Me":3379,"Mi":4015,"O ":1633,"Ma":8187,"Mo":4597,"Ni":1492,"Ne":3393,"Na":4223,"P ":1532,"No":2609,"Ob":1767,"Gr":1729,"Ha":2931,"He":2201,"Ho":2992,"Hr":1212,"In":2920,"Ja":4326,"L ":1226,"Ji":2201,"Je":8576,"Jo":1796,"Ju":1244,"Ka":5031,"M ":1302,"Tu":1161,"Tr":2534,"To":2275,"Th":2120,"Ti":1361,"Te":3239,"Ně":1339,"Ta":2435,"V ":4585,"St":6172,"Sv":1697,"Su":1422,"Wi":1615,"Wa":1205,"Vy":1686,"Vo":1294,"Vi":2012,"Va":1654,"Ve":3321,"Pr":7552,"S ":3112,"Pe":2524,"Pa":5499,"Pl":1516,"Po":7183,"Pi":1236,"Os":1168,"Or":1234,"R ":1716,"Se":3431,"Sc":1219,"Si":1856,"Sl":2584,"Sk":1212,"Sp":2686,"So":3053,"Ru":1888,"Sa":3516,"Re":2582,"Ná":1404,"Ro":3655,"Ra":2507,"b ":5436,"Zá":1325,"a ":217661,"Př":2063,"Za":1646,"Ze":1335,"i ":90037,"fy":1326,"ge":8367,"ga":7746,"bý":4615,"fi":13067,"ač":14478,"fr":4738,"fu":2046,"ft":1555,"fo":9138,"bí":5183,"j ":5728,"gy":1560,"dá":9259,"he":13421,"ha":16982,"bě":8276,"gn":2020,"gl":5355,"gi":10280,"gh":1578,"gu":3569,"gr":6781,"cí":27801,"go":4942,"du":15000,"dv":6535,"dy":13097,"g ":5595,"ea":6165,"eb":20255,"ec":32017,"ed":50406,"de":36807,"dd":1592,"di":24806,"dh":1175,"dk":3086,"dm":3275,"dl":11857,"do":40054,"dn":44007,"dp":2177,"ds":6618,"dr":15992,"ew":1387,"ex":5658,"eu":3247,"ev":22848,"ey":1497,"ez":19185,"fa":3278,"h ":68060,"fe":4502,"bá":2585,"eh":9007,"eg":5751,"ef":3385,"ee":1918,"el":51949,"ek":22014,"ej":27806,"ei":3682,"ep":11390,"eo":6058,"en":103592,"em":54636,"et":33365,"es":44385,"er":89316,"ca":4919,"e ":246825,"bv":2259,"by":26507,"bs":4261,"br":9964,"bu":11101,"bn":7303,"bo":26250,"bj":2483,"bl":11541,"bi":8921,"bc":3044,"bd":1995,"be":13547,"dc":2442,"db":1918,"da":22450,"f ":2962,"cy":1677,"cu":2117,"ct":4541,"cr":1140,"co":10175,"cn":2818,"ck":45489,"ci":24104,"ch":106800,"ce":56738,"c ":9636,"az":14206,"ay":1595,"ba":13431,"d ":32945,"at":54683,"as":30808,"ar":43773,"ax":1424,"av":34552,"au":9271,"ak":35472,"al":52089,"ai":4202,"aj":15958,"ap":12841,"am":28407,"an":76603,"ac":26092,"ad":38615,"ab":9081,"ag":6085,"ah":10143,"ae":4121,"af":3809,"nu":14720,"nt":27386,"ns":25831,"ič":7118,"jí":23302,"no":57801,"nn":8311,"nz":2252,"ny":17200,"oe":1227,"ká":21030,"of":6319,"oc":21110,"od":69409,"oa":1996,"ob":47136,"ké":50289,"om":27583,"on":44356,"ok":26216,"ol":47354,"oi":2071,"oj":17875,"og":10937,"oh":13386,"ot":27460,"os":56160,"ov":112940,"ou":73196,"op":21879,"oo":2643,"or":53147,"r ":22705,"ox":1211,"ow":2081,"oz":27164,"lá":15416,"pe":15026,"pa":25836,"ký":35391,"pc":1302,"pl":12514,"lé":7239,"pn":3046,"po":81579,"ph":1918,"pi":14831,"eň":2211,"lo":61865,"ln":20483,"hé":2618,"lm":4934,"ll":7520,"ls":8702,"dů":3418,"lu":15908,"lt":4743,"lz":1355,"ly":9717,"o ":138337,"ma":25523,"eř":2784,"mb":4786,"hý":1516,"me":40325,"iá":5235,"ml":2209,"eš":4400,"mi":28342,"mn":8112,"mm":1350,"mp":4986,"mo":29298,"mr":1370,"ií":2470,"ms":3506,"mu":15591,"my":5631,"p ":4419,"na":97771,"nc":16128,"nd":14047,"ne":57176,"já":1158,"nf":2490,"ež":8504,"ng":13163,"ni":45831,"nk":8297,"jv":3866,"ju":1525,"eč":10324,"js":11164,"jn":5134,"jo":3496,"jm":7494,"ki":3600,"ke":11246,"kd":5064,"kc":3108,"ka":39832,"m ":95402,"ců":2079,"ky":30906,"kt":39344,"ku":33102,"kv":3857,"ko":71233,"kr":19435,"kl":18948,"km":3641,"kn":3161,"li":57563,"lk":10732,"le":77448,"há":9650,"ld":2521,"lf":1159,"la":65008,"dř":2617,"lc":1721,"lb":2945,"n ":43523,"hr":15902,"dí":8010,"hv":1306,"ht":2875,"hu":12412,"hi":7867,"hn":5204,"ho":73846,"hl":13059,"hm":1256,"dé":2586,"id":15111,"ic":65339,"dý":1471,"ib":5266,"ař":4683,"ia":10547,"ih":5555,"ig":6211,"if":3335,"eá":1247,"ie":19796,"hy":6927,"k ":33844,"ir":8411,"is":36532,"it":38607,"iu":2679,"iv":18333,"aš":1639,"ii":6209,"ij":4185,"ik":23953,"il":27036,"im":9173,"in":70373,"io":12386,"ip":4762,"je":121519,"až":7470,"ji":17967,"iz":8625,"l ":42283,"bř":2940,"ja":17600,"dě":13132,"xi":2953,"té":13825,"tí":14980,"pů":6675,"xt":1600,"z ":24530,"př":32632,"ož":13354,"tá":14232,"nž":1154,"oš":2365,"sé":1859,"sí":5664,"rč":2879,"nů":2947,"vy":26292,"vz":6628,"y ":96008,"rý":11309,"oř":9947,"wa":2470,"sá":2703,"we":1453,"vl":7295,"ré":10396,"vk":3052,"nš":1672,"vi":26093,"mž":1362,"vu":5429,"vr":8123,"vs":6209,"vn":24780,"vo":37739,"uz":6709,"mů":3291,"uv":3489,"ve":47740,"rá":30974,"vc":2056,"va":51428,"pě":4347,"x ":2742,"ui":1687,"uj":20016,"uk":7056,"ul":12653,"ue":2392,"ug":1937,"uh":10244,"ur":17880,"us":20839,"ut":14862,"um":13455,"un":12543,"up":13452,"ty":13294,"tz":1768,"tu":25069,"tt":3192,"lů":2536,"tv":17643,"ub":11697,"ua":2086,"ud":12808,"uc":6671,"w ":1552,"to":60100,"tn":18581,"tm":1809,"tl":6035,"ts":8087,"oč":8726,"tr":38784,"pí":3163,"te":85042,"pá":2981,"tk":8430,"ti":58581,"lš":2281,"th":6153,"v ":66402,"tb":1873,"tc":1257,"ta":58747,"ně":45741,"su":6657,"sv":12247,"ss":3706,"st":145711,"sy":7700,"ků":5193,"sl":22654,"sk":92692,"sn":8865,"sm":8745,"sp":20097,"so":35354,"sr":1708,"nč":1796,"sc":4718,"se":52884,"sh":2118,"si":15634,"rz":2926,"u ":121101,"mě":22506,"sa":15498,"kř":3224,"sb":1371,"ný":29623,"rr":2082,"rs":10884,"rt":11161,"ru":27381,"rv":11951,"ry":11066,"ní":119912,"rp":2592,"ro":116857,"rn":21334,"rm":10574,"né":26518,"rl":3217,"rk":6663,"ri":39119,"jš":5173,"rh":1365,"rg":7113,"iž":5869,"ná":41081,"re":36192,"rd":6788,"rc":6965,"mý":1831,"rb":3000,"ra":81349,"t ":42900,"mí":11983,"mé":6077,"iš":4295,"má":9760,"lý":4646,"s ":40054,"py":1779,"pt":3558,"pu":7668,"hů":2019,"lí":8097,"pr":57083,"ps":5342,"zý":1387,"zá":15336,"už":12787,"vš":5164,"zí":7043,"vů":2483,"uš":4102,"tš":4235,"vě":25419,"tř":15220,"tů":3818,"rž":1454,"vý":29752,"zh":2175,"zi":13719,"rš":2140,"zb":2038,"zd":9382,"ze":35435,"vá":31782,"tě":14576,"za":24684,"yz":2224,"rů":8094,"zv":6153,"zy":3702,"zs":3415,"uč":6510,"zr":1709,"zu":4066,"zt":1618,"zo":8118,"zn":27029,"ví":13118,"zp":5782,"zk":6049,"zm":2157,"vé":19370,"zl":2273,"yh":1425,"yc":3822,"yd":3200,"tý":6426,"yb":3071,"yv":3371,"yu":1291,"yt":7847,"ys":11111,"yr":3273,"yp":5097,"yn":5498,"ym":4086,"yl":23502,"yk":5878,"zš":1160,"yř":1210,"yš":3131,"ěžn":1182,"ám":8120,"án":28607,"áp":5503,"áj":1550,"ák":7074,"ál":25430,"áh":2531,"áb":3286,"ác":7677,"ád":11483,"áz":13195,"áv":12832,"ár":9720,"át":14731,"ás":12931,"á ":61028,"íč":1699,"ód":1198,"ón":1579,"éž":3751,"ív":7836,"íz":3776,"ín":7431,"ím":27488,"íp":2283,"ír":5728,"ít":5219,"ís":9896,"íh":10920,"ík":10708,"íl":6160,"íj":1652,"íc":36122,"íd":6139,"íb":1336,"í ":168172,"áž":2247,"él":1388,"áš":2438,"ém":16118,"én":5964,"ét":2091,"ér":4128,"ář":4474,"éd":1164,"éh":25602,"é ":80173,"áč":1328,"úč":1787,"ýc":33402,"ýz":2981,"ýv":5852,"ýs":1457,"ýt":1801,"ýr":2669,"ým":16378,"úz":3790,"ý ":58367,"ús":1640,"íš":1631,"íř":2525,"íž":4061,"ě ":45418,"ěh":2633,"ěj":8834,"ěd":3131,"ěc":1612,"ěz":2258,"ěs":10757,"ět":15295,"ěv":1529,"ěr":3724,"ěk":7417,"ěl":9917,"ěm":5852,"ěn":11047,"ěž":3582,"ěř":2184,"ýš":1701,"Če":5023,"či":11688,"čk":4324,"čl":2919,"če":26348,"ča":7771,"ď ":1420,"č ":2311,"čá":8501,"čn":16418,"čo":3182,"čt":3695,"ču":2648,"čí":6991,"ň ":2419,"š ":1248,"ří":28277,"řá":2554,"řn":1352,"ři":14008,"řs":2960,"ře":44896,"řa":3989,"ší":18419,"ť ":1589,"še":8882,"ša":2373,"šn":3181,"šk":3546,"šl":1812,"ši":4496,"št":7997,"ňu":1474,"ř ":4182,"žs":2453,"žn":8213,"žo":1725,"že":16587,"žd":1803,"žk":1495,"ži":11885,"ž ":17370,"ží":12326,"žá":1346,"ů ":27572,"ům":2630,"ůl":1327,"ůs":3997,"ův":4803,"ůz":2205,"šš":1500,"ůž":1940,"čás":7287,"čí ":1292,"čít":2030,"čís":1405,"čtv":1158,"čuj":2526,"ční":7738,"čné":1325,"čný":1279,"čně":1729,"ěji":2384,"ěko":1330,"ěkd":1747,"ělo":1511,"ěle":2558,"ěkt":1458,"ěme":3785,"ěls":1244,"ětš":3901,"ěta":1179,"ěto":2030,"ěný":1390,"ěst":9587,"ějš":4291,"ění":3741," Ga":1588," Ge":1446," Fo":1382," Fr":2926," Fi":1650," Ha":2917," He":2194," Gr":1717," Hr":1206," Ho":2982," Ji":2194," Je":8552," Ja":4310," In":2868," Ka":5019," Jo":1792," Ju":1242," La":2820," Le":3350," Li":3464," Kl":1383," Ko":4081," Kr":2542," Ku":1171," Ma":8133," Mi":3991," Me":3365," Lo":2542," Lu":1423," Ne":3375," Na":4212," Ni":1486," Mo":4565," Am":1697," An":3275," Al":3081," Ba":3436," Au":1408," As":1420," Ar":2370," Be":3213," Bi":1350," Bo":2981," Br":3935," Bu":2094," By":2085," Ca":2514," Ce":1267," Ch":3924," Co":3154," Da":2135," Di":1684," De":2001," Do":2665," Ev":1844," Př":2056," Wi":1593," Wa":1184," Vy":1684," Ze":1330," Za":1643," a ":53430," Zá":1320," Os":1166," Or":1230," Po":7138," Pl":1502," Pi":1231," Pe":2516," Pa":5464," No":2595," Ob":1764," Ra":2495," Ro":3640," Re":2567," Ná":1395," Pr":7523," Sv":1696," Su":1408," St":6100," Ně":1337," Ta":2426," V ":3746," Th":2108," Ti":1351," Te":3210," Tr":2526," To":2242," Ru":1883," Sa":3503," Si":1839," Sc":1168," Se":3406," So":3031," Sp":2664," Sk":1209," Sl":2578," Va":1650," Ve":3306," Vi":1986," Vo":1286," Tu":1143," ja":14129," dě":2397," bř":1481," ji":8392," až":2786," je":93301," in":6001," it":1175," ka":8683," m ":1247," kd":3115," ke":1672," jm":2892," js":6707," ha":1600," bě":2145," he":2509," dá":1402," cí":2459," gr":1666," k ":5918," hi":2201," hl":5156," ho":6862," dí":1705," hr":6552," hu":3075," ni":2426," ne":33309," na":43193," my":1417," mu":2661," mo":9457," mn":2424," ok":4623," oc":1666," od":14999," of":1957," ob":22345," no":3710," le":12637," li":8468," la":3675," dř":1310," kv":1829," ku":2340," kt":24717," kn":2205," km":3399," kl":3809," kr":8199," ko":16866," me":11211," mi":4779," ml":1339," o ":7191," ma":9419," dů":1272," lo":3118," ab":1328," am":3804," an":6347," ap":1198," ak":1935," al":5289," au":3694," ar":3304," at":1332," as":2951," ba":3819," bi":2222," be":2412," bo":3752," by":19260," bu":3054," br":3187," en":1401," el":2125," fa":1772," ex":2023," fu":1558," fr":2890," fo":4715," fi":6203," ge":1763," bý":3017," i ":5273," co":2133," ce":6243," ch":5478," da":4034," do":19631," dn":2609," dl":1361," dr":8732," de":7028," di":4948," dv":4297," du":1944," vý":12688," zk":2698," ví":2124," zp":3908," zn":6476," rů":1942," zv":2770," tě":2639," za":18239," zd":2577," vá":3060," ze":8914," tý":1167," té":5230," pů":4589," z ":20036," př":29127," vš":2884," už":1794," zá":11557," tř":3845," vě":4981," ru":2498," ry":1961," u ":3351," sa":3585," mě":10287," kř":2709," se":38618," sc":1843," si":3884," sn":1263," sm":2713," sl":8492," sk":7519," sr":1447," sp":14637," so":12794," mí":3137," ra":2856," ná":11985," re":10229," ro":31079," ní":1455," pr":44618," ps":1456," s ":10437," má":2572," os":6522," ot":1283," ov":1214," op":3355," or":4357," oz":5841," lá":1294," pe":2741," pa":10148," pl":5963," po":56929," vy":18421," vz":6214," sí":2537," pě":1240," va":1643," ve":20353," uv":1309," mů":1354," vo":5885," vr":1606," vi":2111," vl":5324," ty":2521," tv":3467," tu":1727," us":1203," ur":2603," um":2673," un":1389," ta":9553," ně":9405," v ":54233," sy":4875," st":28273," sv":11047," su":1768," pí":1932," tr":6193," to":5957," th":1725," ti":1444," pá":1635," te":10751," Če":5018," čí":2219," čá":5494," čt":2423," če":12226," čl":2635," či":5961," ča":2613,"ňuj":1462," ús":1372," úz":3683," úč":1533," šk":1284," řa":1815," ře":4682," ří":3390," ži":4198," že":3745,"Evr":1575,"Fra":1982,"šší":1462,"ším":3926,"šíc":3060,"Byl":1846,"šíř":1269,"ší ":8873,"Cha":1224,"Nov":1143,"Par":1206,"Pro":2287,"Pra":2828,"Pod":1248,"Pol":1195,"ůzn":1758,"ůso":2743,"ůvo":3624,"Jed":1974,"Jeh":1143,"Je ":2998,"ům ":1924,"Kar":1595,"Mar":2760,"řez":1271,"řev":1736,"řet":1507,"řes":3291,"řen":6131,"řel":1284,"ři ":4648,"řed":13088,"řej":1316,"řek":2349,"řeb":1351,"řec":3115,"ře ":3029,"řad":2628,"ého":25401,"ém ":11396,"řsk":2573,"áře":1219,"ému":2422,"éna":1816,"éno":1417,"éri":1326,"Sta":2172,"Spo":1540,"Slo":1556,"Vel":1392,"The":1321,"šec":1390,"šen":2332,"še ":1844,"šak":1283,"šti":2097,"šní":1582,"ško":1848,"šin":1751,"ště":3036,"átu":1328,"bje":2435,"áte":1666,"átk":2625,"átn":2002,"áto":1624,"bit":1438,"ást":9035,"bil":2518,"bo ":10795,"ávi":1211,"bli":4783,"ávn":2176,"bla":3565,"áva":2824,"bod":1239,"bol":1903,"boj":1202,"blí":1257,"ázv":1186,"ává":2557,"áze":5164,"bor":3758,"bov":1176,"bou":1297,"álk":1854,"áln":9542,"álo":2573,"álu":1211,"ákl":2414,"bal":2672,"áko":1419,"án ":4623,"ále":4254,"bar":2156,"áli":1317,"áns":1949,"bdo":1660,"áno":2096,"ány":1316,"ánu":1360,"ámo":1148,"bce":1301,"ána":2621,"áni":1332,"bec":2884,"ber":2768,"ben":2163,"bez":1903,"ápa":4357,"ámě":1142,"ání":9623,"át ":2061,"áro":4387,"árn":3371,"ámý":1162,"áda":1737,"ách":3663,"áce":1349,"ádá":1824,"ábě":1152,"ca ":1459,"ák ":1153,"ál ":2023,"ce ":37517,"ám ":1274,"bri":1805,"bro":1589,"bra":3506,"bu ":2220,"bní":3217,"bsk":1200,"bsa":1785,"bur":1260,"bud":1860,"bvy":1413,"by ":4307,"byl":18860,"byv":1815,"am ":2742,"ake":1229,"aji":2410,"al ":7387,"adě":2234,"aje":2039,"ak ":5398,"ahu":2375,"ahr":2506,"aha":1562,"ací":4252,"aké":5289,"anu":1370,"any":1635,"ano":3773,"ann":1847,"ant":5679,"ans":3967,"ají":6907,"ane":2662,"ang":5036,"ani":8977,"ank":1763,"ana":4560,"anc":5148,"and":5394,"amu":1194,"amo":2812,"amn":1598,"ami":3729,"ame":8001,"ama":1900,"aly":1179,"als":1998,"alo":7874,"all":1742,"ali":6637,"ale":6714,"ala":3987,"alb":1604,"an ":5076,"aku":1247,"akt":3867,"ako":12804,"abe":1141,"abs":1339,"ae ":1943,"ad ":4944,"afi":1777,"age":1161,"ael":1206,"ado":1550,"adl":1340,"adn":6465,"adi":3651,"ade":3282,"ady":1580,"adu":2552,"aco":1432,"aci":3453,"ach":6559,"ace":7532,"ada":3897,"arš":1293,"azu":1350,"aze":2771,"azy":2655,"até":1146,"apř":2101,"ba ":3672,"azý":1373,"avě":1439,"atř":2572,"at ":6248,"aná":3560,"are":2437,"ard":3288,"arc":2227,"ara":4391,"aro":4126,"arn":1320,"arm":1290,"ané":4536,"arl":1695,"ark":1851,"ází":4141,"ari":4139,"ars":2390,"art":3122,"asa":1162,"amě":1185,"asi":3609,"aný":5688,"ase":1167,"aso":1587,"asn":2528,"ask":1257,"ar ":2605,"alá":1216,"ape":1349,"alé":1624,"apo":2941,"as ":2058,"alý":1783,"ava":3516,"aut":3726,"avs":1166,"avo":2741,"avn":5290,"avi":3981,"ave":3850,"avy":1957,"avu":1943,"av ":1604,"ata":3060,"aně":2278,"ast":13117,"atn":1969,"atk":1885,"atr":1563,"ato":5121,"ate":11112,"ati":10351,"alš":1631,"atu":2078,"aur":1970,"řád":2065,"ří ":6262,"řív":1184,"říz":2145,"říc":1240,"říp":1616,"řís":1609,"řím":2008,"řík":1716,"říd":3286,"říž":1421,"až ":2790,"jeh":3911,"jej":4795,"jed":12881,"jek":1841,"jem":2659,"jen":5483,"jev":2523,"ji ":4802,"ažd":1260,"bře":2487,"děl":3685,"jak":10723,"děn":1255,"děj":2198,"jaz":2568,"je ":83650,"jme":2592,"jno":1257,"jov":2720,"jin":3546,"jih":2344,"jic":2414,"ito":2582,"itu":2506,"its":1828,"ity":1661,"isk":2395,"ism":2509,"isl":2097,"iso":2022,"ist":15778,"ině":2331,"ita":3897,"ite":5726,"iti":3568,"ivo":2672,"ivn":3856,"ium":1381,"iva":2483,"ivi":1448,"ive":2466,"is ":3839,"ion":6938,"iro":1733,"irm":1303,"ise":1488,"iný":1514,"iná":3960,"it ":3805,"dě ":4195,"itý":2115,"ití":1214,"ité":1984,"itá":1643,"izo":2349,"iza":2404,"km ":2378,"kdy":3252,"kej":1210,"kem":3800,"ket":1376,"ke ":1607,"kce":1649,"kde":1621,"kra":5687,"kre":3334,"kt ":1234,"ku ":19613,"kro":2033,"kov":11417,"kou":9659,"kos":4285,"kor":1310,"kop":1321,"kon":10573,"kom":4572,"kol":7494,"úze":3480,"klá":2870,"kni":1628,"klu":1477,"ko ":14736,"kle":2073,"kla":8470,"klo":1299,"jso":6864,"jsk":2959,"ečn":6772,"jmé":3893,"eče":1504,"již":2514,"kaz":1148,"kat":3393,"kar":1707,"kap":1616,"kan":2047,"kal":2034,"kam":1509,"kac":1503,"ka ":20272,"jvě":1879,"ha ":4741,"bě ":2724,"han":1905,"hal":1464,"har":2922,"běh":2126,"he ":2188,"dá ":2060,"dál":1753,"dáv":1525,"her":2617,"dán":2244,"hem":3497,"his":2398,"běž":1211,"hla":5571,"hle":2243,"hlo":1435,"ho ":41844,"gli":3394,"cí ":18956,"cíc":3683,"gra":4857,"cím":1211,"ial":1377,"ian":1754,"ic ":2965,"ibl":1394,"ia ":4182,"ien":1656,"aří":2027,"ier":1153,"ifi":1720,"ict":1675,"ick":33133,"ici":3981,"ich":6375,"ice":11279,"ie ":13367,"ica":1513,"ido":1643,"idl":1498,"ide":3474,"ida":1748,"il ":4856,"ika":5332,"ii ":5869,"igi":1176,"icí":1825,"iho":2925,"ik ":2696,"imo":1396,"ime":1223,"inc":4325,"ind":2184,"ina":8223,"inn":2167,"ino":5250,"int":2630,"ins":4625,"inf":1536,"ine":3495,"ing":4248,"ini":3264,"ink":1380,"iká":1715,"inu":3551,"iny":6607,"iko":2695,"ikl":2333,"ila":2431,"in ":6095,"iky":3097,"ikt":1203,"iku":2133,"ilo":3489,"ill":1963,"iln":1971,"ilm":2591,"ili":3988,"ile":1526,"hok":1282,"hol":1704,"hot":1459,"hou":1879,"hov":5549,"hor":3514,"dí ":3801,"hod":8706,"hni":1267,"huj":1855,"hud":2458,"hu ":4702,"hro":1617,"hrn":1403,"hra":7196,"díl":2411,"hyb":1243,"hož":1183,"hy ":3116,"hum":1365,"hrá":2941,"evš":1525,"ext":1409,"etí":2247,"exi":1437,"ezn":2836,"eze":2667,"ezi":7262,"eně":1924,"eta":2522,"ete":4466,"eti":3348,"etn":2128,"esp":1404,"esn":2468,"eso":1292,"est":7801,"ev ":3656,"eto":3093,"etr":4039,"erá":7485,"eve":6187,"evn":2175,"eré":8410,"evi":2322,"erý":10219,"er ":7795,"eor":1444,"es ":7550,"epu":2593,"elé":1267,"epr":1430,"eri":9740,"erg":1674,"ere":2706,"ená":5438,"era":5702,"et ":4584,"emí":4275,"esk":11348,"esi":1360,"ený":6913,"ese":3215,"emě":2087,"erz":1749,"erv":3468,"eru":1909,"ení":16758,"ert":1878,"ers":3684,"ern":7190,"erm":2474,"ené":4823,"ero":6299,"ekl":1253,"eko":2098,"ekt":6413,"en ":12357,"ela":2337,"ele":12125,"eli":4792,"elm":1412,"eln":3331,"elk":5262,"ell":1825,"elo":2426,"els":2269,"ema":1842,"eme":2383,"emn":1840,"emo":2184,"emi":4395,"ene":4580,"ena":4918,"end":1571,"enc":3468,"eno":6243,"enn":2162,"eni":2669,"ens":7642,"ent":12967,"ejí":5227,"eny":2102,"egi":1835,"ej ":1214,"edí":1331,"eho":5425,"ek ":7971,"ein":1192,"el ":8637,"ejs":2194,"ejn":3303,"ejm":2020,"eji":2735,"em ":31739,"ejv":3622,"gin":1170,"gio":1330,"gie":2007,"gic":2475,"gii":1170,"gen":3213,"býv":3044,"být":1189,"gan":3061,"ga ":1144,"íže":1335,"íž ":1472,"fun":1501,"fra":2633,"ače":3487,"ačo":2232,"ačn":3017,"aču":2194,"for":4925,"fot":2015,"bí ":2241,"fic":2581,"fil":3337,"fik":2015,"fin":1574,"fir":1363,"da ":6390,"de ":6350,"dal":3374,"daj":1518,"dat":3084,"dan":2182,"dce":1208,"ctv":1837,"cně":1190,"cko":3237,"chá":6943,"cky":4430,"ciá":2350,"co ":1224,"cká":6180,"cké":15214,"cov":2906,"cou":2423,"cký":12856,"ch ":62063,"ces":2551,"cen":5710,"cem":1899,"cel":4240,"ci ":10328,"cha":4867,"chu":1777,"chy":1469,"cia":1607,"cie":2249,"che":3452,"chl":1932,"chi":2422,"cho":11778,"chn":2940,"cht":1572,"chr":1838,"ed ":3026,"ebn":2564,"ebo":12455,"ec ":4754,"edl":1223,"edm":1145,"edn":17139,"edi":5930,"ede":6652,"eda":1810,"edy":1655,"eds":1915,"edo":3095,"eck":8996,"ech":12275,"eci":1513,"ecn":1350,"dy ":11421,"drá":1267,"dvo":3395,"dva":1139,"dor":1289,"dop":2114,"dom":1869,"dol":1600,"dok":1496,"dov":6030,"dou":1896,"dos":2695,"dpo":1574,"dna":2226,"dne":2707,"dni":1534,"dno":8163,"dob":7531,"dst":2606,"íře":1323,"dně":4952,"duc":1342,"dné":1547,"dra":2227,"dná":2916,"du ":8252,"dro":2166,"dní":18004,"dru":7228,"dsk":2773,"dic":1916,"dia":1263,"der":3512,"des":2009,"dev":1882,"deb":1947,"dec":1456,"del":3171,"den":6683,"dem":3354,"di ":2989,"dle":4381,"dla":1946,"do ":8212,"dlo":2844,"div":1579,"din":4766,"dio":1325,"dis":2494,"die":1484,"rga":2761,"rgi":1143,"ižn":3600,"ret":1327,"res":5841,"nás":2254,"náv":1475,"rez":1829,"náz":3803,"rač":1242,"iž ":1205,"rea":1647,"nác":1291,"rec":1925,"reg":1788,"nám":4897,"rem":3700,"ren":2252,"rek":1273,"nál":3115,"nár":3570,"rep":3246,"rdi":1273,"ná ":14630,"re ":2546,"rch":3978,"rce":1159,"raz":3775,"rd ":1372,"ras":1475,"rat":6479,"rav":9308,"raj":4559,"rah":1855,"ran":13598,"ram":4289,"ral":1943,"rak":3296,"rab":1667,"raf":2398,"rad":5238,"rac":4788,"ros":9033,"rot":4473,"rom":4735,"ron":3499,"rop":5057,"roz":10708,"rou":3738,"rov":17134,"rob":3794,"rod":11155,"roc":8820,"ní ":79844,"roj":4450,"rol":2078,"rok":8745,"rof":1452,"rog":2059,"rno":1887,"rnu":1168,"rna":1957,"rež":1163,"rni":1331,"ném":2468,"rmo":1370,"jší":4959,"ro ":11368,"rma":3339,"riá":1833,"néh":5048,"né ":18438,"raž":1181,"rit":3911,"ris":3258,"rig":1476,"rik":1662,"rin":2410,"ria":2007,"ric":8579,"rid":1291,"rie":3806,"rk ":1359,"ryc":1171,"ruh":6961,"rum":1222,"ruk":1449,"rus":3302,"rva":1915,"rve":2525,"rvn":4272,"ry ":6263,"rsk":6183,"rně":2168,"rst":1339,"rto":1577,"rti":1692,"rub":1147,"rtu":1152,"ním":14655,"ník":6579,"níh":7211,"rmá":1574,"níc":9560,"rt ":1561,"rné":1326,"ru ":6692,"rní":7275,"sah":2489,"měl":2079,"sam":2091,"nýc":8901,"ným":4766,"měn":2340,"san":1523,"sau":1548,"měs":9181,"měr":2410,"sa ":1633,"mě ":2287,"ruž":1312,"ný ":15629,"si ":3518,"měř":1748,"sin":1775,"sil":2216,"se ":32266,"sch":2059,"sev":4239,"ser":1773,"sed":1877,"kří":1234,"sen":1348,"sem":1612,"sel":1714,"spo":10422,"spr":1930,"spe":2260,"spi":1842,"ský":19060,"sou":17153,"sov":4194,"ské":26627,"son":1432,"ská":8637,"sof":1375,"soc":1565,"sob":5198,"su ":2847,"sní":1788,"sné":1153,"st ":11395,"slo":9919,"slu":2567,"sky":9407,"sla":4078,"sle":3342,"skl":1707,"sko":9797,"sku":8770,"ska":4807,"ske":1405,"sni":1352,"smu":2054,"stí":6180,"sté":3269,"stá":6877,"syn":1327,"sys":2806,"stě":3272,"své":1798,"smě":1301,"ste":9093,"sně":1417,"sta":23517,"stn":5308,"sto":14050,"sti":18925,"stl":1904,"stv":6645,"stu":6506,"str":16919,"sts":1291,"ků ":4430,"sty":1890,"sva":1646,"svo":1261,"sy ":1657,"něj":4079,"tak":7734,"něk":4566,"něl":1306,"tal":4758,"tac":1310,"tad":1311,"tba":1700,"tav":8149,"tat":3631,"tas":1264,"tar":5738,"tan":5175,"něn":3988,"něm":4105,"te ":2317,"svě":5020,"stř":6161,"éž ":3711,"ně ":25254,"ta ":12315,"ký ":20031,"ouž":6121,"ozá":1627,"pa ":1199,"ově":6255,"lá ":1917,"kýc":10186,"kým":4999,"par":3389,"pat":3635,"pad":7309,"pak":1199,"pal":1582,"pam":1204,"pan":2500,"pec":1647,"lád":4346,"lán":1802,"pen":1946,"per":4148,"lát":1414,"pla":3524,"ple":1738,"plo":2559,"lé ":2839,"pic":1267,"pin":4585,"pis":3803,"poz":3018,"por":4291,"pop":2679,"pov":4058,"pou":6174,"pot":2565,"pos":5385,"poj":5321,"poh":2575,"pom":2458,"pon":2428,"pok":1746,"pol":12866,"pob":1330,"poc":1426,"pod":13049,"láš":1492,"po ":3633,"lí ":2188,"psk":1538,"hů ":1499,"pub":3056,"poč":3912,"pra":9290,"prv":4970,"psa":1291,"pu ":1292,"pri":1503,"pre":2679,"pro":30809,"prá":4755,"poř":1288,"py ":1312,"prů":1838,"lý ":2387,"má ":3426,"mát":1458,"mán":1640,"išt":1515,"mén":4741,"mí ":3982,"mís":2771,"mír":1517,"mín":2069,"ra ":9350,"eži":1875,"ngl":4299,"ni ":2504,"nge":1442,"nej":9246,"nek":1332,"nen":1766,"nem":4884,"nep":1631,"ner":3084,"net":2605,"nes":3430,"ež ":1798,"ng ":3190,"neb":12487,"nec":2031,"ned":1356,"nfo":1282,"nač":7033,"nez":1422,"nco":2679,"nci":5166,"nce":5874,"ne ":3607,"ndo":1521,"ndi":1883,"nde":1639,"nda":1497,"nak":1732,"nal":3376,"nam":6213,"nan":1425,"nap":3429,"nar":1416,"nac":3724,"nad":3749,"nd ":2404,"nav":1415,"nat":3143,"nas":1417,"naz":1632,"na ":53204,"mys":1461,"mož":1630,"nož":1329,"ny ":15609,"nuj":1460,"nut":2319,"nto":3206,"ntu":1650,"ntr":2728,"nti":4061,"nta":3294,"nte":4199,"nst":3785,"nsk":18177,"nu ":7672,"ičn":2337,"nné":1576,"nt ":3215,"jím":1621,"jíc":10934,"noh":2321,"nol":1273,"nom":2363,"not":3595,"nos":13323,"nor":2008,"nov":9782,"nou":8746,"než":1270,"nno":1485,"nič":2090,"jí ":9261,"neš":1391,"no ":7023,"nka":1355,"nko":1306,"eží":3799,"nih":1649,"nie":1760,"nic":12884,"niz":2240,"niv":1387,"nis":3526,"nit":3056,"nin":2133,"nik":6308,"ogr":3240,"ogi":4525,"odí":1145,"ohl":1659,"oho":4168,"oha":1909,"obě":2552,"oj ":1676,"ok ":1263,"ohy":1525,"ojo":1142,"ojm":1998,"oji":1192,"oje":7276,"odě":2068,"obř":1215,"ol ":1430,"oce":9242,"och":4480,"oci":2151,"ock":1389,"obs":1745,"obv":2042,"obu":1622,"oby":3426,"ká ":15983,"ode":4045,"odl":3777,"odi":2898,"odo":4607,"odp":1587,"odn":14263,"ods":1272,"odr":1258,"of ":1489,"oda":2197,"kán":1145,"ody":1671,"odv":1246,"odu":5226,"ofi":1587,"obí":2810,"oba":1485,"od ":15166,"obo":2639,"obr":2915,"obl":4657,"obn":3328,"obj":2067,"obi":2455,"obd":1788,"obc":2803,"obe":3958,"ový":12973,"orů":1170,"ové":14190,"ozn":6156,"ozl":1181,"ouč":4572,"ozo":1460,"ozd":2636,"oze":3842,"ová":16410,"ozi":1341,"oty":1225,"oud":1374,"oub":1598,"ouc":1601,"otk":1156,"oti":3017,"ote":2572,"oto":6726,"otn":1824,"ost":32198,"ota":1788,"otb":1150,"ov ":3056,"osi":1402,"osk":1491,"ose":1306,"osp":1323,"osm":1339,"osl":4606,"oso":3077,"ovy":1368,"ovi":10902,"ovn":5524,"ovo":6751,"ovs":3742,"ouz":3479,"ova":19936,"ove":6197,"ouh":2082,"oun":1766,"oup":1457,"ous":3223,"out":2402,"opo":2402,"opi":2216,"ope":2692,"opa":2307,"os ":1967,"opu":1150,"opr":3218,"olí":1626,"ops":1833,"or ":5513,"ork":1330,"orm":4795,"orn":2809,"oro":4330,"ord":1649,"ore":3272,"oná":1885,"org":3417,"ori":6242,"ou ":37147,"omě":1557,"osa":2963,"ort":3045,"ors":1818,"oru":3220,"ory":1722,"omá":2183,"ora":3762,"ívá":2944,"íze":1985,"ola":2087,"on ":7329,"oli":8828,"ole":10375,"ols":1995,"oln":2388,"olo":8650,"oly":1795,"olu":2611,"oka":1642,"om ":1785,"íst":3954,"ké ":30775,"ísl":1496,"oke":1629,"íta":1783,"okr":4661,"oko":4002,"íků":1200,"oku":8537,"ona":3352,"ond":1711,"onc":1948,"one":2668,"ong":1188,"oni":3548,"ono":3674,"ons":4146,"ont":3048,"onu":1779,"ony":1528,"oma":2266,"ome":3465,"omi":2447,"kéh":13733,"kém":5563,"omp":1767,"omo":5000,"omu":2175,"íva":3319,"la ":20221,"ími":2048,"íms":1206,"íns":1474,"ím ":20382,"kvě":1156,"ín ":1890,"íle":1385,"le ":15329,"íro":1439,"írk":1295,"íse":1140,"lac":1653,"lad":8838,"ípa":1629,"lan":4986,"lam":1157,"lat":5358,"las":7702,"lav":8047,"krá":4170,"kup":4964,"kum":1330,"kul":2572,"ky ":27061,"ích":18473,"kte":27582,"íce":2062,"ídl":1751,"ců ":1941,"ktr":2141,"ktu":2012,"kti":3026,"kto":1589,"kyt":2072,"ík ":3909,"ící":14415,"ího":9884,"lok":1275,"lon":1573,"lké":1191,"lom":1730,"lod":1650,"loh":1236,"log":5594,"lký":1221,"los":3409,"lou":5777,"lov":17182,"lni":1153,"lež":2967,"lič":1203,"lmi":1186,"ltu":1148,"dů ":1701,"lub":1957,"lsk":6596,"lně":2708,"lné":1553,"lní":11343,"lu ":4880,"lný":1549,"liš":1271,"li ":6590,"lez":2008,"ház":6195,"lev":1768,"les":3807,"let":9902,"ler":1221,"lem":5693,"len":8028,"lek":3907,"lej":1379,"led":7715,"lec":1964,"eň ":1751,"lo ":9285,"lla":1318,"lle":1365,"lli":1334,"lko":2458,"lky":2194,"lka":1525,"leč":4689,"hé ":1767,"lm ":1297,"ll ":1176,"lit":5959,"lis":4420,"lin":6175,"liz":1555,"liv":3315,"lic":7730,"lid":2928,"lia":1350,"lik":5494,"lig":1241,"lie":1691,"ma ":3621,"mac":1223,"maj":1332,"mar":1328,"mal":2994,"man":3894,"mat":5599,"me ":1813,"med":1375,"mec":4748,"met":4900,"mer":5988,"mem":1269,"iál":4186,"men":11122,"mez":6394,"ly ":6413,"lož":5290,"moc":2506,"mob":1461,"mod":2374,"mon":2073,"mov":3077,"mor":1679,"mos":2024,"mot":2793,"mou":1328,"mní":1927,"mu ":8025,"msk":2845,"moř":3176,"my ":2400,"mus":1983,"mun":1943,"mi ":12425,"ešn":1222,"min":3830,"mil":1861,"mis":1525,"ešt":1436,"mic":2749,"mo ":1420,"ií ":2195,"mno":2859,"tří":4710,"tši":1374,"tší":2640,"ůže":1655,"Čes":3563,"vě ":4735,"věz":1289,"vět":11745,"tři":1228,"věk":2228,"tře":8474,"věd":2018,"výr":2025,"výs":1250,"vým":2640,"výz":2611,"výš":1486,"čas":5969,"zná":4232,"uča":1905,"zní":1327,"víc":1941,"čen":7616,"čel":3765,"čet":1680,"čes":6771,"čer":2858,"zný":1440,"zu ":1323,"zsk":2646,"učá":1855,"či ":4977,"rů ":3148,"zuj":1535,"čit":1616,"způ":1462,"čin":2747,"růz":1961,"čka":1735,"zyk":1998,"čle":1629,"čov":3019,"vý ":6798,"čno":3388,"výc":8997,"zi ":5006,"zač":1189,"zej":2847,"zev":2458,"zen":7446,"ván":11057,"zem":8432,"vál":2700,"zel":1300,"zer":1349,"vá ":12375,"ze ":9310,"zde":1205,"zab":1246,"zac":2566,"těj":1439,"zah":1842,"těn":1733,"zal":3135,"těl":1552,"zas":1298,"ví ":7249,"zor":1418,"zov":3172,"zpr":1480,"vém":1518,"rší":1390,"véh":3281,"zna":13601,"zni":4038,"vé ":13437,"zko":1259,"zkr":2046,"zdí":1212,"zař":1286,"zin":1828,"zit":1344,"zdě":2035,"těž":1441,"yrá":1302,"yva":1858,"ytu":1246,"ytv":1652,"yto":1723,"yst":4390,"yso":1389,"ysk":1190,"ysl":1771,"tě ":5527,"za ":5089,"ych":2524,"tým":1477,"tý ":2037,"týc":2116,"ykl":2523,"yla":6390,"ylo":2929,"yly":1314,"yl ":10326,"tím":1511,"tém":3602,"tí ":11245,"též":3726,"pův":2875,"půs":2701,"ože":4937,"ožn":1513,"oži":1506,"tán":1850,"pří":8711,"tát":4498,"ož ":2063,"tál":2187,"té ":3506,"xis":1170,"př ":1535,"tá ":1819,"při":7219,"pře":14856,"sér":1644,"síd":1352,"rče":1253,"oří":2027,"vzn":2563,"vzd":1455,"vyk":2012,"vyr":1739,"vyd":1992,"vys":3366,"vyt":1966,"vyv":1398,"rý ":8817,"vyš":2053,"rýc":1165,"oři":1304,"oře":3041,"vní":15429,"vro":2455,"vrc":1706,"vst":1754,"vsk":4082,"vu ":2977,"nů ":2566,"vně":2710,"vuj":1416,"vy ":4952,"voř":3595,"vil":1507,"vin":6325,"vic":2325,"vid":2543,"viz":1532,"vit":4611,"vis":2095,"ré ":7668,"vla":2806,"vo ":2147,"vna":1206,"vno":1207,"vni":1709,"vod":8774,"voj":3429,"vol":2921,"vor":1476,"vot":1529,"vov":1657,"vou":5924,"voz":2714,"vlá":2701,"vi ":1398,"ver":8833,"ves":2177,"rát":2382,"ráv":4812,"mž ":1199,"rán":2154,"ven":9163,"vem":1785,"rál":6073,"vel":5581,"ráb":1423,"ved":2640,"rác":2253,"ve ":12748,"rá ":6574,"val":7096,"van":12007,"var":2621,"vat":8883,"pěv":1148,"vaz":1671,"vac":2111,"vad":1154,"vaj":2879,"můž":1309,"va ":9872,"uze":1855,"uzs":2041,"utí":1162,"urč":2415,"usk":3978,"use":1494,"umě":1413,"ust":3573,"ute":1270,"mů ":1642,"uto":4678,"us ":7285,"ut ":1210,"ura":2035,"ure":1155,"uri":1389,"urn":1235,"uro":1540,"uru":1160,"ury":1381,"upi":4594,"upe":1451,"upn":1306,"umo":1169,"umb":1314,"ume":1179,"ují":7869,"unk":1600,"uni":2964,"uko":1228,"um ":4532,"ult":2459,"ulo":1228,"uhé":1788,"uli":1303,"ula":1664,"uje":11818,"ucí":1196,"uho":1206,"ude":2608,"udi":1376,"ubo":1407,"uce":1140,"uch":2624,"uh ":1827,"udo":2156,"ub ":1463,"ubl":3299,"ud ":1221,"trů":1395,"tví":6726,"typ":2401,"ty ":7625,"tvr":1530,"očí":2254,"tvo":4249,"trá":2691,"tva":2695,"tur":4631,"tuj":2396,"tup":3639,"tud":1736,"pís":1810,"oče":1902,"tre":1691,"tra":10732,"tné":1153,"oči":1195,"tri":3505,"tru":3242,"tro":11520,"očn":1259,"tní":9445,"tu ":7164,"tný":1437,"tsk":6630,"tně":1562,"lů ":2152,"to ":14761,"lší":1771,"tna":1155,"tno":1779,"toh":1166,"tou":3172,"tov":8578,"tos":1217,"tom":3677,"ton":3319,"tok":1791,"tol":3953,"tor":9066,"top":2079,"til":1384,"tik":3154,"tit":2068,"tis":2205,"tin":7629,"tio":2588,"thu":1152,"tic":11897,"teč":1644,"tiv":5666,"tko":1227,"tka":3033,"tli":2690,"tky":1508,"tla":1310,"tem":4570,"ten":3204,"teo":1298,"tej":1972,"tek":2779,"tel":12611,"tec":6325,"ted":1925,"tev":1290,"ter":37331,"ti ":14561,"tač":2083,"the":1375,"ží ":4449,"žíc":1641,"žív":5443,"yšš":1364,"zýv":1380,"žil":1490,"živ":2602,"žit":2707,"žij":1345,"žov":1554,"že ":5417,"žel":1619,"žen":7485,"záv":1903,"záp":4090,"zák":3066,"uži":2255,"ýt ":1211,"ýro":1252,"ým ":11479,"ými":4161,"žní":2919,"ých":32937,"žsk":1623,"žně":2362,"vší":1568,"zí ":4964,"vša":1280,"vše":1958,"uží":6581,"tů ":3433,"ýzn":2182,"ývá":1959,"ýva":2863,"uše":1228},"n_words":[11333226,13010717,8780627],"name":"cs"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":338489,"E":166377,"F":175306,"G":453070,"A":371519,"B":275760,"C":751456,"L":302911,"M":508956,"N":226941,"O":101563,"H":125996,"I":97942,"J":76318,"K":27436,"U":63033,"T":165496,"W":129096,"V":18216,"Q":3136,"P":245280,"S":271355,"R":184371,"Y":305003,"X":1831,"Z":4074,"f":4291702,"g":4740469,"d":12541164,"e":10317499,"b":2060020,"c":3013168,"a":13050188,"n":11443262,"o":7624533,"l":7117558,"m":2908226,"j":9805,"k":97581,"h":4955284,"i":9078358,"w":5492011,"v":80439,"u":3661293,"t":3899111,"s":3554349,"r":9375348,"q":5143,"p":918555,"z":19700,"y":11865619,"x":17901,"\92":1460,"£":24959,"ï":24620,"î":12446,"í":2171,"ë":2404,"ê":12514,"é":9458,"è":1489,"â":204955,"á":11089,"û":2546,"ö":4198,"ô":116116,"ó":1726,"ŵ":34962,"ŷ":23825," l":444350," m":892336," n":991408," o":1241413," h":924635," i":1098661," j":1707," k":16142," d":1472270," e":1030930," f":928615," g":2001832," a":3061352," b":1038202," c":1432889," y":4250280," u":270784," t":410248," w":722625," v":2781," p":602564," s":805496," r":1506799," J":76301," K":27204," H":125793," I":97830," N":226785," O":101451," L":302168," M":506691," B":275536," C":750240," A":370037," F":175148," G":448916," D":334497," E":166168," Z":4022," Y":304965," X":1781,"о":1412," S":271032," R":184243," Q":3067,"а":1632," P":245017," W":128867," V":18126," U":62998," T":165103," â":171287," ô":68394," £":24955," ŵ":5995,"ا":1484,"A ":51136,"F ":1624,"Da":83430,"Cu":4761,"Cw":8435,"Cy":336581,"Cl":14084,"Co":52549,"Cr":41637,"Ce":66425,"Ch":48051,"Ci":4683,"G ":8214,"Ec":3834,"Ed":13126,"Ea":2252,"Eb":7093,"Dw":4392,"Du":9735,"Dy":52260,"Do":17846,"Dr":14255,"De":55959,"Dd":28273,"Di":51623,"Fe":52440,"H ":1793,"Fa":17497,"Eu":3442,"Ev":3997,"Ew":22302,"Er":29255,"Et":3900,"Es":7913,"En":10149,"Em":3160,"Ei":18716,"El":12213,"Ef":13230,"Eg":4955,"Ge":36048,"Ga":68330,"I ":18154,"Fy":5682,"Fw":2093,"Fu":2056,"Fr":13553,"Fo":36065,"Fl":3457,"Fi":6588,"Ff":31987,"B ":3900,"C ":51723,"Au":3464,"Aw":14409,"Ar":59958,"At":8101,"As":10798,"D ":3873,"Ba":34881,"Ae":32249,"Af":18144,"Ag":3289,"Ab":19319,"Ac":8551,"Ad":20323,"Am":33758,"An":24599,"Ap":2294,"Ai":10453,"Al":50222,"By":49106,"Bw":10815,"Bu":24953,"Br":56488,"Ca":121524,"E ":5353,"Bh":1396,"Bi":5859,"Be":36724,"Bo":26776,"Bl":19768,"Ko":1892,"Le":24072,"Li":11077,"N ":2781,"La":23873,"Lu":6554,"Ly":19349,"Ll":197561,"Lo":16574,"Me":55686,"Mh":9909,"Mi":19373,"O ":20786,"Ma":353446,"Mc":1615,"My":8802,"Mw":2121,"Mu":8098,"Mr":4592,"Mo":28206,"Nh":3558,"Ni":66554,"Ng":81458,"Ne":23468,"Na":15773,"P ":4199,"Q ":4850,"Nw":4046,"Ny":1586,"Nu":1470,"No":23777,"Ol":5539,"On":7737,"Og":3813,"Oh":2023,"Oe":13014,"Of":2109,"Ob":1524,"Gi":6350,"Gl":18922,"Gr":31398,"Go":63652,"Gu":3975,"Gy":66269,"Gw":140934,"J ":2021,"Ha":33467,"He":30826,"Hi":10111,"Ho":16382,"Hu":14940,"Hw":2512,"Hy":14748,"Ia":6486,"Id":2894,"If":1982,"Ie":12677,"Io":10592,"In":11178,"Iw":8666,"Is":11408,"Ir":8587,"Ja":20953,"L ":1847,"Je":8470,"Jo":39810,"Ju":3096,"Ka":5774,"M ":8246,"Ki":8305,"Ke":4703,"Ur":4373,"Un":36736,"Uc":8498,"W ":2665,"Ty":9645,"Tw":3412,"Tu":6322,"Tr":35826,"Ts":3701,"To":17417,"Th":31400,"Ti":9045,"Te":14091,"Ta":24608,"V ":3179,"Sw":30740,"Sy":9582,"St":29219,"Su":12733,"Wr":10595,"Wo":5648,"Wi":24699,"Wh":3162,"Wl":2195,"Wa":16981,"We":55163,"Y ":52072,"Vo":1516,"Vi":5015,"Va":5513,"Ve":2919,"Uw":3260,"Pw":24712,"Pu":5107,"Pr":53804,"S ":7880,"Py":1882,"Pe":60817,"Pa":41773,"Pl":14210,"Po":23538,"Pi":5405,"Ph":9698,"Os":15921,"Or":7777,"R ":3429,"Gŵ":1552,"Ow":7608,"Se":32095,"Sc":4665,"Si":41090,"Sh":7678,"Sg":4461,"Sm":1606,"Sl":2027,"Sp":3276,"So":13929,"Ru":6905,"Ry":6829,"Rw":4999,"U ":13762,"Sa":60869,"Sb":4209,"Re":10571,"Ri":8224,"Rh":86312,"Ro":44014,"Qu":2233,"Mô":8098,"T ":3460,"Ra":12393,"Tŷ":1928,"b ":188407,"a ":1388065,"Yc":2873,"Yn":102548,"Ym":25982,"Yo":2068,"Yr":86694,"Ys":28919,"Wy":8865,"i ":2272558,"fw":93325,"fy":431027,"gd":5453,"ge":277096,"gf":18580,"câ":3046,"ga":724292,"gb":3402,"fl":180022,"fg":1481,"ff":496818,"fi":195556,"fh":2098,"fs":1452,"fr":272586,"fu":97232,"ft":30862,"fo":484371,"fn":177691,"gy":688780,"gw":585146,"dâ":1740,"hf":16701,"hg":20440,"hd":28466,"he":488866,"hb":5711,"ha":816562,"gn":8582,"gl":184207,"gi":178183,"gh":227509,"gg":2030,"gu":106896,"gt":3652,"gs":2779,"gr":229726,"go":520931,"du":203071,"dw":342258,"dy":517237,"g ":931036,"ea":96187,"eb":183710,"ec":186277,"ed":1610960,"de":623948,"dd":3129194,"dg":7414,"df":82859,"di":1292188,"dh":11741,"dm":2081,"dl":170195,"do":681355,"dn":43219,"ds":26351,"dr":451732,"ew":322523,"ex":7983,"eu":450210,"ev":8136,"ey":50899,"ez":2026,"fa":475846,"h ":1177936,"fd":7994,"fe":594002,"fb":7441,"eh":22151,"eg":311387,"ef":503193,"ee":16361,"el":858160,"ek":2704,"ei":1198797,"ep":14479,"eo":166913,"en":926477,"em":108663,"et":784865,"es":501129,"er":1107411,"ca":313521,"e ":860790,"bw":117830,"by":344044,"bs":2707,"br":185005,"bu":80984,"bo":359315,"bl":303416,"bh":1799,"bi":52710,"bb":4778,"be":239715,"db":3993,"da":1020476,"f ":718154,"cy":633555,"cw":48093,"cu":7350,"ct":62415,"cs":12784,"cr":139745,"co":161171,"cn":2016,"ck":25213,"cl":59380,"ci":33119,"ch":1057816,"ce":98894,"dG":3638,"cc":2543,"c ":343775,"az":3933,"ay":14673,"ba":169166,"d ":3835329,"at":471705,"as":344140,"ar":1444434,"ax":1835,"aw":461613,"av":36603,"au":1090598,"ak":7934,"al":543082,"ai":983141,"aj":1537,"ao":21869,"ap":37508,"am":461201,"an":1743050,"ac":458246,"ad":1129155,"aa":2019,"ab":77615,"ag":211373,"ah":44754,"ae":1532785,"af":522427,"nu":171108,"nt":532413,"ns":84700,"nr":100822,"no":616110,"nn":767548,"nz":1568,"ny":424305,"nw":268361,"oe":579869,"of":163174,"oc":86294,"od":1314157,"oa":8299,"ob":214212,"om":118946,"on":847751,"ok":5774,"ol":1239958,"oi":78983,"og":340671,"oh":65618,"ot":47787,"os":336191,"ov":8960,"ou":32901,"op":89124,"oo":15722,"or":610467,"gŵ":2290,"r ":3281337,"ow":30262,"oy":15398,"pe":161740,"lâ":1645,"pa":189940,"pl":48051,"po":104525,"ph":62097,"pi":31396,"lo":304547,"ln":2199,"lm":22755,"ll":1220502,"ls":17011,"lr":6616,"lp":14553,"lw":345911,"lv":2550,"lu":249977,"lt":77926,"ly":476053,"dŵ":6771,"o ":1345269,"mc":23468,"md":109708,"ma":574820,"mb":31654,"mg":56104,"mh":64042,"me":302440,"iâ":1734,"mf":4671,"ml":92596,"mi":149661,"mn":16796,"mm":6621,"mp":32734,"mo":141620,"mr":302752,"mt":1985,"ms":42608,"mu":106959,"mw":147169,"my":85808,"p ":50248,"na":672622,"nb":21604,"nc":88780,"oD":3640,"nd":345573,"ne":760383,"nf":73346,"ng":590640,"nh":99637,"ni":826694,"nj":1540,"nk":5916,"nl":101812,"nm":6203,"fô":1550,"ki":8663,"kh":2267,"gâ":2707,"ke":13073,"ka":5930,"m ":593528,"ky":1861,"ks":5032,"ko":2829,"gî":3874,"kl":1969,"km":14280,"gê":2506,"li":629190,"lh":7775,"lk":4818,"le":691990,"ld":82933,"lg":46948,"hâ":4704,"lf":83881,"la":750380,"lc":95871,"lb":46438,"n ":4787804,"dî":1547,"hr":183684,"hs":3825,"hp":4147,"hw":371101,"ht":6621,"hu":119315,"hi":282969,"hn":55025,"ho":475196,"hl":49179,"hm":5269,"dé":1797,"id":664839,"ic":192410,"ib":56293,"ia":1263604,"ih":12172,"ig":415868,"if":410052,"ie":166715,"hy":794639,"dô":5622,"k ":32674,"ir":477062,"is":369024,"it":440625,"iu":8410,"iv":7571,"iw":257532,"ix":2024,"ii":2747,"ik":6081,"il":405265,"im":67678,"in":760539,"io":738104,"ip":16792,"ji":1668,"iz":3140,"iy":26973,"l ":1847662,"ja":2972,"tî":3276,"wy":1614822,"sô":8739,"z ":4324,"xa":1792,"tâ":7066,"tá":6012,"wg":42604,"wh":1564,"wi":219189,"wl":160359,"wm":73543,"wn":705438,"wo":131443,"wp":14865,"wr":373917,"sï":6838,"ws":65535,"wt":12136,"wu":13055,"rô":9205,"y ":1910993,"wb":38887,"wa":392191,"wd":79870,"wc":88228,"wf":14814,"we":891527,"ré":1461,"vi":40244,"rï":4655,"rê":1538,"vo":3059,"uw":28315,"ve":20282,"va":11557,"x ":9287,"ui":8429,"uk":2069,"ul":134694,"ue":20744,"uf":15081,"ug":26706,"ur":216040,"us":144900,"ut":48913,"um":24406,"un":441720,"uo":44570,"up":3923,"ty":121986,"tz":1790,"tu":130624,"tt":25031,"tw":37964,"ub":10188,"ua":60074,"ud":237653,"uc":38880,"w ":540641,"lŷ":9710,"to":153977,"tn":16677,"pê":3111,"tm":2048,"tl":20346,"uM":1938,"ts":16715,"tr":321747,"tg":40447,"tf":4449,"te":321285,"ti":203530,"th":1535557,"v ":2088,"tb":45909,"tc":3805,"ta":264272,"su":69725,"ss":22899,"st":428431,"sy":392839,"sw":79656,"sl":16153,"sk":4499,"sn":55412,"sm":8824,"sp":4409,"so":185188,"sr":5002,"sd":3729,"sc":7973,"sf":7557,"se":260799,"sh":30472,"sg":271543,"si":399846,"u ":2090991,"sa":248766,"sb":54079,"rr":55164,"rs":84760,"nï":7723,"rt":328516,"ru":282945,"rv":2570,"rw":329957,"ry":365148,"rp":55838,"ro":602514,"rn":152852,"rm":51319,"rl":78753,"rk":10340,"ri":836480,"rh":499517,"rg":80259,"rf":190186,"re":579529,"rd":310455,"rc":129419,"rb":107129,"ra":893957,"t ":587222,"iŵ":5613,"mô":5956,"qu":4251,"mâ":2049,"s ":948343,"hŷ":8102,"py":7720,"lö":2760,"pt":11534,"pu":26642,"pw":69677,"pp":4438,"pr":132094,"ps":6999,"tŷ":1784,"rŵ":7188,"zi":1900,"ze":2519,"za":3932,"zo":1636,"yg":171315,"yh":72170,"ye":5474,"yf":684944,"yc":141109,"yd":1928458,"ya":51106,"yb":73857,"pŵ":1489,"yw":532100,"yt":123198,"ys":573646,"yr":754273,"yp":2904,"yo":15126,"yn":3470264,"ym":959076,"yl":365116,"yi":3130,"£ ":24958,"áu":7736,"âl":3882,"ân":16650,"âd":3031,"âi":1773,"âu":2621,"âr":2850,"â ":172133,"ôr":11547,"ôl":78411,"ôn":22987,"ïo":4314,"ïa":19236,"îl":4095,"îm":5990,"ên":2267,"êl":5660,"êm":2845,"ép":1550,"é ":1708,"ûm":2087,"öy":2639,"ŷ ":5300,"ŵe":1954,"ŵn":1440,"ŵp":7052,"ŵr":14740,"ŵy":8132,"ŷd":5489,"ŷn":11318,"ŷr":1615," £ ":24954," Ga":68273," Ge":35950," Fy":5682," Fw":2093," I ":9807," Fo":36048," Fu":2054," Fr":13547," Fi":6558," Ff":31955," Fl":3435," Ha":33448," He":30815," Gw":140920," Gy":66257," J ":1675," Go":63609," Gr":31246," Gu":3913," Gi":6313," Gl":18894," If":1978," Ie":12669," Id":2890," Ia":6484," Hy":14744," Hw":2512," Hu":14872," Ho":16371," Hi":10107," Je":8469," Ja":20951," Iw":8664," Ir":8586," Is":11396," In":11117," Io":10590," M ":2051," Ka":5749," Ke":4616," Ki":8206," Jo":39808," Ju":3094," La":23801," Le":23844," Li":11024," Ko":1892," Mc":1615," Ma":353332," O ":18653," Mi":19348," Mh":9907," Me":55666," Lo":16544," Ll":196342," Ly":19337," Lu":6544," Ne":23415," Na":15689," Ng":81454," Nh":3558," Ni":66542," Mr":4592," Mo":28166," My":8789," Mu":8036," Mw":2121," A ":41477," B ":1694," C ":12751," Ap":2290," Am":33754," An":24588," Al":50117," Ai":10453," Ag":3289," Ae":32207," Af":18136," Ac":7374," Ad":20296," Ab":19309," Ba":34835," D ":1809," Aw":14409," Au":3460," At":8099," As":10796," Ar":59933," Be":36689," Bi":5853," Bh":1394," Bl":19761," Bo":26726," Br":56435," Bu":24947," Bw":10815," By":49104," Ca":120911," Ce":66408," Ci":4665," Ch":47285," Cl":13991," Cr":41618," Co":52114," Cu":4731," Cy":336462," Cw":8435," Da":83350," Di":51598," De":55946," Dd":24619," Dr":14253," Do":17642," Dy":52256," Du":9735," Dw":4392," Ea":2216," Eb":7093," Ec":3834," Ed":13121," El":12185," Ei":18714," Eg":4955," Ef":13229," Et":3898," Es":7909," Er":29251," En":10082," Em":3156," Ew":22287," Eu":3440," Ev":3993," Fe":52438," Fa":17460," Wy":8863," Wr":10593," Wo":5603," Wl":2195," Wi":24683," Wh":3073," We":55056," Wa":15114," Y ":51971," Ys":28919," Yr":86693," Yo":2066," Yn":102547," Ym":25969," Yc":2871," a ":830478," Tŷ":1928," R ":2099," Gŵ":1552," Ow":7608," Os":15855," Or":7771," Po":23478," Pl":14184," Pi":5403," Ph":9643," Pe":60810," Pa":41679," Nw":4046," Ny":1586," Nu":1468," No":23771," Ol":5537," On":7732," Oh":2023," Og":3807," Of":2088," Oe":13006," Ob":1522," Ra":12355," T ":1831," Mô":8084," Qu":2176," Ro":43958," Re":10555," Ri":8220," Rh":86232," Py":1882," S ":2106," Pr":53766," Pu":5103," Pw":24712," Sy":9580," Sw":30728," Su":12731," St":29121," Ta":24600," Th":31386," Ti":9036," Te":13995," Tr":35797," Ts":3677," To":17329," Rw":4999," Ry":6829," Ru":6903," Sb":4209," Sa":60443," Sg":4461," Sh":7649," Si":41046," Sc":4619," Se":32084," So":13916," Sp":3220," Sl":2025," Sm":1574," Uw":3258," Va":5509," Ve":2889," Vi":4990," Vo":1509," Tu":6235," Tw":3410," Ty":9645," Uc":8498," Un":36734," Ur":4373," im":11191," in":76482," il":4354," is":17036," m ":11207," gâ":2361," gy":546263," gw":478286," ha":96076," he":162837," gi":11285," gl":41252," gr":78474," go":196462," gu":1711," hy":365311," dô":5426," ia":51916," id":44170," ic":11451," if":15524," ie":33757," hi":37580," dé":1483," ho":95011," dî":1523," hu":28386," hw":135056," nh":11270," ni":121995," ng":46712," ne":182669," na":145170," my":52136," mw":82991," mu":7930," mo":53753," ol":23717," on":100399," og":16388," oh":48015," oc":8291," od":15636," oe":180132," of":38817," ob":2855," nw":4888," ny":5794," no":39883," le":65827," li":10695," n ":431744," la":42246," gê":2462," km":13885," me":211425," mh":11846," mi":65182," ml":11455," dŵ":4098," o ":734485," ma":374621," lu":4706," lw":8217," ly":12507," ll":290079," lo":8575," ae":23414," af":26254," ag":69810," ab":5223," ac":313769," ad":158208," am":308430," an":165543," ap":9179," ai":35372," al":81881," au":70585," aw":67212," ar":754053," at":122334," as":18494," d ":1572," ba":74237," bi":7903," be":114726," bo":279521," bl":71311," by":219304," bw":77585," bu":51690," br":137949," ca":253292," e ":6846," c ":1834," er":135688," et":35799," es":9572," en":103379," em":1894," ei":422771," el":25990," ef":56847," eh":7557," eg":29718," fe":207343," fa":114986," eu":135140," ew":2978," fu":28438," fr":21022," fo":146885," fl":40338," fi":19106," ff":161699," ge":127960," câ":2638," ga":511622," i ":828084," fy":138094," fw":49249," cl":32411," cn":1591," co":109371," cr":63415," ce":72686," ch":221519," ci":5186," da":218944," cu":2628," cw":37840," cy":626409," do":40629," dl":2183," dr":194410," de":169791," dd":436087," di":205363," ec":22162," ed":21781," ea":10130," dw":38607," du":15808," dy":134445," yn":1820343," ym":302114," yw":194442," yr":378078," ys":129891," pŵ":1449," yf":2310," yc":29594," yd":132913," tî":3180," wy":119723," sô":8613," tâ":4127," tŷ":1782," ry":22726," rw":2339," u ":39430," sa":81007," sb":3121," se":119945," si":112421," sg":18999," so":6324," mô":5938," ra":82908," re":37316," ri":5735," rh":384202," ro":65984," pw":59223," pu":5536," pr":117209," hŷ":2144," s ":4882," py":4098," os":35170," op":3331," or":31743," gŵ":2020," r ":896554," pe":127692," pa":91740," pl":41516," po":93931," pi":2900," ph":53142," wa":70353," we":305205," rô":8100," y ":1258970," wr":72987," wo":2162," wi":14944," wn":75369," wl":23728," uw":16002," uc":29790," w ":35333," ty":27053," tw":9262," tu":42804," ur":3073," un":179183," ta":58107," sw":41490," sy":328174," st":56328," su":17891," tr":125613," tl":5636," pê":2394," to":14311," th":49752," ti":12364," te":54434," â ":170861," ôl":68336," ŵy":5585,"BC ":3029,"AC ":1466,"AM ":3083,"AQ ":4611,"Fe ":14074,"Fel":30024,"Fen":1555,"Fer":1865,"Ffo":3170,"Ffl":4215,"Ffi":3556,"Ffe":3739,"Ffr":13232,"GA ":3651,"Faw":4451,"Fan":1616,"Far":2499,"Fai":1542,"Erb":3816,"Ess":3450,"Est":1737,"Eth":2283,"Ers":2768,"Ery":2581,"Esg":1785,"Eur":1788,"Eva":3195,"Ewr":21546,"Eid":3977,"Ein":2345,"Eis":4120,"Ele":2715,"Eli":4180,"Er ":16830,"Enw":3109,"Eni":2851,"Gel":9307,"Gem":2143,"Gei":4118,"Ger":7781,"Geo":3979,"Gen":6710,"Gla":4829,"Gib":2275,"Gil":1484,"Gan":11832,"Gal":17640,"Gar":7461,"Gae":18125,"Gad":5081,"Fro":1533,"Fyn":2428,"Fra":4838,"Fre":4327,"For":3219,"Fod":28712,"II ":4458,"Hil":2449,"Hin":1512,"IG ":5986,"Hef":1863,"Hel":5017,"Hed":3459,"Heb":2210,"Hea":1530,"Hen":9801,"Her":3491,"Hae":1946,"Haf":3014,"Hal":2040,"Han":6863,"Ham":2228,"Har":10025,"Haw":1645,"Gyr":1398,"Gym":42845,"Gyn":8266,"Gyd":5561,"Gyf":6063,"Gwr":3207,"Gwy":26844,"Gwe":57637,"Gwa":19610,"Gwo":1510,"Gwn":13119,"Gwl":15264,"Gwi":2629,"Gre":5983,"Gri":4784,"Gra":8936,"Gru":3564,"Gro":6225,"Gle":2868,"Gly":8806,"Gol":4717,"Gom":2792,"Gor":25978,"HS ":1490,"Gog":9938,"Gof":6071,"Goc":1412,"Gob":6546,"Ins":2483,"Ion":7787,"Ind":4805,"Idd":1600,"Iec":7293,"Ieu":3657,"Iai":2532,"Hyw":1599,"Hyd":9879,"Hut":3728,"Huw":2414,"HuM":1938,"Hug":2444,"Hon":1988,"Hol":2628,"Hof":4468,"Arg":4337,"Arf":2821,"Arc":4451,"Ard":3004,"Ara":5312,"Arb":2881,"Arm":1401,"Arl":2436,"Ari":2551,"Ath":3701,"Ast":2350,"Asi":4771,"Art":3397,"Arw":3824,"Aws":8386,"Awd":3332,"Bal":3216,"Ban":6574,"Bac":2192,"Bae":1627,"Bar":11218,"Bat":2986,"Bas":1933,"CC ":30118,"Aca":1583,"Abe":16172,"Act":3953,"Ach":1406,"Adr":4894,"Add":8121,"Ade":2895,"Aet":1896,"Ael":29221,"Afo":11935,"Aff":4708,"Aif":1390,"Ail":6673,"Am ":2367,"Ala":1841,"Alb":20867,"Ali":1390,"Ale":3047,"Alu":7537,"Alm":6621,"All":2960,"Amg":4897,"Ame":9269,"Amc":5055,"Ama":2394,"Amw":5293,"Ang":3572,"Ana":1441,"And":6722,"Ant":2809,"Ann":4580,"Ar ":18871,"But":1453,"Bus":2726,"Bur":3568,"Bu ":12249,"Bry":11840,"Brw":1809,"Byw":5055,"Byd":42228,"Bwy":3014,"Bwl":1803,"Bwr":4800,"DA ":1963,"Cab":3582,"Cad":8785,"Cae":18704,"Cal":3378,"Cam":3915,"Caf":13882,"Cai":8913,"Cas":11929,"Car":18471,"Cat":4229,"Can":20111,"Cap":2054,"Caw":2412,"Bea":1452,"Bet":6615,"Ber":4938,"Ben":8920,"Bel":4071,"Bei":4562,"Bed":2850,"Bla":15401,"Bre":7230,"Bra":4787,"Bro":7561,"Bri":21486,"Bod":1392,"Bon":2309,"Bor":1693,"Bot":9215,"Bou":3998,"Cyd":2800,"Cys":3130,"Cyr":1535,"Cyt":4308,"Cyn":107375,"Cym":169607,"Cyl":4078,"Cyf":35541,"Cyh":6033,"Cwm":4388,"Cwp":1455,"DdG":3636,"De ":7240,"Dey":7530,"Der":4792,"Dew":2912,"Del":1804,"Dem":6525,"Den":3029,"Dea":2964,"Dec":4054,"Ded":4019,"Def":3522,"Ddy":1483,"Ddw":1635,"Ddu":1844,"Ddi":7938,"Dda":2772,"Dde":7404,"Dan":4200,"Dar":5106,"Dat":11996,"Dav":29514,"Daw":5636,"Daf":5736,"Dae":6708,"Dai":2416,"Dal":7568,"Cho":2344,"Chr":5432,"Che":5180,"Chi":3022,"Chw":12405,"Chy":6459,"Cle":1841,"Cla":3349,"Cei":26151,"Cef":5771,"Cel":5461,"Cen":16383,"Cer":9603,"Cha":10397,"Cri":3356,"Cra":2677,"Cre":11529,"DU ":12564,"Cry":10370,"Cru":2605,"Cro":10328,"Cly":2509,"Clw":2729,"Clu":1384,"Coc":1662,"Cof":5253,"Cod":3452,"Coe":3011,"Cor":9729,"Com":7711,"Col":7269,"Con":7331,"Dyw":10945,"Dyd":1433,"Dym":6530,"Dyn":13135,"Dys":3789,"Dyf":6316,"Dyl":8043,"Dwy":3623,"Egl":4375,"Efr":2334,"Efy":3102,"Ei ":5214,"Efa":5477,"Edr":1682,"Edw":8804,"Eco":3291,"Ebr":6340,"Dis":10684,"Dir":8301,"Dio":8272,"Din":7477,"Dim":3122,"Dil":1632,"Dig":1922,"Diw":4850,"Dur":1585,"Dro":1914,"Drw":2051,"Du ":1598,"Dre":3548,"Dra":2578,"Dr ":1404,"Dos":6032,"Dol":2547,"Don":1750,"Dor":1804,"Ne ":2808,"Nat":5045,"Nid":35510,"Nic":6041,"Ngh":73349,"Ni ":20845,"Ngw":2819,"Ngo":3145,"New":11314,"Myn":6475,"Nan":3487,"Nad":1763,"Nwy":4018,"Nor":10406,"Nof":5476,"Nod":3573,"Oes":12374,"Ogw":2061,"Ohe":1667,"Owa":3576,"Owe":3922,"Oly":2697,"Oni":2962,"Ond":3507,"Os ":13940,"Orl":2808,"Pla":11557,"Phi":3702,"Per":4523,"Pet":10052,"Pen":38815,"Pe ":2342,"Pat":1391,"Pas":1419,"Par":10602,"Pau":3598,"Pan":9885,"Pam":1963,"Pal":1960,"Gŵy":1393,"Pa ":5409,"Pwy":22571,"Pug":1781,"Pro":4264,"Pri":32126,"Pre":2572,"Pry":13518,"Pob":2584,"Pol":1666,"Pon":3230,"Pos":1652,"Por":4501,"Pow":6369,"Rad":1677,"Ran":3290,"Môr":3910,"Môn":4178,"Ise":2138,"Iri":5405,"Isl":2286,"Isr":2081,"Is ":1861,"Ira":2109,"Iwe":7768,"Jac":2440,"Jap":2066,"Jan":11107,"Jam":3046,"Jen":4444,"Jos":1675,"Jon":21672,"Joh":12108,"Joc":1759,"Kar":1967,"Ken":1525,"Kir":4162,"Kin":2076,"LWa":1851,"Lew":3371,"Lep":4312,"Leo":4883,"Lei":2167,"Lea":1755,"Law":3401,"Laf":7238,"Lan":4581,"Lli":2308,"Llo":49625,"Lla":35769,"Lle":9036,"Lly":83139,"MP ":1987,"Llw":5196,"Llu":10289,"Lin":2565,"Lit":1660,"Lun":3403,"Loe":2587,"Loc":3759,"Lor":1686,"Lon":1463,"Lyw":14731,"Lyn":2021,"Mei":2923,"Meh":6831,"Men":2309,"Mel":4571,"Mes":10425,"Mer":5529,"Mew":6590,"Met":2031,"Mea":2200,"Med":8411,"Man":6320,"Mal":3191,"Mar":18396,"Mas":2050,"Mag":1798,"Mad":2418,"Mae":282353,"Mai":8315,"Mac":3022,"Mab":1620,"Mat":8080,"Maw":10696,"Moe":1836,"Mon":4407,"Mos":1676,"Mor":14383,"Mik":1997,"Mid":1494,"Mic":6978,"Mil":4139,"Min":1665,"Mhe":1924,"Mha":1542,"Mhr":3233,"Mho":2381,"Mun":2391,"Mur":2093,"Mrw":1525,"Wyd":2054,"Wyn":4508,"Wrt":6093,"Wre":3327,"Woo":1886,"Wla":1533,"Whi":1964,"Wil":18384,"Win":1533,"Wed":6925,"Wei":36912,"Wel":2589,"Wer":1546,"Wes":3389,"Wen":1975,"War":1988,"Wat":1997,"Wal":6169,"Wa ":1629,"épa":1444,"ên ":2198,"êm ":2653,"êl ":5579,"Ysb":3159,"Ysg":20301,"Yst":5213,"Yr ":86644,"Ym ":5361,"Yn ":82879,"Ymd":5797,"Yme":3234,"Ymg":1462,"Ymh":3610,"Yma":1499,"Yng":7777,"Yny":9396,"Ych":2873,"Syl":2023,"Sym":1457,"Syr":3939,"Swy":26172,"Swe":1556,"Sut":3192,"Sue":3189,"Str":4479,"Sto":4500,"Sta":4281,"Ste":11318,"Tei":2519,"Tel":1449,"Tan":1640,"Tar":1679,"Tal":5919,"Taf":2854,"Tac":5838,"Shi":1462,"She":1962,"Sha":2447,"Sim":1534,"Sir":17352,"Sin":1925,"Sia":11038,"Ser":3627,"Sgi":1542,"Sen":8777,"Sel":1981,"Sei":5257,"Sef":7830,"St ":2475,"SoD":3626,"Sou":1862,"Son":3211,"Rws":4266,"Ryd":2876,"Ryf":1897,"Rus":1456,"Ruf":3021,"Saf":4701,"Sai":8547,"Sam":1614,"Sal":2390,"Sac":1638,"Sae":19137,"Sch":1740,"Sar":1996,"San":14834,"Sba":3834,"SI ":1912,"Rho":14367,"Rhu":7735,"Rhi":4012,"Rha":27242,"Rhe":10007,"Ric":5194,"Rhy":21050,"Rhw":1595,"Rei":1456,"Roe":22967,"Rog":2273,"Rob":6038,"Ros":2400,"Val":3027,"Und":6300,"Une":8975,"Uni":1941,"Uno":6597,"Un ":11013,"Urd":3449,"Uwc":3250,"Twr":1583,"Tyn":1779,"Tyw":3140,"Uch":8478,"Ter":1904,"Tha":2583,"The":14207,"Tho":9841,"Thr":2820,"Tim":1714,"Tir":2224,"Top":2729,"Tor":4968,"Tom":2781,"Ton":1704,"Tou":1744,"Try":3291,"Tro":2928,"Tri":2467,"Tre":17818,"Tra":7618,"Tsi":2930,"Tud":2739,"bl ":108610,"biw":4303,"bis":1554,"bio":12210,"bil":7023,"bin":7207,"bo ":4437,"bly":64984,"blw":6354,"blo":22974,"ble":40629,"bli":7754,"blh":3186,"bla":43862,"bod":220071,"bob":76863,"bol":13954,"boe":2630,"br ":20666,"bon":13209,"bor":7226,"bot":2899,"bos":12396,"bou":1567,"bbo":2130,"be ":1980,"ban":30725,"bal":2700,"bai":11039,"bae":7107,"bac":11983,"baw":4752,"bau":11598,"bat":3598,"bas":7334,"bar":67699,"bi ":5133,"bei":27322,"beg":1762,"bed":17621,"bec":2601,"ber":53727,"ben":77191,"bel":21273,"bet":33514,"áu ":7630,"bia":9858,"byw":30592,"ca ":12587,"car":23845,"cas":8765,"cat":2777,"cau":3853,"can":83151,"cap":2655,"caw":3764,"cac":2094,"cae":84897,"cad":27764,"cam":16483,"cal":6269,"caf":14867,"cai":16996,"ce ":10178,"bri":40394,"bro":46496,"bra":12765,"bre":20136,"bry":38825,"bu ":33518,"brw":4007,"bur":5048,"bun":1597,"bum":2930,"bud":16965,"bus":15423,"by ":3295,"bwe":2992,"bwl":4609,"bwm":1750,"bwn":2420,"bwr":17316,"bwy":85191,"byd":138264,"byc":8029,"byn":105121,"byg":18794,"bys":7434,"byt":14453,"byr":14486,"am ":223910,"ake":2598,"al ":118723,"ail":88553,"ain":127939,"air":41299,"ais":49555,"ait":146887,"aig":9597,"aif":61819,"aid":215030,"aic":3098,"aho":2264,"agw":13994,"aha":34404,"agl":20938,"agf":7616,"agi":1866,"agr":2216,"agu":2103,"agn":2736,"ago":36762,"aol":8695,"aod":3349,"anw":35530,"anu":8926,"any":11333,"ano":100125,"ann":124935,"anm":2621,"ant":194347,"ans":34988,"anr":40810,"ane":76121,"anf":37821,"ang":138700,"anh":9277,"ani":73492,"ank":2431,"anl":26962,"ap ":8824,"ana":93564,"anb":13876,"anc":31617,"and":73259,"amw":8200,"amm":1546,"aml":38063,"amo":8733,"amp":12021,"ams":38011,"amr":14056,"amh":7193,"ami":4294,"amg":18088,"amd":22133,"ame":8313,"amb":16368,"amc":6627,"ama":26992,"aly":1799,"alw":28958,"alu":16444,"alt":2910,"als":1463,"alo":23556,"alm":1484,"all":223132,"ali":33870,"alc":16136,"ald":4188,"ale":30849,"alf":2478,"ahâ":2916,"ala":25578,"alb":3898,"an ":601606,"aba":9347,"abe":7222,"abi":6876,"abl":7168,"abo":19005,"abw":3992,"aby":9040,"ae ":500797,"aca":2574,"ad ":471484,"âr ":2718,"ânt":3622,"ac ":289950,"ab ":10475,"afn":10073,"afo":78808,"afr":10021,"aff":39961,"afe":2749,"afi":6417,"afl":25560,"ai ":234297,"aga":3364,"agd":2687,"age":10175,"afu":15963,"afw":7114,"afy":4556,"aeo":10243,"aen":99852,"aem":1827,"ael":201346,"aes":35220,"aer":50881,"aeg":25434,"aed":23890,"ah ":2434,"afa":19107,"afb":7029,"aew":2978,"aet":558993,"ado":61480,"adr":43817,"adl":44992,"adn":22991,"adi":13872,"âu ":2581,"add":96884,"adf":11029,"ade":56818,"aea":17754,"ag ":103800,"adw":77598,"adu":25844,"aco":1839,"ack":6984,"aci":3315,"ach":128622,"ace":4207,"ada":183664,"af ":282976,"act":12156,"acs":3551,"awy":14858,"atá":5909,"atâ":1572,"âi ":1763,"ân ":12449,"ba ":4022,"âl ":3716,"at ":82516,"arh":23505,"arg":39851,"arf":54518,"are":26707,"ard":84561,"arc":45936,"arb":37300,"ara":94572,"arp":37146,"aro":35640,"arn":81808,"arm":3073,"arl":28705,"ark":5712,"ari":88773,"aru":32539,"arw":69199,"arr":14786,"ars":4259,"art":96475,"au ":1046405,"asa":68559,"ary":11581,"asg":34259,"asi":17231,"ash":2200,"ase":12910,"aso":39583,"asn":11022,"aon":5874,"ar ":523204,"apa":4730,"ape":4263,"api":3039,"apu":8213,"as ":105000,"avi":30525,"ave":3478,"awb":10045,"awe":56474,"awd":64555,"ay ":7450,"awa":7380,"awr":88658,"aws":26002,"awu":8536,"awn":86269,"awo":7616,"awl":31176,"awf":5005,"awg":8609,"awi":6112,"atb":44447,"ata":26686,"asu":5959,"ast":31776,"ass":3705,"asy":2064,"asw":2164,"atr":12767,"ato":14863,"ate":127341,"âd ":2953,"ati":23617,"atg":39033,"ath":60146,"aw ":36271,"att":2841,"ats":1593,"atu":14332,"atw":3787,"aty":5073,"aul":6336,"aun":2939,"aur":3535,"aus":4165,"Tŷ ":1917,"itl":3296,"itr":3898,"itt":3287,"ity":2118,"iw ":27157,"ism":1479,"iso":19060,"isn":3234,"iss":2386,"ist":40808,"isw":4736,"isy":1650,"ita":5607,"ite":5770,"ith":392067,"iti":13491,"iwe":61557,"iwc":5902,"iwa":2911,"iwl":7051,"iwm":5821,"iwi":1939,"iwg":2218,"ius":6220,"ium":1735,"iva":1668,"ive":4125,"ips":2042,"ipi":4074,"ipy":2214,"is ":122562,"ion":238019,"iop":3639,"ior":2183,"iog":25807,"iol":103945,"iom":3562,"ir ":294219,"irw":8004,"irs":4748,"iro":22294,"irp":10769,"irn":10886,"irl":2635,"iri":73365,"isi":88927,"ish":14530,"isg":40104,"ise":9048,"isc":1593,"isa":5639,"iry":7836,"irf":10853,"ire":9324,"irg":2216,"ira":3882,"ird":5855,"it ":3539,"iyn":24711,"iwy":51891,"iwn":59547,"iwt":3364,"iwr":22308,"isï":6130,"kin":4185,"km ":12530,"ki ":1558,"ker":2181,"gân":1978,"ke ":5013,"gîl":3846,"ks ":2185,"gêm":2399,"ka ":1865,"ûm ":2086,"ha ":7052,"ham":23057,"han":175332,"hao":9882,"hap":3998,"hai":110980,"hal":15683,"hau":183675,"haw":26413,"har":53834,"has":47801,"hat":7168,"haf":50664,"hae":27487,"hag":43957,"had":23600,"hac":2354,"hbl":3227,"hbe":1560,"he ":24732,"hdr":5934,"hda":4116,"hde":15547,"hel":62200,"hei":34542,"heg":5488,"hef":77520,"hec":4466,"hed":62198,"hea":5713,"heb":18613,"hew":2495,"heu":8975,"het":5133,"hes":32088,"her":56901,"heo":28941,"hen":50971,"hem":4532,"hfi":1753,"hfe":1827,"hfa":10039,"hi ":46339,"hfy":2014,"hga":10718,"hgy":5492,"hgr":2128,"hig":9778,"hif":7727,"hie":4406,"hic":1408,"hia":52147,"hio":82343,"hin":19845,"hil":13596,"hiw":5818,"his":2712,"hit":4169,"hir":27145,"hn ":11023,"hla":8657,"hle":8160,"hli":7323,"hlo":9522,"hlu":4297,"hlw":5128,"hly":5405,"ho ":2690,"hma":1566,"go ":9041,"glw":22832,"glu":20270,"gly":15650,"glo":12695,"gle":62910,"gli":17036,"gla":12914,"gog":27008,"gof":47564,"goe":5103,"god":27033,"goc":1814,"gob":13416,"gno":2138,"gni":1566,"gne":1625,"glö":2290,"ghŷ":5306,"gr ":45679,"goi":3061,"goh":1450,"gom":3052,"gol":150147,"gon":23871,"gop":6979,"gos":50988,"gor":146456,"got":1926,"gu ":82293,"gro":16743,"gry":25523,"grw":30224,"gru":1887,"gra":32649,"gri":46257,"gre":21573,"gto":3324,"glŷ":8055,"gue":3937,"gwm":8230,"gwl":41207,"gwo":1468,"gwn":53470,"gwi":23351,"gwe":153211,"gwa":134710,"gwb":10043,"gur":10624,"gus":3658,"gyb":7452,"gyc":1560,"gyd":108608,"gyf":227201,"gyh":26111,"gyl":38768,"gym":62238,"gyn":132840,"gwr":40683,"gwt":1815,"gwy":112861,"grŵ":6194,"gyw":6050,"gyt":10218,"gys":38715,"gyr":27902,"iai":68448,"iam":28216,"ial":8742,"ian":170221,"ias":4394,"iar":21162,"iau":160853,"iat":11761,"iaw":38916,"ic ":6466,"iac":3494,"iad":518957,"iae":147014,"iaf":29926,"ibl":17312,"ibi":7734,"ibr":1406,"id ":289494,"iba":1702,"ibb":2227,"ibe":6370,"ia ":45885,"ib ":1986,"iet":3765,"ieu":2995,"iel":6330,"ien":9286,"ier":3745,"ies":20092,"ied":56622,"iei":12758,"ig ":263227,"iec":40455,"ifw":1504,"ify":12476,"ifo":38860,"ifr":12857,"iff":83949,"ife":69059,"ifl":2383,"ifi":33926,"ifd":6564,"ifa":33210,"icw":1931,"icr":58727,"ict":3075,"ico":4065,"ick":10871,"icl":2507,"ici":5011,"ich":67610,"ice":5312,"ie ":6756,"ica":24075,"iby":15215,"idy":13693,"idw":13164,"ids":2799,"idr":2377,"ido":84771,"idl":9715,"idi":42134,"idg":2023,"idf":1384,"ide":21066,"idd":167625,"ida":8532,"if ":111235,"il ":51770,"im ":30833,"ige":4001,"iga":12988,"ii ":1929,"igl":1926,"igh":5434,"igi":27928,"igu":9179,"igr":12369,"igo":33990,"ign":1744,"igy":2377,"igw":25763,"iha":10220,"imo":1905,"iml":10188,"ime":4271,"imi":12632,"ip ":2132,"inc":35034,"ind":13272,"ina":47117,"inb":3553,"inn":55876,"ino":32650,"int":53913,"ins":9108,"inf":1442,"ine":31855,"ing":30270,"ini":144723,"inl":1674,"iod":49289,"ioe":9726,"inw":4435,"iny":18916,"ike":2441,"ila":30156,"ilb":1450,"in ":265607,"ilo":8695,"ilr":2211,"ill":127954,"ilm":12277,"ilg":3984,"ili":76010,"ild":6621,"ilf":16113,"ile":18463,"ima":3325,"io ":289985,"ilw":10819,"ily":31080,"ils":2239,"ilt":1658,"hs ":2295,"hpw":3958,"hr ":7968,"how":2490,"hol":119579,"hom":12153,"hon":90998,"hog":8194,"hoi":31688,"hos":29540,"hou":1629,"hop":1896,"hor":31875,"hob":14250,"hof":10190,"hoe":76914,"hod":33780,"hoc":3799,"hni":2220,"hno":22182,"hmy":2484,"hna":15241,"hne":3112,"hug":1558,"huf":6768,"hud":4015,"hua":1767,"hub":3570,"hw ":4998,"hto":1904,"hu ":49728,"hry":11207,"hrw":2709,"dîm":1511,"hro":19232,"hre":66068,"hri":21764,"ht ":2488,"hra":53668,"hyf":52057,"hyh":2218,"hyg":5523,"hyb":8387,"hyd":135856,"hyc":4670,"hyn":321330,"hym":108203,"hyl":31096,"hwr":4551,"hwy":87925,"dôn":5372,"hy ":6235,"hwa":48372,"hwc":1503,"hwe":32713,"hwi":29986,"hwn":155040,"hwm":2940,"hum":3886,"hun":26480,"hus":5581,"hur":11397,"hyt":13007,"hys":19663,"hyr":32298,"hyw":53451,"ffu":31255,"fft":26915,"ffw":14834,"ffr":49438,"fi ":22805,"ffy":37874,"ffe":74696,"ffa":45191,"ffl":2596,"ffo":87611,"ffi":52404,"fet":3688,"fes":16994,"fer":169521,"fey":13730,"few":12993,"fec":3355,"fed":42340,"fen":64928,"fel":169733,"fei":77800,"fia":40768,"fha":2006,"fbw":7025,"faw":26795,"fau":8610,"fas":4389,"fat":49299,"far":64220,"fao":2062,"fam":3661,"fan":90668,"fal":61308,"fai":47413,"fag":3038,"faf":2234,"fae":34347,"fad":4691,"fac":8237,"fab":6835,"ff ":69555,"fdd":6405,"fe ":15854,"fa ":55971,"eyr":15759,"eys":6216,"eyd":12355,"exa":1465,"ewy":65140,"ews":1768,"ewr":3606,"eta":6194,"ete":11955,"eti":8354,"eth":694907,"etl":3716,"esn":18031,"eso":14039,"est":84304,"esu":36481,"ess":5687,"esy":14685,"esw":8363,"eua":14789,"eue":2639,"eud":123634,"eul":24078,"euo":18520,"eun":5305,"eto":15157,"etr":8086,"ets":4905,"ett":8442,"etw":1998,"ety":2010,"ew ":9767,"eve":3548,"evi":2084,"euw":2490,"eut":7771,"eur":3959,"eus":7690,"ex ":4617,"ewi":73442,"ewc":12139,"ewe":1562,"ewo":1850,"ewn":148037,"ey ":13114,"ewa":1725,"epi":5381,"eph":2880,"er ":422635,"eor":6531,"eol":111949,"eon":27594,"es ":210327,"erl":12508,"eri":91394,"erg":8320,"erh":2615,"ere":19871,"erf":46260,"erc":20516,"erd":46688,"era":89973,"erb":57683,"et ":16065,"esg":10116,"esi":21447,"esb":3967,"ese":18221,"eu ":234659,"esa":40315,"ery":19213,"eru":8702,"erw":39257,"err":9221,"ert":82663,"ers":45260,"ern":17311,"erm":24808,"erp":2000,"ero":29348,"en ":226035,"ela":23185,"eld":45521,"elf":13664,"ele":31547,"eli":27149,"elg":3855,"ell":186956,"elo":57857,"elp":12214,"elu":10714,"els":4599,"elt":6398,"ely":21849,"elw":32426,"eo ":1957,"emb":1626,"ema":27011,"eme":5766,"emo":14994,"emi":5124,"emp":1703,"emy":1397,"enf":12468,"ene":63851,"enh":14016,"eng":36123,"enb":2204,"ena":18298,"end":42838,"enc":10021,"eno":47452,"enm":1999,"enn":152654,"enl":2876,"eni":58550,"enw":78572,"enu":5709,"ens":17384,"ent":99268,"enr":8307,"eny":15492,"eoe":12895,"eod":2681,"egl":13651,"ego":34964,"egn":1555,"ege":6483,"egi":20825,"eha":8653,"egr":51439,"egu":7476,"egw":11198,"egy":3765,"eho":1591,"ehe":10265,"ek ":1464,"eib":5369,"eic":44753,"eia":10726,"eis":78703,"eir":116822,"eim":11019,"eil":72219,"ein":209835,"eih":8539,"eid":87601,"eig":15428,"eif":19864,"el ":372965,"eit":227004,"em ":45721,"gl ":8528,"gis":14755,"gir":4745,"giw":1525,"gil":17145,"gip":1439,"gin":3700,"gio":39495,"gid":2251,"gie":1537,"gig":2186,"gia":52084,"ghy":121718,"ghw":2295,"ghr":23072,"ght":4029,"gho":27460,"ghl":1424,"ghe":16945,"gha":18379,"gfy":6247,"gi ":33571,"gfa":1868,"gfe":8886,"cân":1810,"gen":111143,"get":2018,"ger":33433,"ges":6556,"gh ":5400,"ged":20081,"gei":28902,"geg":3171,"gef":17104,"gem":3427,"gel":39073,"gdd":3282,"ge ":8402,"gbi":2600,"gac":4164,"gad":26323,"gae":134740,"gaf":12507,"gai":29586,"gas":4535,"gar":39437,"gau":19803,"gat":2266,"gaw":5037,"gam":9613,"gal":81263,"gan":349053,"ga ":3455,"fyw":14077,"fyt":1464,"fys":13441,"fwy":54963,"fwr":24223,"fyr":32389,"fyl":23029,"fyn":110487,"fyg":10929,"fyd":186182,"fyf":2810,"fur":37788,"fus":5753,"fwn":11199,"fy ":34929,"fta":2433,"fti":3720,"fud":15215,"fua":3656,"ful":1490,"fun":7356,"fug":1943,"ft ":22715,"fra":51653,"frg":3232,"frd":1806,"fre":60623,"fri":60159,"fu ":21619,"fro":44672,"fru":1678,"frw":12026,"fry":26333,"for":98563,"fos":2404,"fot":2236,"fon":44680,"fol":37771,"fr ":9501,"öyn":2600,"fna":5141,"fne":3267,"fnd":3118,"fnf":2107,"fnu":4897,"fni":17053,"fno":61820,"fod":269071,"foc":1640,"fog":4206,"foe":12090,"fny":50847,"fle":57797,"fn ":23285,"fla":32104,"fli":5176,"flu":2782,"flo":12807,"fo ":8741,"fly":15244,"flw":53054,"fid":2843,"fie":8072,"fig":12205,"fil":26125,"fin":25988,"fio":35785,"fir":3345,"fis":6813,"fit":3614,"fiw":3846,"da ":110490,"dd ":1477253,"de ":26329,"dbw":2382,"dad":34580,"dal":74018,"dai":102403,"dag":8330,"dae":63510,"daf":28823,"dat":79262,"das":16177,"dar":103592,"dan":79963,"dam":6340,"daw":14222,"dau":293256,"dda":258712,"dde":137692,"ddf":57032,"ddg":1492,"ddh":7085,"ddi":485523,"ddl":8742,"ddo":292072,"dds":12690,"ddr":9994,"ddu":67078,"ddw":123638,"ddy":160807,"df ":9080,"cul":1415,"ctr":2570,"cto":27970,"cti":6692,"cte":1554,"cta":12152,"cwl":2358,"cwm":14836,"cwr":4675,"cws":1941,"cwe":12279,"cwb":3579,"cwa":1956,"cus":1837,"cym":87796,"cyl":22094,"cyh":35906,"cyf":177359,"cyt":17926,"cys":21905,"cyr":11925,"cyn":218403,"cyd":30782,"cyc":4381,"cwy":3394,"cyw":3704,"cla":5831,"cle":10053,"clu":21575,"clw":4425,"cli":4923,"clo":4478,"cly":7408,"co ":4403,"coh":1780,"cod":19541,"coe":5573,"cof":9302,"coc":2985,"con":33774,"col":15359,"com":10785,"cor":15348,"cos":8474,"cop":28345,"cot":2350,"cr ":7971,"cs ":1718,"ct ":9647,"cre":25913,"cra":16391,"cri":3869,"crh":44596,"cru":1640,"cro":18038,"crw":6875,"cry":13564,"csa":3697,"cso":2895,"csi":3457,"ch ":387720,"cer":12762,"ces":1879,"cen":12324,"cel":7425,"cef":18982,"cei":27071,"ced":2754,"ci ":2300,"cha":83305,"chd":17277,"chw":118760,"chu":29104,"chy":130786,"cia":11444,"ck ":16479,"chg":9358,"che":62787,"chf":6666,"chl":11299,"chi":60597,"cho":54885,"chm":3020,"chn":17832,"cht":1484,"chr":49582,"ciw":1511,"cil":2269,"cig":1657,"cis":2121,"cin":1623,"cio":4546,"cip":1810,"cke":1619,"ed ":214352,"eba":13413,"ebe":6425,"ôn ":16823,"ebi":9909,"ebl":2268,"ebo":8209,"ebr":1680,"ebu":15491,"ebw":2965,"eby":18028,"eac":1537,"eag":1964,"eaf":3149,"ead":8251,"eai":10078,"ean":17065,"eal":20923,"ear":16517,"eat":6423,"eau":2544,"ôl ":76461,"eb ":101834,"ea ":3365,"efi":22738,"efn":124494,"ôr ":11345,"efo":7601,"efa":25470,"efe":21832,"eff":45413,"efy":131474,"ei ":286301,"ega":31939,"efr":6364,"eft":2356,"een":4284,"ees":1420,"eet":1564,"edl":50591,"edi":359651,"edd":749953,"ede":12372,"edf":5588,"eda":55389,"eg ":123528,"edw":24074,"edy":23468,"ônt":5425,"edu":42143,"edo":40266,"edr":25455,"ech":101594,"eci":1577,"eca":2510,"ee ":3216,"ef ":110622,"ecy":2620,"ecw":1473,"ect":38237,"ecs":5066,"ecr":2112,"eco":26895,"dyg":15248,"dyh":1417,"dyf":26975,"dyl":46528,"dym":70188,"dyn":99844,"dys":59049,"dyr":6354,"dyd":76050,"dyc":34668,"dyb":3305,"dwy":146331,"dwi":8446,"dwe":48518,"dwc":13218,"dwr":24472,"dwm":2562,"dwn":30086,"dwl":10851,"dwa":30525,"dy ":46819,"duw":2993,"dur":48569,"dus":35809,"dyw":30318,"dor":33505,"dop":5293,"don":25967,"dom":7441,"dol":259182,"dos":14598,"dr ":13114,"ds ":5982,"dna":26847,"dne":5245,"dno":10393,"doc":9952,"dod":124322,"doe":27190,"dog":102118,"dso":15419,"dw ":23914,"dun":1669,"dul":10560,"dug":2604,"dud":2961,"dri":30946,"dra":161835,"dre":56442,"drw":45270,"dry":28175,"du ":91895,"dro":110745,"dru":4316,"dgy":2105,"dha":8406,"dge":2999,"dic":3369,"did":20310,"dia":209861,"dib":9407,"der":119203,"des":10401,"det":2296,"deu":21010,"dew":20869,"dey":3681,"dfa":22781,"deb":69554,"dea":13246,"ded":35299,"dec":28845,"def":58015,"deh":3382,"deg":29115,"dei":94188,"del":21705,"den":26263,"dem":15362,"deo":23118,"dfu":3372,"dfr":9809,"dfy":4264,"dfw":12337,"di ":343588,"dfe":12376,"dfo":7225,"dle":36107,"dla":42355,"do ":65572,"dly":4536,"dlw":6934,"dlu":21986,"dlo":19459,"dli":13158,"dl ":24621,"diw":121291,"dim":26389,"din":77731,"dio":147213,"dip":2426,"dir":76164,"dis":29794,"dit":1587,"die":4163,"dif":40002,"dig":144690,"dih":1770,"dil":26660,"rgy":18304,"rhe":70719,"rha":251632,"rhi":11342,"rho":38863,"rfy":33803,"rfu":1652,"rga":18985,"ri ":69199,"rgl":5099,"rgi":2080,"rge":21537,"rgr":6072,"rgo":1666,"ret":16937,"res":54799,"reu":26897,"rew":6998,"rey":2073,"rfa":15148,"rfe":25711,"rff":46741,"rfi":14319,"rfl":1729,"rfo":41751,"rdu":1543,"rds":3609,"rdr":2188,"rdy":20437,"rdw":1665,"rg ":3366,"reb":4258,"rea":12241,"ree":4927,"ref":133959,"rec":21009,"red":130547,"rei":60987,"reg":12687,"rem":4531,"ren":43953,"rel":5532,"reo":10464,"rf ":6728,"rda":40210,"rdo":36701,"rdi":12516,"rde":16337,"rdd":152990,"re ":20437,"rby":56369,"rbw":1488,"rbr":3649,"rci":3956,"rch":112992,"rce":1610,"raw":40774,"rd ":19513,"rap":2670,"ras":11491,"rat":38876,"rau":102545,"rbo":4197,"rbe":37571,"rc ":5087,"rai":131520,"rah":5389,"rag":11266,"ran":123131,"ram":16979,"ral":34431,"rab":4126,"raf":74615,"rae":164083,"rad":66122,"rac":33820,"rpr":10686,"rs ":33187,"rpe":3712,"rpa":38052,"rr ":1410,"ror":7927,"ros":107066,"rot":6229,"rom":4057,"ron":48088,"roo":1626,"rop":23932,"rou":4505,"rov":1701,"row":5422,"rob":23818,"roa":2113,"rod":65494,"roc":6113,"roi":31757,"rol":80831,"rof":21090,"roe":88522,"rog":12801,"rno":22930,"rns":3524,"rnw":1636,"rnu":2159,"rny":10237,"rna":22378,"rng":2426,"rne":19996,"rni":12106,"rnh":11275,"rmo":4622,"rmw":7648,"rmy":2333,"ro ":54898,"rma":11870,"rme":2383,"rmi":13641,"rly":6595,"rlu":5557,"rlo":6770,"rll":39347,"rli":4565,"rle":7637,"rla":3212,"rn ":39795,"rks":1562,"rke":1486,"rm ":6483,"riw":9942,"rl ":2183,"rip":3073,"rio":118981,"rir":9993,"rit":13309,"ris":35757,"rig":15158,"ril":9399,"rin":61972,"rim":1705,"ria":206687,"rib":3285,"ric":21565,"rid":9651,"rie":47399,"rif":193163,"rhw":45472,"rhy":80288,"rk ":5015,"rwg":3630,"rwe":24948,"rwi":1687,"rwp":11620,"rwo":7066,"rwn":8458,"rwm":4787,"rws":4580,"rwr":4007,"rwy":222313,"ryb":4844,"ryc":37546,"ryd":130644,"ryf":17151,"rug":5479,"ruf":3376,"rud":2993,"ruc":1894,"rus":10019,"rut":3548,"rwa":12724,"rwc":4604,"rwd":3531,"ry ":20234,"rsi":9220,"rso":11498,"rsa":9355,"nïa":6814,"rse":4348,"rsy":4479,"rta":14492,"rst":6242,"rsw":1913,"rtn":12219,"rto":2978,"rte":6802,"rtf":2225,"rth":239362,"rti":7087,"rw ":14755,"rts":2811,"rtr":18873,"rt ":17422,"rro":2785,"rri":17884,"rre":7941,"rra":15237,"ru ":229203,"rry":5686,"rru":2402,"sac":1631,"sae":2799,"saf":62271,"sai":25064,"sal":3060,"sam":3928,"sba":15855,"sbe":7863,"sbi":2265,"sbl":1857,"san":70637,"sau":24838,"sar":3658,"saw":42580,"sa ":3554,"ryw":36415,"rys":31710,"ryt":1688,"ryr":1752,"ryl":4848,"rym":18611,"ryn":49541,"ryg":7741,"sha":2603,"sgu":19165,"sgw":18665,"sgy":14440,"she":2309,"shi":5072,"si ":25698,"sga":6215,"sgo":72408,"sgl":24788,"sgr":43324,"sge":4670,"sgi":15631,"siw":27410,"siy":17736,"sie":15940,"sid":3544,"sic":63165,"sib":19534,"sia":76060,"sit":3740,"sir":11832,"sis":3362,"sin":5101,"sio":63000,"sil":4613,"sif":2772,"sig":47581,"sbr":3897,"sbo":3621,"sbu":1443,"sby":15923,"se ":11827,"sch":1975,"sco":2461,"sex":4204,"ser":39757,"ses":23238,"set":6293,"sfa":3579,"sh ":16149,"sff":1898,"sg ":45629,"sei":15116,"seg":4502,"sef":70983,"sed":16516,"sec":20419,"seb":2546,"sep":1434,"sen":25624,"sem":2228,"sel":10605,"sol":35386,"som":9795,"son":34627,"sor":4687,"soe":25172,"sod":44247,"sog":8851,"soc":8028,"su ":11096,"srw":1495,"sra":2879,"siŵ":5591,"st ":33109,"ss ":5302,"sla":5549,"sle":6891,"sgî":3837,"sna":6964,"sni":3185,"sne":44663,"smo":2820,"smy":1876,"so ":9507,"sme":1611,"sws":1382,"swm":11357,"swl":5700,"swy":52041,"syd":111441,"syn":20130,"syt":1505,"sys":17787,"syr":1516,"syf":1428,"sym":29115,"syl":80321,"sse":8921,"ssa":2131,"sso":1595,"ssi":2920,"ste":53847,"sta":62106,"stn":2981,"sto":42022,"sti":50489,"stl":1998,"stu":19363,"stw":7828,"str":84065,"sty":65191,"sul":1526,"sut":16540,"sur":35899,"sy ":127712,"swa":1778,"tai":36622,"tal":43400,"tae":6290,"taf":52261,"tag":1975,"tab":7263,"tac":3832,"tad":18299,"tbl":44383,"taw":7828,"tau":14344,"tat":9098,"tas":2365,"tar":17514,"tan":20001,"tam":1918,"tch":3069,"te ":8864,"ta ":18244,"pa ":37111,"pe ":8693,"par":64997,"pat":3348,"pas":12403,"pau":2226,"paw":4951,"pai":2948,"pao":5682,"pap":3898,"pam":9971,"pan":38154,"phe":11012,"pha":10199,"phw":4112,"phr":10198,"pho":14668,"phl":6358,"phi":1655,"pi ":3150,"ph ":1476,"pea":9878,"pec":2412,"ped":9480,"pen":51739,"per":42893,"pet":16989,"pei":7527,"pel":9957,"pla":28995,"ple":12058,"plw":1815,"pia":10014,"pid":5098,"pin":2030,"pio":4575,"por":7160,"pop":3413,"pot":2376,"pos":11018,"pon":2140,"pol":16902,"pob":55043,"poe":2616,"psi":3534,"pte":5358,"pto":3399,"pra":3655,"prw":10598,"pry":26375,"pu ":10555,"pri":36941,"pre":18820,"pro":34872,"pwr":4656,"pwl":2162,"pwn":2716,"pwy":52147,"pur":7632,"pus":2183,"pum":4086,"pwe":6830,"pyn":3167,"pys":1863,"löy":2600,"hŷd":5310,"hŷn":2140,"que":2021,"môr":5916,"iŵr":5597,"ra ":25900,"ngo":70240,"ngi":8856,"ngl":24492,"ngu":6735,"ngw":13906,"ngr":4027,"ngt":3458,"ngs":1836,"ni ":162627,"nfy":3382,"nge":66569,"ngh":139492,"nga":20430,"ngd":2154,"nho":3123,"nhy":17811,"nhw":13074,"nhr":2194,"nha":39183,"ngy":19113,"nhi":10953,"nhe":12166,"neg":58741,"nei":16297,"nel":23644,"nen":2828,"ner":32050,"net":12681,"nes":89293,"neu":177328,"ndw":3769,"ndy":7766,"ng ":205524,"nea":1708,"neb":36285,"nec":1684,"ned":179856,"nef":3629,"nfi":1656,"nfo":25474,"nfr":5836,"nfu":3122,"ney":2567,"new":81562,"nfa":14793,"nff":8709,"nfe":9475,"nct":2729,"nco":3605,"nci":8047,"ncl":19065,"nce":7558,"nch":2262,"nca":5548,"oDd":3626,"ne ":35524,"nby":4127,"ndu":2109,"ndr":13355,"nds":2198,"ndo":10475,"ndl":1413,"ndi":15446,"nde":57533,"ndd":30664,"nda":28690,"ncw":4231,"nal":27769,"nam":4027,"nan":12422,"nar":15181,"nac":16010,"nad":65361,"nae":85590,"naf":35036,"nag":46791,"nai":38578,"nc ":30927,"nab":30870,"nbe":2174,"nd ":167663,"nba":12055,"nau":88603,"nat":15889,"nas":60382,"naw":19356,"ïo ":1777,"na ":107935,"mys":6814,"myr":5193,"myn":55033,"myl":2278,"mwr":2209,"mwy":106236,"mwl":1400,"mwn":18260,"myd":4653,"myf":3779,"myg":3973,"nyf":10568,"nyd":167388,"nyc":8519,"nwy":109198,"nwi":8478,"nwl":5833,"nwn":2177,"nwo":7690,"nwr":5201,"ny ":138554,"nwg":3510,"nwe":18073,"nwc":5387,"nwa":19489,"nul":75697,"nus":11035,"nud":3500,"nty":9797,"nw ":76947,"nto":6210,"ntu":3630,"ïau":14997,"nts":1727,"ntr":49425,"nti":31515,"nth":5358,"nta":50854,"ïai":3322,"nte":23599,"nsw":5855,"nso":9921,"nst":3803,"nse":9354,"nsi":15567,"nsa":11986,"nu ":76710,"nry":14046,"nrw":4687,"nro":2398,"nri":35884,"nrh":37151,"nra":5874,"nt ":333571,"ns ":20490,"noc":4035,"nod":144646,"nog":50582,"noe":7622,"nof":5561,"nol":206820,"nom":37174,"non":7970,"nos":23694,"nor":25161,"nov":1574,"nne":21867,"nna":136888,"nno":91688,"nnh":1664,"nni":142373,"nnu":52353,"nnw":99293,"nny":211841,"nma":2508,"nmo":2415,"nll":57130,"nn ":8357,"nle":2650,"nly":39314,"ndŵ":1512,"no ":90645,"nig":123645,"nif":44965,"nie":2798,"nid":127033,"nic":4230,"nib":9026,"nia":189506,"nk ":2019,"niw":17294,"niu":1595,"nis":8567,"nit":5323,"nir":11106,"nio":79645,"nin":15456,"nil":16129,"ogr":5332,"ogw":9691,"ogi":49377,"ogl":36135,"ogo":10422,"oga":51638,"oge":17943,"ogf":7295,"ofy":23009,"oi ":70058,"ohi":1622,"oho":21669,"ohn":11771,"ohe":28958,"ogy":17848,"ois":1617,"oir":2176,"ok ":1582,"ol ":763917,"oce":3144,"och":46521,"oci":2367,"ock":4010,"oco":2527,"ocr":11686,"ocs":2087,"oe ":6131,"oca":3987,"ode":12839,"odf":2129,"odl":8794,"odi":83255,"odo":79662,"odr":97425,"ocy":1405,"of ":21772,"odd":343166,"oda":94030,"oel":5335,"oeg":51158,"oer":2610,"oes":86294,"oet":10844,"oen":4783,"ody":7861,"odu":5553,"odw":25659,"oed":409580,"og ":129097,"ofn":11560,"ofi":27518,"ofr":8483,"ofo":2473,"off":27270,"ofe":10926,"ofa":28384,"ob ":52175,"îl ":4003,"oc ":4927,"îm ":5804,"oad":3115,"oba":3465,"od ":547384,"obr":5708,"obl":125783,"obi":2247,"obe":20665,"nyn":23338,"nym":18653,"nyl":5128,"nyr":7724,"nyt":1430,"nys":17788,"nyw":9139,"oyd":3974,"owy":9062,"owl":1588,"own":7478,"oyw":3671,"oyn":3623,"otw":9445,"ow ":4107,"oti":2331,"oth":5403,"ote":7444,"ott":3778,"oto":4245,"osy":2038,"ost":30889,"osw":1982,"ota":4367,"osi":48645,"ose":24991,"osg":13017,"osf":1818,"oss":2774,"oso":27758,"owe":3109,"ovi":2723,"orï":3839,"ove":3431,"oug":2750,"oul":1799,"oun":4204,"ous":4391,"our":8928,"out":3139,"opo":2593,"opi":3247,"ope":16444,"opa":35609,"os ":142644,"opt":5362,"ops":1860,"ool":1739,"ook":2746,"ood":5509,"or ":143087,"ork":1606,"orl":27515,"orm":20652,"orn":6094,"oro":25173,"orr":10396,"orc":13306,"ord":84993,"ore":21995,"orf":42746,"org":17624,"ori":60315,"ou ":2179,"osa":9919,"osb":22160,"ort":48554,"ors":13048,"oru":3461,"orw":15896,"ory":7256,"ot ":6872,"orb":3575,"ora":32977,"olb":14257,"ola":56819,"old":19385,"olc":15348,"on ":432682,"oli":132480,"oll":47544,"olk":2728,"olf":14473,"ole":35161,"olh":1780,"olg":1440,"olr":3706,"oln":1388,"olo":11175,"oly":45871,"olu":1783,"olw":23143,"om ":33744,"oke":1397,"ona":27650,"ond":105123,"onc":2117,"onf":10374,"one":36593,"ong":18051,"oni":37789,"onl":18014,"onn":27921,"ono":53618,"onr":3505,"ons":9110,"ont":18300,"onw":6728,"ony":18054,"oma":27472,"ome":15722,"omb":2548,"omi":28943,"omm":1820,"omp":1938,"omo":2196,"op ":17416,"la ":30161,"le ":102209,"lbw":12810,"lch":90844,"lco":2475,"lf ":2256,"ldd":2333,"lde":19232,"ldi":15321,"ldr":3031,"lab":2112,"lac":29323,"lad":98353,"laf":41321,"lae":128735,"lag":1692,"lai":117947,"lal":3217,"lan":125158,"lam":3538,"lar":5003,"lat":3638,"las":18341,"law":107485,"lau":29230,"lay":1468,"lba":26635,"ld ":37477,"lbe":1978,"lbo":2022,"ls ":7845,"lpu":10684,"lol":3460,"lon":31796,"lom":2651,"lor":7254,"lod":80426,"loc":6030,"lof":4907,"loe":72615,"log":46325,"loi":2389,"los":4346,"lot":6141,"low":2604,"loy":9925,"lmi":2989,"lma":7554,"lti":32430,"lto":2424,"ltw":2317,"ltu":11392,"lw ":31473,"lud":23429,"lsh":1772,"lso":2605,"lst":1753,"lta":4687,"lte":3257,"lu ":96206,"lrw":4768,"lt ":18470,"lhe":1975,"lha":4721,"lgy":4529,"lgo":31713,"lge":2937,"li ":53647,"lga":4421,"lfy":5165,"lfr":2302,"hân":4040,"lfo":3307,"lff":13753,"lfe":12308,"lfa":43443,"ley":7868,"lew":34228,"lex":1550,"leu":25002,"les":19828,"let":14970,"ler":17336,"leo":79925,"lem":37265,"len":73940,"lei":80945,"leg":28145,"lef":21974,"led":124640,"lec":11257,"lea":4274,"lls":6571,"llu":91103,"llt":63202,"llw":65886,"lly":80846,"lo ":18642,"lla":195874,"llb":4606,"lle":173411,"llf":21229,"llg":34044,"lli":226485,"llo":30437,"lm ":9681,"ll ":214894,"lit":13912,"lis":26649,"lir":45447,"lip":2581,"lio":84412,"lin":37120,"liw":34714,"lic":2671,"lid":52663,"lia":236926,"lk ":2487,"lig":7246,"lie":8920,"lif":13223,"dŵr":6394,"ma ":40909,"mac":2067,"mab":4216,"mai":55413,"mad":6604,"mae":247275,"mag":4598,"map":1407,"mar":23042,"mas":14537,"mal":5198,"mam":2142,"man":35032,"maw":18773,"mau":39313,"mat":69909,"mba":1504,"mbl":2148,"mbi":3707,"mbe":5450,"mbr":12581,"mbo":3263,"me ":3994,"mca":11618,"mbw":1444,"mch":11714,"mda":8727,"mde":41204,"mdd":39775,"mdr":17496,"mdo":1963,"med":29820,"meg":19523,"met":12974,"mew":128769,"mes":22759,"mer":46667,"mel":4668,"men":17413,"mei":8560,"mff":2556,"mey":3711,"luo":19901,"lun":80882,"lum":1634,"lus":14270,"lur":8303,"ly ":53421,"lwa":19342,"lwb":3155,"lwc":19181,"lwe":30764,"lwg":19758,"lwi":8918,"lwr":11928,"lwn":21982,"lwm":3879,"lwy":170858,"lya":1979,"lyb":2209,"lyc":2140,"lyd":25341,"lyf":17803,"lyg":99540,"lyw":104640,"lym":14951,"lyn":124126,"lys":19250,"lyt":6538,"lyr":2418,"mpi":2296,"mpe":1411,"mpl":2550,"mpw":3470,"mps":1604,"mpt":2590,"ms ":10491,"moc":10530,"moe":5528,"mod":37018,"mon":12910,"mol":5204,"mor":51987,"mos":7848,"mot":1641,"mpa":10579,"mre":12280,"mro":3439,"mnï":5503,"mrw":13794,"mru":208863,"mry":37758,"mu ":8242,"mse":28496,"mra":25843,"mud":19804,"mwe":10675,"mwd":1948,"mwa":5221,"my ":1506,"mur":1395,"mus":2197,"mun":73350,"mho":8276,"mhr":2830,"mhl":11326,"mhw":5307,"mha":11193,"mhe":24401,"mgo":1890,"mgy":43295,"mgu":2786,"mga":3590,"mi ":29234,"mge":3739,"ml ":13005,"min":8059,"mio":5881,"mil":37316,"mis":33867,"miw":5015,"mit":2252,"mia":20012,"mig":2194,"mo ":4753,"mlw":14332,"mly":19237,"mlu":1776,"mlo":5063,"mli":3947,"mle":2810,"mla":30789,"mni":9880,"mp ":4419,"rŵp":7025,"yw ":270496,"yty":6748,"ytu":28907,"ytr":9374,"yti":2114,"yth":64543,"yta":7981,"ysw":7360,"ysy":36277,"yst":133768,"ysu":6119,"yso":21693,"ysl":6336,"ysm":1721,"ysg":129385,"ysf":2863,"ysi":59834,"ysb":16200,"yse":10011,"ysa":7689,"yri":51337,"yro":2402,"yrn":17748,"yrt":1495,"yrs":5675,"yrr":16707,"yrw":12691,"yrc":40688,"yrd":23223,"yra":30777,"yrh":2696,"yre":5663,"yrf":11768,"ys ":130128,"yr ":522252,"yon":1614,"yny":65358,"ynu":90277,"ynw":17551,"ywe":47689,"ywb":7853,"ywa":7712,"ywi":32582,"ywf":4430,"ywg":2254,"ywu":4318,"yws":4181,"ywo":97481,"ywy":47791,"yby":3450,"yca":1577,"ych":135201,"ybi":5778,"ybl":10563,"ybo":33530,"ybr":6887,"ybu":6366,"ybw":3785,"yf ":94965,"yda":108440,"ydb":2445,"yde":30766,"ydf":2528,"ydd":1031264,"ydi":30452,"ydg":1873,"yg ":23655,"yed":2287,"ydo":18475,"ydr":24295,"ydl":41447,"ydn":15963,"ydw":15590,"ydy":142006,"yfa":79734,"yfl":101205,"yfi":9333,"yfh":1890,"yfe":122684,"yff":49868,"ya ":2403,"yb ":1460,"yaf":39177,"yd ":448856,"yau":2199,"ym ":139299,"yn ":2046385,"yla":24866,"ylc":58295,"yld":3028,"yle":19438,"ylf":11493,"yli":25804,"yll":146688,"ylo":4658,"ylw":47190,"ylu":4087,"ymc":10975,"ymb":5107,"yma":78324,"yo ":9913,"ymh":40871,"ymi":12166,"ymf":2997,"ymg":31535,"ymd":80346,"yme":33885,"ymr":286136,"ymp":5265,"ymo":45199,"yml":28649,"ymy":21331,"ymu":93545,"ymw":38802,"yna":64317,"ynd":48575,"ync":3108,"yni":92899,"ynl":49958,"yne":71151,"ynf":3851,"yng":251524,"ynh":49363,"ynr":15963,"yns":1653,"ynt":143293,"ynn":358379,"yno":84035,"yfw":5609,"yfu":9902,"yfy":23708,"yfo":25624,"yfn":30446,"yfr":128185,"ygi":35049,"ygl":7602,"yga":7036,"ygb":2588,"ygw":4747,"ygy":7099,"ygo":16536,"ygr":9832,"ygu":53946,"yhe":1472,"yhy":1494,"yhu":1841,"yho":67043,"yie":1950,"yl ":17899,"tîm":3174,"tân":2674,"tâd":2075,"tâl":1439,"táu":5879,"wun":4321,"wy ":167584,"wso":5954,"wst":14295,"wsb":1703,"wsa":2381,"sïa":6640,"wu ":8197,"wsi":4835,"wse":1824,"wto":2882,"wti":2767,"wth":3593,"wyt":46841,"wys":164434,"wyr":156167,"wyw":2663,"wyl":86951,"wym":16265,"wyn":179235,"wyo":11729,"wyd":589155,"wye":2580,"wyb":42400,"wyc":3206,"wyi":2503,"wyf":88674,"wyg":7233,"wya":43059,"sôn":8723,"wlw":2007,"wmn":13780,"wmp":7509,"wmw":2030,"wp ":5257,"wna":35677,"wnf":1977,"wne":111452,"wnd":6615,"wnc":4706,"wni":16860,"wng":51730,"wnn":39302,"wno":2540,"wns":4025,"wnt":5833,"wm ":45639,"wlc":4133,"wla":51387,"wn ":404585,"wll":7707,"wli":8016,"wle":39435,"wlf":1463,"ws ":27784,"wre":14290,"wrc":1643,"wrd":17997,"wri":42683,"wra":16401,"wrt":86482,"wrs":5673,"wry":3769,"wrw":3993,"wrn":10068,"wrp":4016,"wro":34924,"wod":95909,"wog":10667,"wob":4429,"woc":1528,"wnw":1599,"wor":2513,"wol":6338,"woo":2607,"won":6564,"wpa":2381,"wr ":124568,"wpi":6408,"wf ":5994,"wda":1936,"wdd":15429,"wcl":1588,"wch":84468,"we ":12027,"wbl":13799,"wfe":1476,"wfa":5328,"wes":19504,"wer":103660,"weu":29216,"wet":19199,"wen":21755,"wel":124321,"wei":166516,"wef":12569,"weg":1420,"wed":374291,"ŵyr":1598,"web":1716,"wec":2901,"wg ":25861,"ŵyl":1815,"wdw":2933,"wdu":37364,"wdr":2488,"wgr":12193,"wga":2076,"wi ":11108,"wir":53666,"wis":18022,"wit":3330,"wl ":41621,"wig":3412,"wie":1606,"wid":29273,"wic":2145,"wio":14312,"win":33765,"wil":30098,"wia":16107,"ŵr ":14597,"wb ":13655,"wa ":3359,"wan":26467,"ŵy ":4246,"wal":8910,"way":3018,"wau":4489,"war":68728,"was":82101,"wd ":12936,"wbe":7471,"wag":2121,"waf":7690,"wai":97239,"wah":33598,"wae":16922,"wad":34580,"rïa":3944,"ŵp ":6952,"rôl":8177,"vil":1778,"vin":1893,"vid":14072,"vie":15891,"vis":1891,"ŵer":1950,"ŷd ":5480,"ver":6993,"ven":3325,"vel":1505,"ve ":5223,"van":5418,"va ":2119,"uwy":3416,"uwc":17112,"uwi":1989,"usi":1968,"usg":1882,"use":4507,"usa":1473,"ust":11010,"uss":2899,"uso":3761,"usn":21216,"uth":9701,"ute":1704,"uw ":4094,"utt":4922,"utu":5650,"us ":87295,"ut ":21567,"ura":19210,"urd":37731,"urf":19995,"ure":3962,"urh":2086,"urg":2936,"uri":15646,"urn":7821,"uro":14115,"urr":2119,"urs":2043,"urt":1924,"urw":5461,"ury":3159,"uny":3961,"uog":7021,"uoe":8806,"uod":5536,"uon":6985,"uol":7386,"uos":4002,"ur ":71669,"ump":2645,"umb":3249,"ume":1431,"uo ":2462,"unt":3561,"unr":30860,"unw":12131,"unu":3801,"uni":81801,"uno":31125,"unn":2027,"und":29684,"una":20771,"ung":1445,"une":61156,"unf":2341,"um ":12570,"ulu":19906,"ull":87609,"uli":6287,"ula":3301,"un ":150712,"uis":1497,"ul ":9445,"ugh":7082,"ugi":2003,"uge":2130,"ugo":2448,"ugl":1647,"uga":3606,"uda":2384,"udd":41306,"ude":18823,"udi":17508,"ue ":5180,"uch":33289,"uck":1600,"uff":4314,"ufe":7387,"ufa":2438,"udu":1460,"udo":9949,"ug ":5003,"udw":2918,"ued":6122,"uen":4053,"ub ":3442,"ua ":27986,"uas":1433,"uar":1702,"uan":9132,"ubl":1462,"uba":1626,"ud ":136452,"lŷn":8979,"uag":5660,"uad":3772,"uae":5221,"tyw":5522,"tyf":2870,"tyl":1639,"tym":6439,"tyn":26295,"tyr":48030,"tys":4278,"ty ":18594,"tur":13689,"tus":3246,"tun":39289,"tuo":2638,"tua":31254,"tud":13123,"tue":1780,"tyd":3508,"tyb":2424,"twy":7578,"tws":4311,"twr":6786,"twn":3869,"twm":10367,"twf":1674,"ts ":6903,"tre":90655,"tt ":11110,"tra":97424,"tri":34368,"tru":3859,"tro":35460,"ŷr ":1449,"tu ":22540,"trw":23329,"try":15812,"tsw":3795,"tta":1471,"tte":3482,"tti":1910,"ttl":1680,"tto":2254,"to ":24556,"tne":12625,"tno":2988,"ŷn ":11249,"pêl":3054,"tod":27653,"toc":5503,"toi":3707,"tog":3800,"tow":1605,"tom":5214,"ton":26491,"tol":2761,"tor":43947,"top":1745,"tr ":17819,"til":1672,"tif":5355,"tie":9333,"tig":9818,"tir":27978,"tis":11461,"tin":9873,"tim":2470,"tio":31979,"thy":46160,"thu":45349,"thw":31457,"tia":52311,"tic":3675,"tid":2222,"tiy":6135,"tiw":14738,"tl ":2446,"tli":3793,"tlo":5586,"tla":2391,"tle":4793,"tem":19391,"ten":7716,"tep":1734,"tei":25035,"tel":15355,"tef":7947,"teg":31285,"teb":43243,"tec":5010,"ted":11202,"tff":2312,"th ":759637,"tey":2772,"teu":11405,"tes":9706,"ter":115193,"tgo":3276,"tge":1530,"ti ":8846,"tga":31917,"thn":24561,"tho":105119,"thl":16268,"thr":80549,"ths":2911,"thp":4012,"thf":9853,"thg":10918,"thd":10799,"the":42149,"thi":135223,"thb":4302,"tha":183648,"tgy":3272,"tŷ ":1779},"n_words":[138024683,168015853,135168311],"name":"cy"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":31963,"E":16857,"F":19795,"G":13674,"A":25625,"B":25733,"C":16797,"L":17275,"M":22557,"N":16361,"O":9041,"H":24577,"I":12015,"J":9674,"K":25355,"U":7743,"T":19455,"W":7598,"V":12180,"P":17849,"S":49336,"R":17256,"Y":1840,"Z":1413,"f":221888,"g":349504,"d":493190,"e":1436845,"b":166372,"c":57573,"a":587527,"n":746797,"o":488340,"l":487393,"m":293320,"j":42561,"k":286164,"h":119659,"i":613022,"w":11095,"v":176622,"u":183003,"t":621215,"s":567338,"r":850344,"q":1336,"p":139855,"z":8890,"y":86732,"x":5939,"Å":1900,"Ø":2784,"é":2737,"æ":62497,"å":62645,"ä":1464,"ü":1583,"ø":71666,"ö":1579," l":35858," m":66057," n":25041," o":98877," h":49372," i":113380," j":6194," k":49552," d":112412," e":192889," f":104584," g":23827," a":95172," b":68676," c":8562," u":24358," t":58677," w":1082," v":42657," p":52152," s":123480," r":22226," J":9634," K":25242," H":24422," I":11943," N":16289," O":8943," L":17176," M":22363," B":25561," C":16566," A":25498," F":19659," G":13552," D":31783," E":16778," Z":1391," Y":1835," S":49095," R":17196,"а":1047," P":17741," W":7540," V":12125," U":7720," T":19345," å":3941," æ":1756," ø":5351," Å":1898," Ø":2782,"A ":4032,"F ":2501,"Da":6717,"Cl":971,"Co":3707,"Ce":1101,"Ch":3540,"Do":1568,"De":17413,"Di":2020,"Fe":1146,"Fa":2183,"Eu":1618,"Er":1008,"Et":1623,"En":5373,"El":1343,"Ge":1911,"Ga":1871,"I ":3326,"Fr":4782,"Fo":3851,"Fl":1651,"Fi":2355,"C ":1471,"Au":1064,"Ar":2625,"As":2462,"Ba":5317,"Aa":1418,"Am":3646,"An":3500,"Al":3794,"By":1617,"Bu":1524,"Br":3812,"Ca":3808,"Bi":1743,"Be":3864,"Bo":3085,"Bl":1436,"Kl":1826,"Kr":1747,"Ko":8878,"Le":2792,"Li":3203,"La":5071,"Lu":1291,"Lo":2732,"Me":3324,"Mi":3899,"Ma":7557,"Mu":1568,"Mo":3310,"Ni":2023,"Ne":2682,"Na":2422,"No":4840,"Gi":973,"Gr":3579,"Go":1190,"Gu":1554,"Ha":6824,"He":6629,"Hi":1514,"Ho":4306,"Hu":1589,"In":3339,"Is":1195,"Ja":2348,"L ":1217,"Je":1924,"Jo":2770,"Ju":1199,"Ka":3252,"M ":1238,"Ki":2897,"Ke":1081,"Un":2388,"Ty":1556,"Tr":2242,"To":2011,"Th":4228,"Ti":1419,"Te":2146,"Ta":2888,"Sy":2480,"St":10943,"Sv":1640,"Su":1509,"Wo":997,"Wi":1997,"Wa":1860,"We":1192,"Vo":988,"Vi":3450,"Va":1711,"Ve":4334,"Pr":4633,"S ":1850,"Pe":2483,"Pa":3757,"Po":2392,"Pi":1096,"Or":1531,"Kø":2561,"Se":2517,"Sc":1974,"Si":2298,"Sh":1188,"Sl":1960,"Sk":3103,"Sp":1977,"So":7973,"Ru":1537,"Sa":4145,"Re":4467,"Ri":2574,"Ro":3970,"Ra":2483,"b ":7025,"a ":44327,"Yo":1017,"Sø":1957,"bø":1832,"i ":102076,"fy":1735,"gd":2544,"ge":89823,"ga":13601,"gb":1002,"bæ":1473,"fj":1452,"fl":7833,"ff":4196,"fi":15203,"bå":1373,"fh":1152,"fs":1525,"fr":23015,"fu":3863,"ft":15592,"fo":54992,"j ":3361,"gy":1588,"he":22182,"ha":30491,"gn":17858,"gl":6422,"gi":15986,"gh":4618,"gg":16392,"gu":4973,"gt":15761,"gs":16863,"gr":16831,"go":3646,"dt":26827,"du":5081,"dv":6155,"dy":2631,"g ":111365,"ea":8665,"eb":9181,"ec":4319,"ed":75055,"de":218146,"dd":6228,"dg":5250,"df":2420,"di":20969,"dh":1605,"dk":4450,"dj":1187,"dm":1943,"dl":12465,"do":7994,"dn":3058,"ds":29024,"dr":12885,"ew":2085,"ex":1370,"eu":3377,"ev":22509,"ey":2555,"ez":1521,"fa":10884,"h ":6139,"fe":8693,"eh":3909,"eg":21663,"ef":12940,"ee":3853,"el":91687,"ek":16142,"ej":9330,"ei":7265,"ep":6194,"eo":4491,"en":269379,"em":29799,"et":141301,"es":67128,"er":363438,"ca":5613,"e ":256545,"by":15888,"bs":2188,"br":14152,"bu":7411,"bo":14081,"bj":2063,"bl":20954,"bi":8379,"bb":2090,"be":49808,"db":9039,"da":25268,"f ":48816,"cu":1080,"ct":2015,"co":4730,"ck":5340,"ci":7534,"ch":11084,"ce":13957,"c ":1829,"az":1010,"ay":3869,"ba":15017,"d ":83491,"at":56481,"as":18978,"ar":71549,"av":20381,"au":6593,"ak":8835,"al":58276,"ai":4230,"aj":1819,"ap":5563,"am":30979,"an":131829,"ac":6807,"ad":19629,"aa":1902,"ab":11909,"ag":21569,"ah":1853,"ae":3053,"af":53343,"nu":5799,"nt":31384,"ns":60694,"nr":1599,"no":15192,"nn":14628,"jø":1567,"ny":4222,"nv":2608,"oe":2046,"of":9995,"oc":5289,"od":18682,"oa":1343,"ob":5028,"om":68251,"on":52834,"ok":6768,"ol":36629,"oi":1528,"kæ":2283,"og":81449,"oh":1849,"ot":9072,"os":11517,"ov":19468,"ou":6113,"op":19430,"oo":2822,"or":109199,"kø":2883,"r ":303465,"ow":2461,"pe":19212,"pf":1434,"pa":11174,"pl":7694,"po":10483,"ph":1997,"pi":13460,"lå":3180,"læ":7117,"lo":18381,"lm":9161,"ll":54007,"ls":28805,"lr":2728,"lp":1954,"lv":5002,"lu":8894,"lt":13794,"ly":5517,"hø":4198,"o ":13391,"ma":28771,"mb":6952,"mh":1531,"me":79970,"mf":2516,"mk":2192,"ml":2687,"mi":18259,"mn":1374,"mm":27337,"mp":7909,"mo":10889,"mr":3179,"mt":7052,"ms":5339,"mu":15391,"my":1487,"p ":7331,"na":26783,"nb":2591,"nc":5256,"nd":115468,"ne":83213,"nf":2882,"ng":65185,"nh":5629,"ni":34813,"jæ":2156,"nj":1002,"nk":9744,"nl":3415,"nm":3852,"jy":1603,"ju":2882,"js":1575,"jo":3376,"jl":1331,"fæ":2974,"ki":13129,"gå":3551,"ke":62501,"kb":1624,"ka":33369,"m ":58614,"fø":16135,"ky":2485,"ks":11787,"kt":15601,"ku":9611,"kv":2686,"ko":24231,"kr":19731,"kk":11850,"kl":9428,"km":2041,"kn":3763,"li":70659,"hå":1642,"lh":1724,"lk":6427,"hæ":2028,"le":98770,"ld":24706,"lg":4872,"lf":2530,"la":47170,"lb":6128,"gø":1910,"n ":242497,"hr":2287,"hv":9222,"ht":1466,"hu":6980,"dæ":994,"hj":3040,"hi":7688,"hn":1182,"ho":15697,"id":25624,"ic":10962,"ib":6095,"ia":12086,"ig":59928,"if":7649,"ie":27731,"hy":1292,"k ":53472,"dø":3867,"ir":14180,"is":62018,"it":24973,"iu":2399,"iv":18874,"ik":30570,"il":63431,"im":7762,"in":99762,"io":27519,"ip":4261,"je":15216,"jd":2875,"få":1004,"iz":977,"l ":57442,"ja":3097,"tæ":3691,"tå":3749,"z ":1958,"sø":4093,"wi":999,"så":7469,"sæ":4934,"y ":17470,"rø":5891,"wa":2989,"we":1786,"ræ":12201,"rå":5052,"vi":25451,"vs":4402,"vn":10855,"vo":8670,"ve":64168,"vd":1809,"va":25040,"x ":2713,"ui":2354,"uk":3985,"ul":11807,"ue":7203,"uf":1429,"ug":8656,"ur":17879,"us":20761,"ut":8713,"um":12141,"un":39154,"up":6927,"ty":12222,"tz":1053,"tu":11275,"tt":18340,"tv":2718,"ub":5076,"ua":3814,"ud":21845,"uc":3424,"w ":2415,"to":25039,"tn":4704,"tm":1160,"tl":7804,"ts":12139,"tr":31125,"tf":1908,"te":116420,"tj":2329,"pæ":1318,"på":21476,"ti":89903,"th":7738,"v ":18910,"tb":1612,"ta":40351,"su":4879,"sv":6120,"ss":17007,"st":117117,"sy":9417,"sl":10786,"sk":90052,"sn":3225,"sm":6568,"sp":18285,"so":39577,"sr":1826,"sd":1411,"sc":3973,"sf":3067,"se":58416,"sh":5513,"sg":1978,"sj":1417,"si":34046,"u ":4030,"sa":20739,"sb":7435,"rr":10396,"rs":33077,"rt":26331,"ru":25205,"rv":5600,"ry":5622,"rp":2368,"ro":34074,"rn":23969,"rm":13546,"rl":8366,"rk":22061,"næ":3458,"nå":1618,"ri":66660,"rh":5594,"rg":18288,"rf":5902,"re":122058,"rd":25935,"rc":2308,"rb":9120,"ra":54038,"t ":215608,"mø":1338,"qu":972,"mæ":2988,"må":5133,"lø":4835,"s ":87017,"pt":4539,"pu":4408,"pp":6074,"pr":20775,"ps":3450,"zi":1962,"ze":1041,"za":1046,"væ":10057,"yg":7090,"ye":6589,"yd":11050,"ya":1357,"yb":1013,"tø":6010,"yt":3483,"ys":11680,"yr":5643,"yp":2457,"yn":4794,"ym":2136,"yl":4209,"yk":3734,"År":1234,"å ":31271,"Øs":1854,"æv":1666,"æs":6691,"ær":17270,"æt":3678,"æn":8602,"æk":5318,"æl":6220,"æg":5488,"æd":2878,"ån":3755,"ås":1047,"år":11909,"åe":2110,"ål":3106,"åb":1474,"åd":6195,"ør":20663,"øs":5971,"øv":2281,"øj":4875,"øk":983,"øn":5238,"øl":3077,"øm":1629,"øb":5904,"øg":2353,"øe":2140,"ød":12599,"ø ":2608," Ga":1860," Ge":1892," I ":2530," Fo":3830," Fr":4772," Fi":2339," Fl":1645," Ha":6811," He":6623," Go":1179," Gr":3557," Gu":1534," Hu":1582," Ho":4265," Hi":1510," Je":1919," Ja":2336," Is":1181," In":3319," Ka":3227," Ke":1072," Ki":2888," Jo":2762," Ju":1199," La":5040," Le":2776," Li":3188," Kl":1820," Ko":8866," Kr":1743," Ma":7451," Mi":3885," Me":3307," Lo":2724," Lu":1285," Ne":2661," Na":2409," Ni":2020," Mo":3297," Mu":1554," Am":3639," An":3489," Al":3774," Aa":1413," Ba":5300," Au":1061," As":1298," Ar":2597," Be":3849," Bi":1733," Bl":1427," Bo":3066," Br":3787," Bu":1519," By":1610," Ca":3749," Ce":1081," Ch":3522," Co":3667," Da":6687," Di":2014," De":17357," Do":1505," El":1337," Et":1617," Er":1002," En":5365," Eu":1616," Fe":1132," Fa":2149," Sø":1956," Wo":983," Wi":1981," We":1181," Wa":1857," Yo":1015," Kø":2558," Or":1513," Po":2365," Pi":1093," Pe":2468," Pa":3735," No":4834," Ra":2473," Ro":3958," Re":4459," Ri":2570," Pr":4618," Sy":2476," Sv":1637," Su":1503," St":10885," Ta":2881," Th":4217," Ti":1411," Te":2130," Tr":2230," To":1975," Ru":1535," Sa":4126," Sh":1168," Si":2284," Sc":1958," Se":2497," So":7960," Sp":1954," Sk":3100," Sl":1954," Va":1707," Ve":4322," Vi":3432," Vo":984," Ty":1554," Un":2381," ja":1664," få":1001," in":18970," ik":2700," is":1666," ka":10401," fø":12841," ki":4060," ke":5007," jo":1235," fæ":1545," ju":2042," ha":16862," he":5872," gi":1512," gr":8357," gu":1731," dø":1180," hi":2424," hj":2649," ho":5868," hu":1995," hv":8333," ne":2244," na":5975," mu":3617," mo":5717," om":10735," og":60338," of":4764," ny":1418," nu":1920," no":8819," le":3614," li":12024," hå":1306," la":9917," kv":1456," ku":4034," km":1736," kl":2396," kr":4395," ko":12549," me":34653," mi":5228," hø":2581," ma":10653," lu":1103," ly":1210," lo":1691," af":47341," ad":2058," am":5476," an":9937," ap":1120," ak":1320," al":5943," au":1423," ar":5256," at":12715," ba":5812," bi":2763," be":21652," bo":2959," bl":16483," by":7166," bu":1513," br":6861," ca":2244," er":79580," et":23500," en":65390," el":13704," ek":1980," ef":3944," eg":1315," fe":2371," fa":4879," fu":1922," fr":19782," fo":43206," fl":5239," fi":8516," bå":1058," ge":4611," ga":3502," bø":1211," i ":86016," fy":1194," co":1546," ce":1786," ci":1062," da":14483," do":1734," dr":1947," de":85238," di":4041," dy":1311," væ":5438," tæ":998," sø":1940," ru":2426," sa":10706," se":8834," si":12012," sm":1227," sl":3070," sk":10508," sp":8070," so":28866," ra":1828," re":10118," nå":1108," ri":1551," næ":2233," ro":3386," pr":10950," lø":1660," s ":1355," må":3141," ov":4934," op":11111," or":3538," kø":1518," pe":4046," pa":3870," pl":3433," po":4785," lå":2186," pi":1184," læ":1975," så":2369," sæ":2010," va":14642," ve":13203," vo":1548," vi":7015," ræ":1334," ud":14434," ty":5331," tv":1505," un":6550," ta":3468," sy":6433," st":20417," sv":2066," su":1555," tr":6706," to":3836," th":1962," på":21281," ti":28004," te":4720," År":1234," Øs":1852," år":3117," øs":2023,"Årh":1078,"Fil":1000,"Et ":1446,"Eur":1467,"En ":3727,"Fra":1859,"Fre":1891,"For":2210,"Hel":1134,"Her":3429,"Han":2509,"Ind":1212,"Øst":1835,"Hol":1719,"As ":1183,"Bay":1295,"Amt":2376,"And":1120,"Bye":1086,"Car":1184,"Ber":1053,"EF ":1275,"De ":1771,"Det":5562,"Den":6802,"Dan":4528,"Chr":1315,"Cha":1198,"New":1218,"Nor":4041,"Køb":2308,"Par":1507,"Pro":2885,"SA ":2039,"SAs":1153,"Joh":1172,"Kir":1207,"Kon":1132,"Kom":6411,"Lan":2730,"Man":1218,"Mar":2635,"Søn":999,"Syd":1794,"Sve":1310,"Str":999,"Sti":2399,"Sto":1709,"Sta":3406,"Ste":1353,"Tab":1274,"Sog":4806,"Sch":1301,"San":1256,"Reg":1718,"Ros":1109,"åen":1002,"åde":4772,"ål ":1259,"ånd":1467,"åne":2057,"åle":1126,"åre":1034,"ård":2059,"år ":6325,"Ver":993,"Ves":1255,"Uni":1434,"æde":2099,"Tys":1072,"æge":1409,"ægt":2411,"æll":2133,"ækk":2758,"æld":2168,"æns":1603,"æng":3355,"ænd":2342,"ær ":2495,"æsk":1388,"æse":1314,"ært":1291,"ærk":2357,"ære":6027,"ærd":1139,"ætt":1489,"æst":1815,"ævn":987,"The":2590,"bje":1890,"bil":1901,"bin":2468,"blo":1344,"ble":11852,"bli":3619,"bla":2817,"bol":3467,"bog":1211,"bor":5893,"bbe":1641,"be ":1587,"ban":5529,"bal":1468,"bag":1151,"bas":2113,"bar":1770,"bej":2245,"beg":2119,"bef":974,"bed":1902,"ber":10275,"ben":6123,"bel":6286,"bez":990,"bev":1288,"bes":5570,"bet":8125,"ca ":2352,"ce ":2816,"bri":2216,"bro":1760,"bra":1230,"bre":2307,"bru":5207,"bur":1375,"bun":1225,"bum":1745,"by ":6807,"bye":2309,"byg":4928,"am ":2554,"al ":9518,"ain":1470,"aj ":1010,"agt":3576,"anv":1364,"anu":1902,"ann":5158,"anm":3219,"ant":6958,"ans":23197,"ane":4038,"ang":12562,"ani":4609,"ank":4263,"ana":2737,"anc":2097,"and":39458,"amt":2475,"amm":7240,"aml":2016,"amp":2062,"ami":2635,"ame":7259,"ama":1553,"alv":1423,"alt":4977,"als":1981,"alr":2167,"alm":1522,"all":6438,"alg":1439,"ali":5360,"ald":6029,"ale":8208,"ala":1794,"alb":2973,"an ":17044,"akt":3338,"abe":4887,"ad ":3260,"ab ":2197,"aft":1820,"afs":1107,"afh":1105,"afi":1171,"age":8240,"adm":1030,"adi":3138,"ade":6627,"ag ":3450,"ads":1953,"ack":1318,"ach":1746,"ace":1610,"af ":43757,"aye":1830,"at ":17462,"are":4543,"ard":3649,"arb":2537,"ara":2794,"arm":1244,"arl":1801,"ark":7252,"ari":4342,"arv":1066,"arr":1598,"ars":2045,"art":9040,"au ":1067,"asi":1827,"ase":2004,"ask":1161,"ar ":24485,"as ":3170,"avd":1374,"avn":8812,"avi":1508,"ave":5203,"ay ":1009,"av ":1155,"ata":1987,"ast":4061,"ass":3627,"ato":1943,"ate":7241,"ati":16074,"att":4875,"ats":1271,"atu":2072,"jer":5562,"jek":1076,"jem":1846,"jen":1901,"jet":1025,"jan":1060,"je ":1645,"jde":2262,"jor":2291,"fær":971,"fæl":1337,"itu":1780,"itt":1519,"isk":22022,"ism":1473,"iss":2416,"ist":15812,"iv ":2118,"ita":3145,"ite":4554,"iti":5826,"ium":1528,"iva":1011,"ivi":1481,"ive":12562,"ipt":1296,"is ":8187,"ion":21894,"irk":6542,"isi":1236,"ise":3850,"isa":1650,"ire":2379,"it ":2641,"kil":3875,"kib":1205,"kin":2032,"kir":1742,"går":2699,"kis":983,"km ":1307,"ked":1397,"kel":4377,"ken":10401,"kes":1300,"ker":9106,"ket":3222,"ke ":28879,"kra":3109,"kre":5594,"kt ":3134,"kse":1892,"kro":1365,"kri":8221,"kov":1285,"kor":3059,"kon":5299,"kom":8214,"kol":2985,"ks ":2322,"kni":2413,"kke":10909,"klu":1988,"kle":1997,"kla":3032,"kli":1244,"jyl":1350,"jul":1124,"kba":1487,"kat":1697,"kar":1404,"kas":971,"kan":11432,"kal":6243,"kam":1128,"kab":5579,"ka ":2541,"før":6664,"fød":7830,"føl":1523,"ham":1255,"han":5336,"hal":2007,"hav":7061,"har":10554,"he ":3841,"hel":2400,"hed":6692,"her":3250,"hen":2397,"hin":1447,"his":2209,"hje":2112,"gle":3411,"gn ":5722,"gla":1189,"gni":1875,"gne":9082,"gs ":1501,"gsb":1331,"gsk":1233,"gru":5085,"gra":4027,"gt ":9775,"gre":2783,"gst":2115,"gte":3224,"gti":1404,"grø":1000,"gså":4604,"gus":1029,"græ":2818,"ial":2412,"ian":3001,"ic ":1068,"ibo":1076,"ølg":1529,"øn ":1089,"id ":2460,"ibe":2142,"ia ":3660,"øje":1642,"iet":2644,"iel":2523,"ien":9683,"ier":4656,"ig ":12921,"ift":4749,"ør ":3471,"ici":1501,"ich":2160,"ice":1664,"ie ":4513,"ica":1606,"ids":2132,"idt":1926,"idl":3701,"idi":1064,"ide":10399,"idd":1704,"ønd":1642,"øst":4212,"il ":24604,"im ":977,"ika":7541,"ige":19623,"iga":1060,"igh":3457,"igi":1414,"igg":9753,"igt":6078,"igs":1981,"ign":1642,"øre":4495,"ørs":6046,"ørr":1963,"ørt":1508,"ik ":5837,"ørn":1192,"ime":1521,"ind":23824,"ina":3988,"int":3500,"ins":7738,"ine":6155,"ing":36509,"ini":3271,"ink":1131,"iod":1188,"ikl":2029,"ikk":5095,"ike":2959,"ikb":1260,"in ":7601,"ikt":1780,"iks":1269,"ilo":3003,"ill":15072,"ilk":1504,"øve":1134,"ilm":4941,"ilh":993,"ili":3480,"ild":2323,"ile":1492,"io ":1215,"ils":1343,"hol":6737,"hov":3711,"hri":1405,"hvo":4960,"hum":1081,"hun":1070,"hus":3072,"hve":1568,"hvi":2196,"døs":1339,"død":1050,"ffe":2245,"ffi":1018,"fes":1566,"fer":1682,"fen":1036,"fas":1001,"fat":2864,"far":1600,"fam":1418,"fan":1048,"fal":1044,"ezi":1048,"evæ":1177,"eta":3145,"ete":8817,"eti":2099,"esp":2172,"est":19859,"ødt":7615,"ess":3934,"esv":1081,"ev ":11307,"etr":1299,"ets":3278,"ett":4721,"ety":1926,"øen":1132,"ew ":1416,"eve":5776,"eva":1631,"øge":1365,"evi":1173,"øj ":1036,"ey ":1638,"er ":229484,"epa":1085,"eor":1469,"eol":1033,"ød ":1370,"es ":24995,"øbe":3709,"ept":1210,"erk":1552,"erl":2388,"eri":19046,"erg":5102,"erh":1890,"ere":36721,"erf":2743,"erd":3362,"era":4483,"erb":2464,"et ":110723,"esk":4356,"esl":1065,"esi":2775,"øde":1837,"ese":1937,"erv":2764,"eru":2635,"err":4661,"ert":4521,"ers":15033,"ern":16903,"erm":1970,"ero":1448,"egå":1006,"ekr":1629,"eks":4054,"ekt":4817,"en ":174071,"ela":1844,"eld":1116,"ele":6394,"eli":11128,"ell":22146,"elv":2251,"els":21491,"elt":4815,"emb":3551,"ema":1766,"eme":3075,"emm":3388,"emo":1300,"emi":2222,"emt":1052,"emp":1313,"ems":1366,"enf":1295,"ene":9879,"enh":4546,"eng":3572,"enb":1247,"ena":1550,"end":24965,"enc":1233,"enn":5553,"enk":1120,"eni":2823,"ens":17443,"ent":14377,"enr":1262,"egn":6321,"ege":4459,"egi":3950,"eha":1147,"egr":1976,"eho":1258,"eis":2241,"ein":2075,"ejd":2400,"el ":14709,"ejs":1068,"ejl":1164,"eje":2612,"em ":8110,"gis":2194,"giv":4136,"gin":973,"gio":2383,"gie":1747,"ghe":2580,"gge":15510,"gi ":1776,"gen":19301,"geo":1146,"get":6054,"ger":26388,"ges":3499,"gem":1137,"gel":6514,"gde":1133,"ge ":22940,"gad":1042,"gar":1035,"gav":1126,"gam":1188,"gan":5170,"fte":8906,"fun":1949,"ft ":4358,"fra":16811,"fre":2876,"fri":2352,"for":48487,"fol":2294,"fod":2203,"fle":2554,"fla":1188,"flo":1580,"fly":1416,"båd":1117,"fic":1488,"fil":4788,"fik":1836,"fin":3119,"fir":1330,"fis":991,"da ":2439,"dbr":1439,"dbo":3204,"de ":51365,"dby":3214,"dal":1321,"dag":2881,"dat":1966,"dan":12624,"dam":1233,"dda":1566,"dde":3541,"com":1147,"ch ":2261,"cer":3808,"ces":1087,"cen":3123,"cha":1908,"cia":1876,"ck ":2373,"cie":1532,"che":2404,"cke":1007,"ed ":35220,"ebe":1273,"ebo":1021,"ebr":1930,"eal":1397,"eat":1418,"efi":1531,"efo":3934,"eft":4164,"een":1037,"edl":1944,"edi":2262,"ede":18829,"eda":1072,"edt":1636,"eds":6584,"edr":1723,"eci":1115,"ece":1005,"dyr":1288,"dvi":2440,"dve":2150,"don":1291,"dom":2065,"ds ":4746,"dmi":1103,"dni":1787,"dst":6032,"dsp":2071,"dti":2638,"dte":2421,"dtr":1052,"duk":1051,"duc":1998,"dri":1805,"dra":1347,"dt ":18171,"dre":7141,"dro":1133,"dsk":3064,"dsb":2127,"dsa":1334,"dse":2268,"dgi":2184,"dia":1005,"der":53839,"des":7277,"det":27574,"deb":1003,"deh":997,"del":20136,"den":48394,"dem":1809,"di ":1057,"dga":1116,"dle":3278,"dla":1839,"dkr":1756,"dli":6294,"din":2827,"dio":1576,"dis":4268,"dit":1114,"die":2342,"dig":3302,"rhu":1754,"rho":1270,"rga":2811,"ri ":2041,"rgi":1032,"rge":3583,"ret":16711,"res":7617,"rev":3074,"rfa":1970,"rfo":1099,"rds":1442,"rdv":999,"rg ":8008,"reb":1759,"rea":2289,"ref":3055,"red":12454,"rei":2319,"reg":5210,"rem":3072,"ren":13272,"rek":1859,"rel":3506,"rer":8658,"rep":1174,"rda":1335,"rdl":1187,"rdi":2720,"rde":7501,"re ":32700,"rbu":1055,"rbr":1012,"rd ":6821,"ras":1016,"rat":6278,"rbi":1442,"rba":1156,"rbe":2916,"rag":1612,"ran":8553,"ram":2785,"ral":3419,"rak":1777,"raf":2970,"rad":2370,"rs ":4872,"ros":1298,"rot":1028,"rom":2663,"ron":3183,"rop":3118,"rov":4008,"rod":3399,"roc":1755,"rol":2335,"rof":1572,"rog":3109,"rna":2635,"rne":13393,"rni":1274,"ro ":1858,"rma":3375,"rme":5215,"rli":2882,"rla":2011,"rn ":3815,"rks":2467,"rko":1321,"rki":1210,"rke":7276,"rka":1044,"rm ":1940,"næs":1010,"nær":1381,"rip":1436,"rio":1666,"når":1017,"rit":3128,"ris":7929,"riv":2443,"rig":7130,"ril":1328,"rik":9410,"rin":12007,"rim":1705,"ria":2312,"rib":1119,"ric":1788,"rid":1336,"rie":7227,"rif":1151,"rk ":5973,"rug":4144,"rue":1823,"rup":5078,"run":5278,"rum":2612,"ruk":1154,"rus":1691,"rva":1035,"rvi":1176,"rve":2295,"ry ":1399,"rsk":6329,"rsl":985,"rsi":2088,"rso":2553,"rsa":1053,"rse":2094,"rta":1313,"rst":8534,"rte":6848,"rti":3566,"rts":2294,"rt ":7502,"rri":1525,"rre":6580,"sag":1459,"sal":1002,"sam":8006,"sbe":2183,"san":3521,"sat":3398,"ryk":1357,"sho":1591,"sie":1150,"sid":5798,"sk ":33968,"sit":3034,"sis":4442,"sin":4733,"sio":2930,"sik":3367,"sig":4968,"sby":2385,"se ":14360,"sch":1769,"ser":12739,"ses":2382,"set":2238,"sfo":1331,"sep":1174,"sen":14543,"sem":1108,"sel":4270,"sek":1218,"spo":1957,"spr":2788,"spe":2284,"slæ":1631,"spi":8255,"spa":1108,"sol":1008,"som":26634,"son":4747,"sor":1570,"sog":2156,"st ":15643,"sli":981,"slu":1369,"sky":1101,"sla":3115,"sle":2180,"ski":4168,"skl":1367,"sko":3585,"skr":6518,"sku":2545,"ska":8399,"ske":25508,"sni":1817,"sma":1205,"sme":2348,"stæ":1164,"stå":3319,"syd":3714,"stø":3565,"syn":1352,"sys":1619,"sse":5630,"ssa":1137,"sso":1208,"ssi":4603,"ste":30468,"sta":15172,"stn":1107,"sto":7688,"sti":13739,"stj":1076,"stl":3640,"stu":1392,"str":12753,"sty":2236,"sva":1612,"sve":1375,"svi":1847,"tal":8954,"tag":2922,"tab":1032,"tad":2960,"tat":8263,"tar":2918,"tan":5101,"tam":1561,"te ":23992,"ta ":2303,"pa ":1106,"køb":1295,"pe ":3064,"par":4298,"pan":2233,"lå ":2263,"pec":1027,"pen":2557,"per":8319,"pel":1218,"pla":4596,"ple":1353,"læg":3305,"læn":1040,"lær":1638,"pil":7538,"pis":1494,"por":2733,"pop":989,"pos":979,"pol":3456,"ppe":4792,"pst":1130,"pte":1387,"pti":1074,"pri":5464,"pre":2422,"pro":9553,"pun":1123,"præ":2064,"løb":1835,"løs":1373,"mæn":1058,"mær":1139,"mål":1925,"mån":1657,"ra ":16943,"ngi":1084,"ngl":1961,"ngs":7820,"ni ":1743,"nge":21530,"ngd":1418,"nha":2979,"nhe":1266,"nel":5371,"nek":1429,"nen":7128,"nem":2182,"ner":14377,"net":9752,"nes":8157,"ng ":25397,"ned":2644,"nfo":1407,"nce":2989,"ne ":28324,"ndt":10291,"ndr":5292,"nds":9100,"ndo":2080,"ndl":3623,"ndk":1968,"ndi":3782,"nde":48861,"ndb":3975,"nda":1742,"nal":7713,"nan":1050,"nar":1016,"nd ":19511,"nav":5158,"nat":4037,"nas":1015,"na ":3264,"nve":1870,"nus":1183,"nto":1311,"ntr":3209,"nti":2579,"ntl":1237,"nta":2683,"nte":11676,"nsp":982,"nst":8074,"nse":6294,"nsi":1207,"nsk":23035,"nsb":1130,"nt ":5652,"ns ":13305,"nog":1664,"nom":1719,"nor":6469,"nov":1161,"nne":10196,"nni":1944,"nma":3344,"nli":1178,"nla":1269,"nke":3504,"nkt":1750,"jæl":1755,"nie":2401,"nk ":979,"niv":1911,"nis":7071,"nit":1685,"nio":1012,"nin":14093,"nik":1101,"ogs":5045,"ogr":2357,"ogi":3275,"ogl":1041,"ogn":7366,"oge":2091,"ol ":1201,"oci":1344,"ock":2154,"ode":4999,"ods":1493,"of ":2459,"odb":2119,"odu":2868,"og ":59080,"oft":2765,"off":2072,"ofe":1326,"od ":3992,"obe":1539,"nyt":1313,"ote":1650,"ott":1354,"oto":1594,"ost":2113,"ov ":1326,"osk":985,"ose":1440,"ovi":1957,"ovs":2306,"ove":11716,"oun":1213,"our":1852,"opl":974,"ope":1372,"opf":1235,"opa":1354,"os ":2264,"opr":3026,"opt":1105,"ops":1721,"or ":31324,"ork":2427,"orl":1427,"orm":8791,"orn":2079,"ord":14176,"ore":6583,"orf":2453,"org":9163,"orh":1310,"ori":4799,"ort":7693,"ors":6893,"orv":1018,"ot ":1242,"orb":3261,"ora":1624,"old":10067,"on ":17437,"oli":4334,"oll":3644,"olk":2982,"ole":3254,"ols":1497,"olm":1298,"olo":4008,"oka":1272,"om ":32517,"ona":4226,"ond":1810,"one":10149,"ong":2553,"oni":2635,"ono":1620,"ons":4853,"ont":2499,"oma":2195,"ome":4668,"omi":1702,"omh":1267,"omm":13262,"omk":1807,"omp":2560,"omr":2620,"oms":1669,"op ":2057,"la ":1857,"gør":1749,"le ":18274,"lde":9460,"ldt":2484,"lds":2126,"lac":1163,"lad":4353,"lag":4790,"lan":22791,"lam":1129,"lar":1164,"lat":2523,"las":2596,"lav":1728,"lba":1173,"ld ":5587,"lbo":1181,"lbu":1855,"kvi":1085,"kva":1167,"kun":3121,"kul":1926,"kte":2817,"kst":2005,"kso":1301,"kue":1399,"ktr":1143,"kti":4258,"kto":1463,"ktø":1090,"ls ":1795,"lok":1028,"lom":2968,"lod":2232,"log":4064,"los":1196,"lot":1245,"lov":1376,"lmi":1310,"lme":1724,"lti":975,"ltu":1430,"lub":2283,"lsk":4901,"lst":4539,"lta":1027,"lte":2079,"lse":13829,"lre":2112,"lt ":6296,"lge":2455,"li ":1269,"lev":14315,"les":5717,"let":5891,"ler":27840,"lem":6534,"len":6881,"lek":2168,"leg":1270,"led":4289,"lla":4918,"lle":37621,"lli":6167,"lke":3500,"hæn":1282,"lm ":3777,"ll ":2166,"lit":4876,"lis":3436,"lin":9392,"hån":1024,"liv":2785,"lia":1325,"lik":2160,"lil":1691,"lig":34196,"lie":4111,"ma ":1277,"mag":1391,"mar":6718,"mas":1505,"mal":1831,"man":9443,"mat":3311,"mbe":3163,"me ":4862,"med":21647,"meg":1181,"met":7243,"mes":4874,"mer":15160,"mel":6023,"men":16847,"lut":1751,"lys":1458,"høj":2362,"hør":1554,"mpe":2509,"ms ":982,"mod":4612,"mon":1609,"mor":1053,"mt ":4527,"mst":1872,"mrå":2725,"mus":3381,"mul":1155,"mun":9999,"mhe":1090,"min":6763,"mil":3215,"mis":2297,"mid":2156,"mle":1475,"mkr":1669,"mmu":9493,"mme":16141,"vær":6621,"zir":1025,"væg":1206,"ytt":1956,"yst":3197,"ysk":5330,"yre":2022,"yr ":1010,"yde":3637,"yer":2527,"yen":3113,"yd ":1599,"ykk":1287,"yll":1699,"ynd":1352,"ygg":4471,"tør":4175,"tær":1046,"tæn":1204,"tår":2373,"sæt":2008,"så ":5902,"sær":1369,"røn":1765,"rød":1365,"vst":2580,"vir":2222,"vik":2041,"vil":2648,"vin":4397,"vig":1873,"råd":3726,"vid":3491,"vis":5672,"ræs":2950,"ræn":2483,"ræk":2614,"vn ":4721,"vne":3693,"vns":1255,"vok":971,"vor":5368,"ver":19395,"ves":5050,"vet":6585,"vej":1947,"ven":7443,"vem":1052,"vel":2142,"ved":12182,"vde":1584,"ve ":6698,"val":2588,"van":4077,"var":14654,"vat":1120,"usk":1540,"usi":3385,"use":2336,"ust":3006,"uss":1778,"uti":1400,"ute":1658,"us ":6486,"ut ":1499,"ure":2206,"urg":1747,"uri":1026,"urn":1614,"uro":2454,"urt":1085,"ur ":2594,"upp":2843,"umm":1471,"umb":1127,"ume":1120,"uns":1753,"unk":1906,"uni":2675,"und":15503,"una":2326,"ung":3315,"une":7027,"up ":2829,"ukt":2014,"um ":5774,"ult":2011,"uli":1842,"ule":1205,"uld":1327,"un ":2311,"uge":2701,"ugl":1024,"ugu":1009,"ugt":1589,"udb":1145,"udd":1881,"ude":3316,"udg":3260,"udi":978,"ue ":1121,"uce":1989,"uer":2229,"ues":1633,"udv":2056,"uds":2177,"udt":1177,"uel":1000,"ub ":1257,"uar":1965,"ubl":1141,"ubb":1133,"ud ":2370,"typ":1409,"tyr":2052,"tys":3953,"ty ":1687,"træ":3121,"tur":4777,"tut":1397,"tun":1107,"tud":1198,"tyd":2007,"ts ":4804,"tre":5166,"tra":6691,"tri":4615,"tru":5093,"tro":3770,"try":1478,"tv ":1126,"tte":14611,"to ":3441,"tni":3342,"tne":990,"tof":1468,"tod":1141,"tog":1011,"ton":2739,"tol":1446,"tor":9016,"top":1001,"til":27566,"tik":5022,"tif":3307,"tie":1611,"tig":3596,"tit":2754,"tis":8549,"tin":4128,"tio":14633,"thu":1021,"tia":1323,"tid":6831,"tiv":4885,"tje":1256,"tli":4586,"tla":1138,"tle":1766,"tem":5581,"ten":11717,"tek":2453,"tel":3444,"teg":5502,"ted":5198,"th ":1733,"tet":9121,"tes":3522,"ter":41030,"på ":20701,"ti ":3596,"the":2230},"n_words":[9674395,11309170,8090238],"name":"da"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":565432,"E":313466,"F":303463,"G":350968,"A":448295,"B":452862,"C":218833,"L":272371,"M":385363,"N":224760,"O":171336,"H":264836,"I":169730,"J":158568,"K":336559,"U":140118,"T":243403,"W":221377,"V":203138,"Q":15370,"P":333284,"S":807982,"R":282432,"Y":16379,"X":9646,"Z":96079,"f":1052401,"g":2011313,"d":3686275,"e":12779820,"b":1254948,"c":2286896,"a":5009484,"n":7859499,"o":2714321,"l":3150399,"m":2035145,"j":85201,"k":1042619,"h":3128015,"i":7539051,"w":833806,"v":645462,"u":3015383,"t":5309288,"s":5151894,"r":6424621,"q":20503,"p":756433,"z":844832,"y":283697,"x":72882,"Ü":10506,"ß":107796,"Ö":13881,"í":8823,"é":38695,"ä":358815,"á":12062,"ü":397123,"ö":249595," l":98174," m":186199," n":127225," o":112357," h":112970," i":1142912," j":27689," k":107425," d":1491422," e":826321," f":160920," g":218820," a":569210," b":294630," c":26306," z":200898," u":467824," t":56510," w":322251," v":371555," p":57089," s":347463," r":56139," J":156270," K":328020," H":253550," I":146651," N":212076," O":159226," L":260916," M":370185," B":437655," C":196607," A":413316," F":291285," G":334059," D":543793," E":298852," Z":93212," Y":15401," S":736164," R":264582," Q":14562," P":316134," W":214460," V":189915," U":132267," T":228797," ä":10185," ö":18488," ü":31572," Ö":12893," Ü":10208,"A ":22760,"Da":90985,"Cl":9563,"Co":64248,"Ch":45017,"G ":11776,"Du":11229,"Do":28882,"Dr":19902,"De":166635,"Di":188563,"Fe":38252,"Fa":46184,"Eu":13931,"Er":67899,"Es":30499,"En":32393,"Ei":81512,"El":17463,"Ge":151044,"Ga":33294,"I ":13640,"Fu":26023,"Fr":58870,"Fo":32048,"Fl":33310,"Fi":36767,"B ":10700,"C ":14623,"Au":73540,"Ar":58134,"At":9258,"As":17569,"D ":9913,"Ba":87454,"Ab":25174,"Ad":11158,"Am":21201,"An":55559,"Ap":14324,"Al":66179,"Bu":55758,"Br":45253,"Ca":33967,"Bi":31885,"Be":143843,"Bo":32121,"Bl":15582,"Ku":30221,"Kl":25047,"Kr":46024,"Ko":56995,"Le":49783,"Li":55021,"La":89578,"Lu":17469,"Hö":9378,"Lo":26047,"Me":64987,"Mi":68245,"Ma":116460,"Mu":29273,"Mo":46837,"Ni":23664,"Ne":44049,"Na":66351,"P ":9001,"No":53500,"Ok":13364,"Ol":9042,"Ob":22144,"Gi":11262,"Gl":11809,"Gr":74845,"Go":17306,"Gu":11007,"Ha":86398,"He":62608,"Hi":21572,"Dé":12333,"Ho":42207,"Hu":11017,"Im":11278,"In":78332,"Is":10695,"Ja":77663,"Je":11227,"Jo":24131,"Ju":32751,"Ka":74688,"Ki":37983,"Ke":14966,"Um":8765,"Un":54096,"Tu":10377,"US":39827,"Tr":32100,"To":24574,"Th":35305,"Ti":19953,"Te":52263,"Ta":27705,"V ":9454,"Sy":15026,"St":193472,"Su":18450,"Wo":21476,"Wi":47351,"Wa":44055,"We":77082,"Vo":36505,"Vi":22665,"Va":14430,"Ve":104056,"Mä":13878,"Pu":10703,"Pr":83211,"S ":48175,"Pe":31242,"Pf":21717,"Pa":56041,"Pl":15401,"Po":48441,"Pi":16961,"Ph":15188,"Os":17404,"Or":58036,"R ":11116,"Kö":16414,"Se":64611,"Sc":120209,"Si":90352,"Sh":10028,"Sp":54743,"So":39076,"Ru":19547,"Sa":63564,"Re":106782,"Ri":24028,"Rh":15491,"Ro":44332,"Qu":12167,"SA":9500,"Ra":38143,"Mü":11354,"b ":47610,"a ":292619,"Sü":22392,"Ze":31894,"Zi":10309,"Zu":19954,"Zw":11243,"i ":187041,"ge":787899,"ga":94286,"fl":45063,"fg":17796,"ff":81186,"fi":73360,"fs":18166,"fr":68277,"fu":19769,"ft":140067,"fo":55471,"he":941699,"hb":12648,"ha":293469,"gn":22363,"gl":78298,"gk":13841,"gi":130788,"gh":20146,"gg":10845,"gu":58531,"gt":76690,"gs":110120,"gr":125748,"go":29970,"dt":100722,"du":88105,"dw":22283,"g ":403400,"ea":71177,"eb":165228,"ec":131420,"aß":17528,"ed":151386,"de":1831853,"dg":9266,"di":398882,"dh":11418,"dk":27018,"dl":45536,"do":69104,"dn":16795,"ds":46937,"dr":54467,"ew":66696,"ex":29126,"eu":212719,"ev":25427,"ey":30641,"ez":98518,"fa":86743,"h ":463041,"fe":136348,"eh":196157,"eg":232810,"ef":68699,"ee":58149,"el":553287,"ek":82123,"ei":1578594,"ep":49360,"eo":41207,"en":2309793,"em":396788,"et":380175,"es":772223,"er":2971302,"ca":38911,"bz":11720,"e ":2082290,"bs":31297,"br":69597,"bu":78689,"bt":20985,"bo":31335,"bl":45392,"bg":11719,"bi":136887,"be":567764,"db":9156,"da":176290,"f ":156375,"cu":10926,"ct":17144,"co":32151,"ck":133244,"ci":25902,"ch":1933159,"ce":45478,"c ":18586,"az":20071,"ay":30095,"ba":134727,"d ":722683,"at":393648,"as":323772,"ar":487358,"ax":8820,"aw":12659,"av":27657,"au":496496,"ak":51121,"al":571224,"ai":78558,"ap":48798,"am":283487,"an":921405,"ac":186135,"ad":167659,"aa":47275,"ab":98737,"ag":124899,"ah":148615,"ae":30527,"af":117969,"nu":74735,"nt":478161,"ns":309893,"nr":20689,"no":90394,"nn":190420,"nz":136146,"ny":13727,"nw":36447,"nv":14218,"oe":10022,"of":64141,"oc":78480,"od":135885,"oa":14277,"ob":51225,"om":176521,"on":667125,"ok":27868,"ol":220141,"oi":24320,"og":71416,"oh":66434,"ot":88470,"os":118609,"ov":54069,"ou":69753,"op":68308,"oo":24531,"or":433879,"r ":1993661,"ow":58643,"oz":19078,"kö":10263,"pe":109404,"pf":29697,"pa":81077,"kü":12292,"pl":26694,"po":71059,"ph":41272,"lä":41208,"pi":98645,"lo":124488,"ln":33301,"lm":52275,"ll":300411,"ls":178640,"lr":10222,"lp":20136,"lv":15360,"lu":97812,"lt":212969,"lz":28989,"hö":49290,"ly":20012,"o ":123676,"iß":9834,"ma":207608,"hü":11610,"mb":77222,"me":410943,"mf":12265,"ml":10335,"mi":238709,"mm":108620,"mp":57358,"mo":55705,"mt":36178,"ms":30817,"mu":48362,"p ":27276,"na":304264,"nb":61884,"nc":42156,"nd":1104110,"ne":802403,"nf":54695,"ng":588331,"nh":45861,"ni":342822,"nk":102273,"nl":39986,"nm":15381,"ki":44691,"ke":186644,"ka":155408,"fü":107647,"m ":660573,"ks":34677,"kt":124687,"ku":45967,"ko":74144,"kr":80423,"kl":44918,"km":33455,"li":562165,"hä":31781,"lh":12595,"lk":36060,"le":501436,"ld":86618,"lg":41891,"lf":32687,"la":312768,"lc":19357,"lb":59934,"n ":2940210,"hr":245096,"hs":72646,"hw":53275,"ht":189239,"hu":73471,"hk":8886,"hi":180846,"hn":163077,"ho":102479,"hl":130889,"hm":50687,"id":80042,"eß":9854,"ic":534870,"ib":46123,"ia":115654,"ih":35970,"ig":262184,"if":71985,"ie":1150659,"hy":13546,"k ":140095,"ir":170539,"is":1295546,"it":570135,"iu":24806,"iv":65304,"ik":165059,"il":298251,"im":289814,"in":1828758,"io":214368,"ip":33386,"je":30947,"fä":14922,"iz":46043,"l ":312169,"ja":19963,"tä":51288,"xi":15762,"z ":144384,"sü":19128,"sä":14628,"wi":176498,"wo":47417,"wu":73059,"rö":38038,"y ":99455,"wa":198781,"rü":63451,"we":246633,"rä":41054,"vi":76382,"vo":299196,"uz":17658,"ve":197477,"va":38019,"x ":21295,"ui":24830,"uk":24301,"ul":92478,"ue":71615,"uf":143869,"ug":74097,"uh":10714,"ur":375391,"us":360270,"ut":230981,"um":178633,"un":921062,"up":64261,"ty":34073,"tz":131414,"tu":172266,"tt":172596,"tw":53648,"ub":43047,"ua":44618,"ud":39409,"uc":128427,"w ":36904,"to":167744,"tn":11002,"tm":18841,"tl":110769,"ts":280176,"tr":215121,"tp":9922,"tg":32640,"tf":25417,"te":1203919,"tk":15341,"ti":420789,"pä":14388,"th":119533,"v ":14300,"tb":22514,"ta":360262,"su":36834,"sv":12561,"ss":289880,"st":1222626,"sy":19371,"sz":17514,"sw":26360,"sl":40519,"sk":49450,"sm":28595,"sp":131491,"so":116426,"sr":21109,"sd":16337,"sc":827974,"oß":20955,"sf":20622,"se":456715,"sh":47160,"sg":56341,"si":322120,"nö":10879,"rz":76103,"u ":108662,"sa":109787,"sb":46727,"rr":85173,"rs":267473,"rt":385464,"ru":174667,"rv":18995,"rw":59814,"ry":19301,"rp":30886,"ro":256692,"rn":187691,"rm":96942,"rl":93135,"rk":131828,"ri":506618,"rh":58397,"rg":185502,"rf":74192,"re":658545,"rd":287337,"rc":86223,"rb":105570,"ra":423969,"t ":1627252,"mö":8912,"qu":17893,"mä":18127,"lü":10612,"s ":1183348,"pt":54028,"pu":30332,"pp":47091,"pr":99340,"ps":12738,"zä":9571,"zö":21005,"wä":21780,"zi":99549,"zb":8992,"ze":213386,"za":20320,"zw":56096,"zu":170081,"zt":32056,"zo":13315,"ye":15230,"uß":27824,"ya":11551,"tü":14407,"yt":8983,"ys":38183,"yr":10616,"yp":13299,"yn":15755,"ym":18550,"yl":11401,"äc":27469,"Üb":9926,"ßb":17940,"ße":45162,"ßt":14023,"ép":12471,"ät":44405,"äu":30285,"äl":29341,"än":69784,"äs":15220,"är":37279,"äd":10841,"äg":13766,"äf":13624,"äh":39944,"ün":62239,"üs":17438,"ür":122114,"üt":16867,"üb":39622,"üc":24866,"üg":9236,"üd":43250,"üh":43336,"öß":14937,"ör":67546,"ös":54393,"ön":22557,"öl":13380,"öm":9902,"öh":14808,"öf":13450," Ga":33168," Ge":150714," Fo":31900," Fu":25992," Fr":58787," Fi":36574," Fl":33229," Ha":86258," He":62512," Go":17225," Gr":74652," Gu":10914," Gi":11192," Gl":11774," Hu":10983," Ho":42078," Dé":12331," Hi":21530," Je":11187," Ja":77578," Is":10643," Im":11222," In":77993," Ka":74525," Ke":14835," Ki":37849," Jo":24054," Ju":32722," La":89327," Le":49575," Li":54819," Kl":25011," Ko":56929," Kr":45960," Ku":30157," Ma":116111," Mi":68090," Me":64810," Lo":25969," Hö":9372," Lu":17420," Ne":43845," Na":66181," Ni":23603," Mo":46674," Mu":29161," Ap":14296," Am":21144," An":55404," Al":65979," Ad":11104," Ab":25090," Ba":87247," Au":73430," At":9235," As":17394," Ar":57936," Be":143584," Bi":31731," Bl":15450," Bo":31991," Br":45134," Bu":55625," Ca":33546," Ch":44814," Cl":9441," Co":63874," Da":90740," Di":188080," De":166199," Dr":19868," Do":28619," Du":11174," El":17420," Ei":81331," Es":30480," Er":67840," En":32299," Eu":13899," Fe":38170," Fa":46079," Sü":22387," Wo":21330," Wi":47141," We":76915," Wa":43907," Zu":19747," Zw":11238," Ze":31828," Zi":10283," Kö":16410," Os":17332," Or":58009," Po":48260," Pl":15331," Pi":16916," Ph":15079," Pf":21701," Pe":31180," Pa":55779," No":53429," Ol":9032," Ok":13357," Ob":22103," Ra":37989," Mü":11350," Qu":12048," Ro":44140," Re":106629," Ri":23976," Rh":15474," Pr":83041," Pu":10674," Mä":13877," Sy":14940," Su":18414," St":192451," Ta":27626," Th":35151," Ti":19864," Te":52071," US":39409," Tr":31991," To":24396," Ru":19497," Sa":63396," Sh":9926," Si":90133," Sc":119896," Se":64226," So":38900," Sp":54582," Va":14382," Ve":103873," Vi":22523," Vo":36449," Tu":10321," Um":8742," Un":54031," je":12055," im":207684," in":455196," is":439990," it":10403," ka":23448," fü":85353," ha":41553," he":33774," gi":8908," gl":12278," gr":31732," ih":21981," hi":13984," ho":9452," ni":22730," ne":14510," na":48500," od":67197," of":20927," nu":10713," no":19195," le":19412," li":42027," la":25753," ku":12534," km":15800," kl":13510," ko":18379," me":26065," mi":107786," ma":27726," ab":26560," am":69685," an":87925," al":101826," au":256133," ba":8860," bi":45326," be":204994," br":14611," bz":9809," er":81246," et":21454," es":15402," en":55368," ei":604709," eh":21203," fr":36324," ge":155650," ch":11442," da":104895," dr":12440," de":1066350," di":258330," du":31949," zu":137283," zw":39753," ze":10132," sü":17881," ru":14611," se":68014," sc":24771," si":106825," sp":27646," so":41648," re":21422," pr":15398," po":16457," wa":91486," we":72625," wu":68805," wi":72114," ve":73784," vo":278143," vi":13609," um":38406," un":417759," st":39848," tr":12290," th":15725," te":8776," Üb":9908," ös":13916," üb":31186,"Fer":11772,"Fil":18131,"Fam":19152,"Es ":23376,"Eur":10652,"Ein":58080,"Eis":15092,"Er ":28197,"Ent":11856,"Gem":51515,"Geb":17190,"Ges":25022,"Ger":10345,"Gen":10761,"Gat":12348,"Fuß":15716,"Flu":13729,"Fra":28567,"Fri":10250,"Fre":14074,"For":20619,"Dép":12076,"Hei":15430,"Her":23305,"Hal":8751,"Han":15746,"Hau":30716,"Gre":10954,"Gra":17232,"Gru":19222,"Gro":14185,"Int":14682,"Ins":19291,"Ind":10322,"In ":14119,"Hoc":11500,"Art":17165,"Aut":13725,"Aus":25209,"Aug":12878,"Auf":14417,"Ban":13867,"Bad":10629,"Bar":8982,"Bas":9343,"Bau":14505,"Als":17057,"Alt":10907,"Ant":9903,"Bun":26145,"Bur":11846,"Car":9966,"Bez":18543,"Bes":11001,"Ber":42078,"Bei":10336,"Beg":14069,"Bre":9847,"Bra":12567,"Dez":9022,"Der":97305,"Deu":28251,"Das":56242,"Chr":9059,"Chi":9883,"Cha":13525,"Com":14437,"Cou":18879,"Die":157652,"Dre":10394,"Dor":10511,"Neu":14698,"Nat":19706,"Nie":12050,"New":10171,"Nam":20762,"Nac":12244,"Nov":10049,"Nor":35685,"Okt":8850,"Obe":15435,"Ort":33777,"Ost":13628,"Pla":10533,"Pfl":9335,"Pfa":9319,"Per":13707,"Par":21713,"Pro":54127,"Pri":9311,"Pol":21608,"Mär":10626,"Jan":11806,"Jah":47539,"Joh":10371,"Jul":9776,"Jun":10796,"Kan":19278,"Kar":13767,"Kir":12749,"Kil":10995,"Kla":9457,"Kon":17432,"Kom":17455,"Kre":22963,"Kri":9328,"Kul":11508,"Lei":11844,"Lan":56584,"Lin":9234,"Lis":12636,"Men":8765,"Met":15567,"Man":15071,"Mal":9073,"Mar":33951,"Mai":15148,"Mon":12046,"Mit":29518,"Min":10015,"Mus":19078,"Süd":20812,"Wil":11137,"Wie":10497,"Wei":14673,"Wel":14947,"Wer":11351,"Wes":22169,"Was":9091,"Wal":11499,"Vor":17501,"Vol":11987,"Zei":17372,"Str":23794,"Stu":11271,"Sta":109473,"Ste":21247,"Tei":19885,"Sit":13505,"Sie":53287,"Ser":9549,"Sep":9592,"Sei":11021,"See":8773,"Spi":16479,"Spr":13123,"Sch":114345,"San":12822,"Rhe":11094,"Rec":10719,"Rei":15453,"Reg":42336,"Rom":8900,"Ver":96243,"Uni":18259,"Unt":29085,"The":20642,"US ":31557,"Tra":12824,"bis":44646,"bil":24249,"bin":15696,"ble":9072,"bli":23577,"be ":15350,"ban":20200,"bal":24830,"bah":12251,"bac":14708,"bau":19849,"bar":18594,"bei":74577,"bef":14886,"bed":10076,"ber":182051,"ben":84728,"bel":16201,"bek":17699,"bez":50353,"bew":10065,"bes":47672,"bet":15480,"bie":27376,"bge":10311,"bzw":9957,"ca ":13669,"ce ":20992,"bt ":13539,"bri":17955,"bra":14742,"bre":12676,"bru":12222,"bst":12657,"bur":42376,"bun":13485,"am ":57496,"al ":67162,"ain":24754,"ais":13530,"ahm":10952,"ahn":21966,"ahl":22279,"ahr":74469,"anu":15776,"anz":50908,"ano":9997,"ann":95807,"ant":49762,"ans":31362,"ane":16424,"ang":74734,"ani":87171,"ank":29449,"ana":30762,"anc":17098,"and":264094,"amt":12217,"amm":42375,"amp":12259,"ami":43004,"ame":78887,"amb":11361,"ama":13307,"alz":12079,"alt":72681,"als":84009,"all":87642,"ali":76090,"ald":16999,"ale":62596,"ala":15461,"alb":20979,"an ":124982,"akt":19509,"abe":38207,"ae ":18008,"aat":34161,"ad ":15688,"ab ":9708,"aft":77933,"aff":9982,"ai ":14358,"age":54066,"adi":20643,"ade":24209,"ag ":17784,"adt":77282,"ack":9729,"ach":145520,"ace":11312,"aye":9954,"at ":58813,"are":33081,"ard":28108,"arc":10050,"arb":19113,"ara":24649,"aro":9777,"arl":16356,"ark":32666,"ari":41626,"arr":11665,"ars":12931,"art":72580,"au ":29621,"asi":19652,"ase":11784,"ar ":124263,"as ":155043,"aut":27334,"ay ":9249,"ata":11051,"ast":27543,"ass":73183,"ato":14091,"ate":62804,"ati":113716,"ath":22536,"auc":75721,"att":42075,"ats":11481,"atu":18974,"atz":18097,"aum":14540,"aup":28995,"aus":149891,"aue":19995,"auf":98508,"itu":19936,"itt":50909,"its":25338,"itz":41656,"ism":9438,"iss":46444,"ist":567635,"ita":30952,"ite":83935,"itg":12750,"iti":55617,"ium":14795,"ivi":10241,"ive":36462,"is ":120788,"ion":169772,"irt":10794,"irk":24645,"isi":17517,"ish":14382,"ise":39335,"isc":434611,"isa":10445,"ire":14375,"ird":46859,"irc":21452,"it ":186965,"itä":20849,"izi":17119,"ize":13649,"kir":9307,"kis":9934,"km ":11509,"ki ":9579,"kei":25216,"kel":18319,"ken":38622,"ker":44613,"key":12420,"ke ":26956,"kra":15849,"kre":43910,"kt ":30020,"kri":11594,"kon":15402,"kom":22572,"ks ":9057,"kma":10480,"kle":18061,"kla":10851,"kat":15038,"für":75640,"kan":77012,"kal":10319,"füh":22557,"ka ":18181,"han":36578,"hal":38788,"hau":37347,"har":21603,"hat":27886,"haf":79189,"hab":9073,"he ":213315,"hel":11970,"hei":73898,"hec":9261,"heu":16839,"hes":34182,"her":185057,"heo":9547,"hen":324438,"hem":37228,"hie":35579,"hic":13745,"hin":28238,"hil":15183,"his":35328,"hl ":14867,"hn ":19219,"hla":28455,"hle":29146,"hli":16905,"hlo":9385,"hlu":9270,"gle":20084,"gli":40281,"gke":10880,"gs ":14145,"gsb":9179,"gsg":11045,"gro":10882,"gru":14203,"gra":25992,"gt ":53228,"gri":24893,"gre":15248,"gst":11232,"gss":10565,"gte":17882,"grö":13018,"grü":19508,"gus":11665,"gun":24014,"ial":21666,"ian":25164,"iat":10334,"ibt":8975,"id ":10846,"ibe":11815,"ia ":36994,"iet":30400,"iel":84385,"ien":123978,"ier":122946,"ies":42285,"ied":66086,"ieg":57690,"ief":8944,"ieh":10954,"ig ":34684,"iec":11989,"ieb":29946,"ift":20078,"iff":26227,"ick":25712,"ich":454249,"ie ":537544,"ica":15678,"ide":33142,"ida":13186,"ieß":9409,"il ":53074,"im ":225234,"ika":58499,"ige":123608,"iga":12204,"igk":10797,"igi":12120,"igu":13208,"igt":19283,"ign":9138,"ihe":9068,"ihr":18848,"ik ":39429,"imm":16766,"ime":10380,"ind":156874,"ina":40189,"inn":28145,"ino":11205,"int":30667,"ins":51098,"inf":10131,"ine":425715,"inh":13926,"ing":93214,"ini":56092,"inl":11619,"ink":12759,"inw":21089,"inz":40189,"ike":34841,"in ":793598,"ilo":17887,"ill":36002,"ilm":27378,"ili":52680,"ild":30027,"ile":21718,"ima":11905,"io ":15324,"hr ":43100,"hol":19495,"hor":9009,"hof":13254,"hoc":13003,"hni":15320,"hnu":17265,"hne":77095,"hme":30297,"hul":13709,"htu":10990,"hts":18003,"hti":8985,"hte":48599,"hst":18465,"hse":23032,"hrt":21145,"hre":87475,"hri":38046,"ht ":81034,"hwa":10659,"hwe":33406,"hum":9803,"hun":26557,"ffe":34346,"fes":13275,"fer":33138,"fen":43852,"fel":18220,"fge":10643,"fas":17434,"fan":11879,"fal":21835,"fah":14992,"ff ":20858,"fe ":14115,"ewä":9115,"eze":57686,"ezi":30267,"eta":11725,"ete":74739,"eti":14295,"eso":11977,"est":128414,"ess":60542,"eue":13801,"eug":13587,"etr":30198,"ett":26303,"etw":18129,"etz":35905,"ew ":11648,"eut":121638,"eur":19487,"ewi":9238,"ewe":18343,"ey ":16388,"erö":10701,"er ":1578023,"eor":17275,"es ":365853,"ept":13647,"epu":9001,"erk":45646,"erl":56900,"eri":118551,"erg":76890,"erh":33322,"ere":140461,"erf":36150,"erd":42731,"era":58724,"erb":63293,"et ":137716,"esi":26333,"esc":45868,"ese":67043,"esa":12858,"erz":20677,"erv":14305,"eru":46595,"erw":47997,"err":51760,"ert":122552,"ers":190257,"ern":139910,"erm":29678,"erp":11457,"ero":15582,"ekt":37882,"en ":1453342,"elb":18844,"ela":19480,"eld":20851,"elc":11326,"elf":9116,"ele":71198,"eli":22926,"elm":9454,"eln":17751,"ell":119434,"els":30215,"elt":58131,"ehö":33094,"eiß":9569,"emb":35370,"ema":36279,"eme":116815,"emi":15129,"enf":15937,"ene":83278,"enh":15362,"eng":41769,"enb":41104,"ena":52256,"end":112707,"eno":10734,"enn":31346,"enk":30270,"enl":8977,"eni":28822,"ens":113000,"ent":181395,"enr":9992,"enz":28843,"ege":66150,"egi":51407,"egr":34898,"egt":35267,"ehm":19125,"ehr":37443,"ehe":47146,"eib":17640,"eic":136250,"eht":23060,"eis":113764,"eim":26661,"eil":79634,"ein":827667,"eih":11402,"eie":15075,"eid":22147,"eig":23796,"el ":101100,"eiz":17159,"eit":174763,"efü":9260,"eka":19118,"em ":162112,"gis":26824,"gin":15255,"gio":29407,"gie":31403,"gen":239581,"get":10602,"ger":96243,"ges":76492,"gew":20591,"geb":50080,"geh":39393,"geg":29832,"gef":16578,"gem":34840,"gel":53887,"ge ":89868,"gab":8826,"gar":15096,"gan":29626,"ga ":12696,"frü":9222,"fte":22800,"ftl":11215,"fun":10974,"fts":18001,"ft ":64901,"fra":26521,"fre":14454,"fri":12770,"for":35090,"fol":14926,"fla":12837,"flu":9809,"fil":11145,"fin":25920,"da ":17513,"de ":197184,"das":76737,"dar":15587,"dam":14425,"ckl":11099,"chä":10997,"ch ":425455,"cha":118322,"chw":51871,"chu":40552,"ck ":31790,"che":671441,"chl":70583,"chi":103178,"cho":20768,"chm":11901,"chn":86780,"chs":68005,"cht":155579,"chr":43254,"cke":51895,"ed ":20514,"eba":8988,"ebe":60498,"ebi":30463,"ebr":17022,"ebu":9376,"eat":10811,"efi":14121,"efe":11305,"ei ":84962,"ega":9107,"eer":10701,"edi":25198,"ede":66213,"eg ":9875,"eck":32288,"ech":81379,"aße":12922,"ee ":17886,"dwe":14218,"dur":31652,"dor":21874,"don":10905,"ds ":13347,"dun":25608,"dri":12382,"dt ":65936,"dre":16392,"dsc":11728,"der":838163,"des":213577,"det":50672,"deu":71719,"del":34612,"den":286768,"dem":104609,"dkr":22973,"dli":27133,"din":14965,"dis":37105,"die":268782,"dig":18151,"rhe":18402,"rha":21798,"rga":19288,"rgi":14417,"rge":50622,"ret":20023,"res":34677,"reu":13265,"rfa":14831,"rfo":9599,"rg ":68697,"rea":10702,"rec":32717,"raß":12628,"rei":196090,"reg":17320,"rem":14552,"ren":145744,"rer":33774,"rf ":19644,"rdn":11514,"rdl":8810,"rdi":14216,"rde":136404,"re ":80803,"rbr":9368,"rch":72681,"rd ":68822,"rap":8929,"rar":8924,"ras":13690,"rat":45378,"rau":34010,"rbi":13086,"rba":21515,"rbe":36419,"rai":10459,"rag":25026,"ran":87263,"ram":18048,"ral":31964,"rab":8938,"raf":18926,"rad":17310,"rac":30425,"rs ":37799,"ros":12129,"rot":11834,"rom":12509,"ron":32055,"rop":24374,"rov":26322,"rod":18103,"rol":12320,"rof":8772,"rog":9738,"rns":12992,"rna":25318,"rne":34331,"rni":11673,"ro ":10671,"rma":31109,"rme":19061,"rmi":10506,"rli":33691,"rle":11502,"rla":23240,"rn ":74871,"rks":11567,"rke":27812,"rm ":16219,"rit":36924,"ris":65792,"rig":20472,"ril":13027,"rik":52744,"rin":60346,"ria":21040,"ric":47382,"rie":98836,"rif":31298,"rk ":38902,"rwe":21555,"rz ":21038,"ruc":12341,"rup":19176,"run":74339,"rum":15222,"rus":14979,"rwa":28214,"ry ":12420,"rsi":21022,"rso":11863,"rsp":15246,"rsc":59984,"roß":19513,"rsa":9024,"rse":16931,"rta":10712,"rst":71108,"rte":100313,"rth":11159,"rti":23089,"rua":8770,"rts":42716,"rtr":18145,"rt ":139444,"rro":10371,"rri":12758,"rre":38535,"rra":9354,"sam":29233,"sbe":20860,"san":12568,"sat":17995,"rze":22030,"sha":10198,"sho":11670,"sge":44768,"sie":44959,"sic":57522,"sit":30295,"sis":59876,"sin":51565,"sio":15881,"sik":20202,"se ":74178,"oße":10850,"sch":817718,"ser":52310,"ses":13066,"set":25883,"seu":12734,"sei":61074,"seh":13958,"see":12215,"sen":104567,"sem":11510,"sel":53325,"spo":10155,"spr":34328,"spe":13644,"spi":46629,"spa":12483,"sow":18519,"sol":10161,"son":33802,"sor":15536,"sre":10655,"st ":524603,"ss ":36271,"sla":20592,"ski":8870,"ska":11035,"so ":11408,"swe":11000,"stä":18451,"sse":110613,"ssa":12522,"sso":12151,"ssi":42403,"sst":37624,"ste":300373,"stf":9710,"sta":116653,"sto":29905,"sti":57375,"stl":38070,"stu":17599,"str":60741,"sun":13352,"sve":8840,"tal":50681,"tag":11693,"taa":33781,"tad":80208,"tbe":10504,"tau":10504,"tat":27116,"tar":23114,"tan":59689,"tam":12219,"te ":230506,"ta ":18923,"pe ":20233,"par":30140,"pan":20821,"läc":9211,"län":14161,"phi":11788,"pen":21299,"per":31265,"pel":10530,"pla":12193,"pie":61526,"por":21385,"pol":21212,"ppe":33120,"pub":8976,"pte":12552,"pts":12222,"pra":20767,"pri":21158,"pre":14559,"pro":24883,"ra ":23418,"ngi":9157,"ngl":32203,"ngs":81612,"ni ":15548,"nge":150664,"nga":14715,"nha":17909,"nhe":16114,"neh":18303,"nel":12298,"nen":124634,"nem":32650,"ner":149716,"net":57023,"nes":53346,"neu":10682,"ng ":256902,"nfo":9002,"nfa":11653,"nce":12344,"nch":15880,"ne ":310275,"nbu":15121,"ndt":9026,"ndu":21347,"ndr":11275,"nds":26772,"ndo":18073,"ndl":16133,"ndk":23064,"ndi":40257,"nde":333302,"nda":14652,"nal":58896,"nam":18687,"nan":39807,"nar":14659,"nac":41308,"nad":10470,"nah":12864,"nbe":18026,"nd ":563951,"nba":15883,"nau":11932,"nat":30766,"na ":34549,"nz ":37982,"nwo":19169,"nve":9948,"nun":33102,"nur":9530,"nua":9860,"nty":16624,"ntw":17792,"nto":23385,"nts":23619,"ntr":27593,"nti":33190,"nth":12699,"ntl":20923,"nta":24938,"nte":156191,"nsp":11166,"nst":73244,"nse":37942,"nsi":11408,"nsc":47879,"nsa":13614,"nsb":9715,"nt ":97967,"ns ":52604,"nom":14191,"nor":17522,"nne":52732,"nni":11264,"nnt":52262,"nns":10154,"nli":11747,"nn ":35057,"nla":20752,"no ":11518,"nke":16356,"nkm":14368,"nkt":16086,"nkr":10153,"nig":30567,"nie":44175,"nic":23391,"nia":9121,"niv":12854,"nis":144657,"nit":17226,"nin":9152,"nik":13041,"ogr":16058,"ogi":20348,"oge":19173,"ohl":8865,"ohn":34927,"och":36284,"ock":28973,"ode":93686,"of ":22612,"odu":15579,"off":17101,"nzö":20567,"obe":24203,"nze":40975,"nzi":11904,"owi":20813,"ozi":9366,"ow ":10053,"oti":8974,"oth":10089,"ote":15159,"ott":13896,"oto":13207,"ost":26924,"ose":16100,"oss":15828,"ovi":27674,"ove":15851,"oun":22912,"our":13749,"oph":11323,"opa":9142,"os ":20626,"or ":51491,"ork":11076,"orm":35190,"orn":17190,"ord":64827,"ore":22869,"orf":23363,"org":24171,"ori":35553,"ort":57075,"ors":22259,"ora":13071,"ola":9778,"old":10163,"on ":380926,"oli":42117,"oll":31645,"olk":11956,"ole":12997,"olg":16083,"olo":30821,"om ":33290,"ona":50054,"ond":30722,"one":41633,"ong":12616,"oni":29183,"onn":12427,"ono":12443,"ons":40673,"ont":24319,"oma":20160,"ome":26297,"omi":13337,"omm":35850,"omp":19252,"omo":11157,"la ":19811,"lb ":10692,"le ":103052,"lch":16014,"lde":27222,"lac":9475,"lag":29983,"lan":114878,"lar":12710,"lat":31357,"las":30573,"lau":19611,"lba":8767,"ld ":26676,"lbe":17468,"kur":12239,"kun":14388,"kte":20495,"ktr":9556,"ktu":9324,"kti":30020,"kto":15493,"ls ":107647,"lon":9380,"lom":14967,"log":27324,"los":22267,"lti":11054,"ltu":37425,"lug":9766,"lsp":9315,"lst":14411,"lte":61749,"lsc":14894,"lt ":63451,"lge":22548,"li ":14630,"les":24231,"let":14648,"ler":100316,"lem":16242,"len":91180,"lek":12889,"lei":58951,"leg":20421,"leb":11626,"lls":25768,"llu":11230,"llt":17543,"lla":17394,"lle":120640,"lli":30333,"ln ":17395,"lm ":19332,"ll ":39906,"lit":40830,"lis":62203,"lin":51826,"lic":170755,"lia":13089,"lik":12998,"lig":41309,"lie":114815,"ma ":15588,"mar":19548,"mal":49935,"man":57586,"mat":30614,"mbe":37028,"me ":36368,"met":20898,"mes":11403,"mer":74569,"mel":10134,"men":133736,"mei":94533,"meh":11820,"mfa":9540,"lve":9179,"lun":37355,"lus":22607,"lz ":10714,"hör":33720,"mpf":9424,"ms ":14667,"mon":13165,"mt ":15320,"mte":10917,"mus":17368,"mun":19505,"min":21460,"mil":28731,"mis":23371,"mit":124264,"mig":9701,"mie":12587,"mmu":13317,"mmt":13008,"mme":52772,"zt ":19861,"zte":9874,"zu ":51921,"zw ":9786,"zug":11138,"zur":37734,"zum":31618,"zun":15443,"zus":11821,"zwe":22370,"zwi":20415,"zei":75965,"zeu":11441,"zes":9884,"zen":43803,"zem":9613,"zel":10023,"zer":23898,"ze ":15773,"zia":11658,"zie":29129,"zig":9649,"zir":18395,"yst":17491,"ußb":14873,"uße":8940,"yer":10735,"tän":11031,"tät":24189,"süd":17890,"wur":70546,"woh":26117,"wes":27777,"wer":58987,"wen":21225,"wel":20147,"wei":83410,"weg":12299,"wir":55249,"wis":33738,"wie":34766,"wic":21374,"wa ":19878,"röß":14321,"wan":13038,"rün":30393,"wal":31257,"war":103047,"rüh":12452,"rüc":12801,"vin":25318,"vie":18884,"von":226355,"vom":22538,"vor":35872,"räg":10007,"ver":144066,"ven":17663,"vem":8808,"ve ":14670,"van":10775,"usi":22265,"usg":18900,"use":25762,"usa":16989,"ust":38480,"uss":45929,"usp":9564,"uti":13537,"ute":38886,"utz":15915,"uts":91328,"uto":20186,"us ":143584,"ut ":20376,"urd":76437,"urc":36382,"ure":16506,"urg":50861,"uri":14126,"urn":9214,"uro":18224,"urs":13370,"urt":10114,"urz":14868,"ur ":74659,"upp":21507,"upt":28543,"ums":9212,"umb":12929,"ume":14114,"umf":9413,"unt":60343,"uns":11344,"unk":20377,"uni":19103,"und":450828,"ung":313251,"ukt":12384,"um ":104918,"ult":19564,"uli":15580,"ule":13595,"ula":10852,"uge":19918,"uft":12442,"ugu":12915,"uf ":70919,"ude":15376,"ue ":12656,"uch":109126,"uck":11441,"uer":24349,"ufe":13329,"ufg":14459,"uen":13989,"uel":10929,"uar":22497,"ubl":12721,"tzt":23155,"tzu":9903,"tze":24373,"ty ":26382,"twa":23393,"tur":43568,"tun":79448,"tum":9791,"tz ":51212,"twi":14763,"twe":10501,"ts ":32847,"tre":34672,"tt ":15987,"tra":75255,"tri":43316,"tru":19097,"tro":30104,"tsc":125606,"tsg":8757,"tsp":10708,"tst":44445,"tte":89035,"tti":9403,"ttu":15931,"to ":13838,"tob":10974,"tom":9404,"ton":34822,"tor":56677,"tik":35201,"tie":28029,"tig":44003,"tit":14488,"tis":66050,"tin":30024,"tim":13478,"tio":114009,"thu":10251,"tiv":26072,"tli":78058,"tla":9588,"tle":19198,"tem":46999,"ten":278529,"tei":81154,"tel":91292,"teh":26143,"tec":9644,"tfa":9048,"th ":15758,"tet":40557,"tes":39452,"ter":318918,"tgl":9160,"tge":13122,"tho":19217,"the":36369,"tha":11467,"zös":20559,"zäh":9158,"épa":12146,"ähl":11359,"ähr":16538,"äch":26798,"äng":21244,"änd":29875,"ält":11293,"ät ":14412,"ärz":9551,"äte":10008,"äuf":9858,"ßte":10219,"ßen":17047,"ßer":9786,"ßba":15077,"ße ":16400,"Übe":9600,"üdl":8991,"ühe":8949,"übe":35449,"ück":19105,"ünd":28744,"ür ":75451,"üns":8869,"ühr":24588,"ößt":9800,"öst":25495,"ösi":20607,"ört":26727,"örd":14857,"öni":10085,"öff":11553,"ürt":8780},"n_words":[87197534,99298261,71857404],"name":"de"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":1964,"E":1941,"F":1941,"G":1852,"A":4125,"B":2624,"C":3923,"L":2060,"M":3387,"N":1723,"O":1910,"H":2233,"I":2360,"J":1156,"K":1166,"U":1129,"T":2696,"W":1311,"V":1075,"P":2832,"S":4250,"R":1938,"f":2440,"g":5100,"d":7728,"e":26057,"b":4017,"c":7473,"a":25408,"n":18610,"o":18497,"l":12659,"m":7083,"k":2907,"h":7717,"i":21525,"w":1649,"v":2733,"u":9674,"t":15175,"s":12992,"r":19022,"p":4711,"z":1248,"y":3535,"x":1425,"μ":190953,"ν":363085,"ξ":16299,"ο":514934,"θ":54925,"ι":405235,"κ":245170,"λ":175427,"δ":87843,"ε":333395,"ζ":19116,"η":223442,"α":554256,"β":36747,"γ":101330,"έ":90196,"ά":106544,"ί":162652,"ή":98446,"Ω":1026,"Υ":1630,"Τ":16684,"Χ":5356,"Φ":5011,"Ρ":4090,"Π":18130,"Σ":17983,"Ο":20140,"Μ":17575,"Ν":8294,"Κ":18111,"Λ":6273,"Θ":3309,"Ι":8695,"Ζ":1967,"Η":13434,"Δ":9926,"Ε":15220,"Β":10823,"Γ":10307,"ΐ":1269,"Α":27922,"Ό":1024,"Ή":969,"Έ":4042,"Ά":2718,"΄":1288,"ϊ":3454,"ω":82386,"ψ":4318,"ώ":40333,"ύ":61448,"ό":136039,"σ":232383,"ς":240747,"ρ":274072,"π":191347,"χ":59122,"φ":44210,"υ":205289,"τ":474096," o":780," d":1163,"р":660," a":888," t":1268," p":1168," s":849," r":827," J":1101," K":1080," H":2002," I":1555," N":1284," O":1504," L":1764," M":2988," B":2227," C":3233," A":3004," F":1657," G":1632," D":1595," E":1570,"и":935,"о":946," S":3325," R":1624," P":2318,"а":1103," W":1185," V":798," U":829,"е":697," T":2305," Ι":8408," Θ":3256," Λ":6140," Κ":17657," Ν":8118," Μ":17373," Ο":19727," Α":26876," Γ":10190," Β":10719," Ε":14504," Δ":9620," Η":13284," Ζ":1948," Έ":4041," Ή":969," Ό":1018," Ά":2711,"A ":657," υ":7977," τ":169076," χ":14827," φ":8389," ρ":2618," π":75432," σ":77620," ύ":1013," ό":12814," ω":4856," ψ":866," ζ":1711," η":13278," δ":29475," ε":62623," β":11610," γ":22099," α":74025," ξ":1157," ο":35203," μ":51074," ν":14449," κ":74116," λ":7381," θ":8090," ι":6475," Χ":5293," Φ":4909," Υ":1521," Τ":16381," Σ":17547," Ρ":3989," Π":17476," ί":1230," ή":17892," έ":20443," ά":5017," Ω":1004,"Co":724,"H ":665,"C ":687,"O ":936,"Ma":1040,"To":652,"b ":924,"a ":4343,"i ":1712,"ge":831,"he":1545,"ha":1115,"gh":801,"g ":1029,"ea":941,"ed":859,"de":1790,"di":1004,"do":657,"h ":975,"el":1708,"en":2808,"em":659,"et":1060,"es":2097,"er":4530,"ca":818,"e ":6418,"f ":722,"co":859,"ci":771,"ch":1350,"ce":966,"d ":2124,"at":2084,"as":1170,"ar":3025,"al":2276,"ai":668,"am":1144,"an":4262,"ac":959,"ad":799,"nt":1738,"ns":835,"of":654,"om":903,"on":3275,"ol":1227,"os":875,"ou":924,"or":2245,"r ":2839,"pe":702,"lo":869,"ll":1391,"o ":2523,"ma":1208,"mb":957,"me":1201,"na":1567,"nc":799,"nd":1677,"ne":1562,"ng":1277,"ni":1477,"m ":1364,"li":1620,"le":1892,"la":1782,"n ":4655,"ht":808,"hu":877,"hi":783,"ic":1794,"ia":1591,"ig":1086,"ie":1115,"k ":870,"ir":779,"is":1816,"it":1405,"il":1191,"in":3273,"io":1643,"l ":2336,"y ":1749,"vi":759,"ve":914,"x ":1036,"ul":682,"ur":1293,"us":1362,"um":1309,"un":754,"to":1131,"tr":915,"te":1980,"ti":2250,"th":1729,"ta":1485,"ss":802,"st":1761,"se":995,"si":952,"rs":663,"rt":997,"ro":1924,"ri":2929,"re":1965,"rd":715,"ra":2129,"t ":3000,"s ":5020,"ώρα":1560,"ώτο":1121,"ώτη":1214,"ώσσ":997,"ώσε":810,"όεδ":664,"όδο":943,"όγο":980,"όν ":1172,"ός ":18179,"ϊκή":860,"ώθη":1056,"ώνε":1144,"ώνα":1750,"ώνη":931,"ώνυ":1219,"ώμα":1362,"ών ":13524,"ύου":1130,"ύμφ":1059,"ύμε":885,"ύμα":648,"ύντ":1574,"ύνο":857,"ύσε":1378,"ύτε":2598,"ύστ":2139,"ώς ":2025,"ύπο":1064,"ύρι":1436,"ύθη":1068,"όλο":1532,"όλε":1360,"όλη":2622,"όμω":688,"όμα":718,"όμε":2708,"όμο":1617,"όντ":1036,"όνι":1308,"όνο":4918,"ύν ":1686,"όπω":1790,"όπο":2513,"ύο ":1340,"όρο":2945,"όρη":995,"όρε":1336,"όρι":647,"όσμ":1304,"όστ":662,"όσο":1170,"ύς ":3624,"ότε":4438,"ότη":3717,"ότα":2254,"ότι":2195,"χώρ":2395,"χωρ":2665,"χρη":1672,"χρι":1320,"χρο":1146,"χρό":1127,"χολ":997,"χος":1298,"χου":1973,"χνη":657,"χιλ":816,"χικ":844,"χθη":1072,"χημ":1273,"χεδ":695,"χεί":1404,"χει":4759,"χετ":1198,"χαι":840,"χαί":1771,"χαρ":1871,"χαν":1478,"φόρ":1260,"χές":666,"χής":681,"χία":1887,"φων":1670,"φυσ":992,"φος":1513,"φορ":4132,"φου":862,"χε ":1257,"φικ":1262,"φιλ":1198,"φθη":729,"φερ":660,"φία":1007,"χή ":2299,"φαί":1221,"φαι":2017,"φαν":1114,"φαλ":712,"ωτο":763,"ωπα":870,"ωνσ":715,"ωνι":2150,"ωνα":1612,"ωνί":1735,"ωστ":3134,"ωτα":785,"ωτε":1561,"ωτι":1342,"ωση":2679,"ωρι":2402,"ωπο":844,"ωρί":1331,"ωρε":765,"ωμέ":1015,"ωμα":2019,"ωγή":1132,"ωγρ":700,"ως ":10658,"ων ":25196,"τά ":6317,"σήμ":967,"σία":5336,"σίλ":677,"τή ":2856,"σεω":1071,"σει":4874,"τε ":2677,"σημ":4910,"σης":5924,"σας":901,"σαν":2104,"σαλ":862,"τα ":16925,"σμι":908,"σμο":3140,"σμό":3646,"σου":1542,"σπα":974,"σιο":1662,"σικ":3297,"σιλ":2336,"σιμ":2018,"σκε":4485,"τη ":14833,"σια":1815,"σιά":738,"σμα":2223,"σμέ":1983,"τι ":2082,"σκο":2292,"σκη":760,"συσ":899,"σφα":2732,"συγ":2509,"συμ":1798,"συν":5781,"στρ":4375,"στό":2708,"σχε":1174,"σχο":662,"σπο":1081,"το ":35834,"στο":18068,"στι":9089,"στη":20031,"στε":4570,"στα":7955,"στί":782,"σσό":800,"στέ":1100,"στή":4608,"στά":1775,"σσε":911,"σσα":2647,"ρόε":683,"ρόσ":877,"ρότ":966,"ρόμ":822,"ρόν":1720,"ρόπ":819,"ρωτ":3312,"ρωσ":834,"ρωπ":1145,"ρωμ":989,"ρων":1731,"ρχα":2526,"ρχί":1672,"ρχε":1183,"ρχι":1336,"ρχη":823,"ρχο":1048,"ρυσ":751,"σα ":3872,"ρύθ":842,"ρώτ":2338,"ρώπ":1017,"ρών":1362,"σι ":958,"ση ":12367,"σε ":13475,"σο ":1455,"τότ":864,"τός":2183,"τύπ":651,"φή ":1774,"τών":2189,"Α ":1540,"υαρ":1339,"υβέ":734,"υγο":1001,"υγγ":1153,"Β ":722,"υγκ":1660,"Η ":10508,"υθο":2042,"υκλ":727,"υλί":1002,"υλι":722,"υλο":1136,"υμα":785,"υμβ":739,"υντ":1338,"υνθ":722,"υνο":1544,"υνα":2074,"υνε":870,"υνδ":710,"υνί":677,"υνή":706,"υνέ":720,"υμο":1114,"υμπ":1569,"υμμ":700,"υπη":741,"υπά":768,"υπή":1017,"υρο":983,"υρω":883,"υρί":1627,"υρά":671,"υπό":808,"υργ":2911,"υρα":699,"υρκ":776,"υρι":1000,"υπο":3053,"υτα":731,"υτι":3118,"υστ":2102,"υτή":1523,"υσα":1288,"υσί":1021,"υσμ":1067,"υσι":2818,"υση":1190,"΄ ":1237,"υτό":1999,"υτο":2715,"φέα":711,"φέρ":3057,"τάλ":737,"τάθ":737,"τάν":734,"τάσ":1170,"τέλ":1720,"τές":1644,"τέρ":1900,"σότ":899,"τέχ":815,"σωπ":940,"ταφ":747,"ταν":14808,"ταξ":1725,"ταρ":960,"τας":2802,"τασ":3623,"τατ":1252,"ταθ":1117,"ται":21198,"ταμ":1209,"ταλ":2606,"ταγ":1554,"τής":3962,"τό ":2629,"τήρ":1568,"τήμ":1493,"σύμ":1096,"σύν":1327,"τίσ":697,"τίτ":882,"τίο":1122,"τίν":835,"τία":2320,"τεί":2724,"τελ":4578,"τεμ":808,"τει":1045,"τες":1997,"τερ":11689,"τεχ":1125,"τεύ":1209,"τια":1887,"τιο":1754,"τιν":1951,"τισ":2348,"τις":8696,"τιμ":802,"τικ":22901,"της":36695,"τηρ":3076,"την":27288,"τημ":2854,"τηλ":655,"τηκ":1948,"τηγ":801,"τησ":1537,"τητ":3914,"τμή":1027,"τλο":835,"τρω":752,"τρό":832,"τρο":5304,"τρι":3975,"τρε":1070,"τρα":4343,"τρί":1148,"τού":2364,"του":50266,"τομ":1711,"τολ":2689,"τον":13580,"τοπ":1407,"τος":5084,"τορ":3232,"τογ":1175,"τοι":2194,"τοκ":1571,"τοί":1186,"υν ":3485,"τυπ":686,"υς ":13808,"τσι":825,"των":12537,"νή":7879,"νέ":3576,"μό":8452,"μώ":1755,"νί":9927,"να":62303,"νδ":5696,"νγ":910,"νε":15475,"νθ":2946,"νη":14939,"νι":23024,"μμ":4065,"μν":1567,"μο":26826,"μπ":8431,"μυ":2965,"μφ":2313,"μω":1439,"νά":5541,"ξα":1561,"νώ":4161,"ξε":3656,"ξι":1797,"ξη":2494,"νο":34560,"νν":4891,"νσ":1980,"ντ":28913,"νυ":1752,"ξά":881,"νό":10195,"νω":9532,"οδ":7658,"ού ":14320,"ογ":13258,"οε":2833,"οί":10846,"οβ":2978,"οα":1081,"ομ":21099,"ολ":23701,"οξ":882,"ον":36237,"οθ":2077,"οκ":6497,"οι":21065,"οία":2635,"ξο":1094,"οίκ":1187,"οέ":928,"ξύ":1306,"οίο":3618,"πη":2710,"πε":16079,"πα":18678,"πο":60271,"πν":700,"πλ":7083,"πι":10919,"ου":129022,"οφ":4209,"οσ":11327,"οτ":9541,"ορ":21585,"ος":37589,"οπ":16190,"πή":2566,"ού":30889,"πί":4948,"πά":3903,"πέ":3142,"οχ":4794,"ια":39925,"ιβ":2565,"ιγ":2579,"ιδ":8147,"ιε":6521,"ιζ":1197,"ιη":883,"ιθ":2601,"θω":1118,"ιά":10102,"θό":672,"ιέ":2080,"ιή":692,"θώ":1191,"θρ":2531,"θυ":2592,"θι":1852,"θλ":2017,"θν":3266,"θμ":2061,"θο":6015,"κδ":1079,"κε":22387,"κι":6089,"κη":5418,"κά":15567,"ιό":6367,"κέ":4956,"ιω":2086,"κα":65846,"κή":24785,"κί":3700,"ιώ":5059,"ιτ":9070,"ισ":25486,"ις":13132,"ιρ":6633,"ιχ":2933,"ιφ":1609,"ιμ":6598,"ιλ":8668,"ικ":79288,"ιπ":2073,"ιο":29123,"ιν":16499,"λη":19148,"λι":20827,"λκ":1174,"λγ":684,"λε":18373,"λί":8413,"κώ":4252,"λα":15386,"λβ":741,"λά":7390,"λέ":5224,"κό":26772,"κύ":1477,"λή":4583,"κω":1706,"κυ":3709,"κτ":10513,"κο":23779,"κρ":10237,"κπ":708,"κλ":5937,"κκ":1326,"μι":14565,"μη":6074,"με":36995,"μβ":5397,"μα":35176,"λύ":3607,"μή":3752,"μί":4420,"λώ":2414,"μά":7858,"λό":3924,"μέ":15352,"λω":3053,"λφ":1355,"λυ":3470,"λτ":890,"λπ":663,"λο":25987,"λμ":1804,"λλ":20970,"δη":5995,"δε":9279,"δι":23594,"δο":11445,"δρ":8223,"δυ":2966,"εά":775,"δω":1451,"εί":44740,"δώ":1804,"δύ":1829,"δό":1968,"εδ":3726,"εγ":5939,"εβ":1575,"εα":1003,"εζ":688,"εθ":3485,"ει":35123,"εκ":9726,"ελ":15989,"εμ":6248,"εν":19361,"εξ":4958,"εο":2282,"επ":13811,"ερ":40179,"ες":14762,"εσ":7584,"ετ":27314,"εφ":1836,"ευ":8533,"εχ":2179,"εω":4500,"εό":1241,"εώ":702,"ζί":1027,"εύ":4761,"ζα":1710,"ος ":37511,"ζε":6065,"ζι":1237,"ζη":738,"ζο":2939,"ζό":1259,"ζω":1103,"ηγ":3510,"ου ":87770,"ηλ":4366,"ημ":19012,"ην":37308,"ηθ":2646,"ηκ":9969,"ησ":11936,"ητ":11258,"ηρ":6263,"ης":54552,"θέ":3061,"θά":1066,"θή":2250,"ηχ":968,"θε":10322,"θη":10944,"θα":2622,"αί":10379,"ώ ":1686,"ακ":15045,"αι":93478,"αμ":10092,"αλ":21090,"αξ":3125,"αν":58537,"απ":33497,"αο":903,"αβ":3590,"αδ":5509,"αγ":12802,"αζ":1934,"ον ":14807,"αε":1433,"αθ":7116,"αϊ":2364,"βά":3473,"βέ":1564,"αύ":1058,"βί":1262,"βα":6377,"αρ":29289,"ας":38338,"ασ":22621,"ατ":37260,"αυ":7302,"αφ":9354,"αχ":2612,"βρ":7265,"βο":4803,"βλ":1821,"βι":3507,"βη":749,"βε":2725,"γα":7366,"γί":5669,"γέ":2710,"βό":1236,"γή":2815,"γά":3661,"γο":9693,"γρ":8625,"γμ":2369,"γλ":3739,"γν":4085,"γη":1772,"γκ":7566,"γι":14677,"γγ":5019,"γε":9939,"δα":7905,"δά":972,"γό":3291,"δέ":1507,"γύ":735,"δή":3929,"γώ":1054,"δί":4576,"γχ":1165,"γω":3973,"γυ":1102,"ομώ":1007,"ονί":1750,"έτ":5024,"έσ":4850,"ονα":1353,"ές":9695,"έρ":12949,"έπ":1245,"έο":998,"έξ":1325,"έν":21823,"ονδ":706,"ονι":2179,"έω":1309,"έχ":6163,"έφ":948,"έγ":3052,"έδ":1961,"έα":3333,"ομο":3665,"έλ":6166,"έμ":2360,"έι":841,"έκ":2418,"έθ":1065,"ομό":796,"άρ":7787,"άπ":2212,"άσ":7087,"άς":2871,"άν":11519,"άμ":3476,"άο":721,"άξ":842,"άτ":7688,"άχ":1258,"άφ":3485,"άβ":865,"άγ":2736,"ονο":4835,"οντ":6287,"άθ":2400,"άι":984,"άκ":2900,"άλ":9559,"ονό":1460,"άδ":6624,"άζ":2851,"ίς":1393,"ύ ":16375,"ίρ":2411,"ίπ":1897,"ίο":20214,"ίτ":8024,"ίσ":10180,"ίω":4466,"ίχ":2356,"αΐ":1011,"ία":42781,"ίβ":669,"ίε":2902,"ίζ":4719,"ίγ":1307,"ίδ":6660,"ίκ":4798,"ίθ":1008,"ίν":31172,"ίλ":2453,"ίμ":2180,"ήρ":4528,"ήσ":3271,"ής":18133,"ό ":46975,"ήτ":11312,"οπο":11630,"ήθ":4033,"ήκ":2438,"ήλ":1001,"ήμ":8522,"ήν":2806,"οιε":1122,"οιη":754,"οια":988,"φ ":702,"οιό":643,"οκα":975,"οιχ":1283,"οιν":2467,"οικ":4166,"οιο":1378,"ολι":5666,"ολλ":1265,"ολε":1313,"ολη":1127,"ολέ":884,"ω ":2903,"ολή":1110,"ολί":1066,"οκρ":2314,"ομι":2076,"ομη":811,"ομα":4494,"ολό":1136,"ομέ":1196,"ολύ":921,"ομά":4054,"ολυ":875,"ολο":6719,"οδι":937,"οδο":3299,"Υπ":1010,"οει":1399,"Συ":1600,"Στ":5008,"Τα":1846,"Σύ":1280,"Τζ":1148,"Τσ":726,"ς ":240346,"Τρ":1616,"Το":8405,"Χα":1289,"υ ":88180,"Χρ":999,"Φε":912,"Φι":704,"Φρ":703,"οθε":1042,"τ ":3704,"Πό":722,"Πρ":2848,"ξ ":655,"Ρο":774,"Πά":1152,"Ου":930,"ν ":112419,"Ορ":745,"Πο":2485,"Πε":2134,"Πα":4994,"πα ":685,"οβο":653,"Σα":1339,"π ":1661,"Σο":1132,"ογί":3068,"ρ ":3667,"ογέ":1210,"Σε":1940,"Σι":796,"Σκ":862,"ογρ":3223,"Ρω":976,"ογο":1295,"ογι":2141,"ο ":95107,"νός":3015,"νότ":2168,"Νό":654,"νόμ":1784,"λ ":3759,"Ντ":1194,"Νο":1706,"Ολ":1010,"Οκ":750,"Οι":2398,"μ ":2390,"Με":3726,"Μι":1150,"Μέ":678,"Μά":1161,"Μα":4471,"Λο":1279,"ι ":112286,"Νι":687,"νωσ":4336,"Νέ":832,"νων":3168,"Να":926,"Μο":1624,"κ ":2925,"Μπ":2700,"Κα":4969,"Ιω":836,"Κά":943,"Κι":702,"Κε":764,"Ιο":1485,"η ":59406,"Ιτ":907,"Ισ":1161,"Λα":1230,"Κό":910,"Κύ":1401,"Λι":661,"Λε":948,"Κρ":1403,"Κο":2565,"Κω":831,"Κυ":1027,"Θε":1962,"Ηλ":946,"ε ":49298,"Ηρ":654,"νυμ":1332,"Ια":1067,"ντί":2029,"ντα":8117,"Ερ":804,"Επ":1444,"ντε":1807,"Ελ":3849,"Εκ":838,"ντά":1383,"Εθ":920,"ντο":2386,"ντρ":2342,"ντι":5956,"Ευ":1907,"νστ":941,"α ":122436,"Γο":754,"Γε":2955,"Γι":1598,"Γκ":1275,"Δή":1392,"Δε":1218,"Δη":1620,"Δι":2406,"Εί":1505,"Αθ":2048,"νος":3551,"νου":4641,"νοτ":857,"Αγ":2492,"Απ":2382,"νού":1967,"Αν":3971,"Αμ":1723,"Αλ":1805,"Ακ":644,"Αι":969,"Αυ":2140,"Ασ":1614,"Αρ":2646,"ί ":9847,"Βα":2099,"Βι":900,"Βε":785,"Βο":1596,"Βρ":2238,"νοι":1353,"νομ":10454,"νον":1006,"Γα":1202,"νολ":1206,"νορ":724,"έ ":738,"ννή":1417,"ννη":1767,"ή ":39734,"ννο":742,"ά ":28070,"οι ":6175,"Χ ":729,"Ήτ":719,"Έν":837,"Έλ":1540,"Έχ":649,"Άγ":672,"Ο ":12037,"οί ":1669,"νών":1557,"ξύ ":1208,"πόλ":3996,"ρές":813,"ράκ":669,"ράγ":691,"ράσ":941,"ράτ":1685,"ράφ":2049,"ράς":805,"ράμ":655,"ράν":647,"ρίζ":2399,"ρίν":739,"ρία":4134,"ρίδ":772,"ρίω":1265,"ρίπ":1019,"ρίο":5068,"ρίτ":724,"ρίσ":4287,"ρό ":1004,"ρήσ":959,"ραβ":1005,"ραγ":2738,"ραμ":2193,"ραν":1181,"ρακ":3111,"ραφ":4046,"ρατ":4809,"ρασ":2311,"ρας":2953,"ργε":920,"ργι":803,"ργο":1794,"ργό":1216,"ργά":995,"ργί":829,"ργα":2610,"πτι":832,"πτε":965,"πων":724,"πως":2026,"ρμο":1033,"ρνη":961,"ρντ":654,"ρξε":1053,"ροέ":803,"ροκ":927,"ρολ":823,"ρομ":1223,"ρον":3303,"ροβ":944,"ρογ":664,"ροδ":662,"ροε":1475,"ρού":2500,"ροφ":1412,"ρου":6209,"ροτ":807,"ροσ":3339,"ρος":5922,"ροπ":1689,"ρτί":865,"ρτη":992,"ρτο":808,"ρεί":2767,"ρεύ":824,"ρετ":2773,"ρευ":659,"ρεσ":889,"ρες":2082,"ρει":4053,"ρησ":2941,"ρης":1045,"ριά":922,"ριγ":689,"ρια":3527,"ριό":2111,"ρκε":1056,"ριθ":994,"ρικ":7396,"ριλ":1434,"ριμ":904,"ριν":2336,"ριο":5568,"ρισ":7036,"ριφ":871,"ρμα":2357,"ρά ":3405,"πάν":921,"πάρ":1052,"παλ":1326,"παι":1177,"παν":2463,"παγ":893,"παϊ":680,"πατ":824,"παρ":8016,"παί":655,"πίσ":2277,"ούλ":1020,"ούμ":1108,"ούν":3424,"ούσ":2748,"ούς":3386,"πό ":21634,"πήρ":1722,"ούρ":1307,"οτε":3981,"οτι":1757,"οστ":1538,"οσφ":2182,"οσω":886,"οτέ":721,"οτα":918,"οση":738,"οσι":846,"οσπ":775,"ορφ":1231,"ορυ":783,"ορτ":877,"ορι":2946,"ορε":3059,"οργ":1807,"ορο":1078,"οπτ":643,"πο ":1799,"ορα":1218,"ορί":3662,"ορέ":747,"οπό":828,"ορά":1572,"οχή":2609,"οφο":1172,"ουσ":4557,"ους":13764,"ουρ":5299,"ουν":4982,"ουμ":1209,"ουλ":3147,"ουδ":1420,"ουα":1700,"ουά":882,"όγ":2701,"όε":736,"όδ":1693,"όλ":8233,"όκ":1654,"όμ":7862,"όν":10515,"ός":18197,"όσ":5414,"όπ":5405,"όρ":8376,"όφ":825,"όχ":1299,"ότ":13665,"ρι ":2104,"πλα":1198,"ϊκ":1906,"πλε":822,"πλη":1610,"πλο":1278,"ωθ":777,"ωβ":791,"ωδ":719,"ωγ":2280,"ων":32748,"ωπ":2336,"ωμ":4528,"ωρ":6506,"ως":10668,"ωσ":8039,"ωτ":6473,"ρο ":6138,"προ":8229,"πρι":1380,"πρα":1142,"πρώ":2895,"πρό":2160,"πρω":2758,"ώμ":1777,"ών":20079,"ώθ":1069,"ώη":641,"ώδ":730,"ποκ":999,"ποι":5038,"πολ":7906,"πον":1356,"ποί":6780,"ώτ":3447,"ώσ":2701,"ώς":2026,"ώρ":3768,"ποδ":2435,"πογ":674,"ώπ":1387,"ύκ":1365,"ύλ":2704,"ποχ":850,"ύμ":4174,"πού":867,"ύε":938,"ύθ":1852,"πορ":2148,"πος":1087,"ύγ":1143,"ύδ":738,"που":20607,"ποσ":1036,"ποτ":4745,"ύφ":886,"ύχ":810,"ύς":3629,"ύρ":4445,"ύτ":4068,"ύσ":5147,"ύν":5639,"ύπ":2849,"ύο":2732,"τε":30296,"τζ":1331,"τη":93724,"τι":46176,"τλ":1268,"τμ":1495,"πεδ":722,"το":124490,"τρ":19405,"τσ":2719,"στ":76523,"σσ":6194,"σφ":3318,"συ":12181,"σχ":3635,"σω":3204,"τέ":8491,"σό":1919,"τά":12970,"σώ":1166,"τί":8764,"σύ":4398,"τή":11346,"τα":73796,"σε":22382,"σι":15384,"σκ":10283,"ση":23543,"σθ":1011,"σμ":12903,"σο":6917,"σπ":3490,"πει":1514,"πελ":849,"σή":1903,"σέ":940,"περ":10195,"σά":1139,"σα":9280,"σί":8405,"ρα ":9620,"ρφ":1653,"ρυ":3661,"ρχ":9524,"ρρ":1307,"ρτ":5071,"ρσ":1303,"ρώ":6736,"ρύ":1952,"ρω":9925,"ρό":9460,"ρη":8917,"ρθ":1230,"ρε":17142,"ργ":11016,"ρδ":1272,"ρα":38879,"ρβ":958,"ρο":41731,"ρν":3338,"ρξ":1376,"ρλ":780,"ρμ":5151,"ρι":40112,"ρκ":3719,"πυ":748,"πτ":4561,"πρ":20097,"ππ":796,"ρί":22933,"ρή":2864,"ρέ":3880,"πό":28476,"ρά":13738,"πω":3057,"πια":723,"ρη ":2996,"ψε":1003,"ψη":1336,"χω":3038,"χό":665,"χώ":2869,"χρ":6885,"χο":6422,"χτ":889,"χι":3395,"χθ":1552,"χη":2825,"χν":2772,"χα":7239,"χί":2595,"χε":9910,"πισ":2595,"πιτ":871,"φω":2505,"πιο":1239,"χέ":1462,"φό":1995,"χή":3366,"φύ":644,"πικ":1602,"χά":891,"φυ":2356,"φο":8165,"φρ":1732,"φη":1268,"φι":3786,"φθ":1094,"φε":2326,"φή":2402,"φί":1583,"ρε ":822,"φα":6317,"φά":1435,"φέ":4345,"υφ":1198,"υχ":1680,"υψ":1025,"υτ":12113,"υσ":11499,"υς":13842,"υρ":12723,"υπ":8507,"υν":14929,"υμ":7534,"υλ":5896,"υκ":2985,"υθ":4332,"υζ":915,"υγ":5531,"υδ":2390,"υα":2666,"πηρ":828,"υβ":1419,"τώ":3229,"τό":9167,"τύ":1372,"υά":1396,"τω":14910,"ττ":1165,"τυ":2752,"μης":911,"μηχ":778,"μητ":983,"νε ":1648,"μιο":2230,"μισ":919,"νη ":4892,"μια":4477,"μικ":5134,"να ":17430,"μβρ":2160,"μβο":1003,"μερ":4521,"μεσ":1074,"μετ":6749,"μει":655,"μελ":883,"μεν":3875,"μεγ":2940,"μεί":1099,"λώσ":1078,"νή ":1793,"λών":940,"μό ":3080,"λύτ":1526,"μία":2762,"μήμ":1075,"μαχ":648,"μβά":1095,"μαζ":923,"μαν":3924,"ματ":10484,"μασ":1431,"μαί":1647,"λων":1650,"μέτ":1129,"μέσ":1359,"νά ":992,"μέρ":1935,"μέχ":973,"μέλ":884,"μέν":7654,"μέγ":746,"λόγ":1758,"μάτ":1753,"μάδ":1518,"μάζ":1519,"λυμ":1032,"λυτ":817,"λος":3446,"λον":1147,"λογ":7404,"λοί":713,"μο ":3714,"λου":3295,"λού":2344,"λικ":6554,"λια":1551,"λιά":2019,"μη ":2038,"λιό":666,"λιο":1726,"λιτ":3281,"λισ":1155,"λλο":3357,"λλι":1915,"λλε":843,"λλη":7119,"λλα":1712,"λλά":3329,"λλί":726,"νισ":2798,"νιο":1219,"ξη ":1696,"νθρ":1131,"νικ":14169,"νια":1586,"νημ":1520,"νης":3311,"νησ":3074,"νητ":1569,"ξε ":1387,"νει":4020,"νετ":1530,"νεπ":771,"νερ":896,"νες":2492,"νδι":887,"νδρ":1754,"νεί":692,"νδί":1002,"νγκ":862,"ναφ":1731,"νατ":2689,"ναυ":689,"νασ":1113,"νας":6220,"ναν":1504,"νακ":1377,"ναι":23633,"ναγ":847,"νώ ":1108,"νίδ":909,"μών":1634,"νίκ":973,"νίο":894,"νής":1328,"νό ":1732,"νήσ":732,"νία":4805,"νήθ":2044,"νήκ":1169,"μός":2661,"μόν":1144,"νάμ":808,"μων":716,"νω ":758,"μυθ":2305,"ντ ":1585,"μφω":1019,"μον":2292,"μοπ":1626,"μορ":1296,"μος":2596,"μοι":670,"μοκ":1030,"μπο":2312,"νο ":5296,"μπε":1085,"μπι":1180,"μού":4831,"μπα":642,"μου":4833,"μοτ":850,"μοσ":1084,"νι ":660,"μμα":1876,"λή ":1370,"ιών":3452,"κίν":1235,"κία":1185,"κής":8061,"κό ":13722,"καρ":1054,"κας":811,"κασ":794,"κατ":10049,"καθ":2426,"και":39436,"καλ":3288,"καν":3494,"λα ":2059," Ma":1020," O ":728,"κεν":791,"κετ":3212,"κευ":1338,"κεί":1744,"κει":2368,"κεκ":645,"ιφέ":650,"ιχε":1339," Co":711,"ιωτ":1177,"κέν":650,"ιός":767,"λά ":1854,"κές":3367,"ιότ":2001,"κάθ":781,"κάν":679,"κάπ":806,"κάτ":989,"ινο":2651,"ινη":1172,"ινε":1173,"ινι":1101,"ινή":1455,"ινί":997,"ινα":787,"ινό":2461,"ινω":851,"ιου":2729,"ιος":4127,"ιορ":1283,"ιον":1013,"ιολ":889,"ιοι":954,"ιοδ":1147,"ιογ":776,"κο ":960,"ιού":2201,"ιοχ":1758,"ιρι":1611,"ιρά":1156,"ιρα":723,"ιρε":766,"ισμ":7142,"ιστ":11061,"ισσ":1380,"ιση":1365,"ιτο":998,"ιτα":1283,"ιτι":2704,"ιτε":1108,"ιδρ":1035,"ιεθ":1171,"ιεί":1032,"ιδώ":1125,"ιες":926,"ιερ":1150,"κε ":8838,"ιθμ":885,"κη ":1482," To":646,"ικά":10371,"ικί":910,"ική":22548,"ικέ":3180,"ικα":3219,"ικη":1192,"ικε":1161,"ικι":1228,"ικο":9392,"κι ":739,"ικρ":1864,"ικώ":3521,"ιλί":1034,"ιλα":936,"ικό":19394,"ιλι":2810,"ιλε":728,"ιλο":1028,"ιμέ":1283,"ιμο":2191,"μα ":12255,"λαμ":1653,"λαν":1780,"λαι":1164,"λασ":1322,"λατ":899,"λαδ":919,"λαγ":813,"λαβ":1022,"λαί":667,"λητ":1184,"λην":6429,"λημ":897,"ληρ":1062,"λησ":1042,"λης":1640,"ληθ":1186,"με ":14402,"λεσ":1056,"λες":910,"λευ":2243,"λεκ":1059,"λεμ":1535,"λει":2415,"λεγ":742,"λεί":3987,"κων":656,"λίο":1804,"κών":3899,"μή ":801,"λής":941,"λία":3193,"κόρ":832,"λέξ":905,"λέμ":640,"κόμ":1163,"κόλ":651,"λέγ":681,"κότ":1771,"μά ":961,"λές":766,"κός":6008,"κόσ":1288,"λάδ":2237,"κρι":1532,"κρο":1257,"κρά":1666,"κρα":2747,"κρό":1004,"κολ":1260,"κοι":2432,"κοπ":1299,"κον":2063,"κοσ":762,"κος":1289,"κορ":683,"κογ":1034,"λο ":3553,"κου":2843,"κού":6899,"κυρ":1539,"κυβ":687,"κτω":740,"κτο":1065,"κτρ":885,"κτη":2180,"κτι":1278,"κτή":823,"κτα":758,"λη ":4716,"κιν":1980,"κισ":1099,"κκλ":769,"κην":749,"κητ":870,"κης":1489,"κλο":1123,"κλε":952,"κλη":1528," th":948,"ηχα":829,"ηρο":1562,"ηρι":1012,"ηρε":814,"ηρί":1339,"ησε":2338,"ηση":3739,"ησι":2711,"ησί":1393,"ητο":760,"ητε":799,"ητι":2642,"ητή":1996,"ητέ":684,"ητα":2938,"θήν":1152,"θήκ":739,"θαν":924,"ιά ":3103,"θέτ":764,"θέσ":1185,"ηθο":652,"θε ":885,"ηγό":671,"ηγο":953,"ηνα":1733,"ηνι":4662,"ηνο":892,"ηκα":1019,"ηκε":8651,"ηθυ":973,"ημα":7616,"ημέ":1558,"ημο":3252,"ημι":2310,"ημε":2120,"ηλα":1079,"ηλε":1560,"θυσ":990,"ις ":13118,"κα ":1888,"ιβλ":841,"ιδι":2412,"ιδή":920,"ιαδ":1030,"ιαί":809,"ιατ":1212,"ιαφ":946,"ιας":3279,"ιασ":1512,"ιαμ":1313,"ιαν":1844,"ιακ":4564,"ιό ":1664,"θώς":900,"κή ":16317,"ιάδ":1072,"ιάς":1340,"ιάρ":876,"ιάν":812,"ιάσ":850,"κά ":10264,"θηκ":7617,"θην":731,"θετ":1279,"θεσ":1224,"θερ":1269,"θεω":1121,"θεί":1506,"ια ":20899,"ιο ":9017,"θος":942,"θολ":2157,"ιν ":1082,"θνή":736,"θμό":1010,"θνι":1551,"θλη":1496,"ενε":1043,"ενη":691,"ενα":1016,"εντ":2485,"ενν":2479,"ενο":2214,"ενι":1650,"εξα":953,"ενώ":1366,"ενό":1445,"εξά":752,"επι":5002,"επα":2136,"επί":3082,"επτ":1207,"επο":863,"ερα":4072,"εργ":1600,"ερά":780,"ερί":3339,"ερι":10543,"ερε":1426,"ερη":1880,"ερν":709,"ερμ":2853,"ερο":6623,"ερό":795,"εσί":785,"εσα":886,"εση":1176,"εσσ":1054,"εστ":833,"ετά":1901,"ετέ":733,"ετα":16396,"ετί":895,"ετε":731,"ετι":1741,"ετρ":2017,"ετο":896,"ευτ":1808,"ευσ":1149,"ευρ":1328,"ευκ":741,"ευθ":751,"εφα":751,"εχν":1145,"εωρ":1834,"εων":1309,"εύε":718,"είχ":1649,"είτ":3867,"είς":773,"είο":3068,"είν":22546,"δών":1462,"είμ":673,"δύο":1317,"είδ":966,"εία":3700,"εβρ":814,"εγά":1944,"εγα":1515,"εγκ":737,"εδρ":932,"εδο":1081,"εθο":675,"εθν":2241,"εκε":1191,"εκρ":827,"εκτ":2188,"ειτ":1270,"εκα":1164,"εκδ":780,"ειδ":2526,"ειο":1827,"ειν":862,"εισ":862,"εις":3437,"ειρ":2701,"εια":5347,"εμι":793,"εμπ":762,"εμο":808,"ελφ":670,"εμβ":1975,"ελλ":4425,"ελι":1033,"ελο":1356,"ελέ":801,"ελε":3847,"ζον":1351,"ζου":798,"ης ":54494,"εύο":1107,"εύτ":1065,"ζετ":3914,"ζει":1250,"ην ":28066,"δη ":882,"γεί":984,"γεθ":651,"γει":1005,"γεν":3283,"γερ":916,"γετ":873,"δα ":3858,"γγλ":2042,"γγε":798,"γγρ":1089,"γαλ":2463,"γαν":1627,"γασ":694,"γία":4119,"γής":750,"δή ":1030,"γάλ":2004,"γάν":752,"γέν":1475,"βόρ":789,"γωγ":1517,"γρα":5451,"δο ":1489,"γρά":2441,"γού":1541,"γον":1405,"γος":1565,"γορ":893,"γου":2071,"γνω":3224,"γμα":1795,"γλώ":959,"γλι":810,"γκό":699,"γκο":1093,"γκρ":957,"γκε":929,"γκα":1052,"γισ":983,"γιο":2940,"γιν":873,"γικ":1875,"για":6740,"δηλ":1052,"δημ":2586,"δης":760,"δια":7984,"διά":2910,"διο":3225,"δικ":4263,"διε":2036,"ει ":13053,"δεν":1174,"δεκ":905,"δει":815,"δελ":851,"δες":1552,"δεύ":817,"δία":1566,"δήμ":1730,"γών":991,"εί ":5299,"δας":1990,"γων":2003,"γός":1387,"δων":753,"ες ":14729,"δυν":920,"δυτ":1257,"δου":1522,"δοσ":2455,"δος":1488,"δον":679,"δομ":954,"ερ ":1474,"δρύ":879,"δρυ":713,"δρο":2284,"δρα":2109,"εν ":2162,"αίν":1135,"αία":2533,"αίτ":698,"αίρ":1304,"αίο":2174,"αγι":742,"αγγ":1827,"αγο":1254,"αγκ":1429,"αγμ":1117,"αγν":698,"αδι":1357,"αδε":988,"αγω":2920,"αδή":866,"αβε":1016,"ας ":38230,"αρί":2489,"από":22187,"αρά":1986,"απο":6560,"απα":958,"απε":757,"αξύ":1143,"αξι":781,"ανώ":968,"ανό":2218,"αντ":8056,"ανο":2939,"ανθ":906,"ανι":4409,"ανδ":1912,"ανε":2278,"ανα":6682,"ανά":2554,"ανέ":714,"ανή":1615,"ανί":2957,"ατό":1156,"ατε":1300,"ατι":5266,"ατη":1870,"ατρ":1884,"ατο":9284,"ασσ":883,"αστ":6688,"ατά":4672,"ατέ":1441,"ατί":1716,"ατα":6390,"αση":2971,"ασι":3609,"ασκ":1313,"ασμ":1014,"ασί":3461,"αρτ":1786,"αρχ":6414,"αρα":6192,"αρμ":849,"αρο":1166,"αρι":2077,"αρκ":848,"αθο":890,"αθη":1010,"αθμ":688,"αθλ":1208,"αζί":676,"αερ":700,"αλύ":2369,"αμέ":982,"αμβ":1197,"αμα":698,"αμε":1031,"αμμ":1433,"αμο":749,"αμπ":849,"ακό":2346,"αλά":833,"αλα":2736,"αλί":1668,"αλε":761,"αλι":1989,"αλλ":4951,"αλο":1928,"ακή":1031,"αιό":1091,"αιώ":1483,"ακά":695,"ακε":724,"ακα":1680,"ακο":1635,"ακρ":1212,"ακτ":3182,"αθώ":989,"αιδ":651,"αιο":1032,"αιν":1733,"αιρ":2790,"γή ":1222,"βαν":794,"βασ":2971,"γα ":679,"αυρ":642,"αυτ":5333,"αφί":785,"αφή":1070,"αφέ":2568,"αφε":651,"αφι":1185,"αφο":1697,"αϊκ":1749,"βάν":1252,"βάλ":748,"βέρ":1034,"γο ":1180,"βου":1328,"βολ":1321,"βρο":795,"βρί":4251,"βρα":834,"βιβ":715,"βικ":723,"βιο":702,"γκ ":753,"ήτη":852,"ήτα":9260,"ήσε":1365,"ήσο":649,"ίς ":1391,"ήρξ":1088,"ήρι":880,"ήρε":756,"ίο ":4669,"ίες":2648,"ίζο":1090,"ίζε":3305,"ίδο":723,"ίδε":1013,"ίδη":788,"ίδι":1386,"ίδα":1853,"ίας":15390,"ίοδ":915,"ίος":1746,"ίου":12007,"ίπο":938,"ίνε":1855,"ίνα":24006,"ίνη":1713,"ίνο":1856,"ίλι":696,"ίκη":1461,"ίκο":1574,"ίχε":1140,"ίρι":793,"ίτλ":825,"ίτε":1304,"ίτη":995,"ίστ":2304,"ίτα":3434,"ίσκ":3610,"ίση":1856,"ίων":2775,"ίως":1118,"αι ":81620,"αν ":18884,"άνο":1282,"άνν":919,"άνι":1040,"άνε":1521,"άνδ":657,"άνα":777,"άνω":1396,"άντ":1482,"άμε":1147,"άλι":654,"άλη":999,"άλλ":3199,"άλα":1032,"άλο":1498,"άσε":1019,"άρι":1410,"άρκ":849,"άρτ":1022,"άρχ":1270,"άπο":1033,"άφο":2166,"άτω":1906,"άτι":758,"άτο":2114,"άτα":711,"άτη":871,"ές ":9688,"άστ":2257,"άση":1198,"έγε":1024,"έγι":781,"έδρ":1043,"έας":1468,"έλο":1016,"έλα":1017,"έλε":1428,"έλη":777,"έκτ":1194,"ένα":9814,"ένη":1988,"ένε":1716,"ένο":3647,"ένν":695,"έντ":1717,"ένω":1354,"έπε":703,"έρα":2072,"έργ":1441,"έρο":1656,"έρν":854,"έρε":2865,"έρι":1085,"έρχ":675,"έτο":668,"έτρ":1180,"έτη":1275,"έστ":768,"ής ":18113,"έση":981,"έχε":3074,"έχο":1069,"έχρ":1025,"έως":769,"ία ":26814,"ήθη":3124,"ήκε":1241,"ήμο":3064,"ήμε":1045,"ήμα":3234,"ήνα":1396,"άζε":1809,"άζο":714,"άδα":3349,"άδε":985,"άδο":1280,"άκη":718,"άθε":1012,"έα ":1626,"άς ":2864,"άν ":930,"Του":646," θρ":674," ιδ":2116," ηλ":1023," ημ":735," θέ":1277,"Τα ":1230," θά":727," θα":781," θε":3732," κρ":1753," κο":3790," κλ":1289," κυ":2799," λα":1212," κύ":1020," κό":2579," λέ":1007," λε":1503," ισ":2453," κα":52893," κέ":693," κά":2601," κι":1426," κε":1166," μυ":2558,"Σεπ":721," μο":3409," μπ":1646," νη":980," νε":1091," μό":948," να":5646," λο":894," με":23063," μη":1415," μι":5537," μά":792," λό":896," μέ":5756," μί":2039," μα":2448," ον":2891," ολ":648," ομ":3122," οι":5274," νό":1365," νο":3391," ξε":646," ίδ":940," γα":745," βό":838," βι":1203," βο":1444," βρ":3003," αφ":1044," αυ":4681," ατ":926," ασ":3167," αρ":6507," βα":3334," βά":792," αθ":840," αε":736," αγ":3185," αδ":879," απ":30213," αν":12857," αλ":2584," αμ":1021," αι":1819," ακ":2186," δύ":1562," εί":24110," εγ":909," δυ":1764," δο":1194," δρ":920," δε":3712," δη":3127," δι":13223," δή":1731," δί":798," γυ":721," γν":3012," γλ":1409," γρ":1532," γε":4322," γι":8013," ζω":871," ευ":1283," εφ":760," ετ":975," ερ":1138," επ":11178," εξ":2635," εν":5151," εμ":1144," ελ":4400," εκ":4866," ει":1529," εθ":1094," Χα":1287," Χρ":997,"Συν":780,"Στη":2306," άλ":1949," άν":742," έκ":1202," έλ":791," έδ":1173," έγ":922," έχ":2858," έν":9401," έρ":1085," έτ":734," ήτ":9263," Ολ":1008," Οκ":750," Οι":2387," Πε":2131," Πα":4958," Πο":2479," Ου":928," Ορ":742," Πά":1151," Ρο":774," Πρ":2843," Πό":722," ο ":9144," Ρω":975," Σο":1128," Σι":796," Σκ":861," Σε":1934," Σα":1339," π ":930," Τρ":1615," Τσ":726," Το":8380," Τζ":1148," Τα":1842," Σύ":1275," Στ":4987," Συ":1596," Υπ":1004,"Το ":7170," Φρ":703," Φι":702," Φε":912," Ισ":1154," Ιτ":907," η ":9904," Ιο":1482," Κι":702," Κε":764," Κα":4954," Κά":941," Ιω":835," Ια":1066," Ηρ":653," Ηλ":946," Θε":1961," Ντ":1192," Νο":1695," Νό":654," Μπ":2698," Μο":1622," Να":926," Νέ":832," Νι":686," Λο":1279," Μά":1156," Μέ":676," Μα":4467," Με":3720," Μι":1148," Κο":2560," Κρ":1401," Κυ":1025," Κω":831," Κό":910," Κύ":1399," Λα":1229," Λε":940," Λι":661," Βα":2096," Αυ":2137," Αρ":2641," Ασ":1614," Αν":3964," Απ":2366," Αι":968," Ακ":643," Αλ":1804," Αμ":1723," Αθ":2042," Αγ":2491," ή ":7834," Ευ":1907," Εκ":836," Ελ":3840," Εθ":914," Ερ":803," Επ":1439," Εί":1505," Δι":2396," Δη":1617," Δε":1217," Δή":1387," Γε":2953," Γκ":1274," Γι":1594," Γο":753," Γα":1202," Βε":785," Βι":896," Βρ":2238," Βο":1594," Έχ":649," Έλ":1540," Έν":836," Ήτ":719," Χ ":688," Άγ":671," Ο ":11895," Η ":10438," Β ":692," Α ":894,"Παρ":1018,"Παλ":664,"Παν":1277,"Πρω":677,"Πρό":666,"Προ":732,"Πολ":1465,"Περ":871,"Οκτ":727,"Οι ":2013," φω":763,"Μαρ":1413," χα":1979," χι":822," χρ":4665," υψ":758," φα":980," φι":921," φο":1544," φυ":1533," τω":10299," τό":929," υπ":5803," σχ":2338," συ":11696," στ":39224," τα":7256," σύ":3950," τί":845," τέ":1460," τι":4962," τη":55058," τε":2163," τρ":3378," το":79535," τμ":1047," σή":911," σα":762," σπ":865," σε":9337," ση":3591," σκ":2184," πό":3153," πρ":16846," πέ":917," πά":1081," ου":1649," ορ":3515," οπ":6947," πο":24345," πλ":4421," πι":1545," πε":9563," πα":9674," όρ":2611," όπ":2757," όν":3176," ότ":1627," όλ":952,"Μεσ":722," ως":4538," χώ":2083," χω":2280,"Βρί":1153,"Βασ":1048,"Αμε":923,"Ανα":1441,"Αντ":1044,"Αθή":1140,"Αυγ":651,"Αυτ":975,"Αστ":933,"Απο":822,"Αρχ":657,"Αγγ":839,"Αγί":679,"Δημ":1529,"Δεκ":668,"Δια":869,"Δήμ":1366,"Γερ":1165,"Γεν":1089,"Γαλ":1018,"Ελλ":2950,"Εθν":907,"Είν":1389,"Ευρ":1295,"Θεσ":739,"Ιαν":723,"Κων":707,"Με ":1185,"Ιτα":855,"Ιου":1221,"Καλ":850,"Καρ":959,"Κατ":1044,"al ":791,"and":770,"an ":997,"ati":806,"Έλλ":1427,"Ήτα":715,"ion":1164,"he ":670,"ia ":803,"igh":691,"ing":654,"ht ":652,"hum":677,"er ":1543,"es ":1146,"en ":662,"ght":666,"ng ":653,"on ":1503,"mb ":660,"us ":771,"umb":700,"tio":831,"thu":700,"ter":753},"n_words":[6375777,7261876,4927375],"name":"el"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":116547,"E":296654,"F":128129,"G":121338,"A":269964,"B":167407,"C":324676,"L":239740,"M":232523,"N":101570,"O":64162,"H":81475,"I":139475,"J":79180,"K":34991,"U":62793,"T":132122,"W":35590,"V":93781,"Q":9803,"P":223906,"S":275410,"R":140692,"Y":15953,"X":27052,"Z":15636,"f":602083,"g":868874,"d":3877179,"e":8874725,"b":812441,"c":2912236,"a":7916083,"n":5177793,"o":5444424,"l":3848407,"m":1698678,"j":201978,"k":118503,"h":478078,"i":4816050,"w":47097,"v":521828,"u":2624688,"t":3108332,"s":4177405,"r":4307485,"q":288923,"p":1539196,"z":257162,"y":639511,"x":118459,"²":9268,"Á":8848,"í":298098,"é":232623,"è":9587,"á":241717,"ú":87005,"ó":542725,"ñ":141698," l":925296," m":342591," n":169035," o":224265," h":153575," i":177440," j":57642," k":20882," d":2112466," e":1686012," f":346033," g":128207," a":615947," b":110799," c":841620," y":413167," z":8894," u":509709," t":263819," v":109427," q":191724," p":764411," s":537300," r":229895," J":77814," K":33034," H":77965," I":96403," N":91185," O":55452," L":232579," M":222195," B":158843," C":308303," A":248654," F":121663," G":115612," D":105557," E":280970," Z":14852," Y":15119," X":18791," S":256762," R":131715," Q":9140," P":211939," W":33301," V":79914," U":57665," T":122063," á":26258," é":11575," ú":11089," Á":8823,"A ":15731,"Da":13474,"Cu":13670,"Cl":11053,"Co":80802,"Cr":14045,"Ce":15444,"Ch":44811,"Ci":15164,"Ed":7168,"Du":7509,"Do":14187,"De":29603,"Di":25181,"Fe":19702,"Fa":9899,"Eu":12005,"Es":99956,"En":28786,"El":96353,"Ge":16269,"Ga":21659,"I ":25126,"Fu":23434,"Fr":30636,"Fo":10911,"Fi":10813,"C ":12178,"Au":18315,"Ar":39333,"At":7482,"As":13866,"D ":7805,"Ba":43405,"Ac":8941,"Am":14049,"An":29222,"Ai":9057,"Al":51515,"Bu":16123,"Br":22960,"Ca":88236,"E ":7609,"Bi":10052,"Be":24042,"Bo":25983,"Le":25199,"Li":24708,"La":111930,"Lu":14073,"Lo":44117,"Me":30598,"Mi":26023,"Ma":85855,"Mu":18246,"Mo":36242,"Ni":10392,"Ne":11654,"Na":26109,"Nu":10474,"No":24130,"Gi":8606,"Gr":21868,"Go":14614,"Gu":21771,"Ha":19115,"He":16228,"II":18780,"Hi":11870,"Ho":13033,"Hu":9391,"In":34662,"Is":11404,"Ja":18733,"Je":8830,"Jo":22070,"Ju":22045,"Ka":8249,"Un":40236,"VI":7056,"Tu":7329,"Tr":17446,"To":19269,"Th":18313,"Ti":12341,"Te":21087,"Ta":17232,"V ":8314,"St":15377,"Su":36610,"Wi":9415,"Wa":8415,"Vi":28361,"X ":8775,"Va":20820,"Ve":16615,"Mé":10810,"Pu":11178,"Pr":36130,"S ":9671,"Pe":30523,"Pa":58336,"Pl":10964,"Po":27786,"Pi":20810,"Or":15403,"Se":51296,"Sc":7275,"Si":21213,"So":22342,"Ru":9143,"Sa":70712,"Re":46909,"Ri":14206,"Ro":31238,"Qu":8096,"Ra":15098,"b ":16974,"a ":2807777,"i ":67723,"cá":7737,"ge":97443,"ga":124206,"fl":18249,"fi":107495,"fr":77696,"fu":123473,"fo":79998,"có":10267,"he":77025,"ha":138548,"gn":24260,"cé":16005,"gl":50338,"gi":128766,"gh":9410,"gu":117756,"gr":97493,"cí":12742,"go":114696,"du":66523,"g ":31294,"ea":126973,"eb":49402,"ec":261065,"ed":147913,"de":2140534,"di":323631,"dm":7851,"do":556174,"dr":51266,"ex":62245,"eu":25398,"añ":100385,"ev":65967,"ey":26516,"ez":45865,"fa":71295,"h ":31038,"fe":72710,"eg":169863,"ef":34480,"ee":18956,"el":821140,"ej":32037,"ei":38528,"ep":114508,"eo":63152,"en":1446857,"em":183658,"et":133793,"es":1262459,"er":848036,"eq":18834,"aí":17948,"ca":518269,"e ":2816707,"bs":7367,"br":179247,"bu":47229,"bo":65362,"bl":117456,"bi":135239,"be":67612,"da":466302,"f ":15176,"cu":158454,"ct":134054,"cr":79888,"co":691321,"ck":21843,"cl":46854,"ci":780154,"ch":115875,"ce":230782,"cc":44533,"c ":23067,"az":30061,"ay":59587,"ba":136047,"d ":184141,"at":211115,"as":567489,"ar":691307,"aq":9531,"av":48388,"au":72787,"ak":11907,"al":659633,"ai":51423,"aj":50703,"ao":8102,"ap":77232,"am":297433,"an":805183,"ac":374463,"ad":631439,"ab":144055,"ag":85149,"ah":13118,"ae":43681,"af":21278,"nu":35537,"nt":779819,"ns":140899,"nq":11336,"no":368063,"nn":24107,"nz":31492,"ny":8769,"nv":18820,"oe":24400,"of":39013,"oc":180019,"od":93463,"oa":18159,"ob":123195,"om":334368,"on":689571,"ol":230551,"oi":24324,"oj":12159,"og":57063,"oh":10801,"m²":9191,"ot":93533,"os":699509,"ov":85328,"ou":52929,"op":77349,"oo":15677,"or":691238,"r ":505089,"ox":9374,"ow":10366,"oz":11222,"oy":19941,"lá":14567,"pe":253260,"pa":326975,"pl":67808,"lé":20848,"po":370364,"ph":12436,"pi":101456,"lo":395718,"lm":47465,"ll":193235,"ls":13390,"lp":10103,"lv":15446,"lu":77232,"lt":59369,"ly":9017,"o ":1816298,"ma":323604,"mb":142619,"me":329898,"mi":236057,"mm":8572,"ié":32355,"mp":132049,"mo":207044,"mu":126184,"ió":324878,"p ":13048,"na":683565,"nc":291584,"nd":277346,"ne":311456,"nf":32397,"ng":105777,"ni":309689,"nj":11352,"nk":8262,"nm":8232,"ju":57258,"fí":9213,"jo":44644,"ki":14501,"ke":14800,"ka":12118,"m ":52750,"gó":7060,"gí":14289,"gé":21878,"km":14607,"li":353994,"le":344098,"ld":22533,"lg":19778,"lf":10189,"la":1037960,"lc":15579,"lb":23157,"gú":7200,"n ":1645057,"hr":7436,"dí":25020,"ht":8163,"hu":24887,"hi":87814,"ho":64406,"dé":12258,"id":350652,"ic":527484,"ib":58759,"ia":458387,"ig":130847,"if":53944,"ie":349344,"k ":32633,"ir":119098,"is":363824,"it":317793,"iu":42028,"eñ":26834,"iv":114899,"eó":10608,"ij":12968,"ik":8874,"il":242265,"im":150543,"in":518205,"io":362002,"ip":90246,"je":41244,"iz":77284,"l ":1070460,"ja":41449,"xi":40771,"té":18357,"xp":12413,"tí":29392,"tó":61297,"xt":20556,"só":7077,"z ":55163,"tá":43968,"sé":9023,"sí":13754,"ró":22130,"y ":494983,"wa":13090,"ré":8377,"vi":184322,"rí":60498,"vo":65147,"uz":11349,"uy":29308,"ux":7768,"uv":14743,"rá":28876,"ve":131509,"va":112048,"x ":22885,"ui":101034,"uj":10219,"ul":130016,"ue":529325,"ug":37794,"ur":196055,"us":135539,"ut":75455,"um":62826,"un":688515,"uo":9672,"up":46545,"tu":178275,"tt":18266,"pó":8915,"ub":69875,"ua":157161,"ud":82055,"uc":86063,"w ":7730,"pú":12838,"to":492464,"tl":11756,"ts":11628,"tr":358077,"te":662982,"ti":404422,"th":33828,"tb":14576,"ta":636505,"su":163099,"ss":27514,"st":570360,"sl":23077,"sk":8256,"sm":35452,"sp":146933,"so":177522,"sd":21641,"sc":98040,"se":328771,"sh":16915,"si":316726,"rz":18465,"u ":97650,"nú":7450,"sa":193871,"rr":132506,"rs":76964,"rt":263306,"ru":82530,"rv":23971,"nó":12071,"ry":14114,"ní":12258,"rq":18301,"rp":16192,"ro":473902,"rn":71986,"né":9922,"rm":118903,"rl":28428,"rk":11610,"ri":592813,"rg":83834,"rf":13796,"ná":7351,"re":689451,"rd":86475,"rc":86488,"rb":27212,"ra":734384,"t ":94930,"mú":13777,"mó":11660,"qu":286748,"mí":7526,"mé":17124,"má":63634,"s ":1974557,"pt":30720,"pu":78951,"ló":19506,"lí":47975,"pr":228655,"ps":9053,"zó":8970,"zi":7657,"za":111482,"zu":8538,"zo":36191,"ye":23683,"ya":28514,"uí":10738,"yo":35768,"ué":9949,"² ":9260,"án":62336,"ál":20287,"áf":8040,"ác":15525,"ár":16431,"át":16282,"ás":49088,"á ":27321,"óg":11484,"ód":8472,"ór":13833,"ón":382883,"óm":11589,"ól":14360,"ó ":71778,"ña":56328,"ño":76416,"ín":33019,"ím":10418,"ío":16786,"ít":28434,"ís":34065,"íf":8702,"íc":18334,"íd":7650,"ía":105050,"í ":14487,"él":11392,"én":55163,"és":52145,"ét":12035,"ér":34634,"éx":11216,"éc":11450,"é ":16494,"ún":21741,"úl":7364,"út":8499,"ús":12464,"úb":12720,"ú ":7505,"一":7134," Ga":21582," Ge":16178," Fo":10836," Fu":23266," Fr":30592," Fi":10737," Ha":19064," He":16181," Go":14559," Gr":21740," Gu":21689," Gi":8540," Hu":9382," Ho":12969," Hi":11833," Je":8808," Ja":18701," Is":11343," In":34554," Ka":8187," Jo":22025," Ju":22012," La":111649," Le":25083," Li":24432," Ma":85482," Mi":25830," Me":30504," Lo":44008," Lu":14044," Ne":11540," Na":26019," Ni":10343," Mo":36161," Mu":18130," Am":14012," An":29155," Al":51409," Ai":9020," Ac":8911," Ba":43282," Au":18286," At":7457," As":13733," Ar":39211," Be":23973," Bi":9959," Bo":25824," Br":22869," Bu":16073," Ca":87586," Ce":15402," Ci":15105," Ch":44683," Cl":10942," Cr":13930," Co":80501," Cu":13492," Da":13336," Di":25027," De":29418," Do":13935," Du":7489," Ed":7143," El":96116," Es":98653," En":28605," Eu":11992," Fe":19655," Fa":9820," Wi":9336," Wa":8349," a ":151113," Or":15358," Po":27637," Pl":10897," Pi":20772," Pe":30452," Pa":58153," Nu":10454," No":24018," Ra":15024," Qu":7915," Ro":30970," Re":46794," Ri":14181," Pr":36030," Pu":11158," Mé":10807," Su":36537," St":14732," Ta":17163," Th":18237," Ti":12302," Te":20935," Tr":17351," To":19041," Ru":9128," Sa":70550," Si":21147," Sc":7137," Se":51093," So":22241," Va":20787," Ve":16562," Vi":28257," Tu":7181," Un":40136," ja":7912," im":16980," in":113728," is":9470," it":12073," ju":44169," ha":71347," he":17319," gr":47141," go":9321," gu":12156," id":8758," hi":30296," ho":20186," hu":9280," ni":9094," ne":10791," na":40693," mu":56375," mo":34821," oc":24698," of":15543," ob":20507," nu":10795," no":85354," le":29852," li":28752," la":626496," gé":18729," km":14107," me":61144," mi":48266," o ":71023," ma":85649," lu":16286," ll":24228," lo":186366," ag":19771," ab":21502," ac":45971," ad":18011," am":14490," an":45970," ap":24125," al":116261," au":26320," ar":40593," at":8359," as":24315," ba":52391," bi":11562," bo":13161," br":12518," ca":157850," e ":10258," er":14855," eq":8506," es":539820," en":614552," em":20859," ej":8137," el":384203," fe":21573," fa":54639," añ":28328," ex":38515," fu":116273," fr":61229," fo":35270," fl":10710," fi":33918," ge":20831," ga":13109," cl":15585," co":462664," cr":30029," ce":31950," ch":12983," ci":56484," da":13407," cu":60477," do":35749," de":1878742," di":146031," ed":14319," du":20922," té":10997," tí":7323," ru":8901," sa":21596," se":189308," si":104740," so":62800," qu":191524," mú":9473," ra":15504," re":170810," ro":18871," pu":38713," pr":177892," lí":9079," má":36795," ot":17424," or":52385," pe":108853," pa":137566," pl":26836," po":242173," pi":19178," y ":406137," va":23514," ve":32789," vo":9768," vi":36979," ub":13189," tu":8296," us":12351," ut":9609," un":466456," ta":42321," su":139495," tr":62181," to":30228," th":7488," ti":31564," te":60962," ár":9835," ál":11624,"Fer":7120,"Es ":20079,"Est":40499,"Esp":30688,"Eur":9266,"El ":89488,"En ":18934,"Gar":7699,"Fue":18788,"Fra":22586,"II ":12947,"Gue":7438,"Gra":11989,"Int":10608,"Ind":7667,"Arg":10516,"Bar":13123,"Ale":9873,"Alt":8713,"And":7070,"Ant":10595,"Cal":12289,"Cam":10802,"Cas":14393,"Car":17937,"Can":11389,"Ber":7187,"Chi":15815,"Cen":8021,"Cha":15858,"Cor":11229,"Com":17537,"Col":10868,"Con":24853,"Nac":11329,"Nue":8768,"Nor":14767,"Pla":8680,"Per":15205,"Par":20522,"Pro":17204,"Pri":7235,"Pre":7665,"Méx":8860,"Jos":9116,"Las":10322,"La ":78398,"Los":18518,"Med":8474,"Man":10902,"Mar":33709,"Mad":9566,"Mon":15534,"Mun":7799,"Su ":13104,"Sai":11675,"Sal":8578,"Se ":18407,"San":32494,"Rep":8544,"Val":13044,"Vil":8238,"Uni":27767,"The":12538,"bit":22599,"bio":8582,"bil":7159,"bo ":7079,"blo":8255,"ble":28350,"bli":27042,"bla":52392,"bol":20729,"bié":25416,"bor":11023,"be ":11171,"ban":26993,"bal":10177,"baj":17768,"bas":16282,"bar":17411,"ber":30080,"bia":11473,"bic":15037,"bie":16653,"ca ":145181,"car":45351,"cas":41067,"cat":14539,"can":94395,"cap":14546,"cac":16670,"cab":10529,"cad":51771,"cam":20670,"cal":48767,"ce ":35472,"bri":27454,"bro":15368,"bra":26651,"bre":105758,"bur":7145,"bum":12078,"am ":8792,"ajo":16624,"al ":274116,"aja":11561,"aje":17588,"ain":19507,"ais":7210,"agu":12491,"ago":28675,"anu":10190,"anz":18126,"ano":79509,"ann":7583,"ant":195066,"ans":17426,"ane":17755,"ang":19562,"ani":47254,"ana":70957,"anc":101262,"and":90623,"amo":12426,"amp":25988,"ami":57872,"ame":88516,"amb":41071,"ama":44395,"alu":9523,"alt":15595,"alo":13492,"alm":37560,"all":38280,"alg":10767,"ali":96797,"alc":7683,"ald":9255,"ale":82063,"ala":31798,"an ":104955,"aba":35637,"abe":14437,"abi":26160,"abl":18696,"abo":13547,"abr":22582,"ae ":28175,"aca":16009,"ad ":111062,"aga":11365,"ado":272062,"adr":17031,"adi":20049,"ade":31132,"adu":7585,"aco":10755,"aci":224093,"ach":10349,"ace":30766,"acc":10027,"ada":154737,"act":41870,"aza":10801,"ayo":25160,"aya":7832,"ba ":19878,"aqu":8935,"at ":7824,"arg":22636,"are":34407,"ard":31172,"arc":32672,"ara":95441,"aro":19229,"arn":7316,"arm":9270,"arl":13962,"ari":83070,"arq":11629,"arr":48459,"ars":9729,"art":119072,"asa":20743,"arz":11183,"asi":22274,"asc":10580,"ase":16265,"aso":13076,"ar ":100623,"apa":19232,"ape":7495,"api":11786,"apo":11441,"apr":7236,"as ":405615,"ava":9975,"aut":18769,"arí":13656,"avi":12074,"ave":14502,"ay ":13782,"ata":37974,"ast":48943,"atr":18116,"ato":31490,"ate":31650,"ati":40488,"atu":16694,"aun":7152,"aur":9494,"aus":7903,"jer":7889,"je ":13092,"jo ":28253,"ito":98982,"itu":57305,"eña":13429,"iud":31065,"ism":26830,"isl":10534,"iso":8290,"isp":12009,"ist":177840,"ita":90631,"ite":23119,"iti":14600,"ivo":27807,"eño":12432,"iva":29645,"ivi":28311,"ive":26878,"ipo":17187,"ipi":23278,"is ":44742,"ion":119701,"ior":15897,"ios":44527,"ipa":22022,"ir ":23370,"iri":18435,"isi":29089,"ise":10419,"isc":18799,"ire":21390,"ira":17066,"ja ":14889,"iza":56748,"km²":9037,"gía":13787,"gén":19973,"jul":9258,"jun":20993,"jue":11292,"ha ":23020,"ham":8200,"han":11281,"har":13249,"has":16766,"hab":28769,"hac":13453,"he ":23960,"her":16542,"hin":11792,"hil":16082,"his":17036,"ho ":17368,"go ":53446,"glo":15485,"gle":10695,"gla":7868,"gob":7093,"gni":7865,"gió":54574,"cés":10035,"glé":12339,"gon":9593,"gos":23034,"gru":17556,"gra":45297,"gri":10321,"gre":9359,"gui":14307,"gua":27766,"gue":27456,"gur":7400,"gun":20675,"iam":7087,"ial":56085,"ian":52472,"ias":29422,"iar":10235,"ic ":7674,"iac":10109,"iad":14476,"ibl":8505,"ibi":8164,"ibr":11003,"ibu":11957,"id ":12492,"ibe":11766,"ia ":260890,"iet":7563,"iel":8991,"iem":41853,"ien":137287,"ier":54165,"ies":19531,"ied":10903,"ieg":9534,"ifo":15425,"ife":9766,"ifi":21950,"ict":9719,"ico":124072,"ici":106469,"ich":18974,"ice":11245,"ie ":46477,"ica":217183,"ido":115195,"idi":11705,"ide":56950,"ida":143468,"il ":29082,"ige":12181,"iga":13218,"igl":16846,"igi":27180,"igu":25326,"igo":7750,"ign":13613,"imo":21293,"imp":19247,"ime":39882,"imi":27512,"inc":70215,"ind":21266,"ina":105516,"ino":52448,"int":60589,"ins":15850,"inf":13445,"ine":33717,"ing":45586,"ini":36645,"iod":9277,"inv":7546,"ila":17117,"in ":30692,"ilo":12791,"ill":61746,"ili":73439,"ile":24670,"ima":28669,"io ":149275,"día":18502,"hom":8228,"hos":7240,"hor":7815,"hum":8623,"fes":10458,"fer":25898,"fec":9155,"feb":8937,"fam":38352,"ext":18108,"ez ":27186,"exp":10760,"exi":15850,"eza":8196,"eta":35458,"ete":19089,"eti":13610,"esp":91539,"eso":20238,"est":182109,"aña":34948,"año":60064,"eto":13724,"etr":18301,"epú":7797,"eve":10842,"eva":20123,"evo":10163,"evi":21593,"erí":14511,"ey ":19031,"epe":9088,"er ":98611,"epa":47324,"eos":9539,"eor":7614,"eon":7647,"emá":15175,"es ":719197,"ept":16454,"epo":8010,"epr":9962,"elí":11498,"eri":85302,"erg":11010,"ere":38036,"erf":9607,"erc":31567,"erd":14302,"era":127455,"erb":7502,"et ":12616,"equ":18716,"aís":14636,"esi":46520,"esc":45995,"esd":20115,"ese":28957,"esa":79475,"erv":17764,"err":53481,"ert":62557,"ers":52410,"ern":44545,"erm":26731,"erp":8929,"ero":107826,"en ":609899,"ela":41071,"ele":43829,"eli":19783,"ell":40806,"elo":21079,"eo ":20655,"emb":41784,"ema":33787,"eme":18576,"emo":14924,"emi":17348,"emp":34242,"ene":94564,"eng":10179,"ena":41586,"end":51128,"enc":77537,"eno":37094,"eni":23612,"ens":46652,"ent":408861,"enz":9528,"ego":28111,"egi":66647,"egr":10013,"egu":24464,"ein":14332,"eja":7548,"el ":611266,"ejo":12599,"eje":7469,"gin":15815,"gio":12026,"gid":10945,"gic":10164,"gen":57105,"ger":8119,"ge ":11010,"gad":14992,"gas":7988,"gar":19699,"gal":9045,"gan":23906,"ga ":30329,"fue":90978,"fun":22112,"fra":50839,"fre":13616,"fri":7302,"for":59914,"flo":7804,"fic":57976,"fil":9492,"fin":19583,"da ":212873,"de ":1556339,"dad":141389,"dal":9998,"dae":14141,"das":36381,"dar":9501,"dan":12985,"dam":10048,"cul":40621,"cue":32934,"cua":33441,"ctu":33625,"ctr":9939,"cto":44819,"cti":20454,"cte":9828,"cta":10659,"cur":10605,"cuy":7974,"cla":14341,"clu":14576,"co ":135190,"ció":209489,"con":251381,"col":25189,"com":184452,"cor":25629,"cos":36158,"cre":24696,"cri":31616,"cro":8764,"cci":38750,"cea":16334,"ch ":11118,"cer":30048,"ces":71319,"cen":32729,"cep":7727,"cel":19017,"ced":7879,"cha":26138,"cia":152068,"ck ":13234,"cie":87706,"cid":70600,"che":19885,"chi":20475,"cho":21852,"cil":9620,"cim":9340,"cir":8507,"cis":9341,"cit":8942,"ciu":26750,"cin":17134,"cio":103441,"cip":45552,"ed ":10329,"ebr":19067,"eae":10702,"ead":15707,"eal":19761,"eas":12813,"eat":7412,"ea ":28070,"efi":9256,"efe":15631,"ega":22427,"edi":50600,"ede":34956,"eda":21258,"edo":13652,"edr":9154,"ech":18820,"eci":89707,"ece":32005,"ecc":12316,"eca":7852,"ecu":14868,"ect":45028,"eco":21596,"dur":19421,"dor":54124,"don":18997,"dou":11963,"dos":96466,"doc":7141,"duc":27199,"dri":13291,"dra":9547,"dre":11310,"dro":9953,"dic":47817,"did":11186,"dia":35235,"der":49978,"des":90830,"deb":8351,"dec":9801,"def":7698,"del":228490,"den":85908,"dem":11684,"dep":50829,"deo":9414,"do ":341916,"div":13995,"din":11969,"dio":35658,"dir":18800,"dis":80875,"dit":9088,"die":14888,"dif":14152,"rga":23318,"ri ":7113,"rge":24597,"rgo":16017,"ret":23611,"res":140547,"rev":10637,"rfi":8102,"rea":43224,"ref":13241,"rec":55408,"red":15261,"rei":7333,"reg":68764,"rem":14749,"ren":51706,"rel":19501,"rer":18242,"rep":14882,"rda":8172,"rdo":13679,"rdi":9872,"rde":29313,"re ":155218,"rci":19308,"rch":7367,"rce":15953,"rca":25800,"rd ":11954,"rar":15085,"ras":50104,"rat":31492,"rav":8075,"rag":12449,"ran":146320,"ram":21212,"ral":52901,"rab":19532,"raf":7329,"rad":60032,"rac":43442,"rs ":11514,"ros":47929,"rot":15483,"rom":15707,"ron":42744,"rop":32071,"rov":41024,"rod":21257,"roc":21679,"rol":20667,"rof":11386,"rog":9763,"rno":13811,"rna":28042,"rne":12661,"rni":8562,"ro ":146408,"rma":60914,"rme":24156,"rmi":20846,"rla":8699,"riz":10435,"rio":82353,"rit":87431,"ris":26175,"rig":32503,"ril":18939,"rin":44691,"rim":37251,"ria":74022,"rib":16779,"ric":53993,"rid":24912,"rie":40300,"rtí":7814,"ruc":9429,"rup":19339,"rus":10637,"rva":8468,"rvi":10259,"ry ":9879,"rsi":16295,"rso":23313,"rse":13455,"rta":69414,"rto":22364,"rte":83807,"rti":44079,"rtu":12987,"nía":7088,"rt ":11603,"rqu":18224,"rro":28597,"rri":26111,"rre":31796,"rra":36840,"sad":15576,"sal":12684,"san":10863,"sas":11791,"sar":21220,"sa ":86942,"rzo":11621,"sie":10373,"sid":35818,"sic":33044,"sia":21182,"sit":47886,"señ":8377,"sis":26321,"sin":18695,"sio":18884,"sil":10556,"sim":10529,"sig":27477,"scr":25119,"scu":15948,"sde":20004,"se ":142516,"sca":14799,"sco":24677,"ser":42063,"ses":15571,"seg":17308,"sec":10078,"sep":14018,"sen":28636,"sem":7053,"sel":10295,"spu":8777,"spo":14448,"spe":51710,"spa":61850,"sol":12812,"son":47400,"sor":10955,"sos":12831,"soc":12868,"sob":16402,"su ":57230,"st ":10396,"sla":17876,"smo":24196,"sió":31343,"so ":44831,"stá":22167,"ste":88396,"sta":179440,"sto":49757,"sti":69501,"stu":17380,"str":118933,"sub":9390,"sul":7853,"sup":14503,"sus":22170,"sur":23147,"tal":73680,"tag":7515,"tab":14559,"tac":23593,"tad":77862,"tat":7964,"tas":38619,"tar":41661,"tan":67218,"tam":79203,"te ":284941,"tbo":12533,"ta ":179252,"pa ":14872,"pe ":7492,"par":158326,"pas":11530,"pac":12615,"pal":23552,"pan":11657,"pañ":59522,"pec":57888,"pen":14739,"peo":7168,"per":107782,"paí":9997,"pes":10236,"pel":18436,"pla":29458,"pli":11132,"ple":15398,"plo":9048,"pic":11535,"pie":8675,"pin":12467,"pio":24607,"pit":12508,"por":173351,"pop":7479,"pos":33566,"pon":17239,"pol":27348,"pob":43702,"pod":7343,"lés":13215,"po ":39204,"pub":10168,"pti":14042,"lít":16259,"líc":10958,"lín":8248,"pri":52537,"pre":58569,"pro":108338,"put":7419,"pul":11629,"pue":30894,"mát":8971,"más":38263,"mán":10709,"mér":9468,"que":227040,"qui":47867,"quí":8097,"ra ":231606,"mús":8775,"ncé":9514,"ngo":7965,"ngl":22080,"ngu":15031,"ni ":7789,"nge":13697,"nga":8189,"nen":11810,"neo":11720,"ner":65372,"net":8127,"nes":80792,"ng ":18654,"nea":12399,"nec":28654,"nfo":10116,"nez":7381,"nco":17791,"nci":146898,"ncl":10729,"nce":61430,"nch":9730,"nca":12577,"ne ":57285,"ndu":7292,"ndr":12605,"ndo":65513,"ndi":38911,"nde":54981,"nda":66093,"ncu":16386,"nal":70953,"nam":8218,"nan":12157,"nar":27443,"nac":46410,"nad":35305,"naj":8529,"nd ":14500,"nat":21175,"nas":35377,"na ":398062,"ión":292845,"ntó":37020,"nve":10579,"nue":9759,"nto":136463,"ntu":7064,"ntr":93850,"nti":67583,"nta":102723,"nte":292971,"nso":9665,"nst":30664,"nse":26710,"nsi":26727,"nsa":11053,"nt ":24387,"nqu":11277,"ns ":13747,"noc":38585,"nom":49236,"nos":42446,"nor":29422,"nov":19327,"nne":10554,"no ":158177,"nif":10693,"nie":9958,"nid":51044,"nic":69869,"nia":31660,"niz":14512,"niv":15892,"nis":28903,"nit":8537,"nio":20490,"nim":13518,"ogr":18966,"ogo":8779,"ol ":38964,"oce":17267,"och":8312,"oci":56829,"ock":8945,"oco":9047,"oca":41229,"ode":14611,"odi":11580,"odo":26635,"ocu":9100,"oct":12604,"of ":7638,"oda":9018,"oes":12161,"odu":18410,"ofi":7512,"ofe":9434,"oba":8003,"obr":25648,"obl":50514,"obi":12794,"obe":8533,"nza":20160,"ote":14228,"otr":17397,"oto":14558,"ost":31662,"ota":19189,"osi":18739,"ose":15508,"oso":14720,"ovi":57483,"orí":9114,"ove":16922,"oun":14708,"our":10655,"opo":9237,"opi":14838,"ope":13312,"opa":10564,"os ":567590,"opu":9913,"olí":19344,"or ":251715,"orm":64912,"orn":13575,"oro":14041,"orr":19323,"ord":30127,"ore":48358,"org":21075,"ori":66357,"osa":21383,"ort":65622,"m² ":9185,"ora":46259,"ola":30987,"on ":190943,"oli":25682,"oll":17825,"ole":17479,"olo":38775,"olu":11781,"ogí":11606,"ona":104550,"ond":38242,"onc":23216,"onf":9258,"one":74442,"ong":11339,"onj":7098,"oni":32298,"ono":52521,"ons":50424,"ont":59611,"onv":7285,"oma":33534,"ome":21150,"omb":45255,"omi":25678,"omp":41070,"omo":82407,"omu":58045,"la ":610751,"gún":7107,"le ":58880,"lab":11220,"lac":64631,"lad":24486,"lag":7484,"lan":69592,"lam":24610,"lar":41436,"lat":24612,"las":124120,"lbu":12473,"lon":23409,"lom":10655,"lor":23128,"loc":26941,"log":23375,"los":171022,"lme":34946,"lti":12566,"lto":11513,"ltu":9464,"luc":12530,"lug":7366,"lta":15082,"lgu":7617,"lev":15617,"les":91476,"let":11653,"ler":15144,"lem":26257,"len":30658,"leg":14703,"lec":26734,"lea":8424,"lo ":93016,"lla":74669,"lle":47097,"lli":13816,"llo":31751,"ll ":10675,"lit":23261,"lis":29055,"lio":17662,"lin":23082,"lim":8340,"liz":32987,"lic":48449,"lid":38365,"lia":73957,"lib":11554,"lig":10555,"lie":8152,"ma ":72422,"mac":13622,"mad":34215,"mar":42452,"mas":23691,"mal":12192,"man":63359,"may":20759,"mat":18421,"mba":10176,"mbi":42599,"mbr":71287,"me ":12825,"med":24892,"met":16848,"mes":19527,"mer":60276,"men":167009,"mex":7078,"lva":8274,"lus":8634,"mpi":8494,"mpe":21376,"mpr":14573,"mpo":32948,"mpl":23586,"mpu":10144,"mod":10208,"mon":23296,"mor":11861,"mos":17488,"mpa":16447,"ió ":27428,"mus":8143,"mun":83842,"min":50965,"mil":54218,"mis":20947,"mit":22265,"mic":21219,"mie":39274,"mo ":116388,"ién":27517,"zad":34608,"zac":11303,"zan":7115,"zar":10021,"zon":9132,"zo ":17792,"yor":11730,"za ":34624,"ya ":16470,"yo ":17934,"ués":7665,"tín":7356,"tér":8916,"tón":43482,"tán":19529,"xim":7461,"xic":17547,"tá ":17299,"sí ":7618,"rís":7245,"río":10310,"ría":28707,"rón":9102,"via":12659,"vil":17697,"vin":37424,"vic":9152,"vid":23929,"vie":21693,"viv":7676,"vis":25393,"vo ":33305,"vol":10345,"vos":8624,"ver":42481,"ves":9401,"ven":28441,"vel":14981,"ve ":15016,"val":14785,"van":9128,"var":20642,"vas":9770,"vad":11386,"va ":31522,"uye":9933,"uy ":8325,"usi":15007,"use":7793,"usa":13025,"ust":26126,"uso":9699,"uti":16202,"uta":12332,"uto":18658,"us ":49172,"ura":67622,"ure":8908,"urg":11824,"uri":19845,"uro":19849,"ur ":25587,"upe":15682,"upo":17158,"uma":10015,"ume":13099,"unt":24294,"uni":64830,"uno":27366,"unc":10467,"und":48857,"una":267868,"une":8988,"um ":20862,"ult":21405,"ulo":17762,"uli":17880,"ula":47246,"un ":217532,"uid":13351,"uie":12890,"uil":7952,"uin":8672,"uip":8207,"uis":11000,"ueñ":8502,"uit":13101,"uga":15579,"ugu":12215,"uct":12213,"uda":37565,"udi":17314,"ubr":12417,"uca":7683,"ue ":294774,"uce":7415,"ucc":8218,"uci":25554,"uch":12624,"uev":15569,"uer":46622,"ues":36069,"ueg":19032,"ued":16010,"ueb":8677,"uen":38522,"uel":28446,"púb":11593,"ua ":14731,"uat":7912,"uar":13380,"ual":41530,"uan":21270,"ubi":18564,"ubl":12045,"ud ":8479,"uad":38503,"tur":45480,"tul":9339,"tub":10280,"tua":54339,"tud":17240,"tug":8561,"tre":58815,"tra":116327,"tri":81313,"tru":16821,"tro":72914,"to ":261732,"tod":17241,"tos":55424,"tom":10133,"ton":20463,"tor":94895,"til":28629,"tie":38174,"tig":17405,"tir":9162,"tit":18235,"tis":12154,"tin":45124,"tim":14112,"tip":10749,"tio":15498,"tia":11282,"tic":87300,"tid":24642,"tiv":46603,"tem":32370,"ten":66323,"tel":23707,"teg":9394,"tea":9283,"tec":13760,"th ":7178,"tes":66100,"ter":127000,"the":10645,"éti":7504,"éxi":11167,"éne":20764,"én ":28809,"éri":13573,"érm":8428,"és ":45633,"áti":14202,"án ":27367,"álb":11283,"áni":16413,"ás ":40719,"úsi":8717,"útb":7936,"úbl":12344,"ún ":11275,"óni":12420,"ólo":7805,"ón ":361830,"ógi":7057,"ño ":26522,"ños":18182,"ñol":28121,"ña ":43408,"íti":20855,"íst":9984,"íne":7430,"ín ":16078,"ío ":10995,"ís ":12371,"ícu":14416,"ías":8590,"ía ":90214},"n_words":[70286890,82926999,60413548],"name":"es"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":3604,"E":9114,"F":3428,"G":4270,"A":13995,"B":6049,"C":4433,"L":9649,"M":10132,"N":6533,"O":2986,"H":6531,"I":5403,"J":4779,"K":13384,"U":2720,"T":11888,"W":2107,"V":9682,"Q":266,"P":11422,"S":15988,"R":6826,"Y":596,"X":461,"Z":653,"f":12728,"g":77107,"d":141364,"e":425996,"b":40246,"c":10761,"a":496421,"n":259681,"o":223900,"l":256122,"m":147242,"j":63354,"k":181311,"h":68975,"i":428108,"w":2663,"v":88215,"u":213745,"t":258531,"s":344443,"r":180124,"q":542,"p":61630,"z":3134,"y":5816,"x":1328,"²":200,"Å":72,"Ä":128,"Á":51,"Ü":1682,"ß":76,"Õ":259,"Ö":167,"í":335,"ì":56,"ë":66,"é":703,"è":77,"ç":89,"æ":123,"å":190,"ä":42375,"ã":61,"á":537,"à":86,"ü":30512,"ú":88,"ø":326,"ö":8126,"ô":78,"õ":40539,"ó":356,"ð":63,"ñ":47,"ē":125,"ė":44,"Ā":44,"ā":654,"č":90,"ı":50,"ī":336,"ş":74,"ń":54,"ł":96,"ō":306,"Ž":57,"ž":1084,"Š":453,"š":2423,"ū":213,"ǎ":46,"ə":85,"́":52,"μ":94,"ν":197,"ο":232,"ι":145,"κ":95,"λ":130,"δ":59,"ε":108,"η":90,"α":274,"γ":53,"ά":69,"ί":80,"ω":53,"ό":79,"σ":105,"ς":232,"ρ":156,"π":59,"υ":62,"τ":124," l":23928,"ь":570," m":38337,"э":66,"ю":76," n":13986," o":51528,"я":590," h":8823,"ш":180," i":10696," j":29617," k":61170,"ы":415,"ф":166," d":4378," e":23050,"х":292,"ц":201," f":3289," g":1891,"ч":785,"р":2377," a":29616,"с":1744,"т":1384," b":2333," c":602,"у":739," y":132," x":127," z":161," u":3277," t":29638," w":134," v":33321,"і":130,"ё":86," q":53," p":29550," s":33963," r":13644,"И":192,"Л":141,"К":318,"Н":192,"М":293,"П":266,"О":115,"Б":203,"А":448,"Г":287,"В":339,"Е":66,"Д":138,"З":71," J":4764," K":12980," H":6494,"Ш":75," I":5378,"Ю":47," N":6511,"Я":49," O":2964," L":9614," M":10069,"Э":79," B":6003,"Т":143," C":4377,"У":66,"Р":154," A":13953,"С":395," F":3376," G":4219,"Ч":60,"Ф":101," D":3559," E":9045,"Х":61,"л":1746," Z":652,"к":1535," Y":594,"й":835," X":432,"и":3293,"п":426,"о":3362,"н":2533,"м":786,"г":616," S":15877," R":6777,"в":1990," Q":264,"б":447," P":11356,"а":3415," W":2074,"з":300," V":9645,"ж":145," U":2711,"е":2779," T":11830,"д":957," ä":1217," õ":2318," ö":381," ü":7159," Ā":44," Á":51," Å":72," Ä":128," Ö":164," Õ":255," Ü":1674," ž":64," Ž":57," Š":453," š":125,"ն":64,"ա":118,"ו":53,"י":74,"ר":53,"و":125,"ي":278,"ف":48,"ل":301,"م":185,"ن":161,"ه":60,"د":159,"ح":106,"ت":46,"ب":196,"ة":66,"ا":449,"ع":100,"ش":48,"س":100,"ر":186," А":444," Б":203," В":336," Г":277," Д":137," Е":66," З":71," И":192," К":315," Л":139," М":291," Н":191," О":113,"A ":1491," П":265,"F ":209,"Da":636,"Cu":143,"Cl":241,"Co":1022,"Cr":170,"Ce":161,"Ch":852,"Ci":177,"G ":207,"Ec":84,"Ed":303,"Ea":53,"Du":151,"Do":581,"Dr":153,"De":809,"Di":576,"Fe":322,"H ":271,"Fa":384,"Eu":1055,"Ev":105,"Ex":60,"Er":534,"Et":156,"Es":507,"En":458,"Em":268,"Ep":67,"Ei":131,"El":689,"Ek":108,"Ee":3423,"Eh":79,"Eg":183,"Ge":880,"Ga":592,"I ":1170,"Fu":158,"Fr":833,"Fo":411,"Fl":194,"Fj":45,"Fi":490," б":52," г":58," в":53," д":53," и":51," к":97," н":58," м":69," п":140,"B ":254," о":107," Р":152," С":394," Т":143," У":64," Ф":99," Х":61," Ч":60," Ш":75," Э":77," Ю":47," Я":48,"C ":532," с":79,"Av":172,"Au":837,"Ar":1361,"At":438,"As":1550,"D ":280,"Ba":1524,"Az":50,"Ae":128,"Af":163,"Ag":213,"Ah":170,"Aa":1076,"Ab":517,"Ac":135,"Ad":407,"Am":1404,"An":1589,"Ap":221,"Ai":278,"Aj":171,"Ak":327,"Al":2030,"Bu":590,"Br":896,"Ca":1008,"E ":421,"Bi":534,"Be":1094,"Bo":722,"Bl":189,"Kv":67,"Ku":1642,"Gö":61,"Ky":76,"Kn":47,"Kl":321,"Kr":1260,"Ko":2411,"Le":1403,"Hä":127,"Li":2750,"N ":331,"Gü":52,"La":1509,"Lu":547,"Ly":58,"Hõ":54,"Lo":1151,"Me":1361,"Dž":49,"Mi":1243,"O ":457,"Ma":3890,"Hü":108,"Mc":62,"My":97,"Mu":881,"Mo":1371,"Jä":515,"Ni":728,"Ne":1120,"Na":1009,"P ":412,"Ny":45,"Jõ":269,"Nu":172,"No":1468,"Ok":125,"Ol":339,"Om":123,"On":147,"Oh":110,"Kä":196,"Oi":46,"Od":88,"Of":51,"Jü":111,"Ob":154,"Gi":310,"Gl":141,"Gr":855,"Go":370,"Gu":494,"Gy":49,"J ":73,"Ha":1920,"He":1336,"Hi":1244,"Ho":963,"Hu":297,"Hy":59,"K ":351,"Dü":69,"Id":463,"Ig":87,"Io":55,"Im":143,"In":1490,"Il":192,"Ii":222,"Iv":146,"Is":449,"It":383,"Ir":247,"Ja":1273,"L ":301,"Ji":117,"Je":477,"Jo":991,"Ju":835,"Fü":63,"Ka":3521,"Fö":134,"M ":228,"Kh":87,"Ki":1209,"Ke":1467,"Us":124,"Ut":61,"Ur":136,"Um":60,"Un":420,"Uk":201,"Ul":110,"Ud":58,"Pü":264,"W ":104,"Pö":45,"Ty":76,"Põ":1331,"Tv":68,"Tu":682,"Tr":728,"Ts":169,"To":953,"Pä":660,"Th":747,"Ti":538,"Te":1883,"Ta":4354,"V ":865,"Sy":110,"St":1249,"Sv":158,"Su":1472,"Wo":216,"Wi":599,"Wh":48,"Sä":63,"Rü":62,"Wa":532,"We":428,"Rõ":64,"Vo":452,"Vu":47,"Rä":65,"Vi":1892,"Vl":115,"X ":185,"Va":3312,"Ve":2352,"Uu":347,"Mä":340,"Lü":129,"Pt":77,"Pu":606," م":56,"Pr":1526,"Ps":58,"S ":641,"Lõ":671,"Py":58,"Pe":1263,"Pa":2287,"Kü":279,"Pl":289,"Po":1247," ع":51,"Pi":1126,"Ph":224,"Lä":1080,"Os":432,"Ot":174,"Ou":77," ا":173,"Oo":46,"Op":127,"Or":500,"R ":183," ب":58,"Kõ":416,"Kö":81,"Se":3048,"Sc":514,"Si":1380,"Sh":515,"Sm":85,"Sl":182,"Sk":219,"Sp":264,"So":1693,"Ru":426,"Nõ":704,"U ":154,"Sa":3549,"Nü":46,"Re":985,"Ri":1129,"Nä":146,"Rh":72,"Ro":1769,"Qu":132,"Mõ":229,"T ":213,"Ra":1769,"Mü":121,"Tš":407,"b ":14100,"a ":101956,"Tõ":185,"Tö":129,"Tü":236,"Ya":114,"Yo":251,"Yu":48,"Z ":57,"Sõ":312,"Sö":54,"Sü":198,"Xi":96,"Tä":315,"Tō":50,"Za":127,"Ze":122,"Zh":62,"Vä":634,"Zi":61,"Zo":49,"Zu":94,"Võ":414,"bö":50,"i ":67391,"gd":177,"ge":11091,"ga":17189,"bü":159,"fj":156,"fl":261,"fg":90,"ff":336,"fi":3834,"bä":233,"fs":80,"fr":799,"fu":505,"ft":358,"fo":2004,"j ":178,"gy":115,"he":15992,"ha":9696,"gn":1067,"gm":312,"gl":3003,"gk":631,"gj":169,"gi":15351,"gh":514,"gg":188,"gv":123,"gu":12657,"gt":248,"gs":906,"gr":3204,"gp":765,"go":1593,"dt":600,"du":15412,"dv":357,"dw":178,"dy":147,"dz":76,"g ":7339,"ea":10628,"eb":4711,"ec":855,"ed":12410,"de":25494,"dd":200,"dg":178,"di":20365,"dh":181,"dk":488,"dj":432,"eK":296,"dm":1746,"dl":1518,"do":2346,"dn":649,"dp":77,"ds":1055,"dr":3163,"ew":508,"ex":366,"eu":1294,"ev":10732,"ey":521,"aõ":117,"ez":242,"fa":820,"aü":201,"h ":1684,"fe":1177,"eh":9506,"eg":11045,"ef":1060,"ee":30805,"el":44868,"ek":15889,"ej":1876,"ei":11617,"ep":3591,"eo":4613,"en":30509,"em":17859,"et":21350,"es":49834,"er":30979,"eq":50,"ca":1187,"e ":97234,"by":175,"bs":372,"br":3541,"bu":2050,"bt":50,"bn":97,"bo":1301,"bj":462,"bk":63,"bl":1788,"bh":62,"bi":4858,"bb":142,"bd":109,"be":5461,"db":83,"da":21215,"f ":1128,"cz":59,"cy":123,"cu":367,"ct":474,"cs":86,"cr":248,"co":978,"cm":60,"ck":1210,"cl":163,"ci":834,"ch":3094,"ce":1021,"cc":199,"c ":475,"az":442,"ay":578,"ba":4886,"d ":44215,"at":32525,"as":52565,"ar":30297,"aq":50,"ax":179,"aw":265,"av":11952,"au":7185,"ak":18860,"al":53791,"ai":13608,"aj":8772,"ao":2265,"ap":5317,"am":18369,"an":38127,"ac":1403,"ad":21341,"aa":42535,"ab":9670,"ag":6434,"ah":11107,"ae":3739,"af":1986,"nu":11347,"nt":12596,"ns":4828,"nr":469,"np":151,"no":4692,"nn":17075,"q ":46,"nz":355,"jö":51,"ny":360,"jõ":2814,"nw":57,"nv":314,"oe":1975,"of":1882,"oc":974,"od":6416,"oa":902,"ob":2993,"om":10938,"on":61110,"ok":4533,"ol":25486,"oi":2595,"oj":1236,"og":7246,"kä":1444,"oh":4211,"ot":7307,"hō":45,"m²":200,"os":13528,"ov":3905,"ou":1255,"op":3770,"oo":36377,"or":18504,"r ":11002,"ox":103,"ow":446,"kö":136,"oz":161,"oy":237,"kõ":3255,"pe":8582,"pf":53,"kü":2678,"pa":9082,"pl":1823,"pm":182,"lé":50,"pn":209,"po":8371,"lä":2964,"ph":615,"pi":8736,"pk":178,"lo":11129,"ln":1607,"lm":6318,"ll":19635,"ls":3471,"lr":311,"lp":708,"lv":1889,"lu":12030,"lt":11142,"lz":55,"ly":347,"hõ":638,"o ":5439,"hü":580,"ma":35693,"mb":5069,"mg":52,"dž":411,"mh":80,"me":23829,"mf":109,"hā":79,"mk":376,"ml":264,"mi":40706,"eš":58,"mj":183,"mn":780,"mm":2852,"mp":2268,"mo":4294,"mr":80,"mt":306,"ms":1112,"mv":97,"mu":8559,"iõ":260,"my":155,"p ":1639,"na":27821,"iü":173,"nb":439,"nc":950,"nd":30717,"ne":35835,"nf":748,"ež":190,"ng":16763,"nh":382,"jä":4035,"ni":36973,"nj":320,"nk":2416,"nl":419,"nm":235,"jt":102,"ju":8195,"jn":184,"jo":2230,"jm":158,"kj":201,"ki":13463,"kh":323,"kf":60,"ke":20124,"kd":44,"kb":60,"fü":589,"ka":27697,"m ":10376,"fö":145,"ky":200,"gõ":66,"gö":54,"ks":20790,"kt":6626,"ku":25416,"kv":1230,"ko":25420,"kp":298,"kr":3838,"kk":4692,"kl":3700,"km":2283,"kn":633,"li":57000,"hä":827,"lh":474,"lk":2832,"lj":4573,"le":37295,"ld":8105,"lg":4950,"lf":372,"gü":68,"la":31670,"lc":95,"lb":823,"n ":49802,"hr":621,"hs":195,"hv":2638,"hw":103,"ht":6604,"hu":4453,"hj":2882,"hk":5994,"hh":1052,"hi":9059,"hn":1001,"ho":2898,"hl":335,"hm":902,"dé":47,"id":25348,"ic":2524,"ib":3313,"dü":272,"ia":15748,"ih":2638,"ig":11090,"if":1034,"ie":3981,"hy":186,"dõ":82,"k ":16382,"iq":93,"ir":11061,"is":67525,"it":26582,"iu":2821,"iv":6458,"iw":71,"ix":158,"ii":25093,"aš":218,"ij":1535,"ik":35922,"il":29934,"im":19741,"in":53773,"io":9587,"ip":3170,"je":2855,"až":108,"ji":256,"fä":213,"iz":330,"iy":72,"eõ":59,"l ":31660,"eü":118,"ja":41991,"tä":3641,"xi":267,"té":49,"xt":76,"sö":241,"sõ":3307,"ww":59,"z ":542,"sü":7550,"xa":160,"xe":69,"sä":382,"oš":66,"wi":436,"sé":45,"wn":69,"wo":143,"ws":111,"vv":89,"rõ":329,"vy":59,"rö":278,"y ":2468,"rø":55,"wa":716,"rü":1335,"rā":49,"we":364,"vl":251,"vm":177,"ré":93,"vj":46,"vk":322,"rä":668,"vi":9510,"vg":104,"vt":88,"vu":4481,"vr":127,"vs":891,"vp":221,"rí":55,"vn":468,"vo":2748,"uz":144,"uy":48,"ux":121,"uv":4464,"uu":16971,"ve":10196,"vd":60,"va":34297,"x ":556,"ui":3667,"uj":2076,"uk":5480,"ul":16978,"ue":1272,"uf":259,"ug":4237,"uh":3669,"ur":13993,"us":50648,"ut":13872,"um":10739,"un":13735,"uo":494,"up":1948,"ty":350,"põ":3410,"tz":299,"pö":275,"tu":27878,"tt":4340,"tw":94,"tv":618,"ub":5783,"pü":734,"ua":3006,"ud":21571,"uc":397,"w ":514,"to":11780,"tn":1099,"tm":2505,"tl":5476,"ts":21273,"tr":8177,"tp":259,"tg":91,"tf":183,"te":48864,"tk":949,"tj":657,"ti":30264,"pä":3196,"th":1862,"v ":7055,"tb":129,"tc":91,"ta":41990,"su":20276,"sv":2315,"ss":8143,"st":67054,"sy":172,"sz":75,"sw":139,"sl":2942,"sk":10212,"sn":1521,"sm":4289,"sp":2885,"so":5912,"sr":794,"sd":174,"sc":930,"sf":485,"se":62785,"sh":1322,"sg":140,"sj":1063,"kš":54,"si":30745,"rz":159,"u ":17617,"sa":24031,"nü":262,"sb":374,"rr":3973,"rs":3109,"rt":7363,"ru":7114,"rv":5729,"rw":105,"ry":602,"nõ":791,"rp":648,"ro":11751,"rn":3712,"rm":4085,"né":46,"rl":1539,"rk":3920,"nç":57,"rj":4639,"ri":41739,"rh":803,"nä":2052,"iž":51,"rg":9073,"mā":44,"rf":347,"re":19761,"rd":6093,"rc":505,"rb":1857,"mü":731,"ra":27209,"t ":40235,"mõ":3716,"mö":208,"qu":338,"iš":133,"lā":51,"mä":4794,"lü":1913,"s ":83808,"px":82,"lõ":2040,"py":45,"lö":179,"pt":1489,"pu":3682,"pp":2089,"pr":5736,"hū":50,"ps":1178,"yō":62,"uš":116,"tš":984,"vö":298,"zz":105,"vü":141,"vä":6112,"zh":68,"zi":492,"ze":455,"uü":74,"za":431,"uõ":62,"zy":50,"võ":10296,"zs":86,"zu":144,"zo":270,"zn":53,"zm":49,"yg":56,"ye":201,"yc":105,"yd":131,"tü":1113,"ya":457,"yb":52,"tö":1641,"tõ":1288,"yt":138,"ys":416,"yr":185,"yp":88,"yo":142,"yn":260,"ym":152,"yl":325,"yk":76,"yi":121,"რ":63,"ო":50,"ნ":45,"ი":145,"ლ":57,"ე":73,"ა":150,"² ":198,"án":134,"ä ":222,"ár":59,"äb":871,"Üh":852,"Ül":582,"á ":83,"Ü ":171,"Õi":103,"Õh":47,"Õp":48,"Ök":67,"アアア":314,"ö ":312,"õ ":118,"ón":116,"ín":74,"ía":57,"í ":51,"én":82,"és":48,"ér":80,"ää":6210,"é ":189,"ät":739,"äv":318,"äm":341,"äl":3418,"äo":127,"än":4791,"äp":377,"äs":977,"är":10885,"äe":2551,"äd":252,"äg":1326,"äi":4721,"äh":3974,"äk":203,"üü":2774,"ān":168,"ār":63,"õš":48,"ā ":58,"öö":3301,"ể":72,"üo":89,"ün":5015,"üm":1677,"ül":5410,"üs":2571,"ür":1208,"üp":727,"üv":138,"üt":1243,"üb":167,"üa":118,"üf":48,"üg":358,"üd":830,"ük":2636,"üh":5389,"øy":105,"õõ":933,"öv":135,"ør":74,"øn":53,"õr":3591,"õs":403,"õt":1896,"õm":351,"õn":2300,"õo":45,"õp":1548,"õi":12796,"õj":1645,"õk":165,"õl":2454,"õe":2011,"õg":861,"õh":4577,"õb":159,"õd":527,"öt":705,"ör":689,"ös":334,"öp":203,"ön":372,"öl":253,"öm":99,"ök":454,"öi":89,"ög":138,"öe":80,"öd":707,"öb":82,"ḩ":72,"õz":69,"õv":176,"õu":3856,"ī ":118,"īn":58,"š ":464,"še":571,"ša":283,"šo":109,"šk":120,"ši":439,"šu":129,"št":112,"Ša":47,"Šo":91,"Šv":165,"ōk":47,"ō ":113,"žu":64,"žo":59,"že":84,"ža":274,"ži":342,"ž ":81,"ū ":64,"ი ":59,"あ":71,"ア":442,"가가 ":46," ア":53,"乙":67,"之":249,"丹":47,"临":68,"並":138,"丘":83,"专":99,"三":373,"丁":204," 三":86," 丁":77,"倉":51," 之":80,"ああ":46,"ος":107,"ος ":107,"ς ":231,"α ":91,"アア":377,"ян":79,"ый":89,"ье":98,"ьс":48,"ьн":72,"ха":91,"ци":80,"че":89,"ск":479,"сл":56,"со":89,"се":96,"си":145,"рь":62,"са":149,"рс":87,"рт":59,"ру":71,"тр":186,"то":214,"те":148,"тв":71,"ти":150,"та":205,"сс":83,"ст":346,"ур":101,"ус":74,"ул":50,"ун":50,"уд":52,"уб":58,"ть":45,"фо":51," Ga":590," Ge":860," I ":313," Fo":396," Fu":156," Fr":828," Fi":477," Fl":192," Fj":45," Ha":1914," He":1326," Gy":49," Go":366," Gr":850," Gu":492," Gi":307," Gl":139," Ig":87," Id":463," Dü":69," Hy":59," Hu":297," Ho":962," Hi":1244," Ji":117," Je":475," L ":50," Ja":1269," Iv":146," Ir":247," Is":445," It":383," Im":143," In":1484," Io":55," Ii":222," Il":189," M ":95," Fö":134," Fü":63," Ka":3502," Ke":1459," Ki":1200," Kh":87," Jo":986," Ju":831," N ":212," Gü":52," La":1495," Le":1398," Hä":127," Li":2733," Kl":320," Kn":46," Ko":2402," Kr":947," Kv":67," Ku":1637," Gö":60," Ky":76," Mc":62," Ma":3863," Hü":108," O ":93," Mi":1232," Dž":49," Me":1354," Lo":1146," Ly":58," Hõ":54," Lu":546," Ne":1115," P ":83,"а ":653," Na":1003," Jä":515," Ni":722," Mo":1366," My":96," Mu":873," A ":257," B ":117," C ":205," Ap":221," Am":1400," An":1581," Ak":327," Al":2024," Ai":275," Aj":171," Ag":212," Ah":170," Ae":126," Af":163," Ac":133," Ad":404," Aa":1075," Ab":516," Ba":1515," D ":91," Az":50," Av":171," Au":835," At":438," As":1512," Ar":1360," Be":1090," Bi":531," Bl":186," Bo":717," Br":891," Bu":586," E ":253," Ca":996," Ce":158," Ci":166," Ch":848," Cl":225," Cr":170," Co":1011," Cu":137," F ":65," Da":635," Di":564," De":805," Dr":153," Do":564," Du":150," Ea":51," Ec":83," Ed":300," G ":69," El":689," Ek":107," Ei":131," Eh":78," Eg":183," Ee":3409," Et":156," Es":505," Er":526," Ep":67," En":453," Em":267," Ex":60," Eu":1052," Ev":105," Fe":322," Fa":376," H ":102," Xi":80," Tä":312," Sü":194,"к ":130," Sõ":311," Sö":54,"Ив":73," Wo":210," Wi":583," Wh":45," Sä":63," We":424," Rü":62," Wa":532,"й ":637," Rõ":64," Zu":94,"Ле":48," Võ":412," Zo":49,"Ку":52," Ze":122," Zh":62," Vä":632," Zi":58,"Ко":96,"м ":90," Za":127," Yu":48,"Ка":72," Yo":250," Tü":235," Ya":113," Tö":129," Tõ":184,"л ":110,"Ни":84,"Мо":47,"о ":255," Tš":404,"Ма":85,"Ми":98," Tō":50,"н ":434,"Па":47,"Пе":79,"По":48," a ":162,"с ":91,"р ":253," R ":62,"в ":424,"Ан":87," Kõ":416," Kö":81,"Ал":161," Ou":77," Os":431," Ot":172," Or":497," Oo":46,"Аб":44," Op":127," Po":1235," Pl":283," Pi":1123," Ph":212," Lä":1080," Pe":1250," Pa":2281," Kü":278," Ny":45," Jõ":269," Nu":172," No":1462," Ol":339," Ok":124," On":146," Om":122," Oh":110," Kä":196," Oi":46," Od":87," Of":45," Ob":154," Jü":111," Mõ":229,"Вл":51," Ra":1755," Mü":121," T ":66,"д ":86,"Ви":55," Qu":130," Ro":1766," Re":978," Ri":1126," Nä":145," Rh":72," Lõ":669," Py":57," S ":141,"Бо":74,"г ":58," Pr":1513," Ps":52," Pt":76," Pu":606,"Ва":94," Lü":129," Mä":338," Sy":110," Sv":156," Su":1464," St":1223," Ta":4345," V ":98," Pä":659," Th":744," Ti":535," Te":1863," Tr":722," Ts":168," To":951," Nõ":702," Ru":426,"Ге":81,"Гр":45," Sa":3543," Nü":45,"е ":239," Sh":511," Si":1350," Sc":495," Se":3040," So":1682," Sp":263," Sk":219," Sl":182," Sm":84," Uu":347," Va":3308," X ":95,"и ":220," Ve":2347," Rä":65," Vi":1878," Vl":115," Vo":452," Vu":47," Tu":679," Tv":68," Ty":76," Põ":1331," Pö":45," W ":53," Pü":263," Ud":58," Uk":200," Ul":110," Um":60," Un":417," Ur":135," Us":122," Ut":61," ja":20477," l ":68,"ь ":130," io":82," im":299," in":4488," il":865," ii":156," is":1859," it":246," fü":370," ka":13902," fö":77," m ":441," ki":4980," ke":11286," jn":167," jo":514," jm":152," jt":94," ju":2953," ha":2640," he":1467," gl":111," gr":524," go":49," k ":146,"ы ":90," ib":56," dü":205," id":1072," ig":443," hi":1143," ho":971," hu":967," jä":3184," ni":6713," nd":118," ne":1909," na":1142," p ":48," mu":3133," mo":1838," mm":67," ok":758," ol":7096," om":1854," on":34245," oh":172," kä":1164," oj":83," of":466," ob":937," jõ":1996," nu":369," nt":126," no":1262," nn":71," le":1944," lk":65," hä":549," li":6989," n ":104," la":5115," kv":246," ku":9586," km":1544," kl":989," kr":1633," ko":10755," me":3580," dž":65," mi":14942,"я ":380," hü":385," ma":7824," lu":810," hõ":577," lo":3309," ae":363," af":69," ag":306," ah":192," aa":4258,"Ст":49," ab":842," ac":46," ad":192," am":855," an":2331," ap":575," ai":1079," aj":1910," ak":483," al":5568," av":698," au":1797," ar":3514," at":197," as":4137," d ":153," ba":664," 가가":46," bi":566," be":162," bo":152," bl":78," bu":241," br":141," ca":117," e ":147,"х ":65," b ":66,"т ":156,"Ро":57,"Се":84,"Со":64," er":2077," et":1438," es":2246," en":2378," em":344," ep":75," ei":852," el":2672," ek":498," ef":88," ee":3011," eh":5575," eg":105," fe":136," fa":280," eu":119," ev":98," fu":315," fr":185," fo":525," fl":88," fj":44," fi":1214," bä":181," ge":720," ga":319," i ":993," cm":54," co":181," ch":69," da":137," do":440," dr":125," de":2173," eK":291," di":908,"ч ":585," ed":721," eb":183," du":101,"ль":338,"ма":140,"ме":79,"ми":161," vö":80,"лл":50,"ло":200," vü":80,"ла":291," zo":50," zu":50,"ле":285," võ":8473,"ли":217,"кс":169,"ко":414," vä":4759,"ка":276,"ки":323," tõ":919," tö":1012," tü":520,"йс":53,"ия":126," tä":3294,"им":129,"ин":324,"ик":203," sõ":2172,"ил":197," sö":81,"ии":45,"ий":374,"ич":632,"их":108,"ит":98,"ир":115,"ис":145," sü":5410,"ри":296,"рк":48,"рн":83,"ро":406,"ра":317,"рг":121,"рд":63,"ре":208,"пр":87,"по":93,"па":67,"пе":55,"ос":221,"ор":399,"оп":51,"от":68,"ок":58,"ол":330,"ом":142,"он":240,"ой":87,"ов":918,"ог":103,"од":117,"ое":100,"об":67,"ны":111,"нт":92,"нс":154,"но":345,"нн":108,"нк":74,"ни":279,"не":99,"нг":60,"нд":193,"на":391," tš":115,"мо":119,"ге":109," ru":423," nõ":449,"ги":57," u ":56,"го":156," sa":6645," nü":110," se":7005,"гу":45," sc":47," si":2589," sh":125," sl":105," sk":246," sp":570," so":1701,"да":103,"ве":150,"ви":646," mõ":2887," mö":122,"вн":105,"во":155," t ":61," mü":545," ra":5080,"вс":81," re":2103," ri":3272," nä":1534,"га":89," ro":1241,"бе":93," pu":1946," pr":4024," ps":246,"бо":55," s ":402," lö":96," px":82," lõ":1781," mä":2882," lü":990,"ва":236," os":2510,"ад":144," ot":471,"ае":61,"аз":52," oo":592," op":302,"аб":45,"ав":150," or":1834,"аг":45,"ам":81,"ан":559,"ай":127," kõ":2868,"ак":68,"ал":255," kö":73," pe":3640," kü":1892," pa":3723,"ас":154,"ар":332,"ат":153," pl":542," po":5154,"ая":206,"ба":89," pi":4298," lä":2715," rü":649," rõ":177," y ":67," rö":76,"ив":76,"иг":52,"ид":49,"ие":98," sä":314," x ":107," va":12420," ve":4280," uu":1024," vo":1317,"за":70," vu":102," rä":537," vi":1606," vk":47," vm":99,"ет":178," pü":607,"ес":139,"ер":394,"ео":53,"ен":381,"ем":79,"ел":192," pö":222," põ":3044,"ек":180,"ей":122," tu":3292,"ее":74," us":346," um":867," un":274," uk":117," ul":323," uj":60," ta":4816,"др":169,"до":115,"ди":121," sy":46," st":1273,"де":141," su":5118,"ев":407," tr":1088," ts":358,"ед":85," to":2603," pä":1847," th":387," ti":918," te":10216," Õp":48," Õh":47," Õi":101," Ök":64," Ü ":59," Ül":578," Üh":851," är":293," ää":835," öe":60," ök":219," õl":119," õi":938," õh":408," õp":648," õn":74," öö":76," ür":128," ül":2321," üm":508," ük":1493," üh":2633," 가":51,"가":148,"ōky":44," Šv":165," Šo":91," Ša":47," ža":50,"د ":87,"ة ":66,"ان":46,"ال":176,"ر ":45,"ي ":68,"ن ":97," アア":51,"AS ":102," ко":46,"BA ":64," по":46," Ро":57," Пе":79," Па":47," По":48,"가가":97," Ст":49," Се":84," Со":64,"AO ":53," Ан":87," Ал":161," Аб":44," Ва":94," Бо":74," Ви":55," Вл":50," Ге":81," Гр":45," Ка":71," Ив":73," Мо":47," Ни":84," Ко":95," Ку":52," Ле":47," Ма":83," Ми":98,"Fel":59,"Fer":121,"Fil":156,"Fin":145,"Fir":68,"Fan":92,"Fal":44,"Fai":50,"Era":62,"Eri":232,"Est":182,"Ern":93,"Esi":185,"Eur":956,"Eva":52,"Ehi":44,"Ele":225,"Ela":50,"Eks":56,"End":51,"Ena":97,"Eng":50,"Ene":66,"Emm":77,"Ema":72,"Eli":126,"Ell":63,"Ent":53,"Ger":95,"Geo":499,"Gen":130,"Gio":51,"Gil":46,"Ива":71,"Öko":66,"Gan":54,"Gal":104,"Gam":62,"Gar":101,"Gab":56,"Flo":75,"Fla":65,"Fra":377,"Fri":218,"Fre":186,"Foo":47,"Fon":48,"For":124,"IP ":48,"II ":579,"His":301,"Hii":583,"Hil":89,"Hin":100,"Hel":430,"Hei":240,"Hea":65,"Hen":145,"Her":270,"Haa":140,"Hab":68,"Hal":209,"Hai":81,"Han":256,"Ham":105,"Har":682,"Hau":60,"Gus":86,"Gua":87,"Gui":119,"Grö":69,"Gre":178,"Gri":80,"Gra":216,"Gru":170,"Gro":104,"Gol":71,"Got":58,"Gor":73,"Ing":355,"Inf":74,"Ini":62,"Int":306,"Ins":151,"Ill":72,"Ind":377,"Imp":71,"Iis":118,"Iir":94,"Ida":408,"Hum":47,"IV ":90,"Hor":121,"Hoo":71,"Hom":80,"Hon":86,"Hol":356,"Arg":115,"Arh":59,"Are":119,"Arc":52,"Ara":180,"Arm":142,"Arn":51,"Ark":65,"Ari":84,"App":49,"Apo":82,"Ate":50,"Atl":232,"Asu":1049,"Ast":103,"Ass":139,"Ase":89,"Art":168,"Arv":91,"Aru":68,"Ava":88,"Aut":116,"Aus":402,"Aug":149,"Bai":96,"Bak":55,"Bal":343,"Ban":110,"Bab":69,"Bad":75,"Bar":269,"Bat":57,"Bas":115,"Bau":47," пр":54,"Aar":73,"Aas":446,"Aaf":295,"Aad":67,"Abr":47,"Abe":61,"Aba":83,"Abd":79,"Abi":49,"Ada":90,"Adv":74,"Ado":56,"Ade":47,"Aer":49,"Aeg":46,"Age":54,"Afg":70,"Ain":50,"Air":92,"Al ":56,"Aja":140,"Aka":119,"Akt":63,"Ala":283,"Alb":201,"Alg":178,"Ali":85,"Ale":472,"Alf":50,"Alu":59,"Alt":115,"All":206,"Alp":102,"Ame":1051,"Amb":81,"Ama":69,"Ang":173,"Ani":70,"Ana":136,"And":351,"Ans":55,"Ant":450,"Ann":191,"Bus":55,"Bul":94,"Bur":160,"Bud":75,"Мих":58,"Bru":68,"Cal":198,"Cam":87,"Cas":93,"Car":284,"Cat":48,"Can":91,"Bea":78,"CH ":77,"Ber":414,"Ben":187,"Bel":194,"Bil":84,"Bis":73,"Bir":85,"Bio":136,"CO ":51,"Bla":86,"Bre":116,"Bra":266,"Bro":108,"Bri":271,"Ник":72,"Bol":76,"Bon":63,"Bor":154,"Bos":66,"Bou":71,"Õig":83,"EE ":44,"Det":56,"Des":76,"Dev":50,"Del":63,"Dem":159,"Den":66,"Dep":48,"ан ":122,"Dan":112,"Dar":75,"Dav":137,"ай ":47,"Chr":190,"Che":117,"Chi":112,"Cit":59,"Cla":86,"Cen":74,"Cha":324,"Cri":46,"DV ":45,"Clu":51,"Cor":172,"Com":152,"Col":177,"Con":247,"Cou":57,"FA ":50,"ади":61,"аев":60,"Ege":46,"Egi":118,"али":51,"аль":66,"ано":90,"Ees":3340,"анд":117,"ани":46,"Edu":80,"Edw":63,"Ede":52,"FC ":88,"Dia":62,"Dis":117,"Dio":47,"Dig":58,"Die":54,"Div":45,"ая ":201,"Dre":44,"Dra":54,"Пет":51,"Doy":59,"Don":122,"Dom":77,"Doo":55,"Dor":65,"Nee":123,"Nen":81,"Nel":76,"Nei":118,"Nev":47,"Neu":75,"Net":49,"Nat":157,"Nii":60,"Nig":95,"Nic":90,"Nim":92,"Nik":161,"Jär":285,"New":291,"Nap":53,"Nar":263,"Nan":47,"Nag":44,"Nad":109,"Jõe":111,"Jõg":96,"Jää":129,"OS ":46,"Nov":157,"Nor":978,"Noo":108,"Але":136,"Kär":84,"Obe":69,"Jür":85,"Ott":71,"Ote":47,"Kõi":90,"Kõr":209,"Oli":46,"Ole":65,"On ":49,"Oma":92,"Olü":53,"Ope":62,"Ora":58,"Osc":88,"Osa":64,"Ord":51,"Ori":67,"Org":113,"Ost":56,"Osl":81,"Oss":50,"Lää":780,"Ple":51,"Pla":187,"Pin":124,"Pil":73,"Pik":86,"Pii":306,"Pir":78,"Pih":114,"Pie":94,"Pho":46,"Phi":103,"Läh":64,"Lät":200,"Pea":138,"Ped":50,"Per":283,"Pet":350,"Pen":99,"Pel":65,"Pee":136,"Pat":127,"Pas":71,"Par":721,"Pav":48,"Pau":155,"Paa":107,"Pad":62,"Pan":241,"Pai":107,"Pal":313,"Kül":163,"Pak":101,"Lõu":592,"Pto":68,"Pun":149,"Pur":44,"Pue":49,"Puh":59,"Puu":79,"Pro":404,"Pri":219,"Pre":257,"Pra":604,"Pol":248,"Pom":45,"Poh":46,"Pot":67,"Pos":114,"Poo":305,"Por":187,"RO ":68,"Mär":85," ال":145,"Män":102,"Вла":51,"SA ":770,"Вас":49,"Raa":184,"Rad":71,"Rai":65,"Rah":536,"Ram":65,"Mün":49,"Ran":128,"Rak":98,"SD ":47,"Mõn":60,"Mõi":70,"Isa":72,"Ise":51,"Ita":340,"Isl":190,"Ira":139,"Iva":115,"Jac":79,"Jaa":509,"Jar":46,"Jan":141,"Jam":143,"Jal":74,"Jak":74,"Jen":64,"Jer":96,"Jea":76,"Jee":58,"KP ":62,"Jos":138,"Jor":57,"Joo":51,"Jon":103,"Joh":467,"Joa":63,"Jug":45,"Juh":63,"Juu":106,"Jur":60,"Jul":218,"Jum":146,"Föd":118,"Kaa":255,"Kad":123,"Kab":68,"Kai":254,"Kah":71,"Kag":66,"Kam":188,"Kal":368,"Kak":46,"Kap":63,"Kan":466,"Kau":239,"Kat":178,"Kas":307,"Kar":687,"Ker":134,"Kes":554,"Ket":49,"Ken":126,"Kel":51,"Kem":44,"Kei":251,"Keh":55,"Kee":118,"Kir":431,"Kit":75,"Kin":160,"Kiv":96,"Kil":83,"Kih":48,"Kii":162,"Klo":53,"Kli":49,"Kle":53,"Kla":98,"Klu":55,"Koo":227,"Kon":442,"Kom":339,"Kol":258,"Kos":162,"Kor":274,"Kop":81,"Kog":69,"Kod":97,"Kok":46,"Koi":60,"Koh":156,"Kr ":310,"Kot":63,"Kre":312,"Kra":186,"Kri":267,"Kro":87,"Kru":51,"Kui":148,"Kul":317,"Kun":277,"Kur":369,"Kuu":241,"Kva":53,"Lev":81,"Les":48,"Lep":73,"Leo":109,"Len":251,"Lem":45,"Lei":135,"Leh":50,"Lee":284,"Lea":56,"Lau":185,"Le ":49,"Lak":57,"Lai":145,"Lag":77,"Lah":103,"Lae":57,"Las":93,"Lar":49,"Lap":96,"Lam":60,"Lan":154,"Lad":56,"Laa":70,"La ":89,"ML ":59,"Lib":76,"Lie":81,"Lih":74,"Lig":47,"Lii":1363,"Lil":58,"Lim":58,"Lin":573,"Lip":63,"Lis":111,"Lit":56,"Liv":63,"Lut":62,"Luu":51,"Luk":48,"Lui":44,"Lud":75,"Luc":57,"Lou":126,"Los":88,"Lot":49,"Loh":45,"Lor":56,"Loo":276,"Lon":195,"Lom":53,"Lok":47,"NA ":78,"Mei":81,"Meh":138,"Men":72,"Mel":110,"Mes":92,"Mer":322,"Met":201,"Med":80,"Mee":102,"Man":356,"Mal":230,"Mar":1023,"Mas":191,"Mag":212,"Hüd":45,"Mad":279,"Maj":106,"Mak":124,"Mah":61,"Mai":109,"Mac":68,"Maa":607,"Max":73,"Mau":82,"Mat":240,"Mod":67,"Moh":45,"Mol":99,"Mon":346,"Moo":103,"Mos":352,"Mor":136,"Mot":44,"Mih":126,"Mik":120,"Mic":227,"Mit":88,"Mis":98,"Mil":150,"Min":245,"Mul":70,"Muh":59,"Muu":174,"Mur":95,"Mus":281,"Tän":126,"Täh":115,"лав":45,"лад":53,"ль ":49,"TÜ ":47,"ääm":126,"ääl":395,"ään":2004,"ääk":111,"ääd":51,"ääg":344,"ääb":251,"äät":82,"ääv":158,"äär":2288,"ääs":242,"ää ":101,"Sõn":78,"кса":105,"Sõj":66,"Sõr":64,"ков":99,"кол":81,"кий":216,"Wor":93,"Wol":71,"Wil":257,"Win":131,"кая":100,"Wei":45,"Weh":50,"Wes":117,"Was":59,"War":74,"Wat":77,"Wal":182,"йск":48,"ко ":58,"Vor":103,"Voo":53,"Vol":205,"Vis":61,"Vit":59,"Vla":113,"ная":59,"ое ":83,"ндр":145,"ой ":73,"Väl":103,"Väi":301,"Väs":46,"Vär":64,"ов ":254,"мир":58,"мов":45,"ман":65,"Yor":178,"You":47,"Töö":118,"Tür":179,"льн":72,"на ":209,"льс":44,"三三":54,"Tõn":45,"лов":104,"лив":45,"лен":58,"лек":145,"ра ":49,"Sve":99,"Suu":805,"Sur":85,"Sul":92,"Sup":46,"Sun":86,"Sud":52,"Str":229,"Stu":106,"Sto":184,"Sta":395,"Ste":226,"Tee":206,"Teh":108,"Teg":101,"Tea":178,"Ted":50,"Ten":61,"Tem":174,"Teo":71,"Tei":326,"Tel":114,"Tam":134,"Tan":148,"Tat":66,"Tar":920,"Tav":135,"Tai":140,"Tal":1376,"Tag":78,"Taa":310,"Tab":46,"Tad":48,"Ta ":667,"ори":54,"оро":72,"Ska":93,"Shi":143,"She":124,"Sha":138,"Sim":113,"Sil":212,"Sii":184,"Sih":45,"Sig":69,"Sis":180,"Sir":66,"ост":84,"Sin":138,"Sie":46,"Sib":150,"оль":64,"Ser":245,"Sev":74,"оло":46,"оли":53,"ола":72,"Sep":56,"Sen":68,"Sel":786,"Sem":74,"Sei":87,"Sed":193,"See":1098,"Sea":119,"TV ":56,"äb ":221,"Spa":44,"Spe":59,"Spo":69,"Sof":45,"Soc":51,"Sot":162,"Sou":50,"Sol":103,"Som":77,"Son":96,"Soo":896,"TO ":49,"Slo":106,"äga":245,"äev":922,"ова":78,"äet":56,"äes":339,"äea":55,"äed":62,"äel":380,"äht":1060,"äid":319,"äib":55,"Nõu":525,"ähe":2126,"ähk":56,"ähi":637,"Nõm":138,"äge":296,"ägi":746,"äe ":449,"äbi":596,"ный":73,"SV ":478,"Rus":63,"äda":144,"Rum":92,"äde":48,"änn":112,"änu":150,"Sag":67,"Sai":131,"Sah":102,"Sak":1209,"Sam":213,"Sal":247,"Saa":614,"Sab":44,"Sad":48,"ämm":58,"äna":797,"äni":236,"äng":1228,"äne":1640,"änd":528,"äpp":49,"äps":237,"Sco":47,"Sch":374,"äol":71,"Sav":86,"Sau":94,"Sar":178,"San":281,"är ":134,"äit":1511,"äis":253,"äir":146,"äin":438,"äil":189,"äik":1090,"äij":45,"äig":275,"äie":224,"äiv":81,"ове":46,"älu":94,"ält":126,"ови":380,"äli":683,"älj":1874,"овн":48,"овс":48,"äme":148,"ого":52,"äki":100,"TA ":65,"älg":52,"äld":112,"äle":277,"ävi":138,"SI ":60,"Res":61,"äva":99,"Rev":69,"Näi":96,"нов":187,"ор ":52,"Ris":101,"Rii":508,"Rin":80,"Ric":153,"Rid":55,"ärg":1735,"äri":1473,"ärj":460,"ärk":656,"ärm":139,"ära":1650,"ärd":69,"äre":1128,"Ras":57,"Rau":107,"Rap":154,"äsi":369,"äsk":115,"äse":53,"нко":46,"он ":72,"ärv":1633,"ärn":371,"äru":76,"ärt":890,"ärs":325,"ärr":78,"ätm":48,"äti":288,"ätk":94,"äst":200,"äss":64,"äsu":88,"Rec":53,"Rei":185,"Reg":77,"Ren":63,"Rel":79,"Rep":45,"ätt":141,"äts":56,"Rea":67,"Roh":92,"Rob":131,"Roc":60,"Ros":150,"Rom":120,"Roo":904,"SS ":97,"SO ":46,"нск":92,"сан":105,"Vab":795,"Vaa":45,"Vai":341,"Vah":167,"Vel":86,"Ven":1754,"Vee":94,"сил":46,"ски":218,"ска":112,"ско":106,"Vas":243,"Van":479,"Val":835,"Var":232,"Vih":45,"Vig":101,"Vii":222,"Vid":54,"Vic":79,"Vie":71,"Vir":448,"Vil":366,"Vik":195,"Vin":105,"сов":48,"Ver":155,"Ves":127,"Ukr":194,"Ung":127,"Uni":210,"VR ":44,"Uus":233,"Uur":54,"Uue":46,"Tve":67,"Põl":225,"Põh":1046,"Püh":181,"рий":44,"рис":45,"Pür":45,"ров":215,"VI ":61,"Ter":274,"Tes":65,"The":470,"Päi":111,"Tho":103,"Pär":428,"Tih":59,"Tii":133,"Tim":77,"Pää":51,"Too":201,"Tor":167,"Tol":64,"Tom":75,"Ton":64,"Toi":63,"Tru":76,"Tro":136,"Tri":130,"Tre":108,"Tra":219,"Tur":159,"Tuu":92,"Tul":130,"Tun":106,"ši ":155,"šet":55,"šev":66,"šel":50,"šer":71,"šee":52,"ван":86,"šeh":151,"ša ":68,"šat":45,"Šve":145,"вск":69,"вна":76,"вич":569,"šii":58,"ва ":83,"аси":52,"ато":47,"во ":47,"bje":380,"bja":66,"bis":218,"bit":234,"biv":187,"bio":537,"bip":80,"bir":67,"bik":204,"bil":578,"bim":157,"bin":301,"bii":222,"bij":70,"bo ":72,"blo":54,"ble":274,"bli":550,"bn ":58,"bla":847,"ев ":82,"bod":50,"bok":45,"bol":236,"ей ":78,"bon":137,"boo":67,"bor":247,"bot":147,"bos":56,"bou":65,"be ":205,"bam":130,"ban":365,"bak":284,"bal":373,"bai":111,"baj":47,"bah":73,"bac":66,"bad":310,"baa":286,"án ":48,"bav":70,"bat":316,"bas":493,"bar":1055,"bi ":1023,"bee":78,"bed":61,"bec":51,"ber":2586,"ben":167,"bem":51,"bel":642,"bek":156,"bes":968,"bet":314,"bia":533,"bib":89,"bid":126,"bie":117,"bha":48,"дро":45,"ет ":44,"ca ":308,"car":187,"cas":78,"cat":96,"can":142,"cal":161,"ce ":506,"bri":2225,"bro":86,"bra":389,"bre":198,"bu ":80,"bru":574,"bso":114,"bse":67,"bst":74,"дим":49,"bub":112,"bur":697,"bul":127,"bun":75,"bum":298,"bud":140,"but":54,"bus":333,"дор":45,"by ":122,"aka":1199,"am ":1037,"ake":1191,"aki":1056,"ajo":642,"aju":375,"al ":9437,"aja":7056,"aje":136,"aaž":50,"aik":874,"ail":2117,"aim":1277,"ain":2391,"aio":97,"air":125,"ais":922,"ait":1457,"aiu":220,"aiv":79,"ak ":324,"aig":556,"aie":299,"aid":1846,"aia":317,"ahn":114,"ahk":234,"ahl":63,"ahi":395,"ahj":153,"ahh":236,"ahu":752,"ahv":2279,"ahs":82,"aht":765,"ahr":54,"aho":103,"ahe":4825,"aha":671,"agi":522,"agr":165,"agu":1145,"agn":333,"ago":338,"akä":47,"aol":220,"aok":108,"anu":1617,"anz":132,"ajõ":234,"any":60,"ano":601,"ann":2713,"anm":52,"ant":3838,"ans":1284,"anr":93,"ane":2890,"ang":1938,"ajä":246,"ani":8218,"anj":107,"ank":687,"anl":132,"ap ":65,"ana":2831,"anc":369,"and":7769,"amu":491,"amt":68,"amm":1132,"aml":63,"amo":227,"amp":318,"ams":141,"amj":57,"ami":6164,"adž":93,"ame":2122,"amb":788,"ama":4891,"ao ":126,"alv":587,"alu":1954,"alt":2286,"als":1219,"alr":157,"alp":242,"alo":1054,"aln":583,"alm":1124,"all":6989,"alk":711,"alg":2752,"alh":218,"ahä":77,"ali":9008,"alj":729,"ald":4688,"ale":3211,"alf":70,"ala":5975,"alb":322,"an ":2385,"akv":79,"aks":8379,"akr":533,"aku":957,"akt":1317,"ako":2809,"akp":79,"akk":543,"akl":109,"aba":1901,"abe":658,"abi":1296,"abl":140,"abo":165,"abr":186,"abs":170,"abu":92,"ae ":265,"aca":68,"aab":988,"aaf":943,"aag":470,"aad":3122,"aae":152,"aaj":148,"aak":2014,"aai":1233,"aan":4143,"aal":7268,"aam":1949,"aas":6140,"aar":5151,"aap":765,"aav":886,"aau":47,"aat":3358,"ad ":6207,"ac ":51,"aa ":3615,"ab ":4871,"afo":83,"afr":355,"aft":134,"aff":76,"afe":44,"afi":983,"ai ":831,"aga":2664,"agd":52,"age":905,"aen":249,"ael":441,"aes":76,"aer":147,"aeg":1173,"aee":54,"aed":90,"aek":78,"aeh":45,"ah ":172,"afa":57,"aev":851,"aet":163,"ado":272,"adr":398,"adl":409,"adm":484,"adj":65,"adi":3217,"ade":4451,"ag ":78,"ads":103,"adu":2361,"adv":107,"aco":80,"ack":227,"aci":112,"ach":400,"ace":151,"acc":48,"ada":2882,"af ":128,"act":96,"azo":46,"azi":122,"гор":67,"Šot":73,"avä":555,"aza":97,"др ":56,"avõ":220,"avö":76,"azz":52,"atä":55,"axi":69,"asõ":365,"asü":91,"atõ":66,"atö":93,"atü":106,"aya":58,"aye":71,"ba ":651,"atš":101,"at ":2216,"amõ":135,"arh":476,"anä":68,"arg":455,"arf":44,"are":3776,"ard":1446,"arc":202,"arb":519,"ara":2982,"amü":61,"arp":144,"aro":486,"arn":587,"arm":512,"arl":676,"ark":1022,"anç":55,"arj":940,"ari":5535,"aru":930,"arv":2193,"arr":543,"ars":766,"art":3541,"au ":240,"asa":1531,"anõ":84,"ary":130,"asi":2682,"ash":208,"asc":73,"ase":3414,"aso":395,"asn":174,"asp":282,"ask":962,"asj":302,"asm":178,"asl":169,"aot":356,"aor":55,"aos":1245,"ar ":2000,"akü":111,"apa":845,"ape":400,"api":685,"aph":50,"apn":98,"apl":208,"apo":517,"app":368,"apr":632,"aps":286,"apt":144,"apu":179,"alõ":58,"as ":16624,"alü":198,"amä":398,"ava":6484,"ax ":62,"aut":985,"avs":163,"avo":221,"avl":84,"avi":977,"ave":501,"ay ":224,"arü":99,"awa":117,"arö":60,"avy":49,"arõ":46,"avu":364,"av ":2085,"ata":5828,"asu":6462,"ast":15872,"ass":1978,"asr":56,"asv":597,"atm":137,"atn":57,"atk":174,"atl":435,"atr":525,"ato":1488,"ate":6661,"ati":3865,"ath":254,"apä":659,"aua":310,"aub":254,"att":452,"ats":3184,"atv":82,"atu":6581,"apõ":81,"aul":1069,"aum":65,"aun":225,"aup":177,"aur":292,"aus":715,"aud":1021,"aue":64,"auf":46,"aug":1111,"auh":299,"auk":196,"Tōk":46,"Tšu":55,"Tši":63,"Tše":232,"Ühi":101,"Ühe":728,"Võr":202,"Või":74,"еев":71,"Üli":294,"Üld":58,"Üle":201,"еви":205,"ени":90,"енн":63,"ерн":50,"ерг":55,"екс":141,"ель":84,"етр":46,"ико":106,"ина":63,"ими":56,"иль":70,"ист":52,"иха":59,"ка ":82,"ив ":46,"ие ":51,"ий ":339,"ин ":106,"ич ":575,"ия ":116,"jee":65,"jer":66,"jek":711,"jel":510,"jem":199,"jen":398,"jes":189,"jet":51,"jev":327,"ji ":46,"jad":794,"jaa":1553,"jab":52,"jat":1299,"jas":1811,"jav":659,"jap":328,"jao":620,"jar":182,"jal":3824,"eük":52,"jak":1121,"jan":2708,"jam":517,"jah":222,"jag":412,"jaj":121,"jai":178,"jaz":54,"je ":209,"jms":73,"jne":167,"jok":49,"joo":1404,"jon":315,"jor":223,"jia":91,"jm ":66,"fää":209,"itn":116,"itm":881,"itl":1066,"itr":232,"ito":1263,"itu":2865,"itt":840,"its":4841,"itz":125,"ity":156,"ipõ":57,"iud":60,"isk":1562,"isj":238,"ism":2164,"isl":320,"iso":452,"isn":380,"üdr":236,"isp":759,"iss":1297,"isr":458,"isu":1149,"üdn":80,"ist":16111,"üdi":214,"isv":570,"iv ":471,"ita":4226,"ite":4040,"ith":114,"ipä":225,"iti":2606,"itj":280,"ivs":528,"ivo":234,"ivn":346,"ivu":91,"ühe":2500,"irü":49,"iwa":47,"ühh":198,"ühi":1384,"üha":330,"iup":58,"ius":891,"ium":1351,"iul":72,"iut":86,"iva":1768,"ix ":96,"ügi":120,"ivi":1653,"ive":618,"üga":160,"ipr":172,"ipo":148,"ipp":426,"ipu":354,"ips":94,"ipt":204,"ipi":209,"ipl":275,"ilõ":45,"is ":17934,"ion":1904,"ioo":5170,"iop":63,"ior":225,"ios":453,"iot":121,"iog":121,"iok":89,"iol":466,"iom":85,"ipa":243,"ikü":95,"ipe":504,"iov":51,"ikõ":47,"ir ":371,"iru":697,"irv":50,"irs":78,"irt":114,"iro":264,"irm":315,"irn":410,"irk":973,"irl":76,"iri":2513,"irj":2489,"isi":2505,"ish":182,"ise":17961,"isc":260,"isa":2662,"üda":146,"iu ":159,"inõ":65,"iqu":72,"übi":94,"imä":330,"ire":897,"inä":160,"irg":342,"ira":660,"ird":488,"irc":67,"it ":2043,"imõ":379,"ünn":379,"üno":56,"ünk":61,"üng":61,"ünd":3848,"üp ":170,"üna":270,"ümp":310,"ümm":93,"ümn":303,"itš":421,"ümi":74,"üme":56,"ümf":54,"ümb":638,"ülr":51,"üpo":95,"üpp":83,"üpe":67,"üpi":189,"ünt":213,"ja ":25408,"ül ":45,"itü":46,"itõ":50,"itö":136,"isü":229,"ühr":50,"isõ":144,"ühm":571,"üht":317,"üla":1693,"üle":1718,"üld":476,"ülg":296,"üli":561,"가가가":51,"ülj":132,"ülm":94,"üll":158,"ülo":65,"üks":2052,"ivõ":345,"ivä":146,"üki":160,"ize":56,"ükl":190,"ükk":136,"iza":126,"üm ":72,"kii":670,"kih":644,"kik":66,"kij":101,"kim":281,"kil":445,"kk ":305,"kia":138,"kib":147,"kie":46,"kid":1460,"kiv":697,"kin":1384,"kip":117,"kir":3701,"kis":700,"kit":731,"kiu":59,"kja":72,"km ":1345,"ki ":1992,"kha":79,"kj ":53,"kho":116,"kea":512,"ked":219,"kee":6776,"keh":762,"kei":247,"kek":94,"kej":56,"kem":268,"kel":1252,"ken":530,"kep":122,"kes":5621,"ker":962,"ket":387,"kev":86,"füü":427,"ke ":2101,"kra":1232,"kre":1046,"kt ":481,"ksa":2425,"kse":4827,"ku ":4118,"kro":695,"kru":92,"kri":683,"kpa":66,"kr ":45,"kov":236,"km²":193,"kot":103,"kos":501,"kor":2423,"kop":353,"koo":4137,"kon":8471,"kom":1331,"kol":1558,"kok":400,"koj":93,"koh":2395,"kog":2107,"koe":132,"kod":636,"ööp":158,"öör":516,"öös":287,"ööt":635,"ks ":9327,"ööv":121,"ööb":59,"ööd":419,"öög":123,"kpo":74,"kpi":89,"öök":163,"ööm":71,"ööl":141,"öön":269,"kme":409,"kmi":180,"koa":59,"kob":69,"kne":492,"öö ":232,"kku":1792,"kke":775,"kka":353,"kko":768,"kki":577,"klu":516,"ko ":325,"kma":69,"kle":259,"kla":1096,"klo":288,"kli":1510,"jut":1345,"jus":656,"juv":119,"juu":1477,"jul":461,"juk":80,"jun":529,"jum":545,"jur":167,"jub":134,"juh":1325,"jug":89,"jud":292,"ju ":890,"jt ":90,"kav":342,"kat":1523,"kau":1221,"kar":907,"füs":56,"kas":4782,"kap":253,"kan":1568,"kao":86,"kal":1965,"kam":275,"kaj":243,"kak":404,"kah":1019,"kai":1321,"kag":415,"kae":301,"kad":579,"kab":194,"kaa":1822,"ka ":8332,"föd":110,"ha ":1068,"ham":380,"han":1084,"hap":399,"hai":439,"haj":90,"hak":332,"hal":1865,"hau":155,"hav":82,"har":1689,"has":829,"hat":244,"haf":57,"hae":123,"hag":90,"hab":58,"haa":386,"had":171,"he ":2035,"hek":639,"hel":4104,"hei":379,"heg":134,"hee":149,"hed":836,"hea":267,"hev":124,"het":632,"hes":861,"her":737,"hep":112,"heo":122,"hen":4035,"hem":646,"hi ":803,"hhi":366,"hho":288,"hha":319,"hk ":5071,"hig":50,"hie":100,"hid":172,"hic":69,"hib":91,"hia":196,"hip":104,"hio":120,"hin":1818,"him":313,"hil":853,"hik":677,"hii":392,"hiv":50,"his":1677,"hit":1195,"hir":223,"hja":2408,"hje":45,"hju":407,"hka":48,"hm ":244,"hke":354,"hkl":54,"õõd":171,"hkr":47,"hku":160,"hkv":87,"hn ":212,"õõt":409,"õõr":156,"õõs":83,"hla":75,"hle":125,"hli":44,"ho ":78,"hma":426,"gma":96,"go ":269,"gme":81,"glu":61,"glo":135,"gle":108,"gli":2195,"gla":352,"gko":571,"gno":68,"gni":103,"gne":544,"gna":249,"gmi":89,"glü":44,"gs ":53,"gpa":728,"gol":120,"goo":292,"gon":121,"gos":166,"gor":359,"got":49,"gov":53,"ый ":88,"gu ":2469,"gse":607,"gro":161,"gru":407,"gra":1940,"gt ":50,"gri":373,"gre":257,"gto":66,"gug":104,"gui":112,"guk":582,"gum":403,"gul":521,"gua":174,"gub":77,"gue":138,"gud":1131,"gy ":51,"guv":167,"gut":1041,"guu":62,"gur":250,"gus":4478,"gup":144,"gun":555,"guo":121,"gvi":72,"iai":81,"iah":71,"iak":307,"iaj":141,"iam":367,"ial":852,"iao":48,"dün":222,"ian":848,"iap":68,"ias":2407,"iar":200,"iau":97,"iat":663,"iav":133,"ic ":272,"iaa":1098,"iab":61,"iac":55,"iad":147,"iae":81,"iag":258,"ibl":123,"ibi":192,"ibo":94,"ibn":54,"ibr":97,"ibu":187,"id ":9461,"iba":199,"ibe":483,"ia ":7687,"ib ":1767,"iet":453,"iev":106,"iel":576,"iem":193,"ien":617,"ier":409,"ies":329,"iee":79,"ied":249,"iek":65,"ig ":195,"iec":79,"ifu":44,"ifo":208,"ife":183,"ifi":363,"ifa":59,"icr":74,"ics":57,"ict":114,"icu":98,"ico":150,"ick":168,"ici":96,"ich":758,"ice":186,"ie ":546,"ica":492,"idu":2141,"ids":89,"idr":54,"ido":139,"idm":73,"idn":61,"idl":140,"idi":1042,"idg":52,"ide":8097,"ida":3662,"iib":405,"iia":256,"iif":48,"iig":4040,"iid":2067,"iie":248,"iik":3861,"aši":75,"iin":2367,"iil":963,"iim":756,"iis":1521,"iir":3180,"iip":91,"iiv":1986,"iiu":178,"iit":2428,"il ":6055,"ija":1055,"ije":50,"ijo":86,"iju":112,"im ":1389,"ika":6374,"ige":1448,"iga":2716,"ii ":657,"igl":90,"igm":62,"igh":187,"igi":4211,"igu":1608,"igr":133,"igo":112,"ign":238,"ihe":679,"iha":299,"ihk":182,"ihm":65,"ihh":111,"ihi":253,"iht":687,"ihu":144,"iho":52,"ik ":7328,"imo":225,"imn":64,"imm":144,"iml":60,"ims":252,"imp":192,"idž":109,"ime":7025,"imk":210,"imi":4107,"ip ":128,"inc":157,"ind":2723,"ina":4896,"inb":48,"imt":70,"imu":1850,"inn":6824,"ino":606,"inr":85,"int":1835,"ins":667,"inf":422,"ine":12596,"ijä":97,"inh":82,"ing":10735,"inj":49,"ini":4649,"inl":74,"ink":241,"ioa":102,"inu":4443,"inv":62,"iny":57,"ьев":85,"iko":1851,"ikn":407,"ikm":345,"ikl":1543,"ikk":1983,"iki":2101,"ikh":52,"ike":2715,"ila":1146,"ilb":76,"in ":2156,"ikv":103,"ikt":211,"iku":8444,"ikr":276,"iks":1858,"ilp":103,"ilo":1161,"ill":5963,"ilk":54,"iln":65,"ilm":2925,"ilh":117,"ilj":833,"ili":8126,"ild":409,"ile":1595,"ima":3230,"imb":81,"io ":525,"ils":210,"ilt":420,"ilu":306,"ilv":181,"hol":538,"hom":193,"hon":119,"hoi":327,"hos":103,"hot":60,"hou":76,"hov":70,"hoo":630,"hop":48,"hor":259,"hob":86,"hof":75,"hoe":47,"hod":56,"hni":355,"hno":248,"hnu":57,"hna":53,"hiü":45,"hme":82,"hmi":96,"øya":60,"huk":196,"hul":1525,"hua":81,"htu":834,"htt":114,"hts":526,"htr":54,"htp":46,"htn":93,"htm":114,"htl":468,"hti":872,"htk":120,"hte":1012,"hta":927,"hv ":86,"hst":88,"hu ":482,"hrm":49,"hro":62,"hre":114,"hri":254,"ht ":1267,"hra":59,"hvu":1043,"hwa":51,"hum":318,"hun":86,"hus":694,"hut":421,"hur":174,"huu":56,"huv":197,"hva":963,"hve":131,"hvi":258,"hvk":76,"fi ":130,"ffe":80,"ffi":97,"fes":255,"fer":227,"fen":116,"fek":256,"fel":93,"fia":527,"bän":220,"fga":88,"far":57,"fan":78,"fak":63,"aül":78,"fal":91,"fai":103,"aüh":102,"fac":50,"faa":159,"ff ":69,"fe ":58,"etš":80,"fa ":46,"etü":153,"etö":63,"etõ":90,"eys":46,"esü":78,"exa":132,"ez ":79,"esõ":110,"etä":142,"exi":65,"evõ":561,"evä":227,"ezi":53,"eta":4185,"ete":3476,"eti":2019,"eth":183,"epä":212,"etn":244,"etl":824,"etk":122,"esp":119,"esn":50,"eso":307,"est":15172,"esu":447,"esr":46,"ess":2291,"esw":67,"ev ":1881,"eud":58,"euk":45,"eum":209,"eto":650,"etr":1191,"ets":1555,"ett":1369,"etu":2430,"etv":60,"epõ":75,"ew ":294,"eve":552,"eva":4542,"evo":358,"evk":55,"evi":1332,"eut":162,"eur":217,"eus":372,"ex ":72,"evu":1013,"ey ":327,"erü":78,"epe":305,"epi":674,"elä":86,"eph":149,"er ":4533,"ekü":242,"epa":593,"eot":431,"eos":1290,"eor":477,"eom":115,"eol":658,"eok":113,"eop":87,"eoo":298,"eon":217,"emä":170,"es ":19178,"ept":722,"eps":46,"epu":101,"epl":50,"epp":174,"epo":236,"epr":147,"erk":288,"erl":431,"eri":8620,"erj":357,"erg":1464,"enä":125,"erh":103,"ere":3347,"erf":89,"erc":92,"erd":390,"era":2994,"erb":550,"et ":2344,"emõ":84,"esk":3814,"esl":149,"esm":728,"esh":61,"esi":3721,"esc":105,"ese":2495,"esa":695,"erz":44,"enõ":60,"ery":69,"erv":749,"eru":688,"err":901,"ert":1083,"ers":969,"ern":1144,"erm":1039,"erp":165,"ero":598,"eki":846,"ekk":441,"ekl":132,"eko":1441,"ekr":148,"eks":6278,"ekt":2822,"eku":1116,"ekv":84,"en ":1782,"elb":55,"ela":2671,"eld":1386,"elf":47,"ele":10866,"eli":5612,"elj":466,"elg":374,"elm":278,"eln":482,"elk":721,"ell":3694,"elo":690,"elp":87,"elu":1130,"elv":392,"els":698,"elt":4591,"eo ":141,"emb":1631,"ema":5853,"eme":1702,"emm":105,"emn":126,"emo":658,"emi":4057,"emj":89,"emk":49,"emu":471,"emp":531,"ems":73,"ep ":59,"enf":67,"ene":6080,"enh":138,"ejä":142,"eng":640,"enb":279,"ena":1669,"end":9301,"enc":218,"eno":318,"enn":1302,"enk":163,"enl":78,"eni":2414,"enj":57,"enu":916,"ens":876,"ent":3483,"enr":170,"enz":141,"ejõ":394,"eny":49,"eog":500,"eod":99,"eob":44,"egl":195,"ego":223,"egn":128,"ege":2106,"egi":861,"eha":1078,"egr":208,"egs":221,"egu":1523,"ehn":541,"ehm":64,"ehk":5017,"ehr":86,"eho":88,"ehe":590,"ehi":903,"ehh":383,"ek ":209,"eib":48,"eic":64,"eia":95,"eht":613,"eip":60,"eis":4050,"eir":116,"eim":514,"eil":387,"ein":1247,"eii":45,"eik":96,"eie":63,"eid":2255,"eig":71,"eja":948,"el ":10310,"eit":602,"eiu":148,"eiv":45,"ejo":57,"eje":77,"eke":406,"eka":1625,"em ":2077,"eju":213,"gjo":58,"öta":380,"ötl":187,"ötm":68,"gji":57,"gl ":95,"öst":232,"git":756,"gis":1905,"gir":78,"giv":225,"gil":1090,"gim":411,"gij":262,"gik":642,"gip":428,"gin":524,"gio":325,"gid":469,"gie":61,"gib":455,"gih":62,"gii":53,"gig":185,"örl":72,"gia":2005,"örs":50,"ght":152,"öra":67,"örd":139,"gha":121,"öri":170,"ös ":62,"gga":46,"gi ":5321,"öpm":55,"ör ":108,"gen":1194,"geo":730,"get":321,"geu":170,"ger":573,"ges":288,"gev":1367,"gh ":77,"gee":427,"ged":631,"geb":241,"gei":271,"gem":670,"gel":1762,"gek":61,"gej":286,"öni":115,"gda":70,"önd":159,"gde":51,"ömi":48,"ge ":1988,"gaz":58,"öli":74,"gab":187,"gad":306,"gai":51,"gaa":657,"gas":1340,"gar":467,"bür":50,"gat":540,"gav":357,"gak":106,"gaj":153,"gam":272,"gal":712,"gan":2019,"öko":300,"gap":72,"ga ":9681,"öid":47,"ögi":100,"öel":65,"fur":44,"fta":81,"fti":45,"fun":331,"ft ":139,"fra":194,"fre":107,"fri":412,"fro":53,"for":851,"fos":114,"fot":258,"fon":233,"foo":243,"fol":120,"fla":75,"fli":57,"fo ":79,"fid":70,"fic":72,"fie":44,"fii":98,"fil":1891,"fik":301,"fin":213,"fir":227,"fis":69,"fit":113,"fjo":141,"õla":152,"õle":240,"õlg":110,"õli":251,"õlk":134,"õll":298,"õlm":435,"õlt":294,"õlu":48,"õlv":413,"õju":446,"õkk":88,"õne":668,"õng":76,"õna":926,"õnd":49,"õnn":136,"õni":203,"õnk":86,"õnu":108,"da ":7682,"õmb":51,"õmm":206,"õpe":513,"õpi":263,"õpu":202,"de ":10486,"õpp":472,"dad":930,"daa":286,"dab":1585,"dak":424,"dal":2308,"dai":94,"daj":449,"dag":307,"dae":79,"dat":1521,"das":1349,"dar":435,"dap":141,"dao":265,"dan":602,"dam":1690,"dav":923,"õri":60,"õrj":81,"õrk":196,"dde":47,"õrg":1625,"õra":75,"õrb":100,"õrd":333,"õt ":51,"õrv":475,"õru":187,"õrr":267,"õrm":46,"õda":213,"õde":121,"cul":68,"ctu":66,"õe ":1023,"cto":118,"cti":143,"cy ":83,"õet":187,"ões":152,"õel":416,"õen":137,"õdu":141,"cus":131,"cur":49,"õhk":139,"õhe":49,"õhj":2576,"õhi":1112,"õge":171,"õgi":631,"õi ":5148,"õja":1187,"õiv":218,"õit":583,"õis":2033,"õim":1468,"õik":622,"õie":258,"õid":332,"õig":1596,"õib":513,"õhu":552,"cks":55,"cki":151,"ckh":122,"cla":52,"cle":64,"co ":192,"con":154,"col":137,"com":86,"cor":129,"cos":67,"cot":61,"cs ":57,"öd ":110,"ct ":57,"cra":52,"cro":105,"ödi":95,"öda":96,"öde":377,"õst":108,"cci":47,"õu ":118,"õsa":163,"õtu":46,"õtt":671,"cea":61,"õte":379,"õta":58,"õtm":273,"õtl":100,"õtj":101,"õus":427,"õut":47,"ch ":578,"ces":126,"õua":73,"cen":106,"õue":77,"õud":589,"õug":83,"cel":80,"õuk":759,"õul":55,"õun":1596,"ci ":105,"õva":82,"õve":54,"cha":386,"chw":86,"õtü":68,"chu":131,"cia":158,"ck ":487,"cie":98,"che":615,"chl":138,"chi":411,"cho":153,"chm":45,"chn":57,"chs":85,"cht":224,"chr":50,"cis":100,"cin":92,"cm ":52,"cke":185,"cka":54,"õzs":53,"ed ":5646,"eba":352,"ebe":121,"ebi":253,"ebl":44,"ebo":77,"ebr":684,"ebu":73,"eab":210,"eaa":424,"eag":132,"eae":122,"ead":2747,"eak":483,"eaj":63,"eai":99,"eah":87,"ean":733,"eao":48,"eal":1914,"eam":934,"ear":190,"eas":399,"eap":99,"eav":225,"eat":919,"eau":103,"eb ":2987,"ea ":655,"efi":173,"efo":256,"efa":50,"efe":291,"eff":54,"ei ":1558,"ega":4991,"eft":49,"eej":166,"eek":1260,"eeh":72,"een":1833,"eel":7467,"eem":2642,"eeb":812,"eea":50,"eeg":511,"eed":1212,"ees":3817,"eer":4899,"eep":225,"eev":114,"eet":2444,"edi":900,"ede":2096,"eda":2449,"eg ":436,"edu":707,"edo":157,"edr":231,"eck":136,"ech":210,"eci":98,"ee ":3170,"ef ":57,"ect":151,"eco":81,"dwi":78,"dvu":88,"dwa":83,"dy ":101,"dve":99,"duv":409,"duu":64,"dur":309,"dut":208,"dus":9107,"dva":107,"dor":276,"doo":134,"don":412,"dom":146,"dol":150,"dok":340,"dow":80,"dov":88,"dot":70,"dos":91,"dr ":97,"ds ":128,"dmi":448,"dmu":204,"dne":328,"dni":181,"dnu":82,"dsu":120,"dso":79,"dte":403,"dun":212,"dum":683,"dul":361,"duk":455,"õbe":86,"dub":375,"dua":181,"dud":1109,"dri":1541,"drh":50,"dra":532,"dt ":79,"dre":399,"du ":1711,"dro":385,"dru":114,"dsi":69,"dsa":56,"dse":425,"dha":80,"dge":90,"dgl":50,"dic":84,"did":1413,"dia":1187,"der":1254,"des":3846,"det":1487,"dev":307,"deb":74,"dea":150,"ded":103,"def":181,"dee":433,"deg":743,"dei":268,"del":2624,"dek":762,"den":1386,"dem":499,"dep":286,"deo":437,"di ":4266,"dle":97,"dla":684,"dko":283,"dki":67,"dme":789,"dma":248,"do ":334,"dlu":270,"dli":419,"dja":309,"div":260,"diu":154,"dim":305,"din":5058,"dio":737,"dip":162,"dir":107,"dis":3167,"dit":1143,"die":158,"dif":137,"dig":325,"dii":121,"dik":678,"dil":754,"dka":57,"dju":74,"eKr":290,"rgu":1208,"rhe":141,"näd":70,"rha":144,"näg":90,"näh":356,"näi":1177,"rhi":386,"när":92,"rho":51,"näo":109,"rga":2008,"ri ":5455,"rgk":63,"rgl":133,"rgi":2791,"rgh":68,"rge":1513,"rgs":57,"rgr":60,"rgo":122,"rgm":113,"rgn":141,"ret":637,"res":3334,"rev":282,"reu":99,"müü":230,"rfa":51,"rfe":44,"rfi":65,"rfo":51,"rdu":636,"rds":305,"rdr":70,"rg ":613,"reb":76,"rea":973,"ree":2061,"ref":247,"rec":94,"red":566,"rei":666,"reg":889,"rem":1488,"ren":1721,"rek":855,"rel":1476,"rer":134,"reo":148,"rep":269,"rda":591,"rcu":58,"rdo":137,"rdn":203,"rdk":56,"rdm":90,"rdl":149,"rdi":1642,"rde":1014,"re ":3338,"rbu":230,"rbr":301,"rch":187,"rce":98,"rd ":981,"rao":70,"rap":222,"mür":81,"rar":113,"ras":2388,"müt":281,"rat":2331,"rau":610,"rav":669,"rbi":388,"rbl":48,"rbo":167,"rba":241,"rbe":348,"raj":969,"rai":575,"rah":2331,"rag":255,"ran":3582,"mün":66,"ram":1127,"ral":1912,"rak":1264,"rab":296,"raa":4053,"raf":126,"rae":588,"rad":970,"rac":160,"rpu":103,"rpr":48,"rpo":141,"rs ":316,"rpe":48,"rpa":108,"rr ":55,"rpi":60,"ror":118,"ros":627,"rot":969,"rom":675,"ron":932,"roo":2596,"rop":430,"rou":140,"rov":1089,"row":56,"rob":239,"roa":117,"rod":447,"roc":250,"roj":238,"roi":109,"rol":630,"rok":255,"rof":315,"roe":51,"roh":391,"rog":576,"rno":131,"rns":89,"rnu":569,"rna":702,"riü":55,"rež":141,"rne":1143,"rni":528,"rjä":49,"rmo":236,"rms":58,"rmu":175,"ro ":398,"rma":1727,"rme":627,"rmi":820,"rlo":160,"nää":74,"rli":332,"rld":67,"rle":265,"rla":365,"rn ":318,"rgõ":53,"rkv":211,"rku":265,"rkt":136,"rks":128,"rkm":44,"rko":1059,"rki":431,"rkk":66,"rke":333,"rka":216,"rm ":305,"rju":905,"rjo":50,"rja":2847,"rje":754,"riz":44,"rl ":207,"rip":347,"rio":550,"rir":293,"rit":4861,"ris":3546,"riv":668,"riu":963,"rih":61,"rig":747,"rij":404,"rii":5626,"ril":3640,"rik":3181,"rin":3123,"rim":2490,"ria":2118,"rib":484,"ric":609,"rid":1612,"rie":633,"rif":87,"rk ":868,"rtü":56,"rug":66,"rue":78,"rud":172,"ruc":52,"rup":396,"run":460,"rum":712,"rul":170,"ruk":513,"ruu":777,"ruv":117,"rus":1487,"rut":261,"rva":1548,"rvi":795,"rve":1159,"rvp":144,"rvl":46,"rvu":1089,"rwa":50,"ry ":413,"rsk":243,"rsi":519,"rso":171,"rsc":53,"rsa":171,"rsh":62,"rse":561,"rta":274,"rv ":755,"rst":515,"rss":168,"rsu":190,"rtl":107,"rtm":58,"rtn":92,"rto":217,"rte":1543,"rth":281,"rti":1255,"rub":87,"rua":490,"rts":1053,"rtr":65,"rtu":1315,"rtt":57,"rt ":810,"rro":167,"rri":784,"rre":476,"rra":2127,"ru ":1080,"rry":114,"rru":164,"saa":4775,"sab":140,"sad":632,"sae":148,"sag":380,"sah":226,"sai":500,"saj":656,"sak":2002,"sal":1925,"sam":2292,"sbe":92,"sap":131,"óni":51,"san":1221,"sau":105,"sat":1244,"sas":2183,"sar":1496,"sav":213,"sa ":3636,"ón ":53,"rze":44,"nõu":458,"nõr":64,"nõl":71,"nõi":66,"rvü":53,"sha":261,"sho":138,"she":80,"shi":370,"si ":5360,"sgr":60,"sja":511,"siv":324,"sju":179,"sjo":145,"sfä":198,"sie":142,"sid":2303,"sic":71,"sib":74,"sia":2206,"sk ":538,"shu":57,"sit":1318,"siu":225,"sir":212,"sis":2918,"sip":251,"sin":1981,"sio":5046,"sil":1686,"sim":1606,"sij":144,"sik":2762,"sih":204,"sii":1362,"sif":138,"sig":349,"sda":47,"sdo":45,"sbo":51,"sbu":115,"se ":21232,"sca":191,"sci":49,"sch":453,"sco":130,"sev":717,"sey":55,"ser":1068,"ses":5796,"set":1978,"seu":254,"nüü":243,"sh ":174,"sfi":86,"sfo":68,"sea":1804,"sei":2222,"seh":77,"seg":1695,"see":2074,"sed":3928,"seb":254,"sep":903,"seo":645,"sen":2101,"sem":2170,"sel":10065,"sek":3272,"sej":332,"spu":121,"spo":772,"spr":203,"spe":493,"spi":353,"spa":687,"skü":66,"sot":518,"sou":51,"sov":164,"skõ":91,"sol":473,"som":95,"son":907,"soo":2060,"sop":124,"sor":585,"sos":135,"sog":196,"sof":91,"skä":80,"soa":50,"soc":67,"sob":104,"su ":776,"sri":388,"sre":70,"sra":250,"st ":19438,"smõ":57,"smä":672,"ss ":961,"sli":1453,"slo":247,"slu":141,"sla":864,"sle":179,"ski":1330,"skk":688,"skj":46,"skm":251,"skl":222,"sko":1611,"skn":98,"skp":86,"sks":70,"skr":240,"sku":1933,"skv":335,"sm ":509,"ska":1203,"ske":1189,"sno":114,"sjõ":49,"snu":95,"sna":125,"sni":235,"sjä":141,"sne":888,"smo":245,"smu":122,"so ":126,"sma":1250,"smi":1155,"sme":256,"swi":78,"ssü":155,"ssö":85,"ssõ":135,"stõ":73,"stö":240,"svä":103,"svõ":164,"svö":80,"sse":2400,"ssa":886,"sso":712,"ssk":48,"ssi":2279,"ssu":260,"sst":65,"ste":12441,"stf":123,"spä":99,"sta":12430,"stm":389,"stn":305,"sto":736,"sti":11558,"stj":94,"stk":190,"stl":1467,"stv":203,"stu":4098,"str":2584,"sts":374,"sua":230,"sud":110,"sub":2124,"suh":654,"sug":926,"sul":1089,"sum":385,"suk":274,"sup":112,"sun":473,"suu":3246,"sut":3768,"sus":4456,"sur":769,"suv":746,"sva":1352,"sve":77,"svi":156,"svo":72,"svu":249,"sy ":64,"tai":1177,"taj":1586,"tak":4223,"tal":2877,"tae":166,"taf":74,"tag":732,"tah":420,"taa":2158,"tab":1730,"tac":49,"tad":1273,"tba":65,"tav":3721,"tau":131,"tat":7322,"tas":2453,"tar":1636,"tap":159,"tao":157,"tan":2647,"tam":2975,"tch":74,"te ":12807,"ta ":4265,"сто":56,"стр":66,"ств":57,"pa ":804,"ста":51,"pe ":459,"par":1885,"küt":108,"pat":370,"küs":137,"pas":347,"pau":44,"pad":102,"paa":872,"pab":98,"pag":125,"pae":47,"pak":331,"kül":2089,"pal":1654,"pai":924,"paj":61,"pap":44,"küm":165,"pan":1223,"kün":49,"phe":92,"läb":569,"pha":122,"lät":92,"pho":86,"phi":121,"läh":1032,"pi ":489,"küü":67,"ph ":74,"lä ":65,"pea":2994,"peb":54,"pec":50,"ped":165,"pen":309,"per":2019,"pet":866,"pes":209,"pee":735,"pej":56,"pei":56,"pel":221,"pek":193,"pla":985,"pli":241,"lää":1062,"ple":335,"plo":204,"pko":106,"тан":48,"phy":67,"pia":480,"pid":668,"pie":55,"pig":83,"pii":2460,"pik":984,"pil":982,"pim":106,"pin":1660,"pio":63,"pir":110,"pis":249,"pit":216,"por":922,"pop":279,"poo":3433,"pot":178,"pos":489,"poi":69,"poj":56,"pom":75,"pon":250,"pok":78,"pol":2011,"poe":314,"ps ":115,"ppu":86,"ppi":265,"ppl":73,"ppo":76,"ppa":134,"ppe":738,"тер":49,"pme":107,"po ":62,"pni":140,"pne":50,"pp ":583,"psu":187,"pta":45,"pse":289,"psi":131,"pso":44,"ptu":183,"pub":112,"pud":66,"pte":686,"pti":246,"pto":149,"pts":116,"pra":973,"pru":93,"psa":102,"pu ":143,"pri":927,"pre":785,"pro":2923,"psü":231,"lõh":137,"pur":257,"pus":270,"put":156,"pun":658,"pui":197,"pul":391,"puh":552,"px ":81,"тов":47,"puu":696,"тор":53,"lõp":469,"lõi":316,"lõu":988,"тро":97,"löö":149,"lüm":359,"lül":150,"lük":99,"lüh":844,"lüü":275,"mäe":703,"mäg":380,"män":1220,"mäl":265,"mär":1565,"mää":557,"qua":76,"que":158,"qui":88,"mõe":265,"mõi":1422,"mõj":443,"mõl":89,"mõn":454,"mõr":58,"mõt":400,"mõõ":559,"ra ":2421,"möö":180,"rb ":55,"ngo":283,"ngj":112,"eži":152,"ngi":2407,"ngl":2197,"ngk":485,"ngv":53,"ngu":1971,"ngr":466,"ngt":111,"ngs":158,"ni ":8348,"nge":1413,"ngh":131,"nga":1383,"ngd":44,"nho":71,"jäl":102,"jät":125,"jär":2920,"nha":193,"jäi":81,"nhe":51,"neg":216,"nei":559,"nel":854,"nek":392,"nen":1253,"nem":2691,"nep":231,"neo":254,"ner":1383,"net":1372,"nes":1547,"nev":1928,"neu":153,"ndv":49,"ng ":5293,"nea":209,"neb":1735,"ned":454,"nee":1283,"nfi":63,"nfo":399,"nfl":72,"ney":104,"nez":55,"nfe":71,"nco":82,"nci":160,"nce":327,"nch":174,"ne ":18973,"nbu":170,"ndt":47,"ndu":3781,"ndr":1767,"nds":245,"ndn":73,"ndo":567,"ndl":703,"ndm":736,"ndj":199,"ndk":76,"ndi":9574,"nde":2285,"nda":5981,"ncy":54,"nak":847,"nal":1444,"iül":54,"nam":1141,"nan":791,"nao":524,"nap":666,"nar":696,"nac":71,"nad":869,"nae":164,"naf":95,"nag":661,"nah":165,"iüh":71,"nai":421,"naj":121,"nab":249,"naa":1844,"nbe":157,"nd ":4383,"nba":54,"nav":684,"nau":173,"nat":1014,"nas":5608,"na ":9461,"iõp":167,"가 ":51,"nya":69,"jõe":1151,"nyi":68,"jõg":658,"nz ":90,"nsü":49,"ny ":159,"nux":47,"nve":213,"nuk":391,"nul":392,"num":581,"nun":66,"nui":73,"nuj":150,"nus":1407,"nut":211,"nuv":56,"nur":279,"nua":567,"nud":6307,"nto":594,"ntn":44,"ntu":1152,"nts":3190,"ntr":629,"nti":1551,"nth":107,"ntl":53,"nta":1333,"nte":1954,"nsu":194,"nsp":183,"nso":192,"nst":1654,"nss":87,"nse":418,"nsh":72,"nsi":398,"nsk":423,"nsa":519,"nsb":47,"nu ":581,"nri":214,"nra":73,"nt ":1663,"npr":44,"ns ":389,"nod":52,"nob":45,"nog":106,"nof":50,"nok":61,"nol":406,"noi":73,"noo":957,"nop":56,"nom":241,"non":176,"not":137,"nos":276,"nor":508,"nov":777,"nr ":71,"nne":1409,"nna":9091,"nnm":60,"nno":142,"nni":2211,"nnu":1256,"nns":51,"nme":49,"nma":125,"ndž":60,"nli":99,"jää":730,"nn ":2722,"nla":220,"no ":545,"nke":134,"nkl":53,"nki":267,"nka":177,"nku":165,"nko":182,"nks":69,"nkt":721,"nkr":123,"nji":51,"nja":120,"njo":51,"nij":103,"nii":935,"nih":114,"nig":327,"nif":70,"nie":199,"nid":1206,"nic":153,"nib":61,"nia":1970,"nk ":386,"nix":49,"niu":138,"niv":309,"nis":5167,"nit":1025,"nir":49,"nio":206,"nip":222,"nim":5797,"nin":5067,"nik":3655,"nil":1691,"ogs":64,"ogr":1310,"ogu":2080,"ogt":71,"ogi":2325,"ogl":51,"ogo":115,"ogn":110,"oga":201,"oge":329,"oi ":80,"oht":1247,"kät":121,"käs":428,"kär":67,"ohv":181,"ohu":379,"ohk":157,"ohj":63,"käi":618,"ohi":105,"oho":123,"ohn":161,"oha":1453,"käe":74,"ohe":234,"ois":197,"oir":72,"oiu":71,"oit":264,"oin":102,"oik":52,"oim":998,"oil":59,"oid":535,"ok ":100,"oia":64,"oju":137,"oje":372,"oja":659,"ol ":1399,"oce":44,"och":80,"oci":105,"ock":501,"oco":61,"obs":62,"obu":202,"oe ":109,"ode":1123,"odk":64,"odi":1047,"odo":182,"odn":64,"ods":77,"odr":58,"of ":621,"oda":609,"oel":103,"oeg":287,"oer":126,"oes":47,"oet":688,"oen":399,"odu":2480,"oee":44,"og ":463,"ofi":557,"oft":95,"ofo":118,"off":59,"ofe":202,"oa ":93,"ob ":139,"oan":65,"oam":51,"oal":156,"oak":80,"oaj":111,"oba":165,"od ":481,"oar":70,"oas":64,"obo":116,"obr":390,"obl":782,"obj":309,"obi":400,"obe":361,"jõk":47,"jõu":952,"nza":48,"nze":88,"nzi":54,"kõi":997,"otü":52,"oya":45,"osü":192,"ows":73,"own":56,"kõr":1655,"kõv":81,"oyl":61,"kõl":116,"kõn":322,"otu":561,"ow ":114,"otl":94,"otk":48,"otj":93,"oti":581,"oth":115,"ote":630,"ott":288,"ots":2879,"otr":134,"oto":793,"otm":209,"ost":1657,"osu":161,"ota":435,"otb":45,"ov ":391,"osi":662,"osh":53,"osk":556,"ose":1045,"osf":200,"osp":69,"oss":687,"osm":201,"osl":192,"oso":739,"osn":672,"oy ":48,"owa":44,"owe":70,"ovi":1506,"ovg":80,"ovn":57,"ovo":225,"ovu":110,"ovs":142,"ox ":49,"ova":496,"ove":767,"oug":79,"oui":114,"oul":81,"oun":169,"oup":53,"ous":121,"our":234,"out":116,"opo":355,"opp":88,"opi":377,"opk":103,"opl":127,"ope":659,"oph":152,"opa":1002,"os ":1676,"olü":473,"opu":229,"opr":90,"opt":153,"ops":80,"oon":7646,"ool":6457,"oom":3730,"ooj":448,"ook":1060,"oof":591,"oog":2684,"ood":3903,"oob":808,"ooa":118,"or ":1504,"oov":321,"oot":2071,"oos":2409,"oor":2416,"oop":1217,"ork":293,"orl":106,"orm":1403,"orn":581,"oro":441,"orp":251,"orr":1904,"orc":46,"ord":2640,"ore":923,"orf":173,"org":2150,"ori":3030,"orj":99,"ou ":125,"osa":4585,"onü":106,"ort":1076,"ors":316,"orv":260,"oru":308,"ory":79,"omä":44,"ot ":233,"m² ":198,"omö":62,"orb":147,"ora":643,"ola":1318,"old":420,"on ":39161,"olj":64,"oli":8483,"oll":1578,"olk":349,"olf":126,"ole":2988,"kää":76,"olg":132,"ols":1007,"olt":1144,"olm":1122,"oln":368,"olo":2600,"olu":1563,"olv":65,"ofü":93,"oka":297,"om ":370,"okk":531,"oki":231,"oke":623,"okr":399,"oks":957,"oko":247,"okl":55,"okt":664,"oku":340,"ona":1303,"ond":3491,"onf":139,"one":1232,"ong":640,"oni":6425,"onl":72,"onk":273,"onn":4600,"ono":803,"onp":75,"onr":44,"ons":816,"ont":1242,"onu":172,"onv":154,"ony":92,"oma":3656,"oo ":462,"ome":2088,"omb":266,"omi":1618,"omm":613,"omp":670,"omn":106,"omo":342,"omt":108,"omu":606,"oms":246,"op ":270,"la ":5991,"le ":12620,"lf ":96,"ldg":52,"lde":484,"lda":2949,"ldo":95,"ldn":65,"ldm":61,"ldk":243,"ldj":62,"ldi":712,"ldu":1916,"ldt":53,"lds":102,"ldr":58,"laa":1373,"lab":404,"lac":145,"lad":1522,"lae":732,"lah":1640,"lag":388,"laj":204,"lai":1062,"lal":666,"lak":450,"lan":3578,"güm":58,"lam":1233,"lap":307,"lao":158,"lar":440,"kyō":50,"lat":2238,"las":6761,"lau":1302,"lav":840,"lay":76,"lba":164,"ld ":1107,"lbe":196,"lbi":58,"lbo":45,"lbu":264,"lbr":50,"kvi":101,"kve":117,"kva":933,"kuv":271,"kuu":2381,"kut":1032,"kus":4511,"kur":510,"kup":256,"kun":2636,"kum":1199,"kul":2990,"kuk":399,"kuj":1401,"ky ":44,"kta":184,"kte":370,"ksp":172,"ksu":1368,"kst":539,"ksk":77,"ksj":60,"ksi":1286,"ksh":44,"kso":186,"ksn":51,"ksm":52,"ksl":97,"kub":299,"kud":842,"kug":145,"kuh":220,"kui":2011,"kua":93,"ktr":1077,"kts":955,"ktu":547,"kti":1734,"kto":1139,"gõz":53,"ksü":131,"kyl":45,"lpo":71,"lpe":157,"lpi":167,"lph":47,"ls ":120,"llü":71,"lpt":60,"lol":75,"lok":121,"lon":328,"lom":526,"lop":192,"loo":6341,"lor":264,"lod":57,"loc":108,"loe":826,"loh":62,"log":273,"loi":81,"los":787,"lot":130,"lov":353,"lkõ":156,"lni":82,"lne":1147,"lob":91,"lnu":266,"lmn":103,"lmi":1561,"lme":771,"lma":2491,"liõ":138,"lmu":534,"lms":87,"lti":513,"lto":103,"ltr":71,"lts":498,"ltu":933,"lud":201,"lub":1064,"lua":79,"lug":367,"lue":62,"lsi":290,"lsk":106,"lso":81,"lss":61,"lst":139,"lsu":210,"lv ":77,"lta":252,"lte":588,"lri":102,"lu ":1464,"lse":1684,"lsa":658,"ía ":44,"lt ":7949,"lra":58,"lhu":155,"häv":119,"lho":73,"här":83,"lhe":124,"häi":143,"lha":82,"lgu":794,"lgs":300,"lgn":56,"lgl":54,"lgr":91,"lgo":86,"lgp":735,"lge":1193,"lgi":247,"li ":8571,"lga":1157,"lfr":46,"lfi":78,"lfa":51,"ley":127,"lex":145,"leu":72,"lev":2102,"les":6911,"let":1229,"ler":467,"leo":191,"lep":522,"lem":2382,"len":1713,"lek":2924,"lel":939,"lei":662,"lej":650,"leh":741,"leg":615,"lef":123,"lee":784,"led":276,"lec":56,"leb":814,"lea":143,"lg ":112,"llp":58,"lls":97,"llu":829,"lly":101,"lo ":301,"lla":3994,"lle":7577,"hää":354,"lli":4723,"llk":46,"llo":330,"lko":770,"lku":106,"lka":667,"lke":205,"lki":380,"lkl":73,"lju":697,"ljo":186,"lm ":614,"lje":847,"ll ":1528,"lja":2809,"lit":3006,"lis":12972,"lir":127,"lip":448,"lio":191,"lin":10479,"lim":654,"liz":82,"liv":165,"liu":274,"lic":156,"lid":1460,"lia":1378,"lib":211,"lk ":259,"lik":8420,"lil":1193,"lii":5503,"lij":133,"lig":496,"lih":443,"lie":227,"lif":188,"hõõ":63,"ma ":4860,"mb ":129,"maa":11983,"mac":100,"mab":268,"mah":237,"чес":55,"mai":1139,"maj":980,"mak":919,"hüd":167,"mad":1582,"mae":114,"maf":51,"mag":417,"hüp":189,"map":146,"mar":885,"mas":3246,"mal":2280,"hüm":55,"mam":324,"man":2399,"mav":524,"mat":3086,"mba":251,"mbl":329,"mbi":269,"mbe":2044,"mbr":1530,"mbo":263,"me ":2925,"mbu":182,"med":1125,"mee":3676,"meg":594,"mea":100,"met":4426,"mev":67,"mep":72,"mes":2849,"mer":2169,"mem":157,"mel":1042,"men":2636,"mei":782,"meh":405,"mek":592,"hüü":97,"lva":713,"lve":678,"lvi":208,"lul":347,"luk":457,"lup":150,"luo":49,"lun":505,"lum":505,"lut":683,"lus":3490,"lur":311,"luv":1330,"luu":781,"ly ":163,"lrü":75,"lvk":62,"hõb":45,"hõi":206,"hõl":310,"mpi":452,"mph":51,"mpe":554,"mpr":66,"mpo":299,"mpl":310,"mpu":89,"mps":61,"ms ":217,"mog":59,"mob":84,"moe":58,"mod":191,"moo":1204,"mon":658,"mok":297,"mom":61,"mol":409,"mov":44,"mor":367,"mos":365,"mot":173,"mpa":252,"msa":75,"mu ":287,"mse":406,"mtu":96,"mud":268,"mub":235,"mst":77,"msu":90,"mso":47,"msi":100,"mte":76,"my ":109,"mur":342,"mus":2450,"mut":357,"muu":2318,"muv":182,"mui":212,"muj":55,"muk":66,"mul":772,"mum":89,"mun":762,"džu":46,"dža":146,"mi ":3313,"dži":112,"mjo":107,"min":5859,"mio":159,"mil":5861,"mim":121,"mir":250,"mis":17764,"mip":65,"miv":72,"mit":2152,"mic":74,"mia":602,"mig":164,"mie":74,"mid":2544,"mik":1090,"mii":356,"mo ":151,"mli":87,"mle":44,"mla":87,"mn ":55,"mko":173,"mka":66,"mm ":329,"mnu":50,"mni":67,"mna":105,"mne":450,"mmy":82,"mp ":57,"mmu":480,"mmi":519,"mmo":72,"mma":633,"mme":654,"tš ":386,"tše":222,"tši":133,"tša":111,"zst":65,"võt":836,"võs":96,"võr":892,"võn":91,"võl":153,"või":7974,"võe":178,"vöö":291,"võõ":64,"vür":134,"zi ":155,"väl":2397,"väh":435,"väi":1090,"väg":389,"väe":751,"zen":74,"zer":96,"ze ":109,"zab":50,"uüh":53,"zan":66,"zar":62,"zon":61,"zo ":48,"vää":478,"vär":503,"väs":44,"zia":44,"zin":86,"yst":121,"ysi":65,"ys ":119,"ylä":47,"za ":111,"tüü":554,"yer":56,"ya ":162,"töö":1589,"tüt":130,"tür":79,"yan":90,"tük":102,"tüh":106,"yn ":51,"yle":91,"ylo":45,"yne":47,"yin":76,"tõe":238,"tõm":70,"tõl":200,"tõk":57,"tõu":195,"tõs":109,"tõt":177,"tõr":100,"tän":706,"täh":1822,"täi":727,"xi ":91,"süü":243,"xim":52,"tär":60,"täp":255,"söö":193,"süs":1324,"süt":71,"sük":585,"sül":70,"süm":231,"sün":4272,"xan":102,"süd":144,"süh":336,"süg":181,"sõl":377,"sõj":1120,"sõp":113,"sõn":939,"sõd":233,"sõi":351,"sää":83,"ws ":61,"wor":58,"rüü":199,"wer":86,"wen":48,"säi":178,"wit":44,"wig":128,"wic":50,"win":68,"rõh":178,"röö":217,"wa ":114,"wan":110,"rün":141,"rüo":64,"wal":97,"rük":165,"way":55,"rüt":71,"war":196,"rüh":567,"vri":47,"vsu":116,"vst":161,"vse":403,"vsk":184,"vu ":181,"vut":793,"vus":2887,"vud":63,"vum":73,"vuk":112,"vul":243,"vy ":55,"vib":72,"via":291,"vio":50,"vip":54,"vir":136,"vik":721,"vil":657,"vim":530,"vin":1205,"vig":116,"vih":89,"vii":1360,"vic":51,"vid":522,"vie":84,"viv":99,"vit":1084,"vis":990,"vka":64,"vko":169,"vkj":47,"vla":98,"rää":402,"vli":75,"vo ":249,"vms":66,"vne":367,"vna":66,"voj":65,"vol":333,"vok":90,"von":554,"voo":711,"vor":540,"vos":69,"vpa":151,"rän":173,"vi ":1273,"vgo":75,"veo":115,"ver":1595,"ves":902,"vet":333,"vei":261,"veg":47,"ven":2478,"vem":547,"vel":383,"vek":127,"vea":44,"vee":1876,"ved":504,"ve ":854,"val":8945,"vak":669,"van":2026,"vam":407,"vap":226,"var":1747,"vat":2727,"vas":3460,"vav":166,"vaa":1097,"vab":718,"vae":151,"vad":3860,"vai":1190,"vaj":399,"vag":98,"vah":2967,"va ":3296,"uvõ":268,"uvä":105,"usõ":228,"usü":545,"uuk":89,"uun":1174,"uul":3063,"uum":754,"uub":214,"uua":104,"uug":75,"uud":1301,"uue":262,"ux ":63,"uus":2149,"uur":5973,"uup":59,"uuv":44,"uut":1103,"uvi":326,"uvo":58,"uva":2014,"uve":246,"uvu":186,"usl":1435,"usm":668,"usj":225,"usk":1361,"ush":270,"usi":2532,"usf":104,"usg":54,"usd":49,"use":12729,"usc":44,"usa":1306,"uu ":547,"uv ":1217,"usv":989,"usu":604,"ust":10829,"uss":984,"usr":193,"usp":536,"uso":240,"usn":79,"utk":47,"utl":151,"utm":97,"utn":50,"uth":113,"upä":61,"uti":1297,"ute":1328,"uta":4946,"utz":51,"upõ":82,"utt":121,"uts":1366,"utv":93,"utu":2313,"uto":959,"utr":121,"us ":14520,"umä":125,"ut ":606,"urb":400,"ura":699,"urd":430,"ure":1850,"urg":757,"urj":163,"uri":3709,"url":67,"urk":267,"urm":383,"urn":372,"uro":1163,"urr":134,"urs":299,"urt":336,"uru":948,"urv":239,"uol":59,"uot":45,"uor":83,"uos":126,"ukü":84,"upa":311,"ur ":1500,"upi":356,"upe":239,"upo":146,"upp":283,"upr":65,"upl":50,"upu":118,"ump":59,"umu":239,"umi":3810,"umm":293,"umo":96,"uma":2384,"umb":1389,"ume":800,"unt":871,"uns":729,"unu":969,"unk":808,"uni":2914,"uno":48,"unn":618,"unc":54,"und":1856,"una":2903,"ung":491,"ujä":52,"une":1105,"up ":125,"uks":640,"ukr":117,"uku":265,"ukt":617,"uko":1820,"ukk":131,"ukl":81,"uki":384,"uke":268,"um ":1368,"uka":800,"uju":1494,"ulu":2649,"ult":2105,"uls":130,"ulp":70,"ulo":139,"uln":45,"ulm":184,"ull":465,"ulk":645,"ulj":361,"uli":2783,"ulg":972,"ule":2072,"uld":337,"ula":1652,"un ":198,"uid":849,"uie":51,"uig":91,"uil":85,"uim":112,"uin":284,"uis":365,"uht":1356,"uhu":769,"uk ":199,"uiv":119,"uit":151,"ul ":2180,"uja":426,"ugh":62,"ugi":162,"uge":590,"ugo":56,"ugl":46,"ui ":1419,"uga":910,"uhi":560,"uhe":222,"uho":49,"uhk":168,"ugu":2195,"ugr":50,"uha":362,"uct":46,"uda":566,"ude":2760,"udi":687,"udm":209,"ubu":163,"uca":46,"ue ":313,"uci":50,"uch":100,"uck":66,"uet":68,"uer":113,"ues":193,"püü":159,"ufo":48,"ufi":80,"udu":959,"uds":80,"udt":385,"udo":126,"ug ":76,"udw":51,"uee":109,"uen":148,"uel":155,"pöö":270,"ub ":3808,"põõ":56,"ua ":229,"uat":112,"uas":207,"püs":236,"uar":1172,"pür":52,"uam":44,"ual":222,"uan":196,"ubi":508,"ubj":82,"ubl":189,"ube":338,"uba":529,"ud ":15469,"uak":53,"püh":249,"uai":64,"uad":62,"uaa":347,"tvõ":44,"tvä":73,"tzi":57,"tze":47,"põh":2641,"põi":53,"põl":539,"põr":58,"ty ":254,"trü":150,"tvu":92,"tvo":55,"tve":93,"tva":221,"tur":775,"tus":7900,"tut":545,"tuu":2287,"tuv":618,"tuj":83,"tui":121,"tul":1640,"tuk":346,"tun":1878,"tum":1084,"tup":139,"tub":376,"tua":183,"tud":7345,"tuh":72,"tug":512,"tsü":333,"tz ":121,"tsõ":45,"ts ":1169,"tre":529,"tt ":402,"tra":1815,"tri":3128,"tru":752,"tro":1685,"tu ":1887,"tsa":1214,"tse":5653,"tsc":70,"tsi":9068,"tsj":165,"tsm":159,"tsk":220,"tsl":83,"tso":358,"tsu":2329,"tst":212,"tta":440,"tte":2371,"tti":343,"ttl":46,"tto":143,"ttp":55,"tts":70,"ttu":276,"tme":1276,"tma":236,"to ":641,"tmu":55,"tmo":97,"tmi":756,"tni":437,"tne":300,"tp ":49,"tna":202,"tnu":77,"tno":51,"tof":48,"toe":213,"tod":315,"toc":161,"toi":1340,"toh":46,"tog":150,"toa":50,"tov":120,"tos":269,"tot":150,"tom":456,"ton":1159,"tok":224,"tol":1335,"tor":2218,"too":2528,"top":205,"tkü":52,"tij":48,"tii":1991,"til":2148,"tik":5067,"tif":152,"tie":142,"tih":290,"tig":255,"tir":123,"tit":780,"tis":3212,"tin":1198,"tim":859,"tip":427,"tio":925,"thu":237,"tia":704,"tic":223,"tid":1567,"tiu":100,"tiv":496,"tja":582,"tki":109,"tko":214,"tku":135,"tka":199,"tke":169,"tli":1813,"pää":190,"tlu":1138,"tla":898,"tle":1553,"tem":2042,"ten":994,"teo":974,"tep":235,"tei":2648,"tej":88,"tek":2590,"tel":4094,"tee":4577,"tef":53,"teg":3418,"teh":952,"tea":2198,"tec":57,"ted":509,"tfo":51,"tfa":78,"th ":373,"tev":754,"tet":709,"tes":4048,"ter":4969,"ti ":9417,"tho":135,"thr":60,"pär":1925,"päe":892,"the":496,"thi":183,"päi":92,"tha":245,"ān ":65,"üüp":380,"üür":188,"üüs":557,"üüt":421,"üüa":70,"üüb":92,"üüd":395,"üüg":68,"üüh":62,"üüm":176,"üül":234,"üün":56,"並 ":57,"žik":69,"žis":93,"三 ":165,"žii":61,"žan":88,"丁 ":49,"žaa":84,"ži ":55,"üve":65,"ürs":166,"ürt":60,"üro":72,"ürk":57,"ürg":241,"üri":378,"üre":55,"üra":58,"üs ":51,"ütt":66,"ütu":70,"üti":151,"ütl":183,"ütm":69,"üto":339,"üta":158,"üte":90,"üss":79,"üst":1343,"üsi":1015,"之 ":88},"n_words":[4341644,4941492,4175920],"name":"et"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"٬":970,"پ":57795,"و":425936,"ي":735907,"ً":1698,"َ":4564,"ُ":2202,"ف":144242,"ق":86257,"ك":2176,"ل":186547,"م":383367,"ن":505254,"ه":501741,"ّ":1083,"ِ":3108,"ٔ":10105,"خ":65646,"د":475980,"ج":87595,"ح":43181,"ت":373990,"ث":6476,"ب":289206,"ة":899,"ئ":9436,"ا":1111531,"آ":65724,"أ":1456,"ء":1000,"غ":15714,"ع":91539,"ظ":7379,"ط":45348,"ض":9683,"ص":43813,"ش":310066,"س":370480,"ز":155766,"ر":687985,"ذ":11817,"،":43281,"؛":1040,"۲":32624,"۳":25683,"۰":28783,"۱":63091,"۶":17216,"۷":19045,"۴":17256,"۵":18080,"۸":24510,"۹":42744,"گ":80261,"ک":285107,"چ":25230,"ژ":11180," ،":7698," گ":30777," ک":148021," ۸":2786," ۹":2591," ۱":47632," ۲":17985," ۳":5368," ۴":3375," ۵":3167," ۶":2742," ۷":2925," ن":61607," ه":83469," ل":8629," م":155700," ق":22376," ك":1185," ف":28537," ي":42307," و":134423," ص":10068," ش":162851," ط":22340," ض":1200," ر":51088," ذ":1128," س":118308," ز":19079," ع":23850," غ":6362," ا":284499," آ":62929," ج":29359," ح":15097," خ":24724," د":158860," ب":152331," ت":57718," ژ":5441," چ":17369," پ":40145,"کا ":2349,"کت ":2257,"ژوئ":1909,"ژي ":877,"ژه ":1588,"ژان":2068,"گيت":1091,"گير":2604,"کلا":1036,"کلي":1793,"کوچ":1236,"کيل":3244,"کمي":2269,"کنا":990,"کند":3501,"کنو":1241,"کنن":2076,"کوم":883,"کوه":1843,"کرا":939,"کشف":46287,"کشو":4351,"کست":856,"کزي":3453,"کرد":5509,"کار":7408,"کات":897,"کان":3075,"کاي":1649,"کام":1480,"کال":1004,"کبي":947,"کتر":1843,"کتا":2868,"کتب":2985,"کم ":997,"کل ":1774,"که ":49534,"کن ":1159,"کي ":15027,"کز ":2384,"کس ":931,"گور":1227,"گون":2525,"گوي":1532,"گلي":2047,"گفت":2210,"گست":862,"گري":2038,"گرو":2216,"گزا":1135,"گرد":3447,"گرا":2354,"گرف":2543,"گذش":1619,"گذا":1619,"گار":1384,"گاه":8669,"گان":3798,"گي ":4154,"گر ":3440,"ا، ":2188,"پان":3222,"پاد":1299,"پار":1859,"۹۹ ":1855,"۹۸ ":1731,"۹۷ ":955,"۹۶ ":844,"۹۲ ":854,"۹۱ ":894,"ئي ":825,"ات ":10939,"اح ":1044,"اج ":937,"۹۰ ":940,"پرا":1102,"پرد":1609,"پرو":1607,"اب ":5407,"ئن ":919,"پاي":4002,"پتا":3629,"پنج":5682,"۸۹ ":864,"۸۸ ":992,"۸۳ ":1400,"۸۵ ":947,"۸۰ ":846,"۸۱ ":878,"اي ":55848,"بت ":1678,"اه ":14391,"او ":4256,"با ":13059,"ان ":92173,"اً ":1671,"اف ":1676,"پيو":992,"پيش":2404,"پير":873,"ام ":15467,"ال ":24240,"اق ":2509,"اع ":1667,"اط ":832,"ار ":39983,"اد ":13676,"اش ":1063,"از ":51085,"اس ":3550,"اسک":971,"بي ":8868,"ت، ":2631,"اضي":879,"ارگ":1672,"اصل":2729,"اسپ":971,"اعا":905,"اعت":1127,"اطل":2273,"اعي":1202,"۹۹۹":1558,"۹۹۸":1457,"ادا":1809,"احي":1808,"اخت":6534,"احم":966,"احت":888,"ارب":1131,"ارا":5657,"ادي":8285,"اده":8465,"ادب":902,"بق ":16057,"ادر":1545,"ادش":1271,"ازي":4465,"است":105076,"ازه":1433,"ازن":1098,"اسا":2986,"ازم":1894,"ازد":2915,"اري":13153,"اره":35608,"ازا":956,"ارن":1623,"ارو":2419,"ارم":2814,"ارز":893,"ارس":5553,"ارش":1311,"ارص":1726,"بل ":1849,"ارد":9154,"ارت":4139,"اشن":1730,"ارک":47474,"اشي":1130,"اشد":5423,"به ":41777,"اسم":943,"اسل":2665,"اشا":865,"اشت":3533,"اسي":5578,"بن ":2042,"اسر":1003,"بع ":2965,"ائي":1624,"ابا":1519,"ابت":1082,"ابر":3253,"ابع":2818,"ابي":1834,"ابل":1424,"ابق":1003,"ابو":1132,"اتر":1045,"اثر":1056,"اتي":2176,"اتو":1748,"اجر":1089,"ئيه":1089,"بر ":33260,"آب ":1128,"آثا":830,"آبا":1911,"آزا":908,"آذر":4073,"آن ":10516,"آما":970,"آلم":1164,"آمر":3443,"آمد":1740,"آمو":1076,"آنه":16519,"آور":2786,"آهن":945,"آغا":1416,"چه ":1398,"جزي":1617,"خت ":2202,"حل ":927,"جرا":1045,"جري":3087,"جار":1185,"جاد":1192,"جان":5597,"جام":2439,"جاه":2146,"جاي":1843,"حال":1317,"جنو":3614,"جها":2193,"جمي":1716,"جمع":1997,"جمه":2045,"جمو":2028,"خش ":8012,"حي ":1372,"د، ":4587,"خي ":1696,"دا ":1969,"خه ":1063,"حسي":842,"حزب":975,"حرم":1392,"حده":1001,"حدو":1445,"جنگ":1615,"جوا":942,"جود":1945,"ختر":883,"خان":4015,"خار":1012,"خاب":828,"حمد":2876,"در ":87864,"دد ":1157,"دن ":19684,"تگا":1213,"خست":1155,"ده ":55594,"دو ":4399,"ر، ":2027,"دي ":13399,"خته":2886,"ختل":1697,"خدا":1090,"دل ":1012,"دم ":1329,"خلي":855,"دان":9861,"دام":1683,"دال":1294,"داي":1703,"دار":14265,"داز":1707,"داس":1080,"داش":2836,"داخ":1028,"داد":4687,"خوا":3065,"خود":3802,"خور":1687,"دبي":1292,"رج ":831,"درگ":1116,"رت ":5024,"رد ":14918,"را ":12544,"دست":3550,"دسا":1658,"رب ":1902,"دشا":1262,"درس":984,"درج":847,"درا":1451,"دري":2646,"درو":1489,"دول":1633,"دون":991,"دوم":3136,"دوي":2445,"دهم":3138,"دوا":1480,"دود":1919,"دور":3332,"دهٔ":2250,"دها":1941,"دهد":1019,"دني":1772,"دهس":4371,"دمي":947,"دند":2383,"ذار":1352,"رف ":16580,"ديو":1392,"ديم":1100,"دين":2614,"ديا":931,"ديد":3091,"دير":977,"رش ":1790,"رس ":3514,"رز ":943,"دما":1034,"دلب":839,"اقي":1204,"افي":974,"اقع":6290,"الل":1166,"ان،":3374,"الي":6506,"الن":1060,"اما":3065,"الم":2177,"اله":1801,"امب":7272,"امر":1231,"الت":2172,"الب":1010,"الا":3840,"الد":1099,"الع":862,"افز":1059,"افر":840,"افت":2627,"اين":37088,"ايل":1631,"ايي":11337,"ايه":1592,"ايس":842,"انگ":3856,"ايش":2420,"ايا":4829,"ايت":2340,"ايج":4814,"انک":896,"ايد":1943,"ايز":1142,"اير":12515,"، ":41769,"بخش":8437,"اپ ":1341,"اهر":1198,"اهش":1637,"اني":16598,"باي":4838,"انه":7584,"انو":5568,"اهد":836,"انق":1002,"اها":4950,"بان":6032,"انن":2230,"بال":3191,"اند":8054,"باد":2479,"باز":4359,"انز":1366,"بار":4430,"انش":5048,"باش":6504,"انس":4593,"باس":1834,"انص":1766,"امن":1026,"انا":2522,"امه":4047,"امو":1040,"انت":3295,"بات":1162,"امي":5640,"انج":1903,"امل":2517,"اوي":1440,"اول":3794,"اور":2685,"الک":1567,"اوت":2392,"اهي":3496,"اوا":1252,"امپ":1281,"تا ":5488,"بسي":1565,"برگ":2294,"ايگ":1002,"بدا":1231,"برا":9676,"برد":2157,"برخ":1084,"برو":884,"برن":1758,"بري":2753,"بزر":3175,"بست":2397,"تر ":6150,"بعد":1118,"بني":1199,"بنا":1507,"بند":1464,"ا ":87899,"ب ":20308,"ح ":3843,"خ ":4523,"د ":139796,"بين":3092,"بيم":902,"بيس":5590,"بيش":2092,"ت ":137024,"بيا":1448,"بير":1134,"ث ":1238,"بور":941,"ج ":5421,"بود":10929,"تي ":8134,"اک ":1290,"تن ":2136,"ته ":15461,"تم ":2196,"ثر ":1178,"؛ ":940,"تبر":18681,"تال":1624,"تبا":2109,"تان":31858,"تام":3922,"تاه":3675,"تاي":3199,"تاد":4918,"تاش":890,"تار":5154,"تاب":4019,"تخا":898,"تحد":1114,"تري":7869,"ترو":1889,"ترا":2202,"ترن":839,"جا ":15939,"تصا":1521,"تشر":878,"ترک":1945,"تصد":3250,"تشک":1423,"تعد":829,"تغي":867,"اژه":1048,"تفا":3357,"تقا":902,"تقو":993,"تهر":2967,"تها":877,"ثار":853,"تند":2837,"تمي":4044,"تلف":1376,"تما":2359,"تلا":1147,"تيم":1139,"تين":1737,"تيا":863,"تون":1309,"تول":2859,"تور":2332,"توس":3860,"توا":4394,"پس ":2219,"تيک":1005,"اکن":1376,"اکت":3167,"اکس":936,"جه ":2283,"جي ":1241,"حت ":897,"پت":4019,"پا":14295,"پس":3568,"پد":1266,"پر":7121,"گ ":9285,"پو":2856,"په":1230,"پن":7042,"پي":8594,"پل":1723,"ک ":69398,"مک":3060,"ي،":6981,"وي":29381,"وو":995,"يب":4572,"يا":96279,"يع":3352,"يز":10370,"نگ":14413,"يس":23115,"يش":9793,"يص":2081,"يخ":3819,"يد":17899,"ير":33847,"يت":13742,"يج":6291,"يح":1015,"نک":2007,"يه":14576,"ين":90729,"وچ":1800,"يو":11558,"يق":4160,"يم":13074,"يل":23315,"يف":3191,"وپ":2022,"يي":14011,"وژ":952,"وک":2822,"پ ":2440,"وگ":1347,"يچ":1842,"يپ":878,"يژ":1013,"يک":36397,"يگ":5574,"فع":2000,"فض":915,"فز":1100,"فر":14718,"فس":1042,"فد":1178,"فت":17008,"فا":9831,"قع":6601,"قط":1598,"قش":1281,"قس":1226,"قر":7195,"قد":3410,"قت":1605,"في":8057,"قب":1853,"فه":2747,"فو":4734,"فن":1501,"قا":9232,"فل":2016,"فق":1179,"ل،":834,"قي":10959,"قل":3288,"قم":3612,"قه":3627,"قو":3406,"لق":1885,"لف":2767,"لط":1095,"لع":1660,"لد":3211,"لح":1516,"لز":963,"لس":4361,"لر":1340,"لا":28241,"لت":4427,"م،":1074,"لب":3816,"مع":25874,"مغ":887,"مص":1370,"مط":1157,"مف":852,"مق":2922,"مل":8979,"مت":12979,"لو":11064,"مج":4572,"ن،":5004,"لي":31085,"لم":9307,"لل":2270,"له":9135,"مب":9205,"لن":3401,"ما":76186,"مز":2088,"مر":23149,"مش":4307,"مس":6882,"مخ":2590,"مح":8770,"مد":9936,"نظ":3887,"نع":1089,"نط":1891,"نم":4839,"نق":3830,"نف":2887,"نج":25669,"ه،":2861,"مي":78888,"نت":9394,"مو":18704,"مه":13616,"نب":2742,"نا":34982,"من":11960,"نص":3159,"نش":8281,"نس":7940,"نز":3319,"نر":3727,"ند":45085,"نخ":1510,"مپ":1882,"هف":7528,"هل":3945,"هم":15355,"وئ":3203,"وا":39119,"هن":8401,"نه":35756,"هب":1561,"ها":75779,"نن":4681,"هت":1153,"نو":29290,"هج":3131,"ني":39040,"هد":4476,"هز":15706,"هر":26999,"هش":9098,"هس":7389,"هص":1630,"وع":5118,"وق":2152,"وف":3392,"ون":20803,"وه":7280,"ول":18645,"وم":15237,"هي":7368,"وت":7324,"مچ":1311,"هو":6697,"هه":1141,"وب":8255,"ود":37999,"وح":978,"لک":3276,"وج":4297,"وس":20717,"لگ":942,"وز":11168,"ور":41200,"وط":1483,"وض":931,"وص":985,"هٔ":10019,"وش":6987,"چ ":1702,"خو":10471,"دت":1296,"دا":45095,"خه":1201,"دب":2008,"دخ":1393,"خي":3813,"خل":2639,"خط":1338,"خر":3213,"خد":1570,"خص":2448,"خش":9472,"خس":1563,"تگ":2018,"ده":71326,"دو":21727,"ر،":2098,"دي":30900,"دف":1399,"دق":1055,"دل":3645,"دم":4441,"دن":24729,"ذا":2266,"دد":1619,"در":101792,"دش":2498,"دس":6715,"جي":2986,"جو":5371,"حت":1860,"حا":5566,"جن":6990,"جه":5500,"حب":830,"جل":2106,"جم":10111,"جس":1114,"جز":2725,"جر":5927,"جد":2691,"بک":2261,"خت":10589,"حي":3848,"د،":4720,"تک":1874,"حم":4596,"خا":9456,"حو":1605,"حق":1478,"حل":2638,"حص":1359,"حز":1102,"حر":3367,"حس":2640,"حد":3791,"اژ":1490,"تغ":1106,"تف":4099,"تم":9655,"تل":4375,"تق":4035,"تو":18858,"ته":20979,"تن":7644,"ثا":1467,"تج":1247,"تح":3912,"تر":24685,"تخ":2986,"تد":1435,"تش":4284,"تص":5949,"تس":1020,"تع":2873,"جا":32040,"جب":888,"جت":900,"تي":16351,"اک":9717,"ثر":1600,"اگ":2405,"بع":5160,"به":44819,"بن":6697,"بل":4961,"بق":17149,"بخ":9337,"بد":4503,"اً":1680,"بت":3170,"اي":143043,"اه":29729,"او":19244,"بط":1115,"بش":1384,"بز":4117,"بس":4538,"بر":58307,"اپ":4030,"تا":68031,"تب":22518,"بو":16880,"بي":27336,"ت،":2720,"ا،":2256,"ائ":3170,"از":65740,"ار":177821,"اد":40749,"اض":1760,"اص":6315,"اش":15541,"اس":126776,"ات":19285,"ئو":1601,"اب":23054,"ئن":1012,"اخ":9486,"اح":6972,"اج":4954,"ئي":4511,"اث":1612,"اف":9698,"اق":12353,"ام":46608,"ان":164721,"با":53255,"ال":53226,"اع":7941,"اغ":1903,"اط":5875,"اظ":989,"آب":4072,"آث":840,"ٔ ":10096,"آذ":4112,"آر":1739,"آس":1327,"آز":1293,"آغ":1527,"آف":915,"آل":2867,"آم":7747,"آو":3278,"آن":29292,"آه":1034,"آي":2113,"غي":2298,"عي":7120,"غر":4096,"عل":6634,"عم":5201,"غا":3393,"عن":4842,"عه":5570,"عت":3100,"عد":3656,"عر":22149,"عض":1560,"عا":7561,"عب":3193,"ظا":1618,"طل":3586,"طق":2226,"شگ":3723,"ظر":2221,"شک":5793,"طي":2187,"طه":1305,"طو":2844,"طر":3523,"ضي":1604,"سک":2829,"طا":2296,"طب":17352,"ضو":1678,"سپ":5713,"صل":3684,"صف":2604,"ضر":876,"زگ":1068,"صو":4924,"صن":1205,"ضا":2719,"صي":2649,"شف":46579,"شص":3397,"شش":4939,"شع":1023,"رک":59526,"صد":15627,"صر":2425,"رگ":10959,"شم":40834,"شن":7520,"صا":3027,"شه":20211,"شو":16524,"صت":1929,"شي":10582,"سع":1222,"سط":4899,"دگ":4253,"سف":2299,"رپ":863,"سي":82351,"شت":20229,"رچ":966,"سو":9339,"شب":2714,"سه":5822,"شا":14113,"سن":5406,"سم":5040,"سل":6428,"شر":11318,"شد":71050,"شخ":1686,"زش":2575,"زه":4419,"سب":3293,"زن":6220,"سا":40397,"ست":157670,"زو":2068,"زم":7708,"سد":1583,"سر":10053,"زي":19577,"سخ":1157,"دک":1541,"رس":21469,"رش":6643,"رر":874,"رز":5465,"رص":2328,"رض":1657,"رع":995,"رل":1461,"رق":7164,"رف":22038,"رو":41463,"ره":48555,"زب":4854,"زا":26314,"رن":9102,"رم":13248,"ري":64671,"زر":4516,"زد":7554,"ذر":4784,"ذش":1625,"رآ":1627,"رب":15211,"را":79279,"رت":10552,"رج":4101,"ذي":851,"رخ":3587,"رح":1384,"حک":1290,"رد":33206,"ف ":71140,"ع ":15503,"غ ":1398,"ص ":2346,"ط ":7167,"ظ ":1176,"ر ":213685,"ز ":65089,"س ":20728,"ش ":24544,"ً ":1683,"ي ":278753,"ه ":289561,"ن ":207616,"و ":126016,"ق ":24581,"م ":45325,"ل ":55421,"۳۰":2261,"۳۱":1756,"۳۲":1475,"۲۷":1672,"۲۶":1845,"۲۹":1827,"۲۸":1810,"۲۳":1837,"۲۲":2072,"۲۵":2015,"۲۴":2310,"۴۲":932,"۴۳":828,"۴۰":1179,"۴۱":851,"۳۹":874,"۳۸":3317,"۳۷":1442,"۳۶":1469,"۳۵":1620,"۳۴":1319,"۳۳":1412,"۱۰":3498,"۰۱":1251,"۰۰":7262,"۰۳":960,"۰۲":955,"۰۵":991,"۰۴":954,"۰۷":971,"۰۶":1030,"۰۹":925,"۰۸":1126,"۲۰":6610,"۲۱":2252,"۱۴":2656,"۱۳":8246,"۱۲":3243,"۱۱":2760,"۱۸":4100,"۱۷":2716,"۱۶":2670,"۱۵":2740,"۱۹":21618,"۷۴":853,"۷۳":1099,"۷۶":1037,"۷۵":996,"۷۰":1130,"۷۲":902,"۷۱":971,"۶۸":828,"۶۹":864,"۸۷":1462,"۸۶":1335,"۸۵":1706,"۸۴":1157,"۸۳":1867,"۸۲":1238,"۸۱":1392,"۸۰":1426,"۷۹":1133,"۷۷":1205,"۷۸":1210,"۵۰":1455,"۵۲":836,"۵۱":867,"۵۴":849,"۵۳":863,"۴۴":839,"۴۵":921,"۴۸":900,"۴۹":832,"۶۱":861,"۶۰":1533,"۶۵":892,"۵۷":978,"۵۸":881,"۵۵":896,"۵۶":829,"۵۹":829,"۸۹":1660,"۸۸":1649,"۹۰":1972,"۹۳":2257,"۹۴":1978,"۹۱":1915,"۹۲":1938,"۹۷":3928,"۹۸":5762,"۹۵":1920,"۹۶":2485,"۹۹":9457,"۸ ":9234,"۷ ":8195,"۹ ":8815,"کي":21693,"کس":3863,"کش":52864,"کر":10838,"کز":5951,"کد":1167,"کت":10947,"کو":9889,"که":50638,"کن":11029,"کم":5441,"کل":6076,"کب":1527,"کا":21352,"گف":2232,"گل":4014,"گن":1256,"گه":1120,"گو":7699,"گي":9941,"گذ":3598,"گز":1726,"گر":18517,"گش":860,"گس":1433,"گا":16359,"۵ ":8279,"۶ ":7892,"۳ ":8356,"۴ ":8000,"۱ ":8359,"۲ ":7902,"۰ ":11742,"چي":2258,"چن":2891,"چه":10435,"چو":1224,"چک":1330,"چا":2573,"ژا":3316,"ژه":1974,"ژي":1330,"ژو":2750,"چين":1123,"چني":1160,"چها":6583,"چهل":2101,"چند":1391,"چار":831," ، ":7275," خ ":829," و ":107907," حا":2893," جن":6563," جه":2834," جل":883," جم":4408," جو":1700," جد":1121," جز":2226," جر":1081," جا":5217," تي":2383," اک":4067," خل":1477," خي":1014," خو":9064," دا":22315," خر":1716," خد":1173," خط":1264," حق":954," تک":1140," حم":1000," خا":5629," حو":858," حز":987," حر":1265," حس":1494," حد":1443," به":41159," بن":5266," بل":1919," بع":1479," بز":3261," بس":2873," بر":23110," اي":60506," اه":1491," او":12009," بخ":8934," بد":2040," اق":1653," اف":3495," ال":9180," با":32041," ان":13519," ام":5995," اط":2539," اع":2096," اد":2220," ار":5819," از":47219," اس":93321," اش":2136," اص":3471," اب":2944," ات":1821," اث":1151," اج":1670," اح":1812," اخ":1959," تو":11109," ته":3565," تن":2119," تم":1148," تل":1644," تق":2105," تع":1944," تش":1637," تص":1087," تر":7158," تخ":1066," تج":1001," تح":1857," تا":10536," تب":1758," بو":12627," بي":13492," آم":7369," آل":2819," آو":2892," آه":1013," آن":28846," آي":1817," آذ":4109," آر":1633," آث":831," آب":3531," آغ":1486," آس":1300," آز":1278," شک":1757," طو":1797," عض":849," عر":3450," عا":1962," عب":2630," غر":3490," عل":4955," عم":2914," عن":2426," غي":1131," سط":837," سف":999," سي":58158," شب":2526," سه":2789," سو":5501," سم":1003," سن":2452," شا":6389," سل":2036," شر":8538," شخ":981," شد":64662," شص":1903," شش":4604," شو":11010," شي":3635," شم":35388," شن":2904," شه":16707," صد":3342," سپ":4409," صف":1164," صو":2043," صن":981," طب":16842," طر":1891," دس":4487," در":93973," دي":6271," دو":15789," ده":8568," دن":1430," دل":1101," حک":1033," را":15933," رس":3186," رش":1541," ري":3051," رف":1064," رو":19239," زب":3421," زا":2852," رم":1079," زي":3601," سد":907," سر":6965," زم":4334," سب":1207," زن":2927," سا":27260," ست":1210," ۲۱":1529," ۲۰":5779," ۱۹":20860," ۱۵":1981," ۱۶":1865," ۱۷":1946," ۱۸":3370," ۱۱":2053," ۱۲":2491," ۱۳":7504," ۱۴":1848," ۳۰":1169," ۲۸":971," ۲۹":1004," ۲۶":1106," ۲۷":943," ۲۴":1600," ۲۵":1220," ۲۲":1386," ۲۳":1186," ۱۰":2514," ۷ ":1034," ۶ ":835," ۹ ":923," ۸ ":918," پل":1189," پن":6100," پو":1775," پي":6231," پا":11101," پر":5507," پد":1204," پس":3103," لا":2038," مل":2782," مق":2722," مع":22757," مط":1111," مص":1331," مس":5314," مش":3603," مر":12704," مد":3474," مح":8486," مخ":2396," لي":1375," مج":4247," لو":1161," مت":6834," ما":10891," مب":1057," نف":1957," نق":2575," نم":3573," نظ":2990," نخ":1392," نر":962," نز":1060," نس":1428," نش":1872," نص":1002," نا":13232," من":6720," مه":4441," مو":8861," مي":39390," هف":7486," هن":3049," وا":10473," هم":7875," هر":3177," هز":15471," هس":2372," هش":6627," نو":12914," ها":28631," نه":5384," ني":6690," هج":2983," فر":9595," فع":1613," فا":3850," فل":1560," قا":3036," فو":3446," قب":1076," في":4001," قد":1747," قر":6081," قم":3484," قل":1179," قو":1126," يک":22683," وج":1401," هي":1336," هو":2401," وس":1484," وز":1216," ور":1520," ول":1293," مک":1516," وي":5201," يا":14595," نگ":1352," يو":1770," ۱ ":999," ۵ ":990," ۴ ":1026," ۳ ":1037," ۲ ":1134," کت":2907," کر":7960," کش":52338," کل":3525," کن":7370," کم":2467," کو":5973," که":48270," کي":3060," کب":1107," کا":11187," گا":3551," گل":1221," گف":2153," گي":3912," گو":4533," گر":10421," گذ":2142," ژا":2692," ژو":2172," چا":2040," چي":1498," چن":1727," چه":9110,"۶۰ ":1024,"۲۰۰":3965,"۳۸۳":931,"۲۸ ":1132,"۲۹ ":1101,"۲۶ ":1164,"۲۷ ":991,"۲۴ ":1524,"۲۵ ":1126,"۰۰۰":1569,"۲۳ ":1040,"۲۲ ":1199,"۲۱ ":1215,"۲۰ ":1572,"۳۱ ":837,"۱۹۸":3784,"۱۹۹":7313,"۱۹۳":1178,"۱۹۲":829,"۱۹۵":879,"۱۹۴":946,"۱۹۷":2720,"۱۹۶":1373,"۱۳۸":2507,"۳۰ ":1283,"۰۰ ":2532,"۱۲ ":1158,"۱۱ ":1041,"۱۳ ":1130,"۱۴ ":1282,"۱۵ ":1270,"۱۶ ":1284,"۱۷ ":1225,"۱۸ ":1153,"۱۹ ":1122,"۱۰ ":1507,"فر ":1868,"فت ":4170,"قع ":6048,"فار":2458,"فاد":2141,"فاع":864,"فتا":2708,"فته":4673,"فتم":1892,"فتص":1637,"فه ":1283,"في ":1976,"عرو":1419,"عرف":15753,"عرب":2186,"عرا":897,"عدا":1093,"عبد":885,"عات":1421,"عال":2039,"عبا":1366,"عاد":1232,"غرب":3482,"عيت":1749,"عمل":1014,"عنا":1018,"عمو":1947,"عنو":2161,"غان":879,"عني":1446,"غاز":1437,"علا":887,"علي":2182,"علو":970,"عما":1036,"علم":1702,"غير":1279,"صي ":884,"شما":36739,"شهر":16753,"شنا":4807,"صاد":1085,"شمي":1876,"رکت":2292,"رکز":5506,"رکي":1378,"شور":5763,"شود":8535,"شهو":994,"شون":1223,"شير":1302,"شيد":1499,"شين":1156,"ضي ":873,"رگا":1053,"رگذ":955,"رگر":1703,"رگز":904,"سپت":3558,"سپا":1035,"صول":885,"صور":2307,"ضاي":1046,"صلي":1295,"طي ":1075,"طه ":1047,"ظر ":999,"طبق":15902,"طرا":958,"عت ":1441,"عد ":843,"طلس":913,"طلا":2183,"طقه":1497,"طور":1362,"ظام":977,"شکي":1859,"شکل":1326,"شگا":3095,"عه ":4310,"عي ":3265,"زد ":1250,"ري ":28175,"رن ":1348,"ذشت":1620,"زب ":861,"ره ":40871,"رو ":1622,"ديگ":2478,"ديک":1038,"رق ":1211,"ذرب":3735,"رم ":3802,"رجه":899,"ردم":1119,"رده":3960,"ردن":2001,"ردي":2386,"ردا":4731,"رتب":927,"ربي":3967,"رتي":854,"راک":1311,"ران":23644,"ربا":5086,"راه":2985,"راو":1025,"راي":12001,"ربر":1171,"راف":1247,"رام":1664,"رال":1192,"راب":3402,"رائ":1115,"راح":1002,"رات":2813,"رار":4645,"راد":1759,"راز":1040,"راس":2348,"زش ":1033,"رفت":3769,"سر ":1698,"زي ":8767,"رصد":2148,"رشي":1128,"ست ":82115,"رسا":1284,"رست":10190,"رشت":1369,"رسم":1031,"رسي":3685,"زه ":3212,"رزش":844,"سم ":1194,"زدي":1099,"زده":4341,"ريک":4778,"روپ":963,"ريه":3010,"ريو":908,"رين":6859,"ريل":1861,"ريق":887,"ريا":4939,"ريخ":2752,"ريت":1617,"ريز":1529,"رنگ":1040,"ريس":1451,"روه":2711,"روي":3959,"ري،":1064,"روف":1550,"رون":2388,"روم":1326,"رور":969,"روز":5199,"رود":4045,"رهٔ":1083,"روس":7942,"روش":1626,"رهن":1422,"روا":2639,"زاي":1000,"زان":1010,"رها":4101,"زبا":3526,"زار":18512,"رند":2708,"زاد":3573,"رمي":2432,"رنا":1487,"رما":3636,"رمز":959,"سط ":3314,"رقي":4879,"شد ":26077,"شر ":1296,"شش ":1401,"سن ":1047,"سه ":5126,"زرگ":3314,"شت ":4426,"سي ":13405,"ستگ":1285,"شف ":46304,"زيک":1213,"سري":988,"سرو":889,"سرا":2106,"دگي":1827,"شه ":1221,"دگا":1978,"زما":4471,"سام":2387,"سال":16867,"سان":4639,"ساي":1790,"زند":3098,"ساخ":3059,"ساس":1619,"ساز":3943,"زنا":1261,"زمي":2413,"ساب":1251,"ستا":39209,"ستب":15399,"ستر":2167,"ست،":1386,"ستف":2194,"ستن":2498,"زيا":1067,"ستم":1666,"ستي":2717,"سته":2853,"ستو":1258,"زيس":843,"زير":3410,"رگ ":3861,"صر ":1320,"سلا":2748,"سمت":1146,"شي ":3088,"صت ":1755,"صد ":14363,"رک ":48005,"شرق":5570,"صل ":1169,"شده":27411,"شدن":16377,"شصت":1903,"شرک":1669,"شصد":1494,"ششم":1955,"ششص":1494,"شتا":2276,"سوم":2657,"سوي":1270,"شتر":2340,"شتص":1588,"شبه":953,"شاه":3881,"شان":2849,"شام":1210,"شار":1758,"سند":1687,"شاخ":862,"سمي":1232,"شخص":1423,"سيل":1493,"سين":2125,"سيم":1189,"سيق":1188,"سيو":851,"شتم":2005,"سيص":1797,"سنگ":854,"سيس":2418,"سير":939,"سيد":2026,"شتي":869,"سيا":50847,"شته":5322,"يکا":4258,"يکم":1999,"يکي":12431,"يگر":3114,"يگا":1094,"يچ ":1170,"وچک":1078,"يقي":1448,"يلا":5819,"يلي":2362,"يلم":2163,"يما":2897,"يله":1142,"يلو":2447,"يند":2629,"ينج":15434,"ينا":1201,"يمن":836,"ينت":984,"يمي":2048,"ينو":1227,"يني":2869,"ينه":1667,"يهٔ":843,"يوس":866,"يوا":1011,"يون":3682,"ينگ":934,"وپا":979,"يصد":1797,"يشه":906,"يشت":1412,"يسي":2944,"نگي":1776,"نگل":2493,"يسه":1037,"يسن":1399,"يزي":2379,"يست":11437,"يزه":870,"يسا":894,"نگا":2016,"يره":2645,"يرو":2532,"يري":2723,"يزد":1123,"يعي":880,"يجا":4729,"يتا":2607,"يتي":884,"يده":2751,"يدن":1012,"يدي":1302,"يرا":13163,"يرد":1088,"يخي":912,"يدا":2030,"يدل":878,"ياف":1602,"يال":2762,"يان":11075,"يبا":1219,"يام":955,"ياه":1109,"ياي":3214,"ياس":2249,"يار":48948,"ياز":1404,"ياد":1729,"يات":2044,"ياب":1358,"يک ":14180,"وز ":3949,"ور ":11858,"ود ":25900,"نقش":885,"هٔ ":10011,"نفر":1610,"وش ":1865,"وس ":2204,"نمو":979,"نند":4531,"هاس":15525,"هار":8046,"نما":2997,"ها،":830,"وع ":1666,"نقل":1074,"وط ":936,"نيز":2529,"نيس":1397,"نير":1331,"نيا":4775,"ني،":840,"نوي":5351,"نون":2065,"نور":1647,"نود":1940,"نوب":3858,"نهم":1851,"نوا":6754,"نوع":1934,"نوش":1776,"نوز":853,"هاي":33269,"هان":4405,"نها":19728,"نهص":1628,"هدا":927,"وف ":1708,"نيک":1404,"نين":1804,"نيم":949,"نيو":887,"هجر":2343,"هست":7247,"وم ":4571,"هري":1954,"هشت":6878,"ون ":8984,"هرا":3739,"هرم":1056,"هره":1437,"هزا":15536,"هرس":8280,"ول ":5820,"وي ":9400,"ي، ":6814,"وه ":3934,"هشم":1388,"هصد":1629,"معي":1766,"معم":1600,"معن":1773,"معر":16933,"معا":1278,"هد ":1760,"هر ":8315,"مقا":1363,"منا":1581,"ناب":867,"مند":1285,"ناخ":1262,"منت":1332,"منط":1759,"ناس":3745,"نار":1067,"ملل":1010,"مله":1083,"ملي":2015,"مهم":1073,"مهو":1618,"موا":1351,"موج":939,"مور":2297,"موز":1395,"مود":1287,"موس":2124,"موع":2058,"نام":12986,"نان":3607,"ناي":2070,"نتش":1291,"نتر":1378,"مون":1614,"مول":1953,"ميل":5431,"ميد":1645,"مير":1559,"ميا":3140,"نتي":1863,"نجم":2428,"نجا":19529,"مين":23588,"ندا":3431,"ند،":1301,"نخس":1148,"ندو":1099,"نده":6796,"ندر":1455,"ندس":875,"ندي":2668,"هل ":2462,"نزد":2165,"هم ":4541,"نسا":1177,"نست":1454,"ندگ":2266,"نسو":981,"نشا":1366,"وب ":4521,"نصد":1688,"وت ":2984,"هي ":3587,"نطق":1746,"نظا":1022,"نظر":1892,"نشگ":2179,"ونا":1554,"ومي":5536,"وند":2535,"ولي":4939,"ومت":2798,"يع ":1026,"ولت":1350,"ولا":2437,"ولد":1454,"يش ":4417,"يس ":3929,"نگ ":4653,"يق ":1173,"ويژ":863,"يف ":1505,"ويچ":897,"مکا":1239,"ويي":925,"وين":2044,"ويه":2289,"ويم":1400,"ويز":853,"ويس":4678,"وير":1085,"وني":3417,"ونه":1782,"وها":1116,"يي ":11355,"يه ":12282,"يو ":1257,"يم ":5175,"ين ":61177,"يل ":7962,"هفت":6908,"هنگ":2686,"واژ":1094,"مچن":1005,"وتب":889,"وجو":2093,"لکت":1073,"هنر":1330,"وار":3450,"واز":2127,"هند":2551,"واد":2014,"همي":4888,"وئي":1673,"واب":2536,"وئن":956,"هما":980,"هور":2961,"وبي":1735,"همچ":1222,"هوا":1704,"واه":871,"واي":1826,"واق":6339,"وال":1232,"وان":9374,"وام":2251,"وري":9228,"وست":8309,"وزه":1349,"وزن":1058,"وزي":1584,"يا ":15452,"وسط":3323,"وسي":3688,"وشت":1878,"يب ":1599,"ود،":1062,"ودر":855,"ودن":1173,"ودي":1606,"وده":3280,"ورا":3355,"ورز":1216,"ورش":1426,"ورد":3566,"ورت":2597,"وره":2981,"ورو":1136,"يد ":7931,"ير ":7783,"يز ":4213,"يت ":7713,"يج ":845,"يخ ":2265,"وعي":1123,"وعه":1942,"لد ":1738,"لس ":1850,"لت ":2709,"لا ":1918,"لب ":937,"م، ":1039,"لف ":1009,"له ":6963,"ما ":2248,"لم ":2584,"قمر":3187,"قلا":1065,"قيق":1249,"قوي":1155,"فعا":1470,"فرد":869,"فرا":3846,"فرو":1305,"فري":1360,"فرم":1232,"فره":1581,"فزا":918,"قه ":2804,"قي ":7279,"فيز":835,"فيل":2814,"قدي":1049,"قرن":906,"قرا":4415,"قال":936,"قاب":1236,"فوت":1061,"فور":2032,"قان":1259,"لله":1097,"لما":2054,"ماع":870,"لمي":1688,"مات":1140,"مار":39512,"لند":1567,"ماد":1634,"لمل":950,"نس ":921,"نش ":1280,"ند ":23182,"ه، ":2793,"مي ":37976,"نج ":1914,"ني ":21211,"مشه":1314,"مرک":5626,"نو ":876,"نه ":11112,"مسي":1271,"ها ":10366,"مست":1149,"مسا":1677,"مري":7089,"مرو":1152,"مرد":2093,"مرا":2087,"مرب":1215,"مدي":1420,"مدر":931,"مخت":1330,"مدا":1385,"محر":1442,"محل":1270,"محم":2301,"ليو":834,"لين":2134,"مجم":2201,"مجل":977,"ليت":1597,"ليا":2311,"متو":1555,"ليس":3144,"ليد":1495,"ليل":1108,"لوي":1612,"لوم":2898,"متر":3763,"متح":1213,"مال":4589,"مام":1588,"مان":14800,"ماه":2044,"ماي":4595,"مبر":7239,"مد ":3240,"لو ":1092,"مت ":2632,"ن، ":4905,"لي ":12663,"مه ":7560,"نا ":1758,"من ":2003,"نت ":1363,"مل ":2783,"لاح":1096,"لاد":4785,"لار":867,"لاب":1067,"لات":2844,"لاق":1169,"لاف":888,"لاس":997,"لاع":939,"لبر":1058,"لاي":1580,"لام":3536,"لان":2170,"لال":908},"n_words":[8069793,10004435,6796528],"name":"fa"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":15745,"E":23140,"F":13820,"G":13964,"A":40046,"B":22964,"C":22892,"L":28771,"M":37698,"N":19249,"O":13807,"H":28813,"I":21256,"J":16698,"K":43440,"U":7805,"T":36899,"W":9333,"V":24290,"P":38188,"S":77764,"R":24157,"Y":10043,"X":1821,"Z":1858,"f":29615,"g":68514,"d":158713,"e":1194104,"b":43435,"c":44492,"a":1814181,"n":1349748,"o":934203,"l":885783,"m":433706,"j":314669,"k":691662,"h":239330,"i":1579260,"w":11494,"v":295575,"u":751889,"t":1169051,"s":1099978,"r":475344,"q":1633,"p":248621,"z":7860,"y":250687,"x":8131,"é":2342,"ä":426790,"ö":56679,"š":2606," l":70812," m":89685," n":43615," o":180629," h":39470," i":21438," j":161666," k":172428," d":8045," e":69090," f":6572," g":3193," a":72510," b":5870," c":2712," y":39165," u":13744," t":117334," v":98965," p":89643," s":126171," r":34331," J":16375," K":41900," H":27293," I":17045," N":17648," O":11538," L":26471," M":33953," B":21331," C":19265," A":36093," F":12406," G":12952," D":13481," E":21275," Z":1697," Y":9517," S":71766," R":22636," P":35182," W":8721," V":22294," U":6908," T":34378," ä":2798,"A ":2604,"Da":2655,"Co":4892,"Ch":3205,"Do":1694,"De":2688,"Di":2497,"Fa":1585,"Eu":3024,"Et":2349,"Es":2046,"En":2950,"El":2967,"Ge":2029,"Ga":2319,"I ":2891,"Fr":2159,"Fo":2415,"Fi":2245,"C ":2932,"Au":2202,"Ar":3564,"As":1696,"D ":1819,"Ba":4270,"Am":2679,"An":4463,"Al":7577,"Bu":1605,"Br":4547,"Ca":3954,"Bi":1625,"Be":3488,"Bo":2874,"Ku":5528,"Ky":1627,"Kr":2024,"Ko":6559,"Le":5504,"Hä":4384,"Li":5491,"La":6972,"Lu":2444,"Lo":3355,"Me":4707,"Mi":5018,"Ma":12276,"Mu":3221,"Mo":4384,"Ni":3815,"Ne":4302,"Na":2575,"P ":1899,"No":3154,"Gr":2750,"Ha":6094,"He":6887,"II":1871,"Hi":2221,"Ho":2925,"In":4312,"Is":2133,"It":2586,"Ja":4546,"Jo":4525,"Ju":3085,"Ka":13363,"M ":1871,"Ki":5554,"Ke":4172,"Un":1530,"Tu":4391,"Tr":2204,"To":3873,"Th":5318,"Ti":2566,"Te":3907,"Ta":5984,"St":4721,"Su":13153,"Wi":2315,"Wa":2313,"Vu":2187,"Vi":4932,"Va":6002,"Ve":5654,"Pu":2910,"Pr":3317,"S ":3929,"Pe":5861,"Pa":6975,"Po":6867,"Pi":3820,"Se":20320,"Si":6786,"Sh":1709,"So":3993,"Ru":2993,"Sa":10443,"Re":3883,"Ri":2393,"Ro":4731,"Ra":6455,"b ":4269,"a ":521046,"Yh":4604,"Tä":2310,"i ":187878,"ge":9008,"ga":10572,"fi":6772,"fr":2666,"fo":3746,"hd":23028,"he":38383,"ha":33064,"gn":1824,"gl":5520,"gi":17723,"gh":2779,"gu":2320,"gr":3324,"go":5014,"du":5956,"dy":8630,"g ":5897,"ea":17917,"eb":2211,"ec":3419,"ed":14005,"de":60149,"di":25899,"do":15043,"ds":2160,"dr":3184,"ew":1828,"eu":16676,"ev":21342,"ey":5808,"fa":2058,"h ":5313,"fe":3904,"eh":18148,"eg":3976,"ef":1593,"ee":73047,"el":141223,"ek":30007,"ej":4585,"ei":64027,"ep":7092,"eo":9816,"en":311708,"em":29073,"et":103119,"es":102393,"er":103228,"ca":4693,"e ":102313,"br":3098,"bu":11145,"bo":3540,"bl":1718,"bi":4927,"be":5208,"da":16197,"f ":4183,"cu":1883,"ct":2366,"co":5044,"ck":6770,"ci":3249,"ch":7835,"ce":5416,"c ":2562,"az":1738,"ay":3699,"ba":5388,"d ":12256,"at":82974,"as":103098,"ar":90264,"av":32524,"au":43994,"ak":51088,"al":188447,"ai":159817,"aj":32823,"ao":2348,"ap":28311,"am":44148,"an":226249,"ac":6773,"ad":13018,"aa":139980,"ab":3846,"ag":5956,"ah":19189,"ae":6848,"af":2453,"nu":19006,"nt":79878,"ns":48167,"nr":2630,"np":4065,"no":34422,"nn":76172,"ny":10224,"nv":4829,"oe":6505,"of":6585,"oc":5124,"od":21394,"oa":12733,"ob":3163,"om":49202,"on":220535,"ok":63100,"ol":88347,"oi":101083,"oj":13195,"og":7134,"oh":26450,"kä":31248,"ot":48882,"os":73162,"ov":18707,"ou":20507,"op":25063,"oo":22586,"or":48012,"r ":15286,"ow":2927,"kö":4127,"oy":1883,"pe":39610,"pa":50151,"pl":2749,"po":19855,"lä":49440,"ph":2309,"pi":40558,"lo":48127,"lm":34905,"ll":141072,"ls":4754,"lp":5969,"lv":6764,"lu":74348,"lt":40738,"ly":10157,"o ":46103,"ma":107323,"mb":5971,"iä":11662,"me":68275,"mi":93628,"mm":26269,"mp":15152,"mo":20189,"mu":39868,"iö":6377,"my":13711,"p ":3699,"na":88104,"nc":4077,"nd":15417,"ne":115628,"nf":1558,"ng":26517,"nh":5001,"jä":31523,"ni":80364,"nj":4418,"nk":37841,"nl":4563,"nm":4904,"ju":18610,"jo":83376,"ki":95289,"ke":65747,"ka":163517,"m ":8821,"ky":14212,"ks":65607,"kt":5970,"ku":90075,"ko":85957,"kr":5885,"kk":48142,"kl":2002,"kn":1816,"li":159675,"hä":6582,"lk":30410,"lj":7180,"le":81668,"ld":2991,"la":154382,"lb":9921,"n ":656910,"hr":2420,"ht":42173,"hu":10726,"hj":14036,"hk":3966,"dä":2655,"hi":25267,"hn":1754,"ho":13688,"hl":1555,"hm":8011,"id":26485,"ic":10236,"ib":2776,"ia":70345,"eä":2738,"ih":17748,"ig":6859,"if":2884,"ie":58968,"hy":6074,"k ":7419,"ir":40208,"is":262822,"it":142893,"iu":5141,"iv":30487,"ii":85125,"ij":23513,"ik":78594,"il":97254,"im":65870,"in":288307,"io":39520,"ip":11864,"je":25016,"ji":8870,"l ":13054,"ja":144917,"tä":77533,"sä":41776,"vy":5926,"y ":23865,"wa":2667,"rä":7609,"vi":49369,"vu":43029,"vo":19782,"uv":30695,"uu":71208,"ve":29647,"va":118391,"x ":5327,"ui":22148,"uj":5727,"uk":45990,"ul":59174,"ue":19283,"ug":2138,"uh":10868,"ur":38607,"us":87239,"ut":63413,"um":30283,"un":78305,"uo":96118,"up":18134,"ty":54901,"tu":95699,"tt":139450,"ub":2312,"ua":10701,"ud":16569,"uc":1658,"w ":2157,"to":89637,"tl":2100,"ts":24815,"tr":16390,"te":142275,"tk":13593,"ti":135644,"th":11168,"pä":16825,"ta":258052,"su":49746,"sv":13201,"ss":117492,"st":215124,"sy":12228,"sl":7481,"sk":39138,"sn":1670,"sm":5723,"sp":7200,"so":30621,"sr":2112,"sc":2341,"se":139811,"sh":5275,"sj":2476,"si":171906,"nö":1934,"u ":38729,"sa":155918,"rr":9979,"rs":10921,"rt":23316,"ru":25282,"rv":12255,"ry":7951,"rp":3591,"ro":37223,"rn":7182,"rm":9935,"rl":3938,"rk":30938,"rj":32055,"ri":107434,"rh":7195,"nä":20276,"rg":6568,"re":35490,"rd":7095,"rc":2086,"rb":1938,"ra":66068,"t ":86930,"mä":30120,"s ":74921,"px":2388,"py":4209,"lö":4854,"pt":2602,"pu":33575,"pp":18507,"pr":6312,"ps":3220,"yä":2288,"yö":17077,"vä":26642,"yy":14572,"yh":29899,"ye":9120,"yd":4538,"ya":1921,"tö":12963,"yv":6669,"yt":27603,"ys":23779,"yr":6697,"yp":4560,"yn":15218,"ym":10818,"yl":20580,"yk":18251,"yi":8456,"ä ":141477,"äa":1925,"ö ":8048,"ää":49135,"ät":13038,"äv":12353,"äy":16618,"äm":13914,"äl":20535,"äo":2938,"än":46392,"äp":3824,"äs":18046,"är":23375,"äe":2662,"äi":25110,"äh":12610,"äk":13701,"äj":7869,"öö":1835,"öy":2134,"öt":2960,"ör":2049,"ös":12022,"ön":8537,"öl":2694,"öm":1579,"ök":3180,"öh":1825,"öi":4386," Ga":2281," Ge":2007," Fo":2388," Fr":2153," Fi":2200," Ha":6075," He":6845," Gr":2717," Ho":2907," Hi":2219," Ja":4528," Is":2125," It":2580," In":4291," Ka":13310," Ke":4144," Ki":5524," Jo":4519," Ju":3077," La":6921," Le":5486," Hä":4383," Li":5091," Ko":6537," Kr":2018," Ku":5494," Ky":1626," Ma":12197," Mi":4965," Me":4668," Lo":3341," Lu":2432," Ne":4266," Na":2563," Ni":3790," Mo":4358," Mu":3189," Am":2670," An":4363," Al":7546," Ba":4234," Au":2195," As":1683," Ar":3519," Be":3463," Bi":1601," Bo":2840," Br":4522," Bu":1591," Ca":3824," Ch":3183," Co":4800," Da":2631," Di":2463," De":2676," Do":1656," El":2955," Et":2345," Es":2034," En":2918," Eu":3021," Fa":1558," Tä":2303," Wi":2296," Wa":2289," Yh":4598," Po":6838," Pi":3788," Pe":5834," Pa":6894," No":3136," Ra":6410," Ro":4711," Re":3856," Ri":2382," Pr":3297," Pu":2902," Su":13126," St":4422," Ta":5960," Th":5301," Ti":2528," Te":3868," Tr":2184," To":3832," Ru":2990," Sa":10406," Sh":1689," Si":6699," Se":20290," So":3959," Va":5984," Ve":5638," Vi":4890," Vu":2183," Tu":4371," ja":74622," in":3365," il":6233," is":1907," it":4582," ka":41758," ki":21829," ke":20426," jo":60523," ju":13868," ha":11651," he":10326," hy":3213," ih":2463," hi":4639," ho":2199," hu":2999," jä":12144," ni":12782," ne":5502," na":3130," my":11678," mu":27269," mo":4701," ol":30354," om":3788," on":110013," kä":15552," oh":5375," oi":2526," of":2700," ny":2745," nu":2351," no":7612," le":7943," hä":3832," li":11414," n ":4861," la":17994," ku":35062," ky":5412," kr":3070," ko":26681," me":12340," mi":8054," ma":22033," lu":14733," ly":2009," lo":4509," am":2608," an":4850," ai":10388," aj":3176," al":26047," av":3055," au":2714," ar":5107," as":8761," er":7980," et":8124," es":9200," en":13349," ei":4106," el":17426," fi":2104," de":3555," di":1711," ed":3633," vä":7263," ym":1774," yl":9242," yk":6097," yh":19431," tä":4808," ru":4569," ry":2522," sa":23518," se":22713," si":27049," so":8389," ra":14800," re":3082," ri":4303," nä":4180," ro":4580," pu":9566," pr":4112," s ":2297," px":2387," py":2739," mä":2284," os":9956," ot":1666," ov":7670," op":3466," or":1769," pe":20224," pa":14359," po":10381," pi":14437," lä":10166," sä":4588," va":34158," ve":7508," uu":2456," vo":6724," vu":30542," vi":12396," ty":5276," tu":19858," us":4512," ur":1664," ul":2520," ta":38855," sy":5905," st":3683," su":25086," to":18439," th":5224," pä":9746," ti":8862," te":13642," ää":2036,"Ete":1912,"Eur":2720,"For":1614,"Hel":2818,"Int":2022,"Alb":1684,"Bri":2185,"Nor":1531,"Per":1731,"Par":1783,"Poh":2941,"Ran":1941,"Kal":1586,"Kan":2320,"Kau":1621,"Kar":2566,"Kir":1567,"Kun":1618,"Hän":3486,"Mar":3439,"ään":18825,"ääk":4426,"äät":2724,"äär":4190,"ääs":2129,"ää ":10320,"Vuo":2097,"Yhd":3250,"Suo":9477,"Sta":1658,"Sen":4830,"äht":3405,"ähd":1555,"ähe":3536,"ähi":1621,"Ruo":1578,"änn":2773,"äns":2545,"änt":2999,"Sak":1930,"ämi":2674,"äni":2131,"äjä":6711,"äne":3700,"Se ":11672,"äs ":1575,"ämä":7782,"äos":2009,"äka":1964,"äis":10749,"äin":8551,"äiv":1941,"ält":2814,"äli":6173,"älk":2158,"äll":6310,"äks":2395,"äki":2767,"än ":27597,"äve":2052,"ävi":2773,"ärä":1957,"äyt":13595,"äri":6126,"ärj":7258,"ät ":5667,"äsi":4968,"äse":2119,"ärv":3216,"äst":3327,"äss":3767,"ätt":2038,"ävä":6336,"Ven":3558,"Val":1869,"The":3947,"Tur":1761,"ber":2683,"ce ":2305,"bri":1634,"bum":8758,"aka":6269,"ake":7201,"aki":5604,"aji":6912,"ajo":3036,"al ":4672,"aja":18366,"aje":2029,"aih":3713,"aik":14735,"ail":13159,"aim":1862,"ain":38398,"ais":49469,"ait":14001,"aiv":4010,"aid":2730,"ahm":1834,"aht":3966,"ahd":4670,"aha":3293,"anu":4220,"ano":5725,"ann":13433,"anm":2516,"ant":14494,"ans":18785,"ane":3551,"ang":3334,"anh":3197,"ani":13808,"anj":1817,"ank":6571,"ana":14302,"anc":1814,"and":6936,"amm":4533,"amo":2056,"amp":3132,"ami":8254,"ame":4479,"ama":17130,"alv":4153,"alu":15472,"alt":16363,"alo":7802,"alm":6123,"all":43431,"alk":10647,"ali":18156,"ale":9704,"ala":40225,"alb":7609,"an ":106763,"aks":12179,"aku":8003,"akt":1737,"ako":3232,"akk":5053,"ae ":1542,"aaj":5209,"aak":5208,"aai":4807,"aan":46070,"aal":13155,"aam":4542,"aas":4039,"aar":7457,"aav":4444,"aat":11110,"aa ":30258,"ai ":13611,"ael":1895,"adi":4196,"ack":1788,"ada":3189,"at ":26536,"arh":2305,"are":4235,"ard":3175,"ara":7612,"arp":1607,"aro":2059,"arm":2145,"arl":1651,"ark":11086,"arj":11397,"ari":15453,"aru":3626,"arv":4834,"arr":3315,"ars":2606,"art":6849,"asa":4886,"asi":8287,"ase":5553,"aso":1854,"ask":4888,"ar ":2391,"apa":10451,"ape":2308,"api":2097,"app":6574,"apu":2355,"as ":6418,"ava":19468,"aut":8391,"avo":2634,"avi":5086,"avu":2827,"ata":7640,"asu":5324,"ast":26494,"ass":30398,"asv":4804,"atk":3390,"ato":4227,"ate":6056,"ati":12192,"att":13242,"ats":1884,"atu":3704,"aul":4866,"aup":10709,"aur":1674,"aus":6854,"aud":2731,"auh":2050,"auk":3794,"jel":4529,"jen":8431,"jes":7295,"jet":1556,"ji ":2297,"jaa":5054,"jat":4403,"jas":4641,"jal":7569,"jak":4713,"jan":11104,"jai":10124,"jou":4303,"joh":4795,"joe":2427,"jol":3356,"jok":24849,"joi":16729,"jon":9520,"jot":6559,"jos":6649,"jia":1786,"itk":2090,"ito":6246,"itu":9149,"itt":43813,"its":13426,"ity":9715,"isk":6128,"ism":2973,"isl":1855,"iso":4721,"isp":1964,"iss":30541,"isu":9410,"ist":90841,"isy":1690,"ita":17408,"ite":16460,"iti":5058,"ivo":1659,"ivu":2547,"isä":6664,"iva":11789,"ivi":6748,"ive":3576,"ipp":2283,"ipu":1889,"ilä":2764,"ilö":2966,"is ":8294,"ion":10934,"iop":2837,"ios":4401,"iot":1986,"ikä":1979,"ioi":3973,"iol":2452,"ipa":1808,"ipe":1630,"irt":2993,"iro":1572,"irk":4448,"iri":6484,"irj":10801,"isi":26753,"ise":59209,"isa":5742,"ire":2058,"inä":3666,"ira":5520,"it ":5916,"ja ":90836,"itä":10709,"ivä":3912,"kii":3407,"kik":3240,"kij":2090,"kim":2775,"kil":12288,"kia":4204,"kie":9439,"kiv":2180,"kin":14280,"kio":1639,"kir":13427,"kis":5163,"kit":6283,"ki ":12342,"kea":4096,"kee":4463,"keh":4191,"kei":5212,"kem":2757,"kel":5236,"ken":9401,"kes":10316,"ker":7027,"keu":3655,"ket":2834,"kev":1653,"ke ":2125,"kre":1700,"ksa":4741,"kse":20115,"ku ":3951,"kot":3952,"kou":3762,"kos":5488,"kor":5268,"koo":4082,"kon":11074,"kom":3392,"kol":7289,"kok":7259,"koj":1821,"koi":14746,"koh":3422,"koe":1980,"kku":3944,"kke":3841,"kka":15414,"kko":10268,"kki":11721,"ko ":7803,"jul":11521,"kat":4283,"kau":16540,"kar":4289,"kas":12031,"kap":6608,"kan":20757,"kal":9712,"kam":1710,"kak":4624,"kah":2946,"kai":21386,"kaa":14665,"ka ":40830,"ha ":1581,"han":4990,"hai":2275,"hal":6945,"har":4587,"hah":1631,"haa":1766,"he ":6389,"hdo":2167,"hdy":6548,"hde":8081,"hdi":4168,"hel":4032,"hei":7575,"hee":1552,"het":1965,"her":2750,"hen":6953,"hem":2103,"hin":8148,"his":2803,"hit":5065,"hja":5248,"hje":3366,"hjo":5005,"gle":2192,"gla":2321,"gra":1797,"ial":8370,"ian":13283,"ias":7499,"ic ":1670,"iaa":4991,"ia ":28344,"iet":9042,"iel":9758,"iem":2939,"ien":20960,"ier":2897,"ies":3794,"ied":2257,"ieh":1857,"iek":1604,"eä ":1619,"ich":1751,"ie ":2310,"ica":1971,"idi":2094,"ide":16311,"ida":3705,"iid":1915,"iik":12079,"iih":2036,"iin":34905,"iil":1871,"iim":1601,"iis":3357,"iir":7370,"iip":1650,"iiv":3159,"iit":11463,"ija":15115,"ijo":3212,"ika":17282,"ii ":2634,"igh":2049,"ihe":3769,"iha":1816,"ihm":2273,"ihi":4976,"iht":1902,"imo":4420,"imm":10403,"imp":1982,"ime":14110,"imi":21669,"ind":1904,"ina":17571,"imu":3688,"inn":9244,"ino":7309,"int":21117,"ins":3814,"ine":62207,"ijä":3525,"ing":10847,"ini":7961,"ink":6694,"ioa":2138,"inu":2993,"inv":2235,"iko":8699,"ikk":20680,"iki":6034,"ike":7607,"ila":13660,"in ":123439,"iku":7225,"iks":5910,"ilp":4235,"ilo":6040,"ill":27290,"ilm":11405,"ilj":1888,"ili":8485,"ile":2304,"ima":7267,"io ":5938,"ilt":4359,"ilu":7728,"hmä":2450,"hol":1937,"hon":2485,"hoi":2109,"hmi":2965,"hmo":1850,"hty":9392,"htu":3085,"hto":2163,"hti":9794,"hte":8765,"hta":4518,"htä":1902,"huo":1796,"hum":3543,"hyv":1693,"etä":4989,"evä":1690,"eta":9667,"ete":9398,"eti":3164,"est":38413,"ess":23541,"eud":1841,"eto":4814,"etr":5809,"ets":2385,"ett":36313,"etu":5501,"ety":2077,"erä":4263,"eve":1697,"eva":7898,"evi":4106,"euv":2196,"eut":3244,"eur":5135,"eus":2033,"esä":2111,"ey ":1541,"evy":4434,"elä":9755,"er ":8358,"eor":2097,"eol":1627,"emä":1894,"es ":8151,"erk":7307,"eri":25262,"erg":2239,"erh":1625,"enä":7730,"ere":5494,"era":4710,"et ":16346,"esk":9110,"esi":14483,"ese":2664,"erv":1561,"eru":10609,"err":4242,"ert":7186,"ers":4983,"ern":2916,"erm":2674,"ero":5460,"eki":2975,"ekk":1967,"ekn":1637,"eko":1843,"eks":9087,"ekt":2330,"en ":216998,"ela":5301,"ele":9915,"eli":28185,"elj":2641,"elm":10985,"ell":36182,"elo":8595,"elu":6803,"els":2931,"elt":10887,"ely":1692,"ema":7235,"eme":2562,"emm":5417,"emo":1546,"emi":5347,"emp":2813,"ene":8165,"eng":2981,"ena":5955,"end":1729,"enn":9973,"enk":6752,"eni":7989,"ens":15471,"ent":18077,"ekä":5887,"ehd":2924,"ehi":4544,"eht":6057,"eis":18168,"eim":3841,"eil":7078,"ein":8482,"eik":4720,"eid":4335,"eja":2716,"el ":2656,"eit":8348,"öss":1852,"gis":3146,"gin":6890,"gia":4299,"ght":1865,"ös ":7736,"gen":2252,"ger":1575,"ön ":5342,"gas":2278,"gan":2134,"fri":1576,"for":2137,"fil":1743,"da ":1964,"de ":3246,"daa":2346,"dal":1725,"das":2350,"dan":3795,"ck ":3175,"ed ":1799,"ean":1991,"eal":1781,"eat":2428,"ea ":2811,"ei ":3830,"een":37770,"eel":9070,"ees":6176,"eet":6503,"edi":2493,"ede":4270,"edu":2200,"edo":1603,"ee ":9590,"dys":6375,"dus":2698,"don":2736,"dol":2034,"dos":3976,"dia":2516,"der":2863,"des":8657,"det":3714,"del":8280,"dek":2312,"den":24947,"dem":1603,"di ":1662,"din":5138,"dio":5021,"dis":6234,"rhe":3181,"rha":2376,"näj":3467,"näk":2091,"näi":1855,"ri ":14183,"rgi":2577,"ret":2238,"res":3703,"nä ":7166,"rea":2509,"ree":2885,"rei":4803,"ren":6049,"rel":2990,"rdi":1756,"re ":2848,"rd ":1882,"ras":4899,"rat":4668,"rau":3008,"raj":2533,"rai":2122,"ran":13051,"ral":4384,"rak":6377,"raa":6078,"rad":3325,"rs ":1794,"ros":3755,"rot":3070,"rom":2805,"ron":4039,"roo":4243,"rov":1686,"roc":2312,"roi":1996,"rna":1810,"rne":1656,"rni":2033,"ro ":2723,"rma":3198,"rme":1785,"riä":3343,"rmi":2119,"rko":8356,"rki":6693,"rkk":7003,"rke":4145,"rka":1913,"rjo":6361,"rja":16764,"rje":7814,"rio":2073,"rit":12816,"ris":12311,"rii":3347,"ril":5111,"rik":9617,"rin":19152,"rim":2324,"ria":8783,"ric":1741,"rie":2399,"näy":2474,"ruo":2205,"run":2801,"ruu":3504,"rus":12209,"rva":2385,"rvi":3707,"rve":3560,"rvo":2466,"ry ":2410,"rsi":3925,"rta":4520,"rto":3696,"rte":2990,"rti":3436,"rtt":1850,"rt ":1812,"rro":1797,"rre":2394,"rra":3831,"saa":11689,"sai":4605,"sak":3935,"sal":8568,"sam":4120,"san":10417,"sat":2386,"sas":5453,"sar":9940,"sav":3217,"sa ":88364,"ryh":2885,"si ":33316,"siv":2877,"sie":3768,"sia":14727,"sit":13812,"sis":15595,"sin":22898,"sio":5975,"sil":7537,"sim":10671,"sij":10631,"sik":6439,"sii":15427,"se ":7339,"sev":3309,"ser":1954,"ses":19353,"set":8009,"seu":4708,"sei":5713,"see":16044,"sen":44413,"sem":6469,"sel":8992,"sek":9571,"spa":2113,"sot":3315,"sol":2285,"son":3580,"sop":1591,"sos":3124,"sod":2739,"sof":1780,"soi":4463,"st ":1959,"sli":1829,"sla":3001,"ski":7594,"sko":4863,"sku":10246,"ska":6718,"ske":7078,"sma":1538,"siä":2875,"smi":2339,"ssä":20775,"stä":20839,"stö":6003,"syn":3354,"syy":2751,"sse":2037,"ssa":81782,"sso":2391,"ssi":7227,"ste":34759,"sta":74007,"sto":18448,"sti":27582,"stu":19044,"str":3547,"sty":7941,"suk":6559,"suo":12490,"suu":15921,"sut":2418,"sva":8410,"svi":2330,"tai":23079,"taj":9294,"tak":4197,"tal":15729,"taa":33105,"tav":11283,"tau":2602,"tat":3910,"tas":5911,"tar":15686,"tap":4732,"tan":13725,"tam":13647,"te ":4200,"ta ":96137,"pa ":1685,"par":4524,"paa":2641,"pah":2670,"pak":1785,"pal":14678,"pai":9231,"pan":6844,"läp":1604,"län":3940,"läm":2523,"läi":5398,"läh":6121,"pi ":2782,"lä ":21592,"per":21853,"pet":1794,"pel":8616,"lää":3392,"pia":3590,"pid":1828,"pie":3232,"pii":5873,"pil":2184,"pin":5017,"pis":5110,"pit":5090,"por":1632,"poi":2560,"poh":4180,"pol":4180,"ppu":2417,"ppi":4768,"ppa":7267,"ppe":1921,"pro":3960,"pur":1783,"pus":1680,"pun":10901,"puo":7107,"pul":1657,"puh":2589,"px ":2372,"puu":2017,"mä ":7257,"mäi":6703,"män":3876,"mäs":1650,"mää":5076,"ra ":5712,"ngi":7939,"ngl":4558,"ni ":6341,"nge":2486,"nga":3203,"jän":3177,"jäl":4462,"jäs":2729,"jär":9906,"nha":2067,"nei":4215,"nel":5383,"nen":63104,"ner":2732,"net":11819,"nes":5079,"neu":1692,"ng ":3916,"nee":7398,"jä ":5912,"nce":1684,"ne ":6951,"ndo":1777,"ndi":2475,"nde":2171,"nda":1665,"nak":1691,"nal":6875,"nan":9972,"nai":6761,"naa":4824,"nd ":4059,"nat":3023,"nas":6250,"na ":39872,"myö":9305,"iö ":2528,"ntä":3484,"nsä":4958,"nva":1615,"num":1758,"nus":3629,"nut":8387,"nty":5978,"nto":10581,"ntu":4377,"ntt":4440,"nti":13349,"nta":21023,"nte":11336,"nso":2855,"nss":4946,"nse":1616,"nsi":11891,"nsk":3706,"nsa":12409,"nnä":1593,"nt ":1924,"ns ":1897,"nol":2730,"noi":8508,"nom":2361,"non":3636,"not":1919,"nos":4123,"nne":16039,"nna":31714,"nno":6361,"nni":12231,"nnu":4777,"nme":1667,"nma":1873,"jää":2989,"nla":2791,"no ":2124,"nke":2631,"nki":13461,"nka":13056,"nko":2285,"nja":2100,"nii":4930,"nie":2772,"nia":6492,"nis":11071,"nit":5273,"nim":13793,"nin":16562,"nik":3354,"nil":2835,"ogi":4118,"oi ":3679,"oht":5278,"käs":5110,"kär":1779,"ohj":13237,"oho":2246,"oiv":1619,"ois":23808,"oir":1655,"oit":25419,"oin":13137,"oik":5181,"oim":14211,"oil":4642,"oih":1702,"oid":5498,"käy":11169,"oje":6434,"oja":5307,"ock":3160,"ode":9371,"odi":1977,"odo":4669,"of ":2718,"oda":3354,"oel":2434,"oen":2647,"ofi":1682,"kä ":7989,"oa ":4765,"oal":3299,"nyk":2547,"nyt":4401,"nvä":1760,"otu":2486,"otk":3192,"oti":5562,"ote":5798,"ott":10457,"ots":2991,"oto":4571,"ost":14630,"ota":8496,"osi":11836,"osk":3907,"ose":3031,"oss":12304,"oso":3047,"ovi":3135,"ova":10956,"ove":2498,"ouk":4542,"oul":4682,"oun":1887,"ous":2951,"out":1762,"opp":4159,"opi":5727,"ope":4607,"opa":3651,"os ":5287,"opu":2089,"oon":7289,"ool":1601,"oom":1672,"or ":1961,"oot":2091,"oos":2166,"oop":3454,"ork":3647,"orm":3104,"orn":1628,"oro":2190,"ord":2513,"ore":3159,"org":2171,"ori":12794,"osa":15087,"ort":3782,"ot ":3256,"ora":3303,"ola":6203,"on ":146201,"oli":32153,"oll":16426,"ole":10977,"kää":1980,"olt":1744,"olm":4749,"olo":5842,"oly":2026,"olu":4176,"oka":27584,"okk":2554,"oki":3937,"oke":2183,"oks":6588,"oko":8846,"oku":9166,"ona":6281,"one":7607,"ong":1915,"oni":10795,"onk":9689,"onn":20892,"ono":2022,"ons":4191,"ont":4754,"oma":18393,"ome":15724,"omi":7452,"omu":1601,"la ":45674,"le ":23672,"laa":8510,"lah":2594,"laj":7083,"lai":41589,"lal":1802,"lak":2624,"lan":15670,"lam":1915,"lat":4073,"las":9219,"lau":6037,"lbu":8824,"kuv":12915,"kuu":20315,"kut":5564,"kus":5968,"kup":2448,"kuo":2056,"kun":18716,"kul":5464,"kuk":1991,"ksi":34515,"kso":2000,"kue":1696,"kui":3945,"kti":3367,"kyi":1827,"kyl":5052,"llä":17871,"lok":9030,"lon":3666,"lom":4409,"lop":2359,"log":4285,"loi":6423,"lpa":4160,"los":4127,"lot":1540,"lou":2403,"ljä":2124,"lmi":8971,"lme":5378,"lma":14737,"lti":5458,"ltt":2378,"lue":13574,"lsi":2865,"lta":20774,"lu ":5034,"lmä":3813,"hän":3427,"li ":38792,"lev":10595,"les":3513,"let":2489,"ler":1630,"lem":3429,"len":7655,"lek":2031,"lel":2977,"lei":7201,"leh":3152,"lee":7823,"llu":3725,"lo ":3012,"lla":45545,"lle":23098,"lli":38165,"llo":8003,"lko":3906,"lku":3582,"lka":14232,"lke":3142,"lki":3385,"ljo":1577,"lje":1833,"ll ":2046,"lit":6604,"lis":35039,"lip":2086,"lio":3340,"lin":25643,"lim":2511,"liv":3460,"lia":6855,"lik":3266,"lil":3081,"lii":12989,"lij":5839,"lie":2208,"ma ":15016,"mb ":3008,"maa":25280,"mah":1595,"mai":6076,"mak":3049,"mar":3413,"mas":6492,"mal":15003,"man":17338,"mat":9985,"me ":2801,"mee":1671,"met":10390,"mes":9381,"mer":13701,"mel":4799,"men":18334,"mei":2490,"iä ":10723,"lve":3316,"lvi":1540,"luk":6577,"lui":2536,"luo":7753,"lun":5262,"lut":6301,"lus":6284,"luv":9141,"luu":4990,"ltä":6950,"lyh":1732,"lym":1752,"mpi":5319,"moo":2276,"mon":6257,"mpa":1715,"mmä":7757,"mpä":2564,"mua":1679,"mus":7282,"mut":3853,"muu":6734,"mui":3639,"muk":6397,"muo":5752,"mi ":13252,"min":18710,"mil":5082,"mis":25017,"mit":6945,"mia":4076,"mie":6732,"mik":3016,"mii":3025,"mo ":3241,"mmi":7415,"mma":5671,"mme":2557,"väl":6253,"vä ":5493,"vää":1811,"vän":2110,"vät":4019,"yvi":1744,"yty":2014,"ytt":9230,"yte":5446,"ysv":6099,"yst":4193,"ysi":1754,"yri":2968,"yt ":5088,"ymä":1567,"ys ":5219,"ylä":5863,"yny":2515,"yvä":4334,"yyl":1824,"yys":2076,"yyp":2141,"yyt":1690,"yy ":2993,"ye ":3730,"yde":2513,"yee":3728,"yks":11428,"yky":3192,"yn ":4172,"yle":4657,"yli":5834,"yll":2070,"ymi":2202,"ymp":3885,"yne":1918,"ynt":3210,"yi ":2699,"yhm":2893,"yhd":7827,"yht":15566,"yis":3850,"tön":2023,"tös":1710,"tö ":2585,"täm":6823,"tän":1885,"täh":1876,"täj":3398,"tä ":34022,"tää":13926,"täy":1596,"tär":1635,"täv":7438,"sää":2844,"sä ":25127,"säl":3359,"säk":2119,"säv":1927,"vuo":32425,"vun":2514,"vul":3123,"vy ":2051,"via":2642,"vio":1556,"vir":4221,"vil":2731,"vin":6040,"vii":6538,"vie":4226,"vit":3912,"vis":7340,"voi":10127,"von":1808,"vos":2863,"räi":2223,"vi ":2985,"ver":6645,"ves":2270,"ven":5385,"vel":7217,"ve ":1928,"val":26751,"vak":1862,"van":11048,"vap":1548,"var":8953,"vat":20575,"vas":8886,"vaa":7863,"vai":7839,"va ":18614,"uuk":1525,"uun":9740,"uul":10457,"uud":5759,"uus":11398,"uur":9061,"uut":11944,"uvi":3760,"uvo":2424,"uva":18498,"uvu":5385,"usl":2474,"usk":5912,"usi":8604,"use":5829,"usa":2048,"uu ":9094,"ust":26922,"uss":6964,"utk":3663,"uti":2698,"ute":5204,"uta":9377,"utt":14676,"uts":1830,"utu":7065,"uto":3590,"us ":19751,"ut ":13926,"ura":6710,"ure":2812,"urh":2266,"uri":10072,"uro":4160,"uru":1961,"uod":11474,"uon":20919,"uol":10441,"uom":19569,"uok":4826,"uot":13021,"uor":5597,"uos":6389,"upe":2549,"upu":10042,"umi":12508,"uma":5258,"umb":3567,"ume":3010,"unt":10189,"unu":3435,"unk":6506,"uni":5890,"unn":17239,"und":1617,"una":4707,"ung":5391,"une":2125,"uks":15716,"uku":7377,"uko":2518,"ukk":6046,"uki":2552,"uke":1610,"um ":2055,"uka":8994,"ulu":17346,"ult":3742,"ulo":2329,"ull":6128,"ulk":14188,"uli":3634,"ule":2174,"ula":6501,"un ":17537,"uin":4870,"uis":6404,"uht":1985,"uhu":1855,"uje":1776,"uit":1991,"uja":2763,"ui ":2881,"uha":2389,"ude":10287,"udi":3267,"ue ":4574,"uet":2172,"uee":7376,"työ":3659,"ua ":4041,"uas":1855,"tyv":3209,"tyy":6164,"tye":7861,"tyi":5279,"tyk":3846,"tym":2979,"tyn":4512,"tys":6857,"ty ":6437,"tur":3509,"tus":10527,"tut":5144,"tuu":8595,"tuv":3012,"tuj":1880,"tui":4969,"tul":5059,"tuk":7469,"tun":13048,"tum":4274,"tuo":5296,"tua":2491,"tud":2510,"ttö":2890,"ttä":18426,"tra":3772,"tri":7061,"tro":3179,"tu ":15401,"tsa":2112,"tse":11636,"tsi":5021,"tsu":1775,"tta":37250,"tte":15326,"tti":29985,"tto":7420,"ttu":17456,"tty":9077,"to ":10851,"tiö":3363,"toj":3671,"toi":21414,"tkä":1786,"toa":3897,"tos":4171,"tot":2445,"tom":3430,"ton":10160,"tok":5162,"tol":5536,"tor":8049,"too":2936,"top":1727,"tii":20914,"til":10225,"tik":4482,"tie":14185,"tit":2814,"tis":9101,"tin":13829,"tim":2695,"tio":13178,"thu":3317,"tia":5511,"tiv":2672,"tki":4098,"tka":4817,"pää":9932,"tem":2906,"ten":19694,"teo":3855,"tei":13515,"tek":5492,"tel":25974,"tee":16061,"teh":4545,"th ":1573,"tet":23609,"tes":1638,"ter":13778,"ti ":27214,"pär":2295,"the":3039,"päi":2568,"yön":1725,"yös":8569,"yöh":1625,"yä ":1547},"n_words":[15184556,16912812,13033049],"name":"fi"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":116102,"E":120408,"F":121384,"G":111406,"A":238769,"B":169565,"C":285959,"L":345504,"M":204991,"N":98243,"O":65813,"H":89479,"I":164982,"J":77783,"K":45111,"U":61602,"T":117987,"W":42348,"V":73826,"Q":14589,"P":196230,"S":238878,"R":128546,"Y":18365,"X":15982,"Z":13917,"f":625832,"g":841835,"d":2804209,"e":9206578,"b":583197,"c":1872040,"a":5160230,"n":5070939,"o":3525396,"l":3535844,"m":1717377,"j":148113,"k":150020,"h":692514,"i":4746975,"w":64787,"v":659313,"u":3519294,"t":4403208,"s":4479915,"r":4208721,"q":428703,"p":1557910,"z":87206,"y":309342,"x":204437,"É":44421,"ï":13787,"î":22540,"ê":38749,"é":1751958,"è":218696,"ç":101170,"â":19710,"à":277569,"û":11801,"ù":7682,"ô":30698,"œ":8733," l":1232220," m":320276," n":242807," o":228420," h":94558," i":171950," j":98827," k":16664," d":1997342," e":1227865," f":345437," g":129922," a":696872," b":124278," c":594010," y":7295," u":524523," t":219372," w":8382," v":133967," q":138426," p":728591," s":616330," r":286749," J":75893," K":42648," H":85423," I":133876," N":84973," O":54606," L":335414," M":194304," B":161567," C":266111," A":214898," F":109790," G":105282," D":105238," E":106937," Z":13187," Y":17633," X":12755," S":214333," R":117248," Q":13527," P":183259," W":40332," V":62074," U":57563," T":104796," à":274935," î":9932," é":209998," ê":8160," É":44088,"A ":15041,"Da":22440,"Cl":11868,"Co":65041,"Cr":12941,"Ce":29429,"Ch":51129,"Ci":6652,"Du":7894,"Do":13538,"De":21502,"Di":17162,"Fe":8715,"Fa":10235,"Eu":11691,"Es":11694,"En":18617,"El":32899,"Ge":14692,"Ga":20211,"I ":15646,"Fr":44421,"Fo":14809,"Fl":7090,"Fi":9942,"C ":21532,"Au":19631,"Ar":26711,"At":6889,"As":11086,"D ":7702,"Ba":35723,"Am":14264,"An":31376,"Al":37486,"Bu":10594,"Br":27743,"Ca":55063,"Bi":10547,"Be":29116,"Bo":31176,"Bl":7237,"Le":129356,"Li":24542,"La":91448,"Lu":9746,"Lo":35045,"Me":21247,"Mi":27542,"Ma":80328,"Mu":9199,"Mo":39904,"Ni":11148,"Ne":15700,"Na":15519,"No":31277,"On":6946,"Gi":8289,"Gr":22075,"Go":10803,"Gu":15739,"Ha":29460,"He":19781,"II":12760,"Hi":8034,"Ho":13597,"In":22404,"Il":64459,"Is":6832,"It":7525,"Ja":19690,"L ":40055,"Je":19939,"Jo":19144,"Ju":9216,"Ka":11065,"Un":43594,"Tr":15916,"To":17150,"Th":22226,"Ti":6718,"Te":13283,"Ta":12863,"V ":9181,"St":21014,"Su":20295,"Wi":11355,"Wa":11101,"We":6825,"Vi":21333,"Ré":14676,"Va":15441,"Ve":10793,"Pr":25219,"S ":10997,"Pe":15558,"Pa":62058,"Pl":7693,"Po":24431,"Pi":18433,"Ph":7701,"Ou":6784,"Or":12154,"Se":26414,"Sc":10127,"Si":15134,"Sh":8197,"Sp":6886,"So":23717,"Ru":6655,"Sa":56793,"Re":19635,"Ri":11472,"Ro":36071,"Qu":12001,"Ra":14786,"b ":23149,"a ":715548,"Yo":8948,"i ":246446,"ge":153192,"ga":79217,"bé":13640,"fl":14307,"ff":44067,"fi":98156,"fr":118819,"fu":33433,"fo":104257,"he":135815,"ha":144005,"gn":87721,"cé":45974,"cè":9053,"gl":38751,"gi":139568,"gh":16776,"gu":62416,"gr":93539,"go":42919,"du":266692,"g ":46447,"ea":84524,"eb":15026,"ec":142148,"ed":28114,"de":1288806,"di":214915,"dm":11618,"do":95110,"ds":13995,"dr":52545,"ew":13282,"ex":53989,"eu":311458,"ev":34640,"ey":24158,"ez":12097,"fa":75922,"h ":43257,"fe":44423,"eg":22246,"ef":23663,"ee":14949,"el":261273,"ei":65911,"ep":62933,"eo":13735,"en":1013332,"em":298038,"et":487438,"aï":8404,"es":1468583,"aî":9959,"er":561408,"ca":174843,"e ":4165476,"by":10945,"bs":8710,"br":109877,"bu":41947,"bo":48835,"bl":80107,"bi":56922,"be":65791,"da":250547,"f ":44719,"cy":11950,"cu":58711,"ct":159489,"cs":7871,"cq":7986,"cr":84742,"co":398094,"ck":30588,"cl":49217,"ci":192973,"ch":228563,"ce":285004,"cc":27518,"c ":86602,"az":14616,"ay":48306,"ba":99079,"d ":356687,"at":402587,"as":164469,"ar":580946,"aq":13150,"ax":8225,"av":102326,"au":313806,"ak":16011,"al":437033,"ai":491839,"aj":9782,"ao":15479,"ap":112014,"am":169689,"an":984305,"ac":155600,"ad":94574,"ab":78438,"ag":129146,"ah":12966,"ae":24602,"af":19038,"nu":48492,"nt":781378,"ns":405957,"nr":20379,"nq":10241,"no":177623,"nn":194465,"nz":7383,"ny":16039,"nv":29268,"oe":7051,"of":34594,"oc":112798,"od":60253,"oa":9189,"ob":47107,"om":320717,"on":978231,"ok":8561,"ol":182032,"oi":182944,"og":68867,"oh":10520,"ot":93297,"os":119510,"ov":56208,"ou":495961,"op":94277,"oo":30133,"or":397296,"oq":6742,"r ":618020,"ox":6981,"ow":11940,"oy":26184,"pe":191299,"pa":336824,"lè":14938,"pl":106081,"lé":66665,"po":236682,"ph":72167,"pi":70246,"lo":174519,"hé":40769,"lm":23015,"hè":8383,"ll":348027,"ls":35163,"lp":13659,"lv":7890,"lu":105730,"lt":35681,"ly":20798,"o ":123736,"ma":245390,"mb":101583,"me":421982,"iè":67396,"mi":181182,"mm":149391,"ié":40043,"mp":121199,"mo":139179,"mt":11044,"ms":10324,"mu":103041,"my":7264,"p ":21047,"na":254840,"nc":212957,"nd":247872,"ne":645258,"nf":26140,"ng":123325,"ni":252369,"nk":9186,"ju":35235,"jo":42449,"fé":31064,"ki":18400,"ke":23430,"ka":17710,"m ":101180,"ko":10158,"gé":41926,"li":366432,"le":1033424,"ld":17364,"lg":18291,"lf":6672,"la":653743,"lb":19702,"n ":1301691,"hr":18002,"ht":13842,"hu":32067,"hi":111052,"hn":14386,"ho":82392,"hl":7215,"dé":209539,"id":89273,"ic":228813,"ib":37956,"ia":126735,"ig":111904,"if":68146,"ie":498651,"hy":15296,"k ":41221,"iq":189141,"ir":210609,"is":657338,"it":513778,"iu":11403,"iv":116504,"ix":21397,"ik":10626,"il":316521,"im":87035,"in":595401,"io":414857,"ip":60451,"je":33885,"iz":8646,"l ":543603,"ja":25992,"xi":22025,"tè":15940,"té":190035,"xp":12895,"xt":10929,"z ":20913,"xa":6853,"xe":16992,"wi":9300,"sé":88528,"rô":7192,"y ":99277,"wa":16319,"we":7882,"rè":35139,"ré":242093,"vi":188474,"vu":6788,"vr":43893,"rê":8320,"vo":57579,"ux":99749,"uv":71595,"ve":234012,"va":93360,"x ":120860,"ui":230155,"uj":8246,"ul":117513,"ue":407659,"uf":8137,"ug":28998,"ur":515467,"us":227928,"ut":191898,"um":73719,"un":620155,"up":64955,"ty":19621,"tu":167178,"tt":77237,"ub":50581,"ua":51699,"ud":59261,"uc":57765,"w ":13938,"to":173702,"pé":53976,"tl":8663,"pè":21910,"ts":102393,"tr":321129,"te":602785,"ti":612679,"th":96741,"v ":6801,"où":7384,"tb":13422,"tc":11543,"oû":9564,"ta":328001,"su":146804,"ss":210582,"st":753670,"sy":23798,"sl":12189,"sk":12848,"sn":9845,"sm":19680,"sp":78469,"so":222811,"sq":17522,"sc":60065,"se":403227,"sh":19364,"si":325826,"u ":509520,"sa":131590,"rr":83294,"rs":168244,"rt":289803,"ru":65582,"rv":27573,"ry":20802,"rq":9625,"rp":15297,"ro":341033,"rn":84154,"né":177701,"rm":89580,"rl":33141,"rk":16207,"nç":94772,"ri":487470,"rg":66626,"rf":12647,"re":763207,"rd":96680,"rc":70956,"rb":28741,"ra":505286,"t ":1634972,"qu":422607,"mê":10277,"mé":91911,"mè":10378,"s ":1913426,"pt":48826,"pu":65796,"pp":68088,"pr":230460,"ps":19103,"zi":10741,"ze":10497,"za":12596,"zo":10854,"vé":17663,"ye":16656,"yc":12861,"ya":27740,"yt":7956,"ys":37929,"yr":15599,"yp":14150,"yo":8853,"yn":14467,"ué":92385,"ym":16691,"yl":12792,"Ét":26242,"ât":11174,"à ":276969,"éé":15388,"îl":9937,"êm":12225,"êt":20220,"él":51000,"éo":28234,"ép":86639,"ém":44293,"én":55491,"és":107336,"ét":140932,"éq":14455,"ér":173640,"év":51715,"éb":22743,"éa":37956,"éd":70000,"éc":110985,"éf":19719,"ée":230327,"ég":99948,"èm":27722,"èn":11542,"èr":67257,"ès":34878,"èt":16045,"èv":6684,"èg":9962,"èc":26515,"ço":8440,"ça":89329,"é ":372600,"ût":9917,"ù ":7557,"ôt":12300,"œu":7751,"一":9376," Ga":20103," Ge":14539," Fo":14685," Fr":44293," Fi":9743," Fl":7048," Ha":29378," He":19697," Go":10717," Gr":21904," Gu":15637," Gi":8161," Ho":13528," Hi":7850," Je":19786," L ":35977," Ja":19604," Is":6751," It":7499," In":22151," Il":64299," Ka":10962," Jo":19013," Ju":9181," La":90821," Le":128061," Li":24245," Ma":79831," Mi":27364," Me":21128," Lo":34906," Lu":9698," Ne":15538," Na":15371," Ni":11097," Mo":39732," Mu":9105," C ":11416," Am":14164," An":31164," Al":37253," Ba":35535," Au":19508," At":6802," As":10968," Ar":26331," Be":28995," Bi":10366," Bl":7174," Bo":30964," Br":27614," Bu":10535," Ca":54551," Ce":29219," Ch":50964," Cl":11738," Cr":12852," Co":64524," Da":22266," Di":16958," De":21321," Do":13217," Du":7843," El":32823," Es":11656," En":18307," Eu":11642," Fe":8673," Fa":10157," Wi":11260," We":6753," Wa":11001," Yo":8931," a ":54049," Ou":6671," Or":12055," Po":24205," Pl":7593," Pi":18345," Ph":7574," Pe":15479," Pa":61683," No":30970," On":6799," Ra":14631," Qu":11878," Ro":35894," Re":19473," Ri":11439," Pr":25007," Su":20225," St":20131," Ta":12776," Th":22049," Te":13049," Tr":15810," To":17022," Sa":56588," Sh":8106," Si":14799," Sc":9966," Se":26149," So":23336," Sp":6786," Va":15389," Ve":10579," Vi":21221," Ré":14605," Un":43425," ja":19099," l ":229130," je":16653," im":17160," in":90112," il":33523," is":6946," it":13099," jo":28748," fé":16770," ju":33224," ha":21033," gr":51361," go":9168," gu":9190," hi":17321," dé":147001," ho":26370," hu":11315," ne":11993," na":26573," mu":26866," mo":87695," on":19443," oc":20805," of":17687," ob":10074," no":93418," le":471758," li":50278," n ":10043," la":426807," gé":20899," me":31539," mi":31668," ma":94790," lu":10283," lo":32481," af":11272," ag":11352," ab":10657," ac":41475," ad":17435," am":34528," an":80768," ao":8597," ap":47388," ai":15942," al":42510," av":56190," au":168550," ar":49629," at":13572," as":18267," d ":197071," ba":44107," bi":13538," be":11213," bo":15457," bl":6713," bu":7813," br":18993," ca":62413," et":342046," es":474914," en":328723," em":8216," el":14313," fe":12823," fa":58915," eu":10657," ex":33671," fu":26386," fr":103021," fo":71702," fl":7457," fi":46439," ge":16870," ga":14369," cl":20052," co":290856," cr":32388," ce":54595," ch":72895," ci":26401," da":165596," cu":9192," cy":6653," do":48107," dr":11453," de":1120211," di":79077," du":223778," té":10776," ru":15648," sa":43475," se":95411," sc":20238," si":97963," sp":20322," so":138476," qu":138128," ra":22818," re":79730," ri":12843," né":88113," ro":35414," pu":26022," pr":170981," s ":23984," mê":10232," mé":23004," ou":87553," op":9623," or":44807," pe":62711," pa":204808," lé":7304," pl":62779," po":141674," pi":14858," ph":17219," sé":19742," va":14483," ve":21390," vo":19720," vi":65672," ré":111803," ut":13716," un":501687," ta":13187," où":7340," sy":15329," st":19632," su":114473," tr":64495," pé":8718," to":29268," th":29162," ti":14570," te":41930," Ét":26213," à ":274908," êt":8158," év":15420," éq":11336," ét":84332," ép":12565," él":18021," éd":13385," éc":33500," ég":11829," îl":9913,"Eur":7732,"En ":11566,"Ell":27452,"Fra":34835,"For":6793,"II ":8125,"Her":7505,"Hau":10362,"Gra":11197,"Ind":7362,"Il ":58031,"Bar":7002,"Bas":7999,"All":7697,"Ang":8461,"Cal":11943,"Car":9781,"Can":10563,"Ber":8508,"Bel":7820,"Bou":9492,"Dan":8952,"Chi":7925,"Cen":6692,"Cet":9727,"Cha":27535,"Cor":8959,"Com":17296,"Col":7940,"Con":15750,"Cou":6779,"New":7835,"Nor":17044,"Pie":8398,"Par":29868,"Pro":9444,"Ita":6863,"Jea":13129,"Les":29698,"Le ":85946,"Lan":7053,"La ":64716,"Lou":8888,"Loi":7362,"Man":8487,"Mar":33987,"Mon":17275,"Mic":7917,"Sud":6967,"Sta":7499,"Son":7414,"Sai":21598,"San":8752,"Val":6713,"Uni":28327,"Un ":8531,"The":11621,"bit":13396,"bil":9380,"ble":36471,"bli":30327,"bor":11093,"bou":14756,"be ":11661,"ban":14082,"bal":23571,"bat":8684,"bas":21924,"bar":8519,"bec":7102,"ber":20504,"bel":11445,"bie":9676,"ca ":7169,"car":22613,"cat":26927,"can":25116,"cap":8531,"cad":7391,"cal":21842,"cai":32564,"ce ":152975,"bri":16551,"bra":7577,"bre":74429,"bum":11288,"but":12601,"by ":8395,"am ":12290,"al ":83242,"ail":20993,"ain":126816,"air":72297,"ais":165056,"ait":69408,"aie":8332,"acé":7099,"agi":9624,"agn":32293,"ago":7536,"anv":10025,"ano":12803,"ann":37434,"ant":198270,"ans":198981,"ane":14041,"ang":47054,"ani":45704,"ana":30499,"anc":97402,"and":98722,"amm":17042,"amp":23230,"ami":37168,"ame":15452,"amb":9570,"ama":14537,"alt":7552,"alo":12342,"all":56591,"ali":100273,"ale":107350,"ala":20659,"alb":12381,"an ":76640,"abe":9364,"abi":15595,"abl":19961,"abo":9408,"abr":8483,"ae ":16557,"ac ":8751,"aff":8179,"ai ":16674,"aga":7441,"age":55470,"ado":7415,"adm":9823,"adi":21924,"ade":15733,"aci":10019,"ach":16602,"ace":28163,"acc":10028,"ada":13056,"act":43926,"até":6761,"ays":15243,"aya":8788,"aqu":12595,"at ":36684,"arg":12036,"are":21800,"ard":35646,"arc":26378,"arb":8126,"ara":32792,"aro":14440,"arn":8384,"arm":11551,"arl":15187,"anç":91515,"ari":62241,"aru":7211,"arq":6770,"arr":20556,"ars":16927,"art":134713,"au ":118568,"asi":7486,"ase":11574,"ar ":120208,"api":10314,"aph":15548,"apo":12040,"app":42097,"apr":9346,"as ":37163,"amé":27799,"ava":24401,"aux":51164,"aut":60224,"avr":9575,"avo":10413,"avi":13083,"ave":40354,"ay ":10427,"ata":13379,"aoû":8368,"ast":22288,"ass":51465,"atr":15439,"ato":12284,"ate":44406,"ati":192812,"ath":17738,"auc":6672,"att":16852,"ats":23409,"atu":15332,"aul":9810,"aum":7753,"aur":11354,"aus":19732,"aud":9105,"jet":8119,"jeu":19460,"jan":10693,"fév":8788,"fér":12432,"jou":35022,"itr":12118,"ito":10908,"itu":87676,"itt":10859,"its":10569,"ism":13926,"iso":19540,"isp":7420,"iss":50166,"ist":125656,"ita":62348,"ite":57973,"ith":6663,"iti":60924,"isé":36821,"iva":20746,"ix ":16528,"ivi":25932,"ive":55895,"is ":216454,"ion":363871,"iol":6761,"ipa":18735,"ipe":13929,"ir ":34075,"iro":12325,"iné":18867,"iri":8297,"isi":28415,"ise":101298,"isc":8611,"isa":28074,"iqu":188846,"ire":120253,"ira":8690,"irc":8347,"it ":122098,"ité":60635,"gén":18856,"jus":7076,"jui":19529,"ham":23355,"han":29050,"hau":9292,"har":26334,"hab":12359,"he ":52407,"hel":8715,"hef":7684,"hes":11848,"her":19826,"hie":13715,"hin":15050,"hil":13377,"his":19375,"hiq":6944,"dé ":18734,"déb":7151,"déc":35759,"go ":7689,"cée":7075,"céd":16748,"gle":10717,"gli":6716,"gla":15951,"gno":9326,"gni":10059,"gne":50271,"gna":8886,"gou":8451,"gro":28558,"gra":41927,"gre":10583,"gui":7723,"gue":37672,"ial":35445,"ian":23438,"iat":15622,"ic ":12032,"ibl":8749,"ibu":7144,"ia ":21961,"ieu":43579,"iel":22526,"ien":138676,"ier":92906,"ies":16655,"iff":14012,"ifi":22387,"ict":17983,"icu":12669,"ico":11245,"ick":6729,"ici":37411,"ich":23588,"ice":26902,"ie ":169464,"ica":66266,"idi":10544,"ide":31414,"ida":13896,"if ":16206,"il ":58832,"ige":6657,"igh":9785,"igi":21897,"igu":11721,"ign":41717,"idé":15214,"imp":16150,"ime":18408,"imi":12406,"inc":41459,"ind":21942,"ina":36398,"ino":12991,"int":75915,"ins":44722,"inf":11189,"ine":105011,"ing":34736,"ini":40626,"ila":9244,"in ":120797,"ilo":10879,"ill":136561,"ilm":14297,"ili":45030,"ile":18930,"ima":15975,"io ":17855,"ils":11236,"hol":10188,"hom":18219,"hon":13352,"hor":8438,"dév":10462,"dée":10321,"déf":8940,"dém":9364,"dép":53945,"dér":20431,"dés":15740,"ht ":8492,"hum":15199,"ffe":7771,"ffi":11627,"fes":12569,"fer":7877,"fam":26250,"fai":22917,"fac":6862,"ext":9480,"ez ":9334,"exp":11699,"exi":10548,"eta":8136,"ete":7713,"eti":12647,"esp":33176,"est":489584,"ess":57266,"eul":7484,"ett":36757,"ew ":8505,"eve":10111,"eva":7495,"evi":7979,"euv":6803,"eut":10699,"eur":177418,"eus":14865,"eux":38960,"ey ":17748,"er ":177809,"es ":856843,"ept":19543,"epu":14012,"elé":11860,"epr":16261,"erl":8779,"eri":17387,"erg":13600,"ere":10113,"erc":19130,"era":14672,"erb":9918,"et ":392437,"esc":6799,"eu ":31850,"erv":19978,"err":46160,"ert":38461,"ers":78150,"ern":42947,"erm":29874,"ero":7440,"en ":352124,"ela":11282,"ele":8299,"eli":13751,"elg":9644,"ell":93604,"elo":21097,"els":10727,"emb":53506,"ema":21014,"eme":149235,"emi":31587,"emp":25298,"ene":6850,"ena":20675,"end":38663,"enc":42120,"enn":54421,"eni":9389,"enu":8876,"env":9265,"ens":48323,"ent":371396,"enr":18661,"eil":18380,"ein":23896,"eig":7921,"el ":58534,"gis":10364,"giq":10802,"gin":20723,"gio":57044,"gie":22112,"ght":8779,"gen":26739,"ger":14731,"ges":19429,"ge ":72294,"gar":9985,"gal":16426,"gan":17016,"ga ":6882,"fus":7249,"fut":21703,"fra":95189,"fri":7633,"for":45771,"fon":27020,"foo":11436,"foi":12370,"fic":24328,"fil":25221,"fin":20051,"ffé":6845,"da ":14864,"de ":945562,"dai":12767,"dae":8896,"dat":11688,"dan":177922,"cul":27372,"ctu":20091,"ctr":11700,"cto":18088,"cti":59878,"cte":30379,"cré":24660,"cla":14396,"cle":15005,"clu":9348,"ché":8767,"co ":9868,"cié":10273,"con":120854,"col":28490,"com":148564,"cor":20969,"cou":37141,"cs ":7503,"cqu":7392,"ct ":7934,"cra":8267,"cri":32184,"cro":10112,"cci":7074,"cea":6989,"ch ":14828,"cer":17030,"ces":36299,"cet":8987,"cen":24433,"cem":11283,"cel":14093,"cha":54442,"cia":30366,"ck ":14947,"cie":55297,"cid":7429,"che":76684,"chi":31813,"cho":10460,"chn":7132,"cir":7170,"cis":7169,"cit":9428,"cin":16999,"cip":23717,"cke":8035,"ed ":9999,"ec ":39615,"ean":16858,"eau":41781,"eff":6903,"ech":13667,"ef ":8454,"ect":52251,"eco":14578,"dur":7123,"don":36748,"dom":8813,"dou":8007,"ds ":11646,"dmi":10099,"doc":6954,"dui":12311,"duc":16031,"dri":6694,"dra":6827,"dre":21768,"du ":216503,"dro":13106,"dic":10492,"dia":18227,"der":20812,"des":229646,"deu":19332,"dev":7361,"del":7755,"den":24656,"dep":13089,"di ":9657,"do ":8222,"div":14424,"din":10138,"dio":11927,"dir":13444,"dis":29420,"dit":31296,"die":29137,"dif":14101,"rga":14359,"ri ":10992,"rgi":6778,"rge":19992,"ret":19172,"res":111968,"reu":15390,"rg ":10612,"rea":7335,"rec":29178,"reg":9663,"rem":41279,"ren":49926,"rel":19572,"rer":6909,"rep":18635,"rdi":12299,"rde":11104,"re ":401675,"rco":8273,"rch":25796,"rce":14227,"rd ":49795,"rap":22951,"ras":10406,"rat":61280,"rav":14677,"rbe":7330,"rai":37514,"rag":12671,"ran":191224,"ram":14978,"ral":42593,"rab":8579,"rad":17757,"rac":18041,"rs ":108005,"ros":12152,"rot":13239,"rom":20207,"ron":45128,"rop":30358,"rou":56658,"rov":25371,"rod":19072,"roc":18144,"roi":33087,"rol":7836,"rof":11414,"rog":8618,"rna":26829,"rne":26551,"rni":14539,"riè":7237,"nér":15715,"ro ":12314,"rma":27965,"née":51678,"rme":33077,"rmi":8879,"rle":10727,"rla":9260,"nça":87637,"né ":83238,"rip":8171,"rio":12752,"riq":23356,"rit":49518,"ris":61412,"riv":19174,"rig":30515,"ril":14567,"rin":32694,"rim":9133,"ria":17879,"rib":8141,"ric":64397,"rid":9964,"rie":84165,"rk ":9714,"ruc":9646,"rus":10779,"rvi":9893,"rve":7324,"ry ":13884,"rsi":16639,"rso":15966,"rse":12239,"rta":21499,"rto":8213,"rte":72339,"rth":9356,"rti":82832,"rts":8395,"rtu":7199,"rmé":10475,"rt ":67054,"rqu":9541,"rro":9520,"rri":15769,"rre":35748,"rra":13162,"sac":6821,"sai":15869,"san":31853,"sat":19662,"sa ":20364,"si ":23508,"sie":23358,"sid":13927,"sic":8539,"sit":96579,"sis":11253,"siq":13014,"sin":20761,"sio":46892,"sil":9278,"sig":20886,"scr":8195,"se ":185840,"sca":7156,"sci":11470,"sco":9152,"ser":33436,"ses":32384,"seu":18614,"sea":8788,"sei":15724,"sec":11236,"sep":15495,"sen":31564,"sem":23920,"sel":11350,"spo":15398,"spe":6813,"spa":15911,"sou":38682,"sol":8864,"son":104745,"sor":19583,"soi":7516,"soc":22585,"st ":465410,"squ":17455,"sla":8458,"siè":10850,"sme":14217,"stè":10526,"sys":7499,"où ":7381,"sse":73979,"ssa":25709,"sso":20070,"ssi":65158,"ssu":9524,"ste":86262,"sta":32395,"spé":11204,"sto":25947,"sti":51286,"spè":18097,"str":64918,"sud":12096,"sui":13416,"sul":7775,"sup":9440,"sur":74845,"tai":73243,"tal":50318,"tag":11022,"tab":9930,"oût":9507,"tba":12570,"tat":51850,"tar":13094,"tan":65935,"tam":9043,"tch":9033,"te ":229351,"ta ":14870,"pe ":50543,"par":234438,"pat":9116,"pas":16097,"pay":9300,"pag":21493,"pal":18293,"pan":7834,"phe":9797,"pha":8008,"pho":13907,"phi":21732,"peu":15687,"pen":18365,"per":42656,"pet":8398,"pes":12928,"pel":20669,"pla":26028,"pli":9068,"ple":16401,"plo":8634,"plu":41224,"lé ":12711,"phy":6923,"pio":10675,"pir":7633,"pit":8267,"por":43095,"pop":8694,"pou":69236,"pos":40507,"poi":8423,"pon":18257,"pol":30878,"ps ":10703,"ppo":8721,"ppa":15829,"ppe":26475,"lév":6751,"lég":7457,"lée":13954,"pub":20867,"pte":18796,"pti":14976,"pri":39208,"pre":41906,"pro":84520,"pui":23064,"pul":10006,"prè":17131,"pré":40058,"mé ":9595,"mêm":9754,"mée":9397,"méd":12609,"mét":11009,"mér":37720,"qu ":18907,"qua":24258,"que":267169,"qui":99689,"qué":9551,"ra ":22507,"ngl":20995,"ngu":17741,"ni ":11818,"nge":18618,"nga":7486,"ndé":17148,"nel":18137,"nen":6872,"nem":17361,"ner":14652,"net":8307,"nes":51020,"neu":14088,"ng ":26468,"nfo":8493,"nct":9124,"nco":15938,"nci":43333,"nce":105515,"nch":14886,"ne ":496030,"ndu":10752,"ndr":20713,"ndo":10176,"ndi":34156,"nde":56096,"nda":37414,"nal":54750,"nan":26247,"nar":12696,"nad":12377,"nag":10703,"nai":30209,"nd ":51106,"nau":12991,"nat":48959,"na ":18193,"nté":16371,"ny ":7135,"nvi":18890,"nve":7658,"nue":12712,"nto":18893,"nts":41197,"ntr":72123,"nti":46425,"nta":41915,"nte":98733,"nsu":6773,"nso":9167,"nst":29806,"nse":30814,"nsi":22894,"nsc":6653,"nu ":15262,"nné":23464,"nre":13930,"nt ":427954,"nqu":6929,"ns ":274209,"nol":9967,"noi":11530,"nom":60135,"non":15485,"not":12329,"nor":18442,"nov":11258,"nou":9106,"nne":96393,"nna":28812,"nni":16118,"nnu":18917,"no ":11795,"nif":7747,"nie":43637,"nic":15379,"niv":18250,"nis":60898,"nit":15799,"niq":25898,"nio":9717,"nim":8520,"nin":8256,"ogr":17742,"ogi":23895,"ogn":7673,"oi ":11549,"ois":67090,"oir":45748,"oit":23632,"oin":16797,"ol ":10751,"och":11107,"oci":26047,"ock":12402,"oca":13060,"occ":8851,"ode":18688,"oct":13097,"of ":10906,"odu":17793,"off":9016,"ofe":9519,"obr":10777,"obi":7081,"oye":7463,"oya":11498,"osé":9652,"oue":24956,"ouc":7361,"oti":7550,"ote":10769,"oto":12004,"opé":11013,"ost":16493,"ota":13987,"otb":12106,"osi":13992,"ose":15543,"oss":13431,"ovi":29641,"ouv":52919,"ove":15893,"oug":6908,"oui":10077,"oul":19390,"oup":35466,"ous":44108,"our":142367,"out":30934,"opo":10738,"opp":14092,"ope":8783,"oph":15160,"os ":20357,"opu":8920,"or ":16010,"oot":13483,"ork":6806,"orm":40453,"orn":11249,"orr":9505,"orc":7615,"ord":43495,"ore":17630,"org":20050,"ori":52286,"ou ":81997,"ort":98537,"ors":17665,"omé":6755,"ot ":11871,"ora":19855,"ola":10628,"on ":413868,"oli":37368,"oll":18132,"ole":20481,"olo":39631,"olu":16814,"om ":32719,"ona":46449,"ond":64776,"onc":25758,"onf":7947,"one":18885,"ong":21441,"oni":23679,"onn":93395,"ono":18517,"ons":103783,"ont":114426,"ony":7110,"oma":27935,"ome":15382,"omb":20959,"omi":18767,"omm":120540,"omp":47351,"omo":9810,"omt":10618,"la ":409873,"le ":650000,"lab":8668,"lac":18171,"lag":11480,"lai":40035,"lan":67550,"lam":6667,"lar":10935,"lat":30778,"las":17285,"ld ":8004,"lbu":11677,"llé":6984,"ls ":27190,"lon":33452,"lom":9149,"lop":13744,"lor":20550,"loc":9951,"log":34746,"loi":10734,"los":7562,"lié":10397,"héo":7867,"lti":6832,"lub":8118,"lue":8111,"li ":8120,"leu":30125,"les":193810,"let":24641,"ler":15063,"lem":54863,"len":12174,"lec":18621,"lo ":9843,"lla":30932,"lle":228673,"lli":25721,"llo":13913,"lm ":12927,"ll ":25434,"lit":56867,"lis":75238,"liq":19529,"lio":7237,"lin":24717,"lim":6784,"liv":7668,"lic":17589,"lia":14317,"lib":8142,"lig":13313,"lie":66363,"lif":7647,"ma ":11524,"mb ":9981,"mai":41522,"mag":14210,"mar":34078,"mas":7661,"mal":10704,"man":59224,"mat":42658,"mba":7370,"mbl":12331,"mbr":52251,"mbo":7187,"me ":138747,"met":15733,"mes":29770,"mer":17959,"mem":10190,"men":192406,"lui":7872,"lut":8740,"lus":46191,"mpi":17291,"mpr":7983,"mpo":28665,"mpl":18530,"mps":7384,"mpt":6961,"ms ":7519,"moi":10596,"mod":8999,"mon":39570,"mor":34638,"mot":9918,"mou":10824,"mpa":13773,"mmé":6819,"mus":17057,"mul":7053,"mun":72605,"mi ":7506,"min":39720,"mil":42847,"mis":18157,"miq":9951,"mit":10387,"mie":22944,"ièr":39842,"ièm":14714,"ié ":12315,"mmu":65417,"iét":13514,"miè":11922,"mma":8876,"mme":60859,"yst":10800,"ys ":16246,"yen":7179,"yan":8923,"ué ":23787,"uéb":10376,"uée":52636,"tér":21675,"tés":15841,"tél":10092,"tée":11073,"tèr":7376,"tèm":7268,"té ":120644,"sée":25892,"sér":11984,"sé ":33254,"réé":15299,"vri":19364,"vre":15576,"vra":7557,"vir":10748,"vil":38576,"vin":28833,"vic":9378,"vid":11495,"vie":25896,"vit":11987,"vis":24817,"ré ":18385,"rès":23870,"réc":11353,"rée":15149,"réf":7765,"réa":27396,"rén":7293,"rég":60534,"rét":7637,"rés":42404,"rép":7979,"voi":21769,"vol":18467,"ver":63394,"ves":12081,"ven":32893,"vem":17149,"vel":23577,"vea":6695,"vec":28438,"ve ":39545,"val":17132,"van":23469,"var":6981,"vai":21124,"uté":12504,"ux ":89405,"uve":48851,"uvr":11649,"usi":30125,"use":23041,"ust":23088,"uss":29122,"uti":33117,"ute":37011,"uto":14655,"utr":16881,"us ":97767,"ut ":58626,"ura":21007,"ure":61643,"urg":15629,"uri":18044,"urn":17785,"uro":15520,"urs":51288,"urt":11255,"ur ":263487,"upe":34564,"uma":8198,"umb":11805,"ume":16466,"uni":34130,"una":10830,"une":273015,"um ":21626,"ult":20316,"uli":14760,"ule":23978,"ula":21824,"un ":279711,"uil":20013,"uin":15450,"uip":9137,"uis":42982,"uit":34639,"ul ":11106,"ui ":87556,"uct":19646,"ude":13798,"udi":11213,"ue ":252733,"uch":11064,"ueu":15091,"uer":16125,"ues":65033,"uen":9815,"uel":31495,"ub ":7072,"uat":10437,"uar":8894,"uan":14849,"ubl":25725,"ud ":22607,"tué":65263,"typ":6932,"ty ":7203,"tré":9963,"tur":39432,"tut":8273,"tud":13647,"tue":18585,"ts ":96064,"tre":117480,"tra":79361,"tri":46224,"tru":15862,"tro":41105,"tta":10904,"tte":35818,"ttr":6953,"pée":8023,"péc":10887,"to ":12714,"pér":17722,"tiè":7263,"toi":21756,"tob":11135,"tou":31577,"tom":9528,"ton":34164,"tor":25952,"til":20545,"tif":22436,"tie":47885,"tir":8477,"tiq":62118,"tit":40283,"tis":20099,"tin":33045,"tim":9651,"tio":235114,"thu":11875,"tia":8957,"tic":25452,"tiv":30333,"thé":16932,"pèc":18042,"tem":70682,"ten":37907,"tel":13125,"tec":11914,"th ":10711,"teu":75009,"tes":56728,"ter":82954,"ti ":26511,"tho":14369,"the":16191,"Éta":23397,"ège":6746,"èce":21913,"ère":67047,"ète":9916,"ène":11368,"ème":27688,"ès ":31596,"édé":24476,"éga":13682,"égi":62907,"écé":13741,"édi":27690,"éce":13512,"éci":16144,"ée ":173192,"écu":7487,"écr":22748,"éco":23770,"éfi":6764,"ées":42716,"éen":8156,"ébe":6903,"éal":20524,"éve":11864,"évi":9923,"éré":8391,"évo":13471,"évr":9038,"éta":51936,"éti":16412,"éte":7748,"étr":10893,"étu":6864,"été":39872,"éme":8284,"émi":12731,"émo":10213,"éna":8958,"éni":9260,"éli":8934,"éle":12690,"éma":11177,"éo ":6991,"équ":14445,"éra":46783,"ére":13799,"éri":83768,"éro":16442,"éné":26266,"ése":23781,"ési":28918,"épa":51263,"épu":10400,"épo":7454,"élé":15180,"és ":45990,"ême":12159,"êtr":10139,"çai":86644,"ût ":9033,"ôte":8179,"île":9715,"éé ":7606},"n_words":[66338594,78580813,56850284],"name":"fr"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ૈદિ":382,"g":235,"d":312,"e":960,"c":304,"a":1076,"n":720,"o":584,"l":382,"m":289,"h":369,"i":764,"u":324,"t":728,"s":517,"r":627,"ોટ ":345,"ેસા":764,"ોલ ":730,"ોર ":574,"ોદ ":1827,"ેત્":227,"ેતી":2222,"ેતમ":2186,"ેતપ":357,"ેડબ":253,"ેડા":1366,"ેડી":300,"ેગા":229,"ેગો":375,"ેઘર":242,"ૈકી":6300,"ેશમ":527,"ેશન":12436,"ેવી":831,"ેવા":710,"ઇડર":265,"ેરા":387,"ેરી":718,"આહવ":288,"ેલા":24917,"ેલી":519,"ેલુ":9935,"ેલો":381,"ોઇ ":458,"ેન્":278,"ેપુ":457,"આવે":34862,"ા":337683,"િ":47127,"સ":31472,"હ":20294,"શ":32541,"ષ":5409,"વ":91695,"લ":111041,"ળ":3931,"ર":102867,"ય":39143,"મ":113670,"ભ":35403,"બ":10569,"ફ":1198,"પ":49237,"ન":84304,"ધ":9131,"દ":38743,"થ":6321,"ત":89107,"ણ":9770,"ઢ":1233,"ડ":18443,"ઠ":3507,"જ":54268,"ઝ":1439,"ટ":6287,"ઘ":2525,"ચ":20557,"છ":25106,"ક":72592,"ખ":14557,"ગ":61691,"ઓ":8101,"એ":23599,"ઉ":5095,"ઈ":409,"અ":6168,"ઇ":1975,"આ":43598,"ં":82987,"૫":1391,"૪":875,"૩":2115,"૨":1146,"૯":1054,"૮":946,"૭":1034,"૬":461,"૧":5611,"૦":1269,"ૈ":7159,"ો":34921,"્":89060,"ૌ":562,"ુ":82336,"ી":42473,"ૃ":539,"ૂ":4236,"ે":108368,"આણં":435,"આદિ":1564,"ેશ ":483,"અને":1581,"અન્":344,"e ":271,"અમદ":630,"ેમ ":2812,"ેર ":1681,"ેલ ":1022," ૧":4255," ૩":409," ૨":679," ૫":978," ૪":492," ૭":700," ૯":551," ૮":625,"અગિ":1051," વ":14987," શ":3197," ર":16267," લ":5654," સ":11956," હ":3244," થ":1672," ત":31864," ધ":1718," દ":21808," ડ":1738," ઠ":222," ભ":34182," બ":4095," ય":383," મ":24848," ન":6795," ફ":765," પ":35455," છ":24245," ચ":2656," ઘ":628," ટ":479," ઝ":829," જ":21642," ઓ":682," ગ":30845," ખ":8068," ક":14981," ઉ":4757," એ":23366," આ":43205," ઇ":661," અ":6143,"આંગ":703,"્ચિ":11645,"્ટ્":549,"ોતર":377,"ોદર":1858,"ોનગ":236,"ોટા":473,"ોટી":225,"ોડા":794,"આઠ ":685,"ોની":800,"ોનો":2167,"ોરી":514,"ોળી":244,"ોલી":442,"ંવત":748,"ંબા":470,"ંબુ":281,"ંમત":254,"ંઠા":1406,"ંડવ":256,"ંદુ":455,"્ધ ":712,"ંદો":312,"ંધી":504,"ંતર":406,"ંચા":1418,"ંચમ":1337,"ંગા":221,"્ર ":966,"ોકો":3591,"્ય ":7092,"ંગણ":713,"ંખે":303,"ંગર":288,"્ષ ":789,"્વ ":2508,"એવા":6093,"્ષન":1137,"્ષિ":1509,"્વા":430,"્વે":772,"્વન":6820,"્વર":251,"્યન":12109,"્યત":720,"્યમ":432,"્યપ":428,"્યવ":2272,"્યા":2476,"્યુ":248,"્મદ":563,"્મા":375,"્લો":1068,"્લા":13052,"્રો":432,"્રે":602,"્રમ":649,"્રદ":871,"્રા":2175,"્રિ":476,"્રી":593,"્રહ":375,"્દ્":258,"્થા":288,"્તા":298,"્તી":856,"્તર":2535,"એક ":15869,"ઉદે":246,"ઉપલ":606,"ઉપર":392,"ઉત્":2557,"ઉમર":329,"િત":853,"િણ":1494,"વિજ":359,"ીં":343,"િમ":11976,"િપ":550,"િન":1570,"વાય":500,"િવ":4622,"વાર":525,"િશ":322,"ીક":534,"વામ":999,"િલ":14752,"ીઓ":697,"િય":2671,"િર":803,"વાસ":1799,"ીજ":386,"િહ":232,"વિક":492,"િસ":578,"વાલ":357,"ીત":406,"ીદ":281,"ું":22062,"વાદ":862,"વાન":404,"ીય":2619,"ીમ":860,"ીન":8731,"વાડ":2612,"ીપ":459,"ુક":18441,"ીવ":394,"ુખ":4304,"ીર":426,"ીલ":229,"વાગ":247,"વાઘ":254,"ીસ":510,"ુચ":222,"ુજ":13015,"ાં":48849,"ાઉ":298,"ાઇ":474,"ાક":921,"ાઓ":6449,"ાઘ":283,"ાખ":279,"ાગ":19994,"ાજ":13842,"ાચ":245,"ાટ":1064,"ાડ":4936,"િં":1255,"ાણ":2500,"ાથ":1359,"ાત":14028,"ાદ":2250,"ાન":14000,"ાપ":2091,"ાબ":1765,"ામ":34603,"ાય":4603,"ાર":20818,"ાલ":24953,"ાળ":1774,"વિર":286,"િક":2870,"ાવ":3657,"ાષ":849,"ાસ":4564,"ાહ":1263,"િજ":517,"વિસ":266,"હત":7478,"સી":2165,"સુ":1962,"સે":1528,"સા":8757,"સિ":770,"હવ":567,"સો":764,"હર":240,"સ્":3025,"સૌ":277,"હુ":529,"સા ":522,"હે":3680,"હા":2824,"હિ":2043,"હી":349,"હો":1278,"હ્":385,"શ્":12458,"ષન":1142,"સગ":988,"સર ":276,"શહ":485,"શિ":374,"શા":1696,"શુ":2253,"શી":262,"સં":2001,"ષ્":864,"સમ":828,"સન":417,"સવ":569,"સર":997,"ષા":458,"સદ":328,"સત":251,"સણ":439,"ષિ":1579,"વદ":549,"વન":7118,"વર":2659,"વલ":1055,"શક":498,"વગ":256,"વસા":2722,"વડ":3280,"વત":1294,"વણ":314,"વે":36512,"શન":12531,"વૈ":520,"શમ":627,"વ્":2839,"વસ":7405,"વી":2081,"વિ":2302,"વા":17902,"લો":6663,"લ્":14395,"લે":641,"લા":40018,"લિ":942,"લી":2736,"લુ":28591,"લસ":736,"લવ":315,"વસ્":826,"વસે":1125,"ળી":539,"વં":274,"ળા":1514,"રો":1975,"ર્":7275,"રુ":979,"રી":7304,"રૂ":908,"રે":1966,"રસ":718,"વાં":660,"રહ":1839,"રા":36128,"રિ":1193,"રવ":1135,"લબ":632,"લપ":735,"લય":580,"લન":2427,"લક":304,"લગ":215,"રક":1797,"રગ":229,"રખ":218,"યવ":2310,"રજ":562,"યા":7096,"રડ":414,"યુ":510,"રણ":534,"રત":14232,"રથ":239,"રદ":1032,"શમા":566,"યે":847,"રન":646,"રપ":682,"યો":569,"રબ":221,"રમ":2425,"મર":703,"મમ":2823,"મલ":319,"મહ":10705,"રં":397,"મી":561,"યડ":242,"મુ":6197,"મા":44661,"મિ":1415,"યત":1490,"યપ":480,"યન":12830,"મે":837,"યમ":548,"મ્":803,"મો":2623,"બ્":1122,"ભર":899,"મજ":2296,"મગ":222,"મખ":281,"મણ":336,"મત":581,"મથ":816,"ભા":32622,"ભિ":388,"મપ":589,"ભો":320,"મદ":1417,"મધ":3415,"મન":2890,"બર":1834,"બહ":293,"મં":287,"બી":424,"બુ":465,"બા":2768,"બિ":244,"બો":581,"બે":417,"પો":705,"પ્":4053,"બન":257,"પલ":826,"પહ":227,"પશ":13852,"પર":1594,"પૂ":2675,"પૈ":6312,"પે":306,"પુ":5227,"પી":742,"પિ":245,"પા":6287,"ન્":1623,"નો":5861,"પણ":494,"પત":282,"પટ":291,"પડ":321,"પછ":431,"નવ":1459,"નર":658,"ધ્":3586,"ધો":283,"નપ":579,"નન":237,"ને":4269,"નુ":10812,"પં":2789,"ની":3990,"નિ":691,"ના":45180,"નસ":606,"ધા":1073,"ધુ":315,"ધી":641,"દે":14545,"ધન":837,"દ્":1191,"દો":459,"ધર":916,"સી ":1244,"નગ":2570,"દશ":215,"દસ":731,"દહ":218,"દા":3639,"દિ":5058,"દી":575,"દુ":1289,"દર":3345,"થવ":580,"વેલ":34942,"શના":12337,"થી":1041,"થા":882,"તો":531,"વૈદ":382,"તે":6014,"દક":1517,"થય":782,"ત્":12219,"થમ":1093,"થક":777,"તી":4432,"તુ":553,"તા":26724,"તિ":1123,"તન":712,"ણે":327,"તપ":511,"તર":3967,"તલ":356,"તમ":2718,"ણા":2366,"ણી":846,"ણવ":795,"તઘ":640,"ડો":2767,"ડુ":286,"ડે":1027,"ણં":571,"ડી":3535,"ડિ":410,"ડા":5615,"ડવ":598,"ડર":315,"ડભ":218,"ડબ":277,"ઠા":1831,"ટ્":673,"ટે":774,"વ્ય":2778,"ટિ":249,"સે ":1144,"ટી":761,"છે":23574,"જન":365,"છી":486,"ઝઘ":245,"જય":244,"છો":305,"જબ":1127,"જર":11973,"જે":2020,"જો":465,"જિ":13950,"જા":1399,"જુ":2555,"જી":794,"જ્":13119,"ઝર":271,"શુપ":2187,"ઝા":567,"સંવ":756,"ટક":225,"સંત":320,"સંખ":325,"ટા":840,"ટલ":348,"ગા":17980,"ગુ":12125,"ગિ":1160,"ઘડ":264,"ગી":427,"૯ ":627,"ગ્":891,"ગો":1143,"ઘર":948,"ઘો":548,"ચર":560,"ચમ":1383,"ચા":2321,"ચિ":12038,"ચી":259,"જં":215,"ચો":437,"ચ્":251,"જક":251,"શહે":470,"૫ ":1091,"કર":2446,"કમ":270,"કલ":513,"કપ":478,"ખં":251,"કડ":644,"ખલ":227,"ક્":3347,"કો":5466,"કે":1500,"૭ ":792,"કુ":6686,"કૃ":229,"કા":21625,"કી":6922,"કિ":350,"કહ":564,"કવ":583,"ગવ":1115,"ગલ":272,"ગર":2876,"ગમ":18397,"ખ્":4351,"૮ ":673,"ખે":6309,"ગન":575,"ગણ":1117,"ગઢ":779,"ખા":2071,"૧ ":1168,"શાળ":1055,"શાસ":252,"એવ":6158,"૨ ":320,"૩ ":1541,"૪ ":593,"ઓન":283,"એક":16184,"૦ ":810,"ઉદ":317,"ઉત":2595,"ઉપ":1216,"સગવ":904,"ઉમ":378,"આં":1102,"અગ":1187,"અં":436,"ઇડ":287,"ષના":751,"આહ":296,"ષનો":362,"આવ":35199,"આદ":1695,"આઠ":804,"આણ":440,"અમ":826,"અર":256,"અન":2071,"ંવ":912,"ંસ":486,"ંત":1429,"ંથ":310,"ંદ":2195,"ંધ":939,"ંબ":1038,"ંભ":281,"ંમ":287,"ંક":540,"શ્ચ":11655,"ંગ":2855,"ંખ":377,"ંચ":4016,"ંજ":464,"ંટ":564,"ંડ":982,"ંઠ":1433,"હે ":1152,"શ્ર":320,"શ્વ":348,"૧૩":1467,"૧૧":1091,"૧૯":283,"૧૦":694,"વડો":2345,"ોટ":1277,"ોડ":1517,"ોજ":373,"વલી":267,"ોન":3584,"ોધ":322,"ોત":616,"ોદ":3809,"ોગ":284,"ોક":3826,"ોઇ":523,"્ટ":1147,"્ત":4385,"્ણ":325,"્દ":602,"્થ":651,"્ધ":1001,"્પ":336,"્બ":262,"વર્":2136,"્ક":609,"્ગ":267,"્ચ":11827,"ોમ":362,"ોલ":1612,"ોય":246,"ોર":2045,"ોવ":257,"વલસ":582,"ોળ":645,"્સ":302,"્ષ":3855,"્વ":11263,"્લ":14321,"્ર":8621,"્ય":26212,"્મ":1641,"ૂર":2771,"ુદ":699,"ુધ":815,"ુન":510,"ુણ":524,"ુત":213,"ુમ":369,"ુર":8763,"ુપ":2377,"ુવ":804,"ુસ":354,"ુલ":6351,"ૂચ":717,"વનો":376,"વનુ":6376,"ૃત":300,"ેક":249,"ેત":5179,"ેડ":2071,"ેટ":618,"ેઠ":219,"ેજ":326,"ેગ":653,"ેઘ":271,"ષા ":222,"વતા":228,"ેર":3345,"ેલ":37195,"ૈક":6315,"ેશ":13804,"ેવ":2024,"ેન":1094,"ેપ":540,"ેમ":3602,"ૈદ":389,"ેસ":990,"હિં":776,"હાલ":1485,"હાર":571,"હિન":919,"ઇ ":1018,"આ ":2702,"ાં ":41350,"ઓ ":7041,"હેર":525,"હેલ":239,"હેવ":536,"હેસ":730,"એ ":663,"ાઇ ":270,"હોદ":867,"હ્મ":326,"ાઓ ":6349,"ં ":62940,"ાગ ":214,"ાડ ":990,"ાદ ":1109,"ાણ ":231,"ાત ":12086,"ાન ":979,"ામ ":12051,"ાલ ":1653,"ાર ":2510,"ાય ":3046,"ાવ ":357,"િક ":1785,"ાસ ":1162,"ષિણ":1462,"ે ":34827,"ો ":11870,"સણા":264,"ષ ":917,"સ ":4627,"સમો":292,"વ ":3614,"શ ":598,"સરા":281,"ષ્ટ":602,"િ ":698,"ુ ":1292,"ી ":24520,"ા ":103799,"સવા":466,"સુર":1060,"દ ":4991,"થ ":242,"સીઓ":544,"ન ":4557,"સુદ":463,"ધ ":850,"સાડ":665,"સાત":801,"સાણ":973,"સાગ":213,"પ ":563,"સાય":2235,"સામ":236,"સાર":662,"સાવ":281,"સાબ":1406,"બ ":1292,"મ ":27791,"સોન":293,"ર ":16908,"ય ":11483,"લ ":10111,"હતા":268,"હત્":6808,"ળ ":723,"ક ":19636,"ગ ":1433,"સ્વ":268,"સ્થ":430,"ચ ":2010,"સ્ટ":235,"સ્ત":1316,"સ્ક":359,"જ ":4325,"ટ ":1263,"ડ ":2185,"ઠ ":958,"ઢ ":645,"હવે":238,"ણ ":3405,"હવા":327,"ત ":27700,"ૂચ ":710,"િત્":251,"ાસા":294,"ાસિ":270,"ાસી":1732,"ાહો":869,"ાષા":325,"ાસણ":337,"ુલ ":6018,"ાસર":237,"ાષ્":514,"ાલન":2232,"ાલપ":364,"ાલય":562,"ંગ ":904,"ારે":564,"ાર્":428,"ારો":350,"ારી":908,"ારા":1301,"ારત":13028,"ારમ":342,"ારડ":261,"ાયત":689,"ાયડ":229,"ામા":16128,"ુર ":3220,"ાવી":500,"ાવા":1445,"ાવલ":274,"િકે":222,"િક્":471,"ંચ ":1075,"ાવત":219,"ાળા":1219,"ાલો":667,"ાલુ":18139,"ાલી":427,"ાલિ":223,"ાલા":242,"ાનો":1081,"ંટ ":317,"ાનપ":429,"ાના":6337,"ાનુ":3493,"ાની":787,"ંજ ":235,"ાદર":601,"ામપ":347,"ંત ":433,"ામન":2336,"ામમ":2770,"ાબર":1409,"ાપ્":387,"ુદ ":457,"ાપી":523,"ાપુ":503,"ાપા":321,"ંદ ":612,"ીદા":229,"ીનગ":468,"ીના":6429,"ીને":1070,"ીની":270,"ીનો":264,"િસ્":315,"િલ્":13910,"િલો":366,"િવસ":2730,"િવા":1650,"િનો":226,"િના":989,"િયા":2403,"ાંટ":436,"ાંઠ":1422,"ાંડ":461,"ાંગ":1094,"ાંચ":1295,"ાંત":590,"ાંધ":547,"ાંદ":419,"ાંસ":263,"ીઓ ":502,"િમ ":11653,"િપ ":367,"િત ":360,"િણ ":1447,"ાણા":1231,"ાણી":404,"ાતી":550,"ાત્":264,"ાથમ":992,"ાતે":354,"ાટી":295,"ાટે":280,"િંમ":251,"િંદ":527,"ાડી":1596,"ાડા":2060,"ાજક":237,"ાજી":257,"ાજ્":12491,"ીય ":442,"ાકી":371,"ાગમ":18270,"ાગન":475,"ાઉદ":238,"ું ":21442,"ૂર્":2560,"ેટ ":236,"ુણા":437,"ુજબ":1107,"ુજર":11840,"ુપા":2208,"ુધન":587,"ીયન":389,"ીમા":379,"ીયા":1731,"ીમખ":262,"ીસમ":292,"ુકા":17641,"ુકો":529,"ુખ્":4233,"ુરી":2300,"ુરુ":440,"ુરા":1358,"ુરત":829,"ુવા":696,"તઘર":640,"ણવા":740,"દસ ":512,"ણાવ":406,"દા ":938,"તપુ":465,"તનગ":252,"દી ":294,"તના":216,"દુ ":479,"તમજ":2184,"તમા":321,"તરી":269,"તરા":350,"તો ":423,"થા ":316,"થી ":996,"નવ ":475,"થવા":574,"ના ":41544,"ને ":3899,"ની ":3744,"નો ":5537,"દરા":2352,"દરમ":423,"તું":277,"તાલ":18057,"તાર":306,"તાપ":566,"તાન":302,"ધા ":271,"તેમ":3056,"તેર":1418,"તેન":315,"દક્":1484,"ત્ત":2727,"થમિ":987,"ત્વ":7552,"ત્ય":281,"ત્ર":1452,"થયે":642,"નપુ":541,"પી ":564,"ધીન":468,"ધાર":249,"ધાન":290,"નવસ":483,"નસવ":370,"ધ્ય":3481,"નર્":560,"દુધ":605,"દેપ":250,"દેશ":13431,"દેવ":535,"ધની":595,"દાવ":797,"દિક":394,"દાર":330,"દાદ":262,"દિવ":4303,"દાહ":863,"નગર":2214,"નગઢ":247,"પર ":286,"પણ ":420,"દોદ":293,"દ્વ":331,"દ્ર":524,"ધરા":484,"બા ":381,"પટે":233,"પાવ":384,"પાર":273,"પાલ":2355,"પાટ":403,"પાડ":735,"પાં":1274,"પશ્":11653,"પશુ":2197,"પલબ":596,"પરા":655,"પંચ":2575,"નું":10559,"નાં":923,"નાર":240,"નામ":306,"નાન":1138,"પછી":428,"ન્ય":559,"ન્દ":395,"બહુ":235,"બાક":373,"બાર":800,"બાય":217,"રજ ":269,"મા ":715,"મી ":253,"યડ ":217,"યન ":455,"બરક":1386,"મો ":1343,"પૂર":2616,"પુર":4959,"પૈક":6303," આ ":2656,"પોર":340," એ ":485,"પ્ર":3459,"પ્ય":389,"માં":41157,"માટ":294,"માન":402,"માણ":276,"માત":359,"માલ":325,"માર":263,"મિક":1063,"મહત":6802,"મહા":1848,"મહિ":850,"મહુ":233,"મહે":917,"યત્":701,"મેઘ":271,"મુખ":4261,"મુજ":1125,"મુવ":353,"યતઘ":640,"મપુ":552,"રે ":767,"મમા":2808,"મધ્":3365,"મદા":1324,"રી ":4562,"મના":2419,"રો ":232,"મજુ":2185,"મખે":264,"રા ":5544,"મતન":247,"મથક":765,"ભિલ":318,"ભાર":12956,"ભાગ":18607,"ભાષ":335,"બોર":287,"રત ":13401,"યા ":2563,"રમ ":485,"યો ":254,"ભરૂ":710,"બ્ર":421,"બ્ધ":597,"ળા ":1138,"રેગ":384,"ળી ":385,"રોત":365,"રાં":597,"રાય":238,"રામ":591,"રાવ":421,"રિક":325,"રાષ":510,"રાડ":230,"રાણ":287,"રાત":11870,"રાથ":991,"રાપ":541,"રાજ":13365,"રીય":783,"રીન":986,"રું":396,"રીક":260,"રિય":414,"રૂચ":710,"રવા":911,"રહવ":219,"રહે":1153,"રહ્":327,"રપુ":361,"રમ્":382,"રમા":804,"લો ":1417,"લા ":25318,"રના":235,"યેલ":764,"લી ":1812,"રદે":816,"રડી":227,"યાલ":588,"યાર":1934,"યાન":576,"યાપ":305,"રકા":1582,"લય ":553,"યવસ":2216,"યપૂ":410,"મોડ":217,"મોટ":454,"યનો":326,"યના":11717,"મ્ય":415,"યમા":407,"લન ":2219,"મ્બ":232,"લ્લ":14131,"વે ":1140,"લોલ":465,"લોડ":396,"લોદ":449,"લોક":3642,"વા ":7780,"વી ":1602,"લુક":18002,"લીમ":321,"લુણ":389,"લિય":312,"લું":9913,"વસ ":2535,"લાન":6932,"લિપ":369,"લાસ":286,"લાવ":223,"લાલ":307,"લામ":5764,"લસા":677,"શક ":371,"વર ":236,"લબ્":597,"વદ ":505,"લપુ":412,"વત ":759,"વડ ":417,"ર્ષ":1963,"ર્વ":2626,"ર્ય":411,"ર્મ":900,"કી ":503,"કા ":605,"કે ":834,"કો ":1354," ૧૦":680,"૧૦ ":535,"૧૧ ":1036,"૧૩ ":1412,"ગઢ ":538," ૧૩":1456," ૧૧":1079," ૧૯":274,"ગર ":1959,"કડી":283,"કડા":244,"કરવ":314,"કરી":1162,"કવા":445," હો":303," હિ":944," હા":540," સો":475," સૌ":273," સ્":714," સિ":322," સા":3562," સુ":1717," હત":654," સમ":395," સર":370," સત":226,"કાલ":238,"કામ":9196,"કાર":676,"કીન":6304,"કુલ":6004," લુ":420," લી":386," લિ":414," લા":253,"કહે":527," લો":3728,"કાં":1539," રહ":1203," રા":13805,"કાન":2517,"કાઓ":6055," સં":1919," શા":1268," શિ":264," શહ":475," સગ":922,"ઘર ":662," શ્":290," વા":1551," વિ":1924," વસ":1926," વ્":2581," વૈ":515," વે":236," વડ":2142," વર":2148," શક":474," વલ":624," વદ":498," પછ":431," પટ":261," પણ":391," નો":313," પા":2926," પુ":759," પૂ":2113," પૈ":6312," પર":651," પશ":13847," પહ":226," ધા":386," દ્":305," દે":13156," નગ":894," ધર":567," ના":1491," નિ":408," નસ":380," ને":386," પં":2780," ધો":248," નવ":1299," નર":628," મધ":3389," ભિ":335," ભા":32144," મથ":749," ભર":862," મે":521," મો":1041," મા":2519," મુ":5993," મહ":9472," પ્":3289," પો":348," બો":476," બે":295," મં":217," બી":247," બા":1559," બહ":292," ડા":372,"કોન":2841," ડે":860,"કોળ":215,"કોટ":356,"ક્ષ":1887,"ક્ર":890," ત્":349," થય":778," દક":1478," તે":5306," થવ":379," દર":551," દુ":722," દિ":2858," દા":1294,"કેટ":221," દસ":704," તર":348," તિ":233," તા":24717," ૯ ":497," ગો":553," ગ્":552," ગુ":12029,"ખેડ":1595,"ખેત":4414," ગા":16841,"ગના":399," ૮ ":570," ખે":5659," ખા":1675," ગણ":237," ચર":392," ઘો":266," છો":266,"ગણવ":693," ઝઘ":243," છે":23548," ચો":383," જં":214," ચા":714," ચિ":300," ઝા":412," જ્":480," જુ":259," જા":631," જિ":13843," જો":332," જે":1921,"ખાસ":931," એવ":6158,"ખાન":217," એક":16184,"ખાત":404," ૫ ":907," ૪ ":431," કહ":562," કવ":319," કુ":6308," કા":1279," કો":831," ૭ ":640," કે":929," ક્":383," કડ":504," કપ":373," કલ":240," કર":2121," ઇડ":271," આહ":296," આવ":35196," આદ":1582," ઉત":2589," ઉપ":1212," ઉમ":372,"ગવડ":906,"ગરહ":219,"ગમા":18349,"ખ્ય":4323," અં":432," અગ":1187," આં":1036," અન":2069," અર":252," અમ":825," આઠ":804," આણ":440,"ગાં":635,"ગામ":16798,"ગિય":1057,"ગુજ":11824,"ઘડી":254,"ગોર":435,"ગોધ":222,"ગ્ર":743,"ઘરજ":242,"છી ":438,"ઘોડ":335," જ ":2898,"છે ":23415,"જબ ":1105,"ચરો":362,"ચાર":560,"ચાય":582,"ચિમ":11652,"ચાં":471,"ચાગ":369,"જી ":285,"જા ":331,"ચમહ":1210,"જે ":397,"જકો":214,"ઝઘડ":245,"જિલ":13824,"જુર":2204,"છોટ":245,"જરા":11822,"ઝાલ":270,"ઠા ":1548,"ટી ":375,"ટા ":336,"જેવ":714,"જેત":374,"જ્ય":12917,"ટે ":225,"ડી ":2334,"ડા ":4375,"ડર ":279,"ટેલ":254,"ટાઉ":239,"ડો ":637,"ણી ":405,"ણા ":1630,"ટ્ર":578,"ડેર":603,"ડેડ":232,"તી ":3871,"ડોદ":1844,"તે ":714,"ડિય":272,"ડાસ":222,"ણંદ":569,"ડીય":740,"તિ ":370,"તા ":7106,"તર ":2912,"થક ":760,"ડાં":355,"ડબ્":254,"ણે ":229},"n_words":[2118540,2468202,1874859],"name":"gu"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":3214,"E":2525,"F":2735,"G":2624,"A":5600,"B":4044,"C":5446,"L":2999,"M":4837,"N":2358,"O":1895,"H":2508,"I":3050,"J":1816,"K":1288,"U":823,"T":4063,"W":1765,"V":1278,"P":3992,"S":6051,"R":3127,"f":3967,"g":6300,"d":10934,"e":38551,"b":4292,"c":11175,"a":34025,"n":25648,"o":25621,"l":18280,"m":9009,"k":3209,"h":10406,"i":28660,"w":2312,"v":3224,"u":12118,"t":20514,"s":19158,"r":26616,"p":7557,"z":1550,"y":5684,"x":3018," o":1293," d":1423,"р":1134," a":1241,"с":1017," c":818," t":1122," p":2507," s":1080," J":1761," K":1182," H":2237," I":2087," N":1804," O":1286," L":2498," M":4152," B":3287," C":4445," A":4293," F":2284," G":2345," D":2725," E":1939,"л":851,"к":858,"и":1731,"о":1736,"н":1202," S":4832," R":2670,"в":941," P":3269,"а":1783," W":1618," V":962,"е":1219," T":3332,"ז":63772,"ח":143673,"ה":590284,"ו":795043,"ג":118718,"ד":182547,"א":355837,"ב":395285,"מ":381378,"ן":91462,"ל":422282,"ם":165990,"ך":21160,"כ":123771,"ט":120550,"י":838057,"ץ":10248,"פ":163464,"ק":174715,"צ":88276,"ס":156906,"נ":265593,"ף":12208,"ע":168037,"ש":294877,"ר":442678,"ת":390314,"ְ":2319,"ִ":1697,"ֵ":751,"ֶ":1684,"ַ":1974,"ָ":1859,"ֹ":1678,"ּ":3241,"ي":1156,"ل":1891,"م":1083,"ن":836,"ب":960,"ا":2457,"ر":948,"A ":1146,"Co":1139,"Ch":747,"Ca":914," ט":10158," י":46710," כ":41422," ל":87440," מ":140680," א":111429," ב":199095," ג":26594," ד":18598," ה":315869,"Ma":1436," ו":76633," ז":14443," ח":27536," ת":26310," ש":120088," ר":28151," ק":30023," צ":12233," פ":33679," ע":48214," ס":29027," נ":37369,"Th":1329,"St":822,"Pa":795," ا":1195,"a ":5141,"i ":1770,"ge":1176,"he":2979,"ha":1572,"g ":1420,"ea":1448,"ec":949,"ed":1174,"de":2377,"di":1239,"do":869,"h ":1424,"el":2456,"ei":783,"en":3848,"em":940,"et":1553,"es":3076,"er":6931,"ca":1297,"e ":9963,"be":993,"da":916,"f ":1405,"ct":774,"co":1157,"ci":1010,"ch":2176,"ce":1369,"c ":854,"d ":3184,"at":3174,"as":1657,"ar":4336,"au":801,"al":3282,"ai":824,"am":1581,"an":5294,"ac":1575,"ad":954,"nt":2386,"ns":1298,"no":898,"nn":849,"of":1227,"oc":763,"om":1498,"on":5076,"ol":1724,"ot":992,"os":1322,"ou":1121,"op":789,"or":3356,"r ":4009,"pe":961,"ph":746,"lo":1342,"ll":2162,"o ":2412,"ma":1706,"me":1669,"mi":920,"mo":751,"na":1919,"nc":982,"nd":2342,"ne":2187,"ng":1840,"ni":1853,"m ":1750,"li":2362,"le":2918,"la":2312,"n ":6713,"hi":1088,"ho":885,"id":1119,"ic":2833,"ia":1960,"ig":907,"ie":1690,"k ":1105,"ir":937,"is":2730,"it":1899,"il":1849,"in":4459,"io":2236,"l ":3308,"y ":3108,"vi":786,"ve":1332,"x ":2437,"ul":958,"ur":1480,"us":2545,"um":935,"un":962,"tu":790,"tt":809,"to":1696,"tr":1218,"te":3049,"ti":3191,"th":2015,"ta":1915,"ss":1070,"st":2602,"so":786,"se":1404,"si":1360,"rs":966,"rt":1408,"ry":760,"ro":2308,"rn":818,"ri":3441,"re":2927,"rd":1160,"ra":3300,"t ":3529,"s ":8103,"px":1778,"一":845," Ma":1422," Ca":896," Co":1129," Pa":776," St":815," Th":1324," of":1007," de":821," px":1777,"זר":6935,"זק":1127,"חב":10029,"חז":2980,"חו":22900,"חה":3687,"חד":8305,"זי":12406,"זל":838,"זם":1284,"זכ":3534,"זנ":886,"זמ":4439,"זע":2250,"חר":11087,"חק":7976,"חת":6995,"חש":6932,"טב":2858,"טא":3408,"טו":20425,"טה":4110,"טח":2065,"חי":19519,"חל":11067,"חם":835,"חמ":5926,"חנ":2382,"חס":4114,"חצ":1267,"הת":16129,"הש":20373,"הר":18303,"הק":13593,"הצ":7002,"הפ":12266,"וה":16855,"וד":26125,"וג":17753,"וב":35986,"וא":44658,"וּ":1323,"הכ":8371,"וֹ":1575,"הט":3770,"הי":51622,"הז":2998,"הח":11926,"הה":6606,"הו":49027,"הע":15156,"הנ":11770,"הס":11464,"המ":50070,"הן":3379,"הל":10568,"הם":6285,"ות":101531,"וש":17323,"וצ":14838,"וץ":2534,"ור":67093,"וק":18906,"זו":10251,"זה":6566,"זא":1585,"וי":25913,"וך":3767,"וכ":13432,"ול":56167,"וו":21707,"וז":11389,"וח":11917,"וט":7960,"וס":26165,"וע":21211,"וף":4260,"ופ":26005,"ום":16031,"ומ":32989,"ון":31711,"ונ":45275,"גת":2080,"דב":1971,"דג":1438,"דא":1968,"גע":907,"גר":12191,"גש":1319,"גם":5839,"גל":15851,"גנ":4603,"גן":1862,"גמ":1889,"גו":18230,"גה":2711,"גד":7511,"גי":15377,"גז":1369,"הא":24412,"הב":13059,"הג":10416,"הד":6369,"דפ":866,"דצ":1312,"דק":1275,"דר":19078,"דש":2624,"דת":5288,"דל":3528,"דמ":4712,"דם":3790,"דנ":1611,"דן":1253,"דע":6022,"דס":1191,"דה":10078,"דד":1385,"דו":30532,"די":39839,"בא":28974,"את":18469,"אר":23727,"אש":17310,"אצ":2077,"אק":3488,"אפ":6737,"אף":1142,"אנ":19673,"אס":5720,"אן":3277,"אמ":12397,"אם":2757,"אל":31779,"אכ":1022,"אך":1814,"אי":39448,"אט":3314,"אח":13816,"אז":5466,"או":54111,"אה":8465,"אד":6606,"אג":2909,"אב":7150,"גא":2681,"גב":4647,"בש":16669,"בת":15296,"בצ":5367,"בק":6122,"בר":44457,"בס":9775,"בע":20054,"בפ":5660,"במ":24479,"בנ":12394,"בן":3469,"בי":60852,"בל":12579,"בכ":5400,"בז":1800,"בו":37124,"בט":3788,"בח":6845,"בג":4476,"בב":4468,"בה":14562,"בד":9533,"ממ":12115,"מן":5860,"מנ":16156,"מס":13321,"מע":13284,"מפ":8474,"מצ":11869,"מק":11494,"מר":20980,"מש":30914,"מת":18345,"נד":8678,"נג":16204,"נב":2857,"נא":5195,"נח":5856,"נז":1832,"נו":50382,"נה":26549,"נכ":2770,"ני":73546,"נט":9488,"נן":997,"נמ":4014,"נם":1134,"נל":2549,"לנ":7233,"לס":4718,"לע":5121,"לל":8431,"לם":5118,"למ":14400,"לן":1558,"לר":5001,"לש":7046,"לת":11787,"לפ":7565,"לצ":2285,"לק":11094,"מו":54355,"מה":19324,"מד":15338,"מג":4215,"מב":11717,"מא":19955,"מל":12739,"מכ":7596,"מי":43286,"מט":8749,"מח":12385,"מז":4310,"כד":4207,"כה":6173,"כו":21072,"כז":3111,"כח":1672,"כי":15596,"כך":1583,"כא":3499,"כב":6283,"כג":1009,"כפ":2033,"כת":9752,"כש":2355,"כר":5552,"כמ":4368,"כם":777,"כל":14948,"ככ":754,"כס":2535,"כנ":8682,"כן":3171,"לז":1198,"לח":8146,"לה":26418,"לו":38179,"לך":2679,"לכ":7962,"לט":7328,"לי":67951,"לג":4986,"לד":10370,"לא":20135,"לב":10370,"טק":1116,"טס":846,"טנ":3561,"טע":1653,"טל":6264,"טן":1424,"טמ":2400,"טי":27400,"טכ":861,"יו":59458,"יז":7368,"יח":8678,"יט":18165,"יב":19301,"יג":6611,"יד":26912,"יה":49212,"יא":28186,"טר":15887,"טת":2335,"יר":32678,"יק":28848,"יצ":10768,"יץ":1467,"יפ":9697,"יף":1039,"יע":8301,"יס":16568,"ינ":36790,"ין":20736,"ימ":19188,"ים":101424,"יל":28185,"יכ":8588,"יך":2854,"יי":52949,"יש":24046,"ית":80424,"צמ":5084,"צע":4237,"צפ":3530,"צח":1679,"צי":17340,"צט":759,"צל":2956,"צא":6953,"צב":5471,"צג":1091,"צד":1322,"צה":5120,"צו":14859,"פצ":904,"פק":4972,"פת":9249,"פר":34424,"פש":3647,"רץ":4177,"רפ":7179,"רק":7125,"רצ":5822,"רס":9590,"רנ":6801,"רע":2837,"רם":1942,"רל":4398,"רן":1612,"רמ":9087,"רי":69906,"רט":11059,"רכ":12424,"רך":5453,"רה":18046,"רו":57537,"רז":1504,"רח":8185,"רא":29022,"רב":18351,"רג":11231,"רד":8475,"קר":20408,"קש":3566,"קת":4771,"קצ":3292,"קפ":1285,"קע":1184,"קס":4328,"קנ":4930,"קן":2101,"קמ":2688,"קם":1371,"קל":7255,"קי":22013,"קט":8069,"קו":31546,"קד":4923,"קה":11443,"קב":8684,"קא":6360,"צת":2627,"צר":9786,"עד":7161,"עה":5494,"עו":21588,"עז":1654,"עט":1111,"עי":23608,"על":30126,"עם":5137,"עמ":4465,"ענ":3092,"עס":1118,"סע":1202,"סס":1542,"ספ":16242,"סף":1568,"סר":7306,"סק":6961,"סת":3711,"עב":7995,"סד":6581,"סה":2338,"סב":2454,"סג":2607,"סח":1327,"סט":14456,"סו":21766,"סכ":1718,"סל":4205,"סי":26035,"סנ":2332,"סם":1071,"סמ":4060,"נע":2683,"נס":7827,"ננ":1118,"נק":6861,"נצ":2907,"נפ":4308,"נת":16355,"נש":4892,"נר":4214,"סא":1865,"פט":6029,"פי":27041,"פה":6116,"פו":28843,"פח":3684,"פנ":5942,"פס":4846,"פע":6986,"פל":6474,"פן":2295,"פב":1568,"פא":3156,"פד":1152,"פג":1094,"עש":4774,"ער":15071,"עק":2148,"עצ":4716,"עת":7895,"שת":9019,"שש":1558,"תב":4462,"תא":5204,"תו":25811,"תה":9277,"תג":1436,"תי":25751,"תח":9940,"תל":3463,"תם":2303,"תכ":2023,"תנ":5503,"תמ":4850,"תן":2959,"תפ":5557,"תע":2232,"תר":13462,"תש":2068,"תק":7602,"רש":6651,"רר":1758,"רת":19384,"שא":5518,"שג":2036,"שב":15227,"שה":14417,"שד":1381,"שו":28172,"שט":3529,"שח":6900,"שי":32057,"שך":1218,"שכ":3883,"של":59496,"שם":6738,"שמ":16387,"שנ":26253,"שע":3886,"שפ":9870,"שק":2676,"שר":23791,"תת":1846,"מ ":2663,"ן ":91356,"ס ":23921,"ע ":17848,"ף ":12199,"ִי":1007,"פ ":1749,"ץ ":10234,"צ ":3355,"ק ":22523,"ר ":101491,"ש ":18495,"ת ":251160,"ב ":39174,"א ":61709,"ד ":40754,"ג ":17864,"ו ":72231,"ה ":204656,"ח ":14016,"ז ":8988,"י ":129045,"ט ":20042,"כ ":3096,"ך ":21128,"ם ":165828,"ל ":123422,"ال":1187," יי":2324," יכ":1029," יל":1306," ימ":1454," יע":851," יצ":2302," יר":1739," יש":8585," כג":1001," כב":1385," כא":2275," כו":3554," כד":2689," כך":799," כי":5212," כח":844," חל":3208," חמ":880," חי":4768," חס":883," חר":777," חש":1430," טו":2163," טב":768," טל":1002," טי":1699," טר":1404," יח":2213," יו":7458," יה":3689," יד":8356," מז":2002," מח":5968," מט":3237," מי":10487," מכ":3995," מל":5214," מא":7744," מב":4174," מג":2553," מד":6437," מה":5964," מו":15023," מש":13103," מר":5197," מק":5213," מצ":3521," מת":7226," מס":7731," מנ":4010," מן":1074," ממ":8626," מפ":3253," מע":6426," כר":1353," כת":1870," כש":1438," כפ":1233," כנ":1474," כל":6776," כמ":3311," לט":1128," לי":6245," לכ":2758," לה":9774," לו":4443," לז":853," לח":2914," לא":10716," לב":3660," לג":1530," לד":1145," לת":2908," לש":5145," לר":2977," לק":2667," לצ":1757," לפ":4810," לע":3788," לס":2540," לנ":1953," למ":9220," לל":1387," בפ":5193," בע":12834," בס":7728," בר":10133," בק":4194," בצ":3678," בת":9609," בש":14953," גב":1847," גא":1205," בד":5369," בה":6389," בב":3563," בג":3763," בח":3909," בט":1774," בו":5324," בז":1578," בכ":4321," בל":4682," בי":25678," בן":1774," בנ":6905," במ":21078," גר":3583," דב":749," גד":1588," גו":3424," גי":2015," גל":1445," גם":4780," אפ":2121," אק":1199," אצ":767," אס":1813," אנ":6385," בא":23971," אש":4957," אר":6566," את":12538," או":23084," אז":1939," אח":7629," אב":4176," אג":950," אד":3113," אל":10175," אם":984," אמ":5812," אי":11144," אך":1639," זה":3518," זו":3424," וק":1350," ור":2477," וש":2610," ות":1633," ונ":3060," ומ":9400," ופ":2170," וע":3597," וס":1644," וח":1862," וז":955," וו":1631," ול":4945," וכ":2933," וי":4333," חב":4303," חד":1040," חו":5710," זר":823," זמ":1557," זי":910," זכ":1452," הג":7501," הד":4857," הא":22400," הב":10880," דר":2719," דמ":1254," די":3533," דו":3757," דה":785," וב":8620," וג":1290," וד":892," וה":11812," וא":5274," הר":14413," הש":17912," הת":11571," הפ":9733," הצ":5867," הק":10138," הס":10053," הנ":10137," הע":13912," הם":2069," הל":5334," הן":954," המ":46981," הי":41072," הט":3324," הכ":7288," הו":33156," הה":5396," הח":10762," הז":2732," שה":7711," שב":7588," שח":2817," שט":1041," שו":5579," רש":1366," שא":2492," שפ":2381," שע":1892," שר":1669," שק":787," של":48394," שכ":1941," שי":8345," שנ":9947," שמ":8425," שם":3634," רא":3404," רב":4590," רו":6814," רח":840," קר":4233," קש":1007," רפ":810," רק":1211," רצ":860," רי":2280," רכ":802," תח":2475," תי":1934," תו":4939," תא":1279," שש":1373," שת":1185," תק":2224," תש":1190," תר":2157," תע":967," תפ":1385," תנ":1167," תכ":774," תל":1823," סי":4571," סל":908," סמ":1246," סג":795," סב":864," סד":1598," סו":5658," סט":1619," נש":1719," נת":1130," נע":1238," נפ":2034," נק":2006," נכ":1356," ני":4715," נמ":1957," נג":1220," נב":1056," נא":920," נח":2096," נו":11348," נה":1061," עת":798," עצ":1752," עק":770," ער":2088," עש":1568," ענ":811," עמ":1580," עם":3883," על":19873," עי":2972," עו":4351," עד":2831," עב":2405," סק":894," סר":2435," ספ":4534," פר":8948," פו":6232," פי":5929," פל":1804," פע":2438," פנ":1331," פס":1467," פא":1358," קל":1932," קט":1426," קי":3820," קצ":927," קנ":905," צר":1714," קו":8076," קא":953," קב":2948," צי":2521," צפ":808," צמ":781," צו":1425," צב":1575," ال":1017," ב ":10067," ג ":4498," ד ":1157," ה ":7037," א ":1385," י ":1129," כ ":2336," ל ":2779," ו ":2034," ז ":1154," מ ":2074," ס ":1112," צ ":1288," ר ":1507,"The":1046,"al ":1171,"and":1073,"an ":1270,"ati":1099,"מוז":5024,"מוד":3483,"מוג":886,"מוב":939,"מהל":1373,"מהמ":885,"מדר":908,"מהו":1225,"מדי":5777,"מדו":1516,"מדע":3235,"מגו":862,"מבר":4568,"מגד":820,"מבו":1915,"מבי":1059,"מבנ":1260,"מאת":1015,"מאר":916,"מאפ":1026,"מאנ":883,"מאל":2405,"מאי":3462,"מאז":946,"מאו":3114,"מאה":2706,"מלו":953,"מלא":1080,"מלכ":2095,"מלך":1227,"מלי":1534,"מלח":3450,"מכו":2726,"מכי":1235,"מים":6416,"מיל":2833,"מין":1783,"מינ":3538,"מיש":1371,"מית":4649,"מיק":1216,"מיר":984,"מטר":3264,"מיי":3085,"מיד":2520,"מיה":1218,"מיו":3528,"מחק":931,"מחל":1800,"מחי":970,"מטי":1746,"מחש":1462,"מחו":2697,"מחז":1008,"מזר":2302,"מוע":3080,"מוס":2260,"מונ":5841,"מון":1066,"מוצ":2037,"מופ":1468,"מוי":943,"מוח":898,"מול":1345,"מוכ":1297,"מזו":770,"מוש":3780,"מות":5842,"מוק":2142,"מור":3868,"מפי":1187,"מפל":1393,"נו ":5903,"מפו":1340,"מפר":1910,"מעו":2266,"מעל":1128,"מספ":3471,"נד ":1870,"מעב":969,"מער":4843,"מעש":819,"נה ":21979,"מנה":2231,"מנו":2766,"מנט":976,"מני":6289,"is ":1053,"ion":1503,"מסו":3392,"מסי":756,"מנת":816,"מסג":979,"נג ":1904,"ממל":1294,"ממו":4003,"ממש":2774,"נם ":1133,"מתא":1357,"מתי":1175,"מתח":1378,"מתו":2071,"מתמ":1624,"מתק":1096,"מרו":1617,"מרי":5941,"מרח":1042,"ני ":19093,"מרץ":1409,"מרכ":2955,"משו":2959,"משח":2455,"משך":1130,"משי":1615,"מרת":774,"משפ":4826,"משק":861,"משר":976,"משת":2362,"משל":2762,"משמ":3131,"משנ":2281,"מצו":2210,"מצב":1295,"נח ":1477,"מצא":2270,"מצע":1849,"מצי":1590,"מקו":5589,"מקב":868,"נט ":1605,"מצר":828,"מקר":1322,"נק ":1115,"נס ":1624,"נן ":997,"נר ":886,"נת ":12553,"סט ":2719,"סד ":1188,"סה ":2203,"סם ":1071,"נאי":1779,"סי ":2536,"סל ":766,"נהל":1094,"נהי":761,"נוד":953,"נוב":2639,"נוא":1590,"נהר":1042,"נדי":1685,"נדו":1190,"נהג":967,"נדר":1651,"נגל":9186,"נגד":1024,"סף ":1568,"נגר":1057,"סס ":885,"נבח":833,"נטי":3381,"נטר":1455,"ניו":3890,"ניה":4469,"ניב":1798,"נחש":1738,"נטו":1177,"סר ":1369,"he ":1611,"סק ":1452,"נוי":2975,"נול":6258,"נון":1701,"נונ":778,"נוס":3397,"נוע":4528,"נוצ":1379,"נור":1169,"נות":10949,"נוש":1445,"נלא":901,"נכת":1073,"נית":14496,"סת ":2182,"ניס":1942,"ניק":1685,"ניי":6008,"נימ":1196,"נים":13661,"נסו":805,"נסי":2174,"נמצ":1867,"עו ":882,"נפו":1293,"עד ":4359,"נסת":1066,"עה ":5479,"נער":986,"נקר":1955,"עי ":4052,"נשי":2815,"נצי":1255,"נקו":1045,"נקצ":758,"עם ":5134,"נתי":767,"נתו":1033,"על ":22307,"סבי":1330,"סגר":950,"סגנ":827,"סוג":2713,"סוד":1173,"סוב":810,"סדר":3650,"ia ":834,"סוי":1607,"סוף":1248,"סול":1222,"סון":1399,"סור":2397,"סות":809,"סופ":3199,"סטי":3364,"סיד":979,"סיב":1007,"סיו":1804,"סיה":1411,"סטר":2818,"ער ":2948,"סטו":3180,"סיפ":1197,"סינ":1401,"סיס":1561,"סיל":886,"סים":1765,"סימ":1300,"סיט":1729,"סיי":2145,"סיכ":1356,"עת ":4300,"סית":2310,"סלו":858,"סמו":1244,"ing":1041,"in ":936,"ספט":1488,"ספי":2004,"ספו":1293,"ספר":9874,"סקו":1337,"סקי":1760,"סרט":3915,"סרי":1021,"עבר":4731,"עבו":2091,"עדו":1037,"עוב":1493,"פו ":941,"עסק":909,"פה ":6052,"עני":1193,"עמו":895,"עמי":1194,"עמד":1340,"עלי":4061,"עלה":1199,"עלו":1136,"עלת":1040,"עיק":3105,"עיר":4131,"עית":1602,"עיי":870,"עיל":2019,"עים":3131,"עין":910,"עיו":894,"עור":1459,"עות":4912,"עול":5515,"עונ":916,"עוס":1354,"עוד":1596,"עזר":762,"er ":2469,"es ":1643,"פן ":2293,"en ":849,"עתי":2916,"ent":903,"עשר":1306,"עשי":1214,"עשו":933,"עשה":950,"ערי":962,"ערכ":3489,"ערך":1379,"פי ":4902,"ערב":4397,"ערו":1170,"פט ":1621,"עקב":1399,"עצמ":2020,"עצה":792,"פר ":7389,"פת ":4142,"פונ":2515,"פון":2263,"פול":5205,"פוב":883,"פות":3286,"פור":5443,"פוצ":750,"פופ":985,"פוס":1005,"פוע":1359,"פטמ":1413,"פטי":1246,"פיה":1485,"פטר":1084,"פחה":834,"פחת":1765,"פבר":1306,"פוא":886,"פאר":769,"פעי":1681,"פעל":1386,"פעו":1775,"צה ":4704,"פרט":1640,"פרי":7391,"פרו":4724,"פרנ":1094,"פרס":3792,"פרד":2122,"צי ":1464,"de ":816,"פשי":825,"פרש":1302,"פרק":1005,"פקי":1813,"פלו":847,"פלג":1367,"פלי":1158,"פים":2239,"פינ":1223,"פיע":1348,"פיו":1127,"פיז":1279,"פיי":2109,"פיל":2559,"פיר":2169,"פיק":1363,"פית":1710,"צב ":1365,"פני":3690,"פנה":1015,"פסי":1405,"פסו":1262,"צא ":3406,"צת ":2388,"צר ":2282,"פשר":1288,"פתח":1344,"פתי":2367,"צע ":1320,"קו ":1976,"קט ":953,"קי ":3109,"קל ":917,"קב ":846,"קה ":10128,"צוע":1224,"צות":4135,"צור":3752,"צוו":876,"צוי":1042,"ציא":1461,"ציב":1399,"קר ":5102,"קת ":3999,"ציר":1681,"צים":1036,"ציי":1115,"ציו":2783,"ציה":2306,"צלי":914,"קם ":1371,"צאה":819,"קן ":2098,"צבי":1113,"צבא":1757,"קס ":1455,"צאת":990,"קדמ":1302,"קדו":1269,"קהי":992,"קונ":1555,"קומ":2802,"קופ":3480,"קור":4356,"קות":1519,"קוב":1891,"רץ ":4170,"קוו":1390,"קוד":2387,"קוט":813,"קול":4389,"קום":1590,"רן ":1611,"קאי":4278,"קבל":961,"קבו":5125,"קבי":757,"רס ":2957,"קלא":1049,"קלי":2877,"קלו":814,"קמה":931,"שא ":795,"רק ":2876,"קטי":1305,"קטו":2534,"רר ":986,"קיב":1075,"רש ":946,"קטר":975,"קים":4228,"קיס":929,"קיד":1911,"קיו":786,"קיי":2845,"קית":1359,"רת ":15678,"קיצ":864,"קיר":818,"רה ":17356,"רד ":3185,"צעו":1152,"צעי":1124,"צפו":2771,"רו ":3945,"רא ":2013,"צמב":1291,"צמח":843,"רג ":1850,"רב ":5168,"רל ":1007,"רם ":1940,"רט ":4971,"רח ":1986,"רך ":5443,"צרפ":2865,"רי ":10968,"צרי":2230,"רחש":787,"רחי":1692,"רחב":1963,"רחו":949,"ריא":2270,"רטי":3060,"שר ":8585,"רטו":1441,"רות":11282,"רוש":2596,"רור":832,"רוק":2070,"רופ":3572,"רוע":945,"רוס":3581,"רונ":2892,"רון":3292,"רומ":4126,"רום":2275,"רוי":1525,"רוח":842,"רוט":809,"רוו":1191,"רוז":748,"רוב":4518,"רוא":1983,"רדי":2706,"רבי":6491,"רבע":1063,"ראת":808,"ראש":9879,"רבו":2717,"רבה":870,"רגנ":814,"רגל":1905,"רגי":1770,"רגו":2907,"תב ":1936,"רנס":871,"רנט":865,"רני":2334,"רמו":1109,"רמנ":3395,"רמי":1311,"רכה":807,"רכז":2614,"רכו":1721,"רכי":2246,"רכת":2068,"רלי":1812,"ריו":3530,"ריט":3551,"ריה":4718,"ריד":1253,"רים":13911,"רינ":921,"ריי":3646,"ריל":1756,"ריכ":1158,"ריק":5899,"ריס":1584,"שת ":3670,"רית":8516,"רכב":1910,"קצי":1354,"שו ":1123,"שה ":6000,"שג ":788,"קסי":1168,"קני":2966,"שב ":2846,"ראה":1168,"ראו":1207,"ראל":9123,"ראי":2021,"שם ":6732,"של ":44424,"קשר":1129,"שך ":1218,"קשו":1188,"קרא":3185,"שי ":4651,"קרב":1789,"קרו":2813,"קרי":3474,"שהי":1689,"שהו":2779,"שהת":1193,"שוו":1653,"שוי":825,"שוב":2695,"שות":2730,"שור":3173,"שומ":896,"שון":3078,"שול":846,"שונ":5449,"שחק":4451,"תר ":6554,"שטח":1349,"שיא":1478,"שיב":1320,"שיח":755,"שיו":1010,"שיט":1510,"שיי":2304,"שימ":3122,"שים":3750,"שינ":1048,"שיצ":765,"שית":2397,"שיש":1013,"שיר":3655,"תת ":1033,"שכו":948,"שלת":1194,"שלי":4040,"שלט":1378,"שלו":3513,"שלה":1787,"שמא":2250,"שמה":1020,"שמש":2616,"שמע":1452,"שמו":3471,"שמי":1858,"שנה":2722,"שנו":3666,"שנת":8163,"שני":8407,"תה ":7963,"רסי":2582,"רסם":846,"רעי":965,"רפת":3016,"רפי":1377,"רפו":1781,"תו ":7307,"רקי":1347,"תח ":1273,"רצו":3053,"רצי":1126,"רשו":916,"רשי":915,"תי ":6224,"תל ":1457,"רתו":1195,"רתי":1593,"רשת":1744,"px ":1770,"תם ":2303,"שאי":1679,"תן ":2959,"שבת":1116,"שבי":2213,"שבע":790,"שאר":864,"שבה":1333,"שבו":2486,"תחו":3127,"תחי":1668,"תחת":1243,"תוא":930,"תוך":1588,"תוח":1128,"תונ":1543,"תוכ":2531,"תול":1229,"תות":809,"תור":2674,"תוצ":851,"תופ":747,"תכו":755,"תלמ":974,"תים":1902,"תיי":3333,"תיו":2454,"תיה":1049,"תיב":1533,"תיא":1180,"תית":3825,"תיק":1981,"תנה":885,"תנו":2011,"תמש":815,"תמט":1174,"תמו":1033,"תפי":902,"תפת":844,"תפק":1672,"שפע":1069,"שפו":1080,"שפה":1050,"שפי":949,"שפח":2708,"שפט":1854,"שתמ":820,"שתי":1677,"שרת":856,"שרי":1851,"שרא":9189,"שרו":1075,"שרה":794,"שרד":795,"תאי":785,"תאו":1106,"תהל":1012,"תאר":1331,"ng ":993,"ne ":883,"nd ":899,"תקו":3171,"תקי":1081,"תרח":857,"תרו":1096,"תרג":858,"תרב":1108,"of ":1020,"on ":2345,"le ":936,"אב ":829,"אה ":7609,"או ":12752,"אז ":1578,"אי ":11190,"אך ":1814,"אל ":12893,"אם ":2755,"אן ":3273,"אחד":3813,"אזו":2699,"אות":6901,"אוק":2466,"אור":6849,"איט":1941,"איו":924,"אטר":826,"איס":807,"איר":3360,"איל":857,"איי":2254,"אינ":4275,"אין":766,"אימ":1541,"אים":3388,"בר ":12599,"אחר":6475,"אחת":1793,"אדו":1100,"אדם":2266,"בע ":2728,"אוד":916,"אוג":1935,"אוח":876,"אוט":953,"אוו":1522,"אול":3101,"און":955,"אונ":2600,"אומ":3134,"אופ":3417,"אוס":2423,"אמר":4875,"אמצ":1763,"אמנ":1370,"אמי":1391,"אמו":966,"גד ":977,"אסי":1374,"אסט":928,"אנש":1036,"אנר":800,"אנט":1124,"אני":2535,"אנג":9388,"אנד":988,"אנו":1034,"בת ":4691,"אית":2756,"איש":1805,"אלק":946,"אלג":769,"אלב":2498,"אלו":1949,"אלה":1036,"אלי":7303,"ארד":814,"ארג":2148,"גי ":1572,"ארב":1076,"ארו":1606,"ארץ":2338,"ארצ":2697,"ארק":837,"ארי":2497,"ארכ":746,"אשו":5612,"אשר":5633,"אשי":2353,"גה ":2608,"גו ":887,"אפי":1390,"אפר":2492,"אפש":1248,"באי":3225,"באמ":1587,"באל":995,"באז":1071,"באו":6670,"בבי":1380,"באר":2896,"באנ":6445,"באפ":1580,"בגו":776,"בדי":2083,"בדו":915,"בגר":1359,"גל ":2819,"אתר":938,"גם ":5839,"גן ":1860,"אף ":1141,"אס ":1146,"בא ":1578,"את ":15913,"אש ":2340,"אר ":5565,"בי ":5655,"בה ":7790,"בו ":4295,"בד ":1438,"אבו":828,"אבי":2570,"בן ":3463,"בל ":2984,"גנו":1314,"גני":1049,"הה ":812,"הו ":1555,"גור":2881,"גות":1342,"גוף":967,"דר ":2321,"גיי":929,"גים":1431,"גיל":1327,"גיע":1045,"דש ":1335,"גיה":3078,"דת ":3965,"גית":995,"גלו":929,"גלי":10256,"דבר":1224,"דוב":1184,"דול":3841,"דום":754,"דומ":1571,"דון":1523,"דוג":872,"דוד":964,"דות":4827,"דור":4722,"דוק":822,"דוע":2536,"גרו":886,"גרת":1011,"גרי":1714,"גרפ":891,"גרמ":3486,"הם ":6282,"הל ":908,"הן ":3375,"במח":1274,"במו":1362,"במי":1660,"במא":2636,"במד":1407,"במה":1632,"במב":1705,"במר":2370,"במק":1759,"במש":1939,"במע":1091,"במס":1456,"בלי":2952,"בלו":1708,"בלת":1123,"בעב":1312,"בעו":2020,"בעל":3414,"בעי":5046,"בער":1666,"בעת":1302,"דה ":9750,"בפב":1248,"דו ":2707,"בנו":3853,"בנה":1480,"בני":4749,"בסו":1238,"בסד":891,"בסי":1809,"בספ":2568,"בוצ":4259,"בור":4953,"בות":6458,"בחי":1679,"בחו":1131,"בהם":920,"בהי":972,"בהו":872,"בדצ":1243,"בדר":2390,"בוע":1416,"בוס":1642,"בונ":896,"בום":1558,"בול":2422,"בוי":829,"בוה":789,"בוד":2124,"ביש":3045,"ביר":2459,"בית":7075,"גת ":1847,"בכל":1398,"בכי":1314,"בחר":970,"גר ":827,"בימ":897,"בין":7303,"ביל":1749,"בים":5704,"ביט":1549,"ביי":2219,"ביצ":869,"ביע":1697,"בינ":3973,"ביו":8219,"ביה":1061,"ביד":1090,"ביב":2499,"ביא":912,"גבי":1029,"גבו":1377,"גאו":1715,"דן ":1252,"גוב":1126,"גוד":1622,"גון":2245,"גוס":1728,"גול":1378,"גדר":1056,"גדו":3497,"גבר":964,"דע ":2354,"ברו":5256,"ברה":2480,"די ":14259,"ברא":1602,"בקר":1360,"בקי":1190,"בקו":797,"בצר":1026,"בצע":1020,"בצפ":918,"בצו":754,"בפר":1402,"בפו":753,"בפי":884,"דם ":3784,"דל ":1350,"בתי":1021,"בתק":1270,"בתח":2033,"בתו":2504,"בשי":1378,"בשמ":809,"בשם":1347,"בשל":1420,"בשנ":7415,"ברת":3153,"ברט":849,"ברי":11218,"ברס":1667,"הקב":1121,"הצפ":881,"הצל":875,"הצי":1591,"הרא":4672,"הרב":2013,"זי ":1210,"הקר":1616,"הקש":833,"הקת":1394,"הקל":776,"הקי":1474,"הקה":1071,"הקד":827,"הקו":2620,"השב":876,"הרפ":831,"הרש":823,"הרו":2886,"הרכ":769,"הרי":1168,"התא":1307,"התו":1514,"השת":1139,"השמ":1266,"השם":895,"השנ":4351,"השפ":1366,"השו":2146,"השח":757,"השי":2832,"השל":1958,"העת":967,"הער":1024,"העו":4093,"העב":1724,"העל":1178,"העי":2782,"זה ":4910,"הפע":812,"הפס":746,"הפר":2442,"זו ":2407,"הפו":2450,"הפי":1400,"הצב":868,"המח":2130,"המז":1081,"המי":3193,"המד":2330,"המו":7722,"המב":1301,"המא":3556,"המצ":1978,"המק":2404,"המר":1784,"המש":5759,"המע":1738,"המפ":1404,"הממ":1866,"המנ":1147,"המס":1855,"המכ":1676,"המל":2091,"המת":2867,"הנו":2020,"הנח":758,"הני":1012,"הנפ":766,"הנמ":908,"הסו":2019,"הסי":1362,"הסד":840,"הסר":1363,"הספ":2367,"היש":1505,"היר":1734,"ות ":80938,"הכו":1820,"הכל":1158,"הכי":931,"הכנ":1262,"הלא":944,"הלה":874,"הלו":1160,"הלי":2332,"הלכ":875,"הלך":1189,"ויל":1139,"וים":1245,"וימ":813,"וינ":891,"ויה":1089,"ויו":3873,"ויז":1668,"ויי":2636,"ויד":832,"וטו":1573,"וטי":1439,"זר ":1170,"וחר":794,"וכן":1385,"וכנ":2823,"וכל":1110,"וכי":1321,"וכו":764,"וכה":1041,"וכב":1408,"ויר":1363,"ויק":1124,"ווא":1655,"us ":1690,"ווה":2251,"ווי":6670,"וול":1093,"וון":1540,"והא":894,"והו":1891,"והי":2099,"והמ":1272,"ודע":1169,"ודר":1718,"ודת":964,"וחד":1468,"וחו":1384,"וזע":1797,"וזי":3484,"וונ":2693,"וות":1300,"וור":1036,"ואר":4871,"ואל":1667,"ובה":2620,"ובו":2126,"ובי":4910,"ובד":1206,"ובא":1004,"ואה":1373,"ואו":1083,"ואי":2839,"וגר":1479,"וגמ":793,"ודל":997,"ודי":6497,"ודה":2328,"ודו":2620,"ובר":4203,"ובת":944,"ובע":1673,"ובל":2998,"ובמ":2420,"וגי":5063,"וגו":2244,"וגד":821,"זם ":1283,"התח":1134,"התי":1325,"התמ":749,"התנ":1704,"התפ":1798,"התק":1851,"התר":1227,"וט ":1289,"דצמ":1291,"וי ":5737,"דרה":2116,"וז ":2245,"וח ":3705,"דעי":2475,"וד ":6343,"וה ":2998,"וב ":6671,"וג ":4167,"דמו":2336,"דמי":1260,"וא ":27697,"דלי":756,"דיר":1020,"דית":2578,"הר ":1518,"דיה":1129,"דיו":2800,"דינ":5116,"דין":785,"דים":4959,"דיי":1568,"היי":4423,"היל":1466,"היו":4371,"היח":928,"היס":2192,"הים":937,"הינ":2071,"היג":810,"וש ":3750,"היה":10243,"היד":1593,"היא":15411,"הטו":996,"החש":794,"ור ":18092,"החו":2245,"החי":1691,"החל":2789,"וק ":4196,"החב":1254,"הזמ":843,"הופ":1656,"הור":1218,"הוק":1750,"הוצ":992,"הונ":877,"הול":1454,"הוו":1959,"וץ ":2534,"הוד":5252,"הוא":27359,"ההי":939,"ההו":815,"ופ ":863,"הדר":1419,"הדי":781,"וף ":4256,"הדו":1789,"הגר":1312,"הגי":1488,"הגב":784,"וע ":5599,"הגו":1529,"הגד":2288,"הבר":3864,"האנ":1474,"האמ":2115,"האל":2120,"האר":2435,"tio":1159,"וס ":5753,"הבי":3093,"הבו":902,"האד":1377,"האי":3937,"האו":3742,"האח":1546,"ון ":31676,"ום ":16021,"ול ":7891,"ter":1142,"דרו":3914,"the":876,"דרי":1886,"דרך":2231,"וך ":3767,"דרת":2145,"חבר":6406,"חבי":909,"חות":2707,"חור":1374,"חוק":2664,"חופ":953,"חומ":2243,"חדש":1272,"חוז":2017,"חום":1881,"חול":1611,"חוב":1301,"חוד":1401,"חיר":890,"טת ":2178,"חית":881,"טר ":2558,"חיו":928,"חיד":1965,"חיי":3491,"חיל":2274,"חים":2086,"חינ":1388,"חין":819,"חמה":852,"חמי":1161,"יא ":17223,"חמת":1678,"חלו":1698,"חלל":804,"חלק":3950,"זמר":1390,"זמן":1228,"טו ":1328,"טה ":3869,"זער":1824,"זרח":2887,"טי ":5165,"טח ":798,"טן ":1424,"חס ":1181,"זהו":873,"זור":3460,"חק ":2379,"חר ":3837,"זיה":1878,"זיק":3731,"זית":1113,"חת ":4976,"זכו":1572,"ולר":980,"ולק":1057,"ולת":1183,"ולפ":880,"ולנ":2842,"ולם":4221,"ולמ":1167,"ולל":2126,"ולי":9326,"ולט":1565,"ולו":7007,"ולה":3768,"ולד":6790,"ולא":1327,"וכר":1278,"ומת":1301,"ומש":1030,"ומר":2913,"ומי":8418,"ומנ":2251,"ומה":2366,"ומד":1168,"ומב":746,"ומט":1282,"ומח":813,"ומו":3468,"ומא":1433,"וסס":1023,"וסף":1300,"וספ":1419,"וסק":2400,"וסד":1576,"וסט":3543,"וסי":4415,"וסו":1601,"ונס":965,"ונת":1474,"ונק":1268,"חב ":1144,"ונג":1320,"ונא":1208,"וני":18082,"ונח":1810,"ונט":1231,"ונו":5615,"ונד":1458,"ונה":7200,"חה ":3681,"ועצ":1483,"ועת":846,"ועד":3169,"ועה":1693,"חד ":5298,"ועו":1416,"ועל":2118,"ועי":1952,"וצא":2429,"ופי":5387,"ופה":2721,"ופו":2815,"חו ":753,"ופק":800,"ופר":3436,"ופש":817,"ופת":1896,"ופן":1487,"ופס":1055,"ופע":1474,"וקם":1123,"וקס":776,"וקמ":1227,"וקו":1308,"וקי":1776,"וקט":2107,"חי ":1719,"ורב":1210,"ורג":4279,"ורא":1280,"וקר":2331,"וצי":1905,"וצה":1895,"וצו":1018,"וצת":1866,"וצר":3353,"ושי":1977,"ושה":1203,"ושו":851,"ושל":2823,"ותה":2054,"ורט":1801,"ורי":12062,"ורך":1570,"ורד":1610,"ורה":4165,"ורו":3649,"ורנ":1343,"ורס":1879,"ורכ":1436,"ורמ":1653,"ורר":1139,"ורק":1753,"ורת":4134,"ורש":1062,"ושב":1741,"ושא":1174,"ושג":868,"חם ":833,"ותו":3558,"ותח":851,"ותי":4897,"ותם":860,"ותר":5363,"חל ":1502,"יית":6408,"ייר":2338,"ייש":1691,"ייצ":1194,"ייק":972,"יינ":2765,"ייס":1621,"יכי":851,"יכו":4344,"יחס":2243,"יטי":4701,"יטל":2302,"יטו":2202,"יטה":1667,"יטר":974,"יטת":1710,"יטנ":845,"ייט":1193,"ייך":816,"יים":14831,"ייל":1292,"יין":3012,"יימ":2364,"ייב":892,"ייד":1305,"ייה":4423,"ייח":1458,"יוס":1553,"יונ":4734,"יון":4211,"יומ":966,"יום":4082,"יול":2364,"יות":18525,"יור":2161,"יוצ":1391,"יזי":2803,"יזם":1225,"יחד":770,"יחו":1331,"יחי":2092,"ידע":1040,"ידת":830,"יהו":5488,"יהם":1573,"יהן":839,"יוו":4083,"יוח":1326,"יעי":1794,"יעו":1404,"יעה":1184,"יסר":789,"כה ":5083,"ינס":944,"ינט":1703,"יני":8382,"ינל":1089,"ינא":757,"כב ":1369,"ינג":1937,"ינה":5280,"ינו":10255,"יסי":1267,"יסט":5187,"יסד":879,"יסה":750,"יסו":2347,"ינת":1901,"ימי":5064,"ימו":4842,"ימפ":1867,"ימה":1856,"ימת":1210,"ימש":796,"ילד":1523,"ילה":3661,"ילו":5745,"ילי":5359,"יכר":764,"ילת":1987,"טרי":3898,"טרה":1144,"טרו":3685,"טרת":804,"טרנ":1010,"ידו":5598,"ידה":2025,"ידי":10222,"יגה":763,"יגו":1454,"יבר":2757,"יבת":1107,"יבי":2538,"יבו":5867,"יבה":1458,"יאנ":1337,"יאל":1853,"יאט":1005,"יאה":813,"יאו":2501,"טון":1905,"טונ":1205,"טוס":1037,"טור":5386,"טות":1114,"יק ":3514,"יר ":9468,"טיו":1116,"טיב":1924,"יש ":4075,"טים":3865,"טינ":2155,"טיי":2059,"טבע":1372,"יע ":1962,"יף ":1039,"טוב":2684,"יץ ":1463,"טמב":1415,"טני":1443,"טית":3454,"ית ":68847,"טיס":800,"טיפ":954,"טיק":2311,"טכנ":772,"טלי":1241,"טלו":1568,"טלק":1324,"יז ":756,"יו ":8828,"יט ":1800,"חקי":1130,"חקן":1010,"חקנ":990,"חקר":1503,"יח ":1007,"חסי":1544,"יג ":1633,"יב ":3393,"יה ":40026,"יד ":4579,"ין ":20713,"יס ":3129,"חשו":1442,"יך ":2851,"חשב":3253,"חרת":762,"חרי":2444,"חרו":2409,"יי ":1538,"ים ":101313,"יל ":6999,"כנו":1432,"כני":2511,"כנס":2070,"כפר":832,"לת ":6746,"כינ":1581,"כימ":930,"כיר":1145,"כיב":953,"כיו":2320,"כיה":761,"כים":1975,"כיל":789,"לר ":750,"כמו":1809,"כלכ":1033,"כלל":3229,"כלי":2030,"כלו":1791,"כבי":2214,"כבת":844,"כגו":871,"כדו":2961,"כאש":1171,"לס ":944,"כוכ":1222,"כול":4477,"כון":1365,"כונ":3439,"כזי":1062,"כות":3899,"לק ":2384,"כדי":968,"כוח":1334,"לי ":12633,"לך ":2677,"לט ":1236,"לן ":1554,"לם ":5107,"לל ":3884,"לא ":4586,"לב ":1394,"לה ":14910,"לד ":5523,"לו ":5117,"כר ":1517,"כת ":4160,"כן ":3170,"יפי":1104,"יפו":4212,"יפה":1175,"כו ":1279,"כז ":1400,"יצו":3383,"יצי":2337,"יצא":1573,"יקר":3694,"יקת":771,"יקנ":1394,"יקט":1251,"יקי":1290,"יקל":1432,"יקא":4207,"יקה":6636,"יקו":3145,"ירת":2005,"ירו":10154,"ירה":3648,"כי ":2345,"ירא":823,"ירי":3756,"ישר":9232,"ישו":3196,"ישה":1288,"כך ":1583,"ישי":4067,"כל ":4979,"יתה":3362,"יתו":3399,"יתי":1065,"יתן":1501,"כם ":777,"מת ":6084,"מר ":3374,"מש ":2759,"מס ":899,"מן ":5852,"לתו":964,"לתי":1345,"מל ":1062,"לקו":1461,"לקי":2709,"לקט":1068,"לשנ":1062,"לשי":1010,"לשו":879,"מי ":7001,"לרא":1055,"לרי":1005,"לרו":1023,"מה ":11039,"לעת":1312,"מד ":1904,"לפר":766,"לפנ":1939,"לפי":2283,"מו ":5183,"למע":1288,"למש":1364,"למו":2080,"למד":977,"למי":2394,"למנ":879,"לנד":1704,"לני":1139,"לנו":2241,"לכל":1891,"לכה":1199,"לכו":898,"ליק":1573,"ליצ":890,"ליש":1631,"לית":12132,"לין":1391,"לינ":1320,"לים":8070,"לימ":1810,"ליפ":1428,"ליס":1055,"לכת":1177,"ללי":1608,"לוש":1494,"לות":5393,"לומ":2089,"לום":1009,"לונ":1458,"לון":862,"לוי":1233,"לול":1458,"לור":772,"לוק":1039,"לוס":1978,"לחי":1661,"לטו":1868,"לטי":2006,"לחמ":2604,"ליא":1164,"ליד":1413,"ליה":3296,"ליג":1396,"ליח":784,"ליט":3150,"ליו":3604,"ליז":849,"ליל":2282,"ליי":2952,"ליך":906,"לגו":927,"לגב":764,"לבנ":1140,"לבי":1775,"לבו":2832,"לדו":1001,"לדה":1350,"לדי":1178,"לוג":4860,"לוב":1152,"לוח":1147,"לוו":2569,"להת":947,"להק":2345,"להי":1584,"לאו":4844,"לאח":2966,"לאי":1340,"לאס":784,"לאר":955,"כרי":914,"כרו":1009,"כתי":946,"כתב":2605,"כתו":1006},"n_words":[7414842,8743276,5934016],"name":"he"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ौद्":366,"ोली":389,"ोर्":465,"ोमी":388,"्तम":479,"्तन":384,"्तर":2639,"्त्":1391,"्थल":430,"्ता":2582,"्ती":679,"्ति":2706,"्तु":954,"्थि":2834,"्था":3750,"्द्":869,"्दी":1870,"्दू":492,"्दे":907,"्ट्":2750,"्टे":7603,"्टी":634,"्टि":711,"्तक":539,"्बन":468,"्मक":445,"्यक":2308,"्मा":2864,"्मि":1000,"्रं":360,"्मी":437,"्यत":638,"्यय":385,"्यम":558,"्रक":2642,"्यव":609,"्रच":370,"्रज":367,"्यु":778,"्या":6530,"्धा":551,"्धि":463,"्ना":572,"्पन":572,"्पा":591,"्प्":3747,"्गत":419,"्का":992,"्कृ":823,"्कर":450,"्जा":403,"्ञा":1890,"्टर":764,"्चि":1116,"ोंन":705,"ौर ":706,"ं":116003,"ः":743,"ँ":3754,"आ":15110,"इ":21960,"अ":27216,"ऊ":1116,"ई":8670,"उ":14369,"ए":27071,"ओ":3623,"ऑ":695,"ऐ":1181,"ग":41430,"ख":13380,"क":215389,"औ":11975,"छ":5467,"च":24607,"घ":4688,"ट":40532,"ञ":2576,"झ":1114,"ज":58287,"ठ":3871,"ड":21061,"ढ":1924,"ण":16159,"त":129370,"थ":26984,"द":62970,"ध":21789,"न":137720,"प":89801,"फ":9525,"्ग ":934,"ब":39694,"भ":28885,"म":108014,"य":96626,"र":228209,"ल":79901,"व":82288,"ष":22409,"श":41726,"ह":118206,"स":149246,"़":11159,"ि":139433,"ा":290518,"े":193119,"ू":22463,"ृ":6345,"ी":110466,"ु":44034,"ौ":4425,"्":228350,"ो":68898,"ै":59521,"ॉ":2831,"।":45019,"०":5718,"१":6322,"्क ":738,"६":1789,"७":1738,"८":1943,"९":4350,"२":3762,"३":1587,"४":1437,"५":1969,"्न ":1500,"ोजन":549," ख":3003," ग":13738," औ":11821," क":119739," ओ":938," ऐ":1154," ऑ":666," ट":5844," ज":31363," झ":690," च":7337," छ":3781," घ":3397," इ":19284," आ":12686," अ":26917," ए":21008," ऊ":806," उ":13706," ई":1344,"्फ ":515,"्म ":3545,"ोड़":535,"्य ":9507,"्र ":7477,"ोते":895,"ोती":1270,"ोता":2537," २":2506," ३":490," १":5155," ।":6381,"्व ":2678,"ोनो":409,"ोने":1142," प":54146," फ":4760," न":21169," म":54702," य":22634," ब":23290,"्ष ":1625," भ":19961," ड":2225," द":21719," ध":2884," त":12822," थ":8999," ह":72412," स":78945," ल":14663," र":25548," श":12231," व":27670,"्स ":1196,"्च ":578,"्ट ":1436,"्ञ ":468,"्ड ":1049,"्ठ ":1088,"्ण ":1354,"्थ ":1814,"ोगि":441,"ोगो":559,"्त ":4240,"्ध ":2352,"्द ":1588,"्सा":631,"्ष्":370,"्हे":874,"्हो":949,"्स्":3660,"्ली":1260,"्ला":820,"्ले":511,"्रद":1892,"्रथ":561,"्रत":2596,"्रण":709,"्यू":989,"्रप":384,"्रम":2375,"्रभ":518,"्यो":1594,"्रय":1925,"्रव":977,"्रश":494,"्रह":1076,"्रस":1694,"्रा":9061,"्रि":3453,"्री":4034,"्रै":386,"्रे":9372,"्रो":1860,"्षण":540,"्षे":3114,"्षि":1460,"्षा":1408,"्वत":875,"्वर":1044,"्वव":540,"्वप":609,"्वी":1298,"्शन":1985,"्वे":430,"्वि":632,"्वा":6601,"०० ":579,"२००":1298,"१९९":360,"१९६":420,"१९७":415,"ंत ":1187,"ंड ":1010,"ंग ":1786,"ंक ":375,"ंभ ":447,"ंद ":452,"ंश ":574,"ंस ":426,"ंह ":485,"आप ":923,"अंत":806,"अंग":1106,"अक्":469,"इन ":809,"ंचत":1845,"ंचा":2297,"ंग्":1940,"ंगा":789,"ंगी":464,"ं। ":4299,"ंको":451,"ंक्":1624,"ंख्":1085,"ंगल":563,"ंटे":1825,"ंडि":369,"ंजा":609,"ंने":716,"ंबं":450,"ंपा":1007,"ंत्":1705,"ंति":457,"ंतर":775,"ंद्":705,"ंदी":875,"ंदि":932,"ंदर":556,"ंबर":485,"ंयु":368,"ंस्":2087,"ंसा":366,"�":2362,"थे ":819,"दन ":568,"थी ":791,"था ":4899,"त् ":405,"थम ":523,"तंत":641,"तो ":1178,"ं ":59244,"ः ":691,"ताओ":415,"तान":1545,"ताब":607,"तार":557,"तिक":1335,"दो ":833,"तिज":375,"तिन":613,"ँ ":2207,"तिय":1006,"तिर":505,"तिह":787,"आ ":1631,"नई ":675,"धन ":595,"दी ":4353,"दू ":610,"तरी":448,"तरा":413,"तरह":445,"दा ":857,"तर्":854,"दि ":1483,"तमा":404,"तमि":373,"ण्ड":966,"णों":461,"दर ":576,"दल ":742,"तथा":2577,"तत्":438,"णाल":368,"थ ":4109,"द ":7269,"ध ":3357,"न ":42221,"ड ":6850,"था।":3059,"ठ ":1539,"थाप":1123,"ण ":9157,"थान":1787,"त ":29579,"थित":2566,"धी ":490,"ज ":2925,"दक्":984,"ञ ":468,"धा ":464,"नट ":1785,"ट ":6235,"धि ":2431,"थवा":400,"च ":2310,"छ ":1066,"क ":31192,"तों":455,"ग ":7730,"ख ":1929,"त्स":696,"त्व":1830,"त्प":791,"त्र":11745,"त्य":2756,"त्म":761,"त्त":3587,"ए ":4565,"तीन":446,"तीय":3767,"तीस":449,"ई ":6870,"ै ":18248,"दार":977,"दान":855,"दाय":435,"दिल":1253,"दिश":630,"दिर":1086,"े ":101676,"दिय":1540,"दित":1005,"दिन":569,"न् ":786,"दुर":430,"ू ":2066,"दूर":671,"दूस":592,"ि ":14043,"नी ":5102,"ी ":75983,"ु ":3758,"ा ":97285,"़ ":2122,"दस्":366,"ह ":13867,"ने ":12971,"स ":15469,"ष ":2544,"श ":4366,"व ":7299,"ल ":18408,"नि ":406,"दर्":1109,"ना ":7450,"र ":61007,"थी।":911,"य ":22189,"म ":13770,"भ ":1015,"नव ":411,"ब ":2705,"फ ":1049,"थे।":1504,"प ":5685,"टना":394,"डल ":378,"डी ":506,"डा ":698,"टर्":362,"ड़ ":675," �":578,"टती":1811,"ञान":1620,"ट्र":7028,"ट्ट":617,"टेश":7359,"ढ़ ":636,"टीक":471,"णु ":439,"णी ":648,"णा ":355,"तक ":1940,"ति ":4868,"ता ":14635,"तु ":1219,"ती ":9089,"ते ":6248,"तन ":484,"ड़े":606,"ड़ी":1222,"ड़ा":1450,"डिय":1027,"तम ":378,"तर ":1819,"जे ":3791,"जा ":1910,"ज़ ":425,"जी ":1578,"चेन":368,"चीन":1126,"चुन":405,"जो ":3111,"चिक":478,"चित":1154,"चार":1698,"चाल":2122,"चिम":876,"जंक":1375,"च्च":632,"छोट":541,"जन्":1234,"टक ":496,"जधा":578,"जनस":468,"जनी":553,"छूट":1815,"जस्":392,"टन ":357,"जहा":441,"ज़ी":477,"जिस":3147,"जिल":1663,"जिन":942,"जित":402,"जार":493,"जाब":559,"जान":2165,"जात":5721,"जैस":743,"ज्य":2693,"ज्ञ":2457,"जीव":1088,"टर ":1798,"टे ":2183,"टि ":384,"टी ":1658,"टा ":740,"ंघ":459,"ंख":1287,"ंग":7039,"ंक":4013,"ंड":2775,"ंट":3074,"ंज":1716,"ंच":5039,"केन":435,"ंश":984,"ंस":3555,"ंह":717,"ंव":676,"् ":2686,"खने":369,"ंध":1776,"ंद":4729,"ंन":763,"ंथ":399,"ंत":5036,"ंय":471,"ंप":2156,"ंभ":782,"ंब":1984,"ो ":17876,"ँच":523,"कोई":460,"कों":1045,"आत":950,"इं":1275,"कृत":1818,"आज":527,"अस":694,"अव":2719,"आक":698,"अल":1444,"अर":2196,"अभ":1051,"अम":1289,"अब":415,"अप":3105,"आई":372,"अध":2198,"अन":4741,"अथ":432,"इत":740,"इट":387,"आस":422,"इक":606,"आव":1056,"आम":383,"आय":903,"आर":1257,"आप":1303,"कृष":566,"आद":1231,"आध":1054,"आन":511,"अं":2476,"आं":508,"अत":785,"अक":781,"अग":575,"ं।":6630,"उप":2592,"उस":2266,"इन":2901,"इल":681,"इस":13512,"आ।":435,"ईस":387,"उत":2684,"उन":3336,"उद":1042,"कोच":453,"कोड":3985,"एं":534,"कोल":359,"एँ":425,"कोश":370,"एक":15570,"क्ट":677,"क्त":3551,"क्य":529,"क्श":1460,"क्ष":8266,"क्र":2787,"क्स":4316,"ओर":445,"ओं":2444,"ऐस":714,"एव":3043,"एल":360,"एस":427,"गर":2860,"गय":3634,"गल":1055,"गव":557,"खे":1130,"खो":528,"गम":494,"ख्":2681,"गभ":616,"खा":2157,"गठ":414,"खि":516,"खी":461,"घं":1798,"गढ":741,"गण":767,"गत":1406,"गद":372,"गए":383,"क्":22746,"कै":618,"के":37285,"खन":943,"गई":554,"को":17366,"कॉ":440,"कि":12948,"की":20919,"गं":513,"का":35945,"कृ":2446,"कु":3003,"कू":402,"कस":766,"कव":956,"कह":3138,"कल":2049,"खक":372,"कम":1276,"कर":12591,"कप":515,"कथ":448,"कन":1065,"कड":382,"खं":419,"कत":2310,"कट":630,"और":11428,"कई":777,"कं":723,"चु":923,"ची":1816,"जं":1545,"चि":3214,"चा":5438,"चे":1055,"चौ":426,"च्":1117,"० ":1980,"जग":567,"गढ़":657,"चत":1993,"चन":1667,"चर":750,"चल":1479,"घा":478,"चं":451,"। ":33428,"गति":373,"चक":424,"गा":3720,"गह":412,"गी":1381,"गि":1175,"गु":1958,"गो":2483,"ग्":4970,"गे":620,"गै":364,"घर":423,"टन":928,"ञा":1890,"टत":1909,"टा":1764,"टल":396,"टर":2646,"४ ":778,"झा":416,"३ ":798,"टक":827,"जो":3936,"जै":1167,"जे":4339,"जू":402,"जी":3381,"जु":839,"जा":12605,"जि":7022,"ज़":2064,"घंट":1796,"२ ":974,"ज्":5651,"जन":5056,"जध":594,"छू":1841,"छा":395,"जह":523,"जस":483,"१ ":954,"जर":910,"जल":806,"जब":740,"छो":713,"जय":526,"जम":644,"ठा":568,"५ ":1070,"टे":10515,"ठन":369,"ट्":8239,"टो":662,"टी":2663,"टि":1894,"ड़":5317,"डा":1474,"डि":2023,"डी":831,"डल":514,"६ ":930,"गभग":596,"तं":976,"तः":367,"ढ़":1322,"७ ":816,"ड्":669,"डो":499,"डे":781,"डु":358,"णि":851,"णी":758,"णु":577,"णा":1135,"तक":2674,"८ ":811,"णन":433,"९ ":917,"तव":480,"ति":11566,"ता":20016,"तु":2358,"ती":14469,"तत":547,"तथ":2628,"तप":364,"तन":1343,"ण्":1311,"तम":1364,"णो":495,"तल":566,"तर":5109,"थव":456,"था":11678,"थी":1994,"थि":3214,"ते":6961,"तो":1865,"त्":23294,"थम":587,"थल":512,"दक":1341,"दस":512,"दश":363,"दृ":463,"दू":2227,"दु":1528,"दी":5182,"दि":8815,"दा":4680,"थो":462,"थे":2431,"दन":776,"दल":1038,"दर":2412,"थ्":491,"धा":4715,"नट":1830,"नत":1382,"खें":445,"नद":975,"धी":935,"धि":5621,"धु":599,"दो":2067,"दौ":382,"द्":15486,"नई":683,"दे":6364,"धन":912,"नक":3879,"नग":1441,"धर":1283,"नर":642,"नल":477,"नव":1823,"नन":633,"नप":443,"ध्":2713,"नम":981,"नी":7183,"पं":1090,"नु":2801,"ने":15948,"नस":1797,"नह":1786,"ना":16910,"नि":11710,"पक":1243,"नो":2338,"न्":16749,"पत":2217,"पन":4849,"पद":2167,"पड":542,"पट":576,"पश":1179,"पह":3589,"पस":581,"पल":544,"पय":630,"पर":12235,"बई":388,"पे":1223,"पै":501,"पू":3275,"पृ":1255,"पा":8076,"पि":1966,"पी":1550,"बं":1558,"पु":5232,"फल":545,"फर":388,"प्":26390,"पौ":356,"पो":892,"बन":3146,"फे":662,"बद":869,"फि":861,"बड":1139,"फी":943,"फा":687,"बज":3798,"फ़":1678,"फो":368,"फ्":1074,"बर":2061,"बल":818,"गया":3134,"भग":1053,"बस":1999,"बह":1601,"बि":1756,"बा":6278,"बु":789,"बी":1868,"मं":2240,"बे":1048,"बै":523,"बो":1031,"भर":525,"ब्":3946,"मक":1369,"भव":561,"भी":5128,"यं":862,"भि":1870,"भा":13836,"मत":879,"मण":522,"भू":2019,"भु":462,"मन":2122,"मध":940,"मद":711,"भौ":389,"भो":416,"मल":693,"यक":2875,"भ्":381,"मय":1214,"मर":1260,"मश":384,"मस":725,"मह":3586,"मृ":555,"यत":1327,"मू":1671,"ख्य":2528,"यद":785,"मि":6854,"मा":15028,"मु":5029,"मी":3565,"रं":1975,"मो":1712,"यम":1479,"म्":5925,"मे":33130,"यन":1546,"यप":564,"मै":980,"रख":1463,"यव":775,"रग":385,"रक":5780,"यय":399,"यर":1126,"या":29338,"रज":1027,"यह":10437,"रच":1272,"रद":2279,"रथ":786,"रत":13386,"यू":1784,"रण":4290,"यु":3494,"यी":833,"यि":596,"रय":1959,"रम":3670,"रभ":636,"यो":8068,"रब":740,"रप":839,"ये":5448,"रन":4212,"लम":734,"लय":1888,"लब":457,"लन":1624,"लत":1279,"लग":2035,"लक":1497,"र्":27657,"रो":5583,"रै":502,"रॉ":358,"रे":18745,"गां":472,"रू":3986,"लं":804,"री":10641,"रु":2131,"रि":9664,"रा":30478,"रह":3970,"रस":3428,"रश":605,"रव":1845,"रल":964,"वं":3802,"ल्":5184,"लो":3980,"चन ":356,"लै":601,"ले":6664,"लु":426,"ली":6031,"लि":10617,"ला":9912,"लव":4226,"शब":1373,"वो":877,"वै":1045,"शन":10719,"वे":7461,"व्":3139,"शर":623,"वह":1716,"वव":621,"वश":922,"वस":1629,"वृ":595,"शत":728,"वा":18296,"वि":15882,"वी":3437,"वप":710,"वन":1882,"वध":1955,"शक":894,"वल":726,"वर":5872,"वय":393,"वज":391,"वक":455,"वत":1673,"षे":3151,"सन":2988,"सप":658,"सभ":1408,"सब":1580,"सम":9143,"ष्":6623,"सर":4398,"सल":818,"सव":526,"षा":3622,"सट":463,"षि":1967,"सत":766,"सद":867,"शै":363,"शे":1186,"श्":6992,"शो":892,"षय":360,"सक":10544,"शह":1374,"शी":1148,"सं":12512,"शु":1050,"शा":4600,"शि":3780,"गवा":390,"षण":1876,"है":50827,"हे":2158,"हु":7130,"ही":5060,"हि":5996,"हा":10508,"ह्":694,"हो":8889,"हन":827,"से":22788,"सु":1954,"सी":5308,"हत":2956,"सू":1760,"चल ":435,"सि":5630,"सा":11391,"सह":931,"सस":498,"हव":548,"हस":476,"हल":1812,"हम":833,"स्":33294,"हर":3284,"सै":435,"सो":953,"़त":356,"ात":13410,"ाथ":2015,"ाण":2291,"ाठ":433,"िं":3492,"ाड":1607,"ाट":1520,"ाब":2717,"ाभ":451,"ाप":4193,"ाफ":948,"ान":23672,"ाद":7103,"गुर":518,"ाध":1672,"ाव":4478,"िख":1296,"िक":17099,"ाल":13682,"ार":38751,"ाय":6583,"िए":2749,"ाम":8252,"िज":2534,"ाह":4362,"िच":656,"ास":7258,"ाष":4485,"िग":560,"ाश":1899,"़ी":1936,"ां":6856,"़ा":2260,"़ि":839,"ाँ":2505,"़ो":448,"ाइ":2283,"ाई":2655,"़े":807,"ाउ":506,"ाओ":2028,"ाक":3759,"़्":460,"ाए":1076,"ाच":1779,"ाज":7904,"ाग":3365,"ाख":925,"ीड":730,"ुं":2936,"ा।":4894,"ुई":845,"ीत":2135,"ुआ":2021,"ीप":1050,"ीन":3683,"ुए":1050,"ीम":1042,"ीय":7229,"ीब":439,"ील":1370,"ुओ":472,"ीर":2043,"ुग":719,"ुख":2469,"ीव":1580,"गीत":642,"ुक":2798,"ुज":614,"ुछ":884,"ीस":1060,"ुट":385,"िट":1437,"ीं":3186,"गिक":498,"िण":1209,"गाल":415,"ित":17874,"िद":3772,"िध":2081,"िन":9694,"िप":1804,"िब":505,"िभ":1498,"िम":3142,"िर":6200,"िय":18344,"िल":8308,"ीक":2676,"िश":4603,"िव":3402,"िस":8212,"िष":1899,"ीच":800,"िह":1477,"ीज":592,"ीट":1012,"ेव":2738,"ेश":12536,"ैक":523,"ेल":9698,"ेय":725,"ेर":2606,"ेम":795,"ेब":355,"ेप":664,"ेन":6508,"ैद":560,"ैत":376,"ैज":408,"ेह":634,"ेस":4417,"ेष":1173,"ैर":490,"ैल":1189,"ैन":999,"े।":1956,"ॉन":366,"ैस":1223,"ें":31320,"ेक":2519,"ेख":1744,"गों":941,"ेट":1393,"ेड":787,"ैं":11503,"ेत":4873,"ेद":951,"ेग":371,"ेज":1606,"ृत":2832,"ृष":1881,"गोल":437,"ुत":2428,"ुण":594,"ुड":541,"ुन":2123,"ुध":368,"ुद":1873,"ुप":1093,"ुर":6463,"ुम":1318,"ग्ल":427,"ुल":2104,"ुष":645,"ुस":2362,"ुव":1118,"ग्र":3597,"ूच":523,"ूट":2497,"ूत":791,"ी।":1527,"ून":1187,"ूप":2981,"ूब":457,"ूम":650,"ूर":5314,"ूल":1317,"ूष":1076,"ूस":930,"ूह":483,"्व":16523,"्श":2596,"्ष":9849,"्स":6797,"्ह":2245,"्भ":823,"्म":10658,"्य":26066,"्र":57172,"्ल":4149,"ची ":462,"ौत":437,"ौद":469,"ोर":2026,"ोल":2394,"ोब":410,"ोम":1176,"ोस":741,"ोष":453,"ोह":815,"ोश":547,"ोव":518,"्ण":2398,"्त":17955,"्ड":2149,"्ट":15553,"्ठ":1556,"्ञ":2460,"्फ":981,"्ब":2118,"्प":6879,"्ध":4156,"्न":3593,"्थ":9797,"्द":7520,"ौर":1279,"्ज":1507,"्च":2621,"्ग":2050,"्क":4547,"ों":15790,"ै।":21739,"ोई":504,"ॉर":357,"ॉल":485,"ोज":1283,"ोड":4892,"ोट":1429,"ोद":429,"ोत":5502,"ोप":971,"ोध":664,"ोन":2390,"ोक":1863,"ोच":771,"ोग":4303,"००":2062,"१०":362,"१८":536,"१९":2833,"२०":1612,"९६":475,"९५":380,"९८":427,"९७":465,"९९":438,"जन ":1003,"जब ":495,"चती":1825,"चना":962,"गई ":460,"के ":35060,"का ":19876,"कि ":2326,"की ":19870,"कल ":368,"कम ":375,"कर ":3798,"और ":11340,"कित":562,"किन":799,"काफ":651,"काम":601,"कार":8097,"काल":1674,"काश":1174,"काव":390,"कास":610,"किप":365,"किय":4402,"किल":674,"किस":2621,"कीय":409,"कुछ":880,"कुल":589,"कुम":405,"कां":514,"कान":492,"कहत":1259,"कहल":373,"कहा":1242,"कला":836,"गी ":381,"कवि":783,"कम्":429,"कर्":878,"गा ":602,"करा":372,"करे":537,"करत":2060,"करण":990,"करन":3111,"कथा":387,"कता":1005,"कते":720,"गर ":1575,"खंड":407,"गत ":745,"खा ":869,"कंप":421,"को ":9463,"ओं ":2427,"एवं":2948,"एक्":3566,"कई ":776,"ओर ":406,"उसक":842,"उसे":365,"उपन":475,"उपय":550,"एँ ":417,"एक ":11574,"आदि":993,"आधा":542,"आर्":522,"आवश":646,"उन ":374,"इति":479,"अथव":383,"अधि":1401,"अनु":2084,"अध्":641,"अन्":1665,"अने":519,"अपन":2305,"इस ":3686,"अभि":793,"अमे":560,"अलग":451,"अर्":1665,"अवध":1855,"उन्":1250,"उद्":651,"उनक":1565,"उत्":2559,"उस ":606,"इनक":960,"इन्":786,"इसक":6193,"इसम":1168,"इसी":419,"इसे":1083,"इस्":411,"ेश्":531,"ेशो":391," ओर":436," कं":518," कई":773," और":11423," कन":365," कम":1021," कर":8774," कल":965," कव":757," कह":3125," की":14772," कि":9638," का":20292," कृ":765," कु":2268," के":32645," कै":557," गई":551," को":14690,"ेशन":8153," गए":382," क्":4800,"ेवा":915," एक":15549," एव":3020," ऐस":714,"ै। ":17713,"ोई ":492," चौ":410," जग":453," चु":690," ची":475," जं":1512," चि":1251," चा":1105," चे":563," जर":355," जल":669," जब":682," छो":700," जह":416," छू":1830," जन":2746," ज्":953," जि":5438," जा":9396," जु":639," जी":1466," जो":3609," जै":1100," गर":416," गय":3607," खे":428," गण":526," घं":1793," खा":713," गो":922," ग्":1394," गु":1570," गा":1413," । ":3304," चल":837," चर":408," अं":2469," अब":411," अप":3085," अन":4720," अध":2196," अथ":432," आक":690,"ेता":622," अल":1382," अर":2119," अम":1263," अभ":1046," आज":524," अस":670," अव":2716," आत":825,"ेत्":3123," इं":1185," अक":778," अग":573," आं":486," अत":780,"ेपा":393,"ेन्":1060,"ेना":477,"ेयर":374," इस":13352," इल":428," इन":2514,"ों ":14584," इत":729," आस":419," आर":1183," आम":378," आय":881," आव":1052," आन":491," आद":1214," आध":1052," आप":1294,"ेलव":3726," उस":2203," उप":2582," उन":3214," उद":990," उत":2673,"ेरि":755," वा":3866," वी":554," वि":11396," वृ":367," शत":617," वस":497," वह":1459," व्":2079," शर":525," वे":1801," वै":880," शब":1343," शु":774," सं":11345," शि":1818," शा":2156," शह":1270," सक":1827," श्":1605,"ैसे":672," शक":380," वर":2508," लग":1362," ला":1268," लि":5639," ले":1874," लो":1934," रख":1043," रज":470," या":6370," यह":10383," रच":735," यू":702," यु":969," यो":670," ये":2040," रा":8448," रि":478," रह":2139," रे":6398," रू":2374," रु":449," लं":500," रो":897,"ोर ":430," हो":7690," हि":3553," ही":2071," हा":1189," हु":3748," हे":505," है":50701," सम":6692," सभ":777," सब":1516," सन":1812," सर":2776," सट":433," सद":698," सत":571," हम":409," स्":16029," हर":704," सो":571," सा":6452," सि":2848," सह":892," से":17371," सी":1037," सु":1695," सू":1297," दर":820," थे":2324," दू":1295," दृ":400," दी":559," दु":776," दा":748," दि":4337," दक":983," त्":639," तो":1173," था":4674," थी":1702," तम":403," तर":977," तथ":2624," तत":379," ता":833," ति":580,"ोग ":2225," ती":840," तु":355," तक":1670," डा":569," डि":508," ट्":3950,"ोच ":425," मो":707," मे":29038," मै":859," मू":889," यद":611," मा":5157," मि":3653," मी":694," रं":422," मु":2959," मह":3455," मर":397," भौ":360," मन":1109," मध":857," भू":1742," भी":4132," भा":10893," भर":368," ब्":1174," बे":646," बै":464," बो":710," बा":3995," बि":1259," बी":1196," मं":1754," बु":605," भग":427," बह":1547," बस":477," फ्":403," बर":576," बल":422," बद":685," बन":2421," फे":481," फा":416," बज":3787," फ़":822," फि":728," बड":1101," प्":19145," पो":399," पि":711," पा":4190," पु":2921," पी":496," बं":729," पे":601," पै":416," पृ":1225," पू":2103,"ोड ":4034," पर":10662," पश":1088," पह":3539," पड":455," पट":420," पद":1827," पत":1117," न्":487," नह":1717," नि":5215," ना":4741," पं":1035," नी":536," ने":3413," ध्":387," नव":592," धा":800," नद":808," दे":3244," द्":5503," दो":1318," धर":1010," नग":840,"ेंट":533,"ेंद":417," ई ":361,"ेल ":4266,"ेर ":787,"ेस ":3757,"ेष ":525,"ेश ":2529,"े। ":1586,"ेकि":361,"ेक्":824,"ैंड":368,"ैं।":6253,"ेज़":466,"ेजी":411," व ":823,"ृष्":1735,"ें ":29625,"ृति":1045,"ृत्":636,"ेन ":4101,"ेद ":371,"ेज ":388,"ेट ":613,"ैं ":4429,"ेक ":779,"ुष्":405,"ुवा":512,"ुला":407,"ुरा":1009,"ुरस":482,"ुर्":818,"ुरु":791,"ुरू":359,"ुमा":600,"ुनि":743,"ुना":530,"ुद्":1350,"ुत्":627,"ूसर":611,"ूषण":1023,"ूर्":2859,"ूरी":482,"ूटत":1807,"ुस्":700,"ुसा":1149,"िकल":459,"ाला":1369,"ालि":2288,"ाली":1852,"ाले":1192,"ावि":362,"ावा":595,"ुत ":1429,"िका":3179,"िकि":844,"िकी":876,"ावर":399,"िको":367,"िक्":1744,"ाषा":2056,"ासन":527,"ाष्":2240,"ाशि":573,"ासक":514,"ाहि":1577,"ासि":587,"ासा":469,"ासी":381,"ाहर":429,"ास्":1433,"िज्":1407,"ुर ":2020,"िता":1136,"िति":559,"ुल ":595,"ित्":3274,"िद्":3069,"िधि":924,"िधा":871,"िनट":1785,"िना":538,"िनि":795,"िन्":3490,"िपी":396,"िभि":546,"िभा":779,"ियन":355,"ियम":677,"िमी":551,"िमा":800,"ियो":3134,"िये":2350,"िया":10338,"िर्":2740,"िरा":423,"िलत":401,"िले":1007,"िल्":2138,"िलो":516,"िला":1775,"िसम":787,"िष्":1088,"ी। ":1157,"िशा":959,"िश्":1982,"िशे":677,"िसक":1052,"िवा":1432,"िवे":381,"ीका":588,"िवर":417,"ूप ":2670,"िहा":1270,"ून ":537,"िसे":557,"िसी":1508,"िस्":2756,"ीडि":468,"ुंब":372,"ुंच":1928,"ीटर":673,"ूल ":705,"ीति":750,"ूर ":759,"ुआ।":435,"ूह ":393,"ुओं":451,"ीर्":408,"ीमा":484,"ुक्":1845,"ीवन":532,"ुख्":1068,"ृत ":890,"ीं ":2774,"ित ":12024,"िण ":703,"िन ":1347,"िल ":1061,"ीक ":803,"ांग":686,"ांस":610,"िम ":1068,"ांत":1630,"िय ":463,"िर ":1319,"ीच ":562,"ाएँ":397,"िश ":406,"िस ":650,"ा। ":4040,"ुआ ":1392,"ीत ":751,"ागर":849,"ाक्":373,"ाकि":1058,"ाका":641,"ाकर":472,"ाओं":1931,"िंद":1176,"ाड़":911,"िंह":577,"ुए ":894,"िंग":982,"ाटक":402,"ाजा":802,"ाजि":402,"ाज्":2259,"ीप ":552,"ाजन":843,"ाजध":594,"ाजस":382,"ाची":795,"ीन ":2404,"ुई ":687,"ाने":2242,"ाना":1838,"ानि":2065,"ानी":2791,"ुख ":1161,"ानव":590,"ानस":622,"ाध्":419,"ापन":842,"ान्":1734,"ानो":532,"ादी":544,"ादा":388,"ादि":1157,"ानत":633,"ाधि":410,"ानक":524,"ाति":524,"ाता":4840,"ाती":1316,"ाते":918,"ात्":3549,"ील ":655,"ीय ":6964,"ीर ":855,"ारी":1527,"ारि":1016,"ारा":6234,"ारस":394,"ार्":5244,"ारो":922,"ारू":465,"ारे":707,"ालय":1677,"ामि":517,"ामा":1032,"ायक":414,"ाम्":377,"ायन":449,"ामी":544,"ाया":1613,"ायी":366,"ारक":420,"ारण":1386,"ारत":7763,"ाबा":770,"ाब्":555,"ामक":498,"ुछ ":876,"ापा":517,"ाप्":901,"ाफी":651,"ां ":1658,"़ी ":1669,"ाई ":2362,"़े ":628,"हों":935,"है।":21737,"होत":4643,"होन":1130,"ह्म":356,"़ा ":1290,"ाँ ":1565,"ाग ":1189,"ाथ ":1651,"ाद ":3112,"ाण ":964,"ात ":1397,"ान ":8758,"ाज ":1013,"ाश ":423,"ाव ":1068,"िक ":8100,"ाह ":929,"ास ":2767,"ाम ":3709,"िए ":2584,"ाब ":747,"ाल ":3360,"ार ":11273,"ाय ":1113,"समु":515,"समा":1211,"समू":429,"सरक":1452,"समे":1977,"सम्":2440,"सर्":919,"सबस":1305,"समय":1034,"ष्य":501,"ष्ण":694,"ष्ठ":1362,"सभी":533,"सभा":753,"ष्ट":3181,"षेत":2985,"सन्":672,"हत्":826,"हते":1441,"सेन":499,"सें":450,"सीम":423,"सिर":645,"सूर":646,"सूच":437,"सां":361,"साह":1353,"सिद":1352,"सित":526,"साध":457,"सार":1977,"साम":1236,"साय":536,"सिक":706,"साग":441,"सिं":774,"साथ":1234,"सहा":390,"ससे":419," १८":501," १९":2789," २०":1512,"हला":582,"हले":581,"हरा":369,"स्व":2405,"स्य":682,"स्म":485,"स्ल":386,"स्थ":6865,"स्प":4184,"स्ट":9331,"स्त":5132,"स्क":2303,"सेव":720,"हैं":10612,"हें":816,"हुत":957,"हुई":797,"हुए":951,"हुं":1933,"हुआ":1762,"हिन":1798,"हित":1670,"हीं":1893,"हास":1027,"हाव":428,"हार":1703,"हान":874,"हिं":1060,"हाँ":938,"हां":1185,"हे ":650,"है ":18074,"सकी":3365,"सका":2827,"सके":2127,"सत्":357,"षिण":921,"सटी":407,"षित":444,"हो ":1330,"शेष":769,"शों":440,"हा ":1528,"सकत":1521,"ही ":2790,"श्व":2006,"श्र":1917,"श्य":1102,"श्च":1235,"शहर":1243,"से ":20466,"सी ":3799,"हर ":1676,"संच":2031,"शिय":571,"संक":898,"संग":1492,"संख":1034,"शिव":510,"संघ":401,"संब":719,"संप":1355,"संय":444,"संस":2227,"शित":453,"शाह":383,"शास":1407,"शिक":1243,"शाल":480,"वेश":377,"वेद":482,"शता":469,"सर ":563,"सा ":1620,"व्य":2741,"शब्":1328,"वों":401,"वर्":3433,"षा ":2805,"वरी":565,"ववि":516,"शक्":405,"वश्":752,"वस्":1030,"सन ":1657,"वाद":1110,"वान":862,"वाच":378,"विक":1766,"वाल":2753,"वास":1153,"वार":6241,"वाय":555,"वाम":390,"वित":981,"विद":1842,"विध":1382,"विज":1255,"वाह":694,"विच":389,"वीं":679,"विष":806,"विश":2580,"विस":375,"विव":625,"विभ":1112,"वीप":490,"वता":365,"वधि":1770,"वपू":515,"शा ":783,"षण ":1678,"शी ":479,"वंश":456,"शन ":10109,"वे ":5027,"वि ":440,"वा ":1780,"वी ":1441,"ल्प":398,"ल्म":761,"ल्य":463,"ल्ल":1775,"लोक":1046,"लोग":827,"लों":837,"वह ":1195,"लोम":397,"वल ":387,"लेख":932,"लेक":881,"लिय":3022,"वर ":730,"लाक":373,"लात":527,"लिए":2317,"लाल":465,"लिख":696,"लिक":529,"लित":2228,"लिप":379,"वन ":938,"लवे":3731,"लना":388,"लता":462,"लती":425,"वं ":2957,"लगभ":597,"लगा":459,"रेस":3665,"रोग":432,"रों":2025,"र्श":1019,"र्व":3266,"र्स":388,"र्ष":1579,"र्म":3443,"र्भ":362,"र्य":2627,"र्थ":2552,"र्द":1098,"र्ध":360,"र्न":513,"र्फ":531,"र्ब":360,"र्ट":763,"र्ड":507,"र्ण":1686,"र्त":1633,"र्ग":1726,"र्क":924,"र्ज":853,"र्च":479,"रीक":801,"रिव":843,"रिय":2208,"रीय":1696,"रीर":445,"रुप":452,"रूप":2871,"रें":640,"रेज":901,"रेल":6023,"रेन":3698,"रसा":555,"रसि":1094,"रहत":499,"रहा":649,"रहे":489,"रस्":963,"ले ":3374,"रां":1067,"रान":1776,"राप":737,"रात":590,"राण":691,"राज":5799,"राच":861,"राक":432,"रिट":356,"रित":976,"राष":2127,"रास":461,"राम":1378,"राय":1077,"रार":802,"रिक":2624,"राव":460,"ला ":4369,"रयो":985,"रयु":635,"रम्":388,"रमा":570,"रमु":1096,"रवा":666,"ली ":4841,"रने":2577,"रना":959,"रदे":1187,"रदा":576,"रभा":427,"योज":542,"योग":2669,"यों":3685,"या।":1162,"युक":1333,"युत":419,"युद":407,"यान":790,"याप":561,"यात":2560,"याद":888,"यास":655,"यिक":384,"याल":1423,"यार":471,"याय":535,"रता":1516,"रति":1618,"रती":4166,"रते":811,"रत्":447,"रथम":512,"लय ":1619,"यूट":437,"यून":379,"रणा":516,"रचन":643,"रक्":994,"रखे":433,"रजि":411,"यां":725,"याँ":525,"यहा":1528,"लन ":775,"रे ":1761,"मस्":459,"महा":2097,"महत":756,"यक्":1167,"रु ":369,"यका":377,"री ":6215,"मृत":432,"मूह":442,"मूल":727,"मुद":734,"मुख":2339,"रंभ":445,"मुं":391,"मीट":638,"रंग":680,"मिल":1800,"मित":732,"मिन":2223,"मार":2023,"माल":612,"मिक":732,"रो ":378,"माण":1011,"माध":371,"मात":697,"मान":5551,"माज":766,"मां":436,"मों":431,"लग ":468,"यदि":507,"में":27306,"मेर":842,"मेल":1950,"यता":597,"रका":3522,"म्र":418,"म्प":956,"म्ब":1357,"म्म":1860,"रत ":4510,"रण ":3158,"या ":17033,"यु ":436,"यी ":716,"भिन":962,"भाव":792,"भाष":2240,"भार":7799,"भाग":1261,"रम ":841,"यंत":450,"ये ":4867,"मध्":813,"भूम":411,"भूष":1010,"रल ":672,"रा ":9045,"मर्":364,"रह ":1087,"मा ":1358,"भगव":386,"मी ":1758,"मे ":1785,"यन ":960,"बहु":1203,"बसे":1366,"बाल":449,"बार":924,"बाद":1868,"बां":380,"यर ":634,"मंत":477,"मंद":933,"बिह":404,"बीच":558,"मंड":367,"यम ":869,"रक ":417,"यह ":8676,"बोल":432,"ब्र":1124,"ब्द":1896,"प्य":460,"प्र":23891,"प्त":1355,"भा ":824,"भी ":4811,"मन ":544,"फ़ि":394,"बजे":3647,"फिल":452,"मय ":1094,"बड़":987,"बदल":617,"यक ":1025,"फेर":414,"बना":1428,"बन्":526,"फ्र":464,"पहु":2050,"पहल":1024,"पश्":999,"बी ":558,"बा ":412,"पर्":1203,"परम":473,"पयो":528,"परि":1837,"परा":615,"मक ":839,"पृष":917,"पुस":378,"पूर":2834,"पुत":428,"पुर":3647,"पीड":441,"बंग":529,"बंध":817,"पास":574,"पित":787,"पाक":1033,"पान":519,"पात":387,"पाद":1538,"पार":1007,"पाल":950,"पक्":435,"फी ":766,"न्त":2330,"न्ध":640,"न्न":1957,"न्द":3701,"न्य":2792,"न्म":1190,"न्ह":2015,"नों":1406,"पद्":1215,"पदा":458,"भग ":605,"पनी":874,"पना":1055,"पन्":743,"पने":1546,"बर ":1142,"पत्":1249,"पति":612,"पड़":437,"नमे":432,"नवर":402,"नसं":440,"नदी":699,"ध्य":2164,"नुस":1344,"पंज":567,"नीत":724,"नेप":393,"नेत":472,"नेक":552,"बई ":387,"नही":1669,"नसभ":406,"निव":567,"निर":2352,"निय":1801,"निध":617,"नित":1221,"नाव":392,"निक":2637,"नाय":738,"नाम":2777,"नार":680,"नान":715,"नात":423,"नाथ":458,"नाट":450,"नाड":359,"नाग":557,"द्द":569,"द्व":5706,"द्र":2473,"द्य":2403,"द्म":1046,"द्ध":2842,"धर्":969,"नका":1135,"नकी":794,"नके":1054,"देख":471,"देव":1125,"देश":3481,"दों":393,"दोन":372,"धित":713,"धिय":418,"धार":1754,"धिक":1763,"धान":1811,"पर ":7035,"नता":441,"नते":555,"नगर":1205},"n_words":[3436892,4107546,2722787],"name":"hi"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":8690,"E":5169,"F":5109,"G":8706,"A":14720,"B":12556,"C":10193,"L":6822,"M":14153,"N":10511,"O":10729,"H":9113,"I":13451,"J":5359,"K":11989,"U":4743,"T":10711,"W":3179,"V":5850,"Q":585,"P":15109,"S":23054,"R":9319,"Y":1450,"X":701,"Z":4251,"f":16193,"g":88687,"d":148916,"e":436858,"b":60942,"c":64868,"a":586818,"n":342270,"o":415815,"l":177559,"m":139424,"j":258985,"k":204303,"h":49394,"i":501698,"w":7347,"v":140168,"u":209613,"t":201984,"s":238514,"r":271701,"q":1755,"p":126499,"z":85944,"y":9646,"x":2022,"í":886,"é":1412,"á":1674,"ó":546,"đ":8171,"ć":16773,"Č":895,"č":46294,"Ž":887,"ž":23555,"Š":1432,"š":26211," l":9617," m":20259," n":49295," o":45630," h":4791," i":61249," j":56434," k":46966," d":31478," e":6507," f":5286," g":21986,"р":912," a":19142,"с":742," b":17477,"т":629," c":5752," z":19710," u":45274," t":21468," w":820," v":15539," p":67380," s":79773," r":20001," J":5152," K":11465," H":8434," I":11848," N":9837," O":7822," L":5986," M":13514," B":11861," C":9435," A":13018," F":4631," G":8304," D":7573," E":4598,"л":648," Z":4104,"к":817," Y":1407," X":526,"и":1209,"о":1253,"н":943," S":19621," R":8667,"в":563," Q":537," P":14387,"а":1816," W":3048," V":5326," U":4477,"е":917," T":9972," č":5596," Č":887," ž":5028," Ž":838," Š":1403," š":4901,"A ":857,"Da":1321,"Co":1834,"Cr":1328,"Ce":588,"Ch":1843,"Du":764,"Do":1328,"Dr":1133,"De":1138,"Di":1002,"Fe":631,"H ":574,"Fa":563,"Eu":879,"Ge":716,"Ga":972,"I ":821,"Fr":1113,"Fo":779,"Fi":649,"Au":808,"Ar":1684,"At":729,"As":560,"D ":829,"Ba":2710,"Af":545,"Am":1341,"An":1467,"Al":1655,"Bu":1011,"Br":2042,"Ca":2160,"Bi":1321,"Be":1685,"Bo":2115,"Ku":844,"Kr":1748,"Ko":2488,"Le":881,"Li":1283,"La":1685,"Lu":593,"Lo":1036,"Me":1818,"Mi":1887,"O ":2633,"Ma":5853,"Mu":793,"Mo":1972,"Nj":820,"Ni":1096,"Ne":1404,"Na":3401,"No":2003,"Ob":956,"Gl":526,"Gr":2253,"Go":1536,"Gu":881,"Gv":612,"Ha":1340,"He":1374,"Hi":570,"Ho":905,"IS":2154,"Hr":2959,"Hu":536,"In":4619,"Is":978,"Ja":1450,"L ":681,"Iz":610,"Je":1026,"Jo":711,"Ju":1563,"Ka":3681,"Ki":952,"Tu":1072,"Tr":1314,"To":1609,"Th":1212,"Ti":993,"Te":1282,"Ta":1685,"St":1916,"Sv":940,"Su":1303,"Wi":722,"Wa":1107,"Vo":662,"Vi":1040,"Va":1022,"Ve":1522,"Pu":746,"Pr":3018,"S ":720,"Pe":1604,"Pa":2898,"Pl":816,"Po":3639,"Pi":867,"Os":790,"Ov":672,"Op":1011,"Or":735,"Se":1283,"Sj":776,"Si":1385,"Sh":659,"Sl":1203,"Sr":1393,"Sp":846,"So":1277,"Ru":1026,"U ":1401,"Sa":3386,"Re":2977,"Ri":1491,"Ro":1097,"SO":2074,"Ra":1418,"b ":1522,"a ":202747,"Za":2365,"Ze":593,"i ":135497,"ađ":2045,"gd":857,"ge":5429,"ga":10157,"ać":1902,"fi":4106,"fs":661,"fr":2013,"ač":10342,"fu":639,"fo":2404,"j ":16449,"he":4881,"ha":5859,"gn":997,"gl":6895,"gi":6788,"gh":714,"gu":4729,"gr":14473,"go":15835,"du":5516,"dv":2741,"g ":19299,"ea":3988,"eb":4045,"ec":3773,"ed":22606,"de":10224,"dg":624,"di":30316,"dj":3005,"dm":1005,"dl":806,"do":12624,"dn":15638,"ds":4759,"dr":13255,"ew":844,"eu":1755,"ev":10791,"ey":838,"ez":15435,"fa":1476,"h ":20168,"fe":2011,"eh":1279,"eg":7983,"ef":1181,"ee":1826,"el":24518,"ek":14461,"ej":2546,"ei":1534,"ep":5699,"eo":3348,"en":44674,"em":19226,"et":18394,"es":16599,"er":32266,"ca":14154,"e ":160018,"br":6048,"bu":5481,"bn":1412,"bo":6809,"bj":1695,"bl":6540,"bi":13442,"be":6031,"db":626,"da":22107,"f ":1430,"cu":2623,"ct":683,"cr":1908,"co":2679,"ck":2130,"ci":21232,"ch":4164,"ce":10558,"c ":2983,"az":15697,"ay":1948,"ba":10149,"d ":22851,"at":29447,"as":22343,"ar":30709,"aw":1498,"av":28227,"au":3869,"ak":15591,"al":35013,"ai":3078,"aj":16895,"ao":6448,"ap":10100,"am":17849,"an":70440,"ac":13980,"ad":21936,"aa":799,"ab":3879,"ag":6276,"ah":2814,"ae":1668,"af":2077,"nu":8144,"nt":10450,"ns":14593,"ič":14290,"no":45797,"nn":1399,"nz":952,"ny":715,"oe":678,"of":2150,"oc":4342,"od":40790,"oa":2451,"ob":12953,"om":28810,"on":23346,"ok":14795,"ol":17826,"oi":3961,"oj":40426,"og":22833,"oh":1159,"ot":12114,"os":23349,"ov":31034,"ou":2922,"op":11191,"oo":1568,"or":33494,"jč":526,"r ":11028,"ow":1054,"oz":7848,"pe":6182,"pa":15403,"pl":8816,"pn":1287,"po":36246,"ph":606,"pi":11043,"pj":1476,"lo":18651,"ln":5989,"lm":1222,"ll":3111,"ls":2651,"lu":8623,"lt":2358,"o ":61519,"ma":31717,"mb":2862,"dž":894,"me":24452,"ml":1963,"eš":3371,"mi":10591,"mj":5104,"mn":1665,"mp":2261,"mo":11753,"mr":1000,"ms":2000,"mu":3945,"p ":2149,"na":87240,"nc":5972,"nd":11672,"ne":33576,"nf":891,"ež":1706,"ng":7899,"ni":54450,"nj":27833,"nk":2801,"ić":2802,"jv":1605,"eđ":3970,"ju":19590,"eč":2671,"js":8691,"jn":2724,"jo":4699,"eć":4631,"jk":855,"ki":28341,"ke":16969,"kc":1026,"ka":37663,"m ":37899,"ks":2734,"kt":4893,"ku":14523,"kv":1615,"ko":63668,"kr":10108,"kl":4333,"km":1244,"kn":1083,"li":41243,"lk":741,"lj":24378,"le":17764,"ld":889,"lg":643,"la":37196,"lb":2414,"n ":24345,"hr":2448,"hv":1089,"ht":807,"hu":1937,"hi":4652,"hn":862,"ho":4536,"id":4699,"ic":20781,"ib":2814,"ia":4852,"ih":18991,"ig":5433,"if":1923,"ie":2060,"k ":14010,"ir":12272,"is":21664,"it":16417,"iu":848,"iv":14019,"aš":3959,"ij":57037,"ik":23850,"il":22797,"im":27723,"in":47299,"io":10059,"ip":4978,"je":134763,"až":1877,"ji":26676,"iz":19026,"l ":6966,"ja":39549,"z ":7997,"ož":2534,"oš":2893,"wi":861,"rč":1709,"y ":3377,"wa":2878,"we":956,"vl":4881,"vj":3877,"vk":562,"vi":24026,"vu":3419,"vr":6622,"vs":1342,"vn":9959,"vo":21455,"uz":3112,"uv":1349,"ve":22017,"vc":593,"va":35406,"x ":785,"ui":1893,"uj":4079,"uk":4559,"ul":5933,"pć":2758,"ue":1973,"ug":8425,"uh":1544,"ur":9708,"us":10059,"ut":7404,"um":7672,"un":8708,"uo":923,"up":11863,"tu":9961,"tt":1393,"tv":7233,"ođ":1449,"ub":5211,"ua":3882,"ud":5733,"uc":1692,"w ":858,"to":28843,"tn":5773,"tm":575,"tl":1860,"oć":686,"ts":8790,"tr":16932,"oč":4671,"tp":896,"te":22658,"tk":2453,"tj":1700,"ti":34096,"th":3097,"v ":4865,"ta":38983,"su":13292,"sv":6451,"ss":1881,"st":55588,"sl":7897,"sk":52271,"sn":7159,"sm":3278,"sp":5046,"so":5247,"sr":3449,"sc":1522,"se":24429,"sh":2381,"sj":4697,"si":10381,"rz":1007,"u ":87089,"sa":16094,"rr":1162,"rs":8936,"rt":5687,"ru":17434,"rv":8435,"ry":930,"rp":1334,"ro":34834,"rn":10194,"rm":2849,"rl":1655,"rk":2924,"rj":894,"ri":44000,"rh":1312,"rg":3269,"iž":1025,"re":32485,"rd":3070,"rc":1885,"rb":2322,"ra":64303,"t ":14252,"qu":1390,"iš":6125,"s ":15910,"pt":1447,"pu":6628,"pp":658,"pr":27731,"ps":2712,"už":5556,"uš":2233,"rž":4670,"zg":1072,"rš":1722,"zi":22879,"zb":2089,"zd":2018,"ze":4698,"za":19270,"zv":4353,"uč":5083,"zr":2246,"zu":2322,"zo":3146,"zn":8371,"zm":2468,"zl":2200,"uć":2114,"ye":523,"ya":1738,"yo":520,"á ":773,"é ":601,"ć ":1732,"ći":5749,"ću":1057,"ća":4248,"će":3354,"či":7487,"čj":2822,"čk":13325,"čl":554,"če":5735,"ča":5202,"č ":1473,"đe":2942,"đa":1580,"čn":7182,"ču":1818,"đu":2924,"š ":1063,"šć":1355,"še":2480,"ša":1860,"šp":635,"šn":3132,"šk":2964,"šl":591,"ši":3525,"šu":567,"št":7272,"žu":2110,"žn":3428,"žb":852,"že":4351,"ža":6201,"ži":5092,"đen":1909,"đer":597,"đu ":1688,"đun":708,"čun":831," Ga":961," Ge":702," Fo":742," Fr":1109," Fi":643," Ha":1336," He":1372," Go":1534," Gr":2247," Gu":878," Gv":612," Gl":525," Hu":528," IS":2110," Hr":2946," Ho":895," Hi":565," Je":1023," Ja":1447," Iz":607," Is":977," In":4605," Ka":3666," Ki":914," Jo":707," Ju":1556," La":1670," Le":867," Li":1265," Ko":2481," Kr":1745," Ku":843," Ma":5817," Mi":1882," Me":1804," Lo":1032," Lu":587," Ne":1391,"а ":562," Na":3399," Nj":820," Ni":1084," Mo":1968," Mu":787," Am":1333," An":1450," Al":1637," Af":533," Ba":2689," Au":794," At":724," As":558," Ar":1651," Be":1676," Bi":1296," Bo":2110," Br":2033," Bu":1004," Ca":2140," Ce":585," Ch":1828," Cr":1326," Co":1819," Da":1296," Di":998," De":1131," Dr":1125," Do":1300," Du":759," Eu":878," Fe":623," Fa":554," Wi":714," Wa":1098," Ze":593," Za":2362," a ":4274," Ov":665," Os":789," Or":732," Op":1008," Po":3623," Pl":812," Pi":864," Pe":1599," Pa":2881," No":2002," Ob":956," Ra":1412," Ro":1089," Re":2970," Ri":1491," Pr":3007," Pu":743," Sv":937," Su":1301," St":1875," Ta":1677," Th":1189," Ti":990," Te":1277," Tr":1295," To":1604," Ru":1022," Sa":3371," U ":1282," Sh":655," Si":1373," Sj":776," Se":1273," So":1270," Sp":836," Sr":1393," Sl":1201," Va":1018," Ve":1517," Vi":1035," Vo":661," Tu":1066," ja":1334," iz":11704," je":51007," im":3911," in":3359," il":7148," is":4046," ka":7800," ki":1128," jo":810," ju":3163," ha":604," he":608," gl":3276," gr":8161," go":6544," ih":786," ig":1120," hi":648," ho":696," hr":1744," nj":3220," ni":1874," ne":6451," na":34503," mu":1067," mo":4787," mn":528," ok":4011," on":1068," od":15714," of":856," ob":6528," no":2407," le":827," lj":2291," li":2543," la":2172," ku":1972," kn":829," km":1048," kl":1410," kr":4664," ko":26854," me":3676," mi":2188," mj":1905," o ":1165," ma":4891," lo":1177," am":2645," an":1498," ak":848," al":3416," au":1442," ar":1418," at":615," ba":2490," bi":5965," be":1110," bo":2421," bl":1053," bu":691," br":3222," ca":552," et":559," en":2154," el":1166," ek":584," fa":621," fr":941," fo":883," fi":1669," ge":1007," gd":551," ga":1134," i ":27993," cr":1408," ce":1531," ci":1052," da":4574," do":6878," dr":6375," de":2927," dj":1233," di":5389," dv":2035," du":1555," zn":2259," zr":767," zv":702," za":12881," ze":1108," zb":633," už":620," ru":1327," u ":32404," sa":9931," se":16886," sj":3429," si":2544," sn":698," sm":1890," sl":3915," sk":4532," sr":2957," sp":2436," so":944," ra":7698," re":4140," ri":4141," ro":2280," pu":1716," pr":23703," s ":3500," os":3950," ot":2810," ov":1276," op":3149," or":1765," oz":1224," pe":1664," pa":2832," pl":5604," po":28804," pi":966," pj":1218," va":1250," ve":3149," uz":1739," vo":2912," vr":3315," vi":2766," vj":724," vl":1147," ud":648," tv":902," tu":1007," us":1218," ut":721," ur":562," up":1243," um":849," un":999," uk":1055," ul":803," ug":653," ta":2808," st":9162," sv":5803," su":10580," tr":4273," to":2817," th":929," ti":2177," te":5426," če":1938," čl":530," či":1960," št":1607," ši":1148," šp":620," ži":2289," že":880," žu":1586,"Eur":771,"Fra":831,"Her":776,"Gra":1106,"Gor":695,"Ind":3698,"Hrv":2864,"ISO":2034,"Ara":578,"šće":547,"Ame":663,"Car":617,"Bra":541,"Bri":609,"Bos":708,"Chi":621,"Cha":594,"Nal":560,"Nje":555,"Nov":1094,"Opć":766,"Per":568,"Par":726,"Pro":696,"Pri":721,"Pre":828,"Juž":532,"Kal":564,"Kan":524,"Kar":752,"Kra":872,"Man":821,"Mal":573,"Mar":1189,"Mad":576,"Zag":834,"Sta":754,"Sje":767,"Srb":562,"Sre":533,"Slo":726,"Rus":582,"San":791,"Rep":1934,"SO ":2049,"Vel":771,"The":928,"šen":553,"še ":976,"što":1559,"šte":2571,"šti":879,"šta":884,"štv":1041,"ško":1181,"šnj":2421,"šin":692,"šir":1400,"ški":826,"bje":654,"bja":1027,"bit":2079,"biv":585,"bio":1738,"bil":3046,"bin":696,"bij":1580,"blj":1003,"bli":4296,"bla":688,"bol":1257,"boj":638,"bog":903,"bič":654,"bno":610,"bor":1232,"be ":1311,"ban":1537,"bal":1792,"bav":860,"bar":843,"bi ":1256,"ber":918,"ben":1811,"ca ":10450,"car":656,"can":1075,"ce ":5505,"bri":882,"bro":2066,"bra":1828,"bu ":732,"bum":1939,"buh":796,"aka":2848,"am ":2209,"ake":1450,"aki":683,"aji":1629,"ajn":1118,"ajs":690,"aju":3559,"ajv":1325,"al ":2419,"aja":1758,"aje":2556,"ain":623,"ak ":2449,"aj ":1426,"aha":673,"agr":1324,"agu":525,"ago":1080,"anu":1384,"ano":4034,"ant":2834,"ans":7065,"ane":2342,"ang":1485,"ani":9755,"anj":10858,"ank":1213,"ana":11419,"anc":2218,"and":3683,"amo":1853,"ami":1538,"ame":3781,"amb":801,"ama":5199,"ao ":5760,"alu":766,"alt":538,"als":661,"alo":1706,"aln":4903,"all":806,"ali":7309,"alj":2503,"ale":2883,"ala":6700,"alb":1948,"an ":10347,"aku":618,"akt":1211,"ako":4291,"aba":762,"abi":791,"ae ":719,"aca":3812,"ad ":3345,"ac ":1484,"afs":539,"afi":554,"ai ":709,"aga":937,"age":778,"ado":1169,"adr":1028,"adn":3155,"adi":2550,"ade":1153,"ads":935,"adu":1422,"aci":6004,"ach":727,"ada":5689,"azn":685,"azi":7453,"azl":1063,"azv":1013,"azu":592,"aze":804,"aza":1040,"azb":1010,"aya":527,"ba ":2218,"at ":2601,"arh":564,"are":1147,"ard":1409,"ara":6301,"aro":3575,"arn":1712,"ark":1048,"ari":4189,"aru":579,"ars":3100,"art":1393,"asa":826,"asi":1226,"ash":669,"ase":3354,"asn":1043,"asp":540,"ask":1064,"asl":534,"ar ":2545,"apa":4410,"api":1243,"apo":975,"apr":526,"aps":650,"apu":900,"as ":2262,"ava":8033,"aut":1108,"avo":1516,"avn":4358,"avl":3013,"avi":4083,"ave":3087,"ay ":531,"awa":719,"avu":1466,"av ":1100,"ata":3446,"ast":9058,"atn":1424,"atk":562,"atr":1028,"ato":2245,"ate":2791,"ati":6984,"ats":4995,"atu":1171,"aus":601,"jeg":2764,"jed":10627,"jec":1103,"jer":2418,"jek":4521,"jel":6299,"jem":5086,"jen":9322,"jez":10141,"jes":4061,"jet":3631,"jev":7451,"jač":750,"ji ":16045,"ažn":588,"jat":632,"jav":2585,"jal":1858,"jak":822,"jan":5965,"jam":1147,"je ":61320,"ješ":1621,"jni":1204,"joj":1250,"jom":1082,"jiv":803,"jim":3670,"jin":1845,"jih":2364,"ječ":1963,"jeć":1172,"eća":1346,"eće":940,"eći":1617,"ito":2406,"itu":706,"its":544,"isk":1391,"isl":604,"iso":539,"isn":807,"isp":569,"isu":779,"ist":10340,"iv ":1925,"ita":3250,"ite":2457,"iti":4001,"ivo":1642,"ivn":1941,"iva":3738,"ivi":1447,"ive":1512,"is ":1700,"ion":3221,"ipa":1930,"ir ":678,"iro":1457,"iri":1540,"isi":976,"ish":564,"ise":556,"isc":574,"isa":1555,"ire":1683,"ira":5142,"it ":890,"ja ":23119,"iz ":4867,"izu":877,"izv":2232,"izr":793,"izo":774,"izn":759,"izm":2019,"izl":668,"izi":2042,"izd":806,"iza":2198,"kih":5870,"kim":2628,"kin":753,"km ":900,"ki ":16671,"ked":578,"ke ":14258,"kci":941,"kra":4133,"kre":1063,"ku ":4982,"kro":1000,"kru":1711,"kri":1186,"kov":2238,"kot":547,"kor":2342,"kop":1083,"kon":3505,"kom":6854,"kol":2347,"koj":25320,"kog":6388,"kod":903,"knj":735,"ko ":9564,"kla":1495,"klo":701,"klj":755,"eđu":2748,"još":665,"jve":805,"eđe":812,"juj":863,"jug":1520,"jud":1914,"jsk":7836,"jst":574,"eči":628,"ju ":11452,"eče":573,"kaz":825,"kat":974,"kar":1304,"kas":686,"kan":1885,"kao":3422,"kal":1067,"kam":671,"kak":633,"kad":1513,"kac":594,"juž":1378,"ka ":22352,"juč":696,"juć":551,"ha ":913,"han":1101,"har":926,"he ":1979,"her":717,"hin":716,"hit":527,"go ":855,"gle":1361,"gla":3728,"god":2754,"gom":912,"gon":803,"gos":769,"gor":1668,"gov":5098,"gu ":1902,"gru":1369,"gra":9421,"gre":1427,"grč":1302,"ian":1493,"ic ":604,"iba":598,"ibe":551,"ia ":1869,"ifi":824,"ih ":17225,"icu":773,"ici":6028,"ich":645,"ice":4697,"ie ":604,"ica":6876,"idi":521,"ide":1150,"ida":944,"ašn":1360,"ašt":576,"il ":644,"ija":16185,"ije":24598,"iji":7426,"ijo":787,"ijs":5406,"iju":2114,"im ":10462,"ika":7961,"ige":790,"iga":952,"igi":547,"igr":1302,"iho":1185,"ik ":6489,"imo":723,"imp":613,"ime":3556,"imi":856,"inc":1602,"ind":1666,"ina":9909,"ino":2039,"int":1177,"ins":3389,"ine":10127,"ing":2574,"inj":1865,"ini":6764,"inu":1508,"iko":2904,"iki":780,"ike":2426,"ila":3734,"in ":2960,"iku":1705,"ilo":2029,"ill":863,"ilm":540,"ilj":1386,"ili":10385,"ile":865,"ima":9207,"io ":4778,"ils":642,"ilu":562,"hov":1213,"hrv":1503,"hva":1061,"fer":566,"ez ":610,"ezu":527,"eza":1009,"ezn":818,"eze":1246,"ezi":9990,"eta":3704,"ete":898,"eti":2144,"etn":2390,"etl":530,"etk":539,"esn":1243,"est":6673,"eto":1100,"etr":912,"ets":1480,"etu":996,"etv":760,"eve":4353,"eva":1803,"evo":961,"evn":623,"evi":1701,"er ":4432,"epa":580,"es ":2168,"epu":2109,"epo":647,"epr":650,"eri":7215,"erg":698,"ere":1639,"erc":963,"era":4007,"et ":2066,"esk":1461,"esm":595,"esi":709,"ese":1073,"esa":706,"erv":605,"eru":1435,"ert":600,"ers":1367,"ern":3520,"erm":720,"ero":2614,"eki":802,"eko":2778,"eks":1589,"ekt":2568,"eku":789,"en ":4377,"ela":3182,"ele":2918,"eli":4944,"elj":5961,"ell":698,"elo":2967,"elu":1458,"ema":3624,"eme":6131,"eml":1258,"emo":840,"emi":1481,"ene":3561,"eng":1640,"ena":7266,"end":836,"enc":1218,"eno":6088,"eni":7665,"enj":3407,"enu":1213,"ens":1955,"ent":3912,"ego":1890,"egi":1521,"ek ":753,"el ":1052,"ejs":573,"eke":1782,"eka":2350,"em ":3991,"gl ":618,"gij":2556,"gin":523,"gi ":831,"gen":1325,"ger":827,"gdj":702,"ge ":1704,"gar":946,"gal":662,"gan":1740,"ga ":4609,"ađa":855,"ađe":658,"fra":927,"ača":1970,"ače":950,"ačk":3501,"fri":721,"ači":1814,"fsk":622,"aču":958,"for":1658,"ač ":583,"aće":585,"aća":1021,"fil":896,"fik":566,"fin":592,"da ":9164,"de ":2788,"dal":1347,"daj":785,"dat":824,"dar":1385,"dan":5150,"dam":626,"dav":567,"cus":960,"co ":549,"cu ":955,"cea":713,"ch ":519,"ces":1492,"cen":997,"ceg":685,"ci ":6216,"cha":750,"ck ":1290,"che":856,"chi":796,"cij":9280,"cim":1128,"cir":569,"cio":1059,"ed ":1254,"eba":923,"ebe":581,"ean":810,"ea ":773,"duž":619,"ega":1085,"edn":7588,"edi":4719,"ede":1425,"eda":3130,"eg ":2170,"eds":1513,"edo":1202,"eci":935,"eca":935,"dvi":741,"dvo":690,"dva":1103,"drž":4140,"don":1767,"dom":1052,"dol":970,"dok":659,"dov":1189,"dos":617,"diš":1956,"dna":2624,"dne":953,"dni":3564,"dnj":1612,"dno":6490,"dob":1518,"dst":1252,"dra":1195,"dre":1145,"du ":2280,"dru":5517,"dsk":2664,"dic":3889,"der":1371,"des":966,"den":1450,"di ":4014,"do ":2419,"dje":2862,"dim":541,"din":5519,"dio":1844,"dis":1126,"dij":7896,"rađ":876,"rga":1240,"ri ":5834,"rgi":678,"ret":1800,"res":1271,"rev":690,"rez":931,"rać":609,"rač":1555,"reb":1913,"rea":805,"ree":521,"red":6133,"reg":1745,"rem":2802,"ren":2797,"rek":1056,"rel":612,"rep":724,"rdi":692,"re ":4525,"rce":928,"raz":4813,"rd ":686,"rap":1294,"ras":1520,"rat":4060,"rav":4237,"rbi":882,"raj":3616,"rag":678,"ran":11269,"ram":1816,"ral":3163,"rak":1775,"raf":1147,"rad":6716,"rac":1934,"ros":2073,"rot":1439,"rom":2618,"ron":1652,"rop":1763,"roz":1266,"rov":2438,"rob":584,"roa":633,"rod":8265,"roc":1470,"roj":2300,"roi":1622,"rol":793,"rok":933,"rog":1195,"rno":2932,"rič":3609,"rna":1479,"rne":1371,"rnj":845,"rni":2710,"ro ":1317,"rma":1223,"rmi":655,"rlo":551,"rn ":540,"rkv":521,"rka":695,"ređ":981,"reć":667,"raž":797,"rje":601,"riz":799,"rip":1837,"rio":889,"rir":879,"rit":2510,"ris":3174,"riv":1004,"rig":523,"rij":11045,"raš":597,"ril":1430,"rik":1383,"rin":1848,"rim":1849,"ria":651,"rib":805,"ric":1431,"rid":583,"rug":4029,"rup":1471,"run":520,"ruk":844,"rus":1028,"rva":5151,"rvi":1279,"rve":1037,"rvo":605,"ry ":556,"rsk":5201,"rta":726,"rst":2436,"rti":871,"rt ":714,"ru ":2471,"sad":738,"sam":1694,"san":1142,"sat":604,"sas":3039,"sav":1014,"sa ":5478,"ruž":889,"ruš":721,"ruč":2722,"rzi":592,"shi":607,"si ":1399,"sje":4603,"sis":550,"sin":1783,"sil":686,"sim":836,"sij":991,"se ":14671,"ser":1178,"set":665,"sh ":559,"seb":745,"sel":4338,"spo":2263,"spr":578,"spe":611,"spa":672,"sov":613,"son":650,"sob":1342,"su ":8627,"sre":2282,"st ":3926,"slj":632,"sli":1331,"slo":1832,"slu":1752,"sla":2093,"ski":17159,"skl":527,"sko":15307,"skr":553,"sku":5305,"ska":7019,"ske":5955,"sno":3094,"sna":922,"sni":2255,"sne":697,"smj":1004,"sma":1116,"ste":2346,"sta":16592,"sto":9996,"sti":9753,"stv":3654,"stu":2357,"str":6456,"sus":1535,"sva":788,"sve":1415,"svi":1441,"svj":804,"svo":1576,"taj":996,"tak":2161,"tal":4191,"tac":782,"tav":5990,"tat":1128,"tar":4163,"tan":6149,"tam":583,"te ":6089,"ta ":9339,"pa ":1847,"pe ":1308,"par":1595,"pat":647,"pad":4813,"pal":580,"pan":3273,"pi ":571,"per":1812,"pet":668,"pla":1804,"plj":816,"pli":1041,"ple":3986,"plo":1052,"pje":1454,"pij":889,"pin":4572,"pis":2255,"poz":2191,"por":5631,"pop":1191,"pov":2463,"pot":1881,"pos":3892,"poj":1457,"pog":612,"pom":933,"pon":975,"pok":1642,"pol":3220,"pod":6429,"po ":2082,"psk":1887,"pub":2165,"pti":586,"poč":712,"pra":3170,"prv":1850,"pri":7027,"pre":6249,"pro":8677,"put":1060,"pun":676,"pul":698,"iše":1062,"išn":1267,"išt":2503,"qui":570,"ra ":9271,"ngo":698,"ngl":1753,"ni ":16130,"nga":751,"nej":882,"nek":2622,"nen":652,"nep":734,"ner":1509,"net":1026,"nes":1293,"ng ":2367,"nač":2757,"nez":1162,"nci":2911,"nce":931,"ne ":19985,"ndu":534,"ndo":1052,"ndi":5037,"nde":837,"nda":1627,"ncu":940,"nak":2307,"nal":5099,"nam":1658,"nan":1732,"nap":971,"nar":3446,"nac":3781,"nad":1618,"nag":771,"naj":3818,"nd ":1313,"nav":631,"nat":3622,"nas":6912,"naz":2805,"na ":42477,"mož":877,"nut":1280,"nto":781,"ntr":1002,"nti":2109,"nta":2295,"nte":1843,"nst":1771,"nsk":10758,"nu ":5412,"ičn":3327,"ičk":8118,"iči":1329,"iča":967,"nt ":937,"niš":631,"ns ":584,"noa":520,"nog":6035,"noj":3137,"nom":6601,"nos":6166,"nor":535,"nov":4240,"nič":2517,"ića":608,"no ":15032,"nka":782,"nji":3978,"nje":12914,"nja":6205,"ić ":1370,"nju":1904,"njs":879,"njo":1594,"nij":6542,"naš":1014,"nih":6201,"nic":5054,"niz":1589,"nis":1359,"nit":879,"nir":714,"nim":4415,"nin":1303,"nik":4325,"ogr":2375,"ogu":995,"ogi":1476,"ogl":635,"ogo":1802,"oga":1651,"oj ":12832,"ois":1145,"oim":653,"ok ":1418,"oju":1294,"ojs":576,"ojo":759,"ojn":1052,"oji":10382,"oje":6734,"oja":6046,"ol ":664,"oiz":1012,"oce":973,"oci":1114,"ock":1115,"obu":893,"ode":1274,"odi":8132,"odo":921,"odn":4204,"ods":1138,"odr":4259,"of ":888,"oda":2801,"odu":910,"og ":13004,"oan":559,"oba":2782,"od ":14807,"obo":933,"obr":1117,"obl":1881,"obn":555,"obj":1316,"obi":2599,"oz ":529,"ozn":3312,"ozi":1205,"oza":1421,"oti":1557,"ote":985,"otr":921,"otp":614,"oto":4095,"ost":8908,"ota":912,"ov ":1004,"osi":1554,"ose":1141,"osl":1958,"oso":1519,"osn":2717,"ovj":940,"ovi":7022,"ovn":2887,"ovr":829,"ovo":6487,"ova":7007,"ove":3130,"opć":1972,"opo":690,"opi":1778,"opl":1169,"ope":858,"opa":775,"os ":1176,"opu":936,"opr":664,"ops":861,"or ":2008,"orm":1284,"orn":2821,"oro":4229,"ord":815,"ore":2892,"org":1291,"ori":7402,"osa":1092,"ort":1833,"ors":1607,"oru":1306,"ot ":522,"ora":3600,"ola":1856,"on ":4329,"olj":2400,"oli":4446,"ole":1119,"ols":789,"olo":3261,"olu":1664,"oka":2684,"om ":17269,"oke":734,"okr":3164,"oko":3273,"oku":2145,"ona":3954,"ond":619,"one":1941,"ong":1053,"onj":777,"oni":2936,"ono":2047,"ons":1609,"ont":1171,"onu":585,"oma":2281,"ome":2911,"omi":1374,"omp":729,"omo":1785,"la ":9649,"le ":3757,"lac":1275,"lad":1494,"lag":644,"lak":731,"lan":3996,"lam":781,"lar":880,"lat":2578,"las":3015,"lav":4030,"laz":5016,"lbu":1947,"kva":701,"kup":5219,"kul":1164,"ksi":936,"ktr":973,"kođ":596,"ktu":654,"kti":1269,"kuć":703,"lok":712,"lon":791,"lom":1506,"lop":558,"log":2016,"lov":4118,"lno":1832,"lni":2255,"lne":712,"lob":538,"lič":1936,"lna":885,"ltu":737,"lub":608,"lsk":2111,"lu ":2550,"liš":638,"lj ":865,"li ":12760,"les":1936,"let":657,"lem":4158,"len":1436,"lek":2007,"led":614,"lo ":4167,"lla":657,"lle":663,"lli":529,"ljs":545,"lju":4071,"lje":10652,"ll ":619,"lja":4832,"lji":2209,"lit":2066,"lis":1339,"lin":3083,"lim":1508,"liz":1420,"lic":2147,"lia":609,"lik":6521,"lij":3213,"lig":541,"ma ":15645,"luž":1294,"mac":714,"maj":623,"mak":533,"mar":837,"mas":812,"mal":2079,"man":3666,"mat":2524,"mbi":905,"me ":5240,"med":673,"met":3414,"mer":3791,"mel":710,"men":6548,"mač":1334,"lum":944,"loš":641,"mpi":753,"mog":976,"mon":1050,"mor":2726,"mos":601,"mot":653,"mu ":1457,"msk":1673,"mun":787,"mi ":1004,"međ":2505,"mje":5038,"min":2090,"mil":697,"mir":1036,"mis":832,"ešt":1289,"mit":939,"ešk":669,"mij":1629,"mo ":1420,"mlj":1522,"mno":900,"rža":4071,"ča ":974,"čan":800,"čar":558,"čaj":693,"uča":545,"zra":1677,"če ":853,"učj":2470,"uči":687,"čav":1111,"čen":1159,"čet":1250,"čes":595,"zu ":768,"zva":612,"zvi":939,"či ":1484,"zvo":1982,"zum":625,"čij":837,"čic":566,"čit":844,"čin":2554,"čko":4001,"čka":1912,"čke":2028,"čki":5096,"čju":1626,"češ":556,"čla":542,"čni":1699,"čno":3306,"čna":751,"čne":1006,"zi ":3239,"zem":1449,"zer":1084,"ze ":973,"zbo":641,"zda":1154,"zac":771,"zbe":843,"zaj":1076,"zam":819,"zan":918,"zal":859,"zap":3007,"zav":531,"zas":565,"zon":794,"zme":1381,"zna":6328,"zno":753,"zič":837,"zni":942,"zla":762,"uća":616,"zli":1013,"ući":766,"zic":1241,"zij":2320,"rši":715,"zin":1079,"zil":1675,"zik":7193,"zir":891,"ziv":3073,"za ":7407,"ya ":644,"ože":1175,"ću ":879,"ćin":2919,"ći ":2008,"rčk":1144,"oš ":708,"ošk":698,"vrš":1066,"wa ":713,"viš":1245,"vrt":868,"vrs":1358,"vri":916,"vre":806,"vsk":989,"vu ":2689,"vir":547,"vil":1092,"vim":1483,"vin":3680,"vih":903,"vij":3997,"vic":888,"vid":903,"vit":817,"vis":1291,"već":1856,"vje":3831,"vla":1440,"vlj":3383,"vo ":3548,"vne":801,"vna":1303,"vno":3450,"vić":621,"vni":4024,"vod":2717,"vog":1164,"voj":3212,"vol":582,"vom":1226,"vor":5059,"vot":928,"voz":1030,"vi ":4716,"vač":1765,"vez":1580,"ver":4932,"vet":804,"vać":849,"ven":3207,"vel":1857,"ved":700,"ve ":5319,"val":1770,"vak":779,"van":5697,"var":1878,"vat":5563,"vac":751,"vaj":1521,"va ":12711,"uz ":1098,"usk":1818,"usi":743,"ust":3150,"uti":590,"ute":706,"uta":1448,"uto":1403,"us ":1654,"ut ":910,"ura":1400,"ure":1189,"uri":1158,"urn":947,"uro":1486,"urs":550,"uru":580,"upa":2662,"upi":4500,"upe":980,"upo":759,"upr":748,"upn":605,"umi":603,"umj":671,"uma":1379,"umb":688,"ume":724,"unu":749,"uni":993,"und":654,"una":2557,"up ":535,"uko":757,"ukl":657,"um ":2138,"uka":702,"uju":1023,"ult":1215,"pći":2432,"uli":687,"ula":1753,"uhv":776,"uje":2393,"ugi":1126,"uge":731,"ugo":2301,"ugl":583,"uga":1630,"ugu":1137,"uda":924,"udi":2722,"ue ":562,"uci":556,"ug ":588,"ua ":834,"ual":550,"uan":840,"ubl":2337,"tvu":537,"tvr":1039,"tvo":2139,"tve":1133,"ođe":1121,"tva":2106,"tur":2517,"tup":876,"tud":1103,"tre":2014,"oče":731,"tra":6680,"će ":1908,"oči":564,"tri":3139,"tru":1417,"tro":2790,"očn":2463,"tu ":3006,"tsk":7605,"toč":2840,"ćen":895,"to ":5478,"tni":2342,"ća ":3122,"tna":778,"tič":3116,"tno":1901,"toc":579,"toj":1710,"toi":723,"tog":984,"tov":1949,"tom":1889,"ton":1847,"tok":3601,"tol":1732,"tor":3179,"top":785,"ćan":554,"tij":2254,"til":813,"tik":1009,"tih":692,"tir":1327,"tit":1406,"tis":916,"tin":3058,"tim":1630,"tio":1426,"tic":2096,"tiv":2430,"tje":1395,"tko":583,"tka":775,"tla":700,"tem":1590,"ten":2148,"tek":931,"tel":2393,"th ":723,"tet":818,"ter":4781,"ti ":9529,"the":1111,"živ":2628,"žni":894,"žno":1666,"že ":1842,"žbe":582,"žav":3922,"žan":794,"ži ":866,"žen":1273,"užb":678,"uže":864,"uži":889,"užn":2163,"žup":1446,"ušt":795},"n_words":[5153330,5928363,4281211],"name":"hr"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":13198,"E":18562,"F":13797,"G":13466,"A":98368,"B":24663,"C":21970,"L":15870,"M":28130,"N":17752,"O":8481,"H":13530,"I":14875,"J":7703,"K":21077,"U":4237,"T":19473,"W":4434,"V":10406,"P":22295,"S":35593,"R":12093,"Y":1303,"X":1480,"Z":4534,"f":108322,"g":317259,"d":216557,"e":996693,"b":219927,"c":121096,"a":1016786,"n":579901,"o":467276,"l":670215,"m":298372,"j":133885,"k":454857,"h":131050,"i":506132,"w":6657,"v":203150,"u":147118,"t":700784,"s":637262,"r":536637,"q":1410,"p":129287,"z":429255,"y":242745,"x":7186,"É":1936,"Á":2646,"í":56628,"é":327830,"á":372527,"ü":54519,"ú":35451,"ö":107023,"ó":117740,"ő":81190,"ű":27405," l":36854," m":86086," n":55262," o":27009," h":46816," i":30318," j":25021," k":101750," d":20317," e":90097," f":58652," g":17434,"р":1511," a":216424,"с":1133," b":38681," c":32807," z":5071," u":7248," t":73839," v":67319," p":25823," s":75268," r":41479," J":7511," K":20282," H":12799," I":11800," N":16438," O":7214," L":14736," M":26568," B":23082," C":19809," A":94593," F":12742," G":12445," D":12015," E":17107,"л":1197," Z":3901,"к":1191," Y":1159,"и":1900,"о":2301,"н":1605," S":32225,"в":1255," R":11088,"а":2749," P":20591," W":4161," V":9030," U":3712,"е":1700," T":17702," á":18269," í":4682," é":57281," ö":8570," ó":3751," ü":2190," ú":3722," Á":2459," É":1858," ő":1879,"A ":56345,"Da":1481,"Cs":2770,"Co":4107,"Ce":1232,"Ch":2651,"Du":1186,"Do":1533,"De":2255,"Di":1628,"Fe":3066,"Fa":1242,"Ez":1912,"Eu":1996,"Er":1727,"El":1696,"Eg":3224,"Ge":1854,"Ga":2079,"I ":2334,"Fr":2249,"Bí":1300,"Fo":1548,"Fi":1170,"B ":1118,"C ":1489,"Au":1680,"Ar":2744,"Ba":3859,"Az":15235,"Ac":1577,"Am":2415,"An":3658,"Al":3816,"Bu":2584,"Br":2522,"Ca":4180,"Bi":1847,"Be":4063,"Bo":3301,"Kr":1187,"Ko":2867,"Le":3145,"Li":2545,"La":4095,"Lo":1996,"Me":3040,"Mi":3545,"Ma":10861,"Mu":1674,"Mo":3340,"Ne":3381,"Na":3799,"Ny":3322,"No":1352,"Ol":1338,"Gr":1713,"Go":1444,"Gy":1283,"Ha":3592,"He":2397,"II":1513,"Dé":1163,"Ho":2637,"In":4213,"Is":1406,"Ja":1924,"Je":1294,"Jo":1519,"Ka":3817,"Ki":3017,"Ke":1650,"Tu":1174,"Tr":2483,"To":1807,"Th":2533,"Ti":1466,"Te":2749,"Ta":2278,"V ":1123,"Sz":12584,"St":3173,"Wi":1325,"Vi":2150,"Va":2270,"Ve":1656,"Pr":2035,"S ":1580,"Pe":2500,"Pa":5475,"Po":2747,"Pi":1727,"Or":1697,"Kö":2908,"Se":1702,"Sc":1225,"Si":1430,"Sp":1130,"So":1614,"Sa":3389,"Re":2307,"Né":1630,"Ro":2833,"Ra":1803,"b ":16527,"a ":242203,"bö":1784,"i ":117842,"bó":6751,"ge":24009,"gf":2198,"ga":25428,"gb":5764,"bé":2495,"ff":1119,"fi":9218,"fr":3611,"fu":1929,"fo":23259,"j ":9703,"gy":101312,"gz":1512,"dá":7767,"he":27807,"ha":38767,"gn":5176,"gm":1339,"cé":2844,"gl":4237,"gk":2851,"gj":4145,"gi":17202,"gh":4135,"gg":3431,"gv":1711,"gu":3889,"gt":2295,"gs":3570,"gr":6943,"cí":4683,"go":14059,"dt":3427,"du":4251,"dv":2240,"dz":1317,"g ":45451,"ea":6887,"eb":8727,"ec":7541,"ed":20502,"de":31392,"dd":1273,"di":23518,"dj":21891,"do":18282,"dn":1552,"ds":5503,"dr":4675,"ex":2202,"eu":3957,"ev":16778,"ey":1469,"ez":42514,"fa":14955,"h ":4414,"bá":5025,"fe":22515,"eh":6429,"eg":88315,"ef":4014,"ee":2573,"el":138888,"ek":55065,"ej":8422,"ei":11205,"ep":13034,"eo":3448,"en":123802,"em":38648,"et":109362,"es":66857,"er":112461,"ca":6636,"e ":99285,"bs":1600,"br":4886,"bu":6371,"bo":9382,"bl":2460,"bi":10991,"bb":21421,"bd":1747,"be":56227,"db":1466,"da":33610,"f ":3042,"cu":2732,"ct":3944,"cs":39940,"cr":1412,"co":5597,"ck":3543,"ci":18130,"ch":10237,"ce":11684,"cc":1311,"c ":4126,"az":52261,"ay":1779,"ba":62023,"d ":20001,"at":73551,"as":35129,"ar":73118,"av":6465,"au":6414,"ak":76786,"al":112498,"ai":30999,"aj":22921,"ao":1282,"ap":21474,"am":34363,"an":99045,"ac":9905,"ad":30543,"aa":1143,"ab":12613,"ag":47002,"ah":3611,"ae":13109,"af":4252,"nu":6592,"nt":48640,"ns":9377,"no":16464,"nn":10307,"nz":2533,"ny":61139,"gő":1259,"jó":1888,"ká":8866,"of":2805,"oc":6235,"od":12616,"ob":7903,"jú":2913,"ké":20553,"om":30097,"on":54876,"ok":33935,"ol":57719,"oi":1954,"og":16189,"oh":1599,"ot":26104,"os":55594,"ov":15603,"ou":3364,"op":11263,"oo":1729,"or":73234,"kí":3018,"kú":6878,"r ":40419,"ow":1527,"kó":3434,"kö":37773,"oz":36942,"lá":37200,"pe":16459,"pa":16699,"kü":4521,"pc":1541,"pl":5060,"lé":24839,"po":17893,"ph":4251,"pi":8966,"pj":2198,"pk":2115,"lo":28055,"ln":6206,"lm":14166,"hé":2687,"ll":38085,"ls":10986,"lr":2355,"hí":1890,"lv":12472,"lu":8529,"lt":43305,"lz":1116,"ly":54235,"o ":10039,"ma":45288,"mb":11466,"me":75090,"iá":11532,"ml":3581,"mi":28248,"mn":1344,"mm":4364,"mp":6217,"mo":16948,"ms":1960,"mu":7004,"fő":4516,"ió":9250,"mz":4734,"p ":5660,"na":50058,"nb":3564,"nc":11966,"nd":37316,"ne":56223,"já":30832,"nf":2395,"ng":19748,"nh":1725,"ni":26541,"nj":1322,"nk":8011,"nl":3257,"jé":12386,"jt":5016,"ju":2496,"jn":3362,"jo":4214,"jl":2908,"fé":13187,"ki":30779,"kh":2892,"ke":42527,"gá":10089,"kc":1158,"kb":6547,"fü":2239,"ka":34729,"m ":27269,"fö":2942,"jz":1414,"gó":4573,"dő":5431,"gö":3372,"ks":5542,"kt":6729,"ku":13047,"kv":1892,"ko":37181,"gí":1165,"kr":7237,"kk":6940,"kl":3302,"gé":11029,"km":3318,"kn":3693,"li":35208,"lh":5532,"lk":11727,"lj":5903,"le":93067,"há":9402,"ld":10308,"lg":5358,"lf":2782,"la":70668,"lc":4145,"lb":4771,"gú":1727,"n ":153664,"hr":1649,"dí":4512,"dó":5588,"ht":1620,"hu":2691,"hi":9568,"hn":1454,"ho":21034,"dé":9871,"id":22338,"ic":13315,"ib":5712,"ia":30517,"ih":1907,"ig":15930,"if":9201,"eá":1335,"ie":7180,"dö":1970,"hy":1869,"k ":156709,"ir":14921,"is":42797,"it":23842,"iu":6023,"iv":7173,"bő":3863,"ii":3341,"ij":1228,"ik":46279,"il":27833,"im":8040,"in":53934,"io":8559,"ip":5320,"fá":1392,"je":23937,"jd":2673,"iz":8720,"l ":91636,"ja":22570,"xi":1625,"té":34673,"tí":4348,"tó":18526,"sö":2411,"só":2652,"sú":4418,"z ":78459,"sü":3136,"tá":43166,"wi":1167,"sé":27679,"sí":6485,"ró":12762,"rö":8024,"y ":64628,"rú":3555,"rü":11432,"wa":1485,"sá":25703,"ré":19532,"vi":19889,"vt":2486,"rí":1306,"vo":15593,"mű":8899,"nő":6159,"ve":52514,"rá":28508,"va":38110,"x ":2815,"ui":1873,"uk":4884,"ul":23432,"ue":1945,"ug":7493,"ur":14404,"us":30910,"ut":12436,"um":11242,"un":8785,"up":2459,"ty":4352,"tu":13656,"tt":50460,"tv":7094,"ub":2532,"ua":1854,"pü":4993,"ud":8185,"uc":2378,"to":52152,"tn":2524,"pé":3054,"tm":2785,"tl":6382,"ts":8729,"tr":17609,"pí":3835,"tf":1412,"te":87104,"pá":7384,"tk":5205,"tj":7119,"ti":40828,"th":9478,"v ":9584,"tb":6187,"tc":1163,"ta":76171,"su":4875,"sv":2378,"ss":22768,"st":26226,"sz":174760,"lő":14903,"sl":2577,"sk":7704,"sn":2526,"sm":5506,"sp":4512,"so":26449,"sr":3213,"sd":1132,"sc":3372,"sf":1789,"se":34881,"sh":2900,"si":20642,"nö":2962,"rz":5827,"u ":5228,"sa":40529,"sb":5688,"rr":6334,"rs":21922,"rt":47601,"ru":7619,"rv":8829,"nó":1459,"kő":1313,"ry":3017,"rp":2572,"ro":47249,"rn":9437,"né":20022,"rm":20825,"rl":4279,"rk":7954,"rj":4223,"ri":45410,"rh":2668,"rg":7757,"rf":3837,"re":67985,"ná":12107,"rd":9230,"rc":7289,"rb":5639,"ra":55047,"t ":159290,"mú":1197,"mó":2093,"qu":1194,"mí":1861,"mé":15355,"má":27550,"lü":5880,"s ":157503,"lö":4000,"pt":5892,"pu":3980,"ló":16644,"pp":2547,"lí":3237,"pr":7474,"ps":3440,"yő":1310,"zü":5181,"zú":1280,"zá":31274,"yú":1973,"yü":3112,"zé":19023,"zí":5603,"zö":10272,"zó":20498,"vű":1517,"yá":13547,"yí":1801,"yé":5272,"yó":2584,"vő":2313,"tű":4358,"vö":1632,"zz":3816,"sű":1526,"tő":14413,"zf":1314,"zg":2806,"zh":3151,"zi":25268,"zb":1632,"zd":4010,"ze":71368,"vá":29898,"za":30193,"yz":1903,"zv":1698,"rű":2676,"ső":8855,"zs":12765,"zu":3963,"zt":36837,"zo":19843,"zn":6608,"ví":2765,"zp":2246,"zk":4768,"vé":17783,"zm":2937,"zl":11760,"yg":1834,"yh":1922,"ye":40237,"uá":1400,"yf":2056,"yc":1116,"ya":26830,"tü":4027,"yb":3764,"tú":2750,"tö":12442,"rő":4773,"yv":4653,"yu":3843,"yt":3497,"ys":7991,"yr":1838,"yp":1398,"yo":12260,"yn":3914,"ym":1973,"yl":2832,"yk":2442,"yi":17109,"ző":8108,"yű":1984,"Ál":1439,"ám":8760,"án":50938,"áp":2119,"áj":7251,"ák":18338,"ál":59120,"ág":29984,"áh":4265,"áb":34134,"ác":4739,"ád":14308,"áz":7779,"áv":4411,"áu":1148,"ár":52483,"át":21141,"ás":47264,"á ":2111,"óz":2079,"ós":5892,"ót":4344,"óv":1296,"óa":1234,"ób":3397,"ój":4542,"ói":2619,"óg":4159,"óf":2044,"ód":5384,"óc":1867,"ór":2907,"óp":3032,"ón":2969,"óm":2752,"ól":15303,"ók":6328,"ó ":44110,"ív":3409,"íz":3248,"ín":4459,"ím":4915,"ír":8949,"ít":21446,"íg":1483,"ík":1149,"íl":1523,"íj":1882,"éz":4750,"ék":26204,"él":30309,"éj":1119,"ép":16482,"ém":6476,"én":36437,"és":84643,"ét":19103,"ér":22242,"év":16532,"éb":17184,"éd":3694,"éc":1321,"éh":4561,"ég":31689,"é ":4130,"úz":1141,"ün":2126,"ül":37857,"ür":1396,"üt":3250,"üz":1153,"üg":2645,"ük":3502,"úl":1994,"új":3185,"út":2986,"ús":1934,"úr":1723,"úa":6976,"úg":2023,"öv":8122,"öz":26938,"ú ":9189,"öt":9388,"ör":17835,"ös":7442,"ön":10766,"öl":8906,"öm":1336,"ök":5801,"ög":3454,"öd":2116,"öb":4016,"ő ":26112,"őv":2032,"őr":2463,"ős":10918,"őt":2983,"őz":2353,"őd":2691,"őb":2103,"őg":1179,"őe":1463,"ől":9494,"ők":4433,"őj":2432,"ői":2735,"őn":2715,"ű ":13555,"űk":1684,"űe":1599,"űs":1405,"űr":1219,"űv":2905,"一":1237," Ál":1439," Ga":2065," Ge":1837," Bí":1298," Fo":1535," Fr":2248," Fi":1161," Ha":3587," He":2393," Gy":1279," Go":1438," Gr":1700," Ho":2631," Dé":1163," Je":1288," Ja":1919," Is":1400," In":4206," Ka":3809," Ke":1641," Ki":3002," Jo":1515," La":4057," Le":3135," Li":2524," Ko":2859," Kr":1178," Ma":10822," Mi":3523," Me":3025," Lo":1988," Ne":3354," Na":3783," Mo":3325," Mu":1669," A ":54120," Am":2399," An":3650," Al":3798," Ac":1573," Ba":3855," Az":15203," Au":1680," Ar":2712," Be":4051," Bi":1842," Bo":3283," Br":2520," Bu":2577," Ca":4107," Ce":1230," Ch":2631," Co":4068," Cs":2760," Da":1472," Di":1622," De":2234," Do":1490," Du":1183," El":1692," Eg":3214," Er":1720," Ez":1906," Eu":1996," Fe":3060," Fa":1224," Wi":1310," a ":114242," Kö":2904," Or":1691," Po":2728," Pi":1722," Pe":2496," Pa":5457," Ny":3316," No":1346," Ol":1338," Ra":1786," Né":1627," Ro":2824," Re":2297," Pr":2030," Sz":12567," St":3106," Ta":2270," Th":2520," Ti":1460," Te":2728," Tr":2468," To":1786," Sa":3377," Si":1425," Sc":1193," Se":1692," So":1606," Sp":1107," Va":2262," Ve":1647," Vi":2142," Tu":1156," ja":1594," je":10035," in":4519," il":2853," is":10067," ir":2442," fü":1523," ka":7871," fö":1646," ki":17579," ke":13720," jo":1938," fé":4201," gy":6161," ha":19271," he":6504," cé":1611," cí":4405," gr":1159," id":3467," ig":3043," hi":3276," dé":2801," ho":8126," dí":1823," ne":18984," já":7979," na":5781," fő":4149," mu":2133," mo":4422," ok":1477," ol":6794," ké":11002," of":1186," ny":10296," no":1409," há":4031," le":17706," li":2353," la":7930," ku":2128," gö":2536," km":2432," kr":1168," ko":9227," me":30782," mi":10796," ma":18644," lo":1099," hí":1381," ad":3539," am":14965," an":8296," ak":3658," al":19507," au":2171," ar":2746," as":2187," ba":7443," az":39674," bi":2851," be":18442," bo":2566," bu":1302," er":6741," es":6369," en":1270," em":3989," el":22279," eg":37244," fe":16017," bá":1342," fa":10457," ez":6961," fu":1228," fr":2101," fo":11286," bé":1167," fi":3986," ge":1465," ga":1837," ci":1469," da":2485," cs":21597," do":1754," de":6050," di":2713," vé":3902," ví":1450," vá":10832," ze":2004," tö":9167," té":2457," tá":3920," nö":1405," sa":2016," se":2587," sp":2365," so":5755," ra":2479," re":19463," né":13996," ro":3461," pr":3595," má":8283," mé":4247," os":10567," kí":1660," or":4197," kö":27883," pe":2664," lá":2021," kü":3017," pa":5441," lé":4236," po":4686," pi":1682," ró":1121," rö":2141," va":20643," ve":9072," nő":1649," mű":3958," vo":11603," vi":8365," ré":10099," tu":3046," ut":3907," ur":1101," ta":23929," sz":51220," st":2457," su":1760," tr":1755," pé":1624," to":2214," ti":2324," te":19251," pá":3006," át":2658," ál":11522," év":4437," ér":4015," és":39564," én":2210," ép":2514," él":3293," ír":3245," ók":1246," ön":2076," ös":3774," ót":1104," új":1846,"ől ":7803,"ője":1778,"ők ":2201,"őne":2127,"ősz":2500,"ős ":1549,"ősí":1918,"ősö":1177,"őtt":1492,"őbb":1227,"ői ":1993,"Áll":1152,"Eur":1570,"Ez ":1152,"Fra":1443,"Bír":1270,"Int":2403,"Az ":14884,"Ame":1184,"Bud":1205,"Car":1146,"Cse":1140,"Egy":3037,"Nem":1251,"Nag":2455,"Nye":1809,"űvé":1367,"Köz":2207,"Pas":2299,"Par":1124,"űve":1432,"űkö":1244,"Lad":1325,"Mar":2612,"Mag":3698,"éhe":3730,"ék ":14028,"ége":6434,"égb":1296,"égi":4433,"égy":1459,"él ":2635,"éde":1275,"ég ":10442,"ébe":12999,"éba":2195,"éve":6732,"ésé":4210,"év ":4758,"ész":20296,"éte":4730,"élő":1204,"ést":1350,"ésr":1270,"épü":1343,"épí":1687,"étr":1526,"éze":2382,"érő":1195,"éme":4235,"ép ":2557,"éne":8073,"éni":1139,"ént":3323,"ény":11240,"ékh":1465,"éke":3632,"éko":2177,"égé":2685,"én ":10291,"éli":2101,"éle":4096,"élk":1173,"ély":3515,"élt":1344,"ére":3670,"érf":1845,"ét ":8716,"ért":4687,"érs":1117,"éri":1359,"ése":7910,"ési":1788,"épe":3380,"ér ":1942,"éps":1130,"élé":8715,"és ":42926,"ésű":1277,"Szl":3665,"Sza":1311,"Sze":2596,"The":1583,"ául":1145,"áts":1437,"áto":1839,"biz":1578,"áss":1306,"ást":1822,"ász":3797,"áró":1875,"ásá":7057,"árá":5593,"áva":2230,"bol":1519,"átó":1225,"áté":2725,"áz ":1260,"ázi":1122,"bor":3409,"áza":2446,"áll":11176,"bbi":2623,"ált":12559,"bba":2240,"ály":14827,"be ":8763,"áma":1434,"ban":38532,"bal":3567,"baj":2665,"áku":1940,"bad":1154,"án ":13154,"ála":2623,"bar":1493,"áli":4198,"álh":2859,"áno":1436,"ány":14546,"ánu":1287,"ámo":1867,"ána":12065,"bda":1545,"áni":2143,"ájá":1672,"áló":2208,"ás ":11271,"bi ":2519,"ár ":6296,"ber":6263,"ben":26680,"bel":7243,"bes":1771,"bet":1775,"ása":4529,"árt":3175,"árs":2861,"áso":5295,"ási":3907,"ásb":2951,"át ":7693,"ámí":1237,"áro":12322,"árm":2982,"ári":3231,"ára":5673,"ács":1314,"áci":2905,"ág ":10677,"ádj":10076,"ábó":2969,"ága":2054,"ágb":3476,"ca ":1909,"ágo":3161,"ági":2804,"ák ":11048,"áho":4041,"ája":2888,"ál ":2859,"ágá":1372,"áki":3435,"ce ":2292,"ám ":1135,"bri":1367,"bra":1211,"bur":1175,"bum":2164,"ád ":2121,"ábo":1867,"ább":3556,"ába":23979,"ajz":1349,"aka":2195,"am ":2190,"agá":2325,"aki":3799,"afé":1601,"ajn":3205,"ajt":2171,"al ":17585,"aja":1802,"ajd":2547,"ain":2508,"ait":1256,"ak ":42544,"adó":2898,"aj ":6376,"agy":29473,"adá":3539,"agj":1385,"ago":2554,"anu":1841,"any":6283,"ano":1477,"ann":3140,"ant":4051,"ans":1439,"ane":1370,"ajá":1272,"ang":7307,"ani":3989,"ank":1265,"ana":3610,"anc":3826,"and":5977,"amm":1385,"amo":3002,"amp":1099,"ami":6863,"ame":12652,"amb":1439,"ama":2681,"alv":1163,"alu":2639,"alt":1806,"alr":1104,"alo":6119,"alm":6541,"all":3854,"alk":5861,"ali":4354,"alc":1755,"ale":2695,"ala":25236,"alb":2678,"an ":50569,"akr":1682,"aku":2116,"akt":1697,"ako":5426,"akk":1936,"aba":1856,"abb":3158,"abd":1557,"ae ":11230,"ad ":2344,"ai ":23587,"aga":3600,"ado":2879,"adi":3222,"ag ":2506,"adt":1359,"aci":1651,"ach":1191,"ace":1933,"ada":7719,"acs":1177,"azo":3899,"azi":1454,"azt":1117,"azg":1460,"aza":1771,"azd":1290,"azz":1187,"az ":35775,"asú":1719,"atá":8809,"ató":6586,"azá":1314,"ba ":10534,"bb ":12760,"at ":13186,"are":1125,"ard":2003,"arc":2552,"ara":12278,"aro":4590,"arm":1949,"arl":1467,"ark":2718,"ari":4156,"aru":1342,"arr":1237,"art":20941,"asa":1245,"aso":1347,"ar ":7389,"akö":2063,"apa":2996,"akú":5988,"alá":20616,"ape":1622,"apc":1186,"apj":1950,"api":1102,"apo":2602,"apt":1225,"aló":3345,"as ":5129,"ava":2849,"aut":1403,"avi":1292,"ará":1634,"asá":3004,"arú":1364,"asz":10871,"atb":1153,"ata":7891,"ast":1692,"ass":5488,"atl":1204,"apí":1933,"atr":1521,"ato":8994,"apá":1330,"ate":2686,"ati":9317,"ath":1161,"att":4210,"aur":1366,"jed":1383,"jel":12132,"jez":1371,"jes":2679,"jai":1887,"je ":3339,"fér":1780,"fél":10120,"job":1168,"jno":2503,"jog":1359,"jle":1493,"itr":1237,"ito":1407,"itt":1847,"isk":1278,"ism":3684,"iss":1661,"ist":5127,"isz":6268,"ita":2682,"ite":2144,"iti":1991,"ius":3051,"ium":2277,"inő":1958,"iva":2264,"ügg":1434,"irá":5579,"ive":1942,"is ":16119,"ion":3768,"ilá":4340,"iro":2666,"ise":2504,"ire":1148,"ira":1291,"it ":5468,"ült":4085,"ülé":2870,"izá":1108,"ja ":16271,"ül ":11962,"ből":3435,"ük ":2123,"itá":2293,"üle":10807,"izo":1300,"ize":1447,"kif":1538,"kik":1160,"kil":1182,"kia":2258,"kin":1795,"kir":2041,"kis":3777,"kit":1528,"úsz":1202,"km ":2029,"ki ":6708,"khe":1740,"ked":3722,"kel":6145,"gál":2697,"ken":2595,"gán":1121,"kes":2444,"gás":1179,"ker":9767,"gár":1661,"ket":5479,"kez":5201,"ke ":3579,"kra":2981,"koz":3084,"kot":2662,"kos":4432,"kor":12163,"kon":4605,"kom":1428,"kol":2622,"kok":2194,"kod":1381,"gés":1802,"kna":1132,"kiá":3411,"géb":1114,"gép":2262,"gén":2847,"kke":2328,"kka":1690,"kbe":2008,"kba":2863,"kat":6062,"kar":2888,"kas":1212,"kap":3160,"kan":1437,"kal":4211,"kai":6519,"füg":1411,"ka ":5099,"föl":2822,"gyé":3341,"gyá":1929,"gyü":2858,"ha ":1609,"han":3792,"haj":2261,"hal":5044,"har":4293,"has":5102,"hat":10778,"hag":1327,"had":1308,"gyű":1250,"he ":2263,"hel":6775,"heg":1841,"hez":5105,"het":6443,"dás":3281,"dár":1447,"her":2201,"hiv":1273,"gla":1955,"gna":2223,"gió":1365,"cél":1462,"úak":6906,"gok":1862,"gol":3486,"gon":2487,"gos":1601,"got":1210,"gra":2375,"gre":1699,"cím":4480,"gsz":2045,"gy ":37866,"gus":1307,"gya":12084,"gye":12610,"gyh":1320,"gyi":8702,"gyk":1565,"gym":1148,"gyo":4080,"úgó":1191,"gys":4096,"iai":5083,"ial":1462,"ian":2020,"iat":1462,"iad":1714,"id ":1248,"ibe":1135,"ia ":14894,"ig ":5385,"ifo":5710,"ife":1711,"ics":1889,"ici":1201,"ich":2336,"ice":1195,"ie ":1156,"ica":2195,"ide":4043,"ida":9550,"ika":8060,"ige":3208,"iga":2520,"ii ":1284,"idé":1418,"ik ":22104,"imp":1521,"inc":1595,"ind":5305,"ina":5669,"ino":2798,"int":11506,"ine":3645,"ing":3990,"ini":2901,"ink":1323,"iká":2172,"inu":1420,"iko":1515,"ike":2523,"ila":1887,"in ":5995,"idő":1933,"iku":4841,"ilo":2015,"ill":7614,"ilm":1881,"ili":3110,"ile":2438,"hoz":7697,"hol":2205,"hon":1202,"hog":3581,"hos":1589,"hor":2048,"dék":1979,"dél":3343,"dés":3202,"dó ":3604,"dít":1872,"díj":1758,"fen":1212,"bán":1642,"fek":1857,"fel":10451,"fej":4036,"fia":1202,"ező":2970,"fal":3069,"faj":8821,"etű":3115,"ető":6086,"ezé":5371,"etü":1875,"esü":1885,"ez ":6988,"eté":6862,"erű":1796,"ezd":1515,"erő":1876,"evé":4056,"ezt":2772,"eze":13658,"ezh":1979,"ezi":1930,"etb":3254,"eta":1302,"ete":14739,"eti":8705,"etl":3489,"etk":2537,"est":5914,"ess":3758,"elő":7242,"esz":10684,"epü":3424,"etr":2609,"ets":2876,"ett":18422,"etv":2394,"erá":1228,"eve":8161,"eré":4519,"eur":1117,"esí":1394,"erü":9565,"epe":2041,"epi":1162,"er ":12986,"es ":29789,"elü":5526,"epl":1338,"elé":3820,"erk":1552,"eri":12722,"erj":2568,"erg":1768,"ere":20225,"erc":2570,"erd":1315,"era":2862,"erb":2020,"et ":26086,"emé":3302,"esi":1683,"ese":5978,"erz":2655,"erv":3920,"err":1968,"ert":6946,"ers":4641,"ern":2834,"erm":4064,"ené":1314,"ero":1595,"eki":1276,"egé":3370,"ekt":2496,"en ":52180,"ela":1439,"ele":27299,"eli":3526,"elj":2109,"elh":1530,"ehé":1275,"elm":4378,"eln":2146,"elk":2863,"ell":9606,"elo":1349,"elv":9038,"els":6778,"elt":4823,"ely":23022,"emb":4386,"ema":1264,"eme":6876,"eml":2503,"emi":1303,"emp":1725,"emz":4591,"ene":6376,"eng":4545,"end":20330,"enc":3429,"ejé":1183,"enn":3397,"enl":1598,"eni":2643,"ens":3074,"ent":14391,"eny":3762,"egk":1751,"egj":2290,"egn":2926,"ege":4339,"egf":1828,"egi":1596,"egh":1485,"egr":1473,"egs":1610,"egt":1312,"egy":47388,"edé":2069,"ehe":2302,"ek ":33389,"ein":1947,"el ":15987,"eit":1126,"ejt":1562,"ejl":2010,"eje":2098,"eke":7178,"ekb":1825,"em ":9006,"ött":6080,"gje":1874,"öss":4328,"gja":1441,"git":1317,"gia":2402,"ört":3402,"örz":1373,"öná":1325,"gha":1393,"ös ":1742,"gi ":5691,"ör ":1444,"gen":2834,"get":4617,"ger":3980,"ges":2578,"gel":2023,"öny":3378,"gek":1625,"önb":1254,"ge ":3243,"gbe":1509,"öld":3665,"gaz":3489,"gba":3483,"ölt":2040,"gas":2172,"gar":1902,"gat":6541,"gal":3993,"ga ":3541,"ök ":2555,"ög ":1983,"ból":6420,"fra":1911,"özö":5568,"özé":2985,"for":12637,"fon":1546,"fol":3847,"özi":2766,"öze":2220,"özs":4673,"özp":1522,"özl":1118,"övé":1547,"fog":4106,"özt":1797,"örü":1396,"fil":2271,"fin":1712,"övi":1986,"öve":4144,"örö":5177,"da ":3729,"de ":4718,"dal":6673,"dae":8117,"dat":1797,"das":1500,"dar":6693,"dap":1398,"cti":1331,"csá":1388,"ciá":1338,"ció":4016,"cs ":2427,"öbb":3612,"cse":2576,"csa":18273,"cso":5464,"csi":2762,"csk":1549,"cea":1428,"ch ":1376,"cer":1367,"cen":1114,"ci ":1364,"cha":1323,"cia":3115,"ck ":1275,"che":1523,"chi":1879,"ed ":1845,"ebb":4696,"ebe":1132,"eae":1455,"ea ":1332,"efo":1148,"ei ":4004,"ega":1438,"edi":4113,"ede":6411,"eg ":8600,"ech":1220,"ect":1158,"ecs":1616,"dor":1576,"don":3362,"dom":3952,"dol":1610,"dot":2319,"djá":9950,"djé":9456,"dsz":3860,"dul":1851,"dt ":1182,"dia":1805,"der":2519,"des":1303,"det":5300,"dez":2835,"del":4174,"den":3777,"di ":3532,"dja":1819,"din":2017,"dig":1984,"dik":7071,"rga":1547,"ri ":7426,"rgi":1214,"rge":1233,"ret":2640,"res":5836,"óta":1799,"rfi":1520,"red":5622,"ósz":1200,"reg":2954,"rem":1197,"ren":20766,"rek":3056,"rel":1449,"nál":7068,"rep":4197,"rde":1294,"ós ":2694,"re ":12803,"rci":1283,"rce":1814,"ópa":1701,"rd ":1700,"ras":2085,"rat":3432,"rba":1762,"rbe":1412,"raj":1914,"rai":1551,"rag":1242,"ran":7745,"ram":2669,"ral":2852,"rak":6278,"rab":2185,"rad":2745,"rac":1357,"rs ":1186,"ror":2542,"ros":13308,"rot":1270,"rom":4552,"ron":3143,"rop":1359,"roz":4272,"rov":1366,"rod":2730,"roc":1647,"rol":2608,"rok":2835,"rog":1810,"rny":1765,"rna":1947,"rne":1337,"rni":1311,"nél":1390,"ném":3320,"nép":2342,"név":7964,"rma":4453,"rme":7442,"riá":1542,"nég":1166,"rla":1110,"rke":1521,"rje":2570,"rit":2732,"ris":3358,"ril":1221,"rik":5031,"rin":5644,"ria":3879,"ric":1848,"rid":1666,"rie":1106,"rif":2877,"rk ":1195,"rsé":1351,"rtá":1222,"rté":4693,"rul":1250,"rus":2059,"rva":1143,"rve":3364,"ry ":1179,"rsa":2647,"rse":2444,"rsz":11249,"rta":3812,"rtj":2461,"rto":16676,"rte":3860,"rti":1302,"óba":1208,"rmé":1800,"rmá":4220,"rt ":8024,"rre":1962,"rra":1492,"sab":1704,"sai":1775,"sak":2444,"sal":14426,"óma":1626,"sba":3589,"sap":1951,"san":2277,"ójá":1365,"sat":1742,"sas":1771,"sa ":7871,"óko":1362,"ók ":2577,"ól ":13016,"ója":2231,"rvá":1234,"rze":1233,"ógi":2210,"ói ":1956,"növ":1656,"rzs":1300,"rvé":1214,"si ":8670,"sid":1199,"sil":1988,"sik":1599,"rző":1316,"se ":7438,"ser":4397,"set":1567,"seb":2714,"sen":5233,"sem":1195,"sel":3187,"sek":2815,"spo":1141,"spa":1306,"sol":2274,"son":4142,"sop":2367,"sor":6863,"sod":2739,"sok":5478,"st ":5212,"sle":1305,"sko":1582,"ska":1477,"ske":1576,"sme":3505,"ssé":2608,"ssá":1740,"sz ":8870,"lőt":1770,"lős":2401,"stá":1316,"sza":14173,"svá":1185,"sze":35971,"szo":9131,"szt":27632,"szu":2442,"szi":9598,"szl":6370,"szk":3277,"szn":4368,"sse":3446,"ssa":2570,"ssz":9449,"ste":4883,"sta":3478,"sto":1355,"sti":2035,"str":2018,"sug":1404,"lő ":3629,"sré":1266,"tai":1258,"tak":4038,"tal":17363,"tag":2404,"tbe":3293,"tba":1669,"tat":5289,"tar":19006,"tan":4747,"te ":9032,"ta ":15756,"szá":22866,"szü":3008,"szé":7434,"szí":4720,"szö":3515,"szó":5026,"ozó":12847,"ozá":2383,"kúa":5586,"pa ":1509,"pcs":1390,"par":3913,"pat":2664,"kül":3938,"pai":1203,"pap":1164,"pan":2344,"pha":1287,"láb":2286,"ped":1503,"lád":12468,"lán":2463,"pen":1323,"per":4376,"lát":1961,"pes":3470,"lás":5210,"lág":4159,"pel":1327,"lál":4812,"plo":1196,"pia":1507,"por":4386,"pos":1197,"pon":4538,"pol":3552,"lét":2548,"lés":5210,"lén":1237,"lék":10125,"lég":1202,"pjá":1227,"psz":1291,"plő":1147,"ló ":6853,"pte":2465,"lít":2551,"pri":1456,"pre":1161,"pro":3235,"lós":1760,"lóg":2572,"pus":1929,"lód":1195,"lön":2431,"lül":5325,"már":3617,"más":7074,"mán":10674,"máj":1466,"még":1185,"mét":1869,"mér":2829,"més":1202,"mél":2519,"mén":4256,"mít":1327,"mód":1366,"ra ":14404,"ngo":3206,"ni ":7437,"nge":3949,"ngy":1650,"jáh":2978,"neg":1502,"nel":1635,"nek":17344,"ják":1953,"ján":4452,"nem":11176,"jár":7148,"ner":1420,"net":3267,"ját":5567,"nes":2093,"nev":9590,"ng ":3420,"jáb":8030,"ncs":2007,"nci":3666,"nce":2318,"ne ":3513,"ndr":1401,"nds":3385,"ndo":2353,"ndj":9787,"ndi":2220,"nde":6999,"nda":1524,"nak":21124,"nal":2777,"nap":2250,"nae":1236,"nag":5845,"nai":2208,"nd ":4041,"nba":1457,"nat":2233,"na ":6605,"mze":3907,"iós":1792,"iój":1245,"nyb":1400,"nya":6957,"nye":12400,"nyi":4978,"nté":5969,"ny ":9196,"nul":2608,"nus":1717,"nty":1140,"nto":3218,"nti":4462,"ntj":1553,"nta":2379,"nte":5991,"nsz":1210,"nse":1158,"nt ":15590,"ns ":1607,"nká":1105,"nok":4021,"nop":1219,"nos":3762,"nov":1101,"nne":2064,"nna":2413,"nni":1136,"nny":1741,"niá":1146,"jéb":6198,"jén":1719,"jéh":3083,"nle":1203,"no ":1255,"nka":1134,"nid":1544,"nic":1428,"nia":2714,"nk ":1863,"nis":2046,"nik":2308,"ogr":1697,"ogl":2068,"oga":3293,"ogy":4073,"ok ":16218,"ol ":3707,"ock":1291,"ode":1450,"odi":3096,"odo":1217,"of ":1189,"oda":2951,"kál":1176,"kán":1252,"káb":1589,"nyí":1276,"obb":3582,"nyo":6087,"nyv":3444,"nyu":2805,"nyt":1370,"nys":1658,"ntő":1470,"jú ":1653,"nyá":1816,"nyé":1280,"oz ":6136,"osí":1185,"köv":1585,"köt":1840,"ozt":1266,"köz":23037,"kön":3656,"köl":1449,"kör":4099,"ozo":1583,"köd":1357,"ová":8369,"ozi":2582,"oza":5292,"ott":13732,"oto":1925,"osz":17179,"ost":2095,"ítő":1100,"oss":3713,"oso":1195,"orú":1625,"osá":2471,"ovi":1122,"ova":1834,"ove":1321,"orá":3688,"íté":2953,"opo":3410,"olá":1371,"ító":2222,"os ":17347,"oló":3495,"opt":1890,"or ":6588,"ítá":3043,"orm":10619,"orn":2184,"oro":8033,"orr":2134,"ord":2929,"oná":1361,"ore":1109,"org":1652,"ori":4752,"osa":4828,"ort":5009,"ors":11126,"orv":1840,"omá":8665,"ot ":4277,"orb":1551,"ora":2591,"ízi":1186,"íto":2971,"ola":3942,"old":2725,"olc":1133,"on ":14829,"oli":4956,"oll":1909,"olg":2986,"ols":1101,"olt":11581,"oln":1426,"olo":2069,"oly":9282,"olu":1943,"okb":1937,"oka":3243,"om ":4999,"okk":1292,"okr":1131,"oks":2384,"íte":4142,"oko":2728,"ív ":1366,"okt":1201,"író":2841,"ona":4624,"ond":2206,"one":1152,"ong":1637,"oni":3479,"ono":2799,"ons":1750,"ont":8687,"ony":4114,"oma":1728,"omb":2831,"omi":1564,"omm":1333,"kék":1183,"kép":5158,"kén":3827,"írá":2059,"omo":2114,"két":4231,"kés":3743,"ímű":2569,"la ":5399,"le ":4396,"lcs":2466,"írt":1125,"lda":1565,"ldi":1183,"lab":1923,"lad":1437,"lag":3296,"laj":1625,"lal":2932,"lak":15098,"lan":5737,"lam":7155,"lap":6224,"lat":10133,"las":4338,"ld ":2188,"lbu":2257,"dő ":2164,"kus":4984,"kul":4809,"ksá":2221,"ksz":1790,"gó ":1820,"gör":2727,"llí":1578,"llá":2364,"lló":2888,"lon":1455,"lom":5872,"lor":1462,"log":1806,"los":2605,"lot":1131,"lov":8327,"lne":1797,"ljá":1491,"lmi":2811,"lme":1597,"lma":4921,"lna":1780,"lto":3300,"lsz":1378,"lta":8440,"lte":3310,"lu ":1887,"lmé":1286,"lre":1816,"lt ":21002,"ldá":1177,"lha":4212,"li ":5160,"ház":2853,"lev":1937,"les":5249,"let":23881,"hár":2058,"lep":3720,"lem":6723,"len":17272,"lek":3977,"lel":1659,"leh":1502,"leg":14221,"háb":1477,"lla":10657,"lle":8630,"lli":2652,"llo":1404,"lko":4418,"lka":2131,"lke":2520,"lgá":2219,"lje":1805,"ll ":3110,"lja":2062,"lit":2107,"lis":6935,"lin":2766,"lim":1567,"lid":1319,"lia":3789,"lik":2243,"lyó":1710,"ma ":5513,"mai":2750,"maj":1342,"mad":7309,"mag":9145,"mar":2289,"mas":1406,"mal":2064,"man":2348,"maz":4614,"mat":4172,"mba":2545,"mbe":4662,"me ":1765,"meg":18261,"iáb":6342,"met":6641,"mes":7852,"mer":9128,"mel":20048,"men":4193,"mek":1223,"mez":3626,"lva":1622,"lve":3172,"lul":1401,"lus":1938,"ly ":11120,"lvt":1957,"ltá":1280,"lsó":1363,"lya":5925,"lyb":1133,"lyi":2296,"lye":12093,"lté":1277,"lvá":1133,"lyo":1466,"lyn":2379,"lys":1166,"lyt":1183,"lső":6342,"lyá":9265,"mpi":1413,"mpl":1595,"mon":2735,"mok":3026,"mos":2616,"mot":1595,"moz":1966,"ió ":2881,"mus":1737,"mut":1302,"mun":2084,"mi ":6215,"min":10577,"mil":1543,"mit":2000,"mia":1836,"mik":1748,"mma":1863,"tő ":4350,"től":2706,"tős":1505,"tőn":1924,"sű ":1297,"zná":3356,"zt ":3494,"víz":2022,"zte":5806,"zti":1839,"zta":2495,"ztr":2410,"zto":1911,"zsi":1572,"ső ":5513,"zul":1547,"ztá":11872,"zté":2691,"rű ":1401,"zsé":5276,"zza":1374,"zga":2049,"zi ":4058,"zhe":2186,"zet":19059,"zes":1480,"zen":10515,"ván":2981,"zem":3342,"zel":4337,"vál":6881,"zek":2284,"vák":7162,"vár":8676,"zer":19180,"ze ":4791,"zda":1164,"zab":2585,"zad":1388,"zak":6902,"zal":1294,"zat":8108,"zot":2710,"zor":2657,"zom":1281,"zon":5431,"zok":2301,"zol":2432,"zpo":1664,"vén":5253,"véd":2112,"vég":2913,"vét":1848,"vés":2023,"zió":1200,"zke":1119,"zle":1674,"zlo":7817,"zig":3735,"zin":2369,"zil":1212,"zik":7225,"zis":1133,"yve":2842,"yug":2984,"ysz":2663,"yok":1279,"yom":2043,"yol":1936,"yos":2298,"yob":2167,"za ":3143,"ről":1883,"ysé":2805,"ye ":3860,"yei":1140,"yek":4530,"yed":1362,"yes":5416,"yer":1837,"yen":2702,"yel":9386,"yez":2468,"yet":5968,"ya ":3038,"yag":1568,"ybe":1927,"yba":1297,"yar":8873,"yan":3850,"tül":2293,"yal":1350,"yak":3842,"yai":1283,"yhá":1178,"yne":2282,"yi ":5728,"yik":7147,"tív":1267,"téz":2544,"tér":3350,"tét":2097,"tés":9077,"tén":6927,"tél":1509,"ték":6622,"töb":3419,"tör":6044,"tó ":7302,"tól":4069,"tán":3887,"tár":7602,"tás":11209,"táv":1125,"ták":3376,"tál":11084,"táj":1226,"sú ":1586,"sül":2209,"só ":1339,"sök":1579,"ség":20101,"sén":1506,"sér":1918,"sét":1189,"sít":5115,"sár":2700,"sát":1861,"sán":1607,"ság":11788,"sáb":4560,"rög":2754,"rök":1818,"röv":1876,"rúg":1325,"rül":11029,"ró ":3203,"vtu":1747,"ról":2801,"róp":2151,"vir":1492,"vil":4652,"vid":4003,"vis":3046,"réb":2703,"rég":2911,"rés":10041,"vol":10664,"von":3245,"vi ":1300,"vez":9822,"ver":7753,"rás":8381,"ves":3105,"vet":5802,"veg":1557,"rág":1665,"ven":5494,"rán":5049,"rál":4428,"vel":6375,"vek":3167,"ráb":2450,"ve ":7296,"val":9128,"van":3020,"var":2245,"vat":2280,"vas":2577,"vad":1432,"vag":11793,"va ":3234,"műv":2529,"műk":1274,"nős":2336,"utó":1685,"utá":2191,"női":1248,"nő ":1717,"mű ":3511,"uró":2193,"usz":5860,"ust":1128,"uta":3446,"uto":1342,"us ":17567,"ura":2400,"urg":1418,"uri":1391,"ulá":1430,"uma":1476,"umb":1113,"unk":2047,"uni":1375,"um ":4397,"ult":3281,"ula":2897,"uk ":1640,"ul ":10317,"uga":4002,"uda":1985,"udo":3577,"pül":4756,"tvá":1162,"trá":1500,"tve":3654,"tur":1775,"tus":2989,"tul":1430,"tum":1120,"tud":4114,"tté":1199,"ttá":1546,"tsé":3581,"pít":3633,"tre":3344,"tt ":30966,"tra":4155,"tri":3477,"tro":2690,"tsz":2410,"tta":4727,"tte":6235,"tti":1842,"to ":1211,"tjá":2955,"tos":4461,"tot":7575,"tkö":1448,"toz":15230,"tom":3113,"ton":3995,"tok":4029,"tol":3082,"tor":5700,"til":2521,"tik":5482,"tis":1923,"tin":5413,"tio":2102,"tiz":1109,"tja":3013,"tke":1731,"tla":1565,"tle":4022,"tem":4106,"pán":1462,"ten":6944,"tei":1486,"tek":7221,"pál":1428,"tel":10568,"teg":2075,"th ":1102,"tet":11629,"tes":7803,"ter":19863,"pár":2273,"ti ":14986,"the":2658,"tha":1854,"ző ":4019,"zöt":4004,"zöv":1986,"zör":1244,"zül":3298,"zás":4765,"zár":4252,"zám":6211,"zál":1821,"zág":9532,"záz":1557,"yüt":2463,"zó ":13810,"vű ":1466,"zít":1657,"zín":2823,"zép":2406,"zér":1246,"zés":6463,"zén":2337,"zél":1670,"zék":2328,"ülö":2566,"ütt":2514,"ülő":1827,"yáb":1633,"yán":7191,"yár":2315,"tű ":2810,"vő ":1785,"yó ":1102,"yéb":2439},"n_words":[10929783,12338513,8457220],"name":"hu"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":29898,"E":10468,"F":12839,"G":16234,"A":46756,"B":45618,"C":22345,"L":19113,"M":41589,"N":19093,"O":9096,"H":16882,"I":48750,"J":24390,"K":57099,"U":11179,"T":39498,"W":10090,"V":6089,"Q":1321,"P":55685,"S":65231,"R":21435,"Y":7072,"X":1468,"Z":2501,"f":36925,"g":365165,"d":485064,"e":840847,"b":246328,"c":70473,"a":2100570,"n":996656,"o":305761,"l":430448,"m":362974,"j":64260,"k":378208,"h":280863,"i":864649,"w":50622,"v":31442,"u":508685,"t":522270,"s":482635,"r":589018,"q":2068,"p":245252,"z":10934,"y":153007,"x":3585,"é":2192," l":29162," m":98895," n":17327," o":22384," h":14754," i":38006," j":15556," k":88933," d":239066," e":8087," f":10185," g":12086," a":116958," b":88325," c":8643," y":62811," z":1713," u":21958," t":81597," w":11107," v":3008," p":104408," s":135289," r":13775," J":23985," K":55803," H":16124," I":42065," N":16611," O":7318," L":17821," M":38172," B":43424," C":19640," A":41294," F":11657," G":14895," D":28065," E":8736," Z":2373," Y":6861," S":61440," R":19599," Q":1212," P":52451," W":9492," V":4735," U":10169," T":36300,"ا":1474,"A ":3175,"Da":7353,"Co":3999,"Ce":1371,"Ch":3483,"Ci":2915,"Du":2494,"Do":1602,"De":6798,"Di":6852,"Fe":1983,"Fa":1382,"Er":1543,"Ge":3292,"Ga":2686,"I ":4993,"Fr":1829,"Fo":1312,"Fi":2806,"C ":1897,"Au":2055,"Ar":4506,"As":2577,"D ":1256,"Ba":19721,"Ag":2188,"Ab":1488,"Ac":1714,"Ad":1447,"Am":5332,"An":4093,"Ap":1629,"Al":4784,"Bu":4215,"Br":3076,"Ca":4296,"Bi":2492,"Be":8472,"Bo":3000,"Ku":3549,"Kr":1685,"Ko":9825,"Le":3207,"Li":3131,"La":6311,"Lu":1669,"Lo":2427,"Me":8886,"Mi":3917,"Ma":15663,"Mu":4166,"Mo":3510,"Ni":1382,"Ne":3573,"Na":5130,"P ":1286,"Nu":1142,"No":3476,"Ok":1174,"Gr":2085,"Go":1818,"Gu":2419,"Ha":5613,"He":2039,"II":2521,"Hi":2733,"Ho":2454,"Hu":2080,"Ib":1304,"Ia":3224,"In":24134,"Is":3276,"It":2124,"Ir":1207,"Ja":10872,"Je":6536,"Jo":2100,"Ju":3132,"Ka":22425,"M ":1852,"Ki":1891,"Ke":13666,"Ut":2746,"Un":3136,"Tu":2495,"Tr":1946,"To":2609,"Th":2978,"Ti":8403,"Te":9113,"Ta":6615,"V ":1165,"Sy":1338,"St":3734,"Su":10264,"Wo":1285,"Wi":2707,"Wa":3162,"We":1219,"Vi":1573,"Pu":4020,"Pr":10851,"S ":1994,"Pe":15767,"Pa":12458,"Po":3505,"Pi":2342,"Or":1349,"Se":18572,"Si":5355,"Sh":2006,"Sp":1564,"So":3044,"Ru":2528,"Sa":8331,"Re":4622,"Ri":1937,"SM":1268,"Ro":3385,"T ":1420,"Ra":5456,"b ":5402,"a ":312256,"Ya":1618,"Yo":2051,"Yu":2161,"i ":261146,"ge":17252,"ga":100134,"fi":10254,"fr":1949,"fu":1399,"ft":1686,"fo":4136,"he":8947,"ha":50757,"gn":2069,"gl":1807,"gk":15524,"gi":27056,"gh":4457,"gg":24987,"gu":19428,"gs":5762,"gr":8256,"go":7739,"du":22038,"dy":1437,"g ":127207,"ea":8168,"eb":53877,"ec":18556,"ed":14792,"de":45859,"dd":1120,"di":143405,"do":23309,"ds":1430,"dr":3663,"ew":5523,"eu":3102,"ev":3178,"ey":2387,"fa":4277,"h ":162608,"fe":4040,"eh":16895,"eg":14415,"ef":2165,"ee":3361,"el":59121,"ek":27553,"ej":8227,"ei":5982,"ep":21232,"eo":11093,"en":149316,"em":59371,"et":34760,"es":60149,"er":212475,"ca":27397,"e ":42795,"br":3158,"bu":59966,"bo":6681,"bl":3420,"bi":20522,"be":67748,"da":219838,"f ":6725,"cu":3577,"ct":1798,"co":4648,"ck":2674,"ci":8787,"ch":7724,"ce":8123,"c ":1919,"az":1822,"ay":22914,"ba":74854,"d ":16186,"at":148965,"as":93999,"ar":156214,"aw":19383,"av":2545,"au":33011,"ak":85647,"al":170145,"ai":50563,"aj":9897,"ap":26307,"am":87049,"an":494871,"ac":7825,"ad":111448,"aa":15819,"ab":26392,"ag":30377,"ah":179811,"ae":6490,"af":4082,"nu":12849,"nt":61715,"ns":21631,"no":11352,"nn":7780,"nz":1307,"ny":39684,"oe":1902,"of":3903,"oc":2742,"od":8904,"oa":2254,"ob":5836,"om":19983,"on":61017,"ok":11631,"ol":35874,"oi":1595,"og":9163,"oh":4041,"ot":23779,"os":12530,"ov":13798,"ou":5529,"op":9093,"oo":2776,"or":39625,"r ":75373,"ow":2582,"oy":1610,"pe":68224,"pa":90070,"pl":2952,"po":12602,"ph":1712,"pi":13857,"lo":16018,"ln":1524,"lm":6776,"ll":7645,"ls":1781,"lu":24730,"lt":3200,"ly":1874,"o ":23320,"ma":94460,"mb":26120,"me":93044,"ml":1630,"mi":26777,"mn":2215,"mm":2111,"mp":24012,"mo":8478,"mu":28574,"p ":11473,"na":77075,"nc":11437,"nd":56574,"ne":39080,"nf":2387,"ng":238792,"ni":56068,"nj":13796,"nk":3871,"ju":15959,"jo":2256,"ki":23132,"kh":4590,"ke":56774,"ka":127350,"m ":50860,"ky":1397,"ks":9835,"kt":10395,"ku":22809,"ko":27878,"kr":3523,"kk":1592,"kl":3754,"km":2765,"kn":3265,"li":60829,"lk":2913,"le":44927,"ld":2217,"la":192799,"lb":1849,"n ":334938,"hr":1702,"ht":1424,"hu":19954,"hk":1872,"hi":19755,"hn":2129,"ho":5306,"hl":1668,"id":19549,"ic":8689,"ib":13081,"ia":81624,"ih":9019,"ig":10843,"if":6720,"ie":7742,"k ":77693,"ir":31102,"is":68271,"it":40678,"iu":5028,"iv":4491,"iw":2077,"ij":2865,"ik":61148,"il":48904,"im":30225,"in":117362,"io":16681,"ip":11977,"je":5678,"ji":4743,"iz":1367,"iy":1200,"l ":56560,"ja":34513,"z ":1416,"wi":10306,"wo":1229,"y ":11456,"wa":31269,"we":3398,"vi":15930,"vo":1681,"ve":8269,"va":3932,"x ":1658,"ui":4586,"uj":4080,"uk":44283,"ul":29290,"ue":2818,"uf":1241,"ug":8297,"uh":9753,"ur":39636,"us":36462,"ut":30979,"um":25849,"un":84334,"up":32836,"ty":2279,"tu":61107,"tt":3054,"ub":9153,"ua":56784,"ud":13238,"uc":2268,"w ":1917,"to":20692,"tn":2005,"tl":1402,"ts":2489,"tr":20074,"te":90779,"tk":2568,"ti":55141,"th":5635,"ta":176003,"su":25989,"ss":5417,"st":32797,"sy":2246,"sw":1219,"sl":3160,"sk":6158,"sn":2055,"sm":3761,"sp":3394,"so":6557,"sc":1962,"se":100046,"sh":4276,"si":103815,"u ":69102,"sa":100996,"rr":2177,"rs":13384,"rt":29454,"ru":41105,"rv":1161,"rw":1931,"ry":4130,"rp":3731,"ro":34136,"rn":12088,"rm":13211,"rl":14368,"rk":15615,"rj":5203,"ri":102805,"rh":3147,"rg":10415,"rf":1249,"re":30519,"rd":11856,"rc":2704,"rb":14366,"ra":143028,"t ":74952,"qu":1325,"s ":74416,"pt":3193,"pu":23123,"pr":13816,"ps":1371,"zi":2204,"za":3196,"ye":4611,"ya":121025,"yu":3182,"ys":1566,"yo":3138,"yi":2638,"一":3113," Ga":2665," Ge":3275," Fo":1303," Fr":1822," Fi":2802," Ha":5588," He":2029," Go":1803," Gr":2062," Gu":2405," Ib":1298," Ia":3220," Hu":2077," Ho":2444," II":1738," Hi":2728," Je":6511," Ja":10843," Ir":1206," Is":3262," It":2123," In":24109," Ka":22393," Ke":13629," Ki":1879," Jo":2086," Ju":3129," La":6277," Le":3183," Li":3097," Ko":9805," Kr":1679," Ku":3543," Ma":15603," Mi":3888," Me":8848," Lo":2414," Lu":1657," Ne":3562," Na":5110," Ni":1377," Mo":3486," Mu":4147," Ap":1628," Am":5319," An":4070," Al":4777," Ag":2185," Ac":1711," Ad":1437," Ab":1471," Ba":19679," Au":2049," As":2567," Ar":4490," Be":8434," Bi":2483," Bo":2979," Br":3065," Bu":4209," Ca":4213," Ce":1368," Ci":2897," Ch":3462," Co":3963," Da":7319," Di":6832," De":6775," Do":1546," Du":2487," Er":1536," Fe":1977," Fa":1366," Wo":1267," Wi":2689," We":1211," Wa":3147," Yu":2156," Yo":2043," Ya":1612," Or":1346," Po":3467," Pi":2336," Pe":15730," Pa":12421," Nu":1139," No":3465," Ok":1169," Ra":5409," SM":1160," Ro":3363," Re":4599," Ri":1930," Pr":10827," Pu":4013," Sy":1332," Su":10248," St":3681," Ta":6595," Th":2952," Ti":8388," Te":9090," Tr":1942," To":2586," Ru":2526," Sa":8312," Sh":1982," Si":5333," Se":18549," So":3026," Sp":1555," Vi":1551," Tu":2469," Un":3130," Ut":2746," ja":5269," je":2335," in":24841," il":1562," is":2175," it":1708," ka":17469," kh":1422," ki":3127," ke":41380," ju":6965," ha":6291," gr":1294," gu":1601," ib":1921," ia":3559," hi":4380," hu":2034," ne":5231," na":8812," mu":5887," mo":2739," ol":12623," of":1811," no":1685," le":5736," li":3289," la":14587," ku":3200," km":2201," kl":1852," ko":17290," me":68772," mi":4388," ma":16460," lu":3991," lo":1345," ag":1308," ab":1554," ad":60411," an":11749," ap":1720," ak":5080," al":4506," aw":1213," ar":2749," at":18641," as":3479," ba":29062," bi":8314," be":42050," bo":2101," bu":6066," ca":3227," en":1364," ek":1536," fo":1399," fi":5527," ge":4483," ga":3124," co":1499," ce":1281," da":93500," do":1270," de":28323," di":109723," du":4523," za":1414," ya":62479," ru":1822," sa":30188," se":78628," si":6482," sp":1184," so":1196," ra":4660," re":4944," ro":1171," pu":5843," pr":10257," or":4567," pe":52401," pa":26959," po":5413," pi":2294," wa":3549," wi":6744," tu":4514," ut":3268," um":2172," un":12469," ta":18998," st":3288," su":11745," tr":2305," to":2153," th":1170," ti":9886," te":42313,"Fil":1716,"Ger":1344,"II ":2070,"Han":1227,"Har":1306,"Ing":3276,"Int":1699,"Ind":17352,"Ia ":3146,"Ara":1730,"Aus":1437,"Bah":2306,"Bal":1493,"Ban":5008,"Bar":4958,"Bat":1605,"Agu":1534,"Ame":4085,"Ang":1356,"Ber":2116,"Ben":1538,"Bel":2649,"Bri":1173,"Des":2124,"Dal":2000,"Cha":1289,"Dia":1644,"Neg":1421,"Nov":1144,"Per":8357,"Pem":1634,"Pen":3260,"Pas":1289,"Par":2408,"Pad":2035,"Pan":1988,"Pul":1756,"Pro":7786,"Pre":1253,"Pol":1113,"Ita":1993,"Isl":1234,"Jan":1394,"Jak":1458,"Jep":2416,"Jer":2596,"Jaw":4820,"Jul":1161,"Kab":8059,"Kal":5302,"Kan":1250,"Kat":1399,"Kar":2399,"Ker":1724,"Kep":1438,"Kel":1127,"Kec":4407,"Kon":1421,"Kom":1142,"Kor":1136,"Kot":4453,"Lau":1252,"Men":2499,"Mer":1254,"Man":2093,"Mal":2386,"Mar":3752,"Mas":1454,"Mus":1132,"Yun":1506,"一一":1315,"Sur":1395,"Sum":1934,"Sul":1400,"Sun":2471,"Sta":2065,"Ten":4508,"Tan":2075,"Sin":1312,"Ser":4461,"Sep":1413,"Sel":5571,"Sem":1247,"Seb":1132,"Rus":1267,"Sam":1292,"San":2004,"Rep":1522,"Rom":1304,"Uni":2252,"Uta":2484,"Ter":1521,"The":1859,"Tim":5598,"bis":1615,"bit":1641,"bil":2430,"bin":3990,"bih":2863,"bli":2292,"bol":2158,"ban":15219,"bak":1477,"bal":2565,"bai":2045,"bag":18039,"bah":11866,"bad":2660,"baw":2021,"bat":5802,"bas":2119,"bar":6280,"beb":2207,"bed":1156,"ber":44372,"ben":5468,"bel":6123,"bek":1139,"bes":5623,"bia":3415,"bid":1129,"ca ":1339,"car":5372,"can":2310,"cam":11583,"ce ":2031,"bu ":2451,"bua":22699,"bup":9962,"bur":1967,"bul":1642,"buk":3601,"bun":4326,"bum":1823,"buh":2607,"but":6527,"aka":36392,"am ":27176,"aki":3783,"akh":2140,"al ":32136,"aja":7514,"aik":1989,"ain":9801,"air":2314,"ais":1591,"ait":2311,"ak ":26644,"ahk":1403,"ahi":5150,"ahu":13410,"aha":23711,"agi":8426,"agu":2010,"anu":3628,"any":11108,"ano":1219,"ann":4127,"ant":18067,"ans":3241,"ane":1770,"ang":133187,"ani":10218,"anj":5111,"ank":1863,"ap ":5145,"ana":17620,"anc":4861,"and":16448,"amu":2163,"amp":6554,"ami":2968,"ame":2230,"amb":4802,"ama":38387,"alu":3924,"alo":1291,"all":1612,"ali":20004,"ale":3013,"ala":101814,"an ":259764,"aks":3177,"aku":4198,"akt":4736,"aba":6181,"abi":1719,"abu":12017,"ae ":1556,"aca":2365,"aan":12880,"aat":2166,"ad ":3820,"ab ":3122,"afi":1142,"ai ":28231,"aga":15818,"age":1445,"aer":3528,"ah ":131776,"adi":10057,"ade":2164,"ach":1161,"ace":1304,"ada":91074,"ayu":1260,"aya":17559,"ba ":1212,"at ":42706,"arg":2616,"are":4682,"ard":2694,"ara":50200,"aro":1477,"arn":2327,"arl":1195,"ark":4616,"ari":39797,"aru":4864,"ars":1240,"art":10017,"au ":21880,"asa":28749,"ary":2056,"asi":28272,"ase":1504,"ask":2058,"ar ":23543,"apa":12599,"api":3283,"apu":2004,"as ":18260,"aut":2942,"ay ":1252,"awa":16458,"awi":1328,"ata":52964,"asu":4212,"ast":4229,"ass":1429,"asy":1183,"atk":1321,"ato":2873,"ate":14253,"ati":9780,"aua":1165,"atu":20191,"aup":1527,"aus":1167,"jen":2753,"jad":5570,"jaa":2051,"jab":1150,"jar":5491,"jal":2507,"jak":2749,"jan":3425,"jo ":1155,"ito":1897,"itu":7300,"ism":1635,"isu":1798,"ist":11957,"ita":13383,"ite":4451,"iti":3900,"iwa":1676,"ium":1147,"iun":2068,"ivi":1396,"ive":2266,"ipu":1250,"ipi":1541,"is ":23018,"ion":11007,"ipa":2509,"ipe":2501,"ir ":10536,"irk":1299,"iri":9503,"isi":11065,"ise":5218,"isa":7947,"ire":1684,"ira":4533,"it ":5451,"ja ":5112,"kim":1495,"kil":1164,"kin":2100,"kir":1475,"kis":1425,"kit":4346,"km ":1129,"ki ":8986,"khi":1575,"keb":2854,"kec":8512,"ked":2012,"kek":1191,"kem":4126,"kel":7010,"ken":6683,"kep":3726,"kes":1631,"ker":5150,"ket":3912,"ke ":5970,"ksa":1691,"ku ":4682,"kot":9086,"kon":4219,"kom":5415,"kol":2400,"koh":1184,"ks ":1167,"kny":1410,"kka":1328,"ko ":1260,"kla":1828,"juk":1809,"jun":1767,"jum":1474,"jua":1961,"jug":4990,"kaw":1325,"kat":13174,"kar":11177,"kas":4851,"kap":2757,"kan":62967,"kal":5122,"kam":1705,"kai":2738,"kad":1254,"kab":2943,"ka ":12884,"ha ":2059,"ham":2420,"han":14832,"hak":1134,"hal":1973,"har":6037,"has":12725,"hat":1841,"haa":1690,"had":1765,"he ":2996,"her":1241,"hi ":1844,"hid":1896,"hin":3929,"hir":6378,"hka":1709,"go ":1252,"gku":1576,"gor":1284,"got":2192,"gsa":3115,"gu ":1706,"gra":2940,"gri":3408,"gur":1410,"gus":2168,"gun":9840,"iam":1372,"ial":6166,"ian":16893,"iap":1176,"ias":3614,"iat":1370,"ibi":1510,"ibu":4263,"id ":2050,"iba":3456,"ibe":2500,"ia ":44404,"ier":1178,"ies":1321,"ifi":1731,"ih ":5574,"ich":1544,"ie ":1390,"ica":1838,"idu":1993,"idi":3417,"ide":3199,"ida":6661,"if ":2797,"il ":7886,"ija":1503,"im ":3888,"ika":23689,"iga":3238,"igu":3825,"iha":2592,"ik ":17419,"imp":2775,"ime":1812,"imi":1994,"ind":4767,"ina":9693,"imu":7350,"inn":1767,"ino":1354,"int":9752,"ins":11171,"ine":2698,"ing":27890,"ini":22825,"ink":1199,"iny":3792,"iko":1474,"iki":7614,"ike":5441,"ila":14804,"in ":16335,"iku":2328,"ill":2052,"ilm":5734,"ili":13211,"ile":1187,"ima":9934,"io ":1907,"hny":1246,"hub":1262,"hun":11971,"hus":1117,"fat":1257,"eta":14851,"ete":4068,"eti":4646,"est":3135,"esu":1622,"ess":1214,"etu":1676,"evi":1298,"ey ":1761,"ewa":3433,"epe":3912,"er ":19709,"epa":9668,"eor":7717,"es ":8727,"ept":1302,"epu":3274,"erk":7438,"erl":11827,"eri":25792,"erj":4499,"erg":3829,"erh":2837,"ere":6493,"erc":1371,"erd":7020,"era":35580,"erb":12574,"et ":5262,"esi":21745,"ese":3800,"esa":15867,"eru":21470,"ert":16167,"ers":10880,"ern":7132,"erm":8938,"erp":2943,"ero":2183,"eki":2304,"ekn":1342,"eko":2677,"eks":2683,"ekt":2574,"eku":2083,"en ":21818,"ela":27699,"ele":4883,"eli":4636,"ell":1619,"elo":2885,"elu":8272,"emb":14950,"ema":7624,"eme":9102,"emo":1229,"emi":9424,"emu":4943,"emp":7479,"ene":6504,"eng":44211,"ena":14758,"end":12541,"enc":3844,"eni":5826,"enj":6735,"enu":4939,"ens":2961,"ent":14866,"eny":6205,"ege":2018,"egi":2598,"ehi":1472,"ek ":3218,"ein":1306,"eja":5845,"el ":6722,"eke":1577,"eka":8278,"em ":3699,"gka":11656,"git":1200,"gin":2402,"gio":1293,"gia":7505,"gha":1865,"ggu":3578,"ggr":3634,"ggo":2263,"ggi":3715,"gga":11266,"gi ":10809,"gen":4341,"ger":4633,"gem":1548,"gel":2369,"ge ":1412,"gah":5405,"gai":13025,"gas":1879,"gar":10391,"gat":2080,"gam":3520,"gal":5628,"gan":35620,"gap":1320,"ga ":16084,"for":2821,"fil":3873,"fik":2255,"fis":1267,"da ":32556,"de ":5057,"dak":4551,"dal":74713,"dah":2729,"dae":2768,"dat":2016,"das":2265,"dar":32215,"dap":6287,"dan":52470,"dam":1256,"day":1876,"ch ":1540,"cer":1324,"cha":1339,"ck ":1406,"che":1327,"chi":1177,"cil":1660,"cis":2553,"ed ":1578,"eba":14092,"ebe":8142,"ebi":3337,"ebr":1132,"ebu":26049,"ea ":1655,"ei ":1901,"ega":7504,"eh ":13927,"edi":3728,"ede":1589,"eda":4173,"edu":2208,"eci":1667,"eca":14054,"dus":1247,"don":16622,"dok":1141,"dun":4005,"dup":1272,"dul":1434,"duk":5272,"dua":3278,"dud":2657,"dra":1669,"du ":1319,"did":3706,"dia":10670,"dib":6007,"der":4755,"des":7802,"del":1249,"dek":2240,"den":19332,"dem":1173,"di ":63731,"do ":1904,"dim":2715,"din":3800,"dio":1486,"dip":4666,"dir":8308,"dis":11623,"dit":5425,"dig":4149,"dik":8070,"dil":3331,"rha":2148,"rga":5059,"ri ":42938,"rge":1577,"ret":2887,"res":4698,"rg ":1147,"rea":2388,"rej":1691,"ren":4953,"rek":3031,"rda":4163,"rdi":3109,"re ":2800,"rbu":1584,"raw":1318,"rd ":2133,"rap":3154,"rar":1749,"ras":8094,"rat":11399,"rbi":1817,"rba":6850,"rbe":3317,"raj":2995,"rai":2568,"rah":9741,"rag":1442,"ran":36962,"ram":3611,"ral":3814,"rak":7284,"rab":2762,"raa":1221,"raf":1467,"rad":6359,"rs ":1283,"rpe":1250,"ros":2826,"rot":1395,"rom":1135,"ron":3571,"rop":2512,"rov":9832,"rod":2363,"rog":1358,"rny":1495,"rna":5736,"rmu":1260,"ro ":1992,"rma":9740,"rle":9267,"rla":2281,"rn ":1378,"rki":1226,"rke":4730,"rka":5975,"rja":3509,"rip":1275,"rio":1652,"rit":4490,"ris":9168,"ril":2736,"rik":14926,"rin":9510,"rim":1709,"ria":5936,"rib":1211,"ric":1356,"rid":1193,"rie":1621,"rk ":1641,"rya":1832,"ruh":2011,"rup":16158,"run":3050,"rum":2131,"ruk":1365,"rus":4704,"rut":3005,"ry ":1765,"rsi":3955,"rsa":2467,"rse":3931,"rta":11752,"rte":3710,"rti":7667,"rua":2111,"rtu":2198,"rt ":1478,"ru ":3013,"saa":3516,"sah":3766,"sai":1253,"sak":1265,"sal":14489,"sam":5305,"san":11536,"sat":14608,"sas":2791,"sar":11521,"saw":1247,"sa ":26535,"shi":1399,"si ":37996,"sid":1884,"sia":23497,"sit":3002,"siu":2036,"sir":1424,"sis":6271,"sin":5946,"sio":5312,"sil":3164,"sim":1558,"sik":5796,"sih":1154,"sif":1869,"se ":1750,"ser":7000,"ses":3161,"set":3675,"seh":1426,"sed":1550,"sec":2609,"seb":38986,"sep":5282,"seo":6619,"sen":4315,"sem":4262,"sel":6726,"sek":6032,"sej":4109,"spe":1482,"son":1660,"st ":1847,"ss ":1345,"sli":1139,"sla":1693,"ska":2806,"smi":1231,"sme":1407,"sya":1486,"ssa":1575,"ste":6115,"sta":5899,"sto":1411,"sti":6848,"stu":1610,"str":8471,"sua":5405,"sum":1276,"suk":5658,"sun":2892,"sut":1776,"sus":2249,"sur":1552,"tai":3534,"tak":12396,"tal":4630,"tah":16201,"tab":1684,"tau":16309,"tat":1881,"tas":11773,"tar":15390,"tap":2003,"tan":48487,"tam":7817,"te ":2719,"ta ":30047,"pa ":5126,"par":4662,"pat":19772,"pas":2458,"pad":19421,"pak":16990,"pal":3741,"pai":3130,"pan":11608,"pi ":3245,"pen":20640,"pem":8687,"per":28237,"pes":2547,"pel":2885,"pil":1386,"pin":3678,"pis":1169,"por":1652,"pop":1746,"pos":1337,"pon":1531,"pok":1819,"pol":2768,"pua":1330,"pub":1942,"pte":1271,"pri":2203,"pre":1413,"pro":9219,"pur":2238,"pus":1843,"put":3676,"pun":4895,"pul":5693,"ra ":33891,"ngo":1466,"ngi":5434,"ngk":15500,"ngu":4341,"ngs":5639,"ni ":27725,"nge":5034,"ngg":24626,"ngh":3138,"nga":45848,"neg":5658,"nen":1469,"ner":3870,"net":1984,"nes":16997,"ng ":124434,"nci":3629,"nce":1815,"nca":3309,"ne ":3872,"ndu":6040,"ndr":1343,"ndo":17880,"ndi":8610,"nde":4365,"nda":13200,"ncu":1191,"nak":8053,"nal":12498,"nam":10027,"nan":12711,"nar":2327,"nah":2814,"nai":1146,"nd ":3909,"nat":2083,"nas":5218,"na ":14205,"nya":32061,"nye":2679,"nyi":2079,"nul":1194,"nun":2161,"nus":2178,"nur":1832,"nua":1507,"nto":2680,"ntu":18006,"ntr":1617,"nti":5215,"nta":22900,"nte":7428,"nst":1796,"nse":1390,"nsi":12613,"nt ":2342,"ns ":1443,"nol":1247,"nom":2318,"nny":5115,"no ":2934,"nka":1820,"nja":9105,"nju":2843,"nia":6702,"niv":1591,"nis":7959,"nit":2077,"nin":2190,"nik":2713,"ogr":1789,"ogi":3408,"ok ":3549,"ol ":2833,"ode":3528,"of ":1858,"odu":2149,"oh ":1516,"obe":1842,"ote":1332,"oto":2549,"ota":16038,"osi":2975,"ose":2201,"oso":1148,"ovi":10349,"ove":2391,"oun":1267,"ope":1909,"opa":1279,"os ":1902,"opu":1750,"or ":7039,"orm":2622,"oro":1416,"ord":1550,"ore":2472,"org":2231,"ori":3198,"ort":1972,"ora":11685,"ola":5593,"on ":12856,"oli":4593,"ole":13717,"olo":5388,"oka":1673,"om ":1677,"oko":1983,"ona":7292,"ond":2230,"one":17406,"ong":6397,"oni":2889,"ono":3029,"ons":2868,"ont":3055,"oma":3069,"ome":2094,"omb":1474,"omi":2626,"omp":4769,"omo":1445,"omu":1988,"la ":8248,"le ":2787,"lah":85384,"lag":1642,"lai":7338,"lal":2146,"lak":4119,"lan":18336,"lam":23788,"lap":1829,"lar":2452,"lat":9794,"las":5560,"law":2074,"lau":6152,"lay":9124,"kut":2035,"kus":1523,"kur":1949,"kup":1223,"kun":2321,"kum":1981,"kul":1651,"kuk":1717,"kte":1325,"ksi":4460,"kua":1980,"ktr":1281,"ktu":2650,"kti":1994,"kto":2651,"lok":1231,"lon":2362,"lom":2722,"log":4305,"lny":1353,"lmu":1446,"lua":4860,"lta":1301,"lu ":3267,"li ":7633,"lev":1141,"les":2673,"let":9284,"ler":1546,"lem":2195,"len":2677,"lek":2289,"leh":12733,"leb":3217,"lla":1957,"lle":1326,"lli":1227,"lka":1991,"lm ":4360,"ll ":1780,"lit":5426,"lis":7079,"lir":1515,"lip":1370,"lin":6938,"lim":6542,"lia":5362,"lik":11324,"lih":1781,"ma ":21495,"mah":2164,"mai":3887,"mak":3420,"mad":1850,"mar":3160,"mas":10730,"mal":1793,"man":22564,"mat":17173,"mba":10647,"mbi":2114,"mbe":6808,"me ":2693,"mbu":4891,"med":1539,"met":2294,"mes":1768,"mer":26378,"mem":15519,"mel":4982,"men":35186,"luk":2200,"lui":1193,"lun":1652,"lum":2455,"lus":1551,"lur":4662,"mpi":3325,"mpe":2325,"mpo":2760,"mpu":6558,"mod":1185,"mon":1666,"mor":1409,"mpa":7594,"mu ":1626,"mud":2039,"mua":1660,"mur":7325,"mus":3149,"muk":2006,"mul":2774,"mum":2269,"mun":4790,"mi ":4545,"min":4316,"mil":8646,"mis":1673,"mit":1227,"mia":1700,"mik":1662,"mla":1505,"mny":1911,"zam":1194,"yu ":1225,"ya ":33584,"yat":1924,"yar":2043,"yan":64564,"yak":4682,"yah":7474,"yai":2823,"yi ":1127,"wi ":1323,"wil":6167,"wa ":9713,"wan":5957,"wal":2130,"wak":2165,"wat":1765,"war":3553,"was":1762,"wah":1890,"vin":9845,"vis":1830,"ver":3522,"usi":7004,"use":1166,"usa":6560,"usu":2654,"ust":3983,"uti":2003,"ute":2083,"uta":7869,"utu":1851,"utr":2712,"us ":12050,"ut ":11892,"ura":9315,"uri":1909,"urk":1245,"uru":7051,"uny":2077,"upa":26147,"ur ":13684,"upu":1725,"ump":1843,"umu":2414,"umi":1414,"uml":1462,"umn":1617,"uma":4069,"umb":3461,"ume":1307,"unt":12867,"unu":2087,"uni":6942,"unc":1388,"und":1967,"una":10560,"ung":21765,"une":1335,"up ":3437,"uks":1606,"uku":6493,"uko":1586,"uki":1626,"um ":7458,"uka":7728,"uju":3230,"ulu":3880,"ult":1653,"uli":4656,"ula":12496,"un ":20290,"uk ":22268,"ul ":3263,"ui ":2114,"uga":6118,"uha":3495,"uda":4275,"udi":2630,"ubu":2471,"uh ":4643,"udu":3551,"ua ":5771,"uat":8034,"uas":4449,"uar":6223,"ual":1917,"uan":9121,"ubl":2137,"uba":1945,"uah":19677,"ty ":1866,"tur":4927,"tus":3183,"tuj":1345,"tul":1791,"tuk":15174,"tun":2681,"tum":1823,"tua":3005,"ts ":1124,"tra":8844,"tri":6158,"tru":1229,"tro":2826,"tu ":22458,"to ":3427,"tny":1162,"tob":1155,"ton":2911,"tok":1669,"tol":2048,"tor":4842,"til":2355,"tik":7507,"tif":2658,"tig":1707,"tit":1315,"tis":2474,"tin":8673,"tim":3584,"tio":3164,"tia":2861,"tid":3087,"tiv":1124,"tka":2458,"tem":9574,"ten":16531,"tek":2322,"tel":6089,"th ":1540,"tet":1414,"tes":1167,"ter":45895,"ti ":10357,"the":1811},"n_words":[11077227,12709440,9643042],"name":"id"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":94947,"E":67243,"F":92398,"G":104535,"A":191905,"B":119758,"C":221153,"L":203562,"M":160653,"N":84931,"O":53123,"H":49392,"I":191375,"J":30427,"K":33649,"U":59673,"T":99599,"W":33788,"V":71164,"Q":12325,"P":151391,"S":231227,"R":109065,"Y":11206,"X":16332,"Z":10278,"f":518611,"g":920085,"d":2363397,"e":6056669,"b":494596,"c":2150346,"a":6155041,"n":4141668,"o":4513622,"l":3766393,"m":1338197,"j":22860,"k":110896,"h":452585,"i":5976536,"w":49843,"v":585518,"u":1754682,"t":3725316,"s":2419415,"r":3264659,"q":101625,"p":1281861,"z":475996,"y":114701,"x":30592,"È":18809,"ì":9651,"é":22484,"è":321778,"à":93588,"ù":30198,"ò":27447,"ó":6194," l":332142," m":240472," n":389032," o":184207," h":31309," i":549255," k":13980," d":1574321," e":446281," f":277424," g":138336," a":703335," b":75718," c":722915," z":6698," u":458470," t":223758," v":125009," q":64842," p":570240," s":706865," r":244819," J":29715," K":31685," H":46792," I":159850," N":78119," O":44164," L":197779," M":152789," B":113710," C":207226," A":159909," F":86857," G":100079," D":87417," E":59714," Z":9584," Y":10426," X":11662," S":213852," R":103083," Q":11829," P":141839," W":31372," V":61312," U":56839," T":91377," è":311053," È":18792,"A ":27808,"Da":15987,"Cu":5620,"Cl":7833,"Co":63300,"Cr":10676,"Ce":10902,"Ch":29704,"Ci":11685,"Du":6024,"Do":13966,"De":15564,"Di":21598,"Fe":11756,"Fa":10362,"Eu":7860,"Es":8695,"En":5820,"El":6080,"Ge":15850,"Ga":14081,"I ":31564,"Fu":8170,"Fr":19816,"Fo":12748,"Fi":15192,"C ":10682,"Au":9085,"Ar":21848,"As":8912,"D ":5691,"Ba":27783,"Am":10014,"An":18633,"Al":34245,"Bu":8230,"Br":18440,"Ca":56504,"Bi":8237,"Be":19580,"Bo":19802,"Le":25820,"Li":19778,"La":88640,"Lu":9671,"Lo":25268,"Me":19948,"Mi":23733,"Ma":59219,"Mu":9761,"Mo":30287,"Ni":7514,"Ne":23857,"Na":17893,"No":19720,"Gi":22185,"Gl":5829,"Gr":18092,"Go":9531,"Gu":8537,"Ha":17031,"He":8149,"II":13036,"Ho":8979,"In":30727,"Il":66084,"Is":9021,"It":12730,"Ja":8255,"L ":27703,"Jo":9534,"Ka":7944,"Un":28071,"Tr":15672,"To":14627,"Th":17010,"Ti":6315,"Te":15797,"Ta":10754,"UA":13360,"V ":7192,"St":32757,"Su":12977,"Wi":7930,"Wa":7819,"Vo":7254,"Vi":20021,"Va":13663,"Ve":14211,"Pu":5612,"Pr":25455,"S ":9118,"Pe":16282,"Pa":38366,"Po":20730,"Pi":22494,"Or":10767,"Se":20797,"Sc":24538,"Si":21736,"Sh":7322,"Sp":9077,"So":16973,"Ru":7444,"Sa":46373,"Re":30166,"Ri":14281,"Ro":32671,"Qu":10556,"Ra":11841,"b ":11012,"a ":2254795,"i ":1483483,"ge":88624,"ga":67666,"fl":8575,"ff":35291,"fi":134642,"fr":78002,"fu":41990,"fo":66122,"he":195889,"ha":63814,"gn":70158,"gl":120822,"gi":234393,"gh":24919,"gg":70910,"gu":55229,"gr":73387,"go":71768,"du":52294,"g ":29202,"ea":91863,"eb":24191,"ec":137759,"ed":128701,"de":782039,"dd":10103,"di":854362,"do":146993,"ds":9108,"dr":32135,"ew":7817,"ex":7461,"eu":18301,"ev":52077,"ey":12656,"ez":26509,"fa":71893,"h ":27064,"fe":57833,"eg":172770,"ef":19354,"ee":16955,"el":899715,"ei":89786,"ep":27874,"eo":41068,"en":625515,"em":132418,"et":253878,"es":434221,"er":718824,"eq":6537,"ca":399283,"e ":2094441,"br":59095,"bu":40562,"bo":29867,"bl":42370,"bi":141440,"bb":48744,"be":43765,"da":306546,"f ":15395,"cu":62362,"ct":11024,"cq":7195,"cr":59094,"co":609419,"ck":21526,"cl":55423,"ci":319342,"ch":241963,"ce":223557,"cc":104089,"c ":16295,"az":131617,"ay":15144,"ba":66599,"d ":145378,"at":664795,"as":217940,"ar":494405,"av":77540,"au":53901,"ak":11813,"al":679169,"ai":64110,"ap":84421,"am":172413,"an":700812,"ac":102427,"ad":97495,"ab":100158,"ag":138596,"ah":7533,"ae":26244,"af":30789,"nu":38735,"nt":634979,"ns":87660,"nq":7056,"no":374486,"nn":92022,"nz":76063,"ny":7751,"nv":12211,"oe":12640,"of":30817,"oc":106098,"od":71766,"oa":9353,"ob":27798,"om":291133,"on":807692,"ok":6829,"ol":302572,"oi":49498,"og":89760,"oh":6209,"ot":113582,"os":173480,"ov":109964,"ou":37230,"op":117448,"oo":15155,"or":452213,"r ":177153,"ow":12656,"oz":6935,"oy":5914,"pe":242170,"pa":234524,"pl":24851,"po":213425,"ph":7578,"pi":128997,"lo":210541,"lm":44463,"ll":700194,"ls":12139,"lp":10018,"lv":11056,"lu":64908,"lt":90822,"ly":7335,"o ":1645460,"ma":263312,"mb":52388,"me":350394,"mi":170562,"mm":42029,"mp":106005,"mo":153835,"mu":105465,"p ":14900,"iù":27078,"na":431794,"nc":198192,"nd":220346,"ne":784634,"nf":25150,"ng":109071,"ni":335162,"nk":7990,"ki":12045,"ke":15323,"ka":11574,"m ":76534,"ko":6777,"km":10394,"li":478088,"le":450199,"ld":17670,"lg":10033,"lf":7624,"la":677771,"lc":22009,"lb":26576,"n ":671873,"hr":6559,"ht":9126,"hu":11107,"hi":93831,"hn":5725,"ho":18964,"id":103779,"ic":450726,"ib":38087,"ia":461965,"ig":105629,"if":52751,"ie":183937,"k ":32183,"ir":101293,"is":299704,"it":487446,"iu":44781,"iv":129892,"ik":7413,"il":319428,"im":199167,"in":697560,"io":541230,"ip":107291,"iz":100179,"l ":908043,"ja":6741,"z ":9054,"tà":85596,"wi":6023,"vv":10000,"y ":65822,"wa":12380,"we":6401,"vi":166809,"vo":78376,"uz":20664,"ve":175153,"va":133125,"x ":19811,"ui":79181,"ul":72491,"ue":88946,"uf":10300,"ug":23193,"ur":134766,"us":100555,"ut":118876,"um":71437,"un":561799,"uo":58100,"up":47527,"ty":10365,"tu":164890,"tt":381279,"ub":48232,"ua":156844,"ud":43002,"uc":34894,"w ":10150,"to":743121,"tl":8532,"ts":9120,"tr":295588,"te":580573,"ti":577841,"th":33664,"ta":698330,"su":137066,"sv":16162,"ss":213977,"st":447357,"sl":9152,"sk":10657,"sm":16619,"sp":87152,"so":218207,"sc":157523,"sf":9364,"se":363584,"sh":16912,"si":401810,"rz":16932,"u ":59520,"sa":125213,"rr":72120,"rs":82412,"rt":207644,"ru":63955,"rv":21724,"ry":12041,"rp":15121,"ro":352219,"rn":64837,"rm":75961,"rl":22967,"rk":10623,"ri":614338,"rg":44544,"rf":10278,"re":622304,"rd":75640,"rc":66009,"rb":30426,"ra":583297,"t ":103375,"qu":99430,"s ":156468,"pu":55098,"pp":84071,"pr":255723,"ps":6191,"zz":76375,"zi":221598,"ze":20937,"za":99515,"zo":37143,"ya":7485,"È ":18782,"à ":92132,"ò ":26518,"ì ":8878,"é ":9885,"è ":314074,"ù ":29830," Ga":14003," Ge":15739," I ":16475," Fo":12451," Fu":8160," Fr":19775," Fi":15103," Ha":16981," He":8121," Go":9484," Gr":17959," Gu":8468," Gi":22095," Gl":5798," Ho":8931," L ":24925," Ja":8214," Is":8984," It":12700," In":30290," Il":65842," Ka":7887," Jo":9477," La":88189," Le":25663," Li":19618," Ma":58913," Mi":23636," Me":19838," Lo":25207," Lu":9633," Ne":23724," Na":17829," Ni":7466," Mo":30189," Mu":9681," A ":5979," Am":9971," An":18493," Al":34065," Ba":27690," Au":9040," As":8840," Ar":21718," Be":19500," Bi":8121," Bo":19624," Br":18373," Bu":8165," Ca":56136," Ce":10868," Ci":11600," Ch":29611," Cl":7713," Cr":10596," Co":62928," Da":15272," Di":21504," De":15426," Do":13718," Du":5966," El":6042," Es":8654," En":5749," Eu":7835," Fe":11726," Fa":10272," Wi":7870," Wa":7761," a ":110030," Or":10710," Po":20608," Pi":22451," Pe":16220," Pa":38161," No":19629," Ra":11758," Qu":10458," Ro":32569," Re":30042," Ri":14247," Pr":25353," Su":12949," St":32098," Ta":10678," UA":13306," Th":16944," Ti":6253," Te":15675," Tr":15562," To":14480," Ru":7431," Sa":46303," Sh":7238," Si":21606," Sc":24393," Se":20659," So":16887," Sp":9003," Va":13628," Ve":14047," Vi":19885," Vo":7231," Un":27982," l ":52713," im":22658," in":295683," il":151179," is":14605," it":18185," ha":23853," gi":32407," gl":14007," gr":40025," go":5702," gu":9712," id":6931," ne":275511," na":25351," mu":20050," mo":56127," ol":7856," om":7514," og":8277," oc":9930," of":10734," nu":14604," no":68303," le":52423," li":35936," la":143866," km":9911," me":51316," mi":29028," o ":42065," ma":81024," lu":16900," lo":28809," af":7222," ag":11338," ab":64448," ac":16193," ad":25196," am":20786," an":81988," ap":29511," ai":8950," al":183776," av":17618," au":20712," ar":36060," at":28661," as":31346," d ":20488," ba":32317," bi":8838," be":7889," bo":6391," br":13522," ca":114268," e ":219384," er":25454," et":12895," es":52932," en":13079," ep":5911," el":14730," fe":18710," fa":56978," fu":35844," fr":63957," fo":40785," fi":57936," ge":26299," ga":8284," i ":33110," cl":12326," co":356836," cr":19933," ce":24853," ch":106561," ci":55235," da":211997," cu":28570," do":30811," de":613533," di":670575," ec":32925," ed":39893," du":23486," ru":12186," sa":17824," se":114506," sc":42709," si":147221," sp":37866," so":80656," qu":64678," ra":31475," re":103663," ri":75848," ro":21093," pu":36089," pr":197319," os":7051," ot":10340," ov":5976," op":14803," or":49602," pe":114303," pa":91709," po":66694," pi":52800," va":18746," ve":35685," vo":17500," vi":51221," tu":14428," us":12554," ut":9880," un":408903," ul":5818," ta":13154," st":115114," sv":15397," su":116897," tr":76246," to":12684," th":12060," ti":18079," te":75278," È ":18766," è ":310990,"Eur":6532,"Gio":7152,"Fra":14090,"For":6200,"II ":9095,"Gra":9265,"Int":6283,"In ":9661,"Il ":64107,"Bas":6130,"Alt":6625,"Cal":8462,"Cam":7263,"Cas":9838,"Car":10034,"Can":7902,"Chi":9585,"Cen":5742,"Cha":12533,"Cor":7549,"Com":13539,"Col":6543,"Con":19924,"Dis":7107,"Nel":12164,"Nor":11147,"Per":7527,"Par":12134,"Pro":9921,"Pre":7483,"Que":6463,"Ita":11783,"Le ":10050,"La ":72068,"Man":6976,"Mar":24416,"Mon":12589,"Sta":17125,"UA ":13253,"Si ":7499,"Sai":6635,"Sco":15601,"San":19570,"Reg":7644,"Rom":11444,"Ven":6082,"Val":6880,"Uni":19062,"The":12364,"Tra":6212,"bit":77836,"bil":24167,"bli":33195,"bor":7965,"bbl":30259,"bbe":6273,"be ":7531,"bbr":5663,"ban":14724,"bal":6018,"bat":9698,"bas":13208,"bar":7268,"ber":18766,"bia":7881,"ca ":152253,"car":46789,"cas":12364,"cat":57544,"can":40911,"cap":15975,"caz":8224,"cam":17670,"cal":28832,"ce ":48205,"bri":12930,"bro":7660,"bra":15040,"bre":21272,"bum":18883,"am ":7918,"al ":120078,"ain":14219,"aio":8047,"agl":13391,"agg":47904,"agi":14545,"agn":23437,"ago":14318,"anz":27062,"ano":80778,"ann":47068,"ant":158501,"ans":9496,"ane":24020,"ang":15547,"ani":57213,"ana":44733,"anc":100934,"and":75292,"amm":16889,"amo":10418,"amp":23753,"ami":24192,"ame":55568,"amb":14366,"ama":14741,"alt":24272,"alo":9405,"alm":19827,"all":157126,"ali":107974,"alc":17413,"ald":5979,"ale":157930,"ala":17702,"alb":21166,"an ":45440,"abb":6484,"abi":71766,"abo":7042,"ae ":8196,"ad ":23773,"aff":9278,"afi":13627,"ai ":19697,"aga":10288,"age":5874,"aes":7300,"ado":12933,"adr":12192,"adi":16689,"ade":11907,"acq":6540,"aco":8070,"aci":10043,"ach":7116,"ace":12782,"acc":33357,"ada":10503,"acr":5776,"azi":114771,"azz":11554,"at ":6931,"arg":6155,"are":73118,"ard":28986,"arc":21226,"ara":44876,"aro":14834,"arn":6502,"arm":9201,"arl":10615,"ari":92056,"arr":17226,"ars":9758,"art":120284,"asa":12008,"asi":18399,"asc":31970,"ase":8737,"ar ":13924,"api":8043,"apo":15567,"app":40715,"as ":11094,"ava":20339,"aut":20581,"avo":14059,"avi":12649,"ave":19075,"ay ":7096,"avv":6341,"ata":122912,"ast":41964,"ass":66552,"atr":11363,"ato":251278,"ate":43614,"ati":96839,"att":98894,"atu":21242,"aur":8293,"aus":7205,"ito":68066,"itu":73079,"itt":66676,"ism":9170,"iso":15645,"isp":27507,"iss":18851,"ist":119446,"ita":148002,"ite":23677,"iti":31286,"ivo":23510,"ius":6126,"ium":7439,"iun":5847,"iut":11741,"iva":34645,"ivi":28519,"ive":42061,"ipo":9185,"ipi":8806,"is ":24623,"ion":300543,"ior":37852,"ios":7307,"ipa":67260,"ipe":7498,"iov":6740,"iro":6686,"iri":10262,"isi":24222,"ise":9930,"isc":25942,"isa":8063,"ire":38092,"ira":11739,"irc":16765,"ità":54075,"izz":49660,"izi":46425,"km ":6935,"ha ":20767,"ham":6367,"han":10716,"har":10403,"he ":154969,"het":5812,"her":11167,"hi ":17347,"hie":13497,"hia":17408,"hin":6740,"hil":8553,"hit":6455,"go ":24875,"gle":14019,"gli":101433,"gno":22948,"gni":13103,"gne":7169,"gna":23696,"gol":13409,"gon":12443,"gru":16613,"gra":38555,"gre":12522,"gui":11363,"gua":15610,"gue":16931,"gur":5768,"iam":18301,"ial":41395,"ian":67167,"ias":18220,"iar":13087,"iat":27828,"ic ":7518,"iac":6819,"ibi":9969,"ibr":7455,"ibu":6636,"iaz":6940,"ibe":7052,"ia ":244503,"iet":16601,"iem":9287,"ien":42565,"ier":26963,"ies":15028,"iff":8035,"ife":10392,"ifi":25651,"ico":88289,"ici":66289,"ich":34469,"icc":12587,"ice":31036,"ie ":52789,"ica":189725,"ido":7371,"idi":19231,"ide":49954,"ida":12753,"il ":154014,"igl":35255,"igh":7328,"igi":24518,"igu":7702,"ign":12535,"imo":36242,"imm":6781,"imp":19629,"ime":69205,"imi":17819,"inc":72864,"ind":26998,"ina":84297,"ino":48220,"int":56511,"ins":16076,"inf":11205,"ine":59357,"ing":57070,"ini":52386,"ioc":18011,"inv":7003,"ila":12536,"in ":183514,"ilo":8196,"ill":25371,"ilm":20062,"ili":45534,"ile":34997,"ima":41217,"io ":144991,"ilu":7192,"ffe":9164,"ffi":11476,"fes":8200,"fer":25051,"fia":6078,"fas":16462,"fat":6791,"far":5672,"fam":16618,"fan":7946,"età":10656,"ezz":10468,"ezi":12836,"eta":25147,"ete":11093,"eti":19239,"esp":10424,"eso":7266,"est":82383,"ess":82490,"eto":8361,"etr":19381,"ett":136219,"eve":12042,"eva":15707,"evo":6501,"evi":15209,"eur":6939,"ey ":9468,"er ":129403,"eor":7000,"es ":37108,"epu":6598,"eri":107300,"erg":9054,"ere":60624,"erf":7449,"erc":19250,"era":90400,"et ":14734,"equ":6340,"esi":40808,"esc":25003,"ese":123684,"esa":17222,"erz":5997,"erv":15974,"err":35138,"ert":35208,"ers":54254,"ern":33474,"erm":28661,"erp":7756,"ero":53515,"en ":21820,"ela":13747,"ele":33688,"eli":13565,"ell":470580,"elo":7068,"eo ":15585,"emb":16989,"ema":19885,"eme":23252,"emo":10570,"emi":32972,"emp":19288,"ene":64829,"eng":5768,"ena":16730,"end":39915,"enc":6461,"eno":18323,"enn":25781,"eni":17944,"enu":8919,"ens":28995,"ent":321381,"enz":41200,"egl":30932,"ego":7039,"egn":18128,"egg":11766,"egi":67882,"egu":13973,"el ":344686,"giu":12377,"gis":8322,"gin":21943,"gio":125636,"gic":7947,"gia":27457,"ght":6164,"ghi":6769,"ghe":8042,"ggi":57223,"gge":11031,"gi ":14154,"gen":34367,"get":10150,"ger":10735,"ge ":13833,"gar":8686,"gat":8038,"gan":16471,"ga ":14336,"fra":61392,"fu ":20614,"for":37768,"fon":16839,"fic":48822,"fig":8947,"fil":22511,"fin":27532,"da ":136024,"de ":72497,"dal":94565,"dai":6473,"dat":26742,"dar":7604,"dan":11831,"cun":8420,"cul":6650,"cui":21314,"cur":6089,"cla":11488,"cli":30380,"co ":105436,"cog":6308,"con":172605,"col":60224,"com":159257,"cor":35975,"cos":24601,"cop":26258,"cqu":7001,"cre":13257,"cra":6453,"cri":26431,"cro":12281,"cci":18510,"cch":12126,"cco":24015,"cca":13415,"cce":29193,"ch ":7853,"cer":13344,"ces":73387,"cen":46057,"cel":12841,"ced":6856,"ci ":29747,"cha":6310,"cia":75274,"ck ":12816,"cie":22050,"cid":13163,"che":152251,"chi":60026,"cil":6171,"cir":15709,"cis":9575,"cit":48155,"ciu":9832,"cin":18966,"cio":18635,"cip":33626,"ed ":34167,"ebb":8622,"ebr":5973,"ean":5957,"eal":11897,"eat":17904,"ea ":31185,"efi":7167,"ei ":71643,"ega":13314,"edi":40474,"ede":36599,"ecl":13991,"eci":22204,"ece":13292,"ecc":23316,"eca":6015,"ee ":7306,"eco":39931,"dur":10435,"dut":6584,"duz":6137,"dor":7110,"dop":9690,"don":12153,"dov":9865,"dot":18765,"ds ":7341,"due":12229,"dri":5744,"dra":9112,"dre":9661,"dro":6212,"dic":30670,"dia":36200,"der":31418,"des":28656,"det":11258,"dec":5661,"def":6138,"deg":19555,"dei":52282,"del":486810,"den":37113,"deo":6268,"di ":549365,"do ":64324,"div":20925,"diz":13866,"din":21333,"dio":28106,"dip":45197,"dir":19963,"dis":42122,"dit":11525,"die":7033,"dif":12635,"rga":12245,"ri ":84427,"rgi":7768,"rge":8891,"rgo":7492,"ret":51197,"res":79309,"rev":9081,"rfi":5771,"rds":5842,"rea":31808,"rec":19082,"red":9839,"reg":68923,"rem":13549,"ren":40494,"rel":11321,"rda":7163,"rdo":8984,"rdi":19945,"rde":7719,"re ":253820,"rco":11417,"rci":11839,"rch":14217,"rca":18883,"raz":24666,"rd ":22088,"rap":12296,"rar":11138,"ras":20760,"rat":94270,"rav":8469,"rbi":17348,"rai":8283,"rag":15270,"ran":117955,"ram":19333,"ral":33868,"raf":16450,"rad":21432,"rac":15856,"rpr":6371,"rs ":8921,"ros":16960,"rot":12347,"rom":18807,"ron":34271,"rop":27028,"rov":41355,"rod":22675,"roc":16930,"roi":16969,"rol":9837,"rof":8044,"rog":13121,"rno":15812,"rna":22371,"rne":10164,"rni":10544,"ro ":93036,"rma":38511,"rme":10556,"rmi":17565,"rla":7860,"riz":22621,"rio":45747,"rit":48367,"ris":50562,"riv":16323,"rig":24728,"ril":9910,"rin":43294,"rim":44423,"ria":61665,"rib":8788,"ric":75415,"rid":11323,"rie":41936,"rif":9551,"rk ":5699,"rup":18084,"rus":6787,"rut":6215,"rva":7812,"rvi":6676,"rve":5812,"ry ":9246,"rsi":24369,"rso":28714,"rsa":7860,"rse":7118,"rta":21350,"rto":33655,"rte":47947,"rti":80543,"rt ":10303,"rro":11867,"rri":17046,"rre":16669,"rra":22088,"sal":9289,"san":12698,"sat":15313,"sar":8168,"sa ":55092,"rzo":7427,"si ":104255,"siv":14099,"sie":10542,"sid":14077,"sic":27928,"sia":21469,"sit":73146,"sis":21720,"sin":20546,"sio":35066,"sil":9310,"sim":20855,"sig":13260,"scr":20145,"se ":150356,"sca":15235,"sce":19663,"sci":47092,"sch":12701,"sco":34425,"ser":44878,"ses":5630,"set":14434,"seg":23827,"sed":9878,"sec":24038,"sen":42117,"sem":26599,"spo":16816,"spe":38890,"spi":8159,"spa":13861,"sot":9626,"sol":26000,"son":52029,"sop":6399,"sor":13321,"soc":11137,"su ":15907,"st ":20759,"smo":8359,"so ":74484,"sse":52141,"ssa":30610,"sso":53075,"ssi":64743,"ssu":6194,"ste":66836,"sta":130887,"sto":49939,"sti":78547,"stu":12684,"str":82216,"sua":16483,"sud":8747,"suc":8338,"sul":27351,"sup":11429,"suo":21188,"sur":6713,"svi":7639,"svo":5978,"tal":71028,"tag":17728,"taz":13579,"tav":10674,"tat":99866,"tas":10040,"tar":29860,"tan":110605,"tam":12199,"te ":231097,"ta ":297475,"pa ":11956,"pe ":6955,"par":126674,"pat":12618,"pas":6581,"pag":17667,"pal":29532,"pan":6881,"pi ":12873,"pec":11403,"pen":11283,"per":160544,"pet":24231,"pes":8448,"pli":9280,"ple":7659,"pia":16405,"pic":12371,"pie":7531,"pin":8248,"pio":15425,"pir":5858,"pit":11446,"por":30452,"pop":13161,"pot":6913,"pos":33968,"poi":5808,"pon":18443,"pol":43083,"poc":5679,"ppr":9310,"ppi":6671,"ppo":29526,"ppa":24732,"ppe":7033,"po ":47633,"più":26636,"pub":29314,"pra":8794,"pri":70446,"pre":86755,"pro":88068,"put":6255,"pun":7305,"qua":45208,"que":36643,"qui":15702,"ra ":147867,"ngo":19829,"ngl":15051,"ngu":15413,"ni ":109423,"nge":12593,"ngh":7552,"nga":7243,"neg":12898,"nei":14330,"nel":251833,"nen":19286,"nem":6968,"ner":29386,"net":10607,"nes":22568,"ng ":19777,"nea":10437,"nfi":6288,"nco":13364,"nci":58066,"ncl":16907,"nce":63829,"nch":32511,"nca":8638,"ne ":381891,"ndu":5794,"ndr":9579,"ndo":51716,"ndi":44494,"nde":40952,"nda":41143,"nal":55844,"nam":6348,"nan":9718,"nar":20198,"nag":9973,"nd ":21375,"nat":56733,"nas":7852,"naz":16138,"na ":222844,"iù ":27049,"nve":7397,"num":9332,"nut":9355,"nto":104437,"ntr":54646,"nti":140368,"nta":97179,"nte":203922,"nso":6392,"nse":23034,"nsi":31575,"nt ":19040,"nqu":6994,"ns ":8742,"nol":12956,"nom":37789,"non":21278,"not":13854,"nos":16869,"nor":18667,"nov":9880,"nne":24481,"nna":15245,"nno":16579,"nni":28556,"no ":229968,"nif":7757,"nie":7902,"nic":33089,"nia":32535,"niz":16719,"niv":11731,"nis":28026,"nit":37389,"nio":10735,"nim":18502,"ogr":18460,"ogi":20692,"ogo":10374,"ogn":10473,"oge":8216,"ogg":8490,"oi ":11644,"oir":5807,"oid":15471,"ol ":6484,"oce":16535,"och":7177,"oci":18254,"ock":8603,"oco":10221,"oca":18569,"occ":17275,"ode":8882,"odi":13909,"odo":23349,"of ":9685,"oda":9043,"odu":9647,"obi":7836,"nza":33286,"nze":8647,"nzi":18694,"nzo":13784,"oti":8184,"ote":14816,"ott":46439,"oto":17159,"ost":54505,"ota":15242,"osi":22379,"ose":9147,"oss":25102,"oso":10625,"ovi":33581,"ova":26672,"ove":33842,"oun":6402,"our":8721,"opo":34311,"opp":9241,"ope":36127,"os ":8709,"opr":18087,"or ":15387,"orm":33668,"orn":18923,"oro":17981,"orr":13295,"ord":34705,"ore":83573,"org":19011,"ori":84933,"osa":10641,"osc":18331,"ort":43269,"ors":13700,"orb":16181,"ora":29450,"ola":56821,"on ":116999,"oli":48219,"oll":21894,"ole":18360,"olt":34437,"olo":80006,"olu":13657,"ona":73743,"ond":62359,"onc":11700,"onf":10972,"one":239015,"ong":9187,"oni":83803,"onn":10167,"ono":78642,"ons":27379,"ont":70151,"oma":36812,"ome":71129,"omb":8981,"omi":23159,"omm":10870,"omp":38351,"omo":18571,"omu":77250,"la ":502615,"le ":302428,"lci":6000,"lcu":7598,"lab":6177,"lac":10059,"lan":31109,"lam":6780,"lar":23739,"lat":27051,"las":21662,"lav":9746,"laz":14283,"ld ":6147,"lbu":19083,"lpi":7148,"lon":12253,"lom":6131,"lor":18602,"loc":10691,"log":22625,"los":5990,"lme":20869,"lti":14409,"lto":12474,"ltr":18612,"lta":21939,"lte":12298,"li ":126320,"lev":10337,"les":31571,"let":25767,"ler":10315,"lem":6948,"len":14630,"leg":16217,"lo ":116614,"lla":366876,"lle":95889,"lli":26880,"llo":51317,"lm ":18058,"ll ":147686,"lit":48006,"lis":18952,"lio":27570,"lin":48858,"lim":8378,"liz":23775,"liv":6284,"lic":53902,"lia":69213,"lib":8948,"lig":7530,"lie":14089,"ma ":77830,"mag":30019,"mar":19705,"mas":10652,"mal":8482,"man":52234,"maz":7709,"mat":39139,"mba":7166,"mbi":12940,"mbr":16743,"me ":86094,"med":14225,"met":29612,"mes":11952,"mer":34722,"mem":5945,"men":156332,"lup":7071,"luo":8412,"lun":8513,"lus":8431,"mpi":23542,"mpe":12689,"mpr":10131,"mpo":28943,"mpl":10446,"mod":11161,"mon":36802,"mol":12121,"mor":12412,"mos":9186,"mot":7018,"mpa":14258,"mus":15113,"mun":79273,"mi ":19370,"min":44712,"mil":12857,"mis":11697,"mit":12512,"mic":16950,"mia":19829,"mig":18385,"mo ":49530,"mmi":13753,"mma":13393,"mme":10561,"zzo":8684,"zza":55596,"zi ":8253,"ze ":11813,"zaz":5784,"zat":34965,"zon":11265,"zo ":22095,"zia":26147,"zie":6922,"zio":172644,"za ":45565,"tà ":85470,"vve":6371,"via":17063,"vil":14796,"vin":31306,"vic":6909,"vid":11446,"vie":12590,"viz":5900,"vit":13527,"vis":26147,"vo ":24368,"vol":30564,"vor":8888,"vi ":11655,"ver":60261,"ves":9627,"ven":39876,"vel":9674,"ve ":30393,"val":16987,"van":19808,"vam":6468,"var":11852,"vat":11790,"va ":54959,"uzi":17041,"usi":20123,"use":8740,"usc":8066,"usa":11990,"ust":15108,"uss":11959,"uti":17848,"ute":8898,"uta":15229,"utt":27857,"uto":37177,"us ":15875,"ura":49253,"ure":16806,"urg":6288,"uri":11168,"uro":17633,"uog":8121,"uol":7555,"uov":6581,"ur ":9581,"upe":12180,"upp":25887,"umb":6348,"ume":24760,"uo ":15893,"unt":14471,"uni":39506,"uno":18306,"una":118143,"ung":13147,"une":69547,"um ":26907,"ult":16466,"ull":18007,"ula":8409,"un ":268027,"uin":7055,"uis":8264,"uit":22856,"ul ":12226,"ui ":25799,"udi":18251,"ue ":29020,"ucc":13459,"uer":10140,"ues":18932,"uff":8520,"uen":11155,"uel":14803,"ua ":25910,"uat":61583,"uar":12747,"ual":28454,"uan":11940,"ubi":6952,"ubb":30175,"ud ":7866,"uad":8171,"ty ":9152,"tur":35100,"tut":19514,"tui":7703,"tun":9180,"tua":67153,"tud":14358,"ttà":16625,"tre":43645,"tra":110475,"tri":61377,"tru":14883,"tro":62451,"tta":55228,"tte":61167,"tti":63839,"tto":137927,"ttr":17103,"ttu":21997,"to ":551865,"tog":6135,"tos":6930,"tom":8273,"ton":24892,"tol":23856,"tor":90775,"til":16701,"tif":7476,"tie":11090,"tig":6347,"tir":6554,"tit":35734,"tis":12185,"tin":33317,"tim":54555,"tip":9392,"tio":18064,"tia":7439,"tic":92757,"tiz":5615,"tiv":40919,"tem":30137,"ten":60350,"tel":26672,"tea":13828,"tec":11962,"ted":10497,"th ":7445,"tes":26748,"ter":139440,"ti ":212534,"the":11405},"n_words":[55820958,65476626,49460182],"name":"it"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"é":1545,"и":1279,"а":1241," 『":8564," 』":1624," 。":2126," 、":3623,"あ":3435630,"。":214195,"、":312995," ":2941,"々":3019,"』":21225,"『":21300,"」":29519,"「":29647,"〜":2758,"ア":2611969," あ":52272," ア":58543,"乱":13623,"九":1493,"乗":1644,"久":1407,"主":13646,"丼":69358,"中":24049,"両":2394,"並":1334,"丞":80623,"丕":5670,"世":14223,"丈":118136,"三":6461,"上":13976,"下":7225,"不":54571,"与":2254,"一":26873,"丁":71935,"万":1628,"任":2340,"以":6436,"令":1766,"代":19956,"他":3009,"付":3689,"人":34784,"交":21043,"京":8000,"五":1508,"井":2181,"争":3320,"予":2003,"事":34820,"二":5188,"使":59250,"住":2454,"位":7990,"作":23623,"何":10362,"体":13204,"佐":59544,"伊":1406,"企":2899,"会":27025,"伝":3991,"休":80474,"信":13921,"係":2046,"保":5439,"価":32733,"供":2235," 分":1406,"営":4802,"問":2693,"商":2606,"員":5415,"品":9370,"和":9812,"周":2042,"呼":9439,"命":2467,"味":3638,"含":3079,"名":22983,"同":12059,"吉":1498,"合":22154,"各":2831,"向":3485," 号":4482,"域":5560,"城":2545,"基":5538,"土":3313,"園":3175,"地":19745,"在":13117,"回":5170,"四":2207,"団":7524,"因":1258,"国":40277,"器":3147,"写":1418,"再":1714,"内":9686," 丞":3334," 世":5944," 丈":5457," 丁":2886,"処":1526,"優":2947,"共":7413,"具":1237,"入":4949,"全":10040,"八":1560,"公":10338,"児":1791,"党":2623,"元":7877,"光":3006,"先":1806,"催":4293,"倫":19778,"個":1623,"原":7882,"受":3764,"取":4325,"反":2686,"及":4976,"参":3018,"司":1601,"号":9329,"台":3682,"可":2330,"口":3431,"化":11592," 佐":1439,"区":8504,"医":2840,"南":6511,"協":4447,"博":1814,"単":3157,"千":2088,"十":1946,"半":2617,"劇":2223,"力":7128," 人":2934,"加":4595," 代":2394,"務":6021,"動":13267," 休":1652,"分":13117," 丼":3391,"初":7471,"別":4976,"利":4213,"制":7969,"則":1331,"前":8950,"創":2681,"崎":2237," 大":1534,"工":4508,"州":9754,"川":7871,"山":10018,"属":5936,"展":2319,"屋":2375,"局":4799,"少":2464,"小":30514,"導":2344,"将":1891,"専":2793,"島":7916,"岩":1342,"当":6576,"形":7101,"役":2410,"影":1589,"式":10389,"引":1979,"張":1336,"強":2073,"応":3170,"念":2270,"律":2571,"後":10708,"得":2357,"年":104829,"平":6377,"帝":2972,"布":1306,"常":2950,"師":2527,"建":3787,"店":2005,"庁":1327,"広":4783,"度":5771,"座":1332,"大":35199,"央":2180,"天":5593,"太":2851,"変":3794,"外":5603,"多":7893,"女":5796,"始":3033,"委":1623,"場":13501,"報":4192,"境":2223,"売":8204,"声":8316,"子":14064,"存":4270,"学":37448,"安":3733,"定":14058,"実":7367,"宗":1706,"宮":21279,"客":1436,"家":48873,"富":1327,"察":1679,"対":8681," 回":2984,"曲":6533,"書":6577,"曜":3792,"昭":3661,"映":4870,"星":2861,"時":14954,"果":2006,"査":2002,"木":3174,"本":36912,"朝":4495,"期":7726,"月":55829,"有":6525,"最":8063,"松":2561,"東":13912,"来":4168,"条":3471,"村":4202,"料":2840,"文":11450,"於":1575,"施":4063,"旅":1301,"族":3755,"旧":3190,"日":77379,"放":12785,"改":2906,"支":3198,"教":10820,"数":8554,"整":1308,"技":5374,"投":1259,"所":11171,"手":10767," 年":95958,"戦":13450,"戸":3149,"成":12592,"提":2781,"推":1511,"接":2250,"挙":1713,"持":4610,"指":8229,"情":3302,"急":1371,"感":1443,"愛":3173,"港":2061,"済":2489,"清":1263,"湾":1437,"源":1888,"演":3153,"気":4829,"民":7826,"水":5727,"江":2655,"決":2408,"河":1989,"治":7454,"波":1773,"派":2998,"活":6032,"流":4367,"浜":1603,"消":1291,"深":1258,"機":12320," 時":1629,"権":6128,"横":1498,"標":2070," 月":53298,"武":2753,"止":2016,"正":6725,"死":5205,"歌":3328,"次":4580,"欧":1451,"母":1932,"毎":2982,"比":1391,"殺":5325,"校":9373,"株":3423,"業":15026,"楽":6757,"植":1547,"検":1870,"構":5270," 日":47110,"様":2976,"石":2879,"知":6637,"県":18156,"省":2816,"着":1339,"皇":3000,"的":19048,"目":10668,"直":2224,"白":1686,"発":20771,"登":4069,"病":1859,"症":1381,"町":8545,"画":10139,"田":7594,"由":2916,"用":19757,"産":5875,"生":16324,"番":8301,"略":5228,"界":7422,"環":2235,"理":11169,"球":4459,"現":13270,"王":7473,"独":3082,"状":2926,"物":13688,"特":7417,"照":1250,"然":1440,"無":2728,"点":4161,"火":1398,"置":8277,"美":2548,"群":2041,"義":5850,"習":1334,"素":3072,"約":3788,"紀":4911,"級":2880,"統":4477,"経":5810,"組":11006,"結":4672,"続":3548,"編":3445,"総":5306,"線":8126,"米":2827,"系":6844,"等":8344,"策":1400,"第":13434,"算":1686,"積":1366,"究":4684,"空":6782,"程":1685,"種":6006,"立":12201,"競":4491," 番":1554,"神":6975,"社":17675,"示":2628,"移":1880,"称":14999,"科":5836,"福":3372,"要":4830,"規":3792,"視":1426,"親":1689,"観":2376,"解":3594,"西":7073,"補":1253,"裁":1509,"製":5884,"衛":3099,"術":5314,"行":22080,"衆":5487,"表":10987,"警":1923,"議":4992,"護":2333,"調":2776,"読":1601,"説":4782,"語":21957,"認":2658,"論":4375,"設":10824,"記":10247,"計":5326,"言":6896,"話":3787,"評":1446,"路":5085,"超":1256,"足":1256,"起":2835,"賞":3546,"資":3066,"質":3136,"象":3259,"谷":1834,"近":4077,"農":1628,"載":2905,"転":2711,"車":9144,"身":6802,"自":10797,"者":18526,"聞":12087,"聖":2242,"聯":10502,"育":4279,"能":5243,"華":1394,"般":3860,"航":2825,"興":1400,"艦":4184,"色":1827,"英":9312,"信ああ":2064,"葉":3002,"著":2162,"風":1713,"食":1713,"領":3103,"項":2228,"類":3952,"馬":3385,"駅":2902,"館":2783,"高":11470,"連":10914,"造":5408,"進":3210,"送":12678,"通":11192,"速":1858,"遺":1795,"選":8114,"過":1669,"運":6072,"達":1651,"郡":4845,"部":17131,"都":8516,"郎":2090,"配":2845,"金":5425,"野":8402,"量":2343,"重":4451,"鉄":5783,"銀":1566,"録":3006,"関":12598,"間":11298,"開":13989,"門":4039,"降":1592,"限":2003,"院":3827,"陸":3351,"阪":2944,"防":2270,"離":1558,"電":7996,"隊":3468,"際":6340,"青":1618,"非":1816,"面":3571,"響":1740,"音":7019,"始ああ":1913,"殺ああ":2295,")":148109,"(":149030,":":12250,"=":3484,"~":3083,"交味あ":2597,"使ああ":9134,"使アア":1628," (":4296," )":9852,"価ああ":5083,"行ああ":10289,"組ああ":2099,"表ああ":2782,"一種あ":1572,"類ああ":1338,"手ああ":1245,"場合あ":2928,"世界大":1450,"大ああ":1950,"多あ。":1570,"多ああ":2150,"構成あ":1846,"倫ああ":2719,"成ああ":4644,"部ああ":1870,"戦ああ":1623,"続ああ":1414,"等学校":2276,"売ああ":5009,"声ああ":1329,"通ああ":1426,"送ああ":5822,"生ああ":4620,"ア語:":1902,"造ああ":1673,"ア連休":1985,"世紀あ":1284,"用アア":1501,"作品あ":2764,"ア選手":1921,"用ああ":9493,"佐売あ":1480,"。 ":13103,"、 ":31983,"』 ":1427,"不聯あ":1405,"あ ":66354,"地域あ":1988,"ア ":32799,"催ああ":3126,"あ連":2419,"あ通":3320,"あ選":1976,"あ運":3219,"あ都":1640,"あ金":1288,"あ重":1969,"ア語":8308,"あ電":1906,"あ際":1362,"あ音":2155,"あ関":6111,"あ開":8435,"あ間":2317,"あ認":1333,"あ記":3845,"あ設":5283,"ア系":1829,"あ製":2477,"あ表":4499,"あ行":10711,"あ規":1437,"あ言":4181,"あ解":1569,"あ西":1435,"あ近":1371,"あ起":2247," 『ア":2743,"ア連":2828,"あ高":3598,"ア郡":1249,"ア選":2094,"ア教":1868,"ア放":1489,"ア文":1270,"あ無":1446,"あ特":3612,"あ物":2014,"あ独":1271,"ア朝":1242,"、第":1727,"あ現":2680,"あ王":1453,"あ用":4828,"あ生":5118,"あ略":1610,"あ登":3137,"あ発":11033,"あ目":3554,"あ知":3796,"あ構":3048,"ア州":5899,"、特":1256,"あ機":2062,"あ毎":1746,"あ殺":2375,"あ死":2199,"あ正":1615,"。現":1855,"あ水":1343,"、現":2451,"あ活":3109,"あ流":1320,"ア番":2807,"あ自":4181,"あ聞":2418,"あ聯":1675,"あ者":1261,"))あ":1638,"あ著":1250,"ア社":1403,"あ移":1304,"あ称":1389,"あ神":1585,"あ社":1649," あ ":1635,"あ第":4428,"、英":3619,"あ立":1390,"、自":1350,"あ続":1310,"あ総":2605,"あ結":2983,"あ経":2414,"あ組":1899,"あ統":1376,"ア王":2616,"あ置":2229,"あ国":6666,"あ基":3151,"あ地":4866,"あ呼":9152,"、小":1990,"あ含":2942,"あ同":5129,"あ名":6504,"あ合":3012,"あ各":1270,"あ加":1535,"あ務":1340,"あ動":1382,"あ初":2158,"あ分":4627,"あ制":2425,"あ利":1785,"あ前":1691,"あ創":1816,"あ原":2367,"あ参":1928,"あ取":2481,"あ受":2846,"あ単":1416,"あ南":1637,"あ倫":4845,"あ内":1663,"あ入":1803,"あ全":3374,"あ公":3342,"あ共":2253,"あ元":1846,"、大":2783,"あ乱":3962,"あ上":3029,"あ下":1378,"あ不":14639,"あ丈":32792,"あ三":1486,"あ丞":19819,"あ世":2331,"あ中":9465,"あ丼":18051,"あ主":4771,"あ他":2009,"あ付":1399,"あ代":2198,"あ企":1331,"あ事":5294,"あ交":5865,"あ人":7782,"あ作":6813,"あ何":3230,"あ佐":13423,"あ位":2504,"あ使":13500,"あ伝":1495,"あ会":1351,"、国":2207,"あ休":17591,"あ信":2085,"あ保":1720,"あ価":8783,"、同":2022,"あ一":16360,"あ丁":17347,"あ東":3287,"あ本":3519,"あ有":3083,"あ書":1761,"あ最":4488,"あ時":2327,"あ映":1487,"あ日":8737,"あ文":2466,"あ教":1976,"あ数":1984,"あ支":1653,"あ放":6620,"あ改":1615,"ア大":3211,"あ提":2042,"あ指":5951,"ア国":2219,"あ持":3443,"ア地":1445,"あ所":2803,"あ戦":3270,"あ成":1753,"あ手":1859,"。本":2143,"あ形":2406,"あ当":1618,"、本":1964,"ア合":4791,"、東":2694,"あ後":3036,"。日":2366,"、日":7073,"あ建":2122,"あ広":2188,"ア共":1242,"ア公":1517,"あ属":1897,"あ小":6851,"ア使":2442,"あ学":2960,"あ存":2964,"あ子":1693,"あ家":4670,"あ宮":4858,"あ定":2671,"あ実":3992,"あ対":5932,"ア事":1661,"ア人":2775,"ア休":2911,"ア佐":2436,"ア作":1731,"あ外":1294,"あ多":5332,"あ変":2304,"あ大":8249,"あ天":1336,"あ女":1848,"ア丁":1757,"ア丞":2294,"ア不":5250,"ア丈":4131,"ア丼":2746,"あ始":1317,"あ場":4272,"あ声":3268,"あア":156757,"ああ":1909463,"あ。":138896,"あ『":3993,"あ』":1518,"あ「":15609,"あ」":3632,"第二次":1255,"あ、":216446,"、あ":21490,"。「":2184,"、『":1828,"、「":4201,"。ア":19354,"、ア":64525,"々あ":2319,"。あ":13106,"『あ":1353,"」あ":19753,"「あ":2018,"」、":1525,"」。":1249,"『ア":7054,"』あ":7906,"「ア":7862,"、使":1676,"、価":1248,"、休":3597,"、佐":2726,"。休":1576,"、人":2001,"、丞":3869,"。丁":1388,"、不":3337,"、丁":3572,"、丈":7156,"、一":1931,"。丼":1300,"。丞":1247,"、主":1393,"、丼":4218,"、中":2406,"。丈":2918,"アア":2154092,"アあ":155866,"ア」":6771,"ア『":1584,"ア』":6210,"ア。":10635,"ア、":16488,"ア)":16388,"ア(":33336,"ア=":3214,"場ああ":3434,"入ああ":1917,"現在あ":4587,"、)":1834,"』(":8152,"」(":2340,"あ)":37153,"あ(":5782,"会社あ":2364," ああ":28860," あ、":7361," あア":3298," アア":57211,"次世界":1361,"日)あ":7258,"不身あ":2968,"日( ":2303,"一部あ":1349,"作曲家":1274,"基ああ":1599,"録ああ":1363,"対ああ":2786,"あ行あ":9249,"分ああ":1730,"設ああ":2171,"使究あ":1369,"記ああ":2072,"ア番組":2746,"位置あ":2308,"株式会":2868,"小ああ":3121,"言ああ":1584,"家ああ":5177,"あ置あ":2118,"あ総称":1606,"所属あ":1341,"主あ":3885,"丼ア":1464,"丼あ":19189,"使用あ":2638,"乱あ":3541,"与あ":1930,"不あ":8122,"丈ア":3173,"下あ":2803,"上あ":5913,"丈あ":27006,"丈、":1500,"丁ア":1890,"丁あ":18102,"一あ":5817,"丞あ":21257,"丕あ":1261,"世あ":2310,"不ア":1421,"中あ":5030,"丞ア":1844,"人あ":9122,"人。":1476,"人ア":1242,"他あ":1896,"付あ":2208,"代あ":6798,"代ア":1373,"争あ":1758,"事あ":9346,"交あ":4806,"不家":3233,"中国":2514,"作あ":7059,"何あ":2412,"体あ":6471,"位あ":2264,"佐あ":15860,"丼休":1592,"丼丞":2055,"丼丈":2151,"丼丁":1684,"丼丼":1418,"中使":3153,"会あ":6508,"丞佐":1535,"不使":1765,"休。":1523,"休あ":23904,"丞丈":2889,"丞丁":2038,"休ア":2507,"丞丞":1742,"不交":1422,"丈休":2694,"丈使":1243,"丈丈":5575,"丈丞":2374,"丈丼":3223,"丁休":1545,"一価":1575,"不不":1538,"丁丞":1826,"丁丈":3406,"丁丼":1794,"丈丁":3527,"信あ":4017,"交味":2982,"全 ":1304,"以下":1516,"、英語":1539,"係あ":1321,"供あ":1476,"価あ":9568,"使あ":19446,"中央":2121,"使ア":1968,"事交":2728,"丞家":1297,"一種":2197,"事業":2439,"二次":1389,"世界":5865,"佐丁":2480,"佐価":1425,"佐佐":1269,"倫あ":5771,"休丈":2771,"不治":2327,"休佐":1324,"休休":1921,"主義":2043,"人物":1891,"企業":2138,"一般":3547,"不聯":2198,"催あ":3689,"前 ":1371,"宮ああ":1909,"作家":1397,"作品":5289,"世紀":2728,"価使":2601,"佐売":2092,"あ目的":2533,"あ発表":1274,"あ第 ":2940,"内ああ":1292,"丞 ":1754,"丈 ":1322,"丁 ":1266,"丼 ":1638,"在あ":8533,"地あ":4409,"国あ":10614,"国ア":2658,"団あ":1724,"あ音楽":1319,"合衆":4737,"問宮":1556,"可能":1378,"名称":3513,"器あ":1492,"使(":1266,"動車":2200,"営あ":2139,"休(":1962,"和国":2518,"会(":2933,"分類":1336,"分野":1390,"多あ":4892,"大あ":3257,"あ開発":3310,"外あ":1908,"声あ":2287,"売あ":5710,"地区":1390,"地域":3263,"学ああ":2317,"国家":3119,"地不":3338,"在位":1282,"場あ":5556,"域あ":3413,"団体":2330,"あ開催":2692,"基あ":1982,"ア語あ":2144,"あ間あ":1859,"あ関あ":3802,"全国":1795,"加あ":2048,"力あ":3213,"共和":2791," 人あ":1389,"共同":1280,"化あ":4450,"和 ":3168,"動あ":4507,"務あ":2872,"前あ":3176,"制あ":1242,"定ああ":5359,"別あ":1751,"初あ":3667,"交通":1522,"代表":2575,"使用":3017,"使理":2476,"京都":3402,"佐県":1437," 世あ":1718,"会議":1333,"分あ":3773,"位置":2501,"ア語 ":2251,"使究":4609,"元あ":1489,"一部":2055,"不身":4258,"作曲":2528,"会社":6167,"内あ":4013,"語ああ":3397,"休画":2321,"入あ":2695,"全あ":1427,"品。":1238,"品あ":4556,"丈(":2884,"丁(":1789,"参加":1290,"員あ":2076,"丞(":2210,"丼(":1739,"含あ":3008,"協会":2160,"味あ":2957,"呼あ":8214,"化学":1687,"利用":1828," 世紀":2518,"制度":1248,"及あ":4790,"受あ":2030,"取あ":2208,"号あ":3953,"名あ":9335,"同あ":1953,"合あ":7017,"向あ":2543,"制作":1942,"区あ":2907,"家路":2125,"度あ":3326,"年ア":1913,"年あ":20990,"年、":1688,"広あ":1712,"帝国":1623,"形あ":1829,"年代":2422,"島県":1368,"専門":1642,"当あ":2190,"あ運営":1448,"小説":1987,"対象":1561,"式あ":3352,"川あ":1268,"州あ":2489,"州ア":2755,"常あ":1310,"小治":1760,"学者":3831," 年 ":49544,"成 ":1708,"当時":1631,"応あ":1648,"念あ":1283,"得あ":1713,"後あ":4649,"式会":2869,"平成":1905,"大学":6830,"あ設置":1507,"大戦":1559,"委員":1355,"女小":1372,"女子":1397,"子ア":1884,"子あ":3923,"あ起あ":1522,"場合":3557,"始あ":2180,"あ製造":1274,"子ああ":1318,"国際":4256,"子アア":1695,"あ設立":2139,"家 ":3003,"大会":3699,"物ああ":1862,"大阪":2112,"存在":3074,"家人":2908,"あ記休":1293,"年 ":51327,"島あ":1817,"あ表記":1710,"学校":6834,"家律":2208,"実施":1278,"天皇":1405,"学あ":6550,"定あ":7748,"宮あ":4230,"家ア":2140,"家、":2464,"家。":3423,"家あ":12300,"業ああ":1572,"あ言あ":2286,"小あ":7489,"対あ":3405,"属あ":3978,"局あ":2353,"東不":1238,"東京":5181,"本名":1618,"放送":10914,"映画":3085,"曲家":1330,"施設":1651,"曜日":2035,"教育":2974,"校あ":3206," 日 ":12276,"最初":1340,"楽あ":1408,"時間":1682,"業あ":4045," 月 ":44960," 日あ":15057,"株式":3061," 年(":5941," 年)":9287,"時あ":3451,"教会":1452,"教休":1250,"文使":1758,"文化":2253,"族あ":1560,"日あ":17278,"化ああ":2739,"昭和":3444,"時代":5531,"来あ":2732,"立ああ":3753,"あ放送":5748,"あ東京":1323,"果あ":1257,"日本":21674,"曲あ":2900,"あ日本":7358,"文学":1588,"月あ":5637,"有あ":1810,"最あ":1431,"書あ":2853,"選手権":2431,"本ア":2290,"本あ":10875,"期あ":4322,"日 ":12954,"手権":2431,"、東京":1480,"所属":2119,"月 ":45103,"数あ":3827,"あ戦あ":1312,"教あ":1272,"技術":2097,"務ああ":1643,"年(":6245,"年)":9425,"提供":1355,"指あ。":1899," 年あ":19179," 年、":1517," 年ア":1448,"家(":2915,"成あ":6012,"情報":2756,"戦あ":3394,"あ持あ":3321,"手あ":2666,"あ指あ":3829,"所あ":3296,"持ああ":1475,"ア合衆":4629,"戦争":2199," 年代":2242,"指あ":4364,"持あ":3953,"あ属あ":1826," 日)":9957," 日(":2924,"あ存在":2820,"点あ":3120,"ア不身":1594,"あ家律":1357,"江戸":1490,"あ広あ":1286,"活丞":1276,"活動":3023,"校(":1251,"動ああ":2193,"、日本":6502,"。日本":2210,"あ国家":1342,"あ地域":1419,"次世":1446,"構造":1383,"毎佐":1485,"日)":10122,"日(":3117,"加ああ":1583,"あ場合":3488,"派あ":1250,"流あ":1921,"正式":1441,"あ多あ":3964,"あ大あ":1447,"機能":1261,"機関":2979,"権あ":1708,"あ国際":1794,"機あ":2329,"共和国":2495," 月あ":5223,"あ対あ":3165,"殺あ":2872,"構成":2233,"止あ":1308,"称あ":9039,"称。":1304,"種あ":3083,"社会":2785,"発表":1469,"種ああ":1425,"第 ":9134,"目的":2889,"社あ":6402,"示あ":1874,"社ア":1383,"あ用あ":3710,"あ生あ":1801,"県家":2001,"発生":1334,"知あ":3526,"番組":5337,"発売":5037,"登場":2565,"略称":3022,"用語":1380,"目あ":4296,"あ知あ":3198,"あ登場":2375,"県あ":2733,"あ発売":3969,"界大":1455,"発あ":3815,"的あ":13077,"環境":1290,"理学":1713,"町あ":2460,"画あ":2581,"現在":6030,"用あ":12045,"用ア":1581,"自動車":2063,"界あ":1868,"独立":1445,"生あ":6030,"産あ":1671,"現ア":1355,"現あ":1628,"理あ":2327,"運営あ":1562,"特別":1260,"大学あ":1847,"衆国あ":2595,"ア放送":1421,"特あ":1759,"物あ":5383,"あ活丞":1241,"般あ":1349,"あ活動":1498,"艦あ":1387,"能あ":2381,"、現在":1773,"英 ":1379,"。現在":1490,"義あ":2621,"大会あ":2008,"総称":1629,"者。":2220,"者、":1458,"者あ":10349,"者ア":1305,"聯あ":3977,"聞あ":3281,"総合":1331,"置あ":6978,"経済":1991,"線あ":3253,"続あ":2112,"組佐":2309,"競馬":1452,"系使":1769,"等学":2286,"競技":1586,"組あ":3887,"組。":1569,"ア州あ":1713,"結あ":1657,"ア州ア":2558,"紀あ":1438,"系あ":2247,"素あ":1485,"あ殺あ":1580,"約 ":1619,"第二":1891,"立あ":5248,"究あ":1391,"空あ":1393,"等あ":3921,"あ構成":1941,"科学":1994,"象あ":2019,"設立":2500,"設置":1708,"賞あ":1603,"製造":1966,"計画":1610,"あ。 ":6469,"あ、 ":22576,"ああ ":39188,"論あ":1674,"記休":1374,"表記":2830,"身あ":4282,"車あ":2896,"路あ":1577,"象ああ":1255,"開ああ":1704,"起あ":1745,"設計":1269,"言語":1842,"質あ":1680,"郡あ":1437,"部あ":6774,"関ああ":4613,"。アア":19255,"日本ア":2092,"日本あ":9641,"、アア":64286,"路線":1359,"設立あ":2048,"送あ":7525,"通あ":2755,"造あ":2715,"連あ":1360,"進あ":1274,"『アア":7047,"近あ":1367,"』ああ":2158,"「アア":7790,"」ああ":7507,"間ああ":1397,"『ああ":1282,"載あ":2293,"称ああ":4464,"「ああ":1846,"通称":1887,"選手":4697,"、ああ":21181,"都事":2774,"。ああ":12750,"野あ":1575,"運営":1939,"運動":1644,"語:":3289,"連合":1965,"通信":1344,"連休":3247,"自治":1375,"(昭和":2128,"線(":1472,"航空":2161,"営ああ":1448,"自動":2405,"あ、)":1236,"ああ)":35187,"ああ(":4198,"英語":4067,"ああ自":1831,"ああ行":2651,"ああ表":1577,"ああ記":1483,"ああ言":2300,"ああ設":1654,"ああ開":2170,"あ、同":1707,"ああ丞":6562,"ああ一":3244,"ああ丁":5744,"ああ不":4855,"ああ丈":10564,"ああ丼":5871,"ああ主":1236,"ああ中":2062,"ああ価":2294,"ああ使":4600,"ああ作":2558,"ああ佐":5045,"ああ倫":1458,"ああ交":2092,"ああ人":3676,"ああ事":3171,"ああ休":6525,"あ、国":1738,"ああ他":1319,"あ、丁":2287,"あ、一":1711,"あ、不":2123,"あ、丈":4708,"あ、丞":2433,"あ、佐":1739,"あ、休":2616,"あ、人":1506,"あ、主":1290,"あ、丼":2884,"あ、中":1791,"あ。丈":1605,"ああ大":3222,"ああ多":2733,"ああ学":1481,"ああ家":2341,"ああ宮":1422,"ああ小":2540,"あ。日":1290,"あ、日":6150,"あ、東":1989,"ああ後":1790,"術あ":1484,"ああ分":1352,"ああ全":1444,"ああ公":1285,"あ、大":2131,"行あ":14000,"ああ名":2507,"ああ同":2111,"あ、小":1427,"ああ呼":3646,"ああ地":2280,"ああ国":2987,"ああ場":2933,"表あ":3761,"あ、現":2057,"ああ指":1536,"ああ戦":1238,"ああ最":1780,"ああ日":4356,"ああ放":1785,"あ、英":2463,"ああ第":1398,"ああ特":1792,"ああ現":1367,"あ、第":1316,"ああ目":1662,"ああ発":3602,"ああ知":2352,"ああ用":1755,"ああ生":1691,"設あ":3061,"記あ":3151,"言あ":3035,"製作":1647,"説あ":2100,"語あ":8268,"話あ":1899,"ああ。":128417,"ああ、":114147,"ああ『":1790,"ああ「":7669,"ああ」":2702,"行不":1963,"語 ":3303,"あ『ア":1268,"あ「ア":4056,"あ」あ":2473,"あ、ア":42684,"あ。ア":9708,"あ。あ":10112,"あ、あ":15664,"あ、「":2860,"あアア":151645,"あアあ":3890,"衆国":4735,"要あ":2289,"ああア":66491,"あああ":1201680,"時代あ":3504,"発表あ":1306,"高等":2629,"昭和 ":2986,"社アア":1256,"社ああ":2086,"丞(あ":1427,"本あア":1472,"本ああ":2442,"州アア":2688,"品ああ":1797,"本アア":2199,"分野あ":1258,"録あ":1741,"書ああ":1478,"鉄家":3515,"野球":2343,"設置あ":1442,"月ああ":2434,"間あ":6535,"開あ":1967,"関あ":5450,"有ああ":1305,"隊あ":1519,"関係":1758,"際あ":1785,"開催":3071,"面あ":1724,"項あ":1394,"開発":5246,"類あ":2486,"目的あ":2661,"電気":1300,"駅あ":1276,"音楽":3430,"曲ああ":1651,"高あ":1592,"属ああ":2285,"あ事あ":1426,"あ丼あ":5850,"あ丁あ":4319,"あ一あ":4453,"あ丈あ":8854,"あ上あ":1542,"あ不あ":2773,"あ中あ":2093,"あ丞あ":5276,"あ使あ":4014,"あ価あ":2254,"あ佐あ":3168,"あ作あ":1416,"利用あ":1328,"あ丈丁":1285,"あ丈丈":1864,"あ休あ":5479,"あ中使":2747,"あ他あ":1258,"あ丁丈":1300,"点ああ":1562,"あ利用":1457,"あ含あ":2902,"あ名あ":1668,"あ呼あ":8161,"記休あ":1285,"あ名称":2151,"高等学":2175,"丈(あ":1828,"存在あ":2940,"あ基あ":1594,"東京都":2018,"あ交味":2831,"呼ああ":6929,"あ不治":1330,"あ世界":2001,"あ一種":2121,"あ作品":1789,"あ価使":1553,"あ一般":1707,"あ人物":1374,"味ああ":1844,"あ一部":1521,"あ作曲":1773,"あ休画":1355,"あ代表":1400,"あ使用":2721,"あ位置":2381,"あ使究":1788,"あ務あ":1339,"表記あ":2011,"あ取あ":1899,"あ受あ":1927,"アあ作":1244,"アあ丼":1341,"アあ丁":1432,"アあ一":1999,"アあ不":1243,"アあ丈":2548,"アあ丞":1512,"アア、":16393,"アア。":10631,"アア」":6705,"アア』":6191,"アア『":1500,"アアア":1729078,"アアあ":151645,"アあア":19640,"アああ":55529,"アあ、":4837,"ア』あ":2640,"ア。ア":1791,"ア」あ":4452,"ア、ア":8332,"アあ開":1642,"アア語":8298,"アア系":1825,"アア社":1262,"アア番":2788,"受ああ":1416,"アア王":2579,"アア教":1744,"アア放":1421,"アア州":5894,"アア大":2784,"アア国":1719,"放送あ":6907,"アア合":4757,"アア公":1422,"アア休":2327,"アア事":1564,"アア人":2578,"アア作":1525,"アア佐":1706,"アア使":2109,"アア丼":1996,"アア丞":1556,"アア丈":2749,"アア不":4656,"アア ":31584,"ア。 ":1580,"アあ ":4215,"名ああ":2758,"最初あ":1276,"学校あ":2147,"番組。":1557,"番組あ":2493,"向ああ":1587,"知ああ":3322,"校ああ":1456,"含ああ":1792,"アア連":2243,"アア選":2079,"号ああ":1619,"ア』(":1980,"アア=":3088,"アア)":16080,"アア(":33305,"合ああ":3958,"=アア":3113,"年代あ":1428,"開発あ":3113,"登場あ":2088,"略称あ":1402,"発売あ":3963,")あ ":2592,"義ああ":1444,"機関あ":1580,"あ)あ":31163,"あ(あ":1574,"あ)、":1476,"あ)。":2164,")あア":7291,")ああ":30137,")あ、":47800,"(ああ":48329,")、ア":1251,"(アア":20979,"ア=ア":3175,"ア(ア":4269,"ア)あ":13245,"ア(あ":3617,"団体あ":1276,"ア( ":1748,"置ああ":4412,"合衆国":4735,"丁アア":1421,"上ああ":2005,"丈ああ":11397,"丈あア":1286,"丁ああ":8466,"一ああ":3370,"一あ。":1355,"平成 ":1642,"年) ":3918,"年( ":2247,"年)あ":3507,"式会社":2868,"国あア":1384,"国ああ":2054,"国アア":2374,"年(昭":1867,"載ああ":2044,"地ああ":1883,"者ああ":4317,"丼ああ":8612,"在ああ":4158,"中ああ":1478,"丞アア":1255,"目ああ":1258,"丞ああ":10116,"不ああ":3457,"車ああ":1306,"丈アア":2349,"与ああ":1754,"界大戦":1365,"名称あ":2696,"事ああ":3528,"起ああ":1658,"開催あ":2711,"年あア":3603,"年ああ":7456,"年アア":1814,"乱ああ":1859,"人ああ":3885,"活動あ":2178,"的ああ":3866,"発ああ":2687,"交ああ":1899,"組佐あ":1397,"会(あ":1757,"))":1846,"()":1332,"丈丁あ":1330,"丈丈あ":1676,"休アア":1927," )あ":8664,"会ああ":2436,"休ああ":11780,"(昭":2135,"代ああ":2253,"代アア":1273,"(現":2804,"(英":2015,"付ああ":1616,"=ア":3230,")ア":1792,"(ア":21128,")あ":114771,")」":1247,"(あ":48841,")。":7779,")、":5848,"対象あ":1241,"( ":12526,") ":6625,": ":1725,"~ ":2043,"作ああ":4059,"何ああ":1301,"佐ああ":8185,"体ああ":2330,"当ああ":1803,"日ああ":8218,"日あア":2005,"、)あ":1731,"』(あ":3555,"』(ア":2402,"中使あ":2391},"n_words":[10754229,8353071,5774482],"name":"ja"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ುರಾ":118,"ುರು":200,"ುರಿ":88,"ುರ್":91,"D":56,"E":62,"A":159,"B":71,"C":144,"M":84,"N":67,"O":52,"I":126,"ುವಂ":99,"T":67,"P":98,"S":150,"R":60,"ುಳ್":57,"f":152,"g":241,"d":295,"e":941,"b":240,"c":337,"a":826,"n":692,"o":625,"l":414,"m":349,"k":67,"h":439,"i":767,"w":97,"v":70,"u":387,"t":822,"ುಲೈ":89,"s":460,"r":646,"p":275,"y":151,"x":98,"ುಮಾ":253,"ುಹಿ":52,"ುಸ್":88,"ುವೆ":67,"ುವು":360,"ುವಿ":157,"ುವಾ":127,"ುವರ":100,"ೂಕಿ":129,"ೂಕು":69,"ುಷ್":56,"ುಟು":84,"ುಟ್":157,"ುಡಿ":58,"ುನಾ":71,"ುನಿ":57,"ುತಿ":69,"ುತ್":1508,"ುದು":441,"ುದಾ":117,"ುದರ":81,"ುದ್":264,"ುದೇ":91," o":90," i":63," a":111," c":81," t":207," p":123," s":73," r":80,"ೀರಿ":62,"ೀರ್":102," I":78," M":62," B":59," C":95," A":102," S":86," P":57,"ುಖ್":182,"ುಗಳ":475,"ೀವಿ":81,"ೀವನ":57,"ುಕೊ":61,"ುಕ್":140,"ುಂಬ":175,"ುಂದ":93,"ೀಡಿ":58,"ೀಟರ":52,"ಿಸಿ":1001,"ಿಸು":679,"ಿಸೆ":100,"ಿಸ್":325,"ಿಹಾ":108,"ೀನ್":58,"ೀತಿ":89,"ೀಡು":53,"ೃಷ್":171,"ೆಯ ":906,"ೃತಿ":133,"ೃತ್":89,"ೇಕ ":98,"b ":108,"a ":86,"ೂಲಧ":77,"ೇ":5578,"ೆ":15840,"ು":25709,"ೂಲಕ":96,"ೀ":3205,"ೃ":757,"ೂ":3992,"್":52148,"ೌ":422,"ೈ":1240,"ೋ":2829,"ೊ":2928,"ೂರು":339,"ೂರಿ":178,"he":101,"ೂರ್":274,"೧":1776,"೦":1187,"gh":67,"೯":1054,"೮":516,"೭":491,"೬":491,"೫":520,"೪":472,"೩":669,"೨":1050,"ಐ":161,"ಒ":2005,"ಓ":71,"ಕ":16773,"ಖ":1270,"ಗ":16945,"ಘ":299,"g ":53,"ಚ":3279,"ಛ":94,"ೂರದ":77,"ಜ":4576,"ಞ":287,"ೂರನ":53,"ಟ":6023,"ಃ":62,"ಂ":14035,"ಅ":4390,"ಇ":2785,"ಆ":1877,"ಉ":1108,"ಈ":873,"ಊ":90,"ಏ":267,"ಎ":1847,"ಲ":19400,"ಳ":7999,"ರ":33568,"ಶ":3961,"ೂಮಿ":116,"ಷ":3572,"ವ":18565,"ಸ":14830,"ಹ":6186,"ಾ":27485,"ಿ":36903,"ಣ":3561,"ಡ":6448,"ಠ":200,"ಧ":2658,"ದ":26224,"ಥ":2061,"ತ":20028,"ಫ":753,"ಪ":9255,"en":84,"ನ":22644,"ಯ":14816,"ಮ":11705,"es":68,"ಭ":2607,"er":150,"ಬ":6283,"e ":261,"f ":64,"co":54,"ce":55,"d ":123,"at":112,"ar":92,"al":103,"an":163,"nt":61,"of":64,"on":129,"or":85,"r ":107,"mb":124,"na":66,"nd":87,"ng":73,"li":80,"le":62,"n ":194,"ht":76,"hu":109,"ic":76,"ia":57,"ig":78,"is":75,"it":67,"ೆದ ":99,"in":127,"io":72,"l ":78,"y ":108,"x ":86,"um":122,"to":56,"te":89,"ti":95,"th":193,"ta":55,"st":72,"ri":121,"re":74,"ra":68,"t ":164,"ೂಡಿ":63,"s ":195,"px":81,"ೂನ್":106," ಇ":2754," ಆ":1860," ಅ":4360," ಏ":265,"ುಖ ":181," ಎ":1695," ಊ":90," ಉ":1101," ಈ":872," ಖ":225," ಗ":2051," ಕ":5524," ಒ":1995," ಓ":65," ಐ":147," ಟ":281," ಜ":2387," ಚ":1161," ಘ":128,"ೀಯ ":460," ನ":3476," ಪ":4927," ಫ":377," ಬ":2936," ಭ":1657," ಮ":5657," ಯ":687," ಡ":389," ತ":2162," ದ":2528," ಧ":353," ಹ":3705," ಸ":5985," ರ":2258," ಲ":607," ವ":3839," ಶ":1153," ೬":68," ೭":68," ೮":53," ೯":53," ೨":675," ೩":313," ೪":88," ೫":77," ೧":1407,"ಾಂಶ":59,"ಾಂತ":277,"ಾಂಕ":60,"ಾಂಗ":132,"ಾಂಡ":169,"ೀತ ":103,"ಿಯ ":653,"ಿಮ ":103,"ಿನ ":1775,"ಿಧ ":52,"ಿದ ":402,"ಿತ ":279,"ಿಣ ":191,"ಿರಾ":52,"ಿರಿ":113,"ಿರ್":396,"ಿರು":1115,"ಂದ ":957,"ಿಲಿ":76,"ಿಲ್":730,"ಿಳಿ":79,"ಿಳು":52,"ೀಕರ":77,"ೀಕ್":78,"ಿವರ":110,"ಿವೃ":57,"ಿವಿ":95,"ಿವಾ":109,"ಿವೆ":147,"ಿಶ್":268,"ಿಸಬ":56,"ಂಬ ":229,"ಿಷ್":256,"ಿಸಲ":564,"ೃತ ":73,"ಿನಿ":232,"ಿನಾ":81,"ಿನಲ":259,"ಿನ್":254,"ಿಪ್":89,"ಿಭಾ":100,"ಿಯಂ":74,"ಿಮೆ":73,"ಿಯನ":554,"ಿಯಮ":111,"ಿಮಾ":95,"ಿಯು":246,"ಿಯೆ":150,"ಿಯೇ":59,"ಿಯೊ":67,"ಿಯರ":85,"ಿಯಲ":402,"ಿಯವ":134,"ಿಯಿ":76,"ಿಯಾ":494,"ಿಡಿ":78,"ಿಟ್":81,"ಿತ್":753,"ಿತರ":53,"ಿತವ":177,"ಿತಾ":52,"ಿತಿ":175,"ಿತು":240,"ಿದ್":1341,"ಿನಗ":88,"ಿಧಾ":82,"ಿನದ":58,"ಿದವ":67,"ಿದರ":307,"ಿದಾ":54,"ಿದೆ":1016,"ಿದು":62,"ಾಷ್":299,"ಾಸದ":54,"ಾಷೆ":295,"ಾಸನ":52,"ಾಷಾ":53,"ಿಗೆ":716,"ಿಗಾ":54,"ಾಶಿ":63,"ಾಹಿ":266,"ಾಸ್":461,"ಾಸಾ":72,"ಾಸಿ":149,"ಿಜ್":143,"ಿಟಿ":66,"ಾರವ":174,"ಾರಾ":172,"ಾರಿ":438,"ಾರು":321,"ಾರೆ":385,"ಾರ್":898,"ಾಲದ":109,"ಾಲಯ":138,"ಾಯಕ":132,"ಾಮಾ":287,"ಾಮಿ":92,"ಾಯನ":84,"ಾರಂ":66,"ಾಮ್":142,"ಾರಗ":62,"ಾರಕ":63,"ಾಯಿ":381,"ಾರದ":175,"ಾರತ":711,"ಾರಣ":117,"ಾಯು":52,"ಾರರ":107,"ಿಕಿ":78,"ಿಕಾ":233,"ಾವಣ":88,"ಾವಳ":55,"ಿಕೆ":452,"ಿಕೊ":215,"ಿಕ್":429,"ಾವು":111,"ಾವಿ":144,"ಿಗಳ":606,"ಾವ್":94,"ಾಲು":54,"ಾಲೂ":145,"ಾಲಿ":154,"ಾಲ್":298,"ಾಲೆ":294,"ಾಳಿ":56,"ಿಕವ":96,"ಿಕರ":56,"ಿಕದ":55,"ಾನಿ":272,"ಾನೆ":71,"ಾಧ್":78,"ಾನವ":177,"ಾನ್":541,"ಾಪಕ":91,"ಾದಿ":124,"ಾಧನ":72,"ಾದವ":55,"ಾದರ":121,"ಾನದ":188,"ಾದ್":141,"ಾನಗ":87,"ಾಮದ":98,"ಾಪು":93,"ಾಪಿ":108,"ುವ ":1824,"ಾಪ್":60,"ಿಂದ":1207,"ಿಂತ":88,"ಾಡಲ":88,"ಂಗ ":59,"ಾಡು":222,"ಾಡಿ":260,"ಾಟ್":67,"ಿಂಗ":466,"ಾಟಿ":63,"ಾಟಕ":338,"ಾತ್":294,"ುರ ":111,"ಾತನ":55,"ಾತಿ":122,"ಾತು":101,"ಾಣದ":83,"ಾಣಿ":182,"ಾಣು":83,"ಾಗ್":60,"ಾಗಿ":2495,"ಾಗು":715,"ಾಗೂ":350,"ಾಗರ":116,"ಾಗಲ":56,"ಾಗವ":78,"ಾಗದ":149,"ಾಕ್":170,"ಾಗಗ":52,"ಾಕಾ":66,"ಾಕಿ":70,"ಂತ ":350,"ಾಜ್":424,"ಾಜಿ":104,"ಾಜಧ":76,"ಾಜಕ":69,"ಾಚಾ":64,"ಂಡ ":134,"ಸರಾ":57,"ಸರಿ":110,"ಸರು":177,"ಸರ್":187,"ಸಮಾ":89,"ಸಮು":96,"ಷ್ಣ":158,"ಷ್ಟ":615,"ಷ್ಯ":148,"ಸಸ್":84,"ಸಲ್":204,"ಸಲು":129,"ಸಲಾ":407,"ಸೂರ":160,"ಸುಮ":185,"ಸುವ":474,"ಸುತ":345,"ಸಿನ":84,"ಸಿಯ":74,"ಸಿರ":109,"ಸಿಸ":62,"ಸಾರ":89,"ಸಾಯ":93,"ಸಾಮ":345,"ಸಿಕ":210,"ಸಾಲ":56,"ಸಾಧ":119,"ಸಿದ":904,"ಸಾಹ":178,"ಸಿಂ":68,"ಸಾಗ":104,"ಸಾಂ":87,"ಸೇವ":115,"ಸೇರ":192,"ಸೆಪ":83,"ಹದಿ":73,"ಸೆಂ":112,"ಹತ್":113,"ಶ್ಚ":149,"ಶ್ರ":321,"ಶ್ವ":281," ದ ":69,"ಶೇಷ":56," ನ ":221,"ಶೋಧ":52," ರ ":72,"ಷಿಣ":245,"ಷೇತ":113,"ಷೆಯ":163,"ಷೆಗ":96,"ಸತ್":52," ದೂ":111," ದಾ":111," ದಿ":826," ನಂ":173,"ಾಜ ":54," ತೆ":86," ದಕ":243," ತಾ":436," ತಿ":392," ತೀ":71," ತು":140," ತನ":105," ತಯ":72," ತಮ":160," ತರ":68," ತಂ":275," ಡಿ":173," ಡಾ":64," ಟ್":64," ಟೆ":75," ಮಲ":65," ಮರ":148," ಮಹ":275," ಮೂ":466," ರಂ":117," ಮೀ":91," ಮು":503," ಮಾ":930," ಮಿ":112," ಮ್":74," ಮೊ":287," ಮೋ":54," ಮೈ":149," ಮೆ":105," ಮೇ":303," ಬ್":257," ಭೂ":150," ಮತ":1392,"ಾದ ":915," ಭಾ":1269," ಭೌ":68," ಮಧ":122," ಮನ":105," ಫ್":115," ಬಳ":309," ಬಲ":53," ಬರ":237," ಬಹ":140," ಬಾ":244,"ಾಣ ":79," ಬಿ":202," ಬೀ":65," ಮಂ":165," ಬು":58," ಬೆ":486," ಬೇ":140," ಪೋ":52," ಪ್":2223," ಬಗ":131,"ಾತ ":80," ಬಣ":53," ಬದ":90," ಫೆ":69," ಪಶ":122," ಪರ":473," ಪೂ":178," ಪು":317," ಬಂ":203," ಪಿ":89," ಪಾ":318," ಪಕ":77," ನ್":129," ನೈ":64," ನೋ":69," ಪದ":311," ಪತ":98," ಪಟ":103," ಪಡ":148," ನಲ":117," ನವ":140," ನೀ":205," ಪಂ":148," ನೇ":153," ನೆ":157," ನಿ":719," ನಾ":379," ಧರ":127," ನಗ":232," ದೇ":439," ದೊ":195," ದ್":175," ನಡ":222," ನದ":175,"ಾನ ":293," ಧಾ":127," ನಟ":71," ವೇ":103," ವೈ":156," ವೆ":106," ವ್":384,"ಾಮ ":87," ವಸ":135," ವಿ":1423," ವಾ":355," ಶತ":99," ಶ್":256," ಶಿ":219," ಶಾ":196," ಸಂ":1268," ವರ":800,"ಾಲ ":93," ಲೋ":89," ಲ್":70," ಲೇ":112," ಲಿ":71," ಯು":227," ಯಾ":161," ರಚ":137,"ಾರ ":382," ರೋ":82," ರೇ":54," ರೀ":66," ರೂ":111," ರಾ":953," ರಿ":89," ರಸ":87," ರವ":64,"ಾಯ ":86," ರಲ":134,"ಿಕ ":878,"ಾಳ ":52," ಹು":167," ಹೆ":576," ಹೇ":108," ಹಾ":768," ಹಿ":369," ಹೊ":508," ಹೋ":156,"ಾಸ ":73," ಸದ":81," ಹಂ":55," ಸಣ":52," ಸರ":313," ಸಲ":64," ಸಮ":357," ಸೆ":143," ಸೇ":325," ಹದ":68," ಹಣ":89," ಸೂ":152," ಹತ":79," ಸು":425," ಸಾ":912," ಸಿ":289," ಸಸ":67," ಸಹ":107," ಹಳ":93," ಹಲ":156," ಹರ":112," ಸ್":892," ಸೌ":54," ಸೋ":68," ಸೈ":56," ಅಂ":408,"ಸ್ಥ":756,"ಸ್ಪ":98,"ಸ್ಯ":165,"ಸ್ಕ":258,"ಸ್ಟ":445,"ಸ್ತ":931," ಆಟ":96," ಇಂ":246," ಆಡ":86," ಆಚ":60," ಅವ":421," ಆಗ":261," ಅಸ":94," ಅಲ":154," ಆಕ":73," ಅಮ":216," ಅಭ":111,"ಸ್ವ":337,"ಸ್ಸ":91," ಅರ":279," ಅಪ":94," ಅದ":274," ಅಥ":645," ಅನ":330," ಅಧ":417," ಅತ":346," ಅಡ":73," ಆಂ":116," ಅಗ":62," ಅಕ":181,"ಹರಿ":90,"ಹಲವ":148,"ಹಳ್":97," ಊರ":63," ಎನ":126," ಎತ":68," ಎಂ":869," ಇಪ":84," ಇನ":85," ಇಲ":141," ಇರ":180," ಇವ":618," ಉಂ":56," ಆದ":131," ಆಧ":97," ಆಫ":103," ಆಯ":65," ಆರ":223," ಆಲ":68," ಆಳ":54," ಆವ":57," ಆಸ":93," ಇತ":184," ಇದ":1044," ಉಪ":269," ಉದ":185," ಉತ":320,"ಹಾಸ":180,"ಹಿತ":251,"ಹಿಡ":63,"ಹಾರ":180," ಒಳ":146,"ಹಾಕ":59,"ಹಾಗ":543,"ಹಿಂ":230," ಕಂ":258," ಕರ":597,"ಹುಟ":85," ಕಲ":170," ಕನ":397," ಕಥ":54,"ಹುದ":165," ಕಟ":67," ಕಣ":68," ಖಂ":56," ಕಡ":102," ಕ್":901,"ಹಿಸ":111," ಕೆ":308," ಕೇ":216," ಕೊ":245," ಕೋ":98," ಕಾ":889," ಗಂ":57," ಕಿ":200," ಕೂ":99," ಕು":307," ಕೃ":156," ಕವ":70," ಏಪ":62," ಎಲ":152," ಎರ":162," ಏಷ":55," ಒಂ":1369," ಒಬ":182," ಒಟ":63," ಚೆ":63," ಚಿ":486," ಚಾ":99,"ಹೆಸ":279," ಜಗ":78," ಜನ":678,"ಹೆಚ":200," ಜಲ":69," ಜೋ":71," ಜೊ":56," ಜಿ":489," ಜಾ":210," ಜು":99," ಜೀ":215," ಜೂ":94,"ಹೊಂ":267," ಜ್":80,"ಹೇಳ":76," ಗಣ":150,"ಹೋಗ":60,"ಹೊಸ":67,"ಾಗ ":150," ಖ್":54," ಗಳ":238," ಘಟ":60," ಗಾ":155," ಗು":365," ಗಿ":82,"ಹೊರ":79," ಗ್":584," ಗೋ":85," ಚಂ":64,"ಹ್ಯ":58," ಚಲ":126," ೨೦":198," ೧೯":643," ೧೨":59," ೧೧":53," ೧೪":61," ೧೬":69," ೧೫":80," ೧೮":164," ೧೭":110," ೩೦":63," ೨೫":63,"ಳ್ಳ":382,"ಳೆಯ":131," ೧೦":72,"ಶ್ ":57,"ಳೂರ":208,"ಳುವ":137,"ವಣೆ":61,"ಷ್ ":79,"ಷೆ ":91,"ಲ್ಯ":182,"ಲ್ಲ":5639,"ಲ್ಪ":335,"ಲ್ಕ":111,"ಲೋಹ":64,"ಲೊಂ":55,"ಳನ್":731,"ಲೆಂ":287,"ಲೇಖ":144,"ಲೆಕ":53,"ಲೆಗ":94,"ಲೆಯ":429,"ಲಿಯ":252,"ಲಿರ":303,"ಲಿಸ":109,"ವ್ ":88,"ಲೂಕ":200,"ಳಿಂ":159,"ಳಾಗ":68,"ಳಿನ":117,"ಳಿತ":74,"ಳಿದ":77,"ಳಿಕ":53,"ಳಿಗ":310,"ವಂತ":117,"ಳಿಸ":102,"ಳಿಯ":166,"ಷದ ":67,"ಳಸಲ":88,"ಳಲ್":724,"ಷಗಳ":63,"ವೇದ":77,"ಶದಲ":55,"ವ್ಯ":542,"ಸಂಘ":68,"ಶಿಷ":59,"ಶಿಸ":74,"ಸಂಖ":127,"ಶಿವ":89,"ಸಂಗ":227,"ಸಂಕ":77,"ಶಿಯ":58,"ಸಂದ":52,"ಸಂಸ":304,"ಸಂಯ":79,"ಸಂಪ":131,"ಸಂಬ":98,"ಷದಲ":459,"ಶಸ್":149,"ಶಾಸ":280,"ಶಾಲ":53,"ಶಿಕ":64,"ವರನ":53,"ವರೆ":141,"ವರಾ":53,"ವರು":761,"ವರಿ":316,"ವರ್":898,"ವಳಿ":93,"ಸಿ ":303,"ವಲ್":60,"ವತ್":64,"ವಧಿ":53,"ವದಲ":53,"ವನ್":720,"ವಾಗ":1342,"ವಾದ":543,"ವಾತ":70,"ವಾಣ":65,"ವಾಡ":79,"ವಾಮ":73,"ವಾಯ":90,"ವಾರ":197,"ವಿಗ":77,"ವಾಸ":169,"ವಾಲ":67,"ವಿಕ":218,"ವಾಹ":85,"ವಿಜ":170,"ವಿಧ":170,"ವಿದ":296,"ವಿನ":163,"ವಿತ":66,"ವಿಭ":117,"ವಿರ":130,"ವಿಮ":65,"ವಿಯ":106,"ವಿವ":124,"ವಿಲ":62,"ವಿಸ":121,"ವಿಶ":329,"ವಿಷ":94,"ವೀಪ":59,"ವುಗ":98,"ವುದ":425,"ಸ್ ":791,"ವೃತ":84,"ಶತಮ":88,"ವೆಂ":214,"ವವಿ":62,"ಶಕ್":84,"ಸು ":82,"ಶಗಳ":199,"ವಹಿ":89,"ವಸ್":273,"ಮೊದ":199,"ಯನಿ":63,"ಯನ್":821,"ಮೈಸ":89,"ಯಮ್":96,"ಮ್ಮ":313,"ಮ್ರ":69,"ಮ್ಯ":104,"ಲಿ ":3648,"ರಕಾ":170,"ಲಾ ":127,"ಯಲ್":813,"ಯಲಾ":117,"ಯರು":78,"ರಚನ":78,"ಲೂ ":94,"ಲು ":400,"ರಗಳ":381,"ಯವಾ":336,"ಯವು":63,"ಯವಸ":110,"ರಕ್":198,"ಯವರ":158,"ಯವನ":100,"ಮಹಾ":203,"ಲದ ":112,"ಮಾಜ":102,"ಮಾನ":565,"ಮಾಣ":151,"ಮಾಡ":381,"ಮಾತ":89,"ಮಾಹ":71,"ಮಾರ":439,"ಮಾಲ":61,"ಮಿಕ":75,"ಮಿಯ":113,"ಮಿತ":67,"ಮೂರ":118,"ಮೂಲ":335,"ರಂದ":193,"ರಂಭ":108,"ರಂಗ":139,"ಮಿಸ":75,"ಮಿಳ":54,"ಮುಂ":173,"ಮುದ":121,"ಮುಖ":451,"ಮೆರ":66,"ಮೇಲ":169,"ಮೇರ":140,"ಯತೆ":52,"ಲಯ ":63,"ಯದಲ":149,"ಯೋಗ":170,"ಯ್ಯ":64,"ರರಾ":68,"ರರಂ":62,"ರಮು":225,"ರಮಾ":162,"ರರು":63,"ಳಿ ":203,"ರಲ್":544,"ರಶಸ":112,"ಳು ":1195,"ರವು":117,"ರವಾ":329,"ರವಿ":59,"ರವರ":144,"ರವನ":89,"ರಸಿ":199,"ರಸಾ":67,"ರಸ್":141,"ಯಸ್":73,"ರಚಿ":80,"ಲೆ ":252,"ಲೇ ":70,"ರಜ್":65,"ಯಾಂ":162,"ಯಾಕ":83,"ಲೈ ":93,"ಯಾಗ":387,"ಯಾದ":236,"ಯಾತ":133,"ಯಿಂ":142,"ಯಾಟ":89,"ಯಾಪ":150,"ಯಾನ":165,"ಯಾವ":134,"ಯಾಯ":90,"ಯಾರ":155,"ಯಾಲ":340,"ಯಾಸ":109,"ಯಿತ":172,"ಯಿಸ":93,"ರಡನ":86,"ಯುಕ":79,"ರಡು":80,"ಯುತ":201,"ಯುದ":79,"ಯುರ":59,"ಯುವ":150,"ರತದ":423,"ರಣೆ":97,"ರಣಿ":54,"ಲ್ ":636,"ರಣವ":59,"ರತ್":73,"ರತಿ":180,"ರತೀ":142,"ರದರ":55,"ರದಲ":281,"ರದೇ":250,"ಯೆಯ":95,"ಯೆಗ":53,"ರದಾ":69,"ರದಿ":71,"ರನ್":145,"ರಪಂ":68,"೨೦೦":138,"ರ್ಮ":390,"ರ್ಯ":406,"ರ್ವ":378,"ರ್ಶ":130,"ರ್ಷ":704,"ರ್ಸ":72,"ರ್ಟ":71,"ರ್ಡ":92,"ರ್ತ":167,"ರ್ಣ":195,"ರ್ದ":177,"ರ್ಥ":249,"ರ್ನ":341,"ರ್ಧ":61,"ರ್ಪ":57,"ರ್ಗ":212,"ರ್ಕ":216,"ರ್ಜ":96,"ರ್ಚ":113,"ವನ ":77,"ವದ ":73,"ಲಕ್":112,"ರೂಪ":165,"ರುವ":1148,"ರೀಯ":191,"ರೀತ":68,"ರೀಡ":60,"ರಿಸ":509,"ರಿವ":56,"ರೀಕ":90,"ರಿಲ":102,"ರಿಯ":766,"ರುದ":52,"ರುತ":402,"ರುಗ":103,"ರಿಂ":256,"ರಾಟ":71,"ರಾತ":76,"ರಾಣ":153,"ರಾನ":117,"ರಾದ":125,"ರಾಕ":64,"ರಾಗ":268,"ರಾಜ":696,"ರಾಚ":61,"ರಿಟ":72,"ರಿತ":118,"ರಿದ":175,"ರಿನ":273,"ರಾಯ":125,"ರಾಮ":296,"ರಾರ":74,"ರಾವ":128,"ರಿಕ":602,"ರಾಷ":292,"ರಿಗ":296,"ರಾಸ":60,"೧೯೭":59,"೧೯೬":77,"ಳೆ ":85,"೧೯೯":64,"೧೯೩":78,"೧೯೨":62,"೧೯೫":67,"೧೯೪":74,"ರಾಂ":137,"ರೋಗ":76,"ರೋಪ":59,"ರೆಗ":378,"ರೆದ":71,"ರೆಯ":331,"ವಿ ":195,"ವಾ ":561,"ವು ":807,"ಲವು":161,"ಲವಾ":103,"ಳಕೆ":83,"ಲಯದ":79,"ವೆ ":451,"ಲಾಯ":82,"ಲಿಗ":61,"ಲಿಕ":80,"ಲಾವ":62,"ಲಿನ":461,"ಲಿದ":161,"ಲಾಗ":564,"ಲಿಂ":75,"ಲಾದ":129,"ಳಗೊ":121,"ವೂ ":55,"ಶದ ":217,"ಲದಲ":69,"ವರ ":374,"ಲಧಾ":75,"ಲನಚ":87,"ಪ್ಟ":118,"ಪ್ರ":2498,"ಪ್ಯ":75,"ಪ್ಪ":318,"ಪೂರ":201,"ಪಿಸ":139,"೬ನೇ":54,"ಬಂಧ":106,"ಬಂದ":116,"ಪುಟ":63,"ಪುರ":302,"ಮದ ":92,"ಪಶ್":122,"ಪಿಯ":85,"ಪಾರ":115,"ಪಾಲ":72,"ಪಾತ":114,"ಪಾದ":107,"ಪರ್":164,"ಪಯೋ":105,"ಪರಮ":64,"ಪರಿ":273,"ಫ್ರ":149,"ಬರಿ":53,"ಬರು":198,"ಬರೆ":81,"ಬರ್":426,"ಬಳಕ":77,"ಮ್ ":312,"ಬಣ್":56,"ಬದಲ":69,"ಫೆಬ":56,"ಯದ ":342,"ಮೇ ":71,"ಯನ ":58,"ಮೆ ":97,"ಮಿ ":137,"ಬಗ್":95,"ರಾ ":66,"ರಿ ":689,"ಬ್ದ":64,"ಬ್ಬ":413,"ಬ್ಯ":92,"ಬ್ರ":245,"ಬೇಕ":58,"ಬೆಳ":196,"ಬೆಲ":56,"ಬೆಟ":70,"ಬೇರ":65,"ಬೆಂ":161,"ಬುದ":201,"ಮಂಡ":85,"ಮಂತ":82,"ಬಾಲ":61,"ಬಾರ":62,"ಯೇ ":65,"ಯೆ ":129,"ರದ ":417,"ಬಹು":249,"ರತ ":132,"ರಣ ":118,"ಯೂ ":79,"ಯು ":397,"ಯಿ ":133,"ಬಳಸ":168,"ಯಾ ":166,"ಬಳಿ":64,"ಬಲ್":56,"ಯಕರ":55,"ಯಕ್":233,"ಯಗಳ":256,"ಮರಾ":59,"ಮರ್":71,"ಮಧ್":119,"ಲಕ ":102,"ಮನ್":73,"ಮನೆ":57,"ರ್ ":1282,"ಭೂಮ":114,"ಮದಲ":87,"ಮತ್":1394,"ಭಾಷ":355,"ಭಾವ":93,"ಭಿನ":73,"ಭಾಗ":330,"ಭಾರ":739,"ಯಂತ":301,"ರೆ ":804,"ರು ":2581,"ಮಗಳ":92,"ಭವಾ":52,"ರೀ ":91,"ಮಕ್":73,"ರೂ ":136,"ಥೆಯ":123,"ದನೆ":56,"ದನ್":281,"ದರಿ":133,"ದರು":453,"ದರೆ":272,"ದರೂ":102,"ದರಲ":91,"ದರ್":155,"ದಲಾ":57,"ದಲು":62,"ದಲ್":2055,"ದವನ":56,"ದವು":75,"ದವರ":138,"ದಸ್":52,"ಪದ ":80,"ದಾಗ":201,"ದಿನ":874,"ದಿದ":212,"ದಿಸ":72,"ದಿರ":178,"ದಿಯ":160,"ದಾದ":115,"ದಿಂ":295,"ದಿಕ":52,"ದಿಗ":178,"ದಾರ":359,"ದಾಯ":84,"ದಾನ":52,"ದುರ":81,"ದುಕ":71,"ನಂತ":150,"ನಂದ":72,"ದೂರ":133,"ಧತಿ":57,"ದೇಶ":726,"ದೇವ":195,"ಧನೆ":64,"ದೊಡ":174,"ದ್ದ":1224,"ದ್ಧ":445,"ದ್ವ":154,"ದ್ಯ":488,"ದ್ರ":521,"ಪಿ ":59,"ಧರ್":147,"ನಕ್":67,"ನಗಳ":229,"ನಗರ":304,"ಪು ":78,"ನಚಿ":88,"ನಡೆ":173,"ನಡು":55,"ಧಿಸ":90,"ನಡದ":107,"ಧಿಯ":80,"ಧಾತ":90,"ಧಾನ":211,"ಧಾರ":211,"ಧಿಕ":334,"ನದಲ":85,"ಪ್ ":121,"ನನ್":63,"ನದಿ":214,"ನಪ್":62,"ಧ್ಯ":309,"ನಲ್":436,"ನವನ":120,"ನವರ":180,"ನವಾ":90,"ನವು":53,"ನವೆ":88,"ನುವ":99,"ನೀರ":61,"ನೀಡ":120,"ಪಂದ":55,"ನಿಸ":295,"ಪಂಚ":105,"ಫ್ ":93,"ನಿವ":80,"ನಿಲ":70,"ನಿಯ":276,"ನಿರ":415,"ನಿಗ":99,"ನಾದ":62,"ನಾಲ":99,"ನಿಕ":181,"ನಾಮ":59,"ನಾಯ":110,"ನಾರ":70,"ನಾಗ":147,"ನಾಥ":61,"ನಾಟ":338,"ನಿಂ":150,"ನಾಡ":89,"ನೆಗ":163,"ನೇಕ":77,"ನೆಲ":86,"ನೆಯ":329,"ಪಕರ":55,"ಬಿ ":56,"ನ್ಸ":171,"ನ್ಯ":464,"ನ್ಮ":57,"ನ್ನ":3456,"ಪಕ್":91,"ಪದವ":93,"ಪತ್":245,"ಪತಿ":53,"ಬ್ ":71,"ಪಡೆ":154,"ಪಡು":141,"ಪಡಿ":52,"ಪಟ್":240,"ಪನ್":57,"ಪನಿ":89,"ಪದ್":89,"ಡೆಸ":85,"ಡೆದ":156,"ಡೆಯ":184,"ಥವ ":102,"ಡ್ಡ":232,"ಡಿತ":54,"ಡಿನ":68,"ಡಿದ":293,"ಡಿರ":86,"ಡಿಯ":235,"ಡಿಸ":241,"ಡಿಕ":76,"ಡಿಗ":58,"ಡುತ":146,"ಡುವ":315,"ಡುಗ":71,"ತ್ ":173,"ಡೆಗ":53,"ಥೆ ":99,"ಣಗಳ":143,"ಣರಾ":65,"ತಂದ":116,"ತಂತ":159,"ತಂಡ":77,"ದಲ ":85,"ಣದಲ":85,"ದರ ":265,"ದೇ ":231,"ದೆ ":1948,"ಣಿಗ":82,"ಣಿಯ":81,"ಣಿಸ":84,"ಣಿತ":57,"ದಿ ":230,"ದೂ ":160,"ಣವಾ":87,"ತಗಳ":66,"ದು ":3600,"ತರಾ":116,"ತಮ್":117,"ತರದ":65,"ತಯಾ":73,"ಣ್ಯ":74,"ತಮಾ":89,"ಣ್ಣ":250,"ಣೆಯ":103,"ತದೆ":688,"ತನ್":101,"ದ್ ":83,"ತದಲ":113,"ಣೆಗ":103,"ದಂತ":81,"ತುಗ":83,"ತೀಯ":160,"ತೀರ":77,"ತುವ":100,"ನನ ":106,"ತಿಂ":268,"ತಾನ":117,"ತಾದ":59,"ತಾರ":223,"ತಾಯ":91,"ತಿಗ":244,"ತಿಕ":121,"ತಾಲ":214,"ತಿನ":126,"ತಿತ":52,"ತಿದ":136,"ತಿರ":166,"ತಿಯ":448,"ತಿಸ":85,"ತಿಹ":93,"ತಿಳ":56,"ನದ ":230,"ಧಿ ":63,"ತವೆ":173,"ನಡ ":231,"ತವಾ":246,"ನಿ ":257,"ದಕ್":378,"ನಾ ":82,"ತ್ವ":161,"ತ್ಸ":90,"ತ್ಮ":97,"ತ್ಯ":494,"ತ್ರ":1536,"ತ್ಪ":98,"ತ್ನ":61,"ತ್ತ":3794,"ತೆಯ":160,"ತೆಗ":110,"ಥೆಗ":52,"ನ್ ":1307,"ಥಾನ":142,"ಥಾಪ":134,"ಥಿತ":69,"ನೆ ":323,"ನೇ ":743,"ನೂ ":58,"ಥವಾ":557,"ದಗಳ":58,"ನು ":2464,"ಟಿ ":147,"೧೦ ":57,"ಟು ":213,"ಟೆ ":76,"ಜಿನ":61,"ಜಿಲ":446,"ಜಾಲ":52,"ಜಾತ":92,"ಜುಲ":91,"ಜೀವ":240,"ಟ್ ":637,"ಜೂನ":90,"ೆನ":244,"ೆದ":362,"ೆಬ":114,"ೆಪ":117,"ೆರ":365,"ೆಯ":2599,"ೆಮ":88,"ೇಖ":200,"ೆವ":58,"ೇಕ":249,"ೆಳ":254,"ೆಲ":491,"ೆಸ":577,"ೆಹ":62,"ೇಗ":75,"ೇಜ":68,"ೇಟ":120,"ೇದ":127,"ೇತ":222,"ೆಕ":157,"ೆಗ":1392,"ೆಚ":219,"ೆಟ":300,"ೆಡ":111,"ೇಂ":180,"ೈನ":129,"ೈವ":57,"ೈಲ":128,"ೈಸ":184,"ೊಂ":945,"ೇಮ":53,"ೇನ":121,"ೈಕ":103,"ೇಳ":166,"ೇಶ":901,"ೇವ":379,"ೇರ":613,"ೇಯ":97,"ೇಲ":240,"ೈಟ":55,"ೇಷ":160,"ೇಸ":99,"ೈದ":86,"ೂಪ":194,"ೂರ":1041,"ೂಮ":136,"ೂತ":53,"ೂನ":169,"ೂಲ":387,"ion":67,"ುಪ":145,"ುಬ":131,"ುಭ":63,"ುಮ":379,"ುರ":798,"ುಡ":140,"ುಣ":155,"ುತ":1697,"ುದ":1164,"ುನ":256,"ೂಚ":55,"ುಹ":87,"ೂಟ":104,"ೂಡ":144,"ುಲ":232,"ೂಕ":272,"ುಳ":152,"ುವ":2954,"ುಸ":149,"ುಷ":80,"ೆಂ":1198,"ೃತ":380,"ೃದ":63,"ೃಷ":194,"ಜ್ಯ":449,"ಜ್ಞ":280,"್ರ":7384,"್ಯ":5185,"್ಮ":1098,"್ಭ":55,"್ಳ":385,"್ಲ":6122,"್ಸ":715,"್ಷ":1648,"್ಶ":149,"್ವ":1581,"ೋತ":79,"ೋದ":91,"ೋನ":101,"ೋಧ":91,"ೋಪ":132,"ೋಜ":72,"ೋಟ":142,"ೋಡ":153,"ೊಲ":98,"ೊಳ":303,"ೋಕ":95,"ೋಗ":340,"ೊಸ":100,"ೊಬ":113,"ೊಮ":81,"ೊರ":184,"ೊತ":77,"ೊನ":118,"ೊದ":217,"ೊಡ":288,"ೊಟ":69,"ೊಗ":56,"್ಪ":943,"್ಬ":531,"್ಫ":53,"್ದ":1537,"್ಥ":1015,"್ನ":4079,"್ಧ":516,"್ಡ":394,"್ತ":5341,"್ಣ":606,"್ಞ":286,"್ಠ":60,"್ಟ":2345,"್ಚ":591,"್ಜ":168,"್ಕ":1695,"್ಗ":432,"ೌರ":85,"ೌಲ":59,"ೌತ":60,"ೋಹ":109,"ೋಷ":61,"ೋಸ":106,"ೋವ":84,"ೋಶ":58,"ೋಳ":74,"ೋಲ":103,"ೋರ":455,"ೋಮ":87,"ೋಬ":137,"೦೦ ":84,"ಜಗತ":55,"ಟದ ":65,"೧೯":659,"೧೮":176,"೧೭":128,"೧೬":85,"೧೫":91,"೧೪":72,"೧೨":74,"೧೧":70,"೪ನ":53,"೨೦":221,"೨೧":53,"೦೦":269,"೧೦":100,"೦ರ":61,"೦ದ":80,"ಜಧಾ":74,"ಜನಪ":57,"ಜನನ":108,"ಜನರ":79,"ಜನಸ":53,"ಜನವ":106,"ಜನಿ":197,"೯೯":86,"೯೭":74,"೯೮":65,"೯೫":81,"೯೬":94,"೯೩":91,"೯೪":81,"೯೧":62,"೯೨":81,"೯೦":57,"೯ರ":54,"೮ರ":60,"೫೦":53,"೬ರ":52,"೬ನ":59,"೫ರ":52,"೫ನ":55,"೩೦":80,"೨೪":61,"೨೫":72,"೨೨":62,"೨೩":65,"೨೯":62,"೨೬":59,"೨೭":59,"ಏಷ":56,"ಒಂ":1369,"ಒಬ":183,"ಒಟ":63,"ಒಳ":146,"ಕಂ":287,"ಚರಿ":67,"ಕಪ":63,"ಕನ":521,"ಕಥ":62,"ಕದ":288,"ಕಳ":92,"ಖಕ":55,"ಕಲ":280,"ಕರ":1275,"ಕಮ":65,"ಕಗ":118,"ಕಕ":54,"ಕತ":77,"ಕಣ":99,"ಕಡ":111,"ಖಂ":114,"ಕಟ":188,"ಕೆ":1476,"ಕೇ":322,"ಖನ":70,"ಕೈ":57,"ಕೊ":554,"ಕೋ":259,"ಕ್":4088,"ಖರ":77,"ಕವ":314,"he ":68,"ಕಾ":1989,"ಗಂ":72,"ಕೀ":159,"ಕಿ":814,"ಕೂ":238,"ಕು":604,"ಕೃ":380,"ಗನ":75,"ಖ್":433,"ಗಮ":88,"ಚಲನ":114,"ಗಲ":148,"ಗರ":505,"ಗವ":163,"ಗಳ":4635,"ಗಗ":111,"ಗಡ":160,"ಖಾ":80,"ಗದ":413,"ಗತ":118,"ಗಣ":242,"ಗೆ":1418,"ಗು":1317,"ಗೂ":445,"ಗಿ":3142,"ಗೀ":235,"ಘಟ":107,"ಗಾ":676,"ಗಸ":140,"ಗ್":1370,"ಗೌ":65,"ಗೋ":407,"ಗೊ":342,"ಚಂ":92,"ಚಕ":52,"ಚನ":130,"ಚದ":87,"ಚಿಮ":122,"ಚಿನ":103,"ಚಲ":161,"ಚಿಸ":104,"ಚರ":168,"ಚಾರ":154,"ಚೆ":127,"ಚಾ":329,"ಚಿ":1078,"ಚೀ":83,"ಚು":203,"ಜಕ":77,"ಜಗ":96,"ಚಿತ":435,"ಚ್":491,"ಚಿಕ":171,"ಜನ":825,"ಜಧ":76,"ಜಲ":84,"ಜರ":102,"ಜಯ":76,"ಜು":167,"ಜೀ":273,"ಜೂ":99,"ಜಿ":789,"ಜಾ":390,"ಜೋ":82,"ಜೊ":58,"ಜೆ":97,"ಜ್":917,"ಟಕ":392,"ಟಗ":141,"ಞಾ":215,"ಟಣ":83,"ಟದ":127,"ಟನ":129,"ಟರ":243,"ಟವ":113,"ಟಾ":225,"೨ ":199,"೧ ":173,"೪ ":159,"ಂಸ":383,"ಂಶ":181,"ಂವ":61,"ಂಯ":79,"ಂಬ":1236,"ಂಭ":148,"ಂಪ":432,"ಂದ":5006,"ಂಧ":211,"ಂತ":1631,"ಂಥ":91,"ಂಟ":300,"ಂಡ":1562,"ಂಚ":262,"ಂಜ":146,"ಂಗ":1536,"ಂಖ":130,"೩ ":171,"ಂಘ":78,"ಂಕ":337,"ಅಗ":63,"ಅಕ":181,"೬ ":165,"ಅತ":348,"ಅಡ":73,"ಆಂ":116,"ಚೀನ":76,"೫ ":171,"ಅಂ":414,"ಆಸ":93,"೮ ":158,"ಇದ":1048,"ಇತ":185,"ಆಫ":104,"ಆಧ":97,"ಆದ":131,"ಆವ":58,"ಆಳ":54,"ಆಲ":68,"ಆರ":228,"ಆಯ":65,"ಆಚ":60,"ಅವ":422,"ಆಗ":261,"೭ ":172,"ಅಸ":94,"ಆಟ":97,"ಇಂ":249,"ಆಡ":86,"ಅಪ":94,"ಅದ":275,"ಅಥ":647,"ಅನ":333,"ಅಧ":417,"ಅಲ":154,"ಆಕ":75,"ಅಮ":219,"ಅಭ":111,"ಅರ":281,"ಚ್ಚ":307,"ಉತ":320,"ಉದ":186,"ಇಲ":141,"ಇರ":180,"ಇವ":619,"ಇಪ":84,"ಇನ":88,"ಉಂ":56,"೯ ":167,"ಊರ":63,"ಉಪ":269,"ಟಕ ":183,"ಎಂ":888,"ಎಲ":159,"ಎರ":162,"ಎಸ":63,"ಏಪ":62,"ಎಎ":79,"ಎತ":68,"ಎನ":130,"ಲೂ":332,"ಳದ":98,"ಲೇ":336,"ಳನ":786,"ಲೆ":1225,"ಲೈ":134,"ಲೋ":199,"ಲೊ":116,"ಲ್":7197,"ಳಲ":778,"ಳಕ":140,"ಲವ":387,"ಳಗ":277,"ಲಸ":91,"ಲಾ":1226,"ಲಿ":5377,"ಲೀ":90,"ಲು":562,"ಳೆ":375,"ಳ್":432,"ಜಿ ":102,"ಳವ":134,"ಳಸ":184,"ಳಾ":158,"ಳು":1449,"ಳೂ":229,"ಳಿ":1334,"ವಂ":199,"ರೇ":335,"ರೆ":1812,"ರು":4505,"ರೀ":683,"ಲಂ":108,"ರೂ":345,"ರ್":5601,"ರೈ":117,"ರೋ":413,"ರೊ":96,"ರವ":930,"ರಶ":156,"ರರ":275,"ರಲ":578,"ರಳ":88,"ರಾ":3110,"ರಿ":4229,"ರಸ":502,"ರಹ":213,"ಲನ":192,"ಲಧ":79,"ಲದ":305,"ಲತ":56,"ಲರ":69,"ಲಯ":223,"ಲಗ":86,"ಲಕ":273,"ಷರ":95,"ಸಕ":90,"ಷವ":55,"ಸಗ":60,"ಷನ":59,"ಶೇ":96,"ಶೈ":56,"ಶೋ":71,"ಶ್":926,"ಶಿ":491,"ಶಾ":465,"ಶು":62,"ಶೀ":53,"ಸಂ":1411,"ಷತ":59,"ಷಣ":159,"ಷದ":539,"ಷಗ":72,"ಶಸ":149,"ಸಲ":766,"ಸರ":765,"ಹಗ":69,"ಹಕ":66,"ಸವ":133,"ಷೇ":140,"ಸನ":125," of":57,"ಸಮ":419,"ಷ್":1158,"ಸಬ":77,"ಸಣ":55,"ಹಂ":68,"ಷಿ":475,"ಷೆ":352,"ಸದ":188,"ಸತ":71,"ಷಾ":85,"ವಲ":178,"ವಳ":120,"ಶಕ":218,"ವಯ":77,"ವರ":2811,"ವದ":177,"ವಧ":55,"ವನ":1004,"ವತ":235,"ವಣ":147,"ವಜ":56,"ವಕ":121,"ವಗ":66,"ಷಕ":61,"ಶವ":121,"ಶರ":52,"ವ್":725,"ಶಬ":54,"ವೊ":56,"ವೇ":345,"ಶನ":143,"ವೈ":184,"ಶದ":301,"ವೆ":860,"ಶತ":100,"ವೃ":137,"ವೂ":60,"ವು":1384,"ವೀ":183,"ವಿ":2666,"ವಾ":3557,"ವಹ":172,"ವಸ":327,"ಶಗ":201,"ವವ":155,"ಸಾ":1235,"ಸಿ":2058,"ಸಸ":92,"ಸಹ":114,"ಸೆ":366,"ಹನ":99,"ಸೇ":360,"ಹದ":135,"igh":65,"ಸೂ":287,"ಹಣ":147,"ಹತ":133,"ಸೀ":80,"ಸು":1321,"ಹರ":198,"ಸ್":4080,"ಸೌ":55,"ಸೋ":90,"ಸೈ":69,"ಹಸ":55,"ಹವ":83,"ಹಳ":189,"ಹಲ":179,"ಹಾ":1242,"ಹಿ":907,"ಹೀ":57,"ಹೆ":605,"ಹೇ":123,"ಹು":431,"ಹೂ":52,"ಹ್":175,"ಹೊ":532,"ಹೋ":187,"ಿಪ":236,"ಿನ":2977,"ಿಧ":213,"ಿಮ":515,"ಿಭ":159,"ಿಬ":92,"ಿಡ":230,"ಿಟ":228,"ಿದ":3371,"ಿತ":1841,"ಿಣ":314,"ಿಹ":188,"ಿಷ":392,"ಿಸ":2818,"ೀಟ":93,"ಿಲ":986,"ಿಯ":3289,"ಿರ":1992,"ಿವ":655,"ಿಶ":439,"ಿಳ":170,"ೀಕ":215,"ೀನ":225,"ೀಪ":138,"ೀಯ":562,"ೀಮ":67,"ೀಡ":224,"ುಂ":466,"ೀತ":316,"ೀಸ":69,"ುಚ":64,"ುಟ":277,"ೀರ":351,"ೀಲ":114,"ುಕ":335,"ೀವ":282,"ುಖ":454,"ುಗ":730,"ಾಂ":1000,"ಾಜ":887,"ಾಚ":176,"ಾಖ":102,"ಾಗ":4356,"ಾಕ":469,"ಾಪ":496,"ಾಭ":74,"ಾಬ":132,"ಾಧ":258,"ಾದ":1706,"ಾನ":1975,"ಾಣ":559,"ಾಥ":81,"ಾತ":838,"ಾಟ":601,"ಾಡ":732,"ಿಂ":1911,"ಿಜ":269,"ಾಶ":207,"ಿಗ":1656,"ಾಷ":669,"ಾಸ":1090,"ಿಚ":104,"ಾಹ":451,"ಾಲ":1507,"ಾಳ":273,"ಿಕ":2705,"ಿಖ":52,"ಾವ":819,"ಾಮ":995,"ಾಯ":1047,"ಾರ":4215,"ತಗ":107,"ಣವ":210,"ತಕ":168,"ಣು":184,"ಣಿ":524,"ಣಾ":151,"ಣನ":52,"ಣದ":261,"ಣರ":86,"ಣಗ":184,"ಣಕ":131,"ತಂ":382,"ಡೆ":640,"ಡೇ":56,"ಡು":959,"ಡ್":778,"ಡಳ":71,"ಡಲ":204,"ಡವ":88,"ಡಿ":1466,"ಡಾ":222,"ಡನ":199,"ಡದ":306,"ಜ್ ":103,"ಡರ":323,"ಡಕ":55,"ಡಗ":107,"ಟಿ":799,"ಟೀ":72,"ಟು":403,"ಟೆ":354,"ಟೇ":63,"ಟೋ":167,"ಟ್":2145,"ನದ":576,"ನತ":67,"ಧಿ":651,"ನಡ":623,"ಧಾ":588,"ನಟ":88,"ನಚ":96,"ನಗ":604,"ಧವ":68,"ನಕ":147,"ಧರ":218,"ದ್":2979,"ದೊ":278,"ಧನ":157,"ದೇ":1188,"ದೆ":2081,"ದೂ":324,"ಧತ":62,"ದಾ":1139,"ದಿ":2445,"ದೀ":64,"ನಂ":266,"ದು":3947,"ದಸ":55,"ದಲ":2370,"ದವ":372,"ಥ್":61,"ದರ":1541,"ಥೆ":287,"ದದ":103,"ದನ":456,"ಥಿ":158,"ಥಾ":314,"ಥವ":694,"ದಗ":126,"ತ್":6565,"ಥಮ":57,"ದಕ":447,"ಥಳ":111,"ತೇ":59,"ತೆ":663,"ತೋ":73,"ತೊ":91,"ತಾ":1049,"ತಿ":2648,"ತೀ":321,"ದಂ":194,"ತು":2278,"ತವ":560,"ತಶ":56,"ಥಗ":54,"ht ":63,"ತಹ":98,"ತಯ":76,"ಣ್":346,"ತಮ":315,"ತರ":927,"ತಲ":96,"ತತ":62,"ಣೆ":345,"ತದ":1260,"ತನ":354,"ತಪ":55,"ಪೋ":69,"ಪ್":3265,"ಬಗ":149,"ಫಿ":80,"ಬಣ":56,"ಫೆ":85,"ಬದ":151,"ಪರ":638,"ಪಯ":118,"ಪಶ":129,"ಪವ":80,"ಪು":628,"ಬಂ":329,"ಪೀ":61,"ಪಿ":494,"ಪಾ":718," ri":59,"ಪೇ":72,"ಪೆ":86,"ಪೂ":243,"ನೈ":115,"ನೊ":113,"ನೋ":140,"ನ್":5580,"ಪಕ":225,"ಪಗ":72," px":81,"ಪಟ":267,"ಪಡ":351,"ಪತ":304,"ಪದ":446,"ಪನ":246,"ನಪ":122,"hum":105,"ನನ":201,"ಧ್":402,"ನಮ":102,"ನಲ":470,"ನರ":234,"ನವ":655,"ನಸ":130,"ನಿ":2138,"ನಾ":1363,"ನೂ":140,"ನು":2795,"ನೀ":276,"ಪಂ":242,"ನೇ":1015,"ನೆ":1144,"ರಜ":113,"ಯಾ":2655,"ಯಸ":104,"ರಚ":194,"ರಕ":548,"ರಗ":490,"ಯಶ":64,"ಯವ":894,"ಯರ":301,"ಯಯ":55,"ಯಲ":992,"ರಬ":144,"ಯೋ":318,"ರಭ":83,"ಯ್":184,"ರಮ":614,"ರಯ":73,"ಯೇ":137,"ರನ":308,"ರಪ":191,"ಯೊ":109,"ರತ":1050,"ರದ":1229,"ರಧ":52,"ಯೆ":362,"ಯಿ":713,"ರಡ":219,"ಯು":1152,"ಯೂ":247,"ರಣ":519,"ಮಹ":299,"ಮಸ":60,"ಯಗ":270,"ಮವ":111,"ಯಕ":439,"ಮಲ":102,"ಮರ":288,"ಮಯ":65,"ಭ್":65,"ಯಮ":248,"ಮ್":861,"ಮೊ":334,"ಮೋ":102,"ಮೈ":166,"ಮೆ":345,"ಯನ":1088,"ಮೇ":483,"ಯದ":586,"ಮೂ":556,"ಯತ":120,"ರಂ":561,"ಮೀ":223,"ಮು":926,"ಮಾ":2202,"ಮಿ":718,"ಮಗ":156,"ಭವ":125,"ಬ್":957,"ಮಕ":218,"ಮದ":290,"ಮಧ":132,"ಮನ":323,"ಭೌ":76,"ಮಟ":57,"ಭಾ":1580,"ಭಿ":188,"ಯಂ":366,"ಮಣ":82,"ಭೂ":208,"ಮತ":1458,"ಬಹ":299,"ಬಸ":64,"ಫ್":310,"ಫೋ":52,"ಬಳ":371,"ಬಲ":150,"ಬರ":893,"ಬೆ":594,"ಬೇ":205,"ಭದ":68,"ಬೈ":80,"ಚನೆ":74," th":169,"ಬಾ":388,"ಬಿ":415,"ಬೀ":91,"ಮಂ":262,"ಬು":293,"ತರ ":414,"ೂ ":1096,"ಡದಲ":57,"ಡನೆ":57,"ಡನೇ":52,"ಾ ":1624,"ಡನ್":62,"ು ":14575,"ೀ ":229,"ಿ ":8907,"ತನ ":68,"ಹ ":251,"ಶ ":222,"ವ ":2270,"ಸ ":186,"ಷ ":133,"ತು ":1867,"ಲ ":511,"ಳ ":1320,"ತೆ ":302,"er ":60,"ಮ ":601,"ಯ ":3177,"ರ ":2974,"ತಹ ":86,"ಪ ":166,"ಡರ್":274,"ಬ ":384,"ಡಲಾ":92,"ಥ ":133,"ದ ":5745,"ತಿ ":665,"ಧ ":252,"ಡಳಿ":71,"ತಾ ":62,"ನ ":3185,"೦ ":330,"ಟೆಂ":84,"ght":64,"ಣದ ":121,"ಟುಂ":86,"ಟಿಕ":121,"ಟಿದ":82,"ಟಿನ":67,"ಟಿಯ":94,"ಟಿಸ":83,"ಣು ":78,"ಣಿ ":100,"ಣೆ ":124,"ತದ ":400,"ಟ್ರ":464,"ಟ್ಟ":861,"್ ":7021,"ಟೋಬ":94,"ೊ ":102,"ೋ ":134,"ೈ ":170,"ೆ ":6915,"ೇ ":1293,"ಟನೆ":65,"ಡ್ ":396,"ಃ ":52,"ಂ ":134,"ಞಾನ":208,"ಡೆ ":91,"ಟವಾ":66,"ಟರ್":142,"ಟ ":308,"ಜ ":97,"ತ ":1468,"ಣ ":786,"ಠ ":61,"ಡ ":654,"ಗ ":398,"ಖ ":205,"ಡದ ":222,"ಕ ":1763,"ಡು ":260,"ಟಗಾ":53,"ಟಗಳ":71,"ಡಿ ":212,"ಡಾ ":70,"ಟಕದ":131,"ಆ ":74,"ಈ ":714,"ಕಗಳ":105,"ಗು ":199,"ಗೂ ":397,"ಗಿ ":1320,"ಗೆ ":1195,"ಖಂಡ":114,"ಕಟ್":111,"ಗ್ ":190,"ಕತೆ":52,"rig":61,"ಗರ ":186,"ಗಳ ":1047,"ಗದ ":146,"ಕಂಡ":123,"ಕಂಪ":101,"ಕ್ ":357,"ಕು ":163,"ಕಿ ":188,"ಒಳಗ":120,"ಕಾ ":103,"ಕೆ ":697,"ಕರ ":75,"ಒಟ್":62,"px ":80,"ಒಬ್":183,"ಕದ ":188,"ಒಂದ":1351,"ಚಂದ":71,"ಗ್ಗ":161,"ಗ್ಲ":159,"ಗ್ಯ":52,"ಗ್ರ":702,"ಗೊಳ":102,"ಗೋರ":246,"ಗೋಳ":55,"ಚ್ ":122,"ಗೆಯ":68,"of ":53,"ಗೊಂ":193,"ಗಾಗ":97,"ಗಾಂ":64,"ಗಿನ":109,"ಗಿರ":313,"ಗಿಸ":155,"ಗೀತ":186,"ಗುಂ":62,"ಗಾರ":173,"ಗಾಲ":60,"ಗಿತ":64,"ಗಿದ":943,"ಗುರ":169,"ಗುವ":283,"ಗುಣ":80,"ಗುತ":373,"ಗಳಲ":710,"ಗಳನ":727,"ಗಳೂ":224,"ಗಳು":1084,"ಗಳಾ":97,"ಗಳಿ":589,"ಚು ":103,"ಗವಾ":67,"ಗಸ್":126,"on ":65,"ಖ್ಯ":409,"ಗರದ":116,"ಗದಲ":154,"ಗತ್":77,"ಗಣಿ":86,"ಗಡಿ":73,"ಗಗಳ":101,"ಕ್ಷ":939,"ಕ್ರ":668,"ಕ್ಯ":360,"ಕ್ತ":356,"ಕ್ಟ":186,"ಕ್ಸ":118,"ಕೋಟ":60,"ಕ್ಕ":985,"ಕೊಂ":167,"ಕೊಳ":121,"ಕೊಡ":66,"ಕೆಲ":163,"ಕೃಷ":124,"mb ":102,"ಕೃತ":250,"ಕೆಗ":100,"ಕೆಟ":113,"ಕೇಂ":134,"ಕೆಯ":206,"ಕಿಸ":52,"ಕಿರ":95,"ಕಾಸ":55,"ಕಾವ":68,"ಕಾಶ":87,"ಕಾಲ":240,"ಕಾಯ":81,"ಕಾರ":767,"ಕಿನ":176,"ಕಿತ":62,"ಕುರ":58,"ಕೂಟ":53,"ಕೂಡ":86,"ಕೀಯ":82,"ಕುಟ":89,"ಕಾಣ":65,"ಕಿಂ":74,"ಕಾನ":57,"ಕಾದ":114,"ಕಾಗ":98,"ಕಾಂ":80,"ಕಲಾ":56,"ಕಲ್":114,"ಕವಾ":159,"ಕವಿ":72,"ಕರಣ":88,"ಕರೆ":221,"ಕರಿ":97,"ಕರು":101,"ಕರಾ":121,"ಕರ್":467,"ಕದಲ":61,"ಕನ್":423,"ಉತ್":319,"ಉದ್":135,"ಉಪಯ":103,"ಎಸ್":57,"ಏಪ್":62,"್ಥೆ":228,"್ಥಾ":284,"್ಥಿ":119,"್ದವ":56,"್ದರ":183,"್ದೇ":157,"್ದು":363,"್ಧತ":61,"್ದಾ":350,"್ದಿ":75,"ಎಂಬ":368,"ಎಂದ":448,"್ಣು":59,"್ಣಿ":64,"್ತದ":721,"್ತರ":338,"್ತವ":225,"್ತನ":109,"್ತಿ":826,"್ತು":1699,"್ತೆ":63,"್ತಾ":369,"್ಥಳ":111,"್ತ್":311,"್ಪತ":108,"್ಪಡ":151,"್ಪಟ":116,"್ಪನ":87,"್ಪಾ":63,"್ಪಿ":65,"್ನಡ":386,"್ಧಿ":63,"್ಧಾ":55,"್ನಾ":402,"್ನಿ":122,"್ನೂ":64,"್ನು":2402,"್ನೆ":94,"್ನೊ":55,"್ಮಾ":125,"್ಮಿ":138,"್ಯಗ":175,"್ಯಕ":269,"್ಮದ":87,"್ಮನ":56,"್ಯಂ":192,"್ಯು":176,"್ಯಾ":1532,"್ರಜ":97,"್ರಗ":251,"್ಯವ":564,"್ರಕ":297,"್ಯರ":131,"್ಯಮ":116,"್ಯಯ":54,"್ಯನ":119,"್ಯದ":366,"್ಮೆ":64,"್ಯತ":58,"್ಬರ":145,"್ಮಕ":73,"್ಲೂ":97,"್ಲಾ":202,"್ಲಿ":4561,"್ಲದ":116,"್ಳು":109,"್ಳಿ":142,"್ಲೊ":95,"್ಲೇ":124,"್ಲೆ":484,"್ಲ್":60,"್ರವ":349,"್ರಶ":142,"್ರಸ":283,"್ರಹ":151,"್ರಾ":846,"್ರಿ":942,"್ರೀ":518,"್ರು":70,"್ರತ":249,"್ಯೂ":155,"್ರಣ":55,"್ರದ":625,"್ಯೆ":160,"್ರಪ":133,"್ರಮ":468,"್ರಭ":71,"್ರರ":86,"್ರಯ":66,"್ರೆ":357,"್ರೈ":58,"್ರೇ":163,"್ರ್":78,"್ರೋ":120,"್ಕ್":92,"್ಕರ":72,"್ಕೃ":119,"್ಕೂ":68,"್ಕು":73,"್ಕೆ":541,"್ಕಿ":142,"್ಕಾ":226,"್ಚಾ":65,"್ಚಿ":204,"್ಚು":144,"್ಗೆ":104,"್ಟರ":54,"್ಟಣ":72,"್ಟದ":57,"umb":108,"್ಞಾ":215,"್ಚ್":78,"್ತಕ":66,"್ಣವ":52,"್ಡ್":103,"ಎಎಎ":72,"್ಟೆ":135,"್ಟೋ":119,"್ಟ್":671,"್ಟವ":59,"್ಟಿ":399,"್ಟಾ":78,"್ಟು":217,"ಎತ್":68,"್ವವ":82,"್ವರ":147,"್ವಿ":108,"್ವಾ":248,"್ವಹ":80,"್ಶನ":54,"್ವೀ":99,"್ವದ":124,"್ವತ":116,"್ಷರ":65,"ಎಲ್":119,"್ಷಿ":380,"್ಷೇ":121,"್ವ್":52,"್ಷಗ":60,"್ಷಣ":115,"್ಷದ":530,"್ಸ್":339,"್ಷ್":53,"್ಸೆ":62,"್ಸಿ":119,"ಎನ್":124,"thu":103,"ಎರಡ":156,"the":64," ಆ ":74," ಈ ":713,"ೋಗಿ":116,"ೊಳಿ":52,"ೊಳ್":156,"ಂಸ್":307,"ೊಮ್":67,"ಂತೆ":162,"ಂತ್":364,"ಂತರ":298,"ೋಬರ":94,"ಂತಹ":96,"ಂತಿ":112,"ಂತಾ":62,"ಂದೆ":162,"ಂದೇ":82,"ಂದ್":319,"ಂಧಿ":104,"ಂದರ":313,"ಂದಿ":541,"ಂದಾ":143,"ಂದು":2122,"ಂದೂ":183,"ಂಡದ":109,"ಂಡರ":290,"ಂಟ್":69,"ಂಡ್":203,"ಂಡಿ":293,"ಂಡು":195,"ೋತ್":57,"ಂಭವ":62,"ಂಬು":192,"ಂಬಿ":59,"ಂಬಾ":53,"ಂಬರ":342,"ಂಯು":54,"ಂಪ್":93,"ಂಬಂ":87,"ಂಪು":57,"ಂಪಿ":53,"ೋಡಿ":64,"ಂಪನ":65,"ಅಂದ":78,"ಅಂತ":160,"ೋರಾ":52,"ೋರಿ":296,"ೋರ್":74,"ೈಸೂ":88,"ೈಸ್":52,"್ಕ ":65,"್ಗ ":60,"ೊಂಡ":342,"ೊಂದ":551,"್ಣ ":181,"್ಡ ":146,"ಂಕ್":62,"್ದ ":187,"ಂಗಳ":447,"ಂಖ್":129,"ಂಗದ":91,"ಂಗಡ":60,"್ಧ ":174,"್ತ ":281,"ಂಕರ":53,"್ಥ ":67,"ಂಚದ":64,"ಂಗೀ":161,"ಂಗಾ":120,"ಂಗ್":337,"್ಟ ":206,"್ಲ ":152,"ೊದಲ":197,"ೊಬ್":70,"್ಷ ":91,"್ವ ":146,"್ಪ ":91,"ೊಟ್":56,"್ನ ":174,"್ರ ":555,"್ಯ ":846,"್ಮ ":242,"್ಬ ":139,"ೊಡ್":169,"ಇದು":518,"ಇದೆ":55,"ೆಪ್":108,"ಇದರ":166,"ಇದನ":181,"ೆಬ್":91,"ಇನ್":84,"ೆದು":87,"ೆದಿ":70,"ೆನ್":134,"ಇತಿ":65,"ಇತರ":85,"ೆಲ್":157,"ೆಳೆ":86,"ೆಳಗ":64,"ೆಲವ":94,"ೆಲೆ":75,"ೇಖಕ":54,"ೇಖನ":56,"ೆಯಲ":526,"ೆಯು":360,"ೆಯಾ":224,"ೆಯಿ":119,"ೆಯನ":236,"ಆಸ್":78,"�":222,"ೆರೆ":114,"ೆರಿ":98,"ೆಸರ":280,"ಆರ್":99,"ೆಸ್":116,"ೆಸಿ":79,"ಆಫ್":99,"ೇತ್":125,"ೇಶವ":74,"ೇಶ್":84,"ೇಶಿ":67,"ೇಶಗ":141,"ೇಶಕ":79,"ಉಂಟ":54,"ೇವೆ":92,"ೇಶದ":265,"ೇವಾ":53,"ೇವಿ":64,"ೇಲಿ":68,"ೇಲೆ":114,"ೇರಿ":374,"ೇರೆ":53,"ೇರು":61,"ಇವರ":508,"ಇಲ್":125,"ಇರು":142,"ಇಪ್":83,"ಅತಿ":137,"ಅತ್":191,"ಅಥವ":638,"ೇಶ ":126,"ಅಕ್":155,"ೆಂಗ":155,"ೆಂಬ":309,"ೆಂಟ":57,"ೆಂಡ":313,"ೆಂದ":240,"ಇಂಡ":64,"ಇಂದ":55,"ಇಂಗ":69,"ಆದರ":63,"ಅಸ್":52,"ಆಗಸ":106,"ಆಗಿ":88,"ೆಗಳ":621,"ೆಗಾ":62,"ೆಗೆ":326,"ೆಗೊ":53,"ೆಗೋ":241,"ಅರ್":136,"ಅಮೇ":112,"ಅಮೆ":58,"ೆಕ್":125,"ಅವರ":214,"ಅವು":57,"ಅಲ್":103,"ೆಚ್":218,"ೆಟ್":262,"ಅನು":103,"ಅಧ್":96,"ಅನೇ":75,"ಅನ್":71,"ೇಂದ":169,"ಅದರ":105,"ಅಧಿ":313,"ಅದು":55,"ಅಭಿ":89},"n_words":[514512,567770,395860],"name":"kn"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"정리이":69,"바뀌었":102,"²":131,"·":6405,"×":65,"í":141,"é":560,"è":77,"ç":72,"젤레스":103,"ä":74,"á":169,"ü":183,"ö":136,"ó":126,"참여했":93,"ā":193,"ī":75,"참여하":278,"방글라":64,"ˈ":80,"́":127,"μ":101,"ν":231,"ο":310,"ι":218,"κ":112,"λ":169,"δ":75,"ε":146,"η":94,"α":368,"β":61,"γ":80,"ά":91,"ί":112,"ω":66,"ό":88,"σ":105,"ς":279,"ρ":207,"개발 ":675,"π":78,"υ":78,"τ":170,"ь":58,"я":64,"ы":64,"ч":103,"р":321,"с":201,"т":190,"у":110,"К":57,"С":60,"л":238,"к":231,"й":118,"и":464,"о":480,"н":337,"м":118,"г":82,"в":283,"а":488,"е":318,"д":121," ·":941,"장이었":82,"람이 ":326,"و":102,"ي":163,"ل":186,"م":122,"ن":119,"د":84,"ب":85,"ا":266,"س":58,"ر":100,"람의 ":253,"사상가":98,"산에 ":201,"았다 ":914,"미리 ":99," К":57,"포트 ":65,"놓고 ":101," С":60,"사와 ":395,"산업 ":379,"미를 ":234," ا":105,"람은 ":91,"래에 ":110,"람을 ":297,"물질을":114,"로그 ":136,"물질이":123,"물질의":91,"미르 ":149,"농구 ":70,"생명과":67,"장인물":96,"정리로":90,"장으로":586,"나에서":75,"야기 ":98,"전문적":89,"전송 ":81,"크기가":57,"이팅 ":102,"아들 ":170,"설로 ":135,"나오는":326,"인트 ":75,"· ":502,"의회 ":231,"장이다":475,"랑을 ":65,"사원 ":75,"라질 ":121,"롭게 ":172,"방검찰":64,"사용 ":140,"렸다 ":419,"생산 ":224,"안된 ":100,"é ":165,"란시스":115,"키고 ":206,"서대문":75,"가로서":123,"ể":108,"∼":121,"사이 ":125,"정서 ":66,"찰청 ":197,"사인 ":233,"사일 ":104,"프가니":112,"날에는":70,"따른 ":260,"바라 ":72," ∼":97,"사의 ":834,"가리킨":657,"가리키":646,"서만 ":126,"가마쿠":67,"종교적":110,"딸로 ":77,"사상의":73,"사상이":110,"사상을":91,"표에 ":65,"차이를":67,"야구 ":394,"러드는":78,"생명체":80,"서를 ":347,"성남시":83,"거는 ":139,"코프 ":86,"소가 ":256,"ああ ":309,"밀도는":453,"련된 ":456," 《":4309," 》":621," 〈":549," 『":98," 「":147,"㎞":62,"사적 ":294,"㎢":2033,"㎡":66,"초대 ":388,"가리아":156,"생물학":287,"키가 ":87,"애나 ":67,"ㆍ":388,"あ":1627,"》":5027,"《":5040,"〉":601,"〈":607,"』":119,"『":118,"」":186,"「":184,"가르침":80,"사전 ":487,"ア":1597," ㎞":62," ㎢":2029,"산의 ":346,"알고리":206,"장치 ":174,"아동 ":67,"산이 ":146,"전시 ":66,"크다 ":72," あ":325,"정상 ":60," ア":198,"산은 ":120,"않다 ":121,"백과사":636,"전반에":79,"드시 ":89,"사소송":106,"산을 ":359,"제르바":102,"크는 ":102,"전반적":63,"로구 ":380,"생물의":77,"사자 ":61,"처리하":131,"론과 ":129,"배경으":124,"사장 ":72,"세네갈":68,"남쪽 ":131,"않는 ":497,"바로 ":210,"민들의":77,"이토 ":77,"성된 ":636,"랑스에":91,"랑스어":228,"랑스와":71,"민들이":87,"나이며":210,"남아시":146,"들어 ":501,"악단 ":108,"》에서":140,"내지 ":84,"乱":136,"乩":112,"乞":60,"乙":3650,"乘":609,"乎":59,"么":100,"之":14530,"丹":2134,"丼":110,"临":105,"丫":150,"中":433,"両":683,"並":12874,"丙":894,"丘":5494,"丛":58,"丟":765,"丞":224,"丑":412,"专":172,"丕":293,"且":380,"丈":1178,"三":20984,"성동 ":77,"세대 ":267,"一":284,"丁":10402,"万":311,"仕":271,"仗":58,"亳":204,"人":513,"亥":108,"亨":114,"프는 ":76,"亞":4771,"亟":76,"亀":254,"亂":4287,"于":245,"성공회":174,"佇":187,"佛":194,"佐":64,"미디어":463,"조건을":75,"俘":194,"侶":558,"侃":89,"侏":196,"세계의":336,"자이자":420,"세계적":310,"자이언":74,"세계에":387,"나이다":1203,"남아메":109," 冲":247,"럭비 ":120,"이터 ":437," 兌":119,"아닌 ":431,"년대부":66,"것과 ":178," 僅":104,"咀":213," 傭":293,"개를 ":145,"吾":169,"圓":737,"國":1227,"在":61,"차원의":57,"国":85,"차이다":148,"조는 ":153,"冲":466," 丙":282," 丘":1641," 丟":262," 丞":90," 丑":173," 丕":104," 丈":484," 三":5820," 一":101," 丁":2901," 万":65,"남아프":161,"작용하":94,"兌":396,"傭":628,"傀":77,"무총리":119,"약간 ":80,"僅":349,"倣":128,"倉":3517,"倖":139,"드와 ":234," 侶":151,"차이나":64," 俘":77,"厥":67," 倉":1228," 佛":82," 佇":65,"정부 ":492,"라에서":173,"자유주":102," 侏":88," 人":94," 亳":67," 仕":88," 両":199," 並":3333," 中":269," 丹":1249," 之":4377," 乘":159," 乙":1383,"자전거":100," 亂":1372," 亀":76," 于":69," 亞":903,"유하는":80,"로나 ":78,"감독하":72," 大":668,"드에 ":180,"尙":86,"彦":59,"차이가":126,"자이며":181,"年":139,"랑스의":605,"남아있":64,"大":1261,"세기경":68," 吾":77," 咀":127,"쟁이 ":102,"세계화":76,"작이다":110,"유하고":203,"때로 ":69,"아는 ":196," 圓":305," 國":341,"발로 ":61,"세기부":84,"月":80,"인천광":247,"이트 ":656,"반대로":62,"이틀 ":64,"디렉터":60,"日":322,"敎":382,"拼":71,"바르 ":85,"바른 ":114,"셔널 ":194,"너먼트":71,"드어 ":66,"체되었":67,"표적 ":60,"센고쿠":141,"전문대":61,"바니아":130,"렸던 ":58,"년대에":224,"제로는":86,"거나 ":1548,"淸":143,"포함 ":105,"받는다":110,"나이티":86,"편하여":83,"나이트":65,"재연구":60,"상스 ":59,"나치 ":142," 敎":71," 日":116,"있지만":276,"사에 ":412,"ος":126,"眞":108,"的":166,"개되었":111,"사업 ":385,"노동 ":64,"키기 ":205,"상북도":406,"ς ":268,"전문가":188,"랑수아":66,"물체의":76,"서로 ":648,"α ":114,"爲":79,"의할 ":57," 淸":90,"전사 ":94,"임진왜":116,"의해 ":1904,"남으로":73,"아노 ":115,"크기의":78,"세기에":313,"의학 ":91,"의한 ":415,"장에서":426,"세기의":90,"유형문":67,"창시자":86,"밴드 ":173,"래의 ":272,"장애인":188,"자치 ":104," 眞":65,"자적인":76,"제부 ":75,"硏":108,"인텔 ":106,"의하고":60,"자치도":165,"자치단":144,"응하여":78,"가문 ":83,"ск":57,"람에게":86,"입한 ":113,"정부의":282,"개념이":228,"개념을":110,"개념으":199,"때는 ":184,"저우 ":125,"라이다":188,"라이더":86,"산식품":614,"표준 ":235,"라이나":257,"녀는 ":98," · ":456,"갖는다":62,"나의 ":717,"а ":76,"절에 ":78,"라이브":286,"й ":90,"의하는":116,"н ":64,"성되었":222,"성되어":523,"내부에":78,"в ":66,"내부의":59,"크로 ":125,"라이버":78,"상적 ":74,"어가 ":375,"남아 ":134,"가들의":84,"아를 ":247,"가들이":99,"ч ":81,"라이며":60,"아르 ":58,"ко":62,"ич":83,"세를 ":145,"ор":58,"ов":121,"но":66,"악단이":78,"ви":93,"ан":66,"첫번째":86,"반대하":127,"정부를":59,"색의 ":117,"재지이":107,"적에 ":98,"驿":62,"라이언":123,"라이온":80,"鬪":67," 靑":75,"라이스":87,"민법 ":58,"색을 ":142,"나인 ":170,"색은 ":70,"성모 ":93,"나이 ":96,"라이선":82,"아니라":536,"로는 ":3890,"정보통":139,"처에 ":89,"간되었":75,"전에 ":1190,"드리아":129,"청북도":246,"아니아":146,"표시하":91,"이터베":227,"최근 ":95,"년대 ":557,"이터를":186,"서부 ":369,"아니스":70,"鎭":93,"않는다":348,"송국 ":85,"초로 ":510,"인터내":76,"정보화":76,"인터넷":668,"난을 ":59,"정부에":116,"정부와":65,"임한 ":86,"靑":123,"자치구":154,"인터페":232,"감독을":67,"아누스":72,"감독의":83,"감독이":123,"타고 ":84,"아로 ":63,"적인 ":4808,"적이 ":256,"총괄하":84,"꾼":109,"꾸":368," 갈":698," 각":2309," 가":18345," 간":2103," 같":2403," 개":12530," 객":258," 값":267," 감":1783," 갑":208," 갖":887," 강":4617,"꿈":119,"꿀":77,"꼬":171,"꼭":124,"꼴":114,"꼽":179,"저장 ":80,"꽃":296,"꺼":75,"께":1579,"껍":85,"긴":797,"기":57951,"길":1653,"글":2723,"긍":81,"긋":58,"금":4364,"급":4129,"깥":84,"깨":247,"깔":123,"깝":68,"깃":99,"김":2072,"깊":200,"까":5090,"권":5693,"궁":939,"궐":79,"궤":302,"귀":1039,"전은 ":325,"규":2589,"균":820,"근":3205,"그":19235,"극":2043,"관":22357,"괄":288,"광":6130,"괴":686,"이트로":57,"교":18288,"구":31163,"국":45098,"굴":660,"굳":64,"군":9174,"굿":62,"굽":77,"겐":407,"겔":142,"겉":64,"게":11654,"것":8442,"검":1618,"겸":469,"겹":91,"겼":193,"경":17015,"결":4818,"격":2956,"겨":1106,"겪":88,"견":1448,"겠":90,"물학 ":86,"계":15438,"과":26837,"곽":179,"공":20325,"곰":87,"곱":284,"곳":1654,"골":1102,"곤":459,"곧":180,"곡":2975,"고":47107,"값":459,"갑":443,"감":2792,"갖":908,"강":6713,"갔":159,"같":2537,"객":963,"개":15756,"가":55413,"각":5239,"간":9024,"갈":1359,"갱":78,"갤":95,"걀":59,"건":6543,"거":8987,"걸":1105,"아니다":144,"라운드":96,"날의 ":126,"점에 ":197,"전을 ":820,"버그 ":126,"이트를":58,"반드시":88,"드리드":67,"램을 ":192,"이티드":93,"전의 ":495,"발되었":131,"작한 ":541,"라오스":74,"져서 ":63,"널리 ":525,"사연구":82,"전이 ":205,"전인 ":79,"의회는":86," 뛰":288,"콘텐츠":192,"몇":457,"명":15332,"몰":624,"몽":572,"몸":435,"못":710,"몬":786,"모":11378,"목":6926,"멘":418,"멜":262,"메":5400,"멕":373,"멍":84,"멀":284,"멸":484," 뚜":58,"며":20876,"면":9744,"멤":213,"먼":1045,"머":1633,"먹":424,"므":521," 러":1462," 럭":156," 런":332," 레":2242,"산업기":64," 렌":152,"발달하":67,"믹":261,"미":17448,"믿":187,"민":16451,"흑해 ":60,"뮌":153," 랭":89," 랩":96," 랴":57,"뮤":598,"뮬":208," 띠":142," 라":5313," 락":86,"의하면":109," 람":83," 란":706," 래":122," 랜":122," 랑":74," 뜨":65,"묘":914," 뜻":2081," 띄":61,"물":10422,"묻":74,"문":17304,"묶":127,"무":13084,"묵":131," 디":2243,"룰":113,"룽":97,"룹":837,"룸":158,"룡":457,"룬":460,"루":6886,"룩":203," 등":10121," 들":1303,"료":2691,"뢰":281,"조를 ":342," 득":75," 드":1545," 듀":145," 뒷":121," 뒤":936,"롱":332,"롭":295,"롬":294,"롯":651,"롤":575,"론":3597,"로":96384,"록":4253," 둥":104,"례":1168," 둘":552,"렸":708,"령":3201," 두":3055,"렴":153," 둔":309,"렵":166,"련":2464,"렬":396,"력":5587,"려":5019,"렛":83,"렘":205,"렐":151,"렉":716,"레":10369,"렌":918,"렀":295," 될":193,"렇":142,"매":4904,"맥":997,"맡":729,"맨":507,"맷":142,"맵":77," 또":8172,"맺":226,"맹":992,"링":1187,"만":12443,"많":2444,"작품 ":175,"마":15987,"막":1783,"말":8244,"망":1651,"맞":936,"맛":168,"리":49017,"릭":1201,"린":3639,"릴":663,"릿":184,"립":9031,"림":3191," 떠":221,"름":4869,"릇":64," 떨":873,"릉":452,"륭":84,"륨":139,"률":1185,"를":36449,"른":4259,"반도체":99,"르":15981,"륵":60,"륜":185,"류":4914,"륙":763," 때":4328,"발매 ":81," 땅":251,"뤼":174,"자치시":91,"뤄":108," 딸":364," 따":3445," 딴":80," 몽":316," 못":580," 몸":407," 몰":372," 몬":267," 모":6818," 목":4311,"뻗":112," 묘":385,"블":1922,"븐":182," 먼":230," 머":458," 먹":353,"브":4958," 메":2219," 멕":332," 멜":173," 멘":88," 멀":265,"빗":71," 면":2578,"빛":405,"반도에":120," 멸":242,"빙":357,"빅":263," 멤":202,"비":15033,"빈":832,"빌":1411,"빵":82,"빼":105,"빠":443," 명":7874," 몇":341,"빨":181," 뮤":321,"전용 ":96,"뿌":188," 미":9777,"최고 ":196," 민":2788," 믿":184,"뿐":444,"뿔":93,"램의 ":106," 및":5809," 밀":1074," 받":1853," 반":3359," 밝":338," 발":8680," 밑":142," 밖":243," 박":1386," 바":4874,"뼈":195," 무":4302," 묵":81," 묶":123," 물":2886," 문":7777," 묻":68,"뽑":108," 뮌":99,"래스카":63,"벽":406,"벼":128,"벳":78,"벨":1138,"베":5169,"벡":239,"벤":610,"벗":142,"법":11136,"범":2511,"번":6501,"벌":1869,"버":5133,"볼":1272,"본":10004,"복":3536,"보":17044,"반도의":107,"병":2570," 루":1582,"별":5261,"변":3473,"배":4005,"백":2909,"밴":506,"밸":82," 로":5189," 록":326,"밥":125,"밤":208,"방":12878,"밭":81,"밑":146,"밖":309,"바":9518,"박":2316,"받":2627,"반":9065,"발":14073,"밝":341,"밀":2017,"및":5896,"밍":463," 롱":64,"뱅":123," 롤":179,"뱀":171," 론":58," 롯":72," 마":7115," 막":816," 많":2225," 만":6286," 링":189," 맛":124," 맞":858," 망":371," 말":7151,"불":5804,"붉":133," 리":3407,"부":33314,"북":7080,"분":9424,"붙":797," 린":107,"붓":60,"붕":269," 릴":78,"문학 ":248,"뷰":194," 맨":215," 매":2433," 맥":407," 맡":725," 맹":113,"코틀랜":227," 맺":219,"뷔":356," 류":171,"봄":175,"봇":228,"봉":1368," 뤼":59,"미로는":60," 르":285," 를":2001," 깨":169,"뉘":227,"뉜":158,"뉴":2112,"눌":108,"눈":377,"눅":106,"누":1558," 길":1008," 긴":363,"눔":57," 기":21288," 까":271," 깊":185," 김":1910," 깃":81,"니":10368,"닉":426,"닌":1054,"닐":137,"닛":171,"님":381,"닝":259,"살았던":73,"다":127946,"닥":218,"단":12729,"닫":65,"달":3219,"닭":75,"닮":58,"답":174,"담":2045,"닷":134,"당":8583,"닿":168,"늄":159,"는":119185,"느":753,"늘":901,"능":2952,"늦":65,"늬":121,"던":5053,"덜":472,"더":3069,"덕":1078,"델":977,"데":8452,"덱":63,"덴":904,"덩":135,"덮":120,"덧":84,"덤":174," 꼽":102,"댐":73,"자치주":85," 꼭":115," 꼬":116,"댄":184,"대":50380," 꽃":178,"돕":110,"돔":57,"동":26013,"도":42958,"독":6183,"돈":384,"돌":1249,"돼":177,"상시키":58," 꾸":91,"전세계":81," 꿈":78," 게":2860," 겉":64," 검":993," 것":7525,"끼":482," 겐":93,"끌":362,"끊":85,"끈":119,"끄":190,"끝":732,"뒤에 ":105," 걸":1004," 건":2744,"끔":84," 거":3157,"뀌":174,"사업에":67," 갤":90,"냥":141," 관":8286," 광":2185," 곧":169," 곤":198," 고":8430," 곡":823,"냈":722," 골":477,"냉":198," 공":10814," 곳":1375," 곱":95,"냐":427," 곰":60," 과":4291,"날":1685," 계":3522,"남":10366,"납":236,"났":694,"낭":223,"낮":351,"낱":236,"낳":117,"내":7694,"낸":788,"낼":133," 겪":88," 겨":197," 격":403," 견":241,"낌":58," 결":2608,"낚":60," 겹":79,"낙":450," 겸":396,"나":29545," 경":10290,"난":2693,"논":1278,"노":7620,"녹":472," 권":1787," 궁":394," 굴":151,"라우저":117,"녕":139," 군":3102,"념":1682," 굳":62," 국":11641," 구":9126,"년":50612,"녁":59,"녀":836," 교":6257,"넷":970,"넨":74,"넬":161,"넣":275,"넥":130,"네":5012,"넘":422,"널":1956,"넓":520," 괴":232,"너":1765,"넌":123," 글":778," 긍":64,"뇨":62," 금":1723," 급":1151," 근":1629," 극":746," 그":13695," 균":198,"뇌":397," 규":1420," 귀":590,"의해서":178,"사업이":73,"놓":470,"높":1411,"사업으":67,"사업을":303,"놀":416," 궤":202,"놈":67,"사업의":104,"농":2894,"또":8222," 뉴":1506," 느":260," 는":22679," 늘":155," 늦":61,"적은 ":2122," 능":471,"뚜":90," 닌":150," 니":852," 닛":81,"자치체":176,"의하여":356," 단":4273," 닮":58," 달":1640," 닭":57," 다":9191," 닥":60," 당":2886," 닿":61," 담":1163," 대":30352," 댄":143,"뛰":296,"개로 ":124," 더":1674," 덕":284,"적을 ":427,"뜻":2127," 던":104,"뜨":161," 데":3003," 덴":317," 델":206,"띄":63," 덧":76," 덩":92," 덮":105,"라":32052,"락":940,"띠":194,"랜":2761,"랙":360,"래":5947,"랑":3926,"랐":156,"람":3445,"랍":361,"랄":216,"란":5893," 동":9304," 돕":101,"략":655,"랴":119," 돌":648,"랭":236,"랫":313," 돈":178,"램":1320,"랩":147," 도":8642," 독":3941,"드로이":60," 돼":72,"량":2327," 되":3560,"럿":63,"럽":1862," 된":1374,"럼":922,"런":983,"럴":253,"러":8043,"럭":336,"될":754," 끈":60,"됨":160," 끊":84," 끌":148," 끝":706,"됐":304,"되":20943,"된":15250,"둥":406," 난":587," 나":7418," 낙":348,"둔":452,"두":5194,"둑":155," 끼":191,"둘":641," 냉":157,"사에서":413,"뒤":1008,"반면 ":75,"뒷":122," 남":6428," 납":167," 낮":336," 낭":128," 날":631," 낸":132," 낳":116," 낱":228," 내":4309,"듬":206,"듭":58,"듯":149," 너":176,"들":14979,"든":2442," 넘":356,"등":12499," 널":543," 넓":459,"듈":72,"듀":325,"득":672,"드":16369,"따":3543,"딱":94,"딴":97,"딸":420," 년":48504,"딩":848,"디":7187,"딕":142," 넷":115,"딘":221,"딜":107," 넣":235," 넥":74," 네":2034," 높":1360," 놓":344," 놀":187," 농":2185," 논":982," 노":3844," 녹":420,"땅":275,"때":4902,"떻":123,"너무 ":59,"떨":893,"적의 ":167," 뇌":233,"떠":388,"떡":75,"떤":684," 눈":259," 누":647,"최":4895," 즉":844," 즐":195," 증":1529,"촌":871,"초":5803,"촉":534,"총":3611,"촬":196," 쥐":90," 짧":228,"취":1270,"책으로":129,"번길 ":62,"출":6229,"춘":568,"추":3536,"축":3493,"충":2231," 질":809,"춤":162," 진":3320," 직":1757," 지":18217," 짜":63," 징":255," 짐":132,"춰":72," 집":1925," 짓":61," 족":223," 조":9648,"챔":331," 존":1937,"품에 ":127," 졸":350,"채":1834,"책":2191," 좀":85," 좁":161," 종":5121," 좋":260," 좌":466," 젠":66,"찌":102,"찍":137,"찰":1432,"찾":410,"창":3609,"참":2285,"찬":711,"차":8220,"착":921," 주":18427," 죽":642,"쳐":1531," 중":15239," 준":801,"쳤":211," 줄":1147,"천":5793,"처":4102,"척":639,"첼":120,"첸":85,"체":12407,"청":5830,"첫":1006,"첩":200,"첨":248," 죄":242,"철":3109,"콤":255,"콥":103,"콩":581,"콰":92,"콕":119,"코":6706,"콜":1028,"가루 ":61,"콘":1127,"켰":242,"케":2788,"켄":203,"켈":258,"켓":371,"켜":465," 찍":69,"쿨":196,"쿤":100,"쿠":2769,"쿼":162,"쿄":510,"뮤직 ":109,"차지했":173,"쾌":99,"차지하":297,"차지한":93,"카":9618,"칼":897,"칸":851,"칠":510,"칭":3334,"침":1007,"칩":118,"친":1771,"칙":994,"치":17866,"층":1482,"츠":2145,"측":1172,"컷":121,"컴":2082,"컵":693,"컬":664,"컫":637,"컨":597,"커":1445,"하게 ":1996,"캡":77,"캠":218,"캘":291," 쪽":266,"캐":1331,"캔":130,"캄":196," 씨":235,"줄":1396,"준":5183,"중":19356," 알":4152," 안":3009," 않":2884,"상업 ":66," 아":13909," 악":593," 앞":452," 앙":302," 압":445," 암":698," 앨":831," 애":1646," 액":406," 앤":321,"남서부":201," 약":2469," 야":1729,"쥐":185," 앵":118," 얇":61," 양":2789,"즌":623,"즉":849,"즈":4475," 어":3828," 억":453,"즐":322," 얼":375,"증":2825," 얻":540," 언":2186,"즘":584," 업":1023," 엄":237," 없":1907," 엔":883," 에":9796," 엑":221," 엘":468,"직":4986,"지":53776," 엠":95,"진":12339,"질":3981," 역":3833," 여":5825,"짓":174," 연":8997,"짐":242,"집":3847," 열":2188,"징":1361," 염":317,"짜":260,"짝":66," 영":8578," 였":167,"짧":228," 옆":106," 예":3334,"째":2610," 옛":404,"젝":447,"제":29441,"정":32064,"접":3114,"점":4065,"절":1922,"젊":125,"전":32225,"저":3863,"적":24870,"져":2433,"젤":356,"젠":397,"졌":1373,"남은 ":81,"졸":392,"족":4176,"설립 ":259,"조":17636,"존":3442,"종":10038,"좁":164,"좀":88,"좌":857,"좋":275," 쓰":2084," 쓴":417," 쓸":74,"죄":910,"주":32946,"죽":799,"제목으":67,"때까지":243," 음":3514," 읍":148," 은":18917," 을":2254," 의":13217," 응":652," 율":199," 유":10518," 육":1054," 윤":588," 으":1482," 융":126," 잎":227," 있":30440," 잉":461,"내어 ":68," 임":1668," 입":1910," 잇":347," 잃":123," 잘":752," 잔":258," 작":5027," 자":10494," 읽":214," 일":35680," 인":14925," 익":323," 이":43056,"가니스":126," 잡":554," 잠":484," 장":5226," 잭":118," 재":4073,"쯤":58," 전":16035," 적":1958," 저":2089," 절":733," 젊":120," 정":13855," 점":1342," 접":1777," 제":14773,"록된 ":160," 왜":181," 왕":3468," 왔":270," 와":1869," 완":998," 옹":146,"이하 ":136," 옵":60," 옷":75," 온":1154," 올":1715," 옮":203," 오":7708," 옥":336," 욕":102," 요":2831," 왼":84," 외":2265," 원":4962," 월":24581,"쪽":5565," 움":282," 웅":75," 워":458," 우":4532," 운":3277," 울":664,"이한 ":104," 용":2153,"킨다 ":728," 위":16296," 윌":374,"쟁에서":112," 윈":737," 웰":69,"쫓":57," 웹":637," 웨":474,"엘":1142," 쇼":552,"에":103036,"엑":271,"엔":1496,"었":13976," 쇠":160,"엇":117,"없":2196,"업":9251,"엄":815," 수":15278," 숙":319," 순":1149,"염":794,"엽":300,"연":14124,"열":3577,"역":14028,"여":21379,"엠":137,"인한 ":217,"양":9401,"얇":61,"얄":66," 송":804,"얀":299,"얼":821,"언":4279,"남의 ":128,"얻":540,"어":34822,"억":684," 셰":135,"앗":139,"압":954,"암":1495,"앙":2107," 셸":66,"았":1705,"앞":482,"아":34158,"악":3379,"않":2900,"안":8284,"알":4503," 손":892,"앵":203," 솔":307,"약":5189,"야":6719,"애":2683,"액":892,"앤":486,"앨":898," 소":11517," 속":3191," 성":6301," 센":729," 섹":60," 세":12291," 설":6591," 선":5904," 섬":1862," 섭":109,"살인 ":59," 셀":192,"씬":77,"씨":1169,"씩":216," 셋":131," 샤":410,"쓸":88," 섞":117," 서":10967," 석":713,"ال":101,"쓰":3096,"쓴":426," 삭":82," 사":27351," 삼":1735," 삽":115," 살":1329," 삶":242," 산":3915," 샌":184," 색":449," 새":1434," 상":6949," 생":4620," 샘":83,"·기":95,"쑤":93,"하고 ":9739,"천시 ":225,"내에 ":438,"쟁":2447,"잭":163,"재":12501,"잡":1143,"잠":605,"장":20415,"잘":775,"자":37700,"작":11270,"잔":750,"잉":715,"있":31718,"잎":326,"잃":128,"잇":387,"입":4066,"임":6953,"익":1489,"이":165423,"읽":225,"일":45548,"인":47192,"응":1476," 썼":110,"의":128021,"을":57826,"은":55316,"음":8005,"읍":818," 써":106,"융":702,"으":50795,"유":15783,"육":5384,"윤":825,"율":1139,"윗":76,"위":24705,"윈":893,"윌":390," 쌍":317,"웨":2585,"웠":203," 쌀":75,"웹":654," 쌓":108,"웰":170," 십":304,"워":2364," 심":1298,"웅":321," 실":3412,"움":1073," 신":6386,"웃":221," 싸":282,"월":25278,"원":21744," 싱":605,"용":15333,"울":4973," 시":17424," 식":1730,"우":13058,"욱":203,"운":8938," 슬":354," 스":7245,"상에 ":382," 승":1448,"요":6682,"욕":736," 습":160,"아나톨":89," 슈":770,"왼":87,"외":4313,"완":1509,"와":17969," 쉬":131,"왈":71,"왕":5728,"왔":670," 쉽":226,"왜":332,"오":14295,"옥":684,"온":2639,"옮":206,"올":2277,"옷":147,"옵":67,"옴":62,"옹":454," 술":278,"였":9350,"영":13611,"옆":120," 숨":146,"예":5314," 숫":155,"옌":106," 숭":142," 숲":99,"옛":409,"솔":474,"손":1670,"속":9079,"소":20178,"하계 ":200,"송":3868," 뷰":70,"쇄":396," 블":858,"아래 ":167,"쇼":889,"쇠":250," 브":1829,"섞":131,"석":3098,"서":56814,"센":1955,"섹":86,"세":16513,"성":22377,"섬":2116,"섭":330,"섯":386,"설":10723,"선":15146,"션":2286,"셜":299,"셔":528,"셋":215,"셈":160,"셉":104,"셀":498," 붙":702,"셸":100,"셰":320," 붕":186," 불":4668," 붉":127," 부":9201," 북":4515," 분":5145," 벤":222," 벡":158," 베":2451," 벨":486,"전부터":113," 벼":76," 벽":136," 별":990," 변":2214,"삭":178,"사":52637,"삶의 ":107,"삶":243,"살":2436,"산":15857," 병":912,"삼":2361,"삽":121,"상":21682,"색":2020,"새":1818,"샌":195," 보":8766," 복":1706," 본":2950,"샘":118," 볼":809,"생":10052," 봄":131,"샤":806," 봉":588,"샹":63," 방":5542," 밤":177," 밥":73," 밴":412," 백":1337," 배":2328," 뱀":75,"청사 ":99,"아랍 ":69," 번":5470," 벌":737,"은행이":86," 버":1308," 벗":132," 법":2859," 범":953," 뿐":232,"일컫는":617," 뿌":163,"선되었":89,"따로 ":85,"쏘":63," 뽑":105,"·관":58,"·공":88,"쌓":116,"쌍":397,"쌀":100,"법과 ":132,"썼":127,"·교":69,"써":1331,"스":47455,"슨":692,"슭":65,"슬":1694," 뻗":111,"슷":450,"책이다":168,"슴":118,"습":1034,"승":3768,"슈":1775,"슐":101,"ن ":67,"·경":73,"실":7842,"신":14225,"십":781,"심":4418," 뼈":65,"싱":1106,"싼":122,"싸":555,"삶을 ":67,"시":47707,"식":10614,"숫":164,"숨":182," 빈":391," 빌":529,"숭":331,"술":6046," 비":6448," 빅":231," 빛":315," 빙":124,"숲":113,"숙":641,"수":29032,"순":2152,"쉬":296,"입출력":62,"쉽":270," 빨":157," 빠":372," 빼":98," 빵":61,"이프 ":175,"·연":85,"남서쪽":252,"·유":61,"·일":61,"·의":63,"·이":73,"아날로":71,"·정":112,"·전":81,"따라 ":1673,"남자 ":113,"·조":86,"문화 ":681,"학과 ":472,"·중":79," 표":2914,"양군 ":70," 풋":113," 품":311," 풍":535," 풀":299," 푸":531,"·수":74," 프":8266," 플":1603," 퓨":70," 필":1793," 핀":310," 한":15848," 하":15057," 학":3148,"·시":65," 피":2290," 픽":92," 행":4241," 했":855," 핫":66," 합":1642," 함":2804," 항":2107," 할":1260," 핸":86," 해":6139," 핵":492," 헌":637," 허":1093," 헝":262," 향":878," 혁":595," 현":7804," 혈":203," 협":1285," 혐":108," 형":3039," 혜":164," 헤":908," 헨":184," 헬":298," 호":4483," 혹":1415," 혼":618," 환":1277," 활":2664," 화":2857," 확":1802," 홋":133," 홈":481," 홍":939," 홀":264," 황":1758," 획":175," 회":3401," 횡":97," 효":1076," 후":4257," 훈":284,"년까지":1214," 휘":398," 휴":496," 흉":74," 흘":204," 흙":65," 흐":416," 흑":305," 흔":541," 희":405," 흥":265," 흡":176," 흰":126," 힌":195," 히":1017," 힘":385," 힙":241," 힐":117,"퓨":1797," 탈":393," 탄":758," 타":2520," 탁":97," 택":119," 태":2959," 탕":57," 탑":249," 탐":420," 킬":163,"풍":1318,"품":4217,"풋":127," 키":961,"풀":607," 킹":119,"·비":57,"푸":1058," 크":2443," 큰":1228," 클":1230,"표":6870," 큐":81,"퐁":58," 퀴":87," 퀸":110,"헨":491,"헬":461,"헤":1267,"험":1933,"헝":266," 퇴":255,"허":1930,"헌":1331,"향":3265," 통":6217," 톰":97,"했":6623,"행":10972," 톤":103,"햄":124," 토":1698,"해":24304,"핵":714,"핸":99,"할":7046,"학교 ":1462,"합":7081,"함":6807,"핫":70,"항":4366,"핑":263,"하":77112,"학":20317,"한":70049,"핀":882,"필":2371," 텔":530," 텐":79,"피":5014,"픽":1356," 텍":216," 테":1319," 터":808,"례로 ":60," 털":121,"프":15322,"책임을":92,"플":2945,"픈":291,"훌":63,"훈":752,"후":8266,"·사":104," 티":587," 파":4031," 팀":912," 팝":144,"훼":98," 팔":592," 판":1566,"회":16026,"획":1122," 특":3708," 트":1409,"효":1679," 틀":69,"횡":121,"프라 ":59,"로드 ":165,"홀":471,"홍":1228,"홉":95,"홈":507,"홋":134,"화":22028,"확":2379,"활":4194,"환":3339," 튀":144,"황":3546," 튜":103,"혁":1204,"혀":399,"현":10322,"혈":413," 투":1343,"혐":136,"협":3198,"혔":85," 툴":85,"형":7186,"혜":431,"호":10715,"혹":1549,"혼":1062," 폴":656," 폰":261," 포":4550," 폭":878,"힙":246," 편":1366,"힘":444," 펼":268,"힐":206,"히":4751,"힌":432," 펠":137," 폐":695,"·소":58," 평":2091,"흔":563,"흐":924,"흑":351," 퍼":552,"흘":226,"흙":94,"·서":57,"흉":89,"흰":134," 펑":85," 펜":287," 페":1515,"흥":1435," 펄":70,"흡":255," 펀":62,"희":960," 펌":73,"휘":984," 팽":80," 팩":61," 패":1010," 팬":166,"나지 ":80,"휴":649,"키":6923,"킬":397,"킨":1373,"킷":112,"킴":109,"킹":435," 철":1617," 청":1846," 첨":172," 첫":1002," 체":2408,"탠":112," 초":3045," 촉":322," 촌":94,"탱":122," 총":2665,"타":10431,"탁":475,"탄":2414,"탈":2257,"탑":422,"탐":512,"탕":687,"택":1124,"태":6303," 책":837," 채":1344,"퀸":127," 챔":308,"퀴":217," 찬":190," 착":270," 차":4515," 창":2458," 찾":380," 참":2071,"퀘":136," 찰":178,"크":10562," 처":2216," 척":276,"큼":124," 천":2298,"큰":1271,"러스 ":211,"클":2167,"가능성":147,"큐":382,"통":12139,"톱":213,"톰":149,"톤":509,"토":6975,"톡":100,"톨":856," 춤":75," 충":1498,"퇴":541,"낮은 ":181," 취":867," 최":3844,"털":869,"턴":933,"터":15066,"턱":68," 촬":156," 춘":234," 출":3768," 추":2201," 축":1679,"·문":85,"템":1444,"텐":677,"텔":1214,"텍":454,"테":4250," 캠":159,"튀":186," 캡":60,"설되었":185,"튜":408,"특":6554,"트":19866,"튼":328," 커":696,"틀":1105," 컬":177," 컨":381," 컵":92," 컴":1913," 켄":86," 케":706," 켈":106,"틴":890,"틱":267,"티":5222,"틸":257," 측":539," 층":841,"투":4120," 치":1335," 친":965,"툼":59,"툰":121,"툴":145," 칭":340," 칩":93," 침":590,"퉁":60," 칠":288," 칼":602," 칸":291," 카":3225," 캄":163," 캘":285," 캔":83," 캐":1108,"펑":115,"내외 ":104,"펙":119,"페":3911,"펜":502,"펀":122,"펄":94,"펌":98,"펴":73,"편":2910,"펼":268,"펠":315,"폐":1040," 쿼":94," 쿠":694," 쿨":64,"평":3725,"폴":1296,"폰":647,"폼":238,"강남구":284," 퀘":74,"포":9485,"폭":1205,"팜":57,"·보":96,"팝":166,"판":4057,"팔":900,"파":8561,"팀":1526,"팅":740," 콘":727," 콜":444,"팽":145," 코":2601,"팩":183,"패":1550,"팬":246,"팡":90," 콩":260," 콤":104,"퍼":1823," 쾌":61,"정부가":179,"쟁이다":79,"라이터":66,"쟁으로":65,"사실상":154,"창원시":139,"키는 ":1035,"정부기":114,"라이트":130,"정보를":393,"미상 ":70,"정신 ":68,"라이프":136,"건강 ":58,"양과 ":150,"체로서":85,"네덜란":393,"세로 ":103,"사실을":90,"태가 ":128,"이후 ":1284,"이탈리":1091,"정보의":92,"조로 ":180,"정보원":60,"간단한":90,"응하는":87,"한국 ":743,"간단히":170,"초기에":174,"·철":281,"내셔널":212,"정부는":58,"이타마":68,"크를 ":175,"가며 ":64,"상위 ":110,"사전》":334,"초기의":150,"드리히":131,"각류 ":61,")":98,"(":82,"-":78,",":74,":":84,"정보교":91,"�":268,"가능한":325,"가능하":380,"정보기":60,"천문학":192,"자체를":63,"품의 ":269,"가를 ":483,"하기 ":3613,"프로 ":559,"사진 ":92,"상은 ":241,"거가 ":72,"상을 ":1272,"등록되":68,"내의 ":242,"로도 ":664,"산스크":81,"배를 ":124,"가리 ":150,"인해 ":467,"품을 ":440,"았던 ":129,"품은 ":147,"암동 ":90,"상의 ":1371,"나아가":97,"재지는":153,"로그래":368,"로그램":1142,"일한 ":393,"성되는":97,"정식 ":393,"상이 ":314,"이크로":588,"성된다":104,"상인 ":81,"가대표":141,"람으로":164,"량에 ":62,"품이 ":87,"람이다":230,"서버 ":134,"흔히 ":416,"클럽 ":144,"거리 ":240,"들에게":459,"곡가 ":133,"장한 ":247,"크바 ":154,"亂三並":62,"亂三三":97,"개봉한":58,"가의 ":726,"들어가":153,"고가 ":108,"처를 ":60,"초가 ":61,"정적 ":84,"가상의":121,"주가 ":198,"주간 ":66,"력과 ":185,"《삼국":90,"전시키":66,"거를 ":112,"낱말이":57,"전직 ":79,"차원 ":181,"쿠오카":77,"애를 ":72,"었고 ":892,"런던 ":156,"디아 ":131,"드에서":276,"널드 ":60,"들어서":94,"제이 ":61,"제인 ":105,"간을 ":392,"제일 ":77," �":76,"일하게":128,"려고 ":142,"미술 ":75,"디오 ":872,"간의 ":1177,"제작 ":170,"가장 ":2202,"청남도":310,"남성 ":89,"간은 ":169,"아라비":118,"청나라":162,"타공공":108,"라서 ":346,"박람회":78,"드워드":102,"사람에":109,"사람으":156,"사람의":207,"사람을":267,"사람은":77,"에게 ":2107,"각이 ":58,"산부 ":97,"사람이":649,"제의 ":613,"디언 ":96,"디어 ":334,"쓰인 ":79,"각을 ":142,"차와 ":63,"가사키":57,"사령관":124,"》에 ":263,"가인 ":211,"각의 ":236,"거대한":110,"들었다":123,"이하의":127,"풀로 ":63,"미스 ":121,"제정 ":71,"강에 ":127,"남시 ":83,"제적 ":226,"주고 ":119,"애니메":619,"결과를":92,"밴드이":74,"이해하":86,"방법 ":120,"갈의 ":77,"따서 ":170,"결과로":66,"타나 ":60,"재한 ":154,"》은 ":258,"》을 ":87,"려가 ":67,"타낸 ":57,"간인 ":57," ال":89,"간이 ":309,"차에 ":82,"》의 ":292,"재해 ":65,"인하여":149,"주군 ":164,"라마이":104,"양경찰":118,"러가지":95,"처럼 ":457,"업과 ":165,"세기 ":1033,"없고 ":95,"때로는":64,"전에는":239,"각종 ":375,"랫동안":75,"가수이":107,"주교 ":119,"제조 ":126,"정식으":79,"정신적":68,"정신을":88,"믹스 ":77,"가족 ":81,"이후로":131,"키나파":72,"亂之 ":165,"섬과 ":107,"개발과":79,"경기 ":281,"페테르":78,"포유류":74,"사법 ":61,"전적 ":75,"거래 ":75,"채무자":67,"젊은 ":80,"並國三":63,"들이며":112,"나에 ":58,"남부에":295,"정에 ":426,"무함마":66,"레고리":71,"크라이":244,"이트에":84,"절을 ":58,"亂亞 ":92,"전쟁 ":655,"이트의":60,"이트이":90,"전자 ":409,"아들로":224,"작해 ":65,"들이다":337,"키나와":91,"야기를":124,"상당한":67,"상대적":107,"크로아":167,"들이었":59,"발매되":369,"들이자":64,"발매된":203,"드컵 ":186,"개발사":89,"크로소":373,"라북도":260,"크로스":85,"박사 ":73,"네그로":106,"등에서":329,"亂丁 ":125,"산물 ":91,"크래프":177,"렀다 ":170,"개발된":147,"개발되":172,"안드레":67,"亂三 ":194,"서기 ":77,"새로 ":127,"안드로":92,"아들인":70,"아들이":502,"亂丘 ":69,"亂並 ":148,"체계에":64,"석기 ":98,"의회에":71,"저수지":79,"폐지되":230,"체계이":67,"체계적":153,"의회의":78,"문학의":69,"라스 ":165,"문학자":106,"이풀로":78,"포츠 ":283,"전주 ":94,"성군 ":116,"체고비":106,"성구 ":108,"вич":79,"드웨어":181,"문학에":61,"세계 ":1954,"점성술":118,"점이 ":245,"문화교":73,"간에 ":460,"가와 ":443,"프가 ":107,"장치를":95,"문학상":112,"개발에":94,"점의 ":91,"개발원":63,"개발을":139,"거로 ":203,"가에 ":200,"제에 ":283,"점을 ":418,"들에서":74,"체계를":103,"점은 ":106,"들어진":480,"들어지":82,"드이다":163,"찰스 ":128,"제어 ":96,"들어졌":219,"개발자":101,"무형문":79,"타는 ":77,"사람들":653,"박물관":464,"물학적":58,"문화를":243,"정이 ":195,"체는 ":221,"내버스":159,"성과 ":408,"나오 ":65,"나온 ":159,"물학자":103,"체결된":76,"정의 ":333,"야구장":186,"발생 ":60,"정을 ":810,"발매하":71,"발매한":190,"야구의":96,"나와 ":213,"잡한 ":94,"문화방":94,"정상적":60,"장치이":90,"정은 ":136,"성경 ":87,"개발하":315,"개발한":476,"남부의":85,"제와 ":152,"세가 ":207,"변경 ":58,"ич ":79,"얀마 ":62,"책을 ":239,"조사 ":371,"정안전":64,"책의 ":157,"아서 ":221,"작하여":246,"작하였":243,"문화예":200,"설된 ":121,"사스 ":116,"문화에":102,"크리스":321,"문화와":78,"결국 ":113,"미아 ":71,"앨범 ":191,"문화사":76,"따라서":342,"얻는 ":71,"라에 ":165,"레드 ":122,"전자의":59,"들을 ":1666,"중국 ":1103,"태는 ":61,"중구 ":288,"라엘 ":114,"들은 ":1170,"어도 ":114,"작했으":59,"들의 ":1830,"저장하":84,"문화체":1127,"사실 ":77,"문화의":338,"문화유":176,"정에서":302,"미야 ":93,"남부 ":379,"남북 ":100,"전쟁에":156,"문화재":744,"문화적":145,"전쟁이":130,"전쟁의":144,"전쟁을":114,"조선 ":921,"조성 ":57,"포인트":69,"참조 ":61,"크리트":128,"정책 ":233,"양력 ":94,"체를 ":702,"역과 ":276,"타나는":152,"클리드":62,"등에 ":576,"끝에 ":190,"亞之三":65,"표면에":74,"크립트":86,"들이 ":2342,"버드 ":86,"미에 ":63,"어권에":64,"타난다":90,"들인 ":119,"디미르":96,"타낸다":124,"뜻으로":193,"것도 ":92,"타내는":318,"박스 ":90,"뮤지컬":91,"미와 ":58,"계가 ":265,"뜻이다":146,"재판소":104,"랑스 ":981,"날로그":71,"게는 ":216,"새로운":612,"亞三並":82,"점으로":208,"亞三三":101,"亞並三":64,"자회사":108,"책은 ":69,"제였다":63,"점이다":97,"클라우":75,"클라이":98,"라비아":197,"중기 ":60,"클래식":69,"경과 ":111,"클래스":59,"반민족":66,"성공적":59,"자협회":65,"제에서":125,"세는 ":97,"들에 ":340,"세계대":169,"종류로":101,"아메리":675,"아르헨":188,"세계사":60,"성공하":63,"포지션":88,"야를 ":102,"정의된":86,"정의되":69,"정으로":195,"세계를":63,"어느 ":223,"힙합 ":161,"어는 ":592,"크로프":81,"책에 ":96,"삼성 ":130,"체로 ":467,"초기 ":285,"족문화":157,"클럽이":60,"야마 ":336,"성당 ":102,"정이다":317,"드인 ":58,"미생물":76,"여객 ":65,"장하기":60,"아마추":89,"장하고":143,"령과 ":60,"드의 ":963,"종류의":195,"종류이":108,"장한다":129,"장하는":775,"산시 ":253,"창설되":77,"성경의":62,"미사일":222,"개선 ":77,"떻게 ":119,"쳤다 ":134,"성격을":73,"잔티움":169,"임했다":113,"라인 ":429,"미의 ":189,"등의 ":1883,"등이 ":948,"가상 ":104,"선거로":128,"민사소":63,"생명 ":64,"철도의":57,"이후에":218,"입하는":64,"이후의":91,"람에 ":59,"란을 ":119,"방송 ":364,"전에서":232,"야로 ":92,"런던에":68,"랍어 ":64,"년경 ":243,"적용된":77,"적용되":101,"갈리아":61,"건담 ":64,"전차 ":64,"란의 ":103,"사망하":111,"사망한":67,"사망했":57,"알렉산":236,"미술관":126,"사무관":181,"사상 ":967,"민을 ":66,"란이 ":83,"아름다":121,"가서 ":69,"어가는":73,"산림청":142,"입하여":105,"입하였":66,"품부 ":541,"일환으":83,"민의 ":363,"타로 ":84,"나머지":129,"아르메":80,"러를 ":61,"크스 ":85,"년간 ":196,"로가 ":240,"주기 ":97,"풋볼 ":85,"민이 ":110,"히틀러":64,"거구에":103,"내부 ":106,"임하였":204,"알려졌":143,"알려져":958,"거구제":58,"중간 ":77,"반부터":67,"러리 ":73,"알려진":400,"알려지":101,"亞亂 ":111,"선거구":247,"적이고":205,"철도역":58,"제시하":89,"제시한":63,"전철 ":57,"서는 ":3203,"강력한":114,"선거는":100,"나서 ":76,"亞亞 ":112,"라의 ":696,"라이 ":172,"초구 ":243,"샤를 ":93,"가문의":115,"철도공":82,"亞之 ":255,"사무소":268,"사무실":1870,"적으로":6876,"새롭게":76,"적이다":168,"선교사":68,"아랍어":113,"저작권":130,"전체 ":252,"아래에":87,"亞三 ":239,"작하는":103,"적용하":90,"어난 ":785,"었기 ":131,"전으로":231,"전자기":148,"어나 ":278,"라운 ":61,"亞丁 ":83,"산성 ":74,"적이며":61,"방식 ":87,"려는 ":293,"키며 ":125,"작품은":98,"작품을":152,"작품으":141,"작품의":71,"작품이":311,"가수 ":210,"어날 ":66,"작하고":60,"작품에":75,"평화 ":62,"亞並 ":263,"년과 ":135,"방법이":266,"방법으":220,"설가이":93,"방법을":165,"민족 ":187,"결과 ":148,"방법원":75,"선거인":76,"따르는":60,"차의 ":172,"작했다":157,"선거에":114,"설계되":58,"설계된":59,"처리 ":166,"전이다":158,"래시 ":66,"라와 ":158,"넘는 ":95,"래식 ":58,"사무를":174,"생물 ":118,"점차 ":76,"따르면":285,"종로구":365,"등을 ":1775,"서기관":301,"서도 ":661,"듯이 ":69,"あああ":724,"점에서":270,"생들이":69,"가스 ":119,"등은 ":87,"유일하":126,"작사 ":62,"유일한":211,"반의 ":260,"일본식":166,"남동부":196,"유전자":195,"라트비":70,"터가 ":163,"반이 ":63,"이비드":96,"루고 ":252,"과는 ":512,"의자 ":89,"보고 ":146,"뒤를 ":85,"일반화":65,"익보호":63,"이브러":106,"의의 ":313,"타르 ":110,"밖의 ":84,"타를 ":73,"가타 ":92,"래픽 ":199,"드니 ":63,"흥을 ":81,"반은 ":58,"반을 ":232,"체가 ":464,"래한 ":106,"내로 ":72,"라틴어":370,"게르만":68,"남동쪽":276,"했다 ":3529,"공과대":62,"변경되":110,"바의 ":87,"일반직":142,"로레슬":124,"드는 ":685,"하다는":80,"체계 ":105,"바일 ":130,"사람 ":176,"일스 ":70,"바이 ":61,"이벤트":59,"자리에":188,"하다고":79,"의원 ":442,"가지의":94,"흥원 ":58,"검사 ":107,"하다가":230,"흐스탄":57,"강점기":429,"일본군":83,"자리잡":146,"일반적":1125,"일반의":63,"공공기":200,"일반인":59,"검색 ":91,"갖춘 ":115,"하느님":102,"보가 ":128,"동차가":80,"하로 ":57,"하는데":508,"용하지":112,"삼국지":82,"민지 ":138,"플리케":102,"용하였":172,"삼동 ":85,"해도 ":118,"발에 ":103,"공간을":116,"계로 ":314,"롤라이":60,"용해서":66,"공간의":61,"공간이":89,"공간에":115,"임무를":62,"필름 ":57,"용하여":659,"응을 ":89,"한나라":81,"자리를":100,"나들목":286,"이베이":60,"하나의":423,"하나이":1414,"하나인":156,"돌프 ":87,"뀌었다":80,"건물이":107,"되었는":98,"강으로":96,"레슬링":130,"가지로":201,"해결하":97,"하나였":68,"상당 ":305,"원후 ":60,"벨기에":181,"레슬러":60,"보》 ":158,"표한 ":280,"의와 ":140,"삼국시":89,"용하며":100,"결되어":107,"되었다":5986,"강이다":118,"되었던":228,"도쿠가":148,"용했다":68,"반에 ":227,"원회 ":556,"이스 ":652,"레스타":99,"도한 ":102,"차로이":131,"음이 ":93,"되었고":629,"이베리":109,"육청 ":61,"공개되":85,"이바지":269,"차례의":95,"용한다":393,"가지는":177,"음의 ":177,"미지 ":78,"용하는":892,"가진다":98,"흥에 ":88,"하라 ":165,"건설 ":140,"민주 ":136,"이미지":182,"워크에":63,"밖에 ":144,"음을 ":323,"응용 ":195,"의에 ":179,"곡동 ":69,"음은 ":101,"강원도":332,"용하기":171,"동차의":79,"동차이":57,"력에 ":158,"나무 ":204,"가지가":83,"든다 ":84,"견되었":130,"용하고":267,"가지고":705,"임명되":96,"타마 ":101,"해군의":95,"받아 ":360,"방송되":109,"방송된":59,"인민공":988,"울특별":2233,"방이 ":101,"크에 ":114,"워크를":60,"려운 ":66,"배우 ":114,"고도 ":1655,"고고학":81,"나라의":375,"나라에":236,"태로 ":448,"자료를":144,"인물이":326,"나라와":67,"방의 ":527,"삼각형":57,"나라이":90,"창단되":64,"원형 ":65,"산된 ":75,"제국 ":423,"방송국":244,"방송공":57,"방은 ":58,"미이다":94,"채널이":57,"방을 ":182,"타디움":74,"민운동":60,"로리다":73,"들도 ":99,"학대학":68,"행된 ":137,"자로서":148,"학명 ":83,"인먼트":266,"원한 ":71,"고구려":194,"쟁력 ":98,"나로서":116,"가치 ":92,"되었을":105,"되었으":1444,"합뉴스":77,"방사성":62,"이므로":99,"방사선":62,"됐으며":61,"되었지":87,"인물로":179,"이센 ":70,"로마의":132,"하면 ":687,"보기 ":63,"되어있":155,"하며 ":3509,"핀란드":235,"제공 ":74,"로마에":76,"천구 ":60,"이션 ":833,"정규 ":302,"고대 ":811,"보급 ":119,"천군 ":115,"이묘로":166,"법률 ":166,"정권 ":76,"한다는":384,"산광역":497,"했던 ":888,"련을 ":63,"나라가":92,"음에 ":134,"일리아":483,"발전 ":299,"광고 ":99,"사도 ":73,"력이 ":338,"인사 ":61,"가이자":300,"인리히":70,"쓰고 ":113,"점기 ":106,"력의 ":179,"동한 ":119,"베네치":57,"위치 ":109,"히브리":114,"하루 ":70,"인류학":75,"력을 ":1046,"곡가이":164,"고는 ":197,"간이다":99,"발생하":361,"발생한":413,"이상 ":271,"려의 ":141,"원칙으":57,"간으로":125,"럽에서":169,"거리에":131,"력은 ":109,"정구 ":74,"원칙이":63,"미에서":158,"한다고":160,"산당 ":105,"본과 ":65,"려져 ":976,"나라를":93,"정과 ":186,"민에게":63,"발이 ":61,"변경하":100,"가이며":162,"산동 ":114,"자라고":70,"이면서":108,"이사 ":68,"나라로":69,"살던 ":66,"방에 ":408,"베네수":63,"자라는":57,"행동 ":67,"하마 ":73,"밝은 ":60,"하도록":282,"미어리":61,"발의 ":57,"발음 ":82,"생긴 ":128,"인류의":69,"발을 ":206,"재를 ":220,"이를테":61,"련이 ":125,"거리를":101,"가이다":1033,"나라는":63,"허가 ":79,"로서 ":2731,"장되어":72,"음악 ":533,"크와 ":130,"받을 ":101,"발생시":61,"받은 ":355,"미얀마":91,"련의 ":320,"관급 ":76,"제가 ":421,"하를 ":73,"킬로미":80,"제도 ":358,"간에서":92,"석과 ":60,"제국에":76,"나라 ":614,"령에 ":106,"있어 ":371,"방식을":150,"인인 ":98,"방식으":206,"인이 ":565,"이전 ":225,"방식의":98,"방식이":202,"이저 ":232,"민주당":259,"받았다":382,"산되었":75,"받아들":112,"일의 ":934,"산면 ":79,"음으로":564,"민족주":86,"선과 ":211,"일이 ":179,"서구 ":260,"려시대":128,"민족이":83,"민족의":109,"일은 ":171,"일을 ":555,"원회가":79,"제공한":140,"제공항":114,"해당되":71,"제공하":545,"제국이":104,"제국의":612,"바에서":60,"제국을":60,"동일한":145,"가에서":199,"도지사":58,"유통 ":76,"이사회":105,"이자 ":2852,"표하는":228,"정된 ":457,"산드로":63,"관광 ":81,"사들이":79,"태를 ":373,"천광역":247,"함되어":129,"설계 ":121,"점기에":78,"산드리":57,"장소 ":58,"체나 ":63,"재산 ":60,"정권을":71,"합되어":80,"둘러싸":178,"나뉜다":143,"점기의":205,"위한 ":2048,"인의 ":1094,"천동 ":70,"상대로":69,"해를 ":391,"원하여":159,"설과 ":84,"위해 ":2768,"위치한":3432,"제거하":73,"위치하":1705,"례에 ":90,"위치해":340,"나로 ":1168,"민족문":161,"상대방":151,"크의 ":348,"후쿠오":77,"가운데":1210,"인은 ":257,"히로시":66,"정교회":151,"인을 ":537,"임에 ":72,"이션의":58,"이션을":57,"나누어":113,"위하 ":73,"이션이":144,"사립 ":60,"로스 ":346,"이사장":71,"터널 ":62,"원하기":109,"경기장":259,"경기이":114,"원하고":76,"졌고 ":82,"베리 ":112,"사망 ":84,"로미터":82,"제는 ":260,"이선스":78,"익을 ":208,"학문 ":67,"제국과":91,"계된 ":80,"정동 ":164,"이의 ":892,"개신교":188,"방송통":78,"공기 ":138,"계기로":63,"처드 ":102,"되자 ":84,"가였다":74,"관계 ":273,"생기는":109,"하드웨":180,"정구역":186,"로바키":85,"원하는":219,"정도 ":616,"정기관":402,"원한다":72,"령을 ":243,"터넷 ":430,"령은 ":67,"백작 ":70,"려진 ":441,"관과 ":150,"이상의":447,"방송하":72,"철도 ":369,"동조합":172,"생된 ":67,"이상이":81,"낸다 ":189,"인용 ":127,"위치에":98,"들기 ":69,"경기를":148,"레이더":57,"객이 ":71,"레이드":84,"령의 ":138,"유지하":156,"택되었":69,"산맥 ":122,"레일리":489,"정당 ":75,"반직 ":94,"령이 ":148,"공군 ":73,"키백과":58,"음악을":113,"음악의":121,"선거 ":137,"자본주":105,"음악이":74,"제강점":66,"로써 ":1184,"레이블":111,"위치를":101,"일에 ":1924,"쓰는 ":227,"로버트":170,"음악에":71,"베르 ":73,"함된다":101,"방송사":59,"후한 ":94,"루는 ":422,"경기에":89,"레이션":267,"등과 ":260,"방송의":67,"음에는":128,"해로 ":77,"방송이":79,"레이스":190,"방송을":83,"받지 ":116,"래했다":68,"레이시":146,"설가 ":74,"이용 ":83,"서관 ":112,"일어 ":137,"바이잔":107,"레임워":62,"레이저":82,"바이올":96,"바이오":87,"생각하":93,"강제 ":60,"음악가":117,"로부스":65,"래픽스":66,"자발적":68,"익에 ":75,"나를 ":121,"필리프":61,"필리핀":358,"낙동강":59,"같이 ":509,"보는 ":244,"반이다":146,"바이에":102,"과가 ":110,"레이오":85,"공고 ":59,"미터 ":147,"전기의":97,"했는데":97,"레이어":194,"전기적":63,"너가 ":60,"래하였":76,"개인 ":152,"이온 ":68,"레이크":71,"상남도":478,"이오 ":93,"례의 ":123,"경기도":912,"사단법":1681,"전까지":207,"래프트":205,"레이트":129,"과거 ":210,"레이터":99,"했다고":109,"인어 ":69,"개의 ":1769,"표현한":83,"표현하":178,"자베스":57,"인에 ":164,"보다 ":794,"이완 ":109,"이와 ":277,"했다는":112,"경기는":58,"생대 ":149,"크스주":60,"로부터":1174,"바이러":161,"룬다 ":177,"저기압":67,"강이 ":84,"백질 ":70,"공격 ":71,"반으로":349,"등급 ":68,"서가 ":115,"보건복":107,"별로 ":156,"이언 ":67,"나며 ":96,"이어 ":549,"개월 ":84,"계는 ":211,"장르의":62,"이에 ":1223,"키보드":63,"사를 ":1011,"같은 ":1766,"흑인 ":59,"제나 ":71,"경계를":120,"값이 ":80,"전광역":266,"겼다 ":99,"일부이":124,"동쪽에":157,"동쪽은":73,"동쪽으":984,"적도 ":70,"경계로":137,"프의 ":110,"결된 ":163,"값을 ":97,"강의 ":289,"두번째":73,"운행하":134,"표하였":66,"되지 ":512,"강을 ":180,"해당하":270,"해당한":149,"가진 ":547,"가지 ":694,"탈리아":1095,"강은 ":65,"이안 ":61,"공개 ":76,"이아 ":80,"냈다 ":524,"로비치":78,"공간 ":182,"나뉘어":85,"민주화":70,"유주의":100,"율적인":84,"율적으":85,"일부터":916,"타리오":112,"일본제":58,"일부로":118,"일본어":214,"일본에":462,"방지 ":78,"일본의":1226,"일본이":57,"일본인":94,"음식을":58,"유전체":77,"견된 ":134,"일부를":110,"음식이":66,"학부 ":102,"터는 ":441,"받았으":82,"임스 ":235,"민주주":682,"하려고":72,"고기 ":102,"의적 ":151,"하려는":180,"건물 ":86,"나다의":116,"표현이":66,"강서구":79,"임시 ":75,"민족행":60,"유전학":62,"사로 ":539,"감을 ":118,"고급 ":121,"이에서":435,"보급을":72,"관리 ":537,"이어서":70,"전동차":102,"강화 ":81,"방으로":94,"인에게":123,"례를 ":88,"이에른":91,"레마이":67,"발전에":578,"게임 ":779,"법률에":63,"벨라루":58,"장에 ":303,"보교환":61,"과대학":155,"포함해":70,"포함한":517,"포함하":554,"발전시":125,"량이 ":162,"계산 ":77,"두었다":75,"별도의":74,"족과 ":94,"작성하":88,"법률의":65,"법률이":85,"이었던":250,"이었다":1368,"발전을":350,"량을 ":270,"이언트":66,"계속 ":137,"량은 ":105,"인에서":57,"항구 ":67,"탄소 ":65,"히어로":65,"래이다":109,"일에는":57,"조금 ":68,"아가 ":238,"표적인":377,"보급하":100,"상가 ":71,"이용되":115,"국과 ":645,"크에서":153,"개혁 ":71,"자주 ":141,"이오스":98,"키아 ":114,"버스 ":485,"량의 ":196,"일어난":375,"일어나":250,"일어났":108,"이언츠":61,"난다 ":199,"유클리":61,"발전하":100,"참가했":85,"참가한":160,"참가하":322,"국계 ":80,"이었지":81,"이었으":387,"국경 ":65,"등록 ":59,"꼽힌다":72,"이어지":79,"보드 ":159,"거에는":64,"있지 ":161,"포항시":66,"이올린":74,"자신들":66,"고려대":68,"라크 ":131,"개최하":103,"동하는":170,"가톨릭":535,"해가 ":130,"항공 ":158,"방에서":152,"작이 ":58,"발전과":240,"본거지":83,"의정부":87,"발원하":120,"사기 ":58,"작은 ":650,"작을 ":162,"키에 ":57,"자인 ":552,"작의 ":72,"본격적":88,"동했다":59,"프를 ":82,"들면 ":66,"보고서":76,"반응을":70,"공립 ":63,"포함된":182,"바이트":101,"포함되":256,"자의 ":1228,"살고 ":126,"포하는":115,"나다 ":436,"동하고":132,"백악기":102,"산군 ":87,"합과 ":58,"조가 ":151,"자유 ":190,"산구 ":126,"방영된":79,"방영되":154,"들목 ":196,"음주의":62,"본관은":510,"함과 ":91,"나는 ":720,"국가 ":668,"작전 ":71,"크이다":81,"동하였":160,"키와 ":57,"이었고":95,"발전소":109,"강한 ":187,"유한 ":186,"전달하":101,"표이다":71,"공동으":122,"임시정":57,"의적인":85,"한글 ":77,"반적으":847,"나가와":69,"반적인":257,"과를 ":382,"이야기":454,"아과 ":60,"공동체":220,"구가 ":427,"산과 ":144,"의원이":92,"의원을":113,"필드 ":75,"이징 ":127,"이진 ":64,"뜻한다":497,"이지 ":498,"끼리 ":67,"인슈타":59,"뜻하는":380,"검은 ":78,"하는 ":18164,"품안전":65,"보내는":62,"으킨 ":120,"사관 ":76,"전력 ":61,"이즈 ":185,"개최된":194,"개최되":218,"입이 ":60,"하거나":693,"이스하":64,"랑크 ":135,"사고 ":119,"라틴 ":103,"위탁집":59,"내는 ":646,"있을 ":239,"자와 ":463,"쓴다 ":92,"입자 ":81,"내각의":69,"내각이":64,"드라마":492,"것이 ":936,"키의 ":170,"휘하는":77,"라트 ":74,"일시적":63,"찬가지":135,"것의 ":67,"남도 ":1094,"관련 ":511,"잎은 ":109,"것을 ":2109,"것은 ":829,"차는 ":139,"백제의":59,"램으로":79,"차관급":64,"램이다":179,"사건 ":437,"의자들":60,"고등학":392,"자원 ":102,"인지 ":119,"과로 ":135,"의인민":505,"작용 ":98,"교관 ":61,"이스볼":109,"것에 ":138,"드라이":156,"러운 ":114,"났다 ":378,"표준화":112,"방자치":117,"피드 ":77,"사가 ":831,"럼비아":66,"터내셔":76,"채널 ":133,"인조 ":92,"바지하":107,"바지함":89,"원회는":112,"베를린":147,"각한 ":85,"건이 ":129,"이스라":226,"배우이":125,"건은 ":246,"이스를":85,"건을 ":482,"이슬란":101,"이슬람":294,"발족하":147,"가한 ":185,"건의 ":226,"법상 ":135,"의에서":83,"베르트":104,"이시아":128,"해군 ":184,"각형 ":66,"임의 ":182,"작업 ":64,"입을 ":113,"자연 ":134,"입은 ":73,"표준이":73,"교구 ":85,"레스 ":275,"임이 ":89,"이스테":112,"베르크":152,"이스트":150,"뜻하며":87,"의약품":99,"로로 ":225,"법무부":66,"베리아":142,"하나 ":294,"인종 ":68,"이스에":70,"일정 ":91,"원회에":81,"이스의":66,"이중 ":62,"일제 ":401,"갖추고":82,"원회의":88,"전략 ":114,"위키백":58,"임은 ":107,"함께 ":1469,"임을 ":405,"자에 ":270,"방정식":221,"한국인":156,"한국전":116,"일종의":150,"보로 ":117,"일종으":266,"가치를":121,"한국어":281,"한국에":288,"로마 ":867,"러의 ":97,"프레임":130,"로만 ":108,"거운 ":93,"등교육":72,"한국의":765,"텐도 ":71,"이지만":374,"베스 ":74,"자연적":68,"있으면":59,"있으며":2859,"한국시":112,"임의의":76,"작업을":98,"프로이":170,"고를 ":213,"프로젝":410,"플라스":132,"프로야":90,"풍으로":73,"발트 ":82,"자체 ":74,"학기술":709,"번에 ":61,"품의약":58,"한국철":84,"일정한":279,"일종이":264,"사고이":102,"교가 ":168,"차를 ":208,"프로세":418,"보다는":163,"프로스":78,"학교이":155,"학교의":215,"이크 ":192,"번역 ":68,"학교와":105,"고리 ":71,"프로그":1505,"학교에":231,"사거리":82,"프라이":147,"건설되":84,"사고는":93,"로를 ":314,"남구 ":380,"내각 ":119,"사는 ":696,"프랑스":2191,"거의 ":466,"프레드":80,"한국농":75,"자역학":77,"프랑수":66,"프레스":87,"쟁의 ":196,"프로레":124,"한국사":95,"작으로":431,"자원의":107,"자원을":67,"프랑크":230,"작용을":104,"이집트":409,"료가 ":69,"거인 ":70,"자이다":1129,"쟁은 ":76,"자유민":74,"한국불":125,"쟁을 ":247,"품이다":304,"프로듀":150,"사건이":570,"사건의":71,"했기 ":58,"품으로":223,"사단 ":91,"사각형":73,"자이너":75,"하기도":550,"한국방":93,"사건으":140,"사건을":260,"사건은":231,"키스탄":273,"한국문":122,"사고로":88,"자유를":60,"사건에":107,"자유롭":67,"자유로":59,"터로 ":147,"이클 ":128,"정보 ":449,"자에게":301,"고려 ":339,"키지 ":73,"택시 ":63,"하고있":61,"하고자":204,"하던 ":520,"제명 ":69,"인정받":89,"《한국":70,"거에 ":119,"로듀서":129,"러싸고":66,"학교는":93,"아나 ":116,"한국기":62,"전라북":260,"고로 ":237,"은하 ":58,"조나 ":59,"임으로":241,"프란시":116,"쟁에 ":89,"자연과":58,"학교로":108,"한다 ":13903,"일으킨":137,"일으키":165,"전부 ":107,"학교를":131,"한국과":105,"한국교":74,"독특한":83,"려서 ":64,"나기 ":67,"임위원":62,"건에 ":136,"있으나":300,"가치가":81,"잡지 ":125,"타운 ":91,"자였다":159,"임이다":401,"되지만":116,"타워 ":79,"은행 ":219,"했고 ":365,"록되었":91,"방지하":84,"록되어":194,"레비전":409,"학과를":67,"본래 ":178,"으키는":87,"행과 ":57,"인정하":121,"임워크":62,"하다 ":1598,"인접해":357,"로몬 ":72,"자에서":67,"학교가":70,"터를 ":399,"인으로":806,"정도의":111,"인이다":2750,"정도이":95,"레옹 ":96,"레오 ":64,"보면 ":81,"인이나":74,"이전까":77,"안과 ":124,"터리 ":112,"재위 ":590,"타이 ":170,"이익에":60,"제를 ":765,"하나다":64,"공기업":62,"해당 ":227,"하나님":89,"럽의 ":196,"타의 ":138,"드로 ":276,"나가는":87,"정동으":58,"정동이":148,"있었고":57,"인이며":112,"하나로":1278,"일요일":80,"임에서":89,"있었다":516,"정되었":453,"정되어":273,"버스는":71,"이익을":123,"반한 ":62,"타인 ":187,"장소에":66,"재산을":59,"타임 ":62,"크톱 ":64,"제도로":95,"않고 ":518,"테네그":93,"러진 ":169,"제도를":150,"있었던":279,"종과 ":71,"제리 ":77,"탄의 ":170,"힘을 ":86,"프린스":68,"일이다":180,"버스를":60,"한때 ":76,"일원으":71,"본명 ":192,"탄을 ":70,"위하여":1507,"재의 ":594,"위해서":284,"항공사":85,"이치 ":126,"있어서":333,"안구 ":63,"량이다":57,"안군 ":67,"인이었":103,"량으로":84,"잠수함":77,"벌인 ":97,"악기 ":166,"제도에":143,"들로 ":446,"이전의":127,"함된 ":80,"법원 ":127,"이전에":294,"종교·":271,"보물 ":70,"해는 ":66,"인정되":99,"인이자":288,"종교 ":464,"발한 ":578,"전라남":362,"있었으":214,"일제강":66,"탄생하":60,"도하는":89,"프리카":740,"합된 ":92,"내기 ":85,"러시아":1259,"제도의":156,"제도이":151,"사나 ":68,"과거에":114,"재에 ":70,"개인이":78,"드를 ":373,"관계가":119,"로봇 ":68,"플레잉":80,"차례 ":100,"전문 ":280,"보를 ":504,"플레이":676,"플랫폼":184,"경되었":98,"제로 ":572,"자신의":370,"프리드":163,"자신이":152,"공격하":60,"계를 ":823,"차로 ":128,"계대전":148,"개인용":83,"개인의":86,"드르 ":75,"루가 ":76,"이외에":93,"레이 ":338,"이유는":64,"레인 ":73,"항공모":59,"정당이":122,"이외의":104,"레잉 ":75,"관광객":71,"프리미":129,"일에서":154,"공기관":197,"이오프":76,"레의 ":65,"공동 ":225,"이유로":136,"재산권":68,"건복지":107,"졌다 ":803,"플라이":63,"이용자":68,"인천 ":74,"플래시":78,"강조하":64,"상과 ":312,"장은 ":771,"장을 ":940,"차량 ":86,"정도로":127,"로벌 ":412,"정리 ":264,"정도를":60,"이이다":59,"렉산드":154,"관계를":280,"내고 ":139,"태양 ":74,"삼국 ":84,"둘째 ":87,"장의 ":441,"렉산더":61,"하드 ":77,"장이 ":399,"정된다":136,"게서 ":71,"프로토":225,"정되는":135,"관계에":153,"법에 ":402,"번의 ":336,"았고 ":97,"관광부":1127,"장인 ":112,"태에 ":107,"항공기":266,"이용해":188,"이용한":219,"이용하":540,"이용할":86,"플로리":82,"쓰다이":179," 권 ":193,"약으로":104,"외전화":76,"노 ":946,"류는 ":92,"야이다":152,"논 ":119,"본부 ":174,"송의 ":131,"독을 ":96," 국 ":173," 구 ":910," 군 ":386,"념 ":104,"도의 ":1235,"아카데":151,"약이다":130,"치해 ":314," 교 ":197,"편되면":82,"넷 ":554,"도인 ":130,"독의 ":116,"울의 ":64,"녀 ":215,"우수성":57,"이나 ":3015,"서태평":66,"페라이":66,"년 ":40320,"있기 ":171,"나게 ":58,"너 ":308,"수도권":102,"분과 ":75,"널 ":639,"성한 ":265,"독일 ":770,"도자 ":82,"독이 ":70,"수단이":125,"술로 ":87,"수단으":59,"수단은":92,"네 ":732,"수도는":71,"올라 ":62,"도서관":240,"영화제":129,"차가 ":177,"수를 ":679,"영화이":249,"영화의":83,"왕가의":62,"냐 ":176," 곳 ":60," 공 ":72,"영화에":74," 곧 ":122," 과 ":1102,"내 ":913,"왔다 ":379,"어와 ":330,"남 ":651,"치한 ":2915,"수립 ":59," 곡 ":60," 고 ":93,"낼 ":127,"언어 ":291,"낸 ":543,"나고 ":122,"날 ":645," 계 ":88," 경 ":182,"난 ":1514," 겸 ":249,"나 ":11468,"코나미":58,"우승을":222," 것 ":328,"어야 ":159,"르기 ":103,"우수한":99,"도와 ":396,"우주 ":186,"여러해":77,"끈 ":67,"우스의":77,"수도원":123,"불교 ":309,"동부에":445,"수도이":101,"수도인":69,"베스트":178,"세한 ":63,"열렸다":111,"포드 ":79,"움을 ":181,"어에 ":175,"성화 ":93,"로크 ":62,"오늘날":414,"으로 ":33705,"자기 ":286," 개 ":1973,"트페테":64,"북구 ":168,"월부터":252,"루마니":186,"바티칸":62,"자군 ":57,"오페라":314," 강 ":510,"페르디":70," 각 ":779," 가 ":1852,"도어 ":73,"퍼스 ":60," 간 ":172,"속씨식":90,"외교 ":73,"의된 ":59,"수도회":61,"도에 ":499,"번역되":60,"영등포":159,"파크 ":92,"기》":151,"오랜 ":123,"아트 ":64,"오래 ":72,"야에서":263,"나급 ":176,"이너 ":95,"잡고 ":153,"속에서":155,"련한 ":84,"대회의":76,"대회이":310,"로켓 ":60,"코드 ":238,"법인 ":238,"월에 ":731,"법이 ":149,"대회에":121,"소에서":153,"판에서":87,"바키아":85,"번주 ":58,"이는 ":2155,"박테리":63,"양으로":132,"연속 ":101,"수들이":80,"바탕으":333,"버지 ":158,"대회로":103,"올린 ":87,"어져 ":580,"대회를":64,"우승팀":125,"되면 ":130,"얻은 ":80,"원에 ":338,"께 ":1488,"보디아":75,"대항하":93,"대회는":163,"되며 ":566,"법의 ":384,"버스터":121,"와라 ":58,"파이다":83,"법을 ":510,"법은 ":260,"관ㆍ":62,"법상의":86,"안한 ":113,"범을 ":60,"범은 ":63,"이거나":117,"동물의":147,"동물이":83,"음력 ":492,"별시 ":2065,"러피언":110,"이곳에":70,"이더 ":126,"방향 ":69,"원산지":59,"커뮤니":173,"특히 ":652,"길 ":492,"연맹이":57,"기·":85,"이던 ":135,"긴 ":534,"러해살":77,"범죄 ":90,"김 ":69,"유럽과":60,"원은 ":642,"인다 ":433,"장군 ":139,"대한제":216,"어인 ":140,"원을 ":808,"르가 ":94,"대학을":73,"인구가":178,"대학으":61,"대학의":118,"윌리엄":254,"대학이":119,"부가 ":501,"원의 ":690,"역사 ":258,"판으로":77,"속이다":111,"글 ":394,"아케이":78,"용어이":331,"업에 ":215,"금 ":663,"급 ":1684,"여서 ":467,"원이 ":411,"미합중":60,"원인 ":98,"위성 ":107,"극 ":392,"치체를":60,"그 ":5244,"여섯 ":109,"여성 ":225,"으며 ":7980,"근 ":449,"으면 ":156,"파이어":96,"대회가":57,"소이다":165,"인구는":2448,"원자 ":102,"언을 ":82,"센터는":143,"약칭 ":297,"기 ":12721,"양에서":166,"속으로":235,"이도 ":156,"∼ ":117,"베이 ":167,"파일을":68,"원장 ":135,"요가 ":84,"어사전":65,"대한민":7350,"재가 ":151,"소재로":63,"귀 ":64,"용어는":192,"미한다":599,"카자흐":73,"대한불":58,"열리는":184,"렉트로":73,"미하는":189,"오피스":122,"손으로":97,"석하는":74,"이것이":59,"두는 ":58,"장관 ":152,"이것을":92,"이것은":282,"인간이":102,"인간의":257,"르게 ":297,"장과 ":256,"오피아":86,"이다 ":44553,"균 ":216,"규 ":410,"법적 ":132,"오를 ":79,"스가 ":772,"용어로":228,"어의 ":700,"센터에":60,"왕국이":67,"대해서":237,"왕국의":244,"파이터":67,"궁 ":201,"대하여":393,"대학에":115,"로젝트":389,"나가 ":143,"굴 ":183,"번째 ":1634,"국 ":12549,"대학원":221,"데이터":858,"구 ":8696,"구·":91,"국·":107,"용어를":67,"미하며":79,"델이다":57,"군 ":3195,"력한 ":146,"대학생":58,"번지 ":352,"르고 ":392,"권 ":1513,"유래되":83,"유래된":87,"되도록":63,"용자들":92,"에서의":511,"록한 ":134,"대하는":92,"법인이":2000,"법인의":65,"법인으":81,"껍질":59,"때 ":2169,"원숭이":76,"웨어 ":581,"애플 ":68,"련하여":63,"법정동":157,"된다는":107,"번주이":218,"딸 ":107,"병원 ":128,"딴 ":82,"손자이":109,"대표하":220,"인기를":112,"양주시":57,"오랫동":75,"따 ":83,"끄는 ":125,"꼬리":79,"용이다":73,"딩 ":636,"유러피":111,"리고 ":1415,"데이비":123,"용으로":300,"인된 ":59,"대표팀":149,"오래된":193,"용과 ":88,"리게 ":92,"대표적":477,"운영되":116,"외교관":176,"소장은":80," 개국":359," 개교":77," 가나":171," 객관":69,"용자가":142," 개관":128," 강남":325,"에서부":139," 간단":314,"대표이":78," 가능":764,"떤 ":678,"소유하":90,"파트 ":89," 가는":110," 가공":207," 각국":128,"대학교":1503,"꾸는":62,"소재지":313," 각각":403,"발행하":79," 가격":195," 개가":76," 강과":184,"당하였":79," 갖고":277,"른다 ":763,"에의 ":102," 감각":84,"유래했":74," 가깝":67," 가까":264,"인구의":90,"기초":552,"까지":4684,"기체":184,"도시 ":600,"유럽의":184,"인도 ":427,"균형":133,"유래하":101,"유래한":138,"득 ":112,"르기도":285,"드 ":7173,"포로 ":81,"근하":60,"유럽에":197,"김천":58,"안토니":87,"들 ":1322,"든 ":2179,"역번호":77,"우가 ":328,"기준":2706,"코드를":100,"길을":64,"장기 ":92,"길이":553,"센트럴":124,"기존":438,"기종":131,"에서만":99,"기지":219,"김정":94,"까운":158,"김일":103,"원전 ":1040,"까이":70,"버지는":99,"깊은":85,"버지니":97,"스과 ":57,"자나 ":108,"이기도":812,"르기까":96,"왕후 ":114,"대표로":57,"법으로":406,"기하":531,"기한":151,"디 ":655,"딕 ":78,"인데 ":220,"기획":281,"발하였":89,"기회":128,"기화":84,"발하여":147,"딘 ":125,"기호":268,"당한다":198,"되는데":137,"범으로":68,"기후":219,"본선 ":60," 년 ":39188,"김해":119,"당하는":446,"영상 ":190,"범이다":175,"인구밀":163,"속적인":79,"소재하":88,"속적으":84,"급한":91,"급하":294,"기타":646,"에서는":2546,"당했다":95,"코드는":96,"등 ":2664,"에서도":483,"유롭게":61,"법원에":72,"우에는":147,"도스 ":66,"법원이":60,"창군 ":59," 네 ":279,"이동 ":119,"법이다":392,"있던 ":494,"방행정":150,"일까지":745,"아키텍":133,"근에":408,"그와":139,"국화":64,"구회":87,"국회":507,"둘 ":109,"그에":296,"배하는":77,"극에":58,"구현":195,"구협":65,"둔 ":252,"규정":490,"국해":88,"국하":80,"국한":79,"급시":77,"두 ":2443,"규제":83,"급수":72,"구할":69,"구한":114,"구하":801,"기부":230,"기병":61,"당하고":80,"금속":226,"기본":591,"기보":62,"기법":290,"연방의":172,"기반":832,"당하기":83,"운영하":421,"교황":497,"부는 ":372,"구팀":94,"기물":83,"교환":215,"교회":1206,"술부 ":260,"귀족":311,"방향으":165,"교향":157,"르는 ":1003,"교하":132,"국토":367,"기리":116,"기름":86,"기를":1419,"베이스":456,"기록":1098,"기로":698,"될 ":726,"기의":1771,"기이":523,"기인":192,"기자":720,"기장":468,"기재":80,"김영":83,"기적":272,"기전":79,"기점":130,"뒤 ":432,"기와":342,"근처":198," 내 ":198,"업인 ":92," 낸 ":84,"기원":1347,"병의 ":61,"김씨":66,"우이다":123,"없이 ":416,"기압":128,"기아":138,"수록된":81,"수록되":127,"기에":2090,"기여":1114,"기어":60,"기억":172,"기업":1321,"금지":206," 남 ":62,"었을 ":154," 난 ":119,"급제":92,"양자역":76,"규칙":225," 날 ":143,"급을":163,"급으":81,"금의":358,"급은":66,"기슭":65,"기스":70,"금이":141,"급의":120,"워싱턴":116,"방해양":72,"금을":242,"금은":170,"금으":71,"금융":485,"급이":114,"기시":64," 나 ":149,"글이":148,"글을":102,"본식 ":161,"기소":84,"권한":204,"극적":138,"역시 ":1264,"작가로":60,"기수":80,"기술":2933,"어진 ":1828,"다》":68,"글자":184,"그이":134,"기사":297,"용자의":81,"의료 ":61,"극을":92,"그의":1129,"둥 ":134,"금액":59,"급에":136,"기서":167,"워진 ":158,"근의":142,"극장":238,"금에":66,"극작":92,"기생":113,"기상":277,"극이":61,"국외":69,"법칙 ":74,"국왕":350,"예루살":108,"구원":461,"국역":58,"국영":89,"국에":2267,"기기":316,"구역":938,"기까":238,"군에":440,"구와":486,"구장":488,"업을 ":671,"국인":683,"국이":927,"구자":155,"구인":96,"구이":427,"국의":7823,"공화":2272,"군의":841,"국장":157,"과하":139,"과학":2295,"군은":187,"군을":273,"군으":160,"공했":63,"업은 ":103,"급되":106,"공해":80,"공항":327,"공학":446,"공한":214,"공하":712,"국은":420,"국을":477,"공헌":123,"동 ":5208,"료의 ":83,"국으":324,"구의":1030,"국유":104,"이때 ":93,"기경":136,"근대":297,"곡한":152,"곡하":125,"기계":386,"기고":122,"교적":271,"벤처 ":66,"교정":87,"도 ":19093,"기관":3227,"독 ":328,"교의":1054,"그대":267,"교이":272,"교인":98,"교장":78,"고한":57,"고학":98,"고하":102,"구약":91,"공포":158,"기금":119,"돌 ":128,"포를 ":110,"그들":327,"국어":699,"구에":986,"골프":78,"있다 ":14608,"기구":741,"돈 ":121,"성하고":147,"도·":76,"국악":57,"교와":220,"구시":81,"성하기":80,"군수":86,"구스":198,"교에":493,"교역":61,"국수":106,"그녀":205,"계획":601,"위스 ":122,"로프 ":73,"업의 ":668,"계화":120,"운영에":81,"기갑":60,"기가":643,"교육":2560,"기간":605,"아테네":86,"그는":921,"교원":80,"병을 ":93,"데에서":64,"공통":193,"국식":78,"국시":294,"구사":107,"구상":120,"국사":242,"국산":101,"베이니":80,"경향":176,"용인시":67,"경험":178,"번지에":231,"경하":130,"업이 ":195,"국소":69,"구속":114,"그나":76,"구소":547,"금까":92,"구성":2022,"군산":96,"국세":95,"계한":64,"계학":71,"계하":113,"군사":580,"관측":213,"극복":58,"근본":126,"교통":744,"구치":82,"교토":91,"괴하":71,"포니아":254,"오디오":134,"의로 ":155,"기라":71,"버지이":112,"교파":61,"된 ":12222,"있는 ":7557,"리가 ":556,"기둥":76,"근무":149,"기들":73,"국철":110,"구체":168,"군청":66,"구축":271,"도모하":352,"도모할":62,"기능":861,"광학":98,"기는":959,"글로":653,"규범":69,"그룹":828,"권위":161,"그마":69,"기대":121,"그림":313,"그리":2785,"기다":78,"그린":273,"궁전":122,"기니":172,"그를":132,"인근에":94,"권이":270,"권익":212,"이드 ":468,"권의":270,"권을":623,"권은":116,"권으":128,"긴다":73,"기되":97,"돼 ":80,"글리":111,"기동":137,"권자":142,"기도":3690,"기독":739,"잇는 ":257,"관한":1484,"관하":360,"국제":2470,"규모":583,"국정":128,"관해":84,"이들 ":222,"기나":120,"관할":563,"구적":61,"군인":461,"군이":624,"구제":159,"국적":195,"국전":133,"공회":208,"규명":89,"괄하":139,"교차":238,"르다 ":174,"그러":521,"그런":74,"그랑":78,"그라":318,"구조":960,"그램":1195,"그래":891,"군정":60,"그랜":101,"교체":59,"그로":269,"글라":119,"번째로":469,"그렇":75,"관현":159,"그레":393,"국주":73,"그려":165,"기념":619,"권에":285,"구지":95,"근로":136,"글러":60,"국지":159,"그루":101,"군주":362,"글랜":378,"계약":484,"여왕 ":58,"결정":751,"계열":387,"광범":88,"교들":65,"경우":1823,"계에":1064,"고시":148,"계와":142,"역사가":122,"국농":82,"고스":110,"겸임":57,"고슬":67,"육상 ":106,"고서":106,"격자":62,"倉丘 ":68,"견을":57,"격적":116,"교도":126,"것처":82,"광물":81,"곡선":132,"고성":119,"구나":78,"고속":845,"결의":140,"경에":408,"교동":80,"결을":132,"국내":616,"경영":484,"걸친":86,"권과":106,"결이":61,"경연":59,"이끌어":63,"계의":805,"구대":120,"계이":169,"과사":639,"국대":144,"계인":107,"계자":130,"군대":278,"언어학":160,"계적":562,"倉三 ":204,"경주":361,"국도":235,"구동":102,"경유":80,"겨지":139,"겨진":136,"경으":205,"경은":67,"경을":557,"경의":245,"공산":474,"경인":98,"공사":628,"경이":223,"광부":1133,"경쟁":408,"구는":2663,"계유":60,"倉丁 ":151,"경전":188,"경제":1825,"구단":288,"고와":65,"국들":75,"곡에":64,"교로":205,"고원":88,"관심":174,"율리우":65,"고용":223,"광산":147,"관습":89,"아프가":115,"고안":197,"군도":75,"구되":67,"관성":72,"관세":134,"료에 ":73,"구들":66,"외한 ":214,"고에":169,"경지":70,"고양":226,"공식":660,"공신":113,"더 ":1406,"공업":378,"공에":64,"덕 ":151,"경찰":561,"고정":166,"곳에":821,"고전":364,"던 ":4707,"교문":57,"공연":355,"구라":68,"고조":104,"공여":69,"권도":62,"공예":112,"공영":61,"고종":145,"고유":256,"교리":113,"곡은":123,"곡으":120,"고의":311,"교류":510,"고위":328,"교를":330,"거하":226,"곡을":163,"고인":82,"고이":182,"곡의":107,"고있":175,"곡이":253,"고자":344,"공적":170,"공전":79,"구르":74,"구를":602,"구름":70,"계층":171,"국립":960,"과이":162,"구매":106,"공정":235,"구리":185,"공제":58,"과의":611,"구마":87,"공익":127,"공인":185,"덤 ":58,"공작":278,"공자":115,"공의":311,"공이":148,"에스페":64,"교법":60,"공장":150,"곳을":106,"잉글랜":374,"곳은":97,"곳으":142,"고지":182,"곳이":283,"공으":88,"에스파":105,"공을":239,"관여":73,"관에":348,"과와":60,"곳의":66,"공유":261,"구려":197,"공원":601,"구로":546,"계청":60,"국령":72,"과에":332,"공용":107,"관악":84,"격파":66,"델 ":199,"고체":92,"관적":103,"에스토":68,"관점":111,"덴 ":238,"공직":57,"국문":157,"국무":264,"과정":1002,"과적":120,"데 ":4107,"공중":115,"격투":81,"벗어나":80,"과제":59,"궤도":196,"광양":81,"공주":233,"관장":263,"광역":1560,"번이다":102,"세포의":69,"과일":57,"관위":96,"관으":1430,"교부":78,"倉之 ":137,"관의":2174,"규격":148,"관을":436,"관은":726,"관인":90,"관이":1515,"구멍":69,"근거":264,"격하":210,"격한":92,"권리":429,"국불":143,"그것":304,"구분":457,"계통":173,"곤충":101,"교사":363,"국보":135,"우치 ":58,"그가":220,"군벌":62,"구별":243,"관직":120,"광장":110,"역에 ":645,"국방":338,"국민":1323,"구밀":163,"권력":210,"구미":92,"성한다":77,"급격":65,"고타":71,"달하는":148,"결혼":181,"페르시":179,"고쿠":186,"금강":116,"성하는":565,"과천":96,"관차":147,"관찰":160,"근교":78,"광진":58,"광지":58,"교수":610,"발표된":92,"발표되":107,"결합":346,"결하":438,"결한":100,"웨이 ":207,"倉並 ":110,"견해":92,"군부":82,"광주":542,"견한":104,"견하":126,"는 ":115688,"언어에":73,"관광":1592,"관구":95,"늘 ":65,"관급":89,"광객":71,"가축":73,"언어의":102,"고는":199,"언어이":170,"광고":267,"고대":997,"능 ":323,"가치":461,"고도":1752,"경로":111,"건설":619,"곡동":96,"경력":62,"의도동":78,"연안 ":64,"입된 ":104,"개척":110,"갖춘":117,"검색":157,"쟁과 ":63,"고등":647,"계로":351,"개체":157,"곡들":64,"검사":367,"객체":131,"거스":89,"갖추":158,"개최":745,"가타":160,"골든":61,"공단":148,"경마":75,"과는":512,"계를":823,"개칭":76,"공되":110,"과대":160,"공동":868,"가톨":538,"도메인":70,"닉 ":179,"계몽":70,"과도":95,"니 ":1434,"독립하":59,"게서":76,"닌 ":658,"고래":67,"고라":68,"고로":318,"거에":255,"고려":896,"관되":68,"가포":95,"건에":176,"경보":91,"게시":63,"골라":80,"고르":135,"고를":213,"작가이":229,"교가":179,"작곡가":402,"고리":445,"닝 ":121,"거의":471,"예술 ":247,"님 ":88,"거운":99,"도사 ":75,"경부":199,"경북":127,"건이":681,"가했":110,"거점":73,"것에":206,"교과":83,"거제":96,"교관":186,"곡면":66,"다 ":111349,"간하":67,"닥 ":58,"간한":58,"자는 ":1091,"번역하":66,"거인":110,"거이":70,"개통":175,"경비":85,"건으":192,"건은":246,"가하":646,"가한":227,"료와 ":63,"도모를":115,"건을":482,"검역":179,"건의":278,"각한":97,"가해":91,"각하":172,"간호":75,"달 ":337,"성하여":117,"성하였":153,"결성":349,"벌어진":238,"거주":408,"공룡":180,"과로":156,"검은":129,"단 ":1654,"각형":159,"교구":291,"건전":253,"콜라 ":66,"간행":103,"공로":98,"건조":153,"고문":120,"고무":61,"경상":955,"담 ":279,"검정":63,"강한":188,"강하":127,"관료":227,"관련":1378,"것을":2110,"아파트":117,"것의":67,"것으":1506,"게오":79,"것은":830,"개편":325,"공립":95,"예수 ":112,"여야 ":87,"것인":86,"것이":2686,"거지":151,"계선":74,"강화":343,"공무":742,"구경":84,"개한":63,"국경":557,"국계":88,"개하":180,"관리":2036,"고분":99,"공모":103,"언어를":125,"당 ":2124,"게임":2275,"게이":356,"계사":104,"계산":303,"경성":115,"구가":459,"구간":259,"과를":382,"결승":176,"국가":3195,"구개":146,"거창":63,"계수":70," ∼ ":97,"군과":252,"국교":130,"관목":62,"구권":57,"국군":183,"개화":66,"계속":320,"괴되":57,"고비":130,"개혁":328,"구관":145,"국고":62,"구광":218,"대 ":7820,"과목":73,"국과":776,"국공":136,"국관":64,"의를 ":445,"언어로":146,"계승":345,"국기":252,"거쳐":379,"구기":227,"격에":68,"공법":70,"관문":69,"구글":106,"루스 ":303,"거치":92,"격이":190,"격의":79,"고사":138,"건축":557,"격을":325,"검찰":214,"격으":151,"변에 ":94,"교단":121,"교는":208,"걸쳐":621,"결에":59,"공부":145,"이끄는":118,"겨울":101,"개미":141,"건너":96,"언어들":71,"개발":2877,"개방":130,"독립적":105,"거나":1557,"우주의":58,"본명은":189,"으로부":448,"개막":77,"각본":92," 길 ":123,"간부":65,"개명":89,"어에서":253,"강릉":107," 기 ":194,"개를":146,"가보":96,"것과":204," 긴 ":204,"감소":107,"게는":217,"자금을":57,"계가":271,"것도":92,"강서":89,"경계":516,"경과":169,"가수":461,"으로서":1111,"결과":527,"감사":119,"가스":350,"격기":81,"결국":131,"가슴":62,"가시":211,"올랐다":64,"울주군":82,"가설":126,"가서":87,"건담":81,"언어는":67,"간섭":83,"간선":71,"가속":96,"온라인":297,"거두":93,"갈색":63,"거는":139,"개별":97,"독립을":81,"개봉":161,"가사":265,"가산":81,"으로써":965,"우즈베":76,"거대":185,"가상":319,"독립운":225,"간을":392,"건립":125,"간의":1187,"감염":98,"간인":100,"동부 ":460,"간이":504,"가입":188,"경남":138,"가자":80,"각이":151,"가장":2280,"각자":67,"간으":125,"간은":170,"가을":69,"가의":732,"각을":142,"가인":226,"각의":239,"가이":1724,"거를":112,"고가":145,"곡가":419,"거리":759,"고객":146,"아티아":132,"가요":88,"가우":57,"가운":1240,"갈어":69,"발행되":71,"간에":601,"겠다":82,"개신":194,"가와":549,"개시":97,"가였":116,"가오":62,"각에":105,"거로":219,"계기":156,"계급":233,"가에":444,"왜란 ":94,"가야":126,"개수":58,"이란 ":1323,"거래":321,"경기":2213,"겨난":61,"이라 ":885,"것들":114,"감시":118,"계관":69,"개성":108,"계곡":114,"개선":303,"개설":104,"우주선":91,"느 ":258,"아티스":65,"이래 ":159,"공급":250,"관계":1290,"계된":88,"공기":688,"개조":143,"계되":101,"대통령":1050,"관과":175,"건복":108,"발표하":134,"발표한":276,"개장":97,"거부":132,"공국":116,"공군":148,"발표했":57,"으로만":64,"개정":217,"강점":455,"공고":79,"과가":110,"개의":1771,"계대":179,"공공":459,"공과":122,"강제":215,"개인":627,"강조":121,"경되":110,"과거":402,"객이":82,"오브 ":217,"개입":58,"변의 ":108,"늄 ":94,"감정":123,"개원":65,"개월":142,"간지":80,"공격":404,"계는":211,"같은":1786,"외국인":172,"같이":515,"게르":135,"공간":766,"겼다":112,"값이":110,"강의":343,"강이":240,"강은":66,"강을":182,"가지":2292,"가진":654,"공개":410,"강으":96,"각지":87,"고급":165,"감을":118,"결되":247,"결된":216,"값을":97,"수많은":217,"간주":126,"건물":382,"고기":355,"강원":385,"각종":393,"작된 ":349,"격되":57,"고구":210,"간접":88,"견된":179,"견되":279,"가져":156,"각적":77,"뉴 ":162,"가정":341,"가적":80,"고고":132,"고교":84,"가족":289,"간적":65,"갈의":77,"강에":164,"가졌":76,"각기":59,"가까":268,"가깝":67,"가기":142,"개가":139,"발하는":83,"강과":253,"갖고":277,"외국어":63,"가가":352,"각각":410,"유명하":406,"유명한":333,"가고":100,"유명해":61,"가공":315,"변을 ":61,"가게":78,"가격":229,"자동 ":86,"가경":58,"간고":62,"간과":147,"가구":64,"음반 ":194,"간격":58,"감각":102,"도마뱀":70,"각국":139,"작곡한":149,"작곡하":118,"가능":974,"가는":573,"가다":72,"가니":157,"강남":335,"가대":160,"영미법":57,"간단":335,"간다":177,"독립된":72,"으로는":2876,"우고 ":113,"뇌 ":72,"가도":58,"개관":130,"객관":70,"가나":261,"개교":78,"개국":380,"가노":67,"으로도":374,"강동":69,"강도":90,"가량":79,"개된":94,"》는 ":359,"가락":137,"개되":181,"가라":155," 급 ":610,"가들":383,"간되":110,"간된":75,"개념":759,"가된":598,"가되":103,"각되":72,"같다":132,"갈등":59,"감독":690," 그 ":3864,"갔다":90,"갖는":232,"발하고":100,"강력":160,"갈리":113,"누 ":64,"거구":248,"법에서":118,"건강":248,"가면":128,"가며":64,"감리":65,"가미":121,"개로":133,"개량":160,"가문":317,"수사 ":66,"눈 ":92,"건국":190,"갈래":67,"가루":139,"가로":847,"갈라":146,"객들":72,"거가":75,"가리":1920,"육성 ":90,"가마":120,"각류":82,"가르":365,"가를":483,"성으로":264,"역을 ":746,"역의 ":612,"성이다":196,"여자 ":201,"로스트":70,"역이 ":202,"사회학":136,"이런 ":124,"보에 ":94,"자기장":84,"여성의":88,"역인 ":105,"으로의":77,"포가 ":68,"이로 ":257,"역사학":74,"상품의":60,"상품을":66,"칭하는":246,"웨스턴":61,"세웠다":74,"연을 ":133,"동체 ":85,"이론 ":210,"연의 ":81,"웹사이":130,"분기점":81,"세청 ":72,"원에서":238,"유민주":67,"자들 ":77,"본어 ":107,"위에 ":582,"불구하":86,"섬이다":96,"의미 ":59,"약한 ":189,"보와 ":132,"어졌으":85,"소송 ":68,"업체 ":73,"본에 ":135,"위와 ":92,"렸으며":77,"특징이":235,"특징을":85,"칭한다":172,"설치 ":74,"파와 ":67,"웨스트":211,"월간 ":75,"령으로":158,"사회에":170,"령이다":76,"케스트":70,"연장 ":111,"판에 ":66,"사회의":194,"사회적":296,"로스앤":97,"사회주":304,"영어 ":539,"들고 ":146,"일러 ":99,"도쿄 ":329,"동이다":346,"열을 ":131,"동으로":429,"베타 ":78,"예비치":64,"열의 ":161,"되어 ":4088,"올림픽":756,"영에 ":111,"소수 ":73,"셋째 ":57,"콜롬비":70,"들과 ":470,"판이 ":87,"어이다":784,"상호간":218,"이니아":108,"상하이":159,"드가 ":238,"상하였":80,"역사는":67,"동아시":126,"판의 ":77,"본사는":86,"로소프":372,"도자이":65,"어있다":103,"독일어":182,"독일에":106,"독일의":688,"판을 ":143,"어있는":86,"성장하":60,"동안에":69,"어이며":75,"독자적":97,"아프리":752,"역삼동":82,"장관이":63,"원이다":371,"재는 ":561,"장관을":107,"론적 ":99,"성적을":87,"엔진 ":111,"일로 ":158,"동양사":141,"원으로":777,"업연구":87,"선진화":57,"서쪽으":883,"서쪽은":82,"벡터 ":62,"본사를":78,"동아일":75,"위원 ":99,"서쪽에":179,"역사를":147,"친환경":81,"수는 ":371,"재단 ":121,"상했다":89,"오리지":76,"보의 ":153,"파인 ":61,"러한 ":557,"소스 ":199,"도입하":78,"원자력":149,"론을 ":213,"론은 ":142,"원자로":90,"론이 ":58,"파의 ":209,"동에서":90,"보이 ":67,"역사상":171,"이를 ":1065,"론의 ":239,"이름 ":428,"복을 ":86,"려져있":78,"원이었":68,"역사적":299,"인도네":219,"판은 ":76,"동차 ":460,"속버스":59,"역사와":60,"위인 ":74,"원의원":81,"역사에":123,"으면서":126,"장군이":119,"세워졌":73,"역은 ":410,"세워진":147,"별자리":112,"본은 ":113,"인류 ":64,"파일 ":247,"어졌다":315,"위의 ":333,"업에서":66,"동영상":97,"역상으":142,"장된 ":102,"원인이":59,"치한다":628,"로의 ":620,"베이징":158,"록은 ":122,"본부는":115,"치하는":172,"업적을":72,"였을 ":64,"평균 ":104,"록을 ":144,"패션 ":61,"상황에":103,"원작으":116,"려졌다":93,"요한 ":1062,"원장은":99,"불가능":81,"상황을":61,"도이다":379,"영의 ":109,"특정한":270,"치하며":277,"일대에":115,"동쪽 ":179,"도입된":60,"도입되":74,"소비에":317,"독으로":61,"로세스":161,"도이기":66,"불가리":157,"여진 ":140,"로이 ":96,"영상을":69,"록의 ":66,"움직임":86,"움직이":119,"로세서":244,"영을 ":122,"올바른":82,"로서의":214,"원칙 ":76,"이며 ":7170,"부대 ":58,"도움을":80,"도자로":59,"도이며":276,"우체국":67,"수다 ":66,"로서는":102,"앙행정":163,"수단 ":74,"인도양":80,"치하여":88,"샌프란":83,"치하였":59,"영이 ":63,"인도에":68,"인도의":191,"로운 ":749,"독이다":59,"케인 ":80,"애플리":100,"웨어를":99,"이동하":97,"있는데":320,"상호작":138,"세인트":105,"술개발":68,"있다고":197,"으므로":160,"세츠 ":90,"우는 ":262,"료를 ":341,"양한 ":767,"요구하":87,"팀은 ":106,"북도 ":876,"없으며":74,"있다는":230,"원과 ":225,"었으나":628,"의료기":64,"론에 ":135,"어지고":78,"로와 ":177,"부기관":105,"업이다":206,"서초동":58,"동시에":375,"돌아가":62,"영역 ":58,"원주민":78,"업으로":181,"웨어이":75,"웨어의":74,"파에 ":57,"팀을 ":101,"열이 ":57,"티스트":94,"버트 ":241,"팀인 ":106,"었으며":1637,"팀이 ":193,"끌고 ":65,"이들의":125,"이들은":168,"이들을":80,"컵의 ":85,"이들이":88,"롬비아":69,"력으로":173,"있는지":77,"어진다":212,"일명 ":77,"어지는":358,"도에서":324,"레지스":65,"업인이":86,"부근에":142,"자력 ":84,"재까지":179,"로에 ":104,"인들에":65,"로어 ":69,"자로 ":938,"원군 ":162,"원구 ":57,"보안 ":111,"수도 ":546,"소비자":202,"인물 ":119,"치하고":1297,"세종특":81,"역시의":94,"성직자":104,"오스 ":205,"성질을":132,"불교의":418,"인들은":69,"인들을":66,"이듬해":68,"인들의":133,"인들이":149,"도체 ":99,"서초구":253,"불교사":82,"인디언":128,"록에 ":92,"이미 ":149,"료로 ":215,"론으로":87,"생하였":144,"로이센":147,"인민 ":73,"생하여":71,"이버 ":210,"입되었":102,"론이다":163,"입되어":66,"특한 ":81,"겐 ":159,"자니아":79,"이번 ":130,"게 ":7497,"걸 ":59,"건 ":812,"예수의":57,"검 ":84,"것 ":343,"소설가":209,"음성 ":63,"예술의":117,"어트 ":59,"업진흥":67,"케이드":87,"거 ":803,"》 ":2725,"《 ":66,"외하고":65,"〉 ":274,"소와 ":115,"자료 ":141,"곡 ":443,"고 ":30643,"곤 ":148,"되면서":411,"복지 ":67,"티칸 ":59,"운데 ":1085,"겸 ":297,"경 ":1590,"워크 ":327,"계 ":4339,"겨 ":191,"あ ":337,"격 ":421,"별지방":128,"있도록":441,"견 ":139,"결 ":234,"패의 ":75,"일반 ":275,"자리 ":281,"로이다":326,"광 ":234,"이라고":1978,"쿠니 ":89,"장남이":65,"관·":148,"로이드":134,"용해 ":222,"자를 ":1000,"관 ":2189,"과 ":18300,"곳 ":78,"록이다":99,"공 ":940,"웨이의":60,"곱 ":113,"분류 ":142,"골 ":227,"곧 ":133,"운동 ":385,"케이블":113,"교·":290,"용할 ":375,"선총독":94,"부를 ":654,"연안에":133,"케이션":201,"웨일스":98,"역에서":460,"용한 ":474,"소에 ":141,"교 ":3766,"유성구":90,"생하는":237,"괴 ":95,"쿠가와":148,"속에 ":300,"생한다":87,"생활을":151,"생활의":64,"이렇게":62,"소장 ":152,"이라면":58,"생활에":67,"동인 ":74,"설치된":235,"설치되":170,"동이 ":214,"소설이":157,"속이 ":83,"분된다":62,"용된 ":167,"소송법":99,"유사하":81,"유사한":151,"의사 ":123,"동생이":135,"소의 ":297,"소속된":73,"소속되":78,"육의 ":116,"속은 ":58,"동을 ":1013,"속을 ":100,"동은 ":194,"퍼레이":61,"육을 ":217,"이라는":1497,"속의 ":605,"코미디":122,"북동부":261,"유의 ":224,"센터 ":405,"선출하":127,"동의 ":429,"론에서":229,"부동산":93,"었지만":266,"갈 ":248,"이블 ":130,"가·":97,"간 ":2583,"이븐 ":57,"율을 ":155,"각 ":1502,"가 ":25131,"소설로":59,"이브 ":214,"개 ":2483,"객 ":182,"육성하":58,"와서 ":62,"강 ":1090,"코스 ":158,"감 ":193,"팀이다":183,"갑 ":72,"작되었":341,"값 ":73,"우도 ":215,"작되어":105,"팀으로":74,"이론과":70,"일본 ":2149,"일보 ":109,"ア ":202,"우구스":84,"소속기":439,"유적 ":73,"북동쪽":269,"션으로":78,"예수교":59,"의무를":57,"자동차":777,"예술가":65,"용될 ":69,"로아티":131,"로에서":97,"음반이":168,"션이다":124,"일랜드":482,"전과 ":393,"요리 ":109,"설한 ":86,"위에서":153,"이루고":127,"일부 ":510,"여졌다":78,"영어권":66,"보전 ":62,"의미로":213,"의미를":182,"위가 ":174,"설치하":82,"설치한":64,"도중 ":60,"우크라":193,"역이었":65,"의미가":67,"도시를":74,"이라크":126,"본이 ":103,"이러스":164,"평가를":70,"본을 ":178,"역이며":57,"운행 ":58,"의미는":59,"본의 ":1263,"도시로":1022,"역임하":186,"역임한":62,"역임했":134,"포구 ":295,"온의 ":60,"의미이":69,"의미의":91,"의미에":122,"이르기":105,"업체이":81,"베트 ":83,"소송을":60,"로야구":89,"외부 ":93,"틴어로":69,"재로 ":182,"이룬다":93,"소속의":361,"소속으":82,"이루는":173,"자들에":142,"보스니":124,"별자치":252,"이론에":160,"보존 ":138,"자들이":347,"레코드":185,"자들의":199,"자들은":141,"소속이":111,"자들을":122,"이론이":166,"용노동":84,"페라 ":140,"이론은":70,"이론을":75,"이러한":453,"율적 ":57,"이론의":70,"도시에":84,"오의 ":93,"오사카":137,"전국 ":139,"도시이":464,"도시의":101,"영어로":117,"이론적":57,"자바 ":87,"일보》":157,"의식 ":75,"송을 ":189,"서태지":87,"오와 ":60,"코리아":162,"쉽게 ":173,"세포 ":173,"의미한":603,"의미하":328,"전기 ":326,"위이다":181,"번호 ":230,"이른바":70,"일련의":181,"유지 ":127,"이른다":109,"이르는":307,"류가 ":152,"동안 ":746,"요로 ":75,"루를 ":71,"돈을 ":58,"이르렀":73,"이르러":58,"이루어":990,"우드 ":107,"수로 ":513,"수록 ":68,"요하다":73,"티오피":85,"범한 ":75,"도시는":118,"이름을":619,"이름은":905,"이름으":571,"이름의":95,"이름이":757,"이름인":68,"약품안":57,"위원회":1083,"이맥스":73,"평가하":59,"재단은":166,"역이다":379,"엔진이":58,"재단법":390,"역으로":485,"도시가":58,"엔진을":65,"센트 ":61,"선형 ":74,"위원장":186,"동에 ":602,"보스턴":85,"이름에":98,"점과 ":81,"여의도":98,"열차 ":88,"일렉트":97,"운드 ":160,"부로 ":338," 본 ":154,"생 ":830," 볼 ":404,"면의":154,"면으":111,"면은":81,"면을":202,"목사":132,"명에":192,"오스트":938,"샤 ":153,"면이":257,"없다 ":290,"새 ":176,"무공":61,"무관":282,"색 ":724,"아스 ":84,"선수가":86,"속되어":95,"속되었":68,"문과":177,"명시":68,"불렀다":77,"맨해":63,"무국":71,"물건":208,"물관":507,"물과":183," 또한":784,"무기":312,"물고":100,"문구":184,"면에":519,"쪽에 ":528,"면역":75,"문기":69,"선수권":182,"명제":72,"선이 ":276,"무대":221,"무덤":62,"명종":57,"대자동":69,"문대":131,"모아":148,"모양":499,"먼트":452,"모야":73,"면적":2044,"목소":65,"명예":174,"무네":57,"별한 ":132,"모스":327,"모습":277,"멸종":98," 라고":1124,"모시":69,"명을":535,"명은":636,"명으":376,"몬스":58,"명의":990,"명이":1234,"명인":142,"무는":91,"무늬":117,"선수단":172,"웠다 ":98,"메스":59,"목록":269,"모로":214,"쓰이고":88,"본주의":134,"마틴":68,"마티":65," 별 ":66,"었다 ":8595,"모를":206,"모르":159,"불렸다":153,"립허":405,"성에 ":315,"립했":99,"리히":298,"립한":356,"립하":482,"마포":172,"멕시":360,"림픽":766,"메시":170,"선수들":135,"마토":82,"리프":182,"몇몇":116,"립트":86,"명명":152,"마드 ":89,"만큼":116,"모라":68,"부르고":80,"마트":201,"모래":91,"복지부":89,"명문":66,"설이 ":126,"리눅스":103,"리핀":364,"리한":242,"리학":967,"리하":714,"매년 ":363,"머지":135,"삼 ":80,"명사":136,"만화":639,"산·":76,"모바":156,"용되고":286,"설을 ":221,"무거":57,"메인":168,"메이":1331,"메일":86,"매하":265,"매한":223,"무게":90,"용되기":81,"용도로":77,"문가":203,"설은 ":94,"망한":94,"망하":162,"상 ":4129,"망했":69,"명성":146,"무가":118,"대전광":266,"업한 ":64,"마하":117,"산한 ":76,"사 ":5233,"모리":571,"산하 ":318,"사·":132,"말하":599,"말한":3393,"말해":114,"알바니":106,"설의 ":132,"살 ":233,"만한":100,"모모":86,"면서":1898,"먼저":128,"산 ":2364,"연극 ":67,"림축":110,"마찬":135,"리코":146,"모노":103,"매주":81,"매장":77,"리카":1441,"리칸":149,"림청":142,"르프":78,"매일":93,"명된":68,"명되":255,"리치":164,"메모":272,"리케":256,"모나":69,"맹에":60,"르헨":201,"였으며":1303,"맡은":78,"매우":478,"명단":130,"망을":100,"리츠":81,"몽골":207,"메리":794,"메르":162,"멜로":63,"리처":147,"마주":117,"만족":167,"마지":479,"만주":184,"망원":114,"메로":64,"맥에":68,"리청":92,"리로 ":428,"마쿠":85,"몰도":68,"모듈":69,"모든":1119,"모드":98,"《글":317,"매체":185,"머스":152,"맞추":113,"더스 ":76,"마크":376,"몬드":71,"리포":321,"리투":109,"마케":253,"목동":57,"명령":365," 법 ":85,"《고":72,"멸망":116,"리튼":79,"리트":432,"멤버":200,"모두":692,"리티":185,"링크":113,"모니":182,"릭터":187,"리토":63,"마카":106,"선수를":64,"마치":202,"양하고":63,"쇼군 ":68,"모델":475,"리킨":658,"리키":700,"루지야":62,"선수로":140,"리크":106," 번 ":635,"리타":117,"맹이":84,"맺은":57,"모는":61,"마추":91,"리적":466,"린이":257,"르카":91,"립에":85,"린의":68,"리이":430,"매사":94,"리인":91,"리잡":147,"르침":80,"르치":74,"리자":271,"명과":183,"메달":133,"대이다":77,"리의":878,"리위":69,"메니":147,"리우":347,"간과 ":140,"리원":62,"연주자":75,"없는 ":605,"리올":65,"리와":436,"마시":117," 배 ":81,"명가":99,"린에":77,"동당 ":72,"리온":117,"리오":529,"류큐":59,"마스":330,"르체":115,"리엔":67,"리에":1417,"리엄":263,"리얼":106,"리어":346,"리언":69,"리야":146,"리아":4106,"리안":159,"면과":138,"선조 ":80,"만든 ":839,"워드 ":182,"부로부":57," 밤 ":74,"만들 ":66,"른쪽":57,"맡았":269,"アアア":910,"맡아":66,"말의":116,"말이":844,"메라":241,"르트":563,"르티":105,"만의":176,"류한":91,"류하":347,"류학":112,"르투":301,"많은":1276,"많으":66,"만이":234,"를테":61,"말은":132,"목과":78,"말을":157,"멀리":64,"많이":591,"마음":149,"만원":63,"마의":295,"막으":95,"막을":84,"막의":105,"마이":1302,"르토":132,"마일":89,"명나":114,"마인":99,"마자":90,"만으":117,"만을":299,"말에":148,"모가":149,"마을":459,"머리":313,"르키":132,"리지":401,"만에":262,"립적":128,"마운":61,"마우":104,"르타":206,"림이":95,"만약":57,"많아":58,"립을":177,"림의":63,"마에":203,"림을":114,"르크":924,"마와":68,"립자":112,"리즘":285,"맥스":130,"리즈":1369,"마오":77,"메디":104,"맞는 ":58,"립운":266,"르쿠":64,"릴적":114,"마쓰":345,"르케":65,"르코":77,"연구 ":736,"리조":116,"미디":615,"무총":163,"민들":272,"물체":288," 《 ":64,"대주교":92,"바니":158,"바다":409,"바닥":88,"바닷":63,"에게서":75,"받는":351,"덴마크":199,"반대":455," 》 ":203,"말기의":111,"발달":319,"바둑":111,"반도":781,"미래":237,"소를 ":405,"복잡한":88,"였고 ":681,"미로":269,"발되":197,"밀라":58,"니콜라":139,"발된":170,"바디":62," 러브":58,"반드":90,"미르":208,"에트 ":311," 럭비":135,"보장하":64,"였으나":406,"미를":235,"미리":145,"대적인":85,"북부 ":315,"원도 ":290,"바라":259,"바람":163,"말기에":91,"대적으":83," 뜻하":527," 뜻한":499,"쇼 ":167,"밀리":163,"반란":144,"묘호":73,"박람":78,"부분 ":310,"발라":85,"바로":295,"대전에":70,"발레":115,"발렌":61,"밴드":389," 로도":98,"바르":450,"바른":136,"발로":78,"에는 ":3701,"미법":59,"민과":66,"손 ":238,"씨의 ":120,"본적은":117,"본적으":80,"㎢ ":1457,"민국":7568,"본적인":119,"소 ":1996,"모험":72,"보컬 ":70,"속 ":1432,"민경":63,"모형":163,"미국":5159,"리대신":59,"미군":124,"민공":1009,"문에":1412,"르메니":81," 런던":288,"무와":74,"뮬레":141,"못하":264,"무용":185,"무엇":78,"무에":108,"석을 ":172,"무역":348,"속도로":609,"무의":163,"오스만":137,"송 ":796,"되고 ":1458,"무장":362," 라스":58,"무이":87,"무인":90,"무자":113,"못했":96,"못한":122,"물에":348,"무원":831,"무위":67,"물인":66,"물이":1091,"미나":192,"물의":738,"물을":636,"물은":155,"어나는":167," 레닌":99,"문제":1052,"받게":60,"서의 ":942,"물자":101,"문적":150,"문이":527,"문의":337,"문으":196,"문을":285,"문은":98,"번호는":213,"무제":84,"발간":137,"문장":149,"문자":743,"문인":134,"바깥":84,"발굴":181,"미노":144,"므로":460,"소리 ":122,"가고 ":59,"반구":82,"바그":84,"발과":99,"미널":258,"발견":763,"미네":81,"받고":279,"미니":258," 레드":100,"방과":160,"방공":70,"미는":124,"방검":64,"받기":77,"반기":87,"무지":87,"백과":766,"니케이":87,"방글":64," 라우":71,"쇄 ":106,"배경":222,"대중교":58,"방국":87,"바뀌":172,"민당":124," 라오":74,"바꾸":197,"데서 ":76,"물질":647,"미드":124,"바노":62,"미들":57,"배급":90,"밀도":557," 라인":139," 라이":756,"배구":85,"바나":90,"쪽의 ":187,"셔 ":93,"어떤 ":676,"션 ":1251,"면허":59,"문법":143,"더불어":242,"묘사":175,"무부":172,"각각 ":274," 분 ":172,"석이 ":101,"명하":918,"명한":593,"명해":88,"모터":117,"뮤니":177,"명했":79," 불 ":168,"모토":300,"무사":118,"리를 ":1377,"셜 ":158,"쪽은 ":318," 부 ":168,"모티":57,"쓰인다":281,"영역에":73,"무스":102,"무슬":67,"무성":57,"문사":91,"명확":97,"무선":162,"아버지":563,"몬테":144,"무상":72,"문서":608,"무소":302,"쓰이는":420,"보유하":119,"미가":155,"가가 ":278,"모함":111,"서인 ":81," 뜻은":75," 뜻으":193,"오전 ":100," 뜻을":185,"민간":303,"석의 ":89," 뜻의":103,"모하":434,"모할":70," 뜻이":249,"목포":121,"무실":1878,"무신":111,"목표":383,"속도를":65,"문신":350,"목을":213,"목으":232,"모의":222,"목은":112,"모음":164,"목이":209,"모임":172,"모이":107,"목의":117,"물들":160,"멀티":173,"메타":96,"메탈":156,"모에":58,"모어":73,"목에":288,"묘로":172,"모여":147,"무렵":74,"무로":175,"선 ":3009,"무력":69,"설 ":998,"멘트":73,"서 ":35405,"석 ":585," 라는":949,"무라":196,"명체":83,"선의 ":720,"메트":127,"영역을":65,"사할 ":58,"목적":2924,"물리":852,"세 ":2296,"었던 ":886,"사한 ":256,"물류":61," 라디":307,"성 ":4445," 뛰어":203,"대응하":136,"무리":134,"무르":110,"무를":833,"섯 ":281,"물로":548,"물론":141,"명칭":965,"코의 ":222,"섭 ":81,"섬 ":744,"선을 ":465,"섬에 ":188,"물러":57,"선은 ":236,"무료":103,"에게는":108,"베트남":406,"셀 ":135,"문명":266,"센 ":259,"성인 ":111," 때는":150,"성이 ":557,"류군":65,"운동에":85,"로즈":93,"른바 ":70,"로지":186," 디렉":63,"루를":71,"류가":157," 뒤에":143,"루마":207,"레프":79,"세운 ":205,"론의":239,"러한":558,"론을":213,"론은":142,"론으":87,"러피":111,"록이":179,"로자":119," 때까":219,"로이":916,"록의":66,"어났으":91,"록은":122,"록을":144,"로의":623,"렉터":64," 들면":62,"료를":341,"데뷔 ":105,"렉트":128,"론적":143,"로젝":410,"포시 ":85,"러해":78,"론이":279,"론인":135,"로어":89," 둘째":114,"에도 ":1237,"로에":225," 드림":64,"로야":96,"레코":236,"로아":218,"성장 ":64,"로운":756,"로우":91,"복음서":69," 등록":235,"레타":68,"로와":192,"론에":369,"본으로":94,"록에":122,"료로":247,"마는 ":64,"령을":243,"령으":158,"려진":458,"령은":67,"령의":138,"대에는":126,"로시":141,"렸으":99,"려지":128,"루이스":129," 드라":584," 드래":82,"령이":322,"루는":434," 드러":122,"로써":1210,"례의":129,"룬다":192,"력자":59,"련을":64,"련의":320,"련이":137,"로서":3082,"로세":432,"려져":1058,"령어":129,"려졌":161,"령에":131,"로소":400,"액션 ":92,"료되":58," 동해":190,"상한 ":61,"려주":69,"독교의":109,"례에":105," 동화":66,"로스":851,"도동 ":93,"운동으":60,"운동을":238,"운동은":65,"력에":199,"운동이":163,"운동의":127,"운동장":67,"렌즈":95,"레지":140,"려오":65,"롬비":71,"려운":69,"력이":427,"력의":188,"로사":75,"력은":109,"력을":1047,"려의":143,"력으":173,"였기 ":60," 두었":68,"레의":65,"래프":324,"레이":2571,"레인":196,"레일":572,"레임":136,"복음주":62,"루가":93,"로봇":217," 되지":86,"래하":136,"래한":155,"루고":261,"립된 ":2847,"래했":80,"람회":84,"레잉":82,"로부":1256,"래픽":327,"울대학":150," 뒤를":83,"로비":221,"보》":174,"랫폼":184,"레우":79,"로버":215,"로벌":433," 독특":101,"성전 ":62,"캠퍼스":89,"로베":142,"랑하":95,"성적 ":116,"로미":125,"레오":247,"럽이":82,"레온":77,"려시":129,"로바":151,"레옹":150,"어내는":57,"러진":185," 등급":161,"러지":66," 등기":66,"마다 ":357," 몇 ":197,"럽의":198,"료기":76,"수가 ":625,"어났다":277,"레아":111," 명 ":3052," 등과":227,"페이스":345,"로몬":89,"라프":68,"로망":63," 돼지":64,"려서":69,"락하":73,"럽에":218,"라하":106,"료가":69," 도쿠":173," 되어":644," 도쿄":425,"로를":314,"로만":123,"빙 ":74,"로마":1459,"러의":97," 되었":1574,"로리":196,"편집 ":64,"업기술":80,"송되었":59,"렌스":99,"롤러":79,"러운":114," 면 ":100,"예정이":232,"레시":93,"로로":266,"롤라":89,"레스":841,"레슬":216,"렉스":73,"빌 ":184,"러에":64,"라파":96,"로레":146,"로렌":92,"란트":70,"빈 ":332,"도공사":78,"랑크":259,"라틴":522,"라티":92,"라트":181,"로라":126,"비 ":1194," 두번":84,"란토":57," 동쪽":939,"렉산":237,"라톤":92,"라토":75,"라테":80,"서쪽 ":159,"세의 ":523,"례를":88," 도착":65,"랑카":81,"롯데":72,"량이":250,"런스":86,"량의":196,"라키":103,"량을":270,"량은":105,"량으":84,"러싸":178,"러시":1282,"라크":271,"로디":91,"러스":472,"예전에":64,"례로":68,"로드":367,"보이는":153,"레비":475,"량에":67,"록된":166,"록되":319," 동적":58," 동전":120,"로듀":151,"라쿠":58,"개가 ":134," 동원":67," 동위":64,"라카":115,"페인의":159," 둘러":256," 두루":60," 동의":119," 동인":124," 동이":75," 동일":295,"략적":66,"로도":729,"로동":69," 동작":222,"레바":60," 독점":68,"뜻하":537," 동영":96,"로는":3896,"뜻한":519,"로닉":61," 도중":97,"로니":159,"럼비":69,"븐 ":128," 도입":355," 독일":2033,"로네":72,"였지만":104,"영국 ":794,"세이 ":81," 독재":77," 동안":690," 동아":274,"램의":106,"페이지":237,"블 ":323," 독자":160,"램으":79,"램을":192," 동양":311,"보인다":123,"램이":268," 도요":85," 돌아":168," 도와":123,"럭비":137,"케이팅":98," 도움":142,"브 ":1116,"로나":143,"러브":90," 동시":463,"웨덴 ":161,"레르":65,"강과 ":246,"랑이":109,"렸던":58,"래의":273,"레마":102,"래이":139,"갖고 ":272,"운동량":60," 되며":91," 되면":122,"람이":690,"람의":253,"람은":92,"람으":164,"래에":165,"람을":297,"로그":1803,"랑을":65,"라진":68,"당에서":58,"라질":223,"라지":147,"롭게":172,"렸다":446,"케이프":74,"라즈":65,"로교":66," 동생":173,"운데에":81,"뷰 ":75,"페인어":121,"련되":65,"련된":461," 동서":118,"론과":135," 동성":75,"로구":410,"막부":166,"름에":203,"우리 ":186,"르자":62,"리사":226,"르이":139,"매매":71,"망명":76,"대원군":58,"르의":306,"매를":90,"메가":77,"말부":57,"리소":182,"당이다":139," 떨어":835,"린스":117,"사회 ":554,"르지":156,"리시":222," 바 ":139,"당으로":105,"릭스":107,"리스":2842,"름인":68,"름이":849,"름의":136,"름을":721,"름으":597,"름은":938,"보여주":101," 및 ":5806,"르주":61,"마사":281,"마산":169,"림수":541,"머니":414,"먹는":147,"선수이":268,"용되어":68,"용되었":289," 반 ":121,"말로":462,"마를":97,"마르":609,"리보":100,"리본":58,"마리":562,"말레":213,"ああ":1092,"마루":87,"리버":108,"률에":74,"리베":90,"세에 ":126,"사후 ":103,"섬을 ":86,"말리":164,"리브":160,"마모":59,"률을":89,"률의":82,"유가 ":79,"류의":468,"류이":218,"루프":57,"륙의":69,"류인":86,"우를 ":179,"선시대":218,"본에서":359,"르에":154,"만명":119,"률이":149,"리비":231,"뿐 ":113,"르웨":217,"마법":87,"표기법":77,"아어 ":389,"마뱀":70,"르와":82," 미 ":80,"마디":57,"맞는":62,"마드":193,"리로":509,"맞닿":106,"맞대":170,"만든":919,"만드":289,"만들":1730,"리를":1378,"루터":61,"매니":102,"리마":98,"베키스":65,"리만":74,"어로 ":1139,"리모":80,"리면":59,"르세":70,"리며":220,"리메":102,"료하":61,"많다 ":288,"아에 ":243,"루트":166," 또는":7044,"매되":436,"마라":171,"매된":213,"운동가":315,"류에":196,"마련":128,"와이 ":67,"르시":280,"류와":152,"리바":71,"섬의 ":133,"리미":169,"르스":205,"말라":199,"마로":110,"륙에":126,"와의 ":292,"만나":155,"않아 ":81,"리들":59,"마노":70,"리드":424,"리듬":79,"립대":87,"만년":233,"르미":76,"마누":86,"르바":197," 딸이":114,"른바":71,"르베":87,"마는":64,"코어 ":106,"립되":628,"립된":2855," 때에":297,"마니":282,"마다":425,"막대":84,"르부":80,"동가이":124,"많다":296,"리랑":80,"리라":82,"르비":233,"용되지":64," 때의":151,"매년":365,"루지":119,"리눅":103," 디젤":62,"룹이":172,"룹의":106,"리노":143,"마그":94,"르렀":74," 디자":437,"르러":58,"많고":58,"만과":68,"마구":74,"월까지":163,"리나":257,"마나":106,"리되":201," 따위":129,"리된":63,"르면":329,"리도":58,"르모":65,"를린":149,"르몬":72," 디지":563," 따왔":76,"르메":125,"르며":194," 따온":72,"맡고":199," 디즈":98,"리더":98,"상품 ":73,"린다":701,"르마":127,"리대":73,"르만":187,"리니":123,"르를":106,"리다":169,"매개":71,"리단":71,"리는":1356,"말기":395,"를로":97,"로회":57,"루에":65,"르드":132,"름다":121,"루었":69,"리그":1570,"루어":1049," 때부":83,"리교":75,"롤플":72,"릭교":272,"루아":114,"리고":1473," 등의":1711,"록한":139,"록하":240," 등을":1621," 등은":71," 등으":490,"용된다":504," 등장":882,"르도":108,"리공":66," 등이":930," 디스":332,"로프":248,"료이":131,"료의":83,"르데":87,"로하":57,"리게":121,"로피":98,"른다":783,"루이":519,"용되는":630," 등지":163,"류를":301,"세와 ":160,"마가":184,"립국":88,"루의":66,"룡이":113,"립과":102,"립공":193,"롯한":246,"롯하":108,"롯해":85,"리기":520,"류로":226," 따서":167,"르디":177," 들어":754,"론토":100,"로트":65,"류되":96,"류된":67,"르나":166,"성을 ":1294,"로테":59,"アア":1184,"류는":92,"령하":64,"니티 ":64,"로토":241,"루살":113," 때로":79," 물 ":110," 떠나":66," 등에":874,"부등식":78," 들이":58,"리가":609,"르니":86,"르단":65,"르다":274,"부문 ":76,"성은 ":337,"대에서":249," 때문":1317,"르는":1030,"룹이다":116,"서식한":105,"서식하":112,"로파":60,"르네":183,"료와":64,"루시":73,"료에":110,"르노":173,"로페":81,"루스":522,"르가":186,"씨식물":94," 따로":85,"려하":124,"르게":347,"르고":464,"력하":175,"력한":156,"포스 ":94,"루미":72," 따라":2041,"표기하":113,"르기":616,"로크":88,"용되며":85," 디비":65,"로키":66,"름과":57,"로타":76,"뼈 ":57,"련하":119,"련한":87,"로켓":139," 딸로":65," 따르":395," 따른":279,"로코":155,"성의 ":571,"월드 ":285,"비즈":153," 면이":112,"술관 ":61,"서울대":188," 목사":116,"비주":93,"술과 ":112,"비전":604,"비정":124," 면역":63,"비잔":183,"비자":254," 면에":75,"비의":141," 무기":218,"비이":110,"비유":59," 물고":76,"비율":126," 물건":200,"비용":150,"어를 ":713," 색 ":76," 맨해":60,"선에서":160,"비오":69," 새 ":102,"비에":454,"비영":173," 모아":120,"포스트":82,"다음과":165," 무대":132," 명제":57,"빼앗":60," 명의":647," 명이":852," 모스":252," 모습":262," 멸종":93,"데미 ":77," 명으":136," 명을":174," 명에":64," 목소":63," 면적":1943," 명예":164,"빛을":75,"빛의":78,"빛이":59," 선 ":178," 메트":66,"빅토":130,"류와 ":147," 무렵":70," 무력":62,"상태에":175," 서 ":81,"비크":81," 목적":2802," 메타":86," 메탈":133,"쓰 ":219," 모음":140," 모임":130,"양부 ":86,"쓴 ":323," 모여":146," 모양":452," 멀티":170,"비치":319,"상태이":66,"비해":227,"비하":168,"씨 ":474,"비행":326,"립대학":73," 물리":729,"수권 ":164,"비평":114," 세 ":1630," 문명":202,"단어이":100," 무리":83," 성 ":737,"비판":182," 무료":100,"위는 ":290," 섬 ":655,"소련의":131," 명칭":907,"대신을":63," 물론":123,"비트":348," 메모":266,"왕의 ":270,"비로":129,"비롯":537,"빌라":66," 매장":60,"비례":72,"마르 ":85," 매일":60,"대중 ":103," 마찬":135,"부터":5834,"비를":209," 매주":80,"서울시":173,"마를 ":96,"서유럽":60," 만주":134," 망원":101," 리처":111," 만족":131,"빌딩":554," 마주":96,"단에서":85," 메리":77,"스》":261," 몽골":174," 메르":63," 맡은":78," 매우":478," 명단":126," 마지":456,"대수학":86," 멸망":115,"르부르":74," 모두":685," 멤버":192," 링크":71,"분포":314,"왕이 ":131,"영토 ":78,"분파":90,"브스":61," 리투":93,"비밀":157," 마케":230," 명령":322," 맞추":106,"부하":241,"부학":61,"부한":79," 마크":121,"북한":235,"말로 ":363,"빠르":149,"빠른":101,"부품":104," 몰도":67," 《글":315," 모드":84," 모든":1118," 모듈":58," 매체":153,"빌리":113,"부통":99,"다양하":99,"다양한":769,"솔로몬":76," 마치":120,"부패":65," 모델":414," 마카":90," 모니":75,"브웨":59," 마틴":63," 모래":84," 모로":109," 목록":249,"비상":121,"비사":81,"률에 ":68," 멕시":326," 마포":164,"비서":71," 메시":150,"블이":59," 모르":86,"아와 ":370,"분해":122,"분할":143,"분한":71,"분하":183," 만큼":59,"빌보":67,"부활":98," 명명":148,"부호":224," 명문":58,"분화":119," 몇몇":116,"부흥":81,"안에 ":523," 만화":581," 모바":147," 명성":112,"비아":1009," 상 ":162," 메이":433," 메인":91," 무거":57," 무게":59,"비어":60," 《대":64,"왕을 ":61," 모리":222," 》는":90," 마하":64,"비슷":450,"다양체":72,"비스":1259," 사 ":101,"비시":110," 먼저":127," 말하":595," 말한":3392,"마니아":199," 산 ":314,"립되어":101," 말해":112,"립되었":460," 바닷":63," 받는":232,"얼 ":291," 물체":225,"억 ":318," 바다":321," 바닥":59,"언 ":512,"어 ":13255," 미드":74," 미디":308,"뿐만":235," 반드":89,"요르단":59,"뿌리":157,"업 ":1759," 미래":216," 반도":513," 바둑":89,"엄 ":340,"오고 ":79," 반대":427," 발달":304,"않을 ":58,"류인 ":79,"않은 ":396,"외버스":131,"엘 ":358,"코에서":61,"엔 ":200," 미리":101,"왕위 ":75,"에 ":60715,"륙의 ":69,"역 ":2192,"여 ":11527," 발렌":60," 발레":96," 밴드":336," 바로":243," 반란":141,"연구기":185," 묘호":66,"㎢이며":85," 바라":115,"단위이":64," 바람":140," 수 ":4761,"연대 ":75,"안의 ":228," 문학":506,"연구관":133," 발매":1040," 반면":95," 무형":61,"염 ":118," 문헌":108,"엽 ":65,"류의 ":468," 박물":283,"다이아":64," 바르":206," 민법":129,"연 ":555,"연구가":59,"연구개":126,"열 ":422," 미분":130," 무함":67," 무한":103,"표되었":92," 반발":61,"예 ":316,"컨트롤":65,"상호 ":183," 방면":68," 반복":102," 미사":275," 백만":60," 문화":3374,"영 ":1172," 발명":156," 뮤직":150," 뮤지":132,"안을 ":183," 번길":63,"마리 ":98," 미술":327,"옛 ":293,"단이다":349,"도는 ":1874," 민속":126," 미스":154," 방법":913," 민사":90," 방문":137,"능한 ":349," 미생":74," 바빌":60,"다음으":96,"올 ":130," 박사":165,"연고로":70," 반사":111,"오 ":2121,"꾸는 ":60,"단으로":253," 미시":127,"온 ":1060," 벨기":181,"옥 ":109," 묘사":165,"씩 ":192," 문법":92,"씬 ":58,"분석 ":120,"뻗어":76,"비히":129,"빌헬":89,"서에서":124,"서양에":67," 모터":106,"안 ":2071,"다이묘":287,"부속 ":57,"아 ":8561,"여겨진":95,"여겨지":132,"악 ":699," 무선":135," 명확":90," 몬테":130,"알 ":228,"원래 ":383," 모토":78," 무사":83,"앙 ":331,"아인 ":67,"역대 ":70,"악의 ":190,"압 ":126,"세종 ":61,"암 ":229,"안산시":60,"아이 ":66," 문서":489,"단위로":101,"르비아":166," 문신":338,"애 ":160,"액 ":125," 목표":359," 목포":112,"앨범이":199,"상태를":182," 무신":103,"아의 ":1952," 무슬":66," 민간":289,"다이라":201,"악을 ":149,"앤 ":156,"우디아":57,"오에서":59," 소 ":58," 속 ":64," 모험":70," 미국":5128," 미군":118," 모형":148,"펼친 ":88," 무엇":78,"연결된":112,"연결되":159," 무역":240,"상태로":92,"쿠데타":107,"육군 ":150," 못하":260," 무용":112," 물에":69,"야 ":1639," 못한":122," 못했":94,"약 ":1839," 송 ":74,"얀 ":94," 무인":69," 무장":257," 문인":57," 문자":616,"어린 ":76," 문장":142," 발간":125," 물을":74," 미나":115," 물이":74,"쪽에는":101," 받고":185," 미네":57," 발견":731," 문제":899," 발굴":168," 바깥":84,"양 ":1169," 바그":80,"단체 ":215," 미니":154," 바꾸":196," 물질":502," 바뀌":168,"대신하":67," 배경":215," 배구":64," 백과":71," 방글":64,"오가 ":92," 밀도":372," 배급":88,"㎢이다":412,"바타":69,"바키":98,"배출":125,"바탕":457,"박테":79,"케팅 ":75," 루마":188,"보드":262,"별명":122,"포의 ":126,"버스":1087,"보디":132,"바퀴":74,"법사":70,"외에 ":210,"방출":95,"《한":112,"법상":248,"버시":57,"여기에":152,"만나는":64,"대서양":169,"보도":136,"병력":63,"대인 ":123,"더라도":75,"미트":96,"습 ":140,"별로":160,"보는":250,"미터":286,"보니":57,"보다":1010,"앞에 ":58,"승 ":599,"미토":66,"아주 ":103,"미크":58,"백질":181,"서》":62,"슨 ":398,"봉기":75,"보내":172,"미코":72,"상트페":66,"보나":64,"션은 ":114,"백제":182,"업단지":59,"벨리":83,"방지":261,"션을 ":118,"미카":113,"미치":144,"백인":67,"션의 ":92,"방정":311,"베리":361,"연구하":376,"받지":126,"법무":134,"스 ":13453,"백작":129,"병대":98,"방의":550,"방이":189,"방인":58," 레코":213,"성에서":91,"アア ":190,"바지":303,"방을":182,"방은":68,"방으":94,"베르":797,"베를":164,"방전":97,"방자":121,"배우":453,"반직":145,"본부":489,"분과":114,"연구자":69,"배하":244,"밝히":70,"사하였":95,"부고":63,"사하여":75,"부과":108,"불가":326,"불리며":153,"배했":65,"범죄":410,"별시":2275,"방향":410,"부가":553,"보보":58,"방행":151,"연구원":419,"쪽에서":93,"범주":67,"베이":1145,"번지":621,"법정":263,"연구와":136,"대상이":87,"법적":239,"발효":94,"법전":58,"번째":2258,"법조":75,"방해":160," 르네":101,"대상으":305,"보병":102,"방하":123,"여기서":132,"법인":2460,"밝혀":97,"버지":661,"배포":144,"바흐":101,"병사":95,"베어":68,"분류하":120,"범은":64,"범으":68,"범을":60,"사학자":68,"반화":65,"법으":406,"법은":260,"법을":510,"변수":210,"범이":273,"법의":401,"범인":79,"발현":63,"번주":320,"법이":695,"본명":404,"발하":523,"싼 ":93,"발해":78,"법원":385,"복무":64,"보물":134,"발한":597,"범위":305,"발했":61,"발행":355,"버전":329,"보면":93,"볼리":68,"설이다":223,"반하":117,"반한":76,"사학위":59,"부산 ":108,"벌이":122,"벌인":102,"번이":145,"동과 ":314,"벗어":103,"별히 ":78,"발표":789,"번의":336,"법에":547,"범에":61,"싱 ":188,"복리":93,"보리":111,"보를":504,"보르":185,"사항을":108,"반포":61,"십 ":167,"심 ":452,"벌어":349,"쿠라 ":120,"보통 ":571,"번역":488," 로켓":129,"번에":76,"발트":112,"민회":89,"베스":344,"버에":60,"보로":179,"선정되":73,"본래":267,"실 ":537,"신 ":2121,"시·":101,"바티":117,"표를 ":238,"보라":58,"대의 ":1668,"바트":73,"아시리":63,"시 ":11102,"식 ":2547,"어떻게":123,"별법":78,"미합":62,"미하":416,"미한":620,"배치":148,"민스":65,"방부":88,"벨기":186,"연결하":272,"소셜 ":67,"었다가":169,"박사":214," 레오":139,"델로 ":62,"불리는":292,"반사":163," 로버":203,"변경":400,"미쓰":125," 로베":67,"불린다":546,"방문":160,"사회를":100,"민사":140,"었다고":153,"반부":115,"바빌":62,"미소":59,"콘서트":79,"사》":106,"대성당":95,"미술":456,"민속":159," 로마":1229,"안전 ":96,"미스":278,"어드벤":57,"방법":1110,"미시":173,"에너지":555,"믹스":156,"연구소":525,"술 ":1167,"문화":5237," 라트":76,"반민":78," 빈 ":109,"뮤지":139,"뮤직":175,"사회민":64,"번길":63,"동계 ":168," 로렌":78,"배를":124,"방면":99," 라틴":503,"법과":157,"반발":61,"반복":104," 레슬":75,"미상":126,"미생":82,"법규":59,"미사":365,"백만":74," 레스":71,"사회복":58,"윈도 ":523," 로드":134,"리드리":128,"민법":149,"바리":82,"섬으로":166,"물품":90,"무하":135,"무한":119,"무함":67,"미분":165,"소속 ":308," 러시":1253,"수나 ":68,"무형":102,"반면":95,"발매":1060,"업하고":81,"숙 ":91,"문하":118,"문학":996,"수 ":7565," 롯데":68,"버그":188,"발명":165,"박물":478,"었다는":78,"문헌":145,"물학":358,"순 ":180,"연구에":78,"슈 ":248,"받을":102,"보관":133,"받으":61,"받은":358,"발의":72,"발음":233,"본격":104,"방어":137,"발을":206,"방언":181,"박정":58,"보교":91,"발이":87,"방에":589,"밝은":61,"반정":76,"방영":378,"발자":127,"반적":1188,"보건":323,"밖의":84,"발원":228,"반응":325,"방안":91,"반으":349,"반은":59,"반을":232,"반이":289,"대용 ":70,"법령":101,"반의":263,"본거":86,"반인":88,"보고":314,"법론":58,"별도":116,"산하의":96,"보급":496,"배에":64,"보기":184,"배열":85,"별된":62,"벨라":135,"발족":210,"방위":167,"본과":71,"본관":553,"변동":70,"백악":117,"발적":105,"발전":2168,"법률":748,"본국":57,"본군":86,"반에":291,"리며 ":216,"받았":547,"민중":107,"바오":74,"받아":531,"연고지":92,"민주":1442,"벌레":89,"반영":96,"생활 ":136,"미지":217,"바에":127,"베드":57,"소설 ":296,"밀접":73,"미주":57,"밖에":180," 로서":72,"부르크":380,"버로":72,"버리":89,"부르키":72,"리되어":97,"보가":134," 로스":253,"바일":167,"바이":1137,"박이":57,"바의":87,"밀집":72,"연구를":213,"발에":122,"평양에":104,"바위":87,"민지":365,"민의":388," 레지":112,"민을":66,"민이":137,"배상":72,"방송":1655,"불리고":64,"병과":64,"미이":133,"연한 ":80,"미의":191,"민운":60,"베니":113,"방식":893,"밑에":68,"뮌헨":76,"버러":63,"민족":1125,"연합 ":418,"민정":107," 렌즈":75,"불리기":218,"생태계":74,"쉬 ":68,"버드":123,"미에":254," 로부":64,"미어":138," ㎢ ":1456,"미야":182,"베네":199,"단지 ":89," 레이":539,"미얀":97," 로봇":182,"미아":149,"발생":1139,"열대 ":90,"발사":358,"방사":203,"미우":66,"밀양":61,"었는데":175,"미와":61,"에티오":85,"바실":66,"민에":93,"박스":126,"앙에 ":58,"바스":196,"부시":91,"보편":89,"《동":63,"부스":118," 메가":61,"《대":78," 망명":76,"부속":120,"분석":483," 막부":162,"당시의":130,"보험":439,"비공":80,"복합":220,"보행":57,"복하":136,"북아":337,"보화":79,"부아":90,"보호":994," 먹는":127,"비교":324,"덜란드":394,"동기 ":74,"비가":127,"사하고":71,"보한":539,"보하":152," 리스":112,"별한":142,"별하":152," 마리":480,"변형":136,"분비":61,"변호":218,"콘스탄":176,"변화":514,"변환":239," 말리":67,"불법":161," ああ":304,"부분":1658,"부부":74,"북부":794," 말레":203,"보컬":139," 리버":60,"변하":108," 말로":338," 마르":422,"다에서":81,"부산":804,"서울특":2233,"부사":121,"별히":78,"부상":150,"보통":876,"북서":597,"부선":57,"부설":70,"본토":85,"분산":123,"부서":74,"였던 ":503,"단어는":61,"》는":360," 마법":66,"사하는":206,"《뉴":90,"어떠한":155," 리비":141,"쪽으로":3077," 뿐 ":75," 만명":106,"병하":65,"병합":73,"분리되":151,"소리를":144," 만에":155,"블루":171," 많아":58,"북쪽":976," 메디":66," 마오":62,"브리":671,"분류되":81,"비는":93,"분류된":61,"블로":244,"블록":95,"붙은":60,"붙이":74,"붙인":86,"브르":78,"아직 ":93,"분지":84,"브뤼":58,"브루":161,"부지":240," 마쓰":270,"블레":66," アア":171,"블랙":152," 말이":732," 말의":90," 맡아":66," 맡았":269," 멀리":60," 많이":589,"부총":58," 말은":95," 말을":102,"원대학":60," 많으":66,"비디":498," 많은":1059,"부처":108,"비드":137,"부천":98,"역학 ":84," 마일":63," 명나":109," 막의":74," 마이":939,"역한 ":65,"부착":63," 마음":145," 마을":343,"매된 ":209," 머리":222,"비되":67," 말에":106,"블리":126," 마우":75,"부위":94,"북위":64,"북유":57,"부의":976,"북으":70,"부이":295,"부인":395," 마스":139,"불어":280,"영국군":62,"부원":64,"봉한":65,"부였":63,"영국과":68,"불안":74,"부와":395,"분에":266,"분열":122,"당시에":146," 마산":139," 마사":186,"비극":79,"부에":2420,"보훈":113,"석으로":73,"분야":1297,"부여":260,"여할 ":326,"블라":186,"브로":228,"붙어":138,"북조":69,"브랜":239," 매사":94,"브러":125,"브레":203,"붙여":262,"여한 ":90,"붙었":64,"》를":132,"분쟁":215,"부정":220,"부제":59,"빨간":59,"부적":74,"분자":304,"브란":69,"부족":250,"브라":793,"부조":64,"불이":70,"붉은":121,"비나":140,"분으":162,"분은":160,"서이다":247,"부장":172," 류큐":57,"부자":61,"부작":96,"서울지":61,"분이":366,"분의":455,"표로 ":284,"분을":409,"법칙":261,"코스트":84,"보시":70,"벤처":112,"대전 ":499,"보스":329,"병원":361,"본식":170," 많고":58,"대적 ":85,"봉사":97,"병이":85,"르바이":113,"병의":61,"편으로":96,"병을":93,"부담":95,"별지":141," 마그":88,"편이다":60,"섬에서":72,"부는":373,"분류군":63," 리눅":99,"마로 ":78,"분당":74,"북대":84,"륙에 ":70,"북단":60,"부대":314,"부도":71,"부동":151,"북도":1000,"북동":589,"《아":100,"법학자":59," 매개":67,"보안":263,"보아":63," 말기":330,"버트":260,"보에":135,"》에":434," 리더":83,"부등":87,"보여":239,"오카 ":175,"분되":78,"분된":74," 맡고":199,"본에":513,"보와":133,"보였":72,"본어":215,"북구":223,"부근":226,"불과":67,"백화":66,"부기":151,"왕조 ":222,"북극":86,"불구":99,"불교":1054,"분기":161,"변에":132," 롤플":72,"《삼":93,"《사":79,"였다 ":5818," 리그":1028,"붕괴":176,"보상":91,"보살":73,"복사":130,"부르며":106,"변을":61,"변의":108,"본사":255,"변이":82,"보성":57,"복선":58,"산하는":104,"마누엘":65,"병역":76,"별이":83,"복소":98,"본선":122,"별자":389,"교·철":266,"변조":64,"《스":59,"보수":235," 루이":396,"복수":116," 만들":1714," 만든":918," 만드":275,"엔터테":273,"분리":365," 맞는":58,"아시아":1039,"아시안":64,"분류":756," 마드":70,"부리":81," 매년":364,"생한 ":427,"부문":240," 매니":71," 루터":58,"불리":906,"선이다":194,"불린":592,"부모":121," 맞닿":106," 맞대":170,"북면":63," 마라":96,"북방":63,"연구회":81," 루트":124,"북미":98," 리메":64,"동구 ":190,"부르기":284," 말라":128,"대장 ":64,"선으로":205," 마련":126,"변화 ":64,"니카 ":85,"위공무":227,"보유":232,"벡터":159,"》이":131,"》은":258,"복음":190,"》을":87," 리듬":66,"보의":154,"복을":86,"》의":292,"베타":109,"베키":68,"복원":94," 만나":155,"보원":72,"보정":59,"써 ":1207,"본인":182,"보전":215,"보조":194,"복제":80,"보장":241,"산하고":57,"보인":174,"보이":571,"보자":82," 만년":216,"보잉":66,"본의":1265,"위대 ":64,"본이":148,"본은":114,"본을":178,"복잡":213,"본으":94,"류에 ":144,"벤트":72,"번호":560,"부로":440," 많다":296,"베트":571,"보좌":71,"본적":361,"보존":422," 마다":65,"별칭":85,"본제":60," 막대":80,"불렸":246,"부르는":270,"본질":90,"부르":1396,"부를":656,"부른":614,"본주":135,"벨트":79,"보증":59,"범한":75,"범하":94,"복지":390,"법학":195,"불러":105,"불렀":138,"부른다":562," 기름":70,"사자의":62,"에른 ":64,"된다 ":2822,"레 ":617," 국토":265," 기리":96,"렌 ":169," 기록":1008,"높이":655," 될 ":180,"누리":83,"놓은":145,"에르 ":123,"육관광":1111,"지휘관":62," 교회":451,"렘 ":99," 교황":491,"노출":59," 교환":135,"노카":116,"는가":88,"영한 ":59," 교향":137,"사이자":85,"사이의":380,"되던 ":89,"뉴기":78," 귀족":241,"소니 ":75,"농촌":268,"렬 ":143,"노쿠":69," 구하":94," 두 ":1616," 규제":69," 규정":454," 기반":744,"뉜다":143,"려 ":970,"영국령":58," 금속":188,"사이에":1050," 기법":166,"력 ":1905," 기본":560,"련 ":727,"령 ":1058,"의가 ":145," 그와":88,"페인 ":307,"갔다 ":76,"례 ":297,"에피소":110,"느끼":75,"념하":189,"페이 ":59," 구현":181,"분에 ":187," 둔 ":199," 국회":467," 둘 ":80," 그에":142," 군주":331,"네치":59," 그로":78,"영국에":154,"량 ":637,"사이트":392," 기념":453," 그려":163," 관현":152," 그렇":75," 그레":286," 그러":510," 그런":74," 교차":215," 그랜":97," 그래":457,"농산":152," 그라":116," 구조":750," 관해":73," 관할":541," 관한":1386," 관하":68," 국제":2240," 규모":317," 국적":83," 구제":77,"두고 ":439," 군이":85,"념이":269," 군인":411,"념일":61,"노스":368," 공화":1170," 군의":89,"크게 ":346," 기동":58,"대해 ":649," 권익":198,"진행하":112,"노예":123," 기독":683," 기도":110,"사이타":67,"노에":90,"사이클":132," 권위":133," 그린":258," 그리":2688," 궁전":116," 기대":78,"불어 ":253," 그림":284," 기능":764," 광학":73," 기니":64," 그를":80," 글로":188," 그루":89," 근로":105,"감독 ":133,"동물 ":185," 그룹":609,"농수":73,"러 ":2371,"오토 ":64,"덴의 ":130,"놀이":209,"높아":61,"농어":139,"논쟁":109," 구축":257,"농업":544,"웨덴의":119,"노인":60,"논의":106,"녹음":73,"노이":170,"노의":75," 구체":160," 근무":128,"양성 ":103,"설립되":451,"설립된":2612," 된 ":602,"럼 ":613,"럽 ":802,"높은":480,"누르":181,"대학 ":618," 근본":125," 교통":460,"대한 ":2759,"놓여":60,"럴 ":180,"독부 ":84,"네트":660,"외의 ":195,"런 ":372,"네팔":64," 교토":88,"단기":81," 길이":462," 기존":437,"담고":149," 기종":88,"사이를":61," 뒤 ":412,"니는":215,"루이 ":211," 기준":2663," 김일":102,"의거 ":83,"사이며":98,"니다":213,"대개":179,"대가":323,"당과":113,"보통신":129,"대공":201," 김정":92,"대고":184,"당구":102," 기지":96,"대구":662,"대국":94,"대군":106,"대교":257," 깊은":74,"능력":467,"다나":107,"생이 ":83,"판타지":99,"대규":180,"유고슬":66,"애인 ":68,"대까":71,"대기":340,"사위원":70,"봉한 ":64,"액을 ":97,"다는":1787,"루의 ":66," 균형":110,"다니":145,"다다":78,"같다 ":114," 기체":135,"당나":184," 기초":518," 까지":200,"니라":554,"업무 ":115,"단독":80,"로토콜":205,"크고 ":90,"도로가":62,"단된":60,"단되":119,"다드":63," 김천":57,"뉘어":86,"달되":59,"뉴스":481,"담당":568," 드 ":245,"당대":81,"영국의":714,"생의 ":215,"사이드":109,"대는":204,"에만 ":119," 극작":78," 기생":98," 극장":162," 기사":214,"산이다":72," 그의":924," 기상":201,"소는 ":346,"노트":95,"대형 ":188,"도로로":74," 글자":146," 글을":71," 권한":175," 기소":61,"노폴":62,"늘날":414,"누스":263,"눅스":105,"는다":1118,"노프":93,"느님":102," 금융":393,"니가":164," 기술":1634,"부산지":60,"론 ":1033,"산으로":200,"록 ":1898,"로 ":67654," 급제":82,"는데":1842,"니고":130," 금지":145,"눈에":57," 규칙":165,"롤 ":88,"누어":127,"누엘":65,"영화 ":632,"다가":955,"성부 ":74," 기억":140," 기업":903,"롱 ":111," 기여":1107,"롬 ":79," 기아":103,"가나가":58,"단계":343," 기원":1309," 근처":198,"농협":62,"생산한":69,"다고":1243,"생산하":159,"되기도":259,"사용해":88,"사용할":205," 기자":613,"단군":64," 기장":60," 기재":70,"사용했":107," 김영":81,"육과정":69,"사이버":92," 김씨":58,"다국":61,"단과":143,"영된 ":101,"사용하":1482,"사용한":491,"우루스":147,"대비":160,"니아":1877,"다수":306,"단속":75,"달성":181,"니어":177," 딸 ":100,"대부":1072,"다섯":195,"징하는":66,"대백":608,"능을":413,"대방":188," 따 ":65,"니시":132,"능이":137,"는지":249,"사우스":167,"대법":96," 딴 ":76,"대본":115,"대변":64," 꼬리":58,"변화에":59,"대문":243,"퍼시픽":98,"닉스":176,"니스":713,"다에":144,"대수":239,"당시":1347,"다양":1016,"니즘":123,"대시":62,"다와":77,"단어":494,"니지":109,"대신":635,"단에":340,"님이":60,"님의":97,"대승":104,"덜란":394,"보호 ":293,"업하였":82,"대서":179,"당수":72,"대성":249,"더라":88,"당선":166,"《글로":315,"대상":709,"니이":111,"대사":358,"니의":167,"니온":70,"다시":502,"니와":67," 때 ":1805,"다스":163,"니오":102,"단순":280,"니에":124,"니우":69,"당사":230,"단말":68,"도로와":66,"다른":2127,"다르":492,"다를":71,"능성":166,"아어로":59,"다리":327,"룡 ":75,"다마":68,"다만":82,"집합이":107,"도로이":159,"다룬":160,"도로의":196,"다루":444,"달러":146,"니버":155,"달려":57,"집합을":57,"뉴욕":478,"가까운":155,"달라":143,"가까이":69,"사이다":504,"대회 ":303,"지휘하":108,"대되":108,"다란":88," 등 ":2330," 기타":617,"니며":81,"니메":636,"대동":97,"료 ":646,"생을 ":121,"대도":73,"어로는":135,"당되":95,"요시 ":90,"대대":66,"대덕":70,"당된":101,"다비":69,"담배":65," 김해":111,"대만":114," 기후":202,"대립":98,"대리":167,"단법":2075,"대륙":468,"단백":191,"룹 ":318,"대를":566," 기회":105," 기획":174,"늘어":87," 기호":241,"룸 ":70,"뉴질":239,"사용자":630,"간된 ":72,"동방 ":109," 기하":177,"대로":1018,"대량":103,"대략":109,"룬 ":185,"생산자":98,"다목":60,"달린":72,"달리":502,"루 ":753,"倉三三":66,"다면":173,"동단":59,"동당":117,"동대":173,"대출":63,"다테":75,"대칭":171,"되거":99,"데서":86,"리 ":8468,"소년 ":186,"되고":1487,"닌텐":124,"도덕":82,"다카":195,"분야 ":184,"데뷔":278,"다케":114,"도동":111,"생산되":114,"생산된":72,"니티":109,"대책":66,"대체":365,"더스":149,"도리":106,"르네상":84,"링 ":477,"릿 ":94,"사업자":63,"도를":839,"도르":185,"립 ":1307,"더이":67,"대통":1071,"도마":97,"립·":64,"독립":1230,"단하":227,"막 ":535,"단한":192,"동력":113,"도모":729,"마 ":2884,"부에 ":2000,"도메":83,"동량":72,"달한":116,"달하":316,"만 ":5041,"리나 ":165,"표기 ":66,"파키스":153,"린 ":1716,"릭 ":421,"도라":117,"릴 ":239,"리·":102,"음과 ":230,"던에":74,"데시":125,"도록":1047,"도로":1920,"되기":565,"않았다":134,"데스":339,"월드컵":315,"단편":181,"더욱":101,"림 ":455,"다중":115,"어로서":101,"단조":57,"대역":76,"대에":1339,"담은":79,"률 ":308,"닿아":111,"도군":69,"도구":218,"독교":722,"도교":185,"데라":73,"대안":86,"도권":125,"대용":96,"륨 ":85,"대우":165,"당이":352,"델로":83,"단지":227,"대원":153,"사우루":135,"당으":106,"당은":139,"당을":126,"많고 ":57,"대외":71,"당의":341,"대와":275,"동가":358,"대왕":208,"동강":89,"도가":518,"산업의":264,"산업을":71,"다의":253,"다음":663,"댄스":109,"다운":233,"양시 ":144,"달에":108,"양식 ":64,"단이":638,"단일":159,"갖는 ":164,"단장":102,"컴퓨터":1613,"도공":91,"류 ":1253,"달을":122,"달의":59,"륙 ":150,"부와 ":386,"당에":141,"다이":1163,"에드워":105,"단위":503,"단으":253,"단을":267,"단은":441,"단의":275,"부산광":308,"國三三":57,"대지":106,"단체":1326,"니크":67,"데미":171,"데스크":83,"를 ":35910,"대중":546,"사용된":476,"사용되":1174,"동남":213,"아에서":342,"산에서":114,"도달":64,"릉 ":110,"도니":163,"만과 ":66,"컴퓨팅":102,"니터":63,"도는":1874,"능하":401,"능한":354,"름 ":818,"지휘자":123,"덴마":200,"니치":57,"대자":83,"동구":281,"동국":64,"더불":242,"동군":61,"동굴":86,"대장":219,"니카":153,"대의":1701,"대응":252,"동과":336,"대인":254,"대이":136,"사운드":156,"동계":192,"동경":61,"데르":128,"데리":57,"도네":236,"대주":148,"니코":105,"니콘":62,"어려운":65,"니콜":165,"른 ":3003,"더블":67,"도나":71,"유는 ":91,"동기":179,"대조":88,"대적":271,"대전":1323,"르 ":2357,"당주":83,"니케":131,"델리":75,"설명하":178," 가속":74," 간섭":72," 건담":75," 가설":110," 거대":183,"산주의":205," 개봉":160," 가상":296,"안양시":65," 가사":108,"또 ":312," 개별":96," 감소":102,"영토를":59," 강서":86," 경계":464," 것도":74,"꼽힌":72,"꼽히":75,"아우구":84,"나고":178,"날개":106," 감사":72," 가스":186," 결국":129,"여러 ":1385,"나게":61," 가시":92," 거두":92," 가수":406," 결과":470,"나간":63,"나가":628,"않았으":59," 개막":77," 각본":88," 개를":59," 것과":188,"류로 ":186,"리기 ":163," 강릉":100," 가미":74,"끝난":64,"끝나":75," 개방":107," 개발":2267,"개된 ":81," 개미":68," 건너":96,"끄는":125," 개명":89,"복합 ":62,"우승 ":162," 가르":250," 가마":103," 가리":1469,"선수 ":168,"데에 ":162," 가로":154," 갈래":64," 갈라":132,"야에 ":151," 개량":110," 가문":299," 건국":186," 개로":82,"동대문":97,"끌고":66," 건강":182," 가면":65," 강력":160," 갈리":98,"안에서":172,"우스 ":629," 갖는":232,"산지방":80," 감독":595,"약에 ":90," 개념":737,"위로 ":281," 가량":60," 가락":78," 강도":61," 같다":128," 강동":61,"언론인":132,"나머":129,"루아 ":64," 경마":60,"어리그":61,"나며":97,"나면":62," 개최":744," 개척":103,"나리":168," 개체":141," 고등":369,"나마":61," 검사":233," 갖추":158," 객체":126," 검색":137," 갖춘":117,"나무":576," 공동":773,"넓게":76," 가톨":515,"내려":193," 개칭":76,"대체하":94," 골든":60,"나모":58,"낭만":64," 계몽":59,"진흥원":106,"끌어":118,"끌었":93,"남면":62,"내로":74,"나미":178,"나바":64,"속된 ":167,"낱말":206,"내륙":89,"내리":117,"네갈":68," 고려":852,"진흥을":64,"세력을":62," 관구":68,"끼리":100,"내는":669,"남도":1277," 는 ":22667," 관광":330,"남동":614,"진흥에":83,"나라":1940," 가축":63,"낸다":205,"나로":1319," 가치":372," 고대":982," 광고":196,"너가":67,"나를":121,"나르":103,"날로":118,"왕이었":76,"보험 ":71," 건설":517," 경로":93,"냈다":532," 고도":101," 감염":75," 가정":260," 고고":115," 가져":155," 뉴 ":66," 강에":91,"나눈":68," 가졌":75,"나누":202," 가족":215,"여하기":269," 간접":70," 고구":194,"나뉘":181,"성립된":61," 각종":385,"나뉜":158,"변화를":114,"나는":746," 고급":147,"나다":684,"나님":90," 건물":348," 강원":365," 고기":109," 값을":58," 간주":115,"남녀":66," 값이":70,"난다":238," 공간":529," 가지":1815," 가진":647," 공개":359," 강을":88," 각지":78," 감정":108," 강이":197," 강의":193,"됐다 ":189," 공격":386," 같은":1691,"낙동":61," 개월":137," 개원":65," 강점":383,"드》":58,"도바 ":94," 같이":483," 게르":105," 개인":584," 개의":1532," 공과":57," 강제":192," 공공":336," 개장":93," 공국":74," 공군":133,"뀌었":102," 거부":106,"여하고":161," 강조":120," 과거":399,"나들":306,"남단":82,"남대":69,"났다":419," 개정":186," 관계":890," 개조":140," 공기":186," 공급":227,"상이다":365,"가로 ":579,"나급":179," 감시":80," 것들":82,"나기":90," 계곡":100," 개성":96,"변호사":207," 개설":79," 개선":238," 거래":171,"남겼":58,"집행형":59,"리그 ":853,"루어 ":63," 경기":2030,"남군":59,"내각":453,"남구":433," 계급":193," 가야":70," 개신":191," 가와":79," 개시":84," 간에":91,"남극":119,"소기업":149," 계기":111," 가운":1193,"내고":151," 가을":65," 고객":140," 거리":295,"나나":64,"상으로":821," 건립":120,"나노":70,"내기":127," 간의":259," 각자":58," 가장":2276," 경남":74," 가입":175," 가이":129," 공예":66,"립과 ":82," 고종":129," 공업":167,"되거나":99,"내지":117," 고조":84," 공여":67," 공연":289," 곳에":683," 고정":158," 고전":325,"냈으":120,"네바":84," 더 ":942,"남쪽":977," 경찰":296,"롯해 ":76," 《한":67," 곡이":150," 곡은":71," 곡을":68," 고위":306," 고유":247,"아이들":108," 교리":89," 관심":162,"아있는":62,"남지":88," 고원":69," 교류":254,"아이디":58,"육기관":94," 고용":157,"내전":134," 관습":82," 광산":108,"아일랜":478,"놓고":102,"농구":128,"내장":79,"내이":69,"날짜":67,"내의":242,"네마":59," 고양":209,"네르":65," 공식":585," 고안":196," 군도":61,"내용":752,"내외":188," 관세":105,"낮은":183,"나지":103," 국도":176,"농가":60,"났으":129,"남은":84,"내에":828,"남으":73,"내어":79,"남의":129," 군대":254,"남인":67,"남이":161," 경주":328,"남자":215,"남작":57," 구단":114," 경제":1255,"년도":117," 경전":153,"남원":74,"너무":67,"나중":141,"나주":65,"원래는":145,"널리":610,"날의":126," 경쟁":314,"날이":62," 고시":62," 경유":69,"남에":64,"난을":59,"너먼":74,"년대":962," 공사":132," 공산":365," 경인":69," 경우":1814,"남아":651," 결정":636,"남양":87," 겸임":57,"나인":191,"나일":67,"나이":2105," 계열":315,"나의":720," 광범":84,"날에":128,"속기관":448,"동맹 ":104,"나우":78,"속도 ":68," 경영":338," 걸친":86," 국내":570,"기회를":64,"녀는":98," 계약":327,"널로":65,"나와":256,"나왔":84,"나오":543,"낙엽":63," 것처":82,"나온":207,"소규모":61," 결의":95," 고속":345,"나에":147," 거치":92,"나였":70," 광물":68," 고성":109," 곡선":103," 공부":118,"아일보":76," 걸쳐":620,"내셔":215," 겨울":100,"남시":112," 건축":501," 검찰":106,"류를 ":300,"나아":108," 교단":70," 관문":58,"어머니":289," 구글":105," 계승":325," 국기":175," 거쳐":379,"네덜":393," 계속":310,"설립하":135,"남서":500," 대 ":2047,"나시":87,"설립한":221,"나스":130,"남산":76,"마가 ":85," 국경":502," 거창":60,"설립허":405," 국군":59,"널드":63," 개화":61,"안으로":82,"약성경":63," 개혁":224,"대표 ":142,"년까":1214,"남성":246,"넘는":103," 당 ":135," 국가":2932," 구간":223,"년과":139," 경성":105," 결승":157," 강한":138," 강하":79," 공무":329," 강화":298," 고분":81," 관리":1066," 게임":2058," 계산":280," 게이":242," 게오":73," 것으":1465," 개편":301," 것은":475," 것을":1889,"년간":256,"내부":331," 관련":1271,"오는 ":541," 것인":82," 것이":2545," 공립":83,"나서":96,"아이슬":98,"아이스":118,"상청 ":78," 경상":930,"어릴적":114," 관료":212,"년경":335," 건전":249," 교구":126," 간행":78," 공로":79,"어린이":195,"내버":165," 단 ":185," 고문":87," 건조":136,"끝이":62," 거주":394," 공룡":99," 결성":335," 간호":57," 달 ":74," 검은":120,"당한 ":273," 개통":168,"내무":67," 거의":351,"끝에":210,"살이풀":102,"네그":122," 검역":63," 건의":89,"되는 ":3192," 교과":67," 거제":77," 것에":181," 거점":62,"남북":337,"남부":952," 다 ":148,"아인슈":58,"남미":85,"내면":60," 고르":57,"나비":119," 경북":70,"으나 ":1984," 경부":97,"육과학":285," 고리":78," 》은":86," 《아":76," 국왕":338,"노선":512,"년이":209,"년을":193,"념에":64,"년의":324,"년으":59," 구와":86,"녹색":141," 기기":139," 구역":567,"노사":86," 그들":324," 국어":100," 구에":103,"녀의":96,"념을":152,"비가 ":106," 과학":1159," 국장":81,"념으":242,"원래의":69,"평론가":63," 구이":74," 공헌":115," 동 ":488," 구의":119,"여를 ":144," 공항":153," 공학":187,"완전히":129,"략 ":236,"넷을":59," 기관":864," 도 ":370," 근대":279," 기계":313,"농민":122,"랜 ":157," 기갑":57," 그대":262," 기금":68," 공포":138," 구약":88," 기구":396,"단히 ":190,"램 ":396,"컴파일":63,"년에":3786," 골프":64,"랑 ":252,"네의":71,"네임":79,"네이":376,"기호는":57,"노벨":252,"랍 ":91,"람 ":490," 《사":76," 그녀":204," 계획":467," 《삼":92,"노보":62," 그는":841,"랙 ":101,"래 ":1320," 공통":185," 기간":439," 교육":1812,"누구":78,"사카 ":146,"세부 ":62,"노비":106,"사적으":114,"않으며":102,"노부":168,"사적인":86,"넷에":63,"넓은":250," 경향":146,"란 ":2827,"노무":59,"넓이":80," 경험":163,"부분에":132,"넣어":89,"남편":86,"라 ":7177,"널을":66,"널은":82,"락 ":201,"널이":171,"㎢이":539,"노바":67,"노미":136,"네오":95,"너지":586,"남한":59,"남해":125,"농림":676," 구속":101,"왕과 ":60,"랄 ":61,"논문":137,"부분은":122,"부분으":81,"부분을":239,"완전한":90," 군사":494," 관측":187,"부분의":331," 군산":83," 구성":1965,"부분이":237," 견해":86,"나폴":191,"왔으며":92,"네시":339," 광주":506," 근거":256,"네스":338," 근교":75,"노르":399," 교수":480,"소관의":1872," 결합":308,"널에":59,"아제르":103," 과천":67," 관찰":132," 금강":82,"노리":119,"북부에":293," 급격":65,"왕으로":142,"상에서":235," 결혼":171,"논리":281,"넘어":123," 국민":1106," 국방":227,"네상":85,"야의 ":314,"나톨":89,"나토":79," 권력":168," 구미":71," 광장":76," 구별":240," 관직":109,"네소":58,"년부":1645,"약은 ":85,"왕이다":132," 국보":88," 곤충":91," 교사":123,"네수":64,"나파":113,"노란":66," 그가":158,"노래":459,"나트":57," 군벌":59," 권리":410,"약을 ":246,"논란":123," 계통":136,"노력":131," 그것":304," 구분":433,"루살렘":108," 관위":91," 계층":133," 국립":899," 구매":95,"노드":76," 규격":131," 과정":755,"기하학":185,"너스":57," 공중":106," 데 ":719," 공주":130," 관장":217," 궤도":157," 광양":65,"나타":1198," 관점":110,"크가 ":117,"나키":64," 광역":123,"설립자":69," 국무":229,"념물":120," 고체":87,"개국이":87,"롯한 ":236,"약의 ":117,"뜻 ":134," 공용":96," 관악":81," 공원":222," 구로":190,"나카":127," 곳을":76," 곳으":130,"나치":255," 공유":240,"년마":62," 곳이":274," 관여":73," 공을":98," 공의":61," 공익":122," 공인":132,"데이 ":96," 공작":263,"노동":651," 공장":112," 공적":69," 공전":62," 구리":102," 공정":172," 구마":64," 과의":71,"북쪽 ":122,"룡이다":97," 높이":619," 놓은":85," 누르":169," 높은":470,"연맹 ":267,"연합의":90,"유나이":86,"안전성":62," 뉴기":71,"이고 ":1432,"민 ":686," 노출":57,"미 ":1244,"동남아":110,"밀 ":162,"밍 ":299,"및 ":5874," 농촌":225,"떨어":849,"어서 ":645,"바 ":1223,"박 ":148,"루어졌":77,"《뉴스":59,"표된 ":94,"름다운":96,"루어지":209,"루어진":357,"발 ":1144,"반 ":1275,"안전부":66,"도를 ":838," 노트":65,"밤 ":79,"도르 ":69," 느끼":75,"서비스":980,"평양 ":255,"루어져":306,"방 ":1689,"배 ":369,"리그의":168,"리그이":116,"백 ":172,"오케스":66,"약자 ":66,"릭교회":264,"리그에":124," 로 ":1417,"왕국 ":201," 록 ":195,"세서 ":65,"온다 ":69,"붙인 ":79,"약성서":58," 단계":303,"약이 ":77,"인공 ":105," 대가":98," 대개":167," 담고":149,"북아메":178,"서부터":164,"떠한":155,"편의 ":84," 대기":219," 대규":162,"송된 ":57," 대구":602,"연합회":169," 능력":290,"보험자":61,"였다고":86," 대공":148,"였다가":68,"기하는":100," 단독":80,"벌 ":585,"번 ":888,"기한다":67,"르디난":61," 다니":91,"리기도":228," 다다":76,"보호를":80," 당나":182,"버 ":622,"베 ":197,"벤 ":68,"양에 ":120,"였는데":120," 담당":539," 뉴스":181,"법 ":1604,"범 ":366,"편이 ":71,"인과 ":273,"알제리":72,"벨 ":321," 대덕":65,"를린 ":83," 대대":58," 대도":60," 대동":68," 다리":217,"변 ":297,"독립 ":282," 다른":2061," 다르":415,"열린 ":403,"기하고":59,"용어 ":87,"벽 ":98,"문》":144,"별 ":439," 다만":62,"외에도":132," 달라":106," 뉴욕":473," 다루":441," 달러":131," 다룬":160,"병 ":351,"서부의":58,"편을 ":77," 뉴질":235," 단말":62,"서부에":508," 달린":64," 다목":59," 달리":473," 대략":109," 대량":101,"유네스":90,"폴리스":134," 늘어":84," 단백":169," 대륙":393,"르면 ":308," 대만":99,"용에 ":191," 대리":110,"르며 ":194," 대립":97,"볼 ":700," 대부":829,"도모 ":96," 다섯":189," 대본":114,"본 ":2748," 다수":252," 달성":171,"봇 ":74,"여했다":68,"쿠스 ":111," 대비":138,"단한 ":186,"보 ":1353," 대법":85,"가나 ":114," 대변":60,"복 ":311," 니시":68,"또한":786," 대백":573,"람과":66," 대성":143,"위를 ":1049,"라기":109," 대서":110," 대승":78," 단어":482,"람교":71," 다양":968,"래가":61," 당시":1308," 대수":150," 당사":200,"라고":4349,"이가 ":440," 다시":472," 다스":119," 단순":279,"았으며":218,"봉 ":181,"라가":293," 대상":627,"라그":59," 대사":173," 당선":156,"란과":57,"연합군":116,"라과":75," 도교":68,"상적인":119,"블랙 ":57," 도구":194," 대안":79,"라는":3492,"롯하여":102,"라니":63," 다중":111,"포르투":247,"라다":70," 담은":65,"라데":146," 대왕":88,"란다":72," 대외":65," 대우":121,"라도":290," 대원":80," 단지":69," 대신":337,"립공원":182," 다운":92," 댄스":86,"라남":362,"라나":128," 단위":404," 다이":676," 다음":656,"라노":165,"여하는":229," 단일":156," 를 ":1994,"량과":69,"았으나":73," 대주":65," 대중":469,"람들":653," 단체":752," 대지":68," 도달":62,"래된":291,"래되":137,"라라":69," 대응":233," 대의":90," 동계":163,"라드":158," 덴마":189,"뛰어":204," 대장":118," 동굴":66," 더불":242," 동구":140," 더블":62,"랐다":87,"란드":1305," 당주":81," 대전":1030,"라디":509,"래는":286,"상적으":63," 니콜":153," 대조":67," 동기":77," 대칭":110,"여함을":171,"불 ":296," 되고":141,"요일 ":146,"라미":123,"래로":202,"비는 ":92,"분 ":768,"오픈 ":155,"욕의 ":65,"북 ":391,"붙은 ":60,"부 ":7536,"위대한":96,"성립하":64,"라면":117," 동대":115,"라를":184,"송공사":57,"랫동":75,"러가":176,"라리":104,"라마":640," 대체":348,"달한 ":64,"요소를":93,"라르":90," 데뷔":275," 다케":102,"만년 ":186,"랜드":2035,"라루":63," 동남":210,"역학에":87," 도덕":66," 다카":170," 닌텐":122,"라로":150," 만 ":771," 동력":61," 도모":650,"표는 ":100,"사키 ":99,"뜻이":250,"뜻으":193,"뜻은":77,"뜻을":186,"래밍":301,"뜻의":103," 도메":76,"역할을":484," 대통":1037," 독립":1158," 도마":78,"보한다":522,"라비":303,"러나":429,"라브":117,"레고":91," 더욱":100," 단편":179,"라북":263,"래머":71," 데스":111,"렇게":90,"색으로":87," 도로":444," 되기":85,"라보":58,"래를":142,"어스 ":86,"럽과":65,"양사상":138,"인간 ":162,"아즈치":65," 리 ":87,"라바":138,"성리학":82,"럽게":67,"울시 ":92," 대형":216,"란스":62," 대화":129,"란시":124,"왕조의":225," 대회":1040," 동방":166,"선민주":511,"리드 ":197,"령과":63,"령관":126,"레니":104,"레닌":105," 데이":1061,"레네":63," 된다":760,"개국 ":83," 대학":1552," 대하":366," 대한":10310,"러드":105," 동물":531,"라스":508," 두고":421,"상임위":62," 대항":179," 대해":872,"대체로":110,"라시":210,"렀다":179," 맥 ":79,"예로 ":72,"려고":144,"라서":389,"란색":79," 대표":1228,"력과":186,"런던":299," 동맹":199,"레나":119," 되는":598," 동명":90," 두개":57," 달하":98,"왕족으":63,"려가":88,"러다":58," 말 ":208," 데에":168,"융기관":72,"세르비":167," 도미":66,"라이":2502,"라인":656,"려대":69,"라의":696,"러리":161,"로가":256,"러를":61,"라위":80,"란이":133,"란의":103," 도심":76,"란을":119,"람에":161,"랍어":117," 도시":2717,"론가":102,"라자":84," 도스":70,"래시":96,"라와":192,"래식":78,"란에":60,"연합뉴":77,"라오":172,"래스":164,"려는":295,"라운":208,"라우":429,"랜스":139,"랑스":2207,"라엘":229,"라에":374,"레드":242," 도서":212,"라야":78,"뷔 ":108,"여하여":59,"여하였":106," 동북":136," 동부":560,"랑수":68," 또 ":308,"서울 ":354,"동북":139,"동부":1119,"도사":155,"도상":81,"맹 ":398,"도서":352,"독성":71," 나가":269,"도네시":218,"양의 ":498,"도스":130," 나고":67,"도시":3144," 날개":87,"도심":95,"동산":156,"동사":97,"데에":273,"말 ":452,"리는 ":1318,"편에 ":81,"담하":105,"도바":121,"됐다":193,"있게 ":311,"도미":81,"용자 ":266,"동류":60,"되는":3370,"단체가":70,"당하":866,"동맹":284,"대표":1493,"망 ":267,"유니온":66,"동명":97,"두개":63,"원시 ":244,"단히":190,"당했":130,"양자 ":62,"두가":58,"동면":71,"당한":473,"대한":10786,"대하":654,"대학":3157,"유닉스":93,"독부":116,"된다":2991,"설에 ":116,"되던":89,"대해":906,"대항":184,"덴의":130,"데일":69,"데이":1328,"도부":57,"양이 ":144,"붙어 ":106,"맥 ":265,"델이":112,"두고":457,"동물":811,"대행":62,"매 ":354,"되도":63,"록》":64,"대화":239,"임과 ":71,"동방":208,"도비":86,"두교":119,"업무를":406,"대형":256,"맨 ":157,"말기 ":110," 끝난":64," 끝나":75,"대회":1337,"론》":70,"도적":126," 나누":200,"동안":920,"동아":323,"독재":105,"않지만":68," 나눈":68,"독점":82,"머 ":157," 나뉘":181," 나뉜":158,"폴레옹":142,"도전":67,"동양":321,"패키지":72,"도인":180,"독의":116,"도이":817,"도의":1257,"독으":61,"독을":96,"독자":172,"돈을":58,"도입":373,"도자":409,"독이":170,"독일":2061,"도주":98," 남녀":64,"용이 ":226,"도중":100,"돌이":75,"동에":723," 나는":112,"먼 ":217,"동영":106,"동을":1013,"동으":429,"동은":199,"두루":66,"동의":526,"둘러":259,"동이":750,"동인":172,"붉은 ":71,"동일":302,"동작":235,"동자":198,"동장":81,"동원":102,"동위":88,"용의 ":212,"도지":99,"동조":206,"동전":186,"동적":145," 나들":286,"양을 ":259,"페스티":62," 나급":177,"동서":126,"동성":165,"두는":59,"동생":282,"펜실베":90," 남구":129," 내각":396,"운영 ":417,"되며":571,"되면":541," 남겼":58,"분석하":113,"도와":470,"돌아":199,"도에":874,"영향력":151,"도역":63,"도였":83,"도양":84,"도어":105," 남극":113,"가공의":109,"동시":546,"간다 ":126,"로프로":79,"영리 ":121,"유니버":126,"용을 ":583,"도원":152,"도요":90,"데타":127,"도움":144,"용은 ":157,"됐으":67,"돼지":84,"보편적":59," 나무":232,"서와 ":94,"들기":109,"사추세":98,"우의 ":106,"면 ":3592,"예를 ":240,"들고":156,"도쿠":187,"되어":4394,"들과":539,"도쿄":436,"가된 ":579,"되었":8669,"단체로":153," 나머":129,"등급":177," 내륙":67," 낱말":204,"등기":113," 내리":81,"독특":101,"명 ":4495,"등과":273," 내려":165,"등교":82,"몇 ":303,"되자":92,"두산":85," 낭만":59,"료이다":66,"울산광":189,"도청":90,"도체":168,"동진":82,"동지":68,"작가 ":277,"양성하":72,"동중":64," 남동":554,"능하게":61,"메 ":160,"영되었":153,"도착":72," 내는":102,"동차":918,"선에 ":199,"동천":62,"동청":79,"단체를":65,"동체":258,"며 ":20765,"두번":86," 나라":893,"동쪽":1485,"드가":250,"동화":155," 남서":495,"붙여 ":64," 남성":230," 년까":1209,"자가 ":1294,"몽 ":74,"영화를":78," 남산":66,"연방 ":286," 넘는":98,"동하":683,"동한":158,"동해":218,"동했":140,"못 ":69,"언십 ":93,"영화로":104," 년경":335," 년과":115,"몸 ":59,"룹의 ":106,"둥이":68,"드루":67,"들로":549,"소로 ":203,"양성을":74," 나아":105,"드로":556,"유를 ":151," 내셔":136,"드레":194,"있고 ":893,"드러":189," 네덜":389,"드라":851,"드래":93,"디나":115,"디난":62,"유대인":130," 남부":768," 남북":317,"뒤를":89,"드니":128,"드는":706," 끝에":201," 내무":66,"되지":653,"각국의":60," 남미":66,"와는 ":313,"두에":64,"두었":122," 내부":320,"두어":97," 년간":235,"들도":100,"도회":85,"의는 ":204,"도화":58,"세스 ":128,"몬 ":213,"영화는":78," 끝이":59,"돌프":99,"목 ":781,"모 ":847,"영향을":487,"독하":86,"든다":93,"도한":130,"도하":306,"영하고":172,"성북구":63,"때는":184,"데스 ":135," 남쪽":933,"인권 ":61," 내지":88," 낮은":182,"듀서":131,"각각의":117," 남자":211," 내용":717,"상징하":77,"디렉":63,"오후 ":120,"우에 ":227," 내의":129," 날짜":60,"단체이":367," 내전":128," 놓고":87,"단체의":76,"뒤에":153,"소련 ":87," 농구":95," 내장":76,"들면":80," 남양":79," 년대":956,"들목":286,"때까":243," 널리":529,"여러가":94,"가는 ":557," 년도":79," 너무":67," 남원":73,"드벤":58,"분을 ":409,"일곱 ":76," 나중":141," 남은":57," 내에":577,"인구 ":628,"분은 ":160,"드를":373,"드르":92,"드림":70,"서에 ":154,"드리":485," 나온":189," 낙엽":63,"둘째":114," 나오":450,"북서쪽":240," 나왔":80," 나와":88,"일과 ":143,"상태 ":130," 남아":516,"등록":321," 나이":323,"디션":57,"들은":1170,"들을":1666,"등에":962,"떠나":66,"들의":1832,"폴란드":330," 나폴":174,"드이":229,"드인":87,"떻게":123,"록하였":99,"부의 ":953," 논리":244,"성시 ":68,"북서부":257," 넘어":117," 노리":66,"때문":1343,"들이":3059,"들인":129," 노르":359,"드와":268," 년부":1632,"유럽 ":636,"때로":134,"물 ":1678,"드에":481,"드어":129,"등식":95,"드의":964," 노력":128,"대중화":66," 논란":122,"드웨":238,"들여":122,"드워":117,"들었":219,"들에":883,"들어":2018," 노래":446," 노란":64,"롤플레":72," 노드":65,"디부":57,"맡고 ":193,"딸로":82,"능하다":188,"따른":285,"따르":402,"부인 ":190,"디비":83,"리》":59,"오키나":93,"문 ":1427," 나타":1132,"무 ":858,"원소 ":72,"양수산":143," 나치":207,"서양 ":162," 나카":79,"생으로":111,"묘 ":112," 뜻 ":130,"따라":2057,"생이다":99,"린다 ":670,"드시":125," 노동":422,"도록 ":1040,"되기 ":267,"도로 ":981,"따로":85," 년마":59,"용산구":85,"디미":126,"드스":58,"온타리":105,"따왔":76,"디지":599,"따위":130,"따온":72,"디즈":107," 년이":145,"우와 ":60," 노선":481," 년의":255," 년을":99," 녹색":130,"리다 ":103," 년에":3712,"드컵":316,"디젤":71,"디우":62," 누구":77,"디움":88," 농민":87,"디자":541," 노부":118,"디어":563,"디언":175,"디에":99,"유대교":67,"닌텐도":124,"디오":1235,"단편 ":96,"더욱 ":87," 노벨":241,"디아":393,"르만 ":85," 네이":195,"등지":173," 넣어":82," 논문":122,"비나 ":82,"도니아":125," 네오":57,"때부":94," 남한":57," 농림":654," 남해":111,"따서":170,"디스":467," 남편":86," ㎢이":537," 라 ":285,"등을":1777,"등으":538,"등은":87,"듯이":70,"등의":1888,"등이":1018," 넓은":226,"등장":884," 란 ":635," 넓이":75,"데시 ":86," 놓여":58,"분의 ":454,"등한":57,"등학":492,"또는":7080,"앤젤레":101," 네팔":63," 네트":572,"듬해":68," 높아":61," 놀이":123," 농업":387," 농어":113," 논쟁":99,"》를 ":132," 논의":77,"엘리자":71," 녹음":67,"등포":159,"분이 ":204,"르를 ":106,"득하":114," 노예":114,"영하는":250,"때의":171,"에선 ":72,"에서 ":25246,"때에":310,"북서태":59,"원수 ":69,"분자 ":68,"업발전":65,"딸이":133," 농산":100,"이기 ":244,"유로 ":235," 노스":140," 질서":67," 지수":62,"출신":1014,"출시":497,"치가":564," 지시":73," 지식":486,"활용하":92,"폭 ":101,"리미어":109,"치고":191,"포 ":705,"말라위":74,"회장 ":94,"치권":60,"폰 ":320,"치구":161,"협회 ":346,"치기":91,"축에":59,"폴 ":159,"추어":204,"친구":113,"최초":1069," 증진":256,"출연":224,"폼 ":65,"추원":63,"침공":92,"달린 ":60," 지어":146,"카가":116," 진압":74,"달리 ":397," 지었":64," 지역":3302,"축을":74," 직업":172,"축이":59,"축일":68,"카고":81,"추적":79,"총회":148,"추정":272,"출을":84,"축제":166," 지원":890," 직원":79,"출이":71,"출입":201," 지위":222,"출자":57,"출장":65," 지은":141," 집안":60,"출전":170,"홍콩 ":113," 지점":194," 지정":819," 지적":134," 진입":80,"트는 ":279,"추진":396," 직접":507," 직전":63,"치는":396," 지중":131,"츠를":86,"치단":151," 진주":131,"충류":73," 지명":221,"축물":164," 지리":258,"츠가":74,"신성 ":109," 주파":92," 직무":110,"최신":58,"등학교":448,"회적 ":239,"름이 ":380,"름의 ":136,"충무":57," 지방":2170," 지배":541,"출범":212,"단되었":80,"출발":166,"취득":125,"시뮬레":108," 진보":117,"편 ":523,"르시아":205,"총칭":263,"충분":86,"충북":60," 질병":153,"축산":260,"추상":117,"평 ":133,"최종":194," 즉위":118," 중학":70,"축소":72,"출생":339,"최적":97,"추세":127,"폐 ":75," 직선":63," 중화":785,"름은 ":937," 지속":294," 지상":157,"름을 ":720,"리시 ":78,"카마":74,"카리":164,"카를":321,"카르":370,"카로":94,"출판":563,"취지":64,"김일성":79,"말레이":208,"칼라":126,"출한":142,"출할":58,"호텔 ":84,"출하":502,"측에":78,"캐롤":76,"풀 ":129," 지하":383,"츠에":83,"축하":152,"카메":240,"시민들":57,"릭스 ":81," 지표":99,"칼리":211,"푸 ":69,"츠의":100," 진화":199,"풍 ":312,"캐릭":191,"식물의":97," 지휘":383," 직후":72," 질환":60," 집필":59,"회의 ":1094,"스스로":208,"치세":62," 집행":129,"대략 ":95," 집합":390,"츠와":84," 키 ":71,"카미":146," 지향":127," 지형":89," 진행":628,"품 ":726,"기준이":62,"실상 ":131,"치스":72,"치시":115,"비트 ":196,"층을":58,"층의":89,"기준으":386,"층이":72,"캘리":254,"측정":243,"층에":486,"리스 ":961,"흥과 ":58," 진흥":250,"름인 ":66," 킹 ":69,"캄보":77,"화이트":83," 집중":134,"치된":254,"치되":225,"카나":127,"취소":59,"치도":175,"충주":70,"측면":113," 집적":64," 지지":202," 지진":67," 지질":87,"친다":61,"류이다":187,"최하":230,"카데":164,"추측":97,"카니":70,"춘추":91,"추출":65,"획을 ":70," 짧은":155,"춘천":122,"칠레":106," 큰 ":1157,"치로":193,"치료":253,"치러":226,"캐논":62,"최후":86,"카드":366," 지칭":299,"충청":623,"캐나":550," 진출":298,"르지 ":83,"트》":61,"카라":247,"친목":162,"치면":57," 지키":86,"칠리":66,"표 ":478,"취임":66,"치를":548,"침략":84,"다리 ":68,"청소":381,"총독":192," 주민":210,"체성":73," 중립":59,"순히 ":106,"티 ":784,"최고":599," 주변":342,"틱 ":146,"틴 ":360,"차트":146," 정해":170," 정하":119," 정한":85," 제품":480," 좋은":135," 종전":57," 주를":95,"촌동":62," 종족":57," 중랑":60," 종종":129,"창출":92,"초등":129," 제한":242," 정확":274," 주목":103," 중력":134,"청사":253,"천시":356,"초명":74,"천이":116,"착한":88,"착하":100,"쳐서":143,"천의":92,"천을":85," 중복":60,"천으":103," 증가":212,"청에":126," 즐기":62,"최남":61,"철의":57," 중부":577,"파 ":1076," 증거":174,"시리즈":1281,"판 ":692,"천주":138," 주석":95,"다를 ":70,"청원":71," 증권":93,"팔 ":66,"체에":420,"초로":518,"체스":170," 조치":94,"형태이":106,"처에":159,"형태의":265,"천안":97,"최근":299," 조카":75,"천연":139,"등포구":133," 중반":138," 준비":90,"초를":88,"채택":233,"팀 ":383,"처의":89,"처음":1265,"팅 ":469,"창의":108,"쳤다":142,"차지":589,"채용":84,"창작":170,"처벌":87," 전투":1219,"참전":80," 주된":89,"참의":100,"초기":696,"체로":624,"책에":132,"창원":207," 준다":87,"천문":324," 주도":1756," 전통":766,"책의":157," 전파":198,"투스 ":105,"책을":239,"책은":69,"책으":129,"총괄":131," 조에":97,"처분":130,"마르크":179,"책임":299,"창조":274,"책이":259,"스의 ":2137,"참조":109," 조약":551,"체르":60,"체를":704,"천리":76,"천만":67,"참여":643,"찾아":159,"차적":58,"초고":65," 존스":65," 종사":119,"초과":70,"처리":574,"차의":173,"차이":655,"차장":60," 중남":73," 정치":2113," 주는":346," 제출":84,"창업":76,"차종":112,"찰이":62,"초구":256," 주니":71," 전화":182," 전환":128,"초대":489," 좁은":114," 전형":79," 전혀":90,"카이도":140,"마리아":248,"찰청":251,"다만 ":77,"청북":251," 조조":61," 조종":81," 전해":249,"초동":59," 좌우":58," 접해":75," 종이":322,"천사":64," 종의":95," 접하":505," 접한":376," 종을":67," 조지":462," 조직":785," 종으":82," 전후":94," 주로":2214,"튼 ":196," 졸업":314,"트 ":6929," 정통":81," 중단":103," 중대":70," 존재":1242," 조정":209," 조제":62," 전하":166," 전한":60," 저항":140," 조절":103," 적합":91," 조작":128,"틀 ":149," 중동":117,"첫번":86,"칭이다":298,"충돌":185,"초의":700," 집단":362," 주체":106,"초이":60,"초자":71,"출된":104," 지도":570,"체코":190," 지니":158," 지닌":190," 중점":61," 지대":176," 중종":67,"초연":81,"출되":164,"초에":140," 주지":67," 중이":151," 중인":117," 중의":484," 중장":65,"후속 ":76,"체하":137,"체학":60," 질량":159," 주택":112,"페 ":146,"체험":108,"최소":174," 지류":128,"촬영":165,"청하":95,"출력":193,"총재":61,"총장":195," 중추":105,"최상":86,"먹는 ":101,"추리":70,"천황":417,"취급":141,"형태로":346,"촉진":252,"체포":86,"처형":64,"총연":66,"추락":82," 주축":65,"천하":69,"퍼 ":383,"초점":102,"머니 ":100," 주최":206,"철학":918," 증명":166,"천체":227,"컬럼비":62," 지구":803,"추기":89," 조합":232," 종파":171,"챔피":311," 중소":133," 중생":144,"형태를":152,"슨의 ":72,"최되":220,"최된":194," 중서":84,"추구":133," 중성":88," 중세":272,"팬 ":60,"축구":1084," 주식":300,"슨이 ":59,"패 ":76,"청주":162,"출간":124,"체제":657,"다면 ":148,"체적":348,"추고":107,"최대":550,"체의":621,"축가":82,"시설 ":150,"체인":196,"체이":649,"팝 ":104,"천지":81,"창한":58,"청자":73,"청장":193,"체육":1359,"추가":282,"청으":59,"총리":508," 주소":166,"청은":176,"체와":186,"초반":121,"청이":131,"청의":110," 증기":82," 중요":799,"추는":61,"총선":155," 주주":79," 지낸":214," 지내":96," 줄이":91," 준정":80," 지냈":558," 주제":250,"초식":67,"쳐진":69," 지나":284," 지난":146," 중에":696," 중앙":1159," 주장":603," 죽음":141," 죽은":130," 주인":242," 주이":140,"기타 ":345," 주자":62," 주의":999," 집권":84,"률이다":60," 종합":466," 주위":126," 줄여":450," 좌표":69,"쳐져":57," 주요":741," 주와":621," 주연":98," 주었":87," 지금":536," 지급":139," 주에":722," 주어":275,"충격":69," 중시":60," 조화":72," 중심":1728,"척추":104,"형 ":1793,"코토":79,"콘텐":200,"카에서":101,"협 ":123," 차별":95,"코트":155,"코틀":235," 창립":241," 체계":535,"코퍼":63," 체결":226,"쿠시":66,"쿠스":228," 청구":123,"크는":102," 채무":154,"크다":86,"키가":93," 차량":299,"징이다":166,"말리아":59,"쿠바":94,"크고":102,"크게":351,"황제로":69,"크가":121," 차례":291,"현 ":2291,"혁 ":104,"혀 ":208,"코타":86,"징으로":70,"크기":368,"대륙 ":89,"케팅":130,"컴퓨":1719,"험 ":357," 채널":298,"쿠르":111,"스와 ":613,"칭으로":286," 찾는":68," 창달":98," 창단":225," 창당":93,"대를 ":566,"컨트":120,"코어":137,"코에":118,"리비아":152," 채권":172," 차남":67,"지중해":127,"커피":96,"코이":72,"코의":222,"컴파":63,"허 ":115," 차단":59,"헌 ":139,"쿠라":165,"태는":61,"킬로":125,"대만 ":63," 첫번":85,"키르":123," 처분":74,"타디":124,"타로":108,"크스":277,"쿼크":68," 초대":479,"타라":76,"택되":100,"키며":125,"효 ":88,"타는":77," 차종":107," 창업":69,"크바":228," 초고":57," 찾아":157," 천만":60," 참여":607," 차이":388," 처리":445," 책이":152," 책임":221," 창조":192," 총괄":120," 책으":80," 참조":105," 채용":77,"회 ":3813," 창작":141," 처벌":80,"획 ":208," 창의":90," 참전":79," 차지":583," 창원":193," 참의":90," 초기":674,"타니":105," 천문":270,"크레":166,"크로":1143,"활 ":172,"클래":142,"클라":364,"크루":160,"타고":165,"타공":116,"지질학":58,"통칭은":57,"키나":206,"화 ":3719," 창설":237,"크래":193," 찰스":137,"크라":447,"화·":76," 청동":62,"환 ":439,"지지하":61,"킨다":751,"크메":76,"태계":75,"클리":189,"클린":94,"니메이":622,"타난":144,"타나":395," 차원":250,"타내":438,"타낸":182,"클럽":362,"클레":211," 창시":139,"클론":67,"클로":182,"키는":1068,"크를":175,"탐구":65,"태가":129,"크리":684,"황 ":673,"크림":80,"크립":110,"혼 ":107,"홀 ":76," 청나":165,"코프":157,"호 ":2171,"키고":232,"코하":57," 철도":572,"타가":60," 첨단":85,"쿠오":85," 청년":105,"키기":265," 차세":58,"홈 ":112,"시리아":214,"면과 ":135,"르에서":63,"순한 ":97,"플 ":168,"리아 ":2068,"코노":61,"까지를":69,"코네":66,"리안 ":121,"픈 ":178,"콩고":162,"커버":63,"프 ":1611,"코나":75,"측하":71,"담당한":59,"담당하":386,"대로 ":879,"켰다":167,"카지":59,"커뮤":174,"치체":194,"칭으":286,"치지":83,"칭을":179,"칭은":479,"카와":268,"리야 ":76,"카오":100,"침입":91,"캐스":57,"카운":74,"칭이":493,"카우":76,"컬러":63,"츠키":72,"컬럼":77,"카의":270,"코가":63,"카자":100,"카이":616,"카인":59,"신라의":85,"취하":134,"칙을":131,"칙으":83,"치의":242,"치이":165,"친위":80,"치인":905,"칙이":149,"케도":119,"친일":133,"치적":263,"카에":232,"지털 ":477,"카야":118,"치주":107,"치에":313,"카스":306,"칙에":97,"치와":131,"카시":93,"치원":65,"후부터":65,"케다":64,"커다":69,"카사":70,"리어 ":186,"컫는":618,"치안":73,"치아":129," 참고":71," 창건":80," 창간":97,"쿠데":107,"향 ":296,"리엄 ":215,"쿠니":115," 참가":722,"콘스":196,"코시":67,"리얼 ":67,"술한 ":81," 차관":105,"코스":429,"치인이":484,"치인으":223,"콘서":84,"케도니":118,"리에 ":892,"행 ":946,"황제이":83,"해 ":11775,"핵 ":98,"캠퍼":91,"쿠가":151,"케이":822,"케인":158,"항 ":503,"마스 ":131,"컵의":85,"코미":140,"코믹":87,"합 ":1352,"함 ":501,"현하는":126,"한 ":36846,"학·":96,"할 ":5085,"컵에":64,"칭호":106,"카프":65,"학 ":3682,"하 ":1115,"침해":108,"핑 ":85,"스어 ":438,"코리":211,"코르":181,"콜로":109,"칭한":198,"칭하":436,"콜롬":91,"필 ":124,"친환":81,"코로":88,"까지는":225,"리오 ":289,"스에 ":397,"콜라":224,"케스":103,"카페":78,"치하":2020,"치학":71,"치한":3549,"치해":358,"치했":87,"현하기":80,"콩고 ":134,"피 ":342,"픽 ":726,"카트":104,"핀 ":359,"까지도":76,"기초로":124,"기초를":61,"코드":673,"커스":115,"코딩":85,"카타":138," 언론":305,"중종":67," 알아":113,"진다":650,"진단":77," 안에":305," 않아":113,"황제 ":355,"지대":330," 않았":277," 안양":112," 에디":95,"중점":70," 어려":133,"중적":79,"지니":319,"지닌":190," 에드":185," 어렵":70," 예금":57,"지는":2060,"지능":93,"즈를":119,"중이":356,"중인":249,"중장":67,"중재":63,"주지":232,"중을":67,"중의":646," 에도":489," 액션":106,"주최":210," 안정":204," 안전":386," 얼마":78," 열대":176,"호작용":139," 아제":99,"즈미":73," 않을":60," 않으":216," 않은":394,"주체":150,"집단":449," 연대":172," 안의":57," 어머":279," 야생":70,"중지":63," 아이":1133," 역대":77," 아인":68," 아일":388," 어릴":143,"진동":81," 어린":266,"직된":74,"지되":322,"지정된":110,"지정되":305," 아우":255,"지도":837,"만들기":94,"지동":62," 아울":60,"지라":98," 않지":68,"만들고":94," 에르":95," 양성":269," 아직":134," 였다":83," 양산":93,"증명":181,"다르게":104," 야스":61," 아즈":91,"즈베":96,"주축":66," 앞에":89," 아주":105," 옮겨":85," 열도":76," 알제":77,"지를":1015,"지르":60,"비주얼":58," 여러":1622,"지류":134,"질로":116," 양식":231,"진료":73,"지리":373,"때에는":117,"지름":79," 영등":165,"질량":179,"주택":185," 완공":67,"중추":123,"마드리":66,"지로":949," 엘리":185," 엄밀":69,"질랜":239," 업무":536," 옛날":72,"률이 ":74," 영동":66," 오다":74," 오늘":444," 여름":113,"주특":163,"리랑카":66," 아침":58," 약자":169," 아카":249," 왕국":656,"친위대":72,"직무":110,"주파":110,"비즈니":135,"치이다":129," 왕가":158,"진리":60,"지원하":329,"지막":450,"지마":92,"지만":2443,"쿄 ":369,"지명":245,"진이다":61," 연료":112,"신문》":123," 열렸":185,"지며":169,"지면":175," 예로":96,"진으로":71," 압축":113,"주한":124,"지부":157,"휘는 ":146,"주하":399,"진보":157,"률의 ":82," 액체":118," 오디":174,"지배":563,"지방":3019," 열리":279," 아케":110," 열린":474,"지바":88," 여명":78," 연맹":288," 온도":178," 여부":76,"쿨 ":70,"즈에":211,"준화":118,"째로":481," 외계":79," 에서":1916," 영문":107," 안토":103," 양주":57,"즈와":76," 외교":366," 아키":249," 예루":104," 양의":74,"질병":167," 양자":305,"률을 ":89,"칙이다":82," 연방":722," 양재":71,"쿠 ":497," 예를":204," 아테":125,"리브 ":93,"주화":87," 아토":61," 왔다":158," 올랐":110,"지상":185," 엑스":155," 언어":1410," 알파":121,"지선":61,"중화":873,"즌이":138," 양쪽":62," 얻어":102,"직선":85," 예방":102," 아폴":70,"다루는":196," 어원":87," 얻었":76,"지션":130," 오르":201," 오른":92,"지속":297," 억원":58," 오류":88," 영미":82," 오랫":76," 완도":63," 오래":309," 아트":82," 외국":347," 오랜":130," 온라":297,"중학":110," 아파":129," 아티":99," 어업":104,"칙으로":83,"즉위":118,"다룬다":88,"중해":135," 올라":148," 에스":395,"즈이":155,"지사":210,"즈의":362," 안티":91," 여성":671,"지스":206," 여섯":126," 없었":105," 연산":188," 역설":57,"즘의":66," 없어":111,"진수":64,"즘이":67," 약칭":414," 역삼":92," 아프":573," 역사":1519,"지수":116,"질서":165," 오리":271," 연쇄":64," 여수":102," 연속":241,"진시":95," 언제":60," 요구":281," 예비":61," 알프":80,"증을":98," 올림":799," 얼음":64," 올리":108," 차 ":1230,"지식":562,"지시":133,"능을 ":412," 얻은":80," 업적":116,"통적인":164," 없이":247," 여신":124," 역시":116," 없으":149," 에어":174,"지점에":130,"증진":435," 에이":242," 예산":87," 애플":267,"지어":207," 암호":144,"지아":300," 올바":113,"지야":80," 오버":65," 영상":317," 에우":58,"진영":77," 오브":259,"진에":166,"지와":387," 책 ":74,"직업":192,"직에":115,"지역":3831,"지연":62," 예선":234,"지었":74,"진압":79,"지에":1070,"지오":59," 채 ":79,"지였":87,"지인":168,"지이":710,"직의":93,"직을":212,"직으":106,"지의":930," 연애":57,"집안":61,"지은":150,"진은":66,"진으":71,"직자":136,"직임":89,"직이":307,"즈치":65,"직접적":93,"지자":110," 예수":291," 예술":486,"직원":124,"지원":1263,"진왜":117,"질에":101," 연안":287,"직위":91,"지위":252," 엔진":327,"질의":280,"질이":274,"징역":61,"켰다 ":152,"진정":61,"진을":324," 여왕":102,"진의":113,"집에":83,"투르크":97,"진이":164," 연예":76,"지적":171,"진입":88,"지점":270,"지정":864,"직전":64,"직접":511,"질은":90,"질을":426,"집을":67,"진주":148," 오세":84," 업체":175," 연재":197," 영양":101," 월간":68," 연장":152,"집이":101," 원격":57," 영어":930," 역을":64," 역은":61,"뉴욕의":60," 여의":113,"지주":81,"질적":241," 오사":151,"지중":143," 역임":393," 역이":108," 여자":305,"치체 ":79,"집적":68," 외래":64,"징이":263," 영역":323,"지질":112,"징을":98,"지지":438,"지진":90," 영업":143,"징으":70," 예언":71,"호이다":121," 여주":68,"니버스":58," 옆에":57,"집중":149," 왕복":62,"닉스 ":124," 연주":271," 오스":1097," 영웅":113,"황의 ":111,"큐 ":96,"쪽과":149,"수행 ":60," 용도":92," 월까":163,"짧은":155,"지청":89,"맞대고":167," 영조":60," 왕비":129,"황을 ":121," 예전":119," 예정":308," 영주":76," 운동":905," 외무":89," 영지":58," 완성":200,"수한 ":277," 오염":68,"진출":340,"지칭":300," 열차":200,"지컬":96,"크 ":3097,"니스 ":252,"통적으":123," 요리":491,"황이 ":66,"큰 ":1169," 요르":61," 엔터":268," 오이":81," 연출":58,"다르다":167,"클 ":181," 외부":231," 어항":59,"만든다":72,"만드는":271," 오전":103," 우라":85,"지키":149," 천 ":124,"큼 ":96," 에티":97,"까지의":267," 왕실":92,"지털":569,"수학 ":153,"통치 ":72," 위기":77,"딸이다":74,"주광":202,"주관":327,"정주":71,"주국":154,"준공":69,"주군":206,"지정하":64,"지정한":88,"주교":442,"제조":448," 쓰는":224,"재한":349,"재학":69,"재하":725,"조부":69,"제정":539,"주공":78,"제적":455,"주고":191,"재했":212,"재해":152,"족보":60,"제의":631,"주가":248,"장했":136,"정전":83,"정적":273,"주간":164,"제일":134,"제임":217,"제작":1329,"제자":224,"주거":121,"정조":73,"종목":228,"제이":524,"제인":138,"전지":147,"전직":85,"정일":58,"정인":93,"맞닿아":98,"정자":73,"접전":71,"접적":132,"장해":59,"장하":1342,"장한":379,"장학":77,"현한 ":91,"제와":162,"제외":408,"적지":100,"잡한":94,"정으":195,"저지":133,"자흐":77,"정을":810,"정은":146,"정이":736,"정의":966,"제어":265,"점유":74,"점은":112,"제에":429,"점을":418,"점으":208,"칼 ":114,"정원":147,"제연":82,"점의":91,"전주":277,"종류":648,"제였":72,"점이":420,"자협":82,"전제":91,"칸 ":384,"제안":211,"장편":92,"르와 ":76,"조물":104,"재판":339,"족문":173,"종료":106,"제약":76,"자회":144,"전자":1283,"작해":97,"작했":259,"조명":72,"카 ":2034,"적절":91,"절을":58,"전쟁":1436,"정에":737,"절이":80,"전적":194,"젊은":111,"종로":421,"기한 ":75,"주년":107,"존속":102," 아가":58,"정체":133,"중기":285,"정청":74,"주나":75,"접촉":83,"조세":77,"조성":326,"정책":908,"정착":170,"조선":2884,"중구":359,"중국":2218,"져있":131,"조상":99,"중교":61,"조사":870,"점차":84,"중계":60,"줄기":211,"중간":227,"전철":111,"전체":673," 쪽 ":114,"절차":197,"전청":59," 쓴다":97,"전차":161,"주기":300,"정직":60,"정지":148,"제주":546,"주권":127,"재화":72,"재활":72,"조약":652,"조에":276,"족에":140,"조어":71,"조업":97,"전파":285,"중단":108," 아나":248," 아날":71,"조와":122,"존에":63,"전통":832,"주도":1871," 않기":58,"주된":119,"전투":1249,"만들었":181,"주되":65,"만들어":1188,"제출":94,"주는":689,"죄를":91," 알고":262,"주니":101,"주당":264," 악기":149,"중남":73,"준다":217,"제천":67,"졌으":236,"존스":69,"종사":195," 않고":534,"정치":2288,"접하":574,"접한":492,"토콜 ":80,"종의":527,"전후":101,"조지":513,"조직":975,"종으":458,"종은":82,"종을":142," 아돌":57," 아동":142,"정할":108,"제품":561,"정하":1154,"정한":1132,"정해":250,"컵 ":410,"종자":67,"좌우":61," 안동":112,"종이":791,"접해":436," 야구":716,"적화":77,"전혀":91,"절하":80,"절한":80,"르에 ":62," 약간":124,"젝트":419," 아니":980," 아닌":443,"조종":82,"제트":78,"전했":92,"조조":79," 않다":148,"주로":2465,"좁은":114,"전화":322,"전환":157,"컬 ":194,"주력":61," 않는":847,"족주":88,"전형":115,"존의":272,"저항":165,"적하":72,"헨티나":186,"조이":169,"족의":411,"조인":125,"족이":405,"조작":149,"족인":57,"전할":66,"전한":509,"전학":81,"전하":663,"종에":80,"전함":71,"전해":286,"적합":98,"조절":138,"조적":99,"조제":92,"조정":332,"존재":1248,"커 ":224,"정통":92,"졸업":318," 아내":168,"중대":89,"중도":73,"족을":151,"족은":109,"중동":130,"조의":562,"족으":212,"또는 ":7073,"주민":357,"제휴":62,"중립":69," 아라":177," 아랍":266," 아래":375," 양국":63,"제후":83,"조치":133,"주변":352,"즈가":129,"켓 ":162,"중랑":61,"종종":130," 안드":171,"제하":242,"제학":350,"종족":62,"정확":287,"제한":327,"정화":73," 아드":72,"주를":291,"전히":177,"종전":66,"좋은":148,"제프":122," 아들":897,"주문":90,"켄 ":72," 애니":636,"제회":58,"중력":144,"주목":108,"케 ":324,"중부":615," 아무":102,"즐기":62,"증거":195,"중복":64,"증가":223," 알리":168," 없고":94," 아버":489," 앙리":58,"증권":184,"치에서":60,"주선":105,"주석":120," 아미":102," 얼굴":67," 언급":141," 아바":62,"죄의":58," 쓰여":83," 쓰였":116,"주사":146," 아름":145," 아르":573,"켜 ":348," 알려":1676," 알렉":314," 알레":71,"조카":85," 앤드":80," 아메":328," 압력":91,"준비":133,"중반":143," 아마":238,"식별 ":61,"다루고":124," 아리":130,"즈는":180,"중서":88,"지개":84,"지가":490,"중성":112,"즈니":258,"지각":64,"중세":273,"중소":176,"종특":81,"지게":172,"조합":512,"조항":62,"조하":271,"조한":146,"족한":81,"족하":252,"종파":173,"지고":1085,"족행":61,"코 ":1075,"지관":81,"직공":120,"존하":237," 어느":236,"지구":953,"진공":59,"진과":118,"콘 ":142,"스부르":91," 여개":62," 여객":163,"조화":108,"중시":81," 야마":192,"중심":1840,"증기":111,"주소":193," 알바":142," 쓰이":700," 쓰인":357," 아부":59," 에게":145," 아브":60,"주시":655,"주식":354," 아비":83,"중생":157,"집이다":61,"준의":154,"준을":139,"준이":188,"준인":62," 연기":182,"중앙":1401," 암석":66,"주자":204,"주일":76," 암살":118," 아스":164," 어디":65,"주재":60,"준으":443," 아시":766,"주장":609," 연극":159,"지나":378,"중에":1098,"지난":146,"지낸":215,"줄이":100,"지내":104," 에는":69," 어떤":681," 어떠":156,"주전":67,"주적":67," 어떻":123,"주제":335,"지널":67,"지냈":559,"준정":82,"즈로":101,"주주":795," 영국":2073,"중요":802,"지노":57,"주연":137,"지기":214," 아서":83,"주에":945," 여겨":280," 얻는":95,"콜 ":173,"주얼":72,"주었":136," 앨범":615,"지금":541,"지급":150,"질과":85,"주어":317,"진구":106,"좌파":64," 아사":106," 아산":69," 연계":84," 연결":746," 양력":106,"준에":93,"주와":720," 언덕":79," 안산":100,"주였":57,"줄여":450,"종하":70,"좌표":82," 에너":453," 연관":129,"증대":108," 연고":210,"주요":747,"질적인":82,"죽음":150,"죽을":61," 여기":344,"죽은":134," 없다":375,"주인":347,"주이":578," 어드":76,"주의":4317,"집권":101,"콩 ":169," 없는":546,"주위":137,"종합":594," 연구":2349,"통 ":1329," 이오":58,"찬가":157," 이온":105,"지에서":319," 자신":748," 이와":206," 이외":199," 정당":373,"차관":122," 일어":792," 이용":1530," 일에":1863," 이웃":64," 적도":79," 절대":86," 의존":77," 전도":66,"톰 ":68," 전동":120,"톱 ":100," 이야":437," 제네":68,"차가":214," 이어":589," 이에":430," 이었":171," 이전":899," 장수":124," 위해":3054," 위험":192,"창간":101," 있었":1191," 일으":392," 일을":310," 인재":83," 잠시":62,"짧은 ":150," 이정":57," 있어":761," 이제":64," 제도":983," 인접":485,"지역번":77," 인정":430," 이종":59," 일자":88,"참고":72,"급하는":96," 위협":73," 일이":219," 장식":99,"창건":81,"증진을":113," 정도":1045," 이유":297," 인용":64," 이익":269," 장소":242," 재산":268," 일요":83," 이자":142," 유통":251," 이의":67," 제대":57," 잠수":90," 위한":1962," 일원":183," 위하":1580,"참가":738," 재생":127," 이수":74," 전남":74,"다섯 ":151," 인쇄":158," 원형":97,"스미스":64," 일생":60," 저널":82," 재무":111," 이스":433," 이슬":305," 인수":172," 이승":76," 익스":106," 응용":360," 제기":143," 유체":74," 원활":65," 이성":104," 인사":173," 인상":58," 제고":88," 제공":975," 접근":267," 정권":212," 정기":77," 일상":122," 제국":1197," 정규":377,"증진에":72," 원하":87," 의장":114," 자생":67," 장비":172," 작사":165," 유치":74," 자살":101," 자산":102," 일시":89," 임시":200," 작성":314," 작센":73," 의정":119," 자세":74,"트남 ":220,"다목적":58," 전달":242," 인스":61," 위키":130," 위탁":113,"식물 ":196," 재배":113," 이시":95,"톤 ":252," 잠비":60,"토 ":1338," 인식":217," 의원":243," 유출":60," 종교":1217," 이치":73," 적분":68,"급하고":62," 장점":70,"회에 ":290,"채널":346," 재임":96,"기호 ":76," 전반":229," 일체":59," 재위":670," 정리":561," 인체":78," 인천":505," 장의":150," 정립":87," 정복":93," 재직":117," 정보":1417," 제반":57," 이탈":1113," 일컫":636," 일컬":82," 재즈":82," 자체":292," 입출":70," 은하":175," 제목":256,"찾는":69," 전부":159," 전북":64," 재정":153," 일치":69," 은행":189,"창당":94," 잡지":202,"창달":122,"창단":248,"기타공":108,"채권":223," 잡아":60," 작은":643," 전류":87,"차남":68," 잎은":103,"매를 ":89," 조각":203," 자유":815," 작위":102,"찾기":70," 자율":74," 인증":117," 작용":292," 인지":119," 자원":213,"비이다":74,"창군":88," 있으":3188," 전례":131," 있을":254," 전력":146," 입자":250," 이진":78," 이집":402," 입장":142," 있음":85," 이지":96," 자연":687," 이즈":59," 잭슨":67," 일종":701," 전략":271," 일제":542," 임의":128," 작업":288,"쪽과 ":149," 자에":59," 인조":152," 전라":664," 이중":112," 일정":464," 인종":139," 이주":143," 조기":59,"차단":60," 전문":804," 유효":82," 조금":94," 자주":201,"차는":139," 있지":409," 유형":175," 임진":126," 작전":169," 저명":67," 자전":132," 유행":117," 장애":251,"대부 ":69," 장안":62," 장악":76," 유한":96," 유학":142," 점령":153," 조건":224," 자이":125," 제사":70,"천국":78,"천구":94,"천군":186,"첨가":59," 이해":327," 이행":101,"화예술":214," 이하":307," 정수":130,"창립":248," 인해":475," 인한":143," 인하":140,"참모":66," 일하":99," 인형":68,"차별":151," 정신":352," 정식":444," 이후":1760," 전압":65," 적어":66,"청과":62,"체가":469,"청구":177," 제시":244," 전에":217," 일환":93,"체결":240," 전역":168,"체계":709," 입학":80,"체고":109," 적용":417," 조류":83," 작품":1113,"체국":70," 조리":61," 전용":143,"채무":169," 적은":107," 전원":79,"신들의":66," 저자":149," 저작":244," 적이":69," 저장":326," 전의":63," 점에":156," 정부":1251," 자치":673," 정비":99," 인텔":209," 인터":1102,"차량":357," 층 ":246,"막부 ":71," 전사":114," 전산":106," 장착":83," 저서":110," 의해":2065,"차로":318," 의한":371," 의하":461," 의학":175,"차례":334,"시민 ":74," 전세":104," 전설":193,"름에 ":102,"차르":58," 저술":147,"차를":208," 의회":263,"착륙":65," 전송":205,"지역으":204,"지역을":380,"지역은":111,"카이 ":111,"지역의":387,"지역이":326,"지역인":66," 정사":64," 전승":126," 점성":173," 전신":162," 전시":258,"투 ":527," 장치":478," 정상":182,"천광":247," 접속":105,"스박스":60,"창설":240," 제주":484," 주기":199,"청동":74," 전차":91," 주권":84," 준공":68,"찰스":137,"체는":221," 정지":85," 재화":72,"화에서":184,"체되":102,"채소":57," 줄기":140," 중간":217," 전철":72,"지역에":787," 절차":120," 전체":567," 중구":353," 중국":2024,"체들":133,"차에":128,"르의 ":305," 정착":154," 조선":2644," 정책":576," 조세":64," 조성":287," 점차":84,"창시":143," 조사":590," 조상":79,"처럼":463,"차원":316," 존속":100," 주년":103,"처를":60,"초가":64,"카의 ":270,"차와":65," 접촉":74," 정체":105," 중기":271," 장편":91,"리메이":69," 칸 ":70," 제안":202," 전제":81," 젊은":107," 종로":405,"신문 ":168,"지어졌":61," 전쟁":1259," 적절":82," 전자":881," 저지":85,"차선":57," 제외":403," 점이":100,"청나":170," 종류":641,"청남":319," 전주":245," 정원":86," 칼 ":70," 점을":70," 제어":219," 점유":66," 종료":104," 재판":231," 제약":63," 자회":101," 제이":81," 종목":202,"찰서":86,"천동":110," 주가":75," 주간":132," 전직":83,"체나":63,"비에트":309,"차세":59," 정의":672," 접전":68,"청년":158," 제조":389," 주교":176," 주고":100," 제정":501," 주관":308," 재학":57,"처드":110,"첨단":94," 재해":66," 제임":214," 제일":119," 제작":1250," 제자":112," 주거":104,"철도":871,"탁 ":81,"타 ":1864," 용산":140," 오키":113," 영향":784,"탄 ":632," 유니":282," 유닉":93," 온타":105," 연호":83," 외에":213,"진흥":630,"징하":82," 유네":90," 요시":226,"지위를":75,"킹 ":248,"회사의":138,"지휘":406," 오케":84,"비잔티":171,"진화":280," 역할":582,"회사인":94," 여행":159,"회사이":255,"직후":76," 역학":81,"집행":234," 윈도":635," 연합":775," 유나":92," 요소":324,"질환":102,"집필":60,"집합":497,"집하":97,"진행":633,"코가 ":60,"지형":108,"킨 ":405," 운반":62,"진하":250,"진한":63,"진해":69,"비치 ":261,"는지 ":96," 왕조":583,"회사에":86," 왕족":171," 요새":71," 원리":213," 오카":66,"킬 ":155," 위대":99,"질학":58," 유기":176,"집트":409," 왕위":174,"직하":135,"직한":63,"직할":115,"지할":102," 왕으":89,"지함":108,"지하":1289,"지한":165,"키 ":1509," 왕자":123,"지향":165," 왕이":223," 영토":326," 왕의":63,"지했":204,"환으로":121," 완전":315," 육군":269,"지표":121," 원래":602," 우리":336," 유가":72," 예측":113," 와이":160," 에피":148," 유고":73,"리소 ":81," 월드":600," 우르":62," 첫 ":814,"루트비":99," 자국":94," 운전":103," 있기":213," 이나":121," 율리":81," 잉글":370," 의도":111," 유명":895," 우정":89," 작곡":845," 이끄":118," 이끌":202," 임기":78," 임금":138,"진왜란":116," 윤리":120," 자격":186," 우익":62," 총 ":839," 유리":168," 작가":611," 입구":69," 자가":81," 운용":154," 일까":724," 있고":843,"화인민":476," 있게":303," 원시":110," 용인":92," 워싱":115," 인기":260," 운영":1278," 인근":199," 위반":87," 우에":75," 오후":126," 이기":80," 유로":181," 원수":138,"화재로":72," 인구":3560," 일곱":88," 인권":151," 원소":289," 유럽":1117," 유러":115," 인공":238," 유래":578," 오호":60," 윌리":269," 용어":934," 이곳":157," 원산":126," 이고":250,"회원 ":168," 초 ":138," 일간":93," 이것":480," 인간":752," 오피":88," 오프":61," 오픈":259," 요인":73," 왼쪽":64," 오하":61,"회사를":67,"매되었":349," 위를":178,"능이 ":86,"택 ":158,"태 ":322," 우승":675," 오페":308,"회사로":118," 운송":76," 우수":237,"탕 ":82," 위로":69," 울산":328," 월부":254,"탑 ":86," 영화":1571," 우선":79," 유동":58,"컫는다":300," 유도":157,"화재단":118," 오토":147," 유대":297," 의거":134,"탈 ":268," 의견":96," 육성":259," 음반":525," 자동":798," 유성":164," 이래":213," 의무":187," 작동":93," 이라":1915," 이란":1010," 위에":358," 이러":460," 이런":125," 이렇":64," 우크":191," 인력":88," 의미":1870," 유신":72," 이로":113," 이론":891," 요크":70," 이때":111," 있다":14657,"칭이 ":130," 원주":160,"쪽은":318," 있는":7503,"활약한":126,"활약하":64,"쪽의":188,"화유산":156," 잇는":267," 이들":580,"쪽으":3081," 이듬":68," 있던":474,"터 ":8679," 인디":276," 의료":229,"턴 ":549," 있도":441," 자는":478,"털 ":563,"활약했":89," 육상":128," 장남":113," 웨일":79," 유산":69," 웨이":59," 유사":324," 장관":274," 이다":3516,"칭은 ":476," 위상":193," 장교":115," 음력":494," 장군":295,"칭을 ":179,"쪽에":725," 위성":197," 원인":186," 읽는":58," 원작":227," 원자":518," 이동":421," 음료":60," 움직":247," 일대":282," 장기":194," 유비":58," 인도":1080," 원제":62," 원정":83," 요코":84," 자녀":61," 원조":65," 유물":109," 우주":613," 이념":113," 요청":100," 자극":96," 자금":128," 웹사":127," 우즈":92," 으로":1449," 자기":443," 웨스":246,"지이며":179," 울주":85,"화이다":369," 월에":788,"화재가":70," 이는":558," 육지":73,"시되었":188," 전기":671," 재료":157," 유지":523," 전까":100,"회사가":85," 장르":269," 자발":83," 의식":186," 자바":143," 잘못":97,"지원을":145," 전국":513," 적극":94,"시되어":63," 이산":64," 이사":315," 이상":1002," 익산":65," 정교":161," 위치":5975," 제거":141," 자본":218," 장면":76,"회와 ":238," 음악":1252," 전개":206," 유입":69," 자매":61," 의사":412," 유인":76," 유일":468,"직으로":106," 임무":131," 일반":1861," 자리":455," 이복":68,"신도시":92," 자를":83," 자문":64," 입법":102," 일부":2106," 전공":90,"직이는":74," 장로":61," 운행":339," 운항":58,"커다란":64," 음식":283,"직이다":99," 유전":388," 유적":194," 일본":4755," 일명":82," 유엔":66," 인물":662," 인문":59," 유역":93," 자란":59," 자라":124," 인명":72," 자랑":66,"템 ":507," 음성":147,"기후 ":77," 임명":212," 인민":184," 유용":63," 이번":134," 이베":69," 이벤":57,"시드니":63," 이민":78," 우편":108," 이미":324," 이바":343," 워크":74," 자료":390," 이름":3638," 이른":205," 이르":661," 이를":700," 일로":82," 위의":84,"텐 ":76," 재단":468," 이뤄":69," 일렉":96," 이룬":137," 이루":1409," 위원":421,"테 ":299," 일련":190," 원칙":300," 이며":915,"지이다":426," 일리":90," 위임":72," 인류":278," 요한":197," 입력":181,"카와 ":236,"텔 ":308,"우가":342,"앙행":163,"용과":94,"암호":151,"올바":113," 수술":59," 수신":86," 순수":111,"仕 ":99,"에의":112,"창 ":189,"예상":58,"애플":275,"예산":117,"에이":482,"염색":75,"업이":475,"업인":321,"기의 ":1762,"업자":222,"없이":431,"여신":129,"었으":2299,"었을":161,"업으":181,"업을":673,"업은":134,"연수":157,"없으":154,"업의":671,"영사":61,"업적":191,"오버":82,"영상":486,"에우":80,"리아는":57,"어지":702,"역시":1578,"현존하":104," 수소":95,"어진":2055," 수송":124,"압하":71," 순서":114,"찬 ":123,"언제":60,"요구":295,"양체":81,"연세":57," 승격":131,"알프":91,"열사":59,"예비":145,"와라":117,"어족":218,"역소":73,"양천":58,"안하":96,"올림":844,"얼음":64,"착 ":90,"안한":115,"어졌":507,"올리":188,"올린":122,"어져":637,"찰 ":199,"에어":186,"연쇄":70," 수상":426," 수산":175," 수사":159,"여수":112,"에야":80,"단법인":2067,"연속":265,"연소":68,"기상청":109,"업용":63,"요건":96,"역삼":95,"역상":157,"어인":161,"어이":939,"아프":916,"어있":255,"글이다":95,"역사":1691,"어의":702,"오리":350,"얻은":80,"차 ":2442,"업연":100,"여성":713,"연산":224,"人 ":246,"없었":109,"여섯":133,"역설":57,"업에":293,"없어":112," 승객":70,"언이":107,"악하":95,"여서":479,"언을":82,"약칭":420,"아하":63,"역의":628," 수영":134,"오사":185,"역임":393,"역인":112,"여자":435,"역이":801,"스타 ":243,"연을":133," 싱가":97,"화에 ":525,"약하":110,"약한":190,"언츠":61,"연의":96,"오세":131," 수요":109,"연이":74,"업체":378,"업청":73,"연장":227," 수용":208,"월간":108,"영양":129,"연재":221,"약했":96,"영어":955,"열을":131,"스탄 ":326,"원격":59," 수원":209," 좀 ":59,"여졌":92,"열의":161,"영업":166,"원경":141,"용노":84,"영에":132," 손해":107,"연예":123," 소형":113," 존 ":356,"열에":85,"엔진":374,"약품":127," 심각":57," 수업":58,"역으":485,"여의":138,"역은":412,"역을":746," 수여":189," 시기":380," 소프":898,"예술":941,"예수":361,"여야":90,"연애":59,"연안":304," 소피":59," 시공":71," 속해":183," 송파":169," 소행":123,"역에":1144,"우기":66," 속하":1012," 시계":86," 속한":669," 신경":196,"여왕":122," 조 ":468,"어촌":137,"우고":127,"예선":249,"오브":295,"책 ":401,"채 ":187,"앙회":62,"용기":122,"왜란":118,"우구":91,"업지":61,"업진":68,"었지":266," 시가":100," 시각":159," 시간":720,"영리":199,"양으":132,"양을":259,"언스":130,"아키":326,"예루":109,"후로 ":162,"양의":505,"아타":75,"아크":78," 제 ":4289,"예로":129,"만이 ":196,"기자 ":326,"어스":174,"오디":190,"액체":118,"于 ":68," 수많":217," 정 ":183," 수립":238,"압축":117,"온도":198," 수를":81," 수리":128," 성향":66,"와나":75,"양에":292,"여명":94,"연맹":487," 점 ":68,"아케":128,"열린":478,"열리":280,"오도":95,"어선":74,"어서":782," 쉽게":163,"亀 ":88," 세포":385,"아침":62,"亂 ":1513,"왕국":706,"온다":87,"아카":282,"약자":201,"많은 ":1265,"약이":262," 수록":335,"약을":246,"약은":85,"어사":104,"야이":202,"약의":117,"왕과":64,"스턴 ":192,"오다":102,"약으":104,"야의":315,"오니":86," 수렴":58,"여를":144," 선형":142,"여름":122," 센트":165,"오늘":448,"오는":554,"기장 ":96,"연료":154,"열렸":185,"스터 ":402," 전 ":1060,"연령":60,"왕가":164,"에밀":59," 손자":147,"에미":96,"어원":140,"亳 ":89,"오류":90,"올로":60,"억원":67," 숭배":60,"오르":366,"오른":100,"지하기":153,"오를":110,"요가":94,"예보":65,"어와":363,"언에":58,"왔다":418,"예방":135,"티드 ":85,"아폴":74," 수비":88,"양쪽":63,"알파":123,"얻어":102,"얻었":76,"올려":58,"어업":225,"어에":445,"만을 ":299,"오로":122,"안티":93,"올라":211,"에스":534,"엑스":201,"어오":76,"어온":57,"언어":1505,"올랐":113,"영미":83,"지하고":206,"오랫":76,"완도":69,"아티":246,"온라":299,"염병":62,"아파":201,"어야":172,"양지":59,"기인 ":130,"외곽":60,"양주":113,"외국":418,"오래":309,"아트":145,"리아를":70,"오랜":130,"외교":422,"오라":68," 수반":64,"亞 ":1949,"여부":101,"외계":90,"에서":29088,"안토":110,"영문":119,"에선":73,"아테":131,"아토":68,"양재":76,"역번":77,"연방":786,"언십":167,"양자":317,"만의 ":176,"양이":303,"예를":245,"와는":313,"어리":167,"어린":285,"어를":713,"모가 ":127,"오가":148,"어류":77," 소수":212," 셋째":71,"악원":61,"양부":117,"협의회":158,"야생":75,"안을":183,"안으":82,"아자":83,"악이":91,"아있":148," 소스":183,"아일":569,"아인":232,"악의":207,"역대":111,"아이":1321,"어링":63,"어릴":143,"악을":150,"아의":1952," 서태":96,"약성":129,"아제":124,"않은":399,"않으":216,"오고":97,"않을":60,"안이":117,"연대":266,"안의":235,"어머":290,"앙아":113," 수단":217," 선택":204,"연도":64,"역되":83,"안정":303,"안전":674,"여되":60,"에라":85," 수는":94,"앙에":77,"얼마":78,"열대":208,"어라":65,"권한을":66,"액션":116,"였기":64,"때의 ":171,"실시되":64,"실시된":74,"어려":133,"아어":558,"스크 ":410," 선출":244,"아야":80,"에디":101,"않았":277,"안양":115," 소셜":77,"아와":401,"알아":118,"아오":102,"예금":73,"어렵":70,"에드":190,"아에":642," 소설":770,"않아":117," 생활":496,"어로":1502,"악에":82,"佇 ":72,"화와 ":320," 설치":696,"아웃":101,"아울":62,"아우":283,"안에":740," 소송":210,"언론":352," 소속":1527,"영된":120,"영되":313,"애자":58,"업무":727,"액을":97," 속에":242,"애인":203,"글자를":64,"양수":197,"기술과":58,"엄밀":70," 선포":61,"업발":66,"오나":76,"엘리":216," 소유":309,"완공":68,"여래":59,"영등":168,"디자이":77,"디자인":463,"양시":233,"양식":325,"앤젤":102," 소장":152," 속이":74," 센터":199,"기술개":69,"여러":1623," 속의":125,"약에":115," 서해":142,"카에 ":110,"야에":433," 소재":587,"연되":60,"옮겨":85,"아주":160,"앞에":101,"열도":82,"알제":77,"오기":78,"에로":64,"였는":131,"엘라":90,"야스":153,"아즈":110," 수도":1216,"았으":301,"였다":6029,"야시":57,"아지":116,"아직":137,"양사":193,"양산":105," 속씨":88,"였던":503," 소아":74,"어버":79,"양상":57,"다시 ":463,"니와 ":61,"않지":68,"에르":318,"시즌 ":224,"에른":140,"에리":86,"앙정":74,"양성":389,"에만":120,"영동":84,"옛날":72," 산하":461,"씨식":95," 자 ":216," 소멸":81," 서적":93,"막을 ":84,"야로":143," 사항":165," 선의":80," 섬에":254," 사하":60," 상트":68,"아사":140," 섬이":161,"아산":86,"스키 ":291,"야마":570," 섬의":117,"기술로":69," 산화":93," 상표":91,"대륙의":64," 잘 ":546," 성우":86," 생태":163," 세에":95," 사후":156," 섬을":79," 섬으":146,"여개":65,"여객":267," 선종":67," 사회":2075,"야를":102," 섬유":63,"여가":61," 설정":173," 살해":171,"어는":597," 선조":133,"리아가":58,"단백질":180,"어느":238," 선전":63," 사형":76," 선정":248," 쇼군":110,"알베":57," 선양":102,"쓰인":360,"쓰이":716," 선언":226,"아부":73,"에게":2384,"안보":81,"양도":65," 상태":747," 일 ":16662,"알바":146,"르토 ":64,"양동":61," 서양":207,"었고":896,"애를":72," 소련":332,"막의 ":105," 석유":97," 소리":267," 소말":61,"어내":101,"아비":128,"어낸":69," 서유":66,"아브":61,"어나":672,"어날":73,"어난":835,"었기":138," 솔로":142,"어났":456,"업기":117," 서울":3544,"머리 ":75,"안시":66,"었던":889,"어떻":123,"투에서":133,"였고":684," 성직":123," 성질":259,"에는":3705,"었다":9042,"연금":63,"아시":1288,"금융기":75,"연극":189," 성주":57,"어떤":681,"어떠":156," 세종":300," 소비":642," 세조":70,"암석":66,"연기":258,"염기":67,"마을 ":150," 서초":356,"영국":2099,"에도":1265,"씨의":120,"영과":63,"에다":73,"어도":167," 세우":109," 세운":200,"언덕":81," 성인":182," 세워":312,"양력":120," 성장":260,"역과":296,"연계":123," 상하":185," 선진":141,"연결":758," 세웠":103,"연고":218,"안성":60," 상품":255,"얻는":95," 소방":94,"여겨":282," 세와":139,"앨범":671,"아서":237," 성은":94,"얀마":118,"안산":110," 성의":77,"없는":620,"여금":57,"에노":70,"어드":104,"어들":198," 상호":668,"여기":352,"업단":115,"기적 ":104,"었는":201,"없다":384," 샌프":82,"암살":118,"아스":346,"어디":66," 상황":261," 수가":104,"마의 ":282,"연과":73," 성전":89,"연관":142," 성적":226,"에너":561," 장 ":243," 세의":435," 서쪽":905,"대륙에":112," 세이":184," 세인":106,"연구":3821," 사찰":82,"았던":129," 은 ":18128,"암동":115," 을 ":2159,"애니":672," 소녀":88," 상징":242,"양계":70," 소년":140,"양경":128,"양과":174,"양군":100,"양국":72,"양구":62,"비아의":150,"아들":1116,"야기":519," 서북":81," 서부":519,"아드":146,"때에 ":170," 상주":72,"아디":57,"았다":1000," 서브":58,"안드":183,"길이 ":175," 서비":842,"금으로":71," 서사":93,"실제 ":183,"俘 ":76," 생존":104,"아레":72," 석사":65," 사카":84," 세부":97,"아로":119,"알라":103,"앤드":86," 속도":241," 의 ":5102,"치인 ":144,"기술서":144," 성분":85," 읍 ":79," 성북":85,"디오에":59,"아라":308,"아랍":275," 소니":105,"아래":399,"압력":96," 서식":301,"리에서":329,"승용차":65," 선수":1292,"아마":286,"아목":89,"알리":180,"아몬":59," 사태":68,"아메":712,"알려":1678,"알레":75,"알렉":317,"아리":224," 소득":100,"아름":151,"어가":540,"어간":77," 서술":85,"아르":723,"아를":253,"직할시":78,"아바":91,"언급":141," 이 ":6364,"앙리":61,"쓰였":116,"얼굴":69,"아미":126,"지하였":117,"업과":191,"업관":61,"없고":95,"아버":569," 인 ":395," 생체":57,"업계":106," 센서":58,"양대":68,"지》":67,"르트 ":344,"어권":118,"니어 ":84,"아무":106,"아문":73," 성서":95,"기술부":292,"길을 ":64," 산타":90,"업가":98,"쓰여":83,"형적인":66," 세상":123," 위 ":236,"악기":342," 상수":101,"안구":75,"안군":96," 상속":67,"알고":264,"아기":73,"시스템":1332,"심이 ":111," 섬들":62,"안과":128,"아군":73,"니오 ":65," 성동":57," 세대":321,"실시간":101,"않고":536,"악구":59,"아과":121," 산스":84," 삼성":325," 성당":140," 세는":67," 웹 ":345,"악과":60," 사용":5046," 생산":1061," 사운":155," 사우":191,"기술적":76," 사원":114,"아날":73,"아나":434,"아내":271," 산업":792," 살아":158," 살았":134,"시스코":114,"았고":97," 사업":732," 산악":85,"지함을":85,"슈퍼 ":89," 서로":499,"진행된":77,"진행되":197," 상승":71,"않기":58,"심의 ":122," 소규":60," 생식":58,"아는":197,"기술연":67," 삶의":108," 삶을":67," 살인":124," 상에":106," 상업":212,"아누":93," 설립":4416," 서명":79," 사정":82," 사제":64," 소관":1915," 사적":136," 사전":183,"기술을":263," 산이":64,"기술의":232,"아노":202," 사장":102," 사자":61," 사이":2278," 소개":178,"기술이":179,"기술인":74," 사유":70," 생성":251,"기술자":91,"애국":68,"급으로":81," 손꼽":71,"안되":84,"안된":112,"기존 ":131," 선보":66,"즈》":67,"안동":130," 성모":102," 색을":60,"侶 ":207," 세르":235,"야구":985," 서버":233,"아돌":57," 상임":95," 선발":106,"심을 ":205," 선박":128,"아동":169,"앵글":61,"안데":62,"애나":90," 산지":91," 상의":104,"않다":149," 상을":58," 성립":280," 상이":114," 상인":70," 상위":92,"않는":850," 삽입":61," 사진":217," 상으":64," 성리":78,"지하철":230," 상원":72,"기술에":58,"악단":271," 세로":127," 소금":58," 세력":190,"아닌":450,"아니":1250," 설명":374," 상용":62,"약간":124,"아다":74,"말은 ":132," 사무":2313," 사물":92," 설계":434," 선교":139," 선구":63,"지한다":100," 서기":255," 새로":744," 새롭":78," 상륙":76," 상류":70,"리아어":250,"리아에":188," 섬과":100,"목과 ":70," 사법":159," 성격":204,"말을 ":157,"스주의":80," 성경":194," 세가":201,"통틀어":103,"트리 ":144," 성공":441," 성과":102," 세계":3678,"스코 ":228,"치적 ":191," 서남":105," 원 ":160," 센고":139,"지하는":209,"쓰는":237,"쓰다":206,"니아 ":1248," 월 ":22165,"많이 ":589," 세균":89,"쓴다":98," 세기":1685," 생리":73," 생명":334,"쪽 ":908," 샤를":130," 서대":93," 성남":123," 사상":928,"르체고":106,"캄보디":75," 생물":516," 산소":61," 선도":76," 산성":63,"집트의":91,"악가":125,"실이 ":60,"비영리":167,"아가":452," 세네":79,"화의 ":697," 사실":480," 성능":96," 사도":134," 외 ":86,"슈팅 ":70,"진하고":72," 생기":166," 생긴":156," 살던":66,"倉 ":985,"토해양":108," 상당":502," 상대":501,"캘리포":248,"말이 ":72,"회사 ":324,"지했다":169,"뉴질랜":239,"측정하":101,"메라 ":83," 사람":2374," 사랑":216," 사라":147,"쓰고":114,"화재 ":280," 사령":113," 사례":65,"기준 ":2089," 사망":418,"기술하":75," 용 ":77,"리아와":111," 사립":104," 사마":97," 사막":116," 사르":73,"확인 ":117," 선고":68," 서구":220,"리아의":735,"투아니":93," 우 ":320," 선거":866,"쓰기":124," 산맥":264," 산림":264,"말의 ":116,"적이":892,"저작":265,"저자":156,"적인":4818,"亂並":388,"전위":69,"亂丘":135,"전으":231,"전은":327,"亂三":545,"저장":344,"적자":77,"전의":505,"정안":94,"亂丁":320,"점에":478,"전을":821,"적재":58,"자하":69,"전인":95,"전이":516,"져서":63,"작하":761,"작한":580,"작품":1198,"조를":342,"조르":59,"조리":96,"전용":189,"적은":2124,"적으":6892,"칭 ":682,"전원":97,"적의":174," 쓰기":94,"적을":427,"임했":179,"저우":183,"당시 ":1027,"침 ":97,"따왔다":64,"절에":116,"적용":431,"환의 ":69,"조류":112," 쓰고":102,"리우스":221,"전에":1678,"제시":329,"일환":97,"임한":108,"임하":316,"전역":201,"입한":122,"입학":90,"입하":329,"인형":87,"조로":212,"적에":131,"적어":75,"전압":72,"이후":1816,"잔티":172,"並國":285,"칠 ":70,"정식":676,"정신":481,"이화":58,"일한":420,"정시":83,"일하":307,"친 ":511,"정수":173,"乙倉":109,"이해":346,"인프":61,"이행":148,"인하":333,"칙 ":237,"인한":241,"인해":494,"치 ":2632,"신인 ":126,"이프":354,"제사":194,"이한":134,"이하":446,"三大":142,"신이 ":258,"접속":115,"이풀":105,"정성":106,"정서":149,"장치":620,"전신":167,"환을 ":105,"전시":500,"乙乙":117,"정상":286,"乙之":271,"之倉":297,"일파":102,"자키":58,"종대":66,"乙亞":103,"전승":143,"정사":185,"점성":174,"乙亂":105,"자크":65,"乘三":61,"이퍼":110,"乙並":232,"재청":67,"전술":58,"족들":119,"乙丁":307,"시절 ":80,"저스":73,"乙丘":178,"乙三":410,"三國":301,"인트":215,"전소":133,"이팅":121,"저수":94,"이파":70,"전송":216,"저술":150,"의회":639,"이티":216,"之亂":405,"명단 ":58,"之亞":451,"이틀":169,"이트":1360,"전세":108,"신장 ":61,"조된":63,"전성":138,"조되":83,"전설":222,"전선":133,"之丁":1040,"之三":2090,"之丈":89,"망을 ":100,"之両":59,"之並":1657,"之丘":553,"之丙":209,"之且":109,"장착":83,"전산":136,"之丹":143,"전사":228,"저서":111,"의해":2089,"之之":1269,"의한":468,"의학":321,"의하":736,"之乙":257,"의할":59,"제부":107,"자치":1144,"정비":150,"확장 ":96,"인텔":213,"이토":133,"인터":1169,"층 ":466,"丹倉":60,"조달":91,"응하":231,"조는":153,"이터":1189,"정부":1875,"정벌":59,"정법":122,"자체":347,"丹乙":84,"정변":61,"재즈":85,"丹之":209,"丹並":154,"제반":61,"정보":2086,"이크":941,"이클":251,"정복":111,"재직":118,"재지":325,"쟁으":65,"쟁은":76,"僅 ":60,"쟁을":248,"쟁의":208,"이타":167,"이탈":1138,"쟁이":241,"丹亞":73,"제법":87,"은하":190,"측 ":145,"제목":268,"이케":58,"츠 ":974,"리이다":317,"잡지":250,"화제 ":72,"조나":82,"쟁에":206,"일치":88,"은행":538,"이코":123,"리잡고":88,"丹三":207,"丹丘":102,"丹丁":217,"입출":70,"정받":96,"비스를":298,"並倉":230,"일컬":82,"일컫":636,"재의":598,"並人":80,"장점":74,"中三":72,"종과":75,"실을 ":195,"제리":117,"종교":1302,"장조":63,"이치":249,"재자":79,"並亂":444,"전보":64,"傭 ":180,"재이":59,"재임":98,"並亞":687,"재일":59,"적분":127,"정무":57,"이카":88,"제명":84,"재정":215,"이커":104,"전북":82,"전부":292,"中並":57,"中之":85,"장을":941,"인체":87,"장은":773,"장으":586,"정리":695,"両之":103,"인천":533,"졌다":865,"장인":245,"丟之":69,"両三":99,"장이":1191,"젤레":109,"장의":447,"정립":97,"両並":87,"並丹":89,"일체":116,"並並":1269,"並乙":240,"전반":248,"丘倉":118,"스케 ":67,"실은 ":1893,"並之":1226,"장자":107,"並丁":729,"제를":766,"並丘":363,"융합":89,"재위":678,"제르":165,"並両":95,"並三":1954,"並丈":68,"종결":57,"三倉":449,"丘之":421,"丘丘":278,"자주":224,"丙丁":90,"丘並":322,"丙三":80,"화적 ":138,"丘亂":138,"僅丁 ":58,"丘乙":185,"丙之":212,"있지":448,"재연":67,"제로":721,"조기":91,"丘亞":147,"실의 ":102,"잡이":62,"유효":82,"丟並":76,"조금":122,"이천":64,"재에":108,"丟丁":65,"전문":1050,"유해":57,"장애":383,"장악":78,"장안":65,"丁倉":187,"자전":175,"자적":159,"유행":121,"자재":65,"작자":143,"자자":71,"작이":256,"조건":316,"유하":391,"유학":156,"유한":279,"점령":156,"장에":759,"임진":131,"유형":215,"전면":62,"제라":64,"작전":209,"丘三":684,"丘丁":429,"임즈":102,"저명":67,"정렬":62,"은퇴":57,"三侶":71,"족과":104,"丈三":117,"잎은":109,"三丈":107,"丈並":101,"三万":69,"三丁":1471,"자원":529,"三三":4055,"三丘":745,"인지":345,"三丟":89,"三丙":67,"三並":2154,"三両":110,"인증":178,"丈之":100,"三丹":134,"작용":550,"三之":2117,"三乘":139,"三乙":351,"자인":870,"자이":2068,"작의":72,"三亂":531,"작으":448,"자의":1247,"작은":702,"잡아":77,"작을":166,"三亞":989,"전류":94,"조개":72,"자음":68,"三人":96,"자율":111,"작위":138,"조각":242,"조가":162,"자유":977,"자였":253,"임이":597,"자연":787,"리위원":62,"자열":60,"丁丘":425,"丁且":87,"임의":297,"작업":333,"입은":77,"자역":86,"입을":113,"작에":70,"丁丈":69,"임을":405,"자에":659,"丁丁":864,"제들":91,"丁三":1278,"전략":331,"丁丹":114,"임은":109,"임으":241,"자어":57,"丁並":785,"일제":561,"이중":126,"丁乙":272,"일정":485,"임위":64,"인종":158,"丁之":1054,"이주":163,"으키":165,"으킨":137,"이징":189,"丁亞":282,"이집":416,"이진":103,"이지":1231,"입장":148,"있음":88,"丁亂":313,"丁人":80,"전례":156,"있을":258,"있으":3281,"자와":521,"입자":336,"전력":195,"丈丁":89,"잭슨":68,"이즈":350,"임자":75,"입이":70,"일종":722,"정되":1028,"이잔":109,"정된":594,"식을 ":714,"재생":142,"이재":61,"이익":326,"이이":176,"재산":389,"이인":77,"장소":328,"황에 ":67,"유통":346,"일요":87,"이자":2950,"인의":1113,"위해":3093,"인이":4001,"인으":806,"인을":537,"임에":183,"인은":257,"일원":193,"위하":1705,"위한":2059,"식은 ":203,"장성":72,"이유":337,"인용":176,"졌고":82,"익을":208,"제대":96,"정동":479,"이의":934,"잠수":99,"이은":66,"제단":60,"정도":1147,"인원":74,"이종":60,"일자":116,"인접":487,"인정":511,"인적":116,"위협":78,"인조":157,"전라":669,"장시":86,"장식":115,"임워":63,"취 ":74,"잠시":62,"기지 ":76,"인재":122,"이제":109,"제도":1306,"있어":778,"이정":63,"인자":89,"이점":63,"장수":156,"이전":987,"통합하":160,"이저":329,"인인":120,"이족":65,"이조":77,"제되":75,"일인":105,"일이":470,"일의":940,"위험":212,"일을":556,"일은":174,"일으":424,"있었":1211,"자수":84,"이야":497,"제네":71,"전되":64,"절도":60,"이어":1308,"이언":277,"이었":2244,"이에":1970,"익에":79,"신설되":74,"의존":91,"전도":170,"이아":356,"전동":176,"이안":76,"캐릭터":185,"이외":210,"정당":438,"유키":92,"이용":1588,"이우":80,"일어":991,"장사":59,"시인 ":178,"일에":2143,"이웃":70,"제는":261,"이였":67,"인어":137,"이올":125,"이온":234,"이오":499,"식의 ":484,"인에":349,"이완":207,"자신":773,"유클":61,"이와":378,"자식":84,"의지":73,"일신":97,"일시":120,"일스":114,"잠비":104,"파는 ":76,"인시":91,"인식":243,"유출":66,"의원":853,"활을 ":172,"위키":154,"인스":118,"위탁":115,"재배":144,"인슈":60,"응을":89,"제나":96,"절대":92,"적도":104,"임시":210,"작센":85,"의제":61,"작성":335,"의정":233,"의적":274,"자세":81,"자성":62,"임스":290,"전당":60,"전대":71,"전달":281,"의장":161,"자생":114,"음주":74,"의자":305,"유치":96,"작사":221,"장비":214,"의인":546,"자산":166,"자살":104,"자사":95,"의의":352,"일상":129,"시작 ":73,"제국":1585,"일생":67,"시설이":123,"원한":148," 집 ":170,"원하":678,"쟁력":155,"식이 ":173,"이소":58," 신흥":72,"비스이":77,"정기":592,"제교":76,"을이":57,"이세":85,"제고":109,"이센":174,"인상":83,"제곱":73,"제공":1150,"정규":416,"이션":1407,"음에":319,"접근":281,"정권":313,"점기":433,"이선":105,"시설을":69,"인사":262,"兌 ":199,"이성":133,"익스":143,"활의 ":71,"입사":58,"이승":80,"원후":60,"이식":83,"이시":344,"의와":146,"이슈":60,"제기":237,"유체":83,"음의":177,"원활":70,"원회":1095,"임상":62,"이슬":422,"이스":2082,"仕三":62,"육청":92,"음이":197,"인수":210,"원형":177,"인쇄":185,"음으":564,"음은":101,"의에":273,"음을":324,"응용":379,"저널":109,"음원":65,"재무":115,"일성":102,"이수":87,"전남":92,"의약":116,"이순":57,"재로":240,"저기":73,"대백과":604,"잘못":97,"적극":95,"출 ":296,"치의 ":225,"점검":69,"장르":283," 시행":352,"육지":116,"재료":225,"유지":641,"시장 ":205,"전기":888," 신학":305," 시험":291," 지 ":95,"의식":267,"자바":152,"자발":97," 시호":361," 진 ":93,"점과":85,"人三":93,"전까":209,"을에":58,"자베":74,"장면":96,"자본":288,"제가":451,"제강":74,"음악":1495," 질 ":78," 신호":310,"정계":61,"정경":76," 실행":293,"칙을 ":131,"재를":220,"이사":565,"이산":85,"정과":206,"스카 ":184,"정관":93,"충 ":115," 신화":415," 실험":263,"정구":335," 실현":162,"이상":1108,"위치":6047,"제거":152,"익산":65,"정교":193,"일반":1994,"자리":1002,"익보":67,"이보":83,"이복":70,"유의":257,"임무":144,"일방":62,"유인":88,"之大":89,"유일":471,"입문":63,"자매":75,"의사":510,"육의":120,"마스터":68,"육을":218,"인보":77,"유자":93,"의상":66,"유입":79,"까운 ":156,"전개":214,"이부":78,"육자":126,"의성":63,"일보":361,"일본":4937,"유적":246," 실패":143,"유전":485,"전거":106,"추 ":144,"이븐":59,"율을":155,"이브":495,"음식":352,"축 ":318,"이블":337,"운항":73,"율이":70,"亞倉":61,"이비":217,"장로":122,"운행":351," 식품":226,"전과":411,"전공":113,"일부":2152,"전광":273,"우호":59,"춘 ":196,"신은 ":98,"자문":126,"입법":132," 심판":95,"작물":136,"율적":242,"유주":119,"전국":582,"위주":70,"심에 ":61,"신을 ":369,"之國":69,"之圓":61,"이묘":288,"자랑":69,"인명":89,"유역":120,"이므":100,"자로":1208,"인물":819,"인문":93,"유에":59,"유엔":67,"자력":167,"일명":83,"음성":162,"亞丁":182,"亞三":599,"음서":80,"亞丘":99,"자루":62,"워크":714,"이바":376,"유와":57,"亞並":511,"자료":546,"울특":2233,"이미":360,"亞之":492,"이민":84,"우편":143,"자를":1003,"용해":337,"자르":81,"용했":139,"육원":57,"용하":2610,"亞亂":216,"용할":380,"용한":877,"이베":186,"亞亞":176,"이벤":65," 시티":84,"유용":65,"임명":214,"인민":1273,"이버":362,"이번":163,"위원":1545,"亂乙":88,"의병":59,"일로":231,"재는":563,"亂之":408,"이뤄":69,"일렉":99,"亂亞":162,"亂亂":108,"일련":190,"장동":67,"이마":124,"위의":342,"신의 ":935,"위이":229,"이맥":82,"위인":79,"인류":317,"이름":3683,"이를":1128,"이른":205,"이르":700,"재단":793,"이리":104,"입력":182,"요하":229,"이머":65,"요한":1121,"이먼":60,"이메":59,"디지털":568,"이며":7197,"위임":83,"위자":61,"장된":112,"장되":154,"인리":94,"인먼":269,"비스타":71,"위조":60,"자라":247,"재되":117,"자란":69,"원칙":340,"은색":62,"이면":165,"이명":60,"亂倉":143,"일리":688," 신체":155,"외하":104,"외한":225," 스포":456," 신청":73,"매사추":92,"육상":138," 시초":66,"장남":121,"우치":86," 스펙":67," 스페":757,"웨일":116,"장난":59,"유산":310,"유사":363,"웨이":486,"자대":65," 시청":126,"쟁과":63," 丘 ":87,"있도":442,"자니":97,"입된":112,"입되":233,"마쓰다":198,"의를":445," 스파":150,"자는":1092,"일드":67," 스티":145,"원지":101,"있던":494," 스트":337,"인디":287," 스튜":250,"인들":571,"워진":166,"의료":334,"요크":80,"이때":111,"있다":15164,"원주":176,"위스":240,"재까":179,"이디":118,"있는":7991,"우체":68,"의로":182,"으므":160," 즉 ":662,"메리 ":61,"유신":98,"이루":1453,"일러":233,"이룬":139,"우크":207,"위와":98,"의미":1879,"일랜":486,"인력":158,"자들":1135,"위에":756,"이론":1040,"이로":415,"이렇":64," 스프":81," 시카":80," 스피":206," 스핀":57,"의무":269,"의문":59,"이런":131,"이러":633,"통하여":478,"이라":4666,"이란":1468,"이래":223,"작되":526,"작된":385," 並 ":184,"음반":546,"자동":1079,"최 ":64," 실천":136,"작동":101,"유성":172,"육성":326,"장경":65," 신조":74,"이는":2179," 슈팅":70,"장관":476,"이다":44715,"재개":63,"장과":268," 실제":432,"이니":195,"재가":173," 시조":152,"원예":81,"우지":75,"원에":608," 싱어":64,"장갑":60,"월에":803,"장거":57," 스코":288,"잡고":156,"울주":88,"으로":39730,"자기":738,"웨스":286," 신인":71,"움이":79," 신이":79,"움을":181,"이노":91,"요청":103," 신의":77," 신장":97,"비슷한":239,"유발":57," 스케":164,"비슷하":194," 시절":186,"우즈":100,"유민":79,"웹사":130," 신자":91," 시점":66,"이네":70," 시인":309," 시이":157,"유무":68,"이너":235,"통해서":112,"유물":138," 시의":190,"작권":131," 실용":67," 시장":413,"자극":108," 스카":114,"자금":194,"시상식":63,"우주":734," 시작":1908,"이념":153," 스톡":57," 스토":195,"투이다":152," 실질":132,"자녀":65,"인되":81,"원조":101,"인된":61,"웨어":1107,"이듬":68,"이들":778,"잇는":270,"이드":728,"움직":247,"워졌":78,"음료":81,"일대":304,"깊은 ":79,"자나":141,"왕후":204," 三 ":313,"원전":1057,"유비":60,"장기":260,"원정":112,"요코":91,"인도":1254,"원제":94," 스테":207,"원의":775,"용차":86,"원인":277,"위성":291," 스키":157,"원이":1024,"으면":282," 스타":592,"으며":8023,"읽는":62,"원자":600,"원작":230," 스탠":103,"원장":376,"이도":263," 싸움":60,"이동":495,"인데":229,"울지":66," 심의":80," 슈퍼":229," 丁 ":138," 신주":59,"위상":207,"장교":142,"재건":66," 시즌":547,"음력":497," 심장":68,"위생":71,"이던":150,"이더":241," 십자":179,"원으":778,"인다":456," 스크":137,"장군":389,"원을":809,"원은":649,"이데":100,"육부":79,"형태 ":63,"일군":61,"원수":199,"유롭":69,"원숭":77,"三爲":60,"우에":529,"오후":131,"이기":1185,"유로":412,"이그":62,"일과":149,"인권":206,"용이":398,"용인":137,"임과":78,"운영":1427,"용의":233,"워싱":117,"용은":159,"용으":300,"용을":584,"인기":282,"인근":202,"위반":100,"우와":70,"유래":580,"인공":464,"용에":213,"인격":64,"용어":1083,"윌리":271,"초 ":560,"원생":66,"요제":75,"일간":105,"일가":59,"일곱":97,"인구":3663,"유럽":1343,"冲 ":93,"유러":115,"원소":347,"오호":62,"촌 ":246," 실시":523,"인과":324,"이끌":202,"자고":60,"우저":126,"이끄":118,"의도":237,"유명":908,"우정":123,"작곡":859," 썼다":61,"윤리":162," 시와":120,"임금":151,"자격":236,"옹호":75,"임기":171," 시외":158,"의된":97,"대성 ":58,"이내":59,"의되":83," 시위":129,"운전":120,"자국":131,"자군":113," 신용":132," 실업":107,"율리":85,"작과":77,"잉글":396,"울의":68,"이나":3192,"있기":222,"이남":57,"유를":151,"있고":900,"일까":746,"자간":58,"운용":176,"자가":1369,"우유":59,"울에":84,"위법":67,"원시":421,"용자":756,"있게":320,"유류":81,"의는":204,"일기":79,"우이":219,"우익":68,"우자":69," 시에":386," 신앙":144,"입구":92,"입국":115,"치에 ":239,"작가":798,"유리":178," 신약":73,"총 ":953,"우의":129,"월부":255," 스위":277,"우수":283,"운송":99," 시사":57," 중 ":2659," 시상":122," 시설":392," 신사":98,"운수":104,"시의 ":1040,"오페":337,"우스":1158,"리칸 ":115,"위로":336,"의거":140,"용소":61,"우선":90,"영화":1822,"활성화":218,"의견":100,"리카 ":773,"유동":91,"원부":65,"유도":196,"리적으":85,"용수":83,"외전":91,"울산":345," 스웨":362,"리적인":105,"으나":1988,"이가":479,"울시":173," 승용":68,"오픈":264,"요인":96,"욕의":65," 심사":67,"오프":166," 시스":1156,"요일":287,"이거":210,"인가":135,"인간":782,"이것":484,"오피":231,"이게":62,"오하":85,"왼쪽":64,"원사":88," 실수":85,"이고":1474,"원산":135,"이곳":157," 승인":135,"원상":70,"우승":695,"우시":59,"오폴":57," 신성":183," 신설":178," 신속":77,"위를":1049," 시민":365,"오카":257," 식민":417,"원리":261,"여하":917,"여한":139,"여할":326,"여함":202," 스스":212,"요새":77,"왕족":171,"왕조":659," 심리":249," 신문":265,"위대":223," 시뮬":113," 식물":383,"위는":290,"스이다":194,"융기":85,"윈도":643," 식별":121,"오케":99,"쳐 ":1031,"연합":1184,"요소":392,"유나":103,"연하":111,"연한":93,"여행":192,"요성":113,"여했":135,"역할":607,"역학":339,"역한":78,"역하":120," 亞 ":77," 신분":107," 주 ":4010,"매우 ":477,"통합되":76,"요시":396,"오클":62,"리이며":65," 수행":872," 수학":787,"연호":88,"메니아":86,"외에":406,"유네":90,"倉亂":100,"육대":57,"오토":172,"예프":66,"외의":195,"유대":321,"倉亞":80," 순환":84,"영향":799," 줄 ":89,"유닉":93,"용성":58,"유니":298,"의가":147,"온타":106," 신비":72,"倉丁":326,"영한":96,"영하":577,"유는":91,"倉並":271,"倉丘":124,"倉三":482," 수호":59," 준 ":71,"음과":236,"용산":154,"오타":75,"倉乙":106,"오키":160,"倉之":304,"월드":684,"첩 ":70," 식량":97," 신라":319,"우를":179,"우르":146,"첫 ":815,"업협":85,"유가":172,"우리":506,"웠다":116,"원들":183,"오지":91,"맡은 ":78,"울러":57,"업한":67,"업하":195,"대사 ":83,"우루":219," 시라":58," 亂 ":146,"웨덴":355,"유고":84,"육과":405,"육관":1166," 신뢰":95,"체 ":1944,"청 ":1596,"예측":116,"와의":292,"와이":323,"와인":57," 시로":87,"업화":69,"빛의 ":78,"카스 ":79,"에피":155," 시마":63,"회민주":65," 쇼핑":60,"원래":605,"빛을 ":75," 술탄":78,"위나":89,"유권":118,"육군":281," 시리":1297," 시를":88,"완전":346,"칙에 ":86,"유구":70,"영토":332,"왕이":422,"왕의":270,"스위스":188,"원로":84,"왕을":61,"왕으":142,"운반":68,"왕자":127,"리자베":66,"육기":102,"유기":217,"온천":62," 시모":72,"왔으":124,"니의 ":167,"왕위":174,"오염":119,"요로":94,"우디":104,"메르 ":60,"언트":67,"우드":176,"오에":131,"오와":86," 시도":132,"열차":260," 스몰":57,"운드":310," 승려":122,"디즈니":97,"요르":77," 乙 ":95,"위가":175,"왕성":75,"스이며":63,"侶三":70,"외버":131," 신도":90,"오이":168,"오일":61,"엔터":311," 스미":84,"요리":555," 시드":98,"연출":61,"대상 ":99,"오의":95,"위공":239,"오전":108,"왜성":61,"위계":74,"용량":87,"어항":124,"원대":91,"온의":63," 승리":259," 순천":93,"어하":99,"어학":198,"후반 ":84,"외부":238,"원도":356,"에트":351,"위구":57,"우라":154," 수출":183,"척 ":91,"처 ":416," 수치":83,"우려":58,"왕실":105,"에티":108,"원동":67,"천 ":830,"요미":57,"어휘":60," 죄 ":83,"우로":109,"위기":195,"철 ":398,"워드":249,"연적":84,"원고":58,"치와 ":115,"열이":89,"영역":375,"언컵":95,"외래":67,"원과":233," 종 ":214,"화시키":67," 수익":76," 수의":96,"실에 ":74,"원구":83,"원국":135,"여주":182," 수입":181,"옆에":62," 순위":121,"예언":76," 수장":78,"원군":252,"영원":61,"영웅":130,"오스":1498,"연주":311," 수정":210,"옥스":58,"몽골 ":105,"였으":1734,"오시":78," 丹 ":77,"왕복":96,"였을":67," 싱글":369,"여지":119,"여진":185," 시나":74,"영을":122,"영의":146," 시내":231,"영이":82,"영자":87," 숙종":91,"영장":73,"양한":780,"양학":62,"양하":218,"우는":273,"월까":163,"양항":61,"용도":148," 수준":219," 슬라":77,"예인":65," 실내":91,"예의":63,"용된":671,"용될":69,"영조":63,"왕비":137,"용되":1573,"예전":122," 숫자":146,"와서":75," 수집":187," 수직":64," 之 ":340,"운데":1232,"예정":321,"우도":237,"영주":89,"완성":213,"울대":190," 슬로":136," 스리":80,"운동":1815,"외무":92,"어트":94,"대문구":151," 스마":121," 시대":2063,"였지":104,"영지":83,"어파":69,"소시":75," 분쟁":188," 붉은":115,"소스":310," 부정":201," 분자":245,"세청":93," 부인":212,"셋째":71,"소수":273," 부작":91,"수다":87,"수단":483,"세츠":103,"시와 ":384," 브로":98,"티나 ":145," 붙여":234," 브레":111,"선택":248,"름에서":87,"서태":158," 붙어":133,"손실":63,"단의 ":266,"수는":372," 브라":642," 부족":243,"송사":95," 브랜":223,"션으":79," 불안":63,"션을":118,"션은":114,"소사":75," 분야":1053," 부여":249,"쟁 ":829,"선출":248,"썼다 ":65," 비극":78,"생했":81,"생하":591,"생한":515," 북아":293," 북위":59,"성체":67," 부위":68,"설치":714,"수나":99,"소속":1553,"소송":393,"신에 ":117,"명과 ":133,"속성":78,"소셜":82,"션의":93,"소설":947,"생활":791,"션이":221," 분열":80,"생화":64,"따온 ":72," 부착":61,"선포":63,"소위":58,"단은 ":440,"소와":122,"트라 ":199,"속인":60,"속이":290,"소자":103,"소장":374,"손으":98,"속작":76,"소재":625,"석하":208,"서해":154,"석학":150,"손이":173,"소유":337," 부천":92," 부처":87,"속으":235,"소의":301,"속은":58,"진화 ":77,"속을":100,"기에는":229,"소이":246,"속의":608,"소인":65," 비디":406,"센터":804,"수도":1289," 블로":143," 블록":76,"술대":62," 분지":73,"술단":60,"수동":96," 블랙":148," 블라":159,"시오 ":93," 브루":131," 부지":65,"단을 ":267,"속에":468,"수들":238,"소에":318,"소아":88,"속씨":93,"수되":71," 붙인":76,"서트":83," 블루":149," 북쪽":939," 브리":239,"성학":57,"성한":344,"성하":1173,"수를":679,"성했":93,"점 ":566,"컫는 ":311,"접 ":423,"술로":114," 비롯":537,"트남의":86,"수리":211,"성향":92," 비례":62,"수립":264,"정 ":3475,"수많":217,"수면":90,"성화":282,"정·":70,"세한":64," 부터":293,"수목":78,"제 ":8681,"수라":68," 빌딩":89,"송에":109,"저 ":691,"선하":86,"적 ":6488,"손자":185,"디스플":113,"속적":207,"선형":177,"센트":222,"적·":123,"화민국":208,"설하":120,"설한":95,"세트":80,"전 ":5853,"대부터":165,"수론":65,"수록":388,"수로":616,"절 ":333,"수렴":66,"송이":144,"송의":131,"송을":189,"설화":59,"세포":573,"선후":58,"쉽게":176,"형식의":79," 분파":90,"형식으":115," 비밀":142,"시에 ":1035,"다이 ":78,"스과":112,"져 ":1995," 분포":280," 빠르":144," 빠른":99,"수비":112," 부품":80,"술부":294,"숭배":68," 부하":57,"형식이":60,"젠 ":160,"젤 ":96," 부통":85,"디스크":171,"술문":89,"수반":73,"수법":94,"스웨덴":355,"단위 ":102,"식에 ":147,"스가":801," 본질":83,"다음 ":231,"의 ":113266,"기업의":151," 불렸":246,"기업이":229,"속도":891,"선생":91," 법학":135," 불러":104,"서서":57,"선사":107," 복지":133," 불렀":138,"르이며":70,"사카":280,"세부":108,"기업인":182,"석사":85,"행하고":204,"생존":118,"서사":106,"응 ":113," 번호":380,"소니":139,"소닉":71,"소는":346,"성분":130,"읍 ":503,"성북":87,"사추":98,"성부":135,"다의 ":251,"생자":65,"음 ":1500," 별칭":84," 베트":372," 보좌":58,"생이":317," 본적":119," 보존":342," 보조":161,"생의":222,"생을":123,"행하기":192," 복제":75,"사촌":66,"등장한":143,"생으":111,"등장하":542,"치아 ":63," 본인":73,"설비":81," 보전":117,"을 ":57193," 보장":178," 복잡":211,"사천":61,"사찰":89,"화방송":96," 복음":136," 보이":420," 보인":139,"소년":531," 보잉":63,"소녀":133,"상징":246," 보유":223," 벡터":143,"은 ":53620,"생에":69," 복원":82,"소나":91,"서비":984,"상주":112," 베타":95,"서브":58,"리사무":72,"성문":74,"인 ":17705,"서양":389,"성시":131,"이 ":37893,"성수":74,"성술":118,"센서":66,"생체":62,"익 ":246," 북미":97,"송된":69,"송되":126,"세상":147,"세서":262,"산타":97,"성서":193,"사토":62,"호에 ":574,"선스":81,"성사":84,"성산":69,"선시":239,"사태":109," 부문":169,"서스":92," 불리":901,"서식":307," 불린":584," 부모":106,"행하는":526,"기업에":73,"사키":131,"르웨이":214,"서술":88,"사쿠":75,"소들":74,"소드":129,"소득":173," 분리":348,"선수":1423," 부르":974," 부른":608,"상청":115,"소되":63,"속되":249,"속된":176," 분류":697,"설정":185,"섬유":95,"사회":2727," 부상":120," 보통":720,"선종":75,"설적":60,"선조":162," 부산":765,"살해":175,"투자 ":68,"자·":93,"세아":63,"잔 ":168,"상품":332," 북서":591," 부설":61," 본토":83,"섬이":172," 분산":111," 부서":58,"성원":193,"상표":111,"산화":189,"성운":59,"섬의":133,"섬으":166,"생태":212,"성우":110,"세에":183,"사후":160,"섬을":86,"성요":67,"잘 ":553,"소멸":82,"서적":136,"성애":87,"사항":240,"선인":123,"사했":72,"선이":566,"사하":732,"사학":226,"선을":465,"섬에":271,"선은":242,"선의":737,"사할":112,"사한":280,"서장":59,"선으":205,"상트":78,"선전":130,"사형":99,"쇼군":122,"선정":259,"설이":415," 병합":69,"성어":114,"설을":222,"작 ":890,"산한":104,"산학":93,"산하":736,"설의":132,"성에":448,"자 ":9649,"설은":95,"솔로":157," 변형":122,"소를":405,"임 ":1301,"서울":3628,"입 ":284,"선왕":86,"서원":70,"설에":180,"도가 ":497,"서인":123,"석의":89,"서이":286,"석을":172,"소말":61,"석이":169," 변호":184,"서유":74," 변화":441,"잉 ":170," 변환":220,"서의":944,"석으":73,"소리":483,"석유":108,"서열":68,"소로":258,"석에":88,"세스":264,"서예":61,"출하였":98,"선양":152,"소련":350,"출하여":68," 불법":159," 부분":688,"일 ":19527,"상태":852,"인·":87," 북부":662,"서에":300,"기에서":318,"선언":288," 보컬":132,"서와":99,"선에":397," 변하":73," 복합":146,"술교":71,"송법":108,"수기":70," 보한":520,"션에":87,"서초":359," 비공":72," 비교":314," 보험":236,"선총":96," 보호":689,"등장인":96,"소비":693,"세조":81,"수교":103,"후기의":231,"리와 ":419,"수권":213,"세종":317,"성지":76,"성질":285,"성직":128,"술관":133,"술과":123,"장 ":5509,"세이":380,"세의":524,"서쪽":1399,"세자":103,"세인":151,"성종":71," 보편":88,"상호":734,"샌프":83,"상화":104,"상황":295,"술개":70,"재 ":2451,"성주":100,"상회":64,"술가":133,"세와":166,"캐롤라":68,"소방":114,"성으":264,"성은":337," 부속":114,"성을":1294," 분석":366,"성의":584,"술회관":61,"사히":61,"성인":258,"성이":936,"성자":131,"세운":225,"세우":158,"세워":323,"성장":335,"속버":59,"세웠":108,"선진":160,"상한":79,"상학":60,"상하":394,"대부분":788,"상해":61,"기여를":92,"수가":629,"소보":57,"상했":97,"성전":187,"성적":290,"석기":155,"기여하":360,"상류":71,"상륙":81,"새롭":78,"스에서":445," 미합":61,"서기":467,"새로":765,"기여함":193," 미하":64,"기여할":296,"산문":69,"산물":335,"호와 ":151," 시 ":1062,"성가":64,"사바":60,"선교":176,"설계":483,"사물":97,"사무":2780,"울 ":554,"설과":91,"선구":92," 배치":124,"성경":276,"성계":81,"웃 ":62,"성격":206,"성공":519,"시안 ":86,"성과":536,"웅 ":112," 본래":266,"사본":71,"세가":270,"사보":77,"시아 ":1643,"사법":297,"움 ":309," 신 ":151," 바티":91,"섬과":108," 번역":439,"세계":3906,"산부":140,"워 ":326," 벌어":349," 발트":81," 베스":190,"성구":134,"성군":196,"사부":61,"세균":108,"월 ":22349," 발표":766," 번의":314,"달에 ":101,"상법":81,"세기":1777,"서남":114,"성기":125,"센고":142," 보르":102,"카메라":190,"원 ":4524," 복리":66,"현청 ":126,"서는":3206," 법원":151," 보물":126,"상북":422,"상부":76," 본명":394,"근처에":117," 벌이":59," 벌인":98,"생리":85," 볼리":57,"진흥 ":62," 버전":294," 보면":65," 벗어":103," 번이":131,"선대":74,"생물":828,"산사":62,"사선":118,"사서":100," 버지":75," 배포":140," 법인":125," 밝혀":97," 발현":62,"사사":122," 법의":69," 법이":73,"사상":1612," 번주":318,"성남":125,"서대":130," 변수":131,"생명":423," 베어":58," 발행":329," 범위":193,"샤를":135,"산소":74," 범죄":367,"선되":118," 베이":535,"세네":84," 법조":68," 번째":1969," 방해":63,"산성":191," 방향":321,"스앤젤":97,"선도":157," 번지":619," 법정":244," 보병":96,"사소":200," 병사":60,"다운 ":138,"사성":112," 법적":136," 발효":85,"서도":675,"성단":63,"성당":248,"성대":78,"삼성":344,"산스":90,"사슬":60," 부과":61," 불가":312,"성능":133," 본부":184,"사실":538,"사시":168,"설되":275,"설된":124,"사스":150," 범주":66," 밝히":70," 북구":110,"산식":630,"산시":419,"리적 ":251,"웹 ":348,"세는":97,"성도":116," 북극":76," 불과":64," 부근":222,"상속":91," 불교":744,"성동":166,"세대":442,"섬들":63,"성된":741," 불구":96,"성되":956,"상수":192," 분기":121,"위 ":2320,"성들":103,"상스":90,"윈 ":68,"상승":79,"셔널":254,"상식":83,"상시":144,"상실":57,"린이 ":101," 붕괴":168," 보상":64," 복사":102,"산악":97,"사업":1551,"사에":878,"행한다":126,"통합 ":160,"서로":775,"사연":96," 본사":229,"사였":94,"사와":428,"산업":1568,"살아":160,"선로":99,"통해 ":1561,"집합 ":96,"살았":134,"산에":325," 병역":76," 복소":94," 본선":116,"사용":5124,"생산":1237,"때문이":153," 별자":112,"사우":425,"사운":168,"서류":82,"설로":162,"사원":206,"생생":58,"사위":127," 보수":175,"댄스 ":74,"소가":271,"사유":116,"생성":266," 복수":110,"서를":347,"사자":349,"사장":277,"사인":290,"사이":3510,"립에 ":79," 보스":263,"사일":246,"사의":845,"서만":132," 병원":123,"서리":63,"소개":184,"사정":140,"서명":92,"산자":217," 봉사":58,"산재":60,"소관":1922,"사제":90,"산인":85,"산이":328,"사절":60,"사전":1026,"서면":90,"사적":555,"산의":351,"산으":200," 법칙":215,"스어로":96,"산은":127,"산을":359,"윤 ":93,"카르타":63,"삶의":108,"살인":141,"사조":84,"세라":64,"살이":176,"삶을":67,"상업":234,"상에":642,"상어":65,"설립":4441,"때문에":1177,"육 ":627,"유 ":819,"소금":62,"율 ":154,"세력":252,"설명":378,"소규":61," 부대":178,"생식":87,"생시":83," 부담":83,"속기":527,"산주":225,"상위":159,"삽입":62," 부동":119,"상용":93,"기업청":60," 분당":67,"소기":175,"세로":166,"상원":88,"선민":529,"상이":873,"상인":149,"서버":242,"상자":197,"상임":117,"선발":112,"선박":143," 보안":191,"상은":243,"상으":822,"성리":100,"사진":268," 북동":561,"사지":81,"성립":287,"파가 ":58,"상을":1272,"상의":1384,"세를":145,"세리":64,"송공":59,"상적":276,"산지":293,"상장":64,"비의 ":135,"세르":275,"선보":70,"색이":139,"송국":244,"린의 ":67,"서부":1232,"서북":82," 보여":231,"색을":143,"성모":113,"색은":70," 부등":76,"색으":87,"융 ":140,"색의":117,"손꼽":71,"뿐이":69," 밀양":57,"사고":593," 방사":170,"사관":271,"사과":105,"와 ":15664,"옹 ":166," 바스":71,"사건":1923,"사거":86,"사격":73," 바실":65," 미야":70," 베네":192," 미얀":92,"사가":957,"사각":96,"통치하":66," 미쓰":88," 변경":370,"대법원":73," 발사":235,"출하기":61," 발생":1103,"삼고":63,"산기":92,"기이다":399,"대신 ":324," 방식":744," 밑에":65," 뮌헨":76,"사기":229,"수행한":79,"상가":172,"수행하":444,"산구":164," 방송":1132,"살고":132,"단에 ":234,"산권":82,"산군":133,"삼각":154,"기원전":1034,"산광":498,"완 ":175,"산과":197," 밀집":64,"니즘 ":74," 바위":64,"사냥":61,"사나":120,"상급":60," 반영":94,"상국":58,"출하는":157," 밖에":125," 받았":408," 민중":86," 받아":431," 민주":647,"상공":144,"상관":120,"상과":358,"왕 ":1238," 민족":808,"상경":57,"다와 ":69,"삼국":365," 밀접":73,"통한 ":448,"상계":62," 배열":80," 벨라":91," 발족":205," 보기":59,"사대":81," 별도":115,"생과":60,"사단":1910,"사다":97," 보급":405,"통칭하":65," 발전":1652," 법률":682,"생겨":142,"사는":701,"생겼":60,"현으로":89," 백악":117," 본관":550," 변동":57," 방영":369," 밝은":60," 발음":225," 본격":104," 방어":109," 방언":173,"생각":316," 보관":104," 받을":65," 받은":244," 본거":85," 보고":251," 법령":81," 발원":164," 반응":269," 방안":58," 보건":205," 밖의":74,"단어 ":74,"까지 ":3943," 바이":697,"산동":187," 방정":183,"살던":66," 받지":79," 법무":126,"산된":107,"산되":209," 미카":72," 미치":120," 백인":62,"사들":245," 백작":125,"기초 ":68,"상남":484," 방위":104,"행하였":97,"행하여":85,"산당":204,"산대":62,"사도":176," 배우":373,"산더":62,"외 ":526,"사동":76,"사되":110," 베르":258,"생긴":160,"사된":69,"생기":186," 베를":141,"상동":61,"상도":117,"등으로":538,"비전 ":429,"산드":170," 백제":157," 보내":142," 봉기":67,"상당":532,"상대":588,"상담":89,"니스트":163,"삼동":102,"는지를":90," 방지":187,"살라":75," 보도":103,"사례":68,"사령":195,"사로":662,"생대":183,"살렘":109,"사료":78," 보는":142,"사라":234," 미터":113,"사랑":297,"사람":2438,"술하는":60,"요 ":955,"욕 ":325," 미토":58,"리의 ":865," 보다":233,"니스탄":150," 승 ":122,"우 ":2451,"살리":87,"석과":66,"사모":61," 바탕":449,"서관":272,"님의 ":89,"선거":1124,"설가":221,"산맥":321,"산림":320,"운 ":2130,"선과":230,"서구":357,"선고":72,"산면":109,"상록":72,"생들":178," 배출":120,"욱 ":152," 별명":121,"서가":119,"사르":183,"사를":1012,"산리":59," 버스":279,"생되":69,"생된":72,"사망":432," 방출":92," 보드":68,"사리":79,"사마":121,"사막":121,"용 ":1999,"메달 ":69,"사립":113,"혹은 ":1399,"당은 ":139,"실베이":80,"호의 ":145," 온 ":216,"수학자":299,"수학의":99,"집트 ":224,"수학적":106,"째 ":1900,"니이다":60," 옛 ":292,"카를로":81,"당을 ":126," 영 ":87,"마에 ":93,"림을 ":114,"당의 ":340,"기원후":57,"집 ":514," 연 ":60,"짐 ":75,"트를 ":420,"당이 ":130,"림의 ":63,"징 ":234," 열 ":74,"수행을":57,"신흥":75,"립을 ":177,"확보하":72,"짜 ":61," 생각":309,"단순히":106," 생겼":60," 생겨":141," 사는":188,"현재는":456," 사단":1779," 왕 ":308,"출판사":141," 상관":89," 삼국":254," 살고":132,"당사자":199," 삼각":141,"릴적 ":113,"동가 ":129," 와 ":1259," 삼고":59,"현재까":179,"대와 ":262," 사기":64," 사가":107,"리스에":76,"리스어":391,"대왕 ":100,"수학에":150," 사고":438,"리스의":233," 사건":1669,"스하":127,"시칠":59,"시카":195,"스피":342,"스핀":66,"승팀":125,"즉 ":662,"즈 ":1704,"시코":429,"시켜":325,"시켰":170,"실천":166,"길이는":187,"스플":201,"스프":195,"시청":223,"스퍼":73,"스펙":70,"스페":888,"시초":70,"스포":707,"신청":95,"스폰":57,"신체":167,"스투":75,"심지":489,"스튜":256,"스트":3813,"스틴":60,"스틱":113,"스티":592," 양 ":88,"스파":404,"실질":136,"스톡":64,"스토":637,"스테":623,"스템":1334,"스텔":113,"스터":863,"스턴":325,"스톤":109,"시지":134," 약 ":1207,"스크":1207,"현재에":63,"십자":249,"심장":74,"시즘":60,"시즌":581,"시즈":65,"신주":76,"심이":158,"싸움":90,"스탠":109,"스탄":838,"스탈":89,"심적":69,"층의 ":88,"스타":1367,"스키":554,"슈티":64,"슈팅":76,"카리브":78,"시조":180,"실이":117,"신전":61,"신적":108,"신정":80,"스코":803,"슈트":99,"스쿨":82,"심으":693,"심을":206,"심의":204,"슈퍼":245,"싸여":64,"신조":82,"실적":80,"스쿠":71,"실제":435,"스칼":83,"신으":438,"스카":534,"스칸":67,"시장":721,"실용":73,"신의":975,"당선되":96,"식재":59,"스캐":63,"신은":100,"단순한":93,"심에":93,"신을":369,"시인":445,"시이":748,"식의":488,"식을":715,"식은":204,"식인":80,"식이":641,"시작":1913,"시자":111,"현재의":485,"스케":361,"실은":1894,"신장":127,"식적":310,"실의":105,"싱어":70,"실을":195,"신이":763,"신인":167,"시점":75,"신자":135,"시정":103,"시적":140,"시절":216,"름으로":597,"실현":165," 역 ":190," 여 ":231,"실행":302,"신화":440,"실험":314,"신호":329,"질 ":802,"식회":208,"신해":58,"실하":58,"시호":371,"진 ":4958,"시했":60,"시행":387,"등지에":113,"시한":333,"식하":203,"식한":134,"마쓰 ":58,"지 ":12819,"직 ":904,"신한":74,"시험":488,"신학":397,"신하":183,"심판":171,"달성하":68,"시하":616,"시픽":100,"식품":996,"실패":146,"다중 ":70,"름이다":364," 뿐만":102,"싱턴":122," 에 ":3968,"층을 ":58,"시판":68,"시티":207," 뿌리":139,"시트":81,"기장에":74,"치시 ":58,"리스도":198,"증 ":222,"승한":65,"기장을":58,"신탁":73,"슷한":239,"슷하":194,"즘 ":268,"승하":152,"시킬":102,"시키":987,"시킨":292,"시타":80,"담은 ":79," 어 ":61,"기장이":86," 억 ":226,"전》":368,"대에 ":921,"호선의":73,"즌 ":240,"시뮬":120,"신문":591,"심리":312," 빅토":126,"식민":428,"말에 ":88,"시바":74,"시민":455,"시미":57,"스스":259," 비판":173,"시베":62," 비트":267,"식별":124,"수함":92,"수한":297,"수학":984,"수하":240,"수행":909,"실베":102," 비평":80,"시부":72,"수필":57,"순한":104,"수형":93,"신부":105,"신분":132,"주 ":6421,"스앤":101,"신비":78,"머니이":60,"이》":70,"수호":86,"준 ":2588,"술하":154,"술한":115,"수화":58," 비해":209," 비행":271,"승을 ":297,"줄 ":149,"스어":694,"스에":917,"순화":57,"순환":168,"기존의":250,"시라":107,"신라":377,"식량":124,"시로":1329,"승부":68,"실록":97,"실로":87," 쓴 ":317,"신뢰":98,"시를":363,"시리":1561,"시마":348,"시립":76,"술탄":80,"쇼핑":78,"림수산":539,"신망":70,"시모":119,"실리":112,"스속":105,"층에 ":431,"식물":650,"식문":64,"독교 ":326,"승전":168," 알 ":152,"실시":535,"시아":3207,"시안":117,"다수의":139,"트로 ":289,"닿아 ":81,"시온":61,"시오":180,"썼다":75,"시와":405,"신에":201,"신약":75,"식에":220,"비아 ":592,"시어":65,"기적인":70,"시에":1534,"신앙":167,"기적으":76,"신용":161,"식으":607,"시의":1064,"시위":145,"시우":69," 앤 ":149,"실업":129,"실에":130,"시외":223,"슈타":211,"스위":294,"순히":106,"스웨":430,"스오":59,"스와":739,"심부":90,"술회":66,"슨의":72,"직한 ":59," 뻗어":76,"시설":711,"슨이":67,"신사":180,"스의":2139,"중 ":3559,"도권 ":73,"식사":62,"시상":135,"스이":308,"시사":106,"스인":78," 빌헬":87,"신속":78,"지할 ":97,"스주":85,"승에":60,"지한 ":61,"신설":186,"신선":60,"실상":172,"신성":225,"시시":71,"실수":92,"승을":297,"승인":162,"승자":99,"신시":65," 안 ":111,"승용":69,"습을":167,"습이":87,"시스":1777,"심사":156,"丙 ":228,"소하":122,"소한":93,"속하":1183,"시계":147,"丘 ":1912,"속한":740,"소행":127," 비스":147," 비슷":450,"시게":81,"소피":62,"지하 ":71,"족 ":952,"신경":249,"손해":112,"니아와":58,"송파":188,"만에 ":232,"속해":225,"丟 ":282,"식경":89,"시공":102,"丞 ":58,"튜디오":194,"스도":241,"식과":187,"식공":81,"조 ":2368,"丑 ":151,"스는":657,"달의 ":59,"지칭한":98,"지칭하":168,"호주 ":71,"송통":78,"형이다":110,"丕 ":102,"소프":1375,"且 ":206,"니아의":174,"습을 ":166,"시간":1006,"시각":172,"시가":359,"스니":152,"수요":138,"심리학":178,"신기":142,"수와":199,"싱가":102,"丫 ":62,"르크 ":449,"마와 ":59,"송하":166,"술연":115,"좀 ":61,"수원":294,"술에":154,"형으로":179,"수용":253,"신교":225,"両 ":166,"소화":114,"시권":58,"수아":82,"존 ":728,"시국":65,"니아에":65,"신과":128,"신고":75,"소형":129,"수영":176,"수였":72,"並 ":3956,"시기":468,"수여":200,"수엘":64,"수에":273,"수업":92,"리즘 ":90,"심각":57,"달을 ":122,"승격":131," 분화":88,"수사":246,"수산":1050,"수상":476,"소통":90,"트럴 ":117," 북한":208,"스기":80," 빌보":65," 분할":95," 분해":71,"승객":82," 부호":199," 부활":95,"수술":79,"수수":82," 비상":102,"丈 ":353,"순수":115,"스노":76,"三 ":5345,"출판되":65,"시스 ":159,"리지 ":202,"소포":62,"수시":75,"수신":110,"수성":148," 부흥":64,"수소":134,"스나":102,"丁 ":3345,"식민지":360,"万 ":91,"순서":137,"수송":152,"술사":85,"술서":149,"당에 ":75,"리스트":199,"스미":145,"스바":64,"스박":60,"리스티":104,"시드":120,"시된":222,"식되":65,"신도":198,"스몰":61,"승려":122,"乙 ":1163,"슬링":138,"슬림":73,"乘 ":269,"시되":344,"리스토":113,"시도":203," 빛을":61," 빛의":72,"식문화":60,"죄 ":281,"단이 ":202,"비에 ":63,"스부":93,"수치":89,"단일 ":104,"스볼":110,"乩 ":76,"수출":221,"스베":93," 빼앗":59,"진출하":105,"순천":97,"신들":162,"승리":273," 비율":114,"숙종":94,"스럽":73,"스러":103," 비잔":182,"술적":188,"스레":59,"수주":83,"수준":267,"좌 ":100," 비용":109,"시나":175,"술의":455,"싱글":400,"술을":417,"술은":76,"술인":130,"시내":295,"스라":342,"술이":263,"술자":98,"丹 ":433,"수장":102,"술원":71,"리즈 ":556,"수적":127,"싸고":92,"수정":259,"맥스 ":80,"종 ":2005,"수의":777," 비영":167,"수익":107,"수이":764,"순위":140,"수입":226,"수인":135,"수자":76,"스마":263,"시대":2951,"스만":162,"머니는":107,"스리":201,"시다":76," 비즈":134,"시는":274,"슬로":221,"때부터":94,"스를":979," 비주":67,"之 ":4501,"수직":77,"수지":162,"수집":212,"숭이":78,"숫자":148,"슬러":133,"파구 ":139,"슬레":59,"스로":529,"시노":86," 비전":65,"슬람":295," 비정":120,"슬란":124,"슬라":250,"실내":104,"빌보드":64,"그의 ":1128,"즘의 ":66,"수인 ":81,"마크 ":189,"극을 ":92,"는다 ":996,"기사 ":106,"수상하":96,"수상했":86,"술서기":141,"지방에":438,"교향악":94,"지방을":71,"지방으":60,"특수 ":78,"지방자":116,"지방이":59,"지방의":262,"국토해":108,"니고 ":121,"카미 ":64,"힌두교":117,"증을 ":98,"노쿠니":65,"만큼 ":89,"수적 ":58,"싸고 ":90,"파로 ":96,"지식 ":88,"는데 ":1814,"주하고":96,"술은 ":76,"분포하":147,"술을 ":417,"술의 ":445,"지배하":161,"싱글 ":166,"지방행":148,"지방해":72,"메리카":560,"메리칸":121,"근에 ":191,"리튼 ":69,"칠리아":60,"리트 ":176,"특별시":2267,"그와 ":129,"념하기":58,"술이 ":63,"모두 ":601,"특별법":73,"국토관":71,"지방과":88,"지방검":64,"숙종 ":62,"리티 ":77,"특별지":138,"주파수":89,"마지막":447,"특별자":254,"떠한 ":154,"술적 ":124,"파를 ":146,"누어 ":109,"모든 ":1110,"빠르게":93,"圓 ":266,"그에 ":214,"지방국":73,"國 ":227,"지방법":71,"기록하":192,"기록한":116,"파리 ":176,"비스 ":391,"특별히":77,"머스 ":117,"질서 ":65,"츠의 ":100,"특별한":118,"스로 ":388,"지막으":64,"중해 ":78,"르헨티":187,"신교 ":109,"급이 ":64,"림축산":107,"급을 ":162,"기반의":108,"기반을":124,"기반으":258,"급은 ":65,"금이 ":60,"급의 ":111,"기법이":65,"지리학":71,"즈의 ":361,"지사 ":62,"지면서":85,"리키는":517,"기술 ":470,"충청남":311,"리킨다":654,"지상 ":78,"판매 ":90,"수영 ":75,"트비아":70,"수에 ":166,"시기 ":87,"금은 ":161,"수와 ":188,"리키며":63,"금을 ":242,"주특별":162,"다가 ":862,"티브 ":148,"금의 ":358,"중화 ":71,"충청북":246,"금융 ":121,"國之":115,"國亞":57,"리카의":183,"國丁":85,"國三":298,"國並":202,"구팀이":62,"圓三":73,"리카에":152,"트비히":100,"립트 ":57,"명되었":154,"술에 ":87,"수산부":104,"수원 ":77,"수산물":73,"즈와 ":65,"마찬가":135,"교회의":291,"리프 ":97,"교회와":67,"교회에":135,"몇몇 ":106,"메모리":266,"리케인":119,"리케이":107,"지류인":60,"급에 ":117,"기서 ":119,"리한 ":169,"수산식":513,"리학 ":218,"치러진":143,"근의 ":141,"금에 ":58,"트어 ":82,"리핀 ":247,"극장 ":74,"수의 ":736,"글을 ":102,"트에 ":193,"최후의":62,"출할 ":58,"지리적":89,"패로 ":61,"다고 ":1194,"출한 ":110," 피부":85,"스는 ":657,"부터의":66," 행동":333," 허가":248," 하류":58," 했던":64," 하루":107," 했다":544,"리즈에":127," 표현":714," 필립":80," 필리":448," 필름":96,"부통령":87,"단과 ":106,"지로서":70," 해당":773," 한때":92," 합동":61,"리즈의":173,"리즈이":119," 하드":309,"르크스":130,"冲之":64," 함대":59,"질랜드":239," 필라":63," 핀란":212," 할당":112,"시가 ":252," 한다":3550," 하다":597,"트로닉":61," 하늘":148," 하는":2411," 하느":107," 丹並":115," 丹丁":139," 한성":74," 丹三":149," 丹丘":70," 丹之":113," 현과":87," 피아":207,"르키나":72," 학생":355," 학살":107," 학사":67,"트라이":124," 並倉":88," 並之":464," 並乙":106," 필수":58," 丘倉":66," 항복":59," 해발":149," 해방":139," 並亂":164," 並亞":126," 並丁":313," 並丘":167," 합병":171," 並三":570," 並並":376,"단계 ":74," 丘乙":71," 三倉":199," 丘之":178," 丘並":131," 하부":75," 丘丘":138," 丘丁":182," 丘三":312," 행렬":75," 항만":73," 丁倉":83," 한반":214," 하버":79," 한민":80," 해리":88," 한번":60," 항목":84," 三丘":352," 합류":148," 三三":1459," 三並":676," 학문":432,"매주 ":79,"규정하":103," 丈三":77," 三丁":609," 三亞":219," 합리":91," 三之":750," 헝가":248," 三乙":140,"트랜스":130," 三亂":164," 하며":678," 하면":179," 丁丘":176," 丁三":472," 丁丁":355," 丁並":314," 학명":174," 할리":61," 丁亞":83," 丁亂":112," 丁乙":148," 丁之":391," 해에":93," 함으":81," 해양":438," 해안":258," 한정":80,"비롯한":246,"비롯하":108,"화하여":111,"비롯해":85," 해운":73," 향상":440," 해외":278," 하지":445," 합작":67," 해와":61," 합의":125," 하위":142," 학위":90," 행성":260," 하원":92,"화학자":72," 행사":339," 하우":120," 하와":84," 피지":78," 핵심":157,"트로이":97," 한자":374," 한일":97,"화학적":86," 현대":825," 프톨":67," 하인":90," 함양":74," 학자":250,"째로 ":479,"리크 ":66," 허리":151," 하이":338,"환하는":65," 해서":99," 해석":322," 하여":729," 하에":107," 하였":775," 하얀":72,"립적인":63," 학습":168," 행복":66," 학술":377," 피에":140,"즈에 ":111," 항성":196," 함수":391," 해산":87," 필요":634," 해상":198," 항상":88," 합성":230,"르크의":62,"시간 ":432,"중추원":62,"화합물":130," 확대":221," 피해":312," 회계":74,"트레이":130,"트레일":500," 협력":288," 황금":95," 혁명":479," 합쳐":178," 헤비":59," 혼동":64," 해체":141," 해주":60,"늘날에":73," 하천":129,"기부터":151," 행위":568,"부터는":174," 했으":125," 호는":364," 피터":73," 행정":2058," 협동":102," 헤르":204," 화가":164," 홈구":160,"노폴리":62," 헌법":409,"규정에":101,"늘날의":100," 헨리":126,"틀랜드":245," 倉 ":57," 해전":102," 허브":61," 헬리":60," 환경":782," 호남":99,"僅丁":77," 현실":117,"僅三":105,"캐나다":542," 형성":527," 확률":198," 항해":59,"속해 ":143," 혁신":98," 형상":83," 형사":105," 협상":60," 학회":76," 형식":446," 화물":115," 해협":157," 후계":64," 화면":108,"속한 ":330,"지부 ":95," 확립":183," 헤이":198," 한편":88,"구하여":62," 호로":200," 형법":118," 활동":1538," 학파":104," 한해":58," 현상":583,"화하는":131," 효과":357," 하프":69,"티노폴":57,"티누스":77," 황도":62,"傭三":85," 호르":78," 허용":108," 홀로":66," 확산":127," 화석":129," 화성":179,"림청 ":130,"확하게":91,"카를 ":215," 홍수":68,"카데미":148," 형이":88," 향하":60," 현지":86," 협정":151,"립운동":226," 화산":224," 형제":214," 현청":128," 확실":69,"화하기":61," 활성":221," 효력":75," 호에":514," 활발":97," 확보":146," 행해":125,"화하고":69," 행하":123," 현의":154," 협약":108," 혈액":72," 현에":71," 홍보":159," 획득":133," 후기":424," 혐의":64," 회로":165," 호수":187," 현존":112," 협의":101," 호스":85," 현재":2800," 현장":98," 호선":451," 호세":57," 확정":107,"최하는":107," 확장":327," 환자":90," 홈즈":59," 형태":1027," 후반":207,"구하는":390,"소형 ":88," 휘는":138," 황제":690," 후보":221,"리즈는":127,"는다는":74,"리코 ":60," 호칭":116," 혼인":73," 혹은":1388,"부호 ":101," 회복":73,"리즈로":71," 호주":106," 호조":68,"倉倉":133,"수아 ":63," 활약":340," 훈련":135," 화재":138," 확인":346," 활용":295,"신과 ":111," 회사":809," 화이":94,"기본적":189,"마에서":71,"리지널":66,"식과 ":176,"분할 ":73," 효율":259," 홈페":62," 혼합":144,"丘驿":62," 호환":88," 후에":392,"출판 ":83," 효소":84," 회원":584," 홋카":126," 회의":273," 휴대":208," 호텔":130,"주한 ":63,"담고 ":141," 홍콩":226," 회장":211," 협회":248," 후속":211," 후손":130,"구하고":183," 회전":155," 희곡":57," 후지":163," 흘러":172,"수소 ":62,"스나 ":83,"모델 ":116," 화합":132," 화학":486," 화폐":85," 후원":123," 倉丁":148,"주최하":113," 倉並":114," 倉三":242," 倉丘":63," 倉亂":57," 倉之":119," 힌두":143,"니다 ":190,"대개 ":139," 히데":85,"즈베키":65," 황후":99," 흐르":180," 황해":115," 흐름":133,"당과 ":104,"마치 ":141,"기원 ":60,"북한 ":74," 후쿠":159,"트리스":63,"지배 ":66," 히로":126," 倉倉":67," 회화":73,"지방 ":781,"대가 ":240," 희망":65," 히브":111,"트리아":442," 흰색":64," 희생":96,"니는 ":214," 후한":109," 흡수":98,"부한 ":73," 흑인":105,"명령 ":61,"흐르는":105,"그이다":93,"리처드":110,"당구 ":75,"치를 ":548,"·연구":60," 히스":59,"치단체":145,"극작가":87," 힘을":80,"기와 ":326,"망원경":111,"트리트":107," 휘하":73,"맡았으":76,"집단이":82,"흘러드":76,"만으로":117," 흑해":77,"대교 ":83,"빠른 ":100," 흔히":425,"류하는":80,"류하다":77,"르투갈":235,"능력 ":85," 히트":60," 히틀":63,"기여 ":85,"명나라":109,"기에 ":1427," 힙합":219,"근에서":110,"류하고":65,"기업 ":244,"막으로":89,"대고 ":174,"기억 ":70,"마이다":107,"맡았다":163," 之三":810,"말이다":630," 之丁":474," 之丹":65," 之並":680," 之丘":236,"마이클":102," 之亂":157,"마이크":612," 之之":450," 之乙":108," 之亞":129,"술문화":87," 三國":102," 乙丁":159," 乙三":188," 乙丘":109," 乙並":99," 之倉":147," 乙之":125," 乙乙":65,"를테면":61," 乙倉":62," 並國":241,"치되어":106,"치되었":68," 亂丁":147," 亂丘":69," 亂三":239,"국회의":347,"수법인":62," 亂之":174," 亂並":156," 亂倉":78,"그에서":63,"구협회":61,"근에는":98,"마이오":85," 亞之":103," 亞亂":72," 亞三":197," 亞丁":62," 亞並":142,"대구 ":108,"구현하":65,"모는 ":61,"학자":2335,"한의":189,"현대":875,"함에":145,"한을":120,"학의":956,"하이":710,"학을":385,"학으":130,"하의":429,"학은":215,"하자":244,"학이":346,"함양":88,"하임":58,"하인":110,"하일":70,"프톨":69," 통상":134,"학적":904,"기념일":61,"할을":489," 터키":401,"한이":100,"합에":109,"한자":411,"한일":104,"행사":447,"하우":219,"하원":98,"피지":85,"핵심":159,"호간":219,"호가":153,"행성":432,"학위":158,"허리":158,"학원":440,"하위":156,"항을":212,"하지":1949,"항으":97,"해왔":62,"혔다":58,"합작":74,"해와":166,"그래픽":309,"합이":240,"합의":328,"함이":112,"한지":68,"협동":167," 통신":479,"해운":88,"그레이":201,"그래프":117,"무관 ":85,"향상":599,"합적":124,"항이":169,"헬레":69,"해외":283,"항의":92,"한제":218,"해안":395,"현동":109,"한정":88,"프트":1666,"호국":61,"항에":121,"합으":111,"합은":87,"합을":202,"함으":765,"함은":90,"함을":479,"해에":225,"해야":257,"해양":758,"현된":68," 태화":59,"퓨터":1617,"부천시":57,"할시":83,"항성":207,"피의":87,"해산":95,"함수":560,"필요":647,"해상":245,"해살":124,"피에":185,"행법":63,"픽에":161,"피오":64,"학습":221,"행복":77,"하시":85,"슬로바":89,"합성":289,"항상":88,"해수":92,"그려져":89,"하얀":74,"퓨팅":102,"항시":79,"하에":362,"헤드":71,"학연":123,"하였":7094,"학에":1132,"하여":10590,"한에":57,"하와":112,"해시":144,"할아":61,"해석":435,"해서":1485,"핀의":74,"해병":58,"그러한":104," 털이":65,"필수":60,"픽스":83,"항복":61,"해발":152,"해방":192,"피스":241,"합병":195," 토머":69,"합법":65," 토마":98,"확히 ":61,"지역과":116,"학술":418,"피어":124,"피언":426,"하수":77," 테이":155,"피아":520,"현과":109," 태풍":365,"한성":103," 탐험":69,"향력":154,"호하기":67,"한사":58,"한산":83,"학생":582,"해부":71," 태평":280,"학상":204,"학살":146," 테오":77,"학사":169,"프와":64,"헝가":249,"항만":156,"합류":148," 텍사":129,"문과 ":115,"프에":98,"한문":65,"함마":70,"합리":119,"해로":105,"하므":81,"학문":497,"하며":3518,"하면":1454," 타타":70,"호하고":60,"마포구":144,"할린":58,"하모":62,"학명":176,"할리":69,"권에서":136,"한불":58,"행렬":102,"피소":115,"하부":85," 텍스":81,"구지방":63,"학부":176,"스리랑":67,"해리":93,"한번":67,"해를":394," 테스":96,"합물":165," 토론":158,"항목":94,"프의":110,"한반":216,"한민":7437,"하버":84,"표하":378,"해당":812,"필리":478,"필립":81,"필름":113,"해도":129,"표했":61,"표한":304,"파르트":58,"신라 ":149,"표현":736,"하라":255,"해되":60,"하려":361," 토대":84,"했다":3819,"했는":102,"하로":70,"프스":170,"했던":889,"비디오":466,"하루":140,"하리":63,"판매되":66,"히는 ":130,"행동":388,"하마":145,"하르":159,"하류":64,"허가":742,"하를":74,"문가 ":75,"피부":86,"기념하":175,"행된":216,"행되":505,"하느":121,"하는":18789,"필드":112,"효율적":195," 태어":821," 테러":91,"하늘":174," 태양":417,"했고":365,"하다":2005,"행과":61," 터미":91,"학대":110," 타지":75," 탑재":107,"하던":522,"한다":14560,"하도":295," 텔레":470,"매한 ":208,"해남":60,"했기":61,"할당":116,"한데":57," 통계":268," 테르":57,"행기":130," 통과":189," 테마":80,"표팀":149,"합뉴":77,"기록 ":133,"핀란":235,"하드":336,"호하는":60,"함대":93,"피로":70,"필라":72,"기로 ":619," 태조":124,"파리에":66," 태종":83,"한때":92,"합동":105," 토너":70,"티베트":129,"함된":182,"함되":256,"해는":67,"합된":94,"합되":136,"품종":97,"프를":84,"플로":271,"합과":70,"함과":93,"풍이":93,"플레":847,"플러":120,"합격":100,"함경":91,"풍으":73," 탑승":57,"한글":194,"플랫":190,"플랜":61,"플래":146,"해결":279,"합금":58,"항구":135,"플루":90,"항공":848,"프리":1512,"프린":151,"합군":122,"대기 ":87,"해가":156," 타이":816," 타인":112," 타자":72," 타임":92,"한나":105,"하나":4082,"품질":176,"해군":428,"함께":1510,"근로자":82," 타원":89,"플리":188," 탄자":80,"학년":63,"그리고":1096,"하노":64,"피드":122," 탄생":197,"표지":57,"표준":828,"푸에":75," 탐사":100,"품안":66," 탄소":90,"시로 ":1233,"하고":10067,"학계":119,"그룹의":105,"학과":608,"학관":65,"프라":535,"프란":258,"풀이":93,"한계":93,"글로벌":433,"하거":696,"하게":2062,"그룹이":170,"한강":205,"품에":167,"그루지":65,"하계":214,"프레":585,"한국":5662," 태생":107,"품을":440,"품은":147,"품으":223,"하기":4401,"프로":4269,"품의":327,"품이":502,"플라":427,"학기":729,"프랑":2551,"학교":2885,"하구":63," 테니":98,"프랜":72,"프랭":62,"후부":73,"호칭":116,"그랜드":67," 팔라":87," 팔레":125,"군인으":72,"효소":123," 티베":152," 투표":249,"현하":338,"현한":123,"군인이":191,"황이":118,"황의":113," 파란":59," 파라":163,"시대의":878,"휘는":146,"후보":250," 특수":377,"황제":796,"구조를":175,"형태":1105," 특성":245,"후반":218,"황을":121,"환의":69,"환이":100,"화적":211,"환자":123,"화점":66,"화제":218,"화정":132,"홈즈":59,"시를 ":363,"활을":172,"확정":119,"활의":75,"황에":133,"구조물":73,"무가 ":97,"화유":179,"확장":333,"괄하는":86,"환을":105,"환으":121,"화재":932,"화장":67,"활용":342,"확인":373,"화인":552,"회사":1398,"화이":520,"화의":717,"회주의":311,"훈련":219," 파동":83,"화와":331,"그레고":75,"떨어져":174,"화예":218,"그래머":64,"헨티":187,"화연":58,"후로":193,"화원":71,"활에":69,"파소 ":63,"활약":341," 특별":638,"호조":94,"호주":113,"화에":724," 판단":139,"황실":61,"그러나":356,"호작":139,"호의":161,"혹은":1401,"호이":192,"떨어지":62,"떨어진":577,"그래밍":301,"휘관":62,"혼인":75,"회복":162,"확실":95," 파나":61,"현청":128,"규모의":148,"관하여":88,"호와":168,"화시":180,"관하였":59,"호에":620,"회민":65,"국제적":222," 트리":201,"효력":77,"리핀의":59,"형질":59,"활성":267,"화소":62,"스마트":110,"교차로":174,"화석":141,"화성":207,"리학의":82,"홍수":94,"시마 ":242,"형적":101,"형제":253,"화산":351,"리학에":160,"화사":143,"리하여":91," 트로":120,"확산":137,"화상":154,"형인":57," 트랙":87," 트랜":212,"현지":87,"협정":206,"향하":118,"협조":69,"회를":455," 트레":142,"형의":233," 트라":148,"형이":378," 파괴":164,"형은":85,"형을":227," 판결":111,"형으":179,"협의":275,"호스":99," 파견":97,"혐의":73,"회로":482," 통화":141," 투쟁":64," 투자":238,"현종":57," 투입":65,"현존":113,"호수":201,"리학자":290," 통합":613," 통해":1669," 통하":497,"구조에":62,"메인 ":87," 통한":430,"형에":86,"현장":128,"현재":2814,"카드 ":151,"호세":67,"현을":152,"현의":222,"호선":457,"현이":155,"협약":154,"행형":63,"치로 ":163,"현으":89,"활발":97,"호사":248,"행해":164,"행했":102," 투어":114,"확보":154,"티아 ":137,"행할":139,"행한":306,"행하":1296,"트에서":297,"화방":117,"혈액":74,"홍보":199,"티안 ":64,"현에":128,"화민":221,"확립":194,"헤이":247,"합회":170,"해한":72,"해하":221,"후계":66,"판사 ":69,"화면":122,"해협":171,"형식":491,"화물":206,"리하고":119,"현악":203," 통틀":103,"획득":137,"후기":490,"시대를":133,"리하기":81,"험을":187,"학회":466,"험자":75,"험의":59,"화로":327,"험이":169,"함한":517,"함하":555,"혁신":147,"협상":84,"형사":139,"합한":173,"함해":71,"합하":407,"현실":137,"형상":96,"형성":570,"항해":93,"확률":208,"화를":1063,"회는":1632,"항하":173,"판매하":166,"회당":58,"회담":74,"호를":436,"황도":75,"홀로":69,"호르":135,"험에":93,"하프":78,"효과":428,"홀름":57," 통칭":251,"한하":72," 통치":299,"그램이":267,"학하":70,"그램의":101,"하학":186,"현상":661,"그램을":190,"그램으":79,"헌정":58,"해튼":63,"할하":144,"할한":336,"한해":67,"리하는":212,"화되":254,"화된":293,"화동":78,"시대부":79,"활동":1928,"학파":157,"필하":71,"형법":133,"호로":363,"리한다":63,"형벌":57,"허용":114,"한편":89,"기념물":120,"추진하":98,"글라데":64,"화는":234,"회가":354,"향을":646,"다는 ":1779,"향으":228,"향의":75,"하키":105,"향이":127,"피하":88,"피해":356,"회계":133,"형문":163,"화도":69,"회관":224,"화당":76,"확대":271,"관할한":325,"관할하":116,"황금":99,"혁명":621,"행중":61," 통일":365,"합쳐":178,"피트":60,"헤비":60," 투르":130,"화나":80,"향에":72,"협력":477,"합체":119," 토지":194,"향악":104,"해체":142," 통제":86,"했지":130,"혼동":64,"행을":299,"행으":73,"해진":265,"해지":305,"행의":136,"행위":930,"헤미":57,"했을":154,"했으":992,"화기":123,"하천":174,"하철":238,"망한 ":84,"지지 ":191,"해주":158,"호대":71,"행정":2568,"징을 ":98,"호는":1131,"관현악":157,"피터":77,"실로 ":59," 통영":63,"치료 ":66,"행이":179,"행자":83,"시대에":456,"합중":92,"헨리":128," 테크":122,"해자":124,"해있":127,"해의":187,"화강":75,"헌법":468,"홈구":160,"화가":521," 토양":84,"헤르":222,"향신":67,"화권":68," 大丁":58," 大三":77," 大並":205,"화국":2106,"행에":142,"환경":1224,"글랜드":374,"호남":102,"화교":105,"허브":67,"해져":64,"했었":83,"헬리":65,"해졌":152,"해제":83,"화관":60,"해적":58,"해전":114," 토요":69,"헬름":103," 파티":73," 폭력":128," 파트":83,"황후 ":99," 포맷":119,"최초의":614," 포르":340," 패턴":71,"흥에":88," 펜실":91," 폴란":330," 페스":58," 포로":65,"술이다":157,"흑인":107,"질적 ":91," 폭발":156," 표기":452," 페어":66," 편성":112," 폴리":129," 퍼져":67,"틴어 ":259,"흥을":81,"흥원":108,"히스":77,"히어":69," 페이":330," 평생":57," 페리":59,"회화":87," 패의":57,"후쿠":167,"히로":187,"회학":141," 페루":67,"시내버":159,"집을 ":67," 페르":414,"평가 ":122,"휘자":123,"지식을":78," 평등":60," 특허":114,"브리어":86,"흐스":60," 포도":92," 편리":65," 특히":654,"트이다":286,"흰색":64,"니라 ":509," 파키":145," 퍼시":104," 파크":139,"후한":126," 평면":83," 판타":104,"히브":114,"희생":100,"흡수":105," 포드":63,"지스터":88,"후원":128,"호흡":62,"확한":140,"확하":150,"화합":170,"화한":180,"화학":716,"화하":463,"훈장":68,"후인":57,"후이":122," 파스":91,"후의":318," 평가":365,"환하":122,"화협":57,"활하":69,"흘러":172," 파악":100,"확히":75," 평균":159,"북쪽은":67," 패션":82,"북쪽으":560,"후지":177,"활화":61,"히데":106,"힌다":88," 팀이":267," 팀인":83," 팀은":59," 팀을":65,"부터 ":5567,"황해":119,"흐르":181,"흐름":139,"히는":131,"북쪽에":130,"황후":208," 파워":88,"비를 ":208," 파울":70,"브리지":101," 포괄":107,"힌두":145," 폭격":62," 파이":306," 파일":459," 파장":70,"희망":76,"멕시코":351,"면에 ":269," 판정":70," 파르":82," 파리":324,"호쿠":61," 판매":505,"획에":68,"화체":1138,"회에":827,"회와":251,"호크":61,"홋카":126," 패러":89,"티에 ":60,"퍼드 ":93,"회원":649,"호텔":166,"스볼 ":93,"흥과":58,"획을":70,"회의":1793,"휴대":208,"회이":521,"회장":306,"협회":1182,"획이":86,"홍콩":229,"흥군":61,"회전":178,"희곡":60,"회적":302,"후손":130,"후속":211," 특유":59," 특이":78,"회주":311,"수이자":73," 특정":747,"단계에":81,"효율":286," 특징":490,"히고":71,"홈페":65," 패배":83,"호하":246,"호협":62,"혼합":151," 파생":136,"호화":96,"호환":91,"화폐":97,"후에":722,"황태":64,"브리튼":77," 하기":275," 프로":3860,"스러운":81,"지적 ":62," 한국":5417," 프레":355,"지속적":117,"슬란드":102," 학교":399," 프랜":67," 프랑":2466," 프라":205," 프란":161," 한계":75," 하고":784," 하계":203," 하게":69," 한강":168," 하거":107,"부지방":64,"메시지":94,"무기 ":82,"뉴스 ":176," 표준":763," 푸에":67," 학년":60," 함께":1504," 해군":342," 한나":93,"지정 ":159,"출연하":62," 하나":3925," 품질":108," 항구":132,"모로코":75," 해결":238,"직접 ":361,"추정된":67," 품종":76,"추정되":104," 항공":685," 프리":592," 프린":124," 플루":58," 플러":58," 합격":78," 플레":499," 플로":151," 플라":322," 플래":123," 플랫":188," 한글":177," 함경":88,"림픽에":158,"질을 ":426,"슬라브":70," 평화":277,"슬라비":71," 포크":98," 표시":264," 포털":64,"질은 ":90," 포터":82," 풍부":90," 폭탄":71,"홈페이":65,"그리스":1462,"질의 ":262," 풋볼":90,"티의 ":76,"힙합":222," 포항":151,"출입국":71," 포함":1756," 피고":86,"수주의":71,"지식경":85,"블로그":77,"기능이":91,"수준의":104," 포트":118,"립하여":100,"질이 ":162,"립하였":147,"힌다 ":87,"기능을":346,"그림을":66,"흔히":426," 펼친":93,"싱글이":108,"트웨어":808,"기독교":722," 포지":92,"히토":70,"직의 ":93,"지인 ":141,"히코":61," 포유":87," 펼쳐":96,"흑해":78,"립하는":77," 폐지":323,"스라엘":216," 표면":218," 포인":78,"마트폰":75,"비로 ":77,"중화인":472," 평택":82,"립하고":77,"권으로":128," 푸른":60,"히틀":64,"히트":99,"트워크":561,"립했다":63,"티움 ":158," 편이":112," 편의":114," 편입":97,"뉘어 ":79," 평안":87,"힘을":86," 평양":179,"립허가":405,"진은 ":66,"문구 ":138,"물과 ":174,"진을 ":324,"지션은":60,"슬람교":71,"휘하":198,"권익보":58,"물관 ":160,"목록이":78,"후에는":149,"진의 ":108," 편찬":115,"기를 ":1415," 포스":152," 평원":63," 편집":205,"진이 ":64," 철학":533," 처형":61,"키타":74,"직을 ":211," 추락":80,"브라이":58,"브러리":89,"태지":103," 초점":101," 체포":79,"탈출":64," 촉진":190,"타카":106," 취급":129,"흥 ":176,"토대":96," 천황":402,"키텍":134,"토니":210,"기반 ":179," 촬영":150,"타케":96," 총장":96,"희 ":216," 출력":112,"브라우":133,"브라운":81,"지의 ":915," 체험":72," 최소":174,"타크":164,"터스":138,"시기에":129," 최상":86,"터미":267,"후에 ":504,"탑재":111,"지은 ":141,"테레":67,"태양":433,"테러":113,"태어":822,"태에":207,"테르":345,"통계":363,"술연구":104,"통과":270,"테리":169," 초에":92,"태의":337,"테마":94,"타지":184,"탕으":335,"탁집":59,"텔레":541," 체코":189,"터베":257,"중화민":210,"테면":61,"브라질":214,"흐 ":135,"대교구":66,"태자":84,"태이":177," 초연":67,"토노":77,"태종":105,"태조":138,"토너":74," 충돌":170,"터이":208,"터장":63,"토리":469," 취득":122,"터의":386,"토마":125,"대는 ":203,"토머":72,"턴의":58,"통로":60," 총칭":263," 충분":75,"톨릭":545,"톨리":110,"토모":60,"모델이":91,"털이":89,"통령":1141," 축산":110," 추상":113,"태평":463," 최적":94,"토미":114,"테오":100," 출생":337," 최종":192,"회원국":118,"출시한":118," 축소":69,"투갈":235,"태풍":371,"탐험":74,"템에":113,"테일":72,"테인":310,"테이":563,"타타":84,"화체육":1124,"키프":58,"텍사":130,"통되":70,"틀어 ":87," 최신":57,"히 ":2784,"리투아":94," 출발":163,"즌이다":79,"힌 ":149,"터에":367,"탄티":148,"토로":115,"힐 ":59," 출범":202,"텍스":122,"토론":166,"테스":290,"터와":150,"톨레":83," 편 ":84,"힘 ":69,"토르":148,"토를":102,"터널":153," 처음":1260,"터내":78," 팀 ":177,"스》 ":220,"시된 ":202," 채택":232,"붙여졌":67,"회원의":67," 천연":128,"크이":135,"크의":348," 최근":299," 천안":91,"태를":373," 청원":63,"회이다":450,"지원 ":253,"터는":441,"치도 ":124," 판 ":94,"수엘라":63," 최남":61," 초명":73,"모니터":62,"키스":403,"질에 ":69,"터넷":668," 청사":77,"탈리":1166,"탈린":65,"후 ":3176,"타마":167," 초등":93," 창출":78,"타르":237,"타를":77,"타리":335,"터가":166,"키보":78,"키백":58,"금속 ":93," 차트":125,"태로":481," 최고":577,"타미":69,"크와":139,"타바":73,"구축하":85," 청소":291,"붙여진":80,"크어":57,"크에":279,"마케도":118,"모리 ":253,"훈 ":123," 총독":86,"터로":173,"타에":65,"타운":164,"타우":94,"홋카이":126,"택시":93,"키지":131,"탄에":62,"타와":67,"텐도":125," 척추":95,"휘 ":79,"탑승":57,"트와 ":207,"탄자":83,"토관":72,"탄의":170,"탄을":70," 총선":154,"터를":399,"토가":60,"터리":174,"타워":123,"타원":94,"타입":74,"크톱":69,"타임":197,"타자":78,"시도 ":60," 초식":67,"타의":139,"타일":144,"타인":408,"타이":1114,"키아":237," 추가":264," 체육":101,"테네":224,"탄소":127," 최대":527,"키에":115,"타스":72,"탐사":115," 초반":121," 천주":117,"블라디":122,"슬링 ":88," 총리":306,"모를 ":205,"탄생":209," 팝 ":93,"테나":69," 추기":61,"즈이다":127,"키의":171,"태생":110," 챔피":295," 천체":219," 청주":153," 출간":122," 체제":509,"키와":61,"모델로":73," 추구":126,"키우":65," 축구":957,"테니":137,"트는":279,"리히 ":247," 측정":234," 캘리":251," 층에":435,"통틀":103,"토해":114," 캄보":74,"트니":100,"토프":60,"진에 ":121,"퇴적":60,"대규모":179,"트남":408,"투스":187,"튜디":195," 캐릭":178,"빌딩 ":490," 칼리":148,"토크":67," 카메":218," 풀 ":69,"수이며":80,"통치":346,"통칭":251,"대구지":67,"트가":392," 카리":88,"지와 ":314," 카르":178," 카를":260,"교환 ":84,"토콜":205," 출판":450,"히고 ":66,"특성을":91," 침입":81,"능력이":74,"능력을":155," 카운":60,"국철도":85,"파나":80,"파냐":96," 카자":80," 카이":174,"티드":127,"귀족 ":88,"트럭":83,"트럴":128,"트럼":73," 친위":73,"트레":777,"수원시":104,"티노":156," 취하":75,"트롤":86,"트로":845,"싱가포":89," 친일":132,"틀란":68,"틀랜":270,"마추어":90,"티누":78,"트루":135,"틀러":85,"트르":93,"트를":421,"치된 ":243,"트릭":104,"트리":1178,"통행":63,"파가":72,"통하":582,"통한":472,"립한 ":329,"통해":1678,"통합":684,"투자":326,"투이":162," 카스":160,"투입":66,"통화":178,"파견":103,"분으로":162,"투쟁":105,"판관":57,"파구":159,"트랙":134,"트랜":257,"지역 ":878,"트라":796,"트란":76,"트랄":57,"니며 ":81,"판결":150,"파괴":182,"티나":260,"직에 ":77,"분이다":91,"수이다":549,"투아":129," 커다":65,"투어":153,"투에":168,"통신":860,"토양":96,"수여하":88," 추진":344,"텐츠":220,"트인 ":57,"터테":274," 출전":165,"통상":199," 출입":97,"투는":161,"브랜드":212," 추정":261," 총회":98,"토시":94,"토스":227,"터키":426," 축제":131,"텍처":131,"지에 ":707," 출연":182,"토성":62,"림픽 ":413," 침공":92," 최초":1056," 폴 ":107," 친구":98," 폰 ":187,"수용소":59,"통부":57,"트의 ":617,"회에서":510,"중학교":87,"투기":161,"템이":180,"템을":168,"템은":65,"템의":139,"술에서":64,"태화":59," 출신":971," 출시":471,"지어 ":59,"통문":80,"택하":130,"근무하":84,"증진 ":102,"교황 ":314," 카라":126," 친목":158,"대구광":218," 침략":81,"토착":58," 취임":66,"통제":163," 최후":85," 카드":233," 캐논":60,"모로 ":86," 치료":200,"투명":71," 칠레":103,"토지":226," 캐나":542,"통은":64,"통을":190,"통일":475," 충청":610,"통의":266,"통이":170," 치러":226,"통적":314,"투르":163,"투를":79," 춘추":67," 추측":96,"교회 ":398,"송하는":91,"통용":58,"구체적":132,"투로":114,"大之":149," 춘천":113,"大亞":171,"통영":82," 추출":59,"통에":86,"大丁":104,"大丘":81,"大並":265,"토의":195,"大三":116,"터페":234,"토에":143,"토어":58,"토요":82,"테크":171,"뉴스》":133," 충주":65," 측면":110,"최초로":360,"판서":64,"파소":76,"티안":75,"티아":321,"판소":126," 항 ":96," 케이":278,"파수":93,"티야":82," 캠퍼":62,"퍼드":118,"티에":140,"파스":130,"티오":173," 해 ":409,"틴어":370,"파시":83,"평가":559,"스를 ":979,"확한 ":138,"티움":195,"티의":76,"팀에":70,"평군":70,"평구":64,"패션":100,"평균":190,"파악":100,"퍼런":57,"부여하":82,"팀은":106,"파에":127,"팀을":101,"팀이":414,"팀인":106,"패스":68,"폐기":68,"머지 ":99,"퍼레":61,"특수한":100,"팀으":74,"속하는":814,"판에":167," 콘서":79,"파와":71,"파워":96,"속한다":406,"파울":88,"파의":210,"파이":647,"파인":105,"파일":554," 코스":197,"포가":72,"시는 ":274,"판이":166," 콘스":180,"폭격":74,"판의":82,"판은":76,"판을":143,"기록되":116,"판으":77,"파장":76,"페란":60,"페라":351,"포괄":108,"즈에서":95,"편되":118,"지아 ":174,"포구":341,"판정":86,"송파구":154,"트페":72," 쿠데":103,"후속작":75,"페르":483," 컨트":101,"대까지":71,"트폰":76,"페루":70,"시간에":90,"시간이":97,"티칸":62,"특한":83,"시간을":87,"명명되":59,"페리":114," 코어":59,"평동":57,"패의":75," 컴파":63," 커피":78,"특허":119,"비드 ":116,"평등":101,"특히":654,"스니아":124,"편리":65," 컴퓨":1671,"출생하":68,"포니":304,"화학 ":222," 쿠르":73,"평론":89,"포되":112,"포도":114,"화한 ":163,"포동":68,"퍼시":110,"파크":221,"퍼스":194,"포드":131,"분에서":63,"평면":121,"판타":113,"파키":163,"특별":3191,"파는":76," 커뮤":155,"퇴하":73,"판단":150,"부이며":90,"판되":77,"기록을":88,"기록은":61,"기록이":62," 코나":66,"파도":65,"파동":100,"트비":181," 콩고":150,"수집 ":85,"파라":208,"파란":62,"티모":92,"틸리":74,"티미디":76,"만화 ":211,"특성":272,"트사":71,"팔레":130,"티미":122,"북으로":70,"파르":244,"파를":146,"티벌":59,"티베":160,"투표":309,"특수":392,"파로":128,"팔라":113,"특정 ":405,"트어":154,"패러":91,"티븐":77,"티브":220,"부이다":121," 카타":66,"판매":556," 코드":377,"파리":365," 친환":78,"특유":61,"트이":390,"트의":618,"특이":82," 카페":68,"트인":71,"트워":614,"트웨":825,"송통신":78,"트와":228," 카트":59,"틀어":112,"트에":514,"패로":64,"맞추어":57,"티시":93,"티스":235," 칭호":106,"다란 ":85," 코리":125," 코르":132," 칭하":80," 콜로":74,"트족":57," 콜롬":89,"틀을":58,"특정":766," 침해":88," 함 ":107,"판사":210,"슬람 ":141,"파생":140," 코미":128," 코믹":74,"특수법":62," 할 ":828,"틀을 ":58," 한 ":3539,"패배":83,"특징":490,"펙트":82,"페트":58,"펼친":93,"포지":121," 회 ":430,"푸르":142,"풀로":95,"다른 ":2094,"말해 ":85,"분야로":104,"포의":126,"펼쳐":96,"페테":81,"포유":91,"표를":238,"품들":89,"비교하":63,"포인":100,"포이":65,"포자":65,"표면":243,"평택":84,"휴대용":67,"프가":223,"포츠":546,"회의에":88,"푸른":64,"회의원":323," 킬로":101," 키르":89,"리포니":249,"포터":121,"포털":72,"풍부":92,"폭탄":88,"교통의":80,"포크":112,"표시":333,"기본 ":174,"스도의":64,"평화":337," 키보":71,"편하":134," 쿼크":66,"풋볼":91,"품부":596," 후 ":926,"협회는":600,"출시된":71,"출시되":154,"풍속":64,"표이":142,"치가로":67,"포하":234,"포함":1770,"포한":64," 터널":91,"프는":77,"표의":64,"표율":67,"뉴욕 ":303,"푸아":98,"비교적":115,"표적":496,"피고":92,"포항":188,"식경제":82,"표에":93,"포트":180,"폭포":71,"먼저 ":125,"푸스":101,"당나라":170,"추진 ":78," 크기":356,"후의 ":318,"포르":483,"패턴":74,"포를":110,"표가":57,"팔레스":104,"분야이":162,"파하":69,"분야의":248,"만한 ":99,"포맷":122,"치는 ":382,"패키":72,"분야에":384,"티스 ":63," 크고":91,"폴란":333,"페스":133,"펜실":91,"폴로":79,"소프트":1324,"폴레":162," 현 ":1849,"면서 ":1800,"파티":108,"다룬 ":72,"파트":241,"몰도바":68,"포로":194," 크게":351,"폭력":166,"포럼":115," 쿠바":79," 國三":150," 國並":101,"팽창":61,"스만 ":143,"패하":101,"후인 ":57,"북아프":61," 코트":97,"리트어":69,"평생":67,"츠를 ":86,"구할 ":67,"시대 ":1109,"페이":806," 코퍼":58,"페인":767,"소행성":117,"페어":82," 콘텐":143,"마케팅":128,"표기":489,"마쿠라":73,"폭발":174,"퍼져":67,"폴리":363,"편성":139,"식공룡":68,"판하":95,"팩트":77," 형 ":90,"시다 ":61,"편적":65,"표된":100,"표되":128,"편입":100,"평야":69,"평양":609," 홈 ":107,"편으":97,"편을":77,"평안":88,"편의":170,"편이":180,"치가이":174,"표는":100,"품과":60,"편에":104," 호 ":622,"주하는":151,"보호하":179," 크다":72,"표로":306," 클리":72,"폐지":328,"편찬":136,"명령어":128,"포에":131," 클로":116," 탐구":62," 크리":494,"펑크":69," 크루":115," 클럽":310,"분야를":77," 클레":98," 클라":269," 크로":292," 클래":126,"출신의":366," 크레":128,"출신이":151,"편집":254,"투표를":67,"평의":64," 크라":154,"부에서":328,"출신으":183,"평원":75,"속하며":146,"포스":282,"포시":139," 타고":80,"폭스":71,"줄여 ":63,"테리아":58,"국시리":95,"최초 ":67,"탁집행":59,"현대자":68,"현대의":80,"주요 ":619,"한제국":216,"현대에":59,"프트가":61,"터베이":229,"국시대":174,"학자이":807,"학자인":136,"면으로":111,"람과 ":57,"면이다":122,"태이다":143,"테르부":65,"행정 ":844,"중부에":219,"죽을 ":57,"목이 ":67,"허리케":119,"죽은 ":131,"주의 ":2545,"구조 ":121,"목의 ":117,"교에서":366,"그녀의":76,"계획을":62,"계획이":79,"피터 ":60,"목소리":65,"통계학":58,"계획에":64,"추어 ":149,"호는 ":1131,"그램 ":342,"관한 ":1431,"학자로":168,"규모 ":208,"호간의":155,"라기 ":58,"국제 ":1037,"행성이":82,"종합 ":162,"념이다":176,"학자들":180,"행사하":73,"목은 ":112,"기나 ":97,"목을 ":213,"농업 ":98,"모의 ":211,"념으로":242,"통계청":58,"하인리":70,"준의 ":153,"년으로":59,"중앙 ":206,"교육자":122,"면적은":1793,"목적 ":106,"교육을":199,"교육의":108,"치구 ":58,"탕으로":335,"합이다":98,"준을 ":139,"함으로":764,"향에 ":59,"교이다":212,"해양수":137,"합으로":111,"그대로":248,"주인 ":62,"해안에":90,"해양부":103,"그룹 ":318,"화나 ":70,"중에 ":471,"군주 ":60,"지나 ":148,"지난 ":127,"혁명 ":188,"노이 ":65,"프트의":82,"프트웨":808,"노선을":63,"노선의":63,"노선이":141,"권에 ":140,"터스 ":74,"치고 ":176,"그로 ":130,"교육과":397,"출범하":69,"교육기":98,"해양경":119,"준인 ":60,"구스타":61,"준이 ":89,"텔레비":412,"무라 ":94,"학적으":169,"노의 ":74,"협력 ":130,"학적인":134,"지널 ":58,"태어났":310,"태어난":296,"태어나":175,"학에서":938,"敎之":110,"공항 ":89,"구성된":325,"구성되":572,"맨해튼":63,"하여야":74,"공한 ":83,"공학 ":155,"출이 ":60,"할아버":61,"라고 ":2752,"태양계":61,"구원 ":202,"지낸 ":207,"년에서":224,"카고 ":57,"물건을":96,"화국 ":933,"출을 ":84,"하에서":122,"최적화":72,"환경 ":270,"먼트 ":231,"하였는":106,"행에 ":116,"즈로 ":87,"터미널":258,"경하였":58,"학연구":99,"태에서":98,"국왕 ":169,"하였다":4731,"하였던":143,"추세츠":96,"물관이":96,"헬름 ":83,"구와 ":470,"군에 ":299,"모아 ":105,"해석학":116,"해석하":67,"하였고":606,"중생대":120,"중요 ":61,"중서부":70,"라가 ":204,"행을 ":299,"구성하":492,"주식회":205,"중을 ":67,"국적 ":77,"행의 ":135,"학으로":130,"공회 ":118,"군인 ":129,"그녀는":86,"묘로 ":171,"모여 ":116,"중의 ":536,"목에 ":210,"탄자니":77,"축을 ":74,"학이다":126,"군이 ":366,"국세청":71,"타이완":189,"행이 ":113,"프톨레":68,"타인의":82,"행위 ":176,"군의 ":818,"하우스":121,"했을 ":143,"축산식":105,"타이베":77,"금까지":91,"카가 ":78,"면에서":220,"과학 ":369,"중소기":146,"학위를":104,"국장 ":61,"군을 ":273,"해진 ":173,"군은 ":187,"지는 ":2020,"메탈 ":76,"국이 ":616,"구성요":67,"하와이":79,"국인 ":365,"군사적":69,"추원 ":57,"구성원":169,"구장 ":97,"타이틀":140,"즈를 ":119,"종특별":81,"국의 ":7793,"구인 ":59,"日三":105,"구소는":106,"하였으":1353,"중인 ":239,"즈니스":133,"총칭이":130,"구의 ":1000,"국을 ":477,"년에는":218,"국은 ":385,"중성자":58,"조하는":89,"하철 ":179,"토관리":71,"퓨터의":102,"향상 ":127,"항이 ":78,"퓨터에":131,"국왕이":80,"신하여":63,"해외 ":140,"항의 ":71,"출생 ":118,"해서는":187,"신학자":96,"시험이":69,"국으로":324,"명을 ":535,"명은 ":635,"공하여":84,"공하였":61,"구원은":154,"구이다":327,"필요로":66,"시행하":82,"합이 ":96,"구역은":158,"구역으":111,"타이거":62,"구역이":121,"명의 ":949,"주시 ":444,"기까지":238,"공하기":71,"합의 ":217,"공하고":126,"군에서":97,"합은 ":87,"명이 ":605,"명인 ":101,"합을 ":202,"신호를":88,"공하는":288,"敎 ":92,"최종 ":111,"항을 ":212,"식회사":205,"주변의":60,"하지 ":1309,"매하는":108,"해와 ":164,"무는 ":91,"주변에":81,"공한다":129,"합성어":91,"메이션":621,"퓨터를":66,"필요하":96,"필요한":311,"즈는 ":180,"헨리 ":107,"식품의":90,"농수산":67,"메이저":161,"시한다":59,"관하는":129,"시하는":199,"식하는":102,"구장이":90,"구장은":60,"구장으":111,"테면 ":61,"메이지":127,"지가 ":477,"즈니 ":71,"과학에":60,"태조 ":87,"과학원":110,"시행되":95,"과학적":147,"교토 ":66,"과학의":105,"과학자":114,"중세 ":186,"그라드":91,"국제명":67,"메이크":70,"교통 ":183,"공화정":72,"구치 ":63,"과학기":696,"화가 ":316,"국이다":140,"해살이":124,"국인들":59,"공화국":2091,"구축 ":59,"시하였":95,"함수의":80,"또한 ":786,"해의 ":186,"무관으":57,"물고기":88,"지게 ":159,"군이다":165,"공화당":68,"군으로":160,"헌법 ":117,"규모가":68,"시호는":324,"국제교":61,"국제공":130,"출발하":59,"합에 ":76,"테르 ":63,"한의 ":141,"현대 ":369,"조한 ":117,"곡하였":67,"해부학":57,"면서도":79,"족한 ":72,"한이 ":81,"조합 ":90,"통계 ":83,"향력을":65,"통과 ":82,"학적 ":572,"권의 ":269,"태생의":62,"말하는":177,"면은 ":81,"면을 ":202,"말한다":3380,"종파 ":71,"기관을":79,"기관으":991,"기관의":141,"기관이":951,"지고 ":1032,"한자 ":153,"기구로":81,"권은 ":114,"기관에":108,"권을 ":623,"할을 ":488,"태의 ":331,"만화가":103,"학은 ":215,"그를 ":132,"하의 ":427,"높은 ":477,"면이 ":103,"학상을":72,"말하면":59,"말하며":226,"기능 ":112,"하이 ":171,"학의 ":951,"교육청":88,"기는 ":947,"글로 ":110,"모모야":63,"학을 ":385,"면의 ":154,"학이 ":90,"피아노":101,"진과 ":113,"그림 ":81,"지구 ":303,"그린 ":169,"높이 ":126,"하자 ":138,"출신 ":245,"네치아":57,"함에 ":123,"한을 ":120,"피아니":63,"치가 ":290,"놓은 ":145,"학자 ":830,"기니 ":77,"면적 ":89,"신화에":220,"중심 ":210,"해야 ":228,"구에서":361,"신화의":72,"명에 ":142,"해양 ":65,"피언컵":95,"진구 ":92,"주얼 ":63,"그들의":150,"그들은":74,"질과 ":83,"함은 ":90,"함을 ":479,"해에 ":167,"타지 ":89,"구역상":152,"실행하":64,"국에서":1555,"주에 ":640,"지기 ":88,"기도 ":3458,"항에 ":88,"피에르":97,"권이 ":170,"기관차":146,"만화이":72,"농산물":130,"테니스":103,"프트 ":483,"구약성":69,"국에는":66,"국어로":84,"피언십":167,"모바일":151,"주와 ":707,"현동 ":88,"해안 ":100,"기구이":161,"준에 ":58,"증가하":70,"초점을":59,"뿐만 ":208,"퓨팅 ":60,"터에서":152,"중요성":72,"주주의":702,"관으로":1430,"관이다":925,"관이었":166,"탄티노":86,"국사 ":69,"과적으":61,"지냈으":100,"과정이":135,"과정을":292,"높이는":372,"과정에":270,"중앙행":163,"중앙회":62,"하여 ":10358,"학에 ":189,"누르고":122,"준정부":80,"하에 ":228,"관장하":182,"명칭으":130,"명칭은":335,"명칭을":111,"명칭이":208,"핵심 ":64,"군사 ":206,"중의원":106,"광역시":1449,"구성 ":124,"해시 ":75,"구밀도":162,"호가 ":138,"행성 ":102,"지니고":81,"투갈 ":67,"지로 ":816,"중요하":63,"중요한":537,"츠가 ":73,"국무총":119,"하위 ":107,"국민당":74,"텍사스":130,"학원 ":257,"네트워":601,"태풍 ":215,"구소 ":210,"주지방":89,"농어촌":104,"중이다":171,"학생들":151,"국민경":58,"지만 ":2429,"지마 ":65,"농업인":62,"지막 ":351,"한불교":58,"퓨터 ":987,"뉜다 ":141,"견한 ":100,"항상 ":79,"지리 ":83,"광주 ":85,"목으로":232,"공으로":88,"목이다":93,"피소드":110,"공원이":117,"질로 ":92,"지대에":91,"함수 ":148,"토미 ":72,"고지로":66,"교수 ":144,"지를 ":1014,"곳으로":142,"공용어":70,"촉진하":68,"결한 ":67,"구려의":58,"곳이다":155,"지니아":101,"해서 ":1198,"핀의 ":74,"철학자":224,"목적이":96,"목적으":2374,"목적은":73,"목적을":89,"관에서":127,"메트로":85,"타크래":128,"무부 ":116,"고쿠 ":157,"털이 ":79,"문》 ":124,"지되었":235,"관위는":85,"지면 ":61,"지며 ":169,"과이다":71,"톨릭 ":174,"천황의":64,"공유하":89,"지도자":328,"통령 ":409,"국립공":185,"턴의 ":58,"학명은":70,"모야마":65,"니가 ":122,"족행위":60,"진다 ":599,"출력 ":84,"물리 ":92,"결하는":245,"지도 ":237,"구분하":130,"지구에":121,"함마드":68,"근대 ":136,"노프 ":58,"곡한 ":151,"지금까":89,"기계 ":100,"토리 ":100,"기고 ":87,"교적 ":222,"현과 ":107,"교수로":84,"학문이":118,"학문적":57,"지관리":66,"기관 ":423,"지닌 ":149,"근거하":73,"존하는":140,"하므로":81,"한민국":7350,"피아 ":143,"터의 ":386,"문대학":65,"족하였":143,"직공무":113,"노카미":82,"누스 ":182,"합류하":76,"기구 ":245,"주어지":59,"주어진":139,"교수이":104,"지금의":259,"토니아":73,"늘날 ":214,"멀티미":78,"학술 ":98,"합리적":59,"지금은":121,"눅스 ":72,"헝가리":248,"토르 ":81,"픽에 ":123,"토를 ":102,"키텍처":131,"천황 ":220,"하버드":67,"한반도":215,"터와 ":146,"학습 ":59,"문명 ":78,"지기도":75,"구에 ":582,"국어 ":241,"·철학":275,"지구의":118,"모양을":68,"모양으":92,"테스 ":76,"중심부":67,"주었다":103,"모양이":89,"모양의":161,"주에서":249,"기기 ":100,"구역 ":205,"국에 ":586,"토대로":59,"목에서":71,"중심지":446,"직된 ":71,"촉진 ":60,"중심의":73,"중심으":672,"중심이":78,"한민족":78,"주의는":98,"줄여서":375,"계획 ":143,"주이다":406,"뉴기니":78,"무로 ":104,"모스크":232,"프스키":67,"주의를":168,"축물 ":62,"피스 ":81,"교에 ":115,"국방부":74,"터에 ":207,"종합적":78,"국민에":61,"권력을":67,"주이며":69,"허가된":536,"주요한":65,"토로 ":68,"국문화":83,"해발 ":137,"명으로":376,"해방 ":60,"집단 ":101,"명이다":422,"행되고":89,"국방송":84,"국민의":235,"행된다":79,"행되는":98,"명이며":67,"모습을":123,"교와 ":209,"주인공":208,"토너먼":71,"국불교":141,"중앙아":113,"무를 ":833,"중앙정":74,"하르트":89,"지냈다":412,"행동을":107,"권리를":141,"교사상":96,"행되었":179,"중에서":493,"주장한":106,"주장하":234,"구분된":68,"구분되":71,"기간 ":297,"주의에":98,"교육 ":543,"기가 ":609,"주의와":98,"준이다":67,"준으로":442,"통과하":60,"구별하":80,"주의자":203,"주의인":510,"그는 ":921,"주의적":180,"광주광":201,"하면서":761,"주의의":142,"물론 ":117,"물로 ":459,"무대신":61,"학살 ":67,"교의 ":1022,"명칭 ":96,"중에는":97,"그것은":65,"그것을":99,"주제로":77,"학생 ":143,"근거로":57,"丘乙 ":101,"실제로":191,"문을 ":285,"문은 ":97,"丙之 ":191,"현악단":131,"념물 ":77,"전한 ":445,"구를 ":601,"전할 ":64,"문서를":72,"三倉 ":159,"초에 ":110,"과사전":636,"챔피언":309,"丘之 ":172,"무장 ":114,"승팀 ":85,"경제자":60,"丘亞 ":97,"경제의":97,"경제적":182,"군대를":57,"三丁三":180,"三丁丁":115,"템이다":123,"무소는":58,"丘亂 ":62,"무의 ":132,"문서는":80,"화성 ":63,"스케이":155,"중기의":177,"조정 ":98,"젝트 ":155,"물을 ":636,"三並亞":103,"물은 ":154,"三並亂":75,"三並並":197,"화상 ":61,"三並之":157,"三並三":347,"三並丁":93,"확립하":58,"체코 ":66,"공작 ":150,"러나 ":375,"토에 ":57,"三三並":435,"三三之":337,"三三亂":104,"三三亞":275,"절한 ":64,"三三丘":91,"三三三":746,"三三丁":204,"三丁並":115,"三丁之":138,"정치가":329,"공이 ":75,"경제부":95,"三丘三":78,"추구하":83,"三三倉":73,"문서에":61,"무소장":62,"통신 ":286,"문이 ":99,"공을 ":239,"계이다":139,"스코틀":228,"문서이":58,"문자 ":261,"시코 ":197,"공의 ":248,"래되었":95,"三之並":215,"三之之":161,"문의 ":316,"三之亞":63,"공유 ":71,"三三國":62,"관에 ":201,"심으로":693,"시켜 ":262,"三之丁":129,"三之三":285,"공산주":198,"십자군":103,"화산 ":89,"졌으나":61,"과와 ":58,"공사의":74,"곳의 ":66,"못한 ":89,"노래 ":87,"조선민":527,"곳을 ":106,"곳은 ":97,"회를 ":455,"丁倉 ":81,"중국어":136,"스타리":76,"중국에":167,"시즌이":135,"형성하":139,"경쟁력":153,"명하였":122,"전》 ":217,"뜻의 ":103,"몬테네":93,"공원 ":236,"래밍 ":212,"국령 ":69,"협정 ":78,"구로 ":407,"해하는":84,"스타디":113,"뜻은 ":77,"뜻을 ":186,"스크바":214,"과에 ":283,"조사연":59,"무와 ":69,"스크리":94,"스크립":98,"硏之":94,"문에 ":1313,"결정하":131,"구려 ":72,"라루스":57,"丁並之":61,"정책을":139,"丁並並":65,"랜드의":362,"스토니":68,"정책의":92,"丁並三":108,"정책이":65,"정책에":70,"랜드에":119,"형의 ":232,"뮤니티":87,"존의 ":270,"스테르":112,"남쪽으":574,"남쪽은":77,"丁三亞":60,"丁三之":151,"경으로":205,"丁三並":143,"丁三三":197,"공산당":201,"丁三丁":74,"계열의":107,"스터미":113,"丁丁之":67,"丁丁三":77,"丁丁丁":59,"광범위":81,"합회는":84,"조선총":95,"스크톱":69,"스타일":126,"스타인":141,"형이 ":118,"스탄의":136,"냈으며":111,"남쪽에":112,"족의 ":403,"경우에":367,"뮤니케":86,"丘並 ":120,"조선시":218,"丘丘 ":120,"겨진다":96,"족을 ":151,"족은 ":109,"중국인":65,"중국의":313,"조의 ":556,"丘三 ":214,"丁之之":73,"중교통":58,"족이 ":130,"조사하":74,"형을 ":227,"丘丁 ":189,"조선의":232,"조선인":72,"丁之並":120,"丁之三":108,"출된 ":86,"조선왕":60,"조선에":59,"형은 ":84,"무원 ":354,"계에서":550,"물에 ":244,"나중에":137,"내에서":352,"정할 ":104,"스템에":108,"제품 ":138,"스테이":248,"형성된":99,"형성되":118,"並亞 ":397,"정한 ":1015,"태풍으":59,"태풍이":71,"공연 ":73,"경우를":91,"통상 ":58,"스템의":137,"스템이":178,"스템을":165,"스템은":65,"계약을":87,"정치학":71,"고종 ":91,"민국 ":4370,"터키 ":238,"중남부":58,"고슬라":66,"합하여":263,"럽과 ":63,"명하고":104,"시킨 ":207,"국내에":69,"시킬 ":95,"스타크":130,"곳에 ":630,"럽게 ":67,"종사하":78,"남이다":68,"丘三三":90,"주니어":77,"丘三之":57,"丘三並":57,"회로 ":295,"함하여":108,"경우는":71,"국내외":132,"스탄티":147,"접해 ":411,"경우도":213,"스토리":150,"三倉三":63,"종이 ":160,"슷한 ":237,"공업 ":91,"전투는":153,"심지는":190,"제프 ":99,"템에서":59,"경찰 ":105,"초의 ":683,"명하며":58,"호수 ":60,"경우가":270,"전투기":110,"고전 ":139,"스튜디":195,"결정되":69,"실질적":115,"났으며":113,"함한다":221,"고속버":59,"란드에":64,"란드어":81,"주도로":122,"전통문":78,"투는 ":161,"무에 ":90,"테이션":199,"명하다":316,"투갈어":68,"주도는":958,"명하는":145,"란드의":272,"주를 ":291,"래로 ":181,"전히 ":176,"테인먼":266,"함하는":169,"경에서":129,"테이블":95,"三丘驿":62,"좋은 ":139,"무역 ":89,"三亞三":128,"三亞並":101,"三亞之":123,"내용은":87,"내용을":238,"내용으":63,"스파냐":94,"내용의":62,"내용이":162,"곡을 ":163,"곡은 ":123,"좁은 ":114,"함하고":175,"고속도":557,"주로 ":2407,"고의 ":280,"철학 ":391,"고자 ":296,"곡이 ":62,"전화 ":176,"형에 ":66,"곡의 ":107,"심지이":98,"험이다":74,"람들이":226,"람들의":90,"검찰청":131,"람들은":61,"람들을":90,"고유 ":58,"민간 ":90,"람들에":76,"현재 ":1569,"스트로":176,"並丁 ":256,"스트를":69,"並三 ":685,"렇게 ":88,"것처럼":82,"모할 ":63,"스트리":632,"졌으며":172,"학회는":214,"스트라":255,"전혀 ":90,"스트레":553,"並並 ":551,"종의 ":525,"호세 ":58,"ος ":121,"교류 ":167,"접한 ":137,"공부하":58,"스티안":65,"並丘 ":162,"종을 ":142,"현이 ":70,"종은 ":81,"스파이":76,"호선 ":244,"정치적":249,"민과 ":61,"並之 ":482,"조지 ":208,"조직 ":160,"정치인":881,"교를 ":330,"미국 ":2327,"통문화":79,"현의 ":217,"효과적":73,"스티븐":77,"할한다":328,"현을 ":152,"할하는":82,"並乙 ":122,"스티벌":59,"스파르":60,"행형 ":62,"현상이":157,"래를 ":142,"현상을":242,"텐츠 ":92,"並亂 ":230,"스트이":111,"스트의":82,"협약 ":74,"실시하":164,"제작자":87,"권리 ":62,"문신 ":143,"제자유":65,"경찰청":120,"곳에서":166,"스템 ":494,"노동자":160,"스텔 ":62,"노동조":80,"주당 ":171,"천주교":125,"격한 ":87,"톨리아":98,"효과를":107,"라면 ":101,"관악구":58,"재하였":61,"널은 ":82,"스톤 ":59,"뛰어난":130,"널을 ":66,"호사 ":85,"재하지":105,"통령을":141,"통령의":87,"통령으":73,"통령이":192,"제적인":115,"제적으":88,"그가 ":220,"제작하":177,"제작한":315,"라데시":106,"준다 ":198,"라마 ":280,"주광역":200,"러가 ":71,"미가 ":133,"초명은":71,"경찰서":86,"거하여":108,"제주도":96,"교사 ":69,"제조업":81,"전통 ":182,"넓은 ":249,"행한 ":180,"주도 ":249,"행할 ":135,"정이었":81,"장하며":59,"시아의":577,"시아와":80,"시아어":195,"넣어 ":72,"장했다":79,"시아에":205,"的 ":81,"접전을":65,"나타나":260,"나타난":144,"접적으":61,"나타내":437,"나타낸":182,"곡이다":130,"호르몬":66,"시에는":136,"라디미":108,"곡으로":120,"고있다":120,"제이다":258,"스트 ":1046,"체육관":1157,"현에 ":78,"홍보 ":87,"라디오":334,"재하고":117,"제외한":225,"정의한":63,"정의하":162,"제작되":129,"제작된":176,"장하였":134,"제외하":103,"장하여":85,"고유의":69,"정치 ":296,"식에서":64,"스틱 ":69,"교류를":138,"정적인":117,"재하는":271,"태평양":418,"재한다":193,"시에서":270,"체에서":108,"주는 ":678,"토스 ":122,"활동하":431,"활동한":114,"재했던":158,"고이다":107,"관심을":87,"종목에":75,"너지 ":209,"총리대":58,"청으로":59,"총리를":116,"제정되":152,"제정된":104,"국민 ":136,"제임스":214,"활동했":136,"후기 ":150,"죄를 ":91,"무선 ":97,"해협 ":57,"물리적":166,"시외버":130,"족에 ":107,"三之 ":901,"고용노":84,"三乙 ":170,"三乘 ":99,"조에 ":202,"시청 ":67,"줄기는":57,"활동을":468,"형식 ":59,"화물 ":67,"활동에":80,"총선 ":59,"三三 ":1345,"투기 ":72,"네스 ":139,"라로 ":118,"템의 ":139,"三丁 ":581,"고위공":230,"활동이":95,"조와 ":117,"토론토":96,"물리학":446,"전체의":66,"三丘 ":326,"시외전":76,"템은 ":65,"템을 ":168,"문서 ":147,"슈타인":136,"톨레마":65,"무소 ":98,"최대의":181,"三並 ":896,"三人 ":61,"화되어":83,"래된 ":286,"형문화":146,"텍스트":85,"져있다":81,"화되었":81,"쳐진 ":60,"고양시":85,"공식적":231,"시이다":491,"테스트":133,"활동과":61,"식으로":605,"식이다":352,"三亂 ":228,"고양이":81,"시작된":195,"시작되":382,"터이다":170,"三亞 ":422,"체이다":555,"청장은":97,"시이며":121,"스카르":62,"명한 ":534,"신으로":438,"년마다":62,"丁丹 ":61,"피해자":88,"관의 ":2170,"丁並 ":336,"丁乙 ":118,"주관하":162,"체제의":76,"관인 ":84,"제정한":62,"관이 ":275,"丁之 ":446,"신이다":266,"체제에":91,"공주 ":71,"시인이":151,"추기경":60,"과정 ":135,"궤도 ":71,"토마스":59,"최되었":107,"시장에":144,"제주시":75,"전투 ":320,"토리아":108,"丁丘 ":202,"라를 ":184,"주된 ":92,"시작으":155,"丁丁 ":409,"신이며":66,"丁三 ":412,"노동당":62,"丁人 ":68,"조약 ":210,"확률 ":66,"최되는":68,"경제학":303,"화를 ":1060,"모토 ":158,"과의 ":595,"시장을":66,"시장이":77,"체제를":101,"스카이":59,"회는 ":1632,"랜드 ":1148,"톨릭교":272,"구대회":70,"국립 ":152,"청주시":71,"계적인":187,"계적으":250,"시작하":474,"시작한":212,"시작해":80,"시작했":235,"경주시":80,"축구단":96,"시적으":65,"체적인":105,"丁亂 ":154,"체적으":123,"노동부":61,"신이자":75,"국대학":78,"식적으":162,"전체를":82,"제주특":160,"丁亞 ":110,"논리 ":71,"관을 ":436,"식적인":111,"피해를":133,"관은 ":726,"토머스":70,"大之 ":76,"중부 ":251,"공룡이":97,"제하는":65,"정해진":101,"개편되":114,"국내 ":265,"트가 ":377,"관련된":425,"건전한":206,"권과 ":96,"걸친 ":83,"경영 ":97,"大丘 ":63,"之之三":178,"之之丁":97,"之之並":118,"之之之":104,"네스코":91,"네시아":285,"험의 ":58,"정하여":167,"정하였":105,"험을 ":187,"구나 ":75,"고속 ":76,"경에 ":256,"발된 ":162,"결을 ":132,"험이 ":59,"정하지":57,"미로 ":169,"제품이":89,"제품을":86,"화로 ":269,"之丁並":63,"之丁之":91,"之丁三":123,"之丁丁":66,"량과 ":57,"나폴레":142,"거주하":225,"결정 ":73,"결성한":67,"경상남":479,"것이며":93,"최고의":169,"大亞 ":63,"乙亞 ":60,"조직의":65,"조직으":61,"조직을":68,"조직이":140,"것이라":157,"계약 ":89,"최대 ":229,"체의 ":618,"之並之":139,"之並並":158,"之並三":216,"之並丁":69,"之並亞":199,"함한 ":296,"之丙之":157,"논란이":57,"발견한":90,"발견하":88,"체인 ":128,"게오르":73,"之三丘":62,"것이다":1291,"之三三":426,"之三丁":121,"之三之":190,"之三並":196,"합한 ":160,"것으로":1506,"之三亞":92,"래는 ":286,"정하고":231,"격이 ":107,"것에서":61,"체제 ":143,"乙之 ":116,"정하기":94,"乙丘 ":89,"정하는":363,"정한다":104,"격의 ":76,"조지아":231,"반도 ":306,"乙並 ":132,"건축 ":91,"접한다":355,"검역소":69,"종으로":458,"미널이":72,"가하였":118,"가하여":102,"격을 ":325,"乙乙 ":66,"종이다":450,"라남도":362,"之倉 ":121,"체적 ":96,"추고 ":107,"접하며":306,"해체되":61,"향으로":228,"접하고":189,"노르웨":212,"문제에":105,"처에서":63,"전형적":62,"결성되":75,"결성된":112,"축구 ":742,"최근에":153,"족주의":88,"학회 ":138,"조직된":71,"년에 ":3322,"乙丁 ":151,"乙三 ":190,"문제로":78,"최된 ":172,"받는 ":236,"넷을 ":59,"견을 ":57,"문제를":243,"발견되":256,"발견된":167,"교관이":68,"반대 ":73,"거이다":64,"네의 ":71,"가하고":96,"채택되":89,"라도 ":249,"협력을":97,"물이나":96,"네상스":84,"문자를":85,"계승 ":114,"바다 ":113,"주민들":79,"통신사":68,"물이다":498,"문제가":87,"노벨 ":164,"걸쳐 ":536,"효과 ":100,"고리즘":197,"가했다":70,"건이다":462,"교는 ":208,"현상 ":62,"大並三":80,"건으로":192,"주소 ":90,"통신망":69,"했지만":129,"빌헬름":85,"험에 ":74,"나톨리":87,"체스터":79,"大亞之":73,"문이다":346,"정확하":95,"정확한":116,"문으로":196,"가하는":151,"식품부":594,"고려의":79,"무장관":67,"무원이":83,"무원으":224,"행정부":79,"나파소":72,"처음에":146,"천으로":103,"라드 ":121,"네수엘":63,"채택하":69,"가포르":89,"증권 ":60,"시하고":125,"행정동":192,"물에서":66,"뮬레이":141,"년부터":1634,"군과 ":241,"고려시":129,"거에서":68,"행정기":403,"비행기":106,"국군 ":60,"노래이":97,"제하여":61,"터페이":229,"노래를":60,"호를 ":436,"제학자":107,"미디 ":96,"격에 ":63,"거쳐 ":338,"랐다 ":84,"란드 ":706,"죄의 ":57,"못하는":67,"향악단":90,"문에서":86,"행정안":64,"처음으":371,"구글 ":68,"한편 ":73,"라노 ":78,"터키의":77,"민공화":988,"화된 ":280,"민국의":2300,"기》 ":71,"민국에":562,"했으나":251,"스페인":683,"스페이":72,"해진다":88,"행위를":267,"통을 ":190,"교로 ":167,"통은 ":64,"해지는":107,"못하고":66,"네이버":142,"했으며":734,"행으로":73,"통이 ":84,"스펙트":61,"통의 ":259,"고에 ":101,"주민 ":65,"고와 ":63,"並亞三":75,"並亞並":57,"시티 ":116,"학파 ":61,"미국의":1693,"미국이":72,"미국인":172,"노스 ":128,"계승하":58,"농림수":538,"미국에":434,"활동 ":273,"행위에":64,"건축가":79,"스페란":59,"형법 ":64,"호로 ":311,"並三國":84,"건축물":157,"투로 ":104,"並之之":99,"격으로":151,"並之並":167,"並之三":173,"並之丁":59,"농림축":109,"행위이":57,"행정구":264,"並並之":99,"並並亞":75,"並並並":118,"並並三":168,"전투에":151,"並丁三":78,"並丁並":60,"並丁之":93,"계적 ":102,"전투이":155,"강화하":64,"並三丁":85,"초를 ":88,"並三三":352,"並三並":166,"구기관":173,"並三之":182,"확대 ":72,"並三亞":95,"丹之 ":65,"스피드":93,"주도하":91,"화교류":59,"시칠리":59,"식품 ":76,"처음 ":669,"처의 ":89,"丹丁 ":93,"민경제":59,"국과학":73,"라는 ":3491,"민당 ":69,"조약으":67,"경주 ":115,"조약이":138,"민국과":113,"국도 ":142,"라고도":1545,"환경에":149,"並倉 ":90,"해주는":74,"미니 ":62,"통일 ":78,"전통적":268,"전투를":62,"종종 ":128,"환경을":119,"전투로":106,"환경의":58,"비해 ":202,"스포츠":528,"국기는":74,"싱턴 ":94,"터테인":265,"화국에":206,"제학 ":83,"제한 ":74,"천을 ":85,"주도이":258,"회관 ":166,"공식 ":315,"천의 ":86,"공신 ":59,"스프레":81,"화국이":115,"화국의":592,"쳐서 ":141,"스플레":121,"투를 ":79,"민간인":66,"방과 ":134,"착한 ":79,"시카고":77,"물질 ":102,"미국과":130,"논리학":71,"시한 ":262,"三國 ":87,"향이 ":94,"모하고":162,"구는 ":2663,"관리를":117,"국가의":292,"국가이":133,"년을 ":193,"경쟁 ":61,"향의 ":74,"협동조":95,"합중국":79,"국가와":63,"전하였":96,"년이 ":104,"노선 ":97,"하키 ":77,"모하기":104,"래가 ":59,"향을 ":645,"년의 ":319,"식한 ":96,"존재했":183,"존재하":508,"존재한":228,"헤르체":107,"헌법에":75,"관리소":120,"홈구장":160,"화가이":84,"관리사":94,"게임은":64,"게임을":126,"게임으":157,"게임이":455,"게임의":112,"게임에":95,"미는 ":124,"계속되":62,"고비나":106,"시코의":88,"계의 ":803,"경제 ":436,"철의 ":57,"전해지":88,"청에 ":72,"구단 ":75,"목표로":175,"회가 ":352,"게이트":71,"공모함":59,"전했다":59,"之亂 ":185,"스피어":79,"관리자":123,"논리적":58,"널이다":102,"발굴 ":71,"시켰다":116,"화는 ":234,"공무원":718,"之亞 ":205,"관리청":88,"녀의 ":96,"之之 ":481,"화국과":90,"토의 ":187,"국경을":303,"해졌다":113,"스하키":63,"관리하":241,"시픽 ":95,"문신이":114,"구관으":65,"之乙 ":132,"구광역":218,"체에 ":291,"향상과":135,"받고 ":270,"관련이":94,"之丁 ":427,"경우 ":678,"주변 ":114,"계에 ":486,"之三 ":666,"시키는":366,"항으로":97,"시킨다":73,"슷하게":59,"하지만":557,"之丘 ":282,"之且 ":96,"것이었":89,"계열 ":117,"실행 ":66,"졸업하":144,"之並 ":569,"너지를":107,"발과 ":93,"고스 ":58,"국가가":96,"경상북":406,"국가기":78,"노미야":60,"신호 ":86,"전하는":152,"국가대":143,"항이다":64,"개편하":94,"계와 ":138,"초등학":97,"전하고":83,"전하게":65,"관련하":58,"관련한":66,"국가들":151,"무실은":1845,"므로 ":457,"즈가 ":125,"념을 ":152,"실험 ":62,"시험 ":132,"게임기":80,"총리 ":123,"물의 ":727,"경을 ":557,"국가로":150,"경은 ":67,"물이 ":315,"청은 ":173,"조이다":111,"국가를":123,"청소년":337,"체와 ":151,"구개발":124,"관료이":65,"경의 ":225,"국가보":77,"비히 ":100,"받게 ":57,"향상에":76,"공사 ":223,"경이 ":113,"청이 ":90,"통에 ":62,"총독부":113,"문제 ":176,"향상을":130,"무슬림":67,"시키기":187,"청의 ":110,"빅토리":101,"족으로":212,"결승전":133,"시키고":183,"미널 ":87,"구간이":72,"광부 ":1095,"족이다":137,"국가에":207,"향상시":75},"n_words":[4985886,5984351,3260836],"name":"ko"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ūdu":237,"ūdo":102,"ūdi":671,"ūda":331,"D":8684,"E":5566,"F":3594,"G":7812,"A":19441,"B":11003,"C":5201,"L":15923,"M":12432,"N":7515,"O":2898,"H":3427,"I":6943,"J":8588,"K":17851,"U":3005,"T":11425,"W":951,"V":11358,"Q":106,"P":20146,"S":16914,"R":10122,"Y":441,"X":1264,"Z":1079,"f":17928,"g":102928,"d":143421,"e":363311,"b":73275,"c":39025,"Fed":229,"a":700709,"n":341806,"o":414082,"l":214716,"m":192290,"j":140065,"k":242084,"h":15703,"i":803156,"w":1450,"v":137211,"Fer":122,"u":266546,"t":305542,"s":495974,"r":352720,"q":282,"p":146221,"z":20490,"y":92511,"x":937,"ūgn":67,"ūgi":136,"":175,"²":445,"Fil":257,"Fin":83,"í":134,"é":265,"ä":96,"á":197,"à":65,"ü":86,"ú":64,"ö":70,"ó":158,"ė":94567,"ę":9237,"ā":164,"ą":20880,"Č":1561,"č":23545,"ī":70,"Į":732,"į":24134,"ō":147,"ų":90375,"Ž":2451,"ž":44691,"Š":7077,"š":70424,"Ū":76,"ū":25506,"Fal":71,"Far":66,"ūbr":365,"Fab":73,"Eri":121,"Est":230,"Eti":120,"Eur":1489,"Ekv":92,"Ele":243,"Eko":63,"μ":79,"ν":128,"ο":198,"ι":119,"κ":65,"λ":109,"ε":95,"α":204,"Ent":76,"σ":88,"ς":168,"ρ":114,"π":65,"τ":94,"ь":90," l":18168," m":39544," n":26364,"я":147," o":7927," h":2325," i":50039," j":10300,"ы":100," k":69096," d":33767," e":11412,"ц":67," f":7201,"ч":71," g":24899,"р":400,"с":408," a":48847,"т":349," b":21159,"у":184," c":5176," y":9991," x":101," z":358," u":9812," t":37056," w":140," v":43820," p":74653," s":59667," r":27511," J":8568," K":17825," H":3382,"Gel":228," I":6930," N":7489," O":2834,"Geg":68," L":15878," M":12380," B":10949," C":5111,"Ged":147,"С":93," A":19385," F":3572," G":7762," D":8642," E":5548,"л":327,"к":381," Z":1072,"й":180," Y":439,"и":579," X":1253,"п":122,"о":721,"н":471,"м":156,"г":113," S":16766,"в":327,"Ger":157," R":10064,"б":111," Q":106,"а":757," P":20081,"з":104,"Geo":114," W":927,"Gen":295," V":11305,"е":519," U":2992,"д":172," T":11383," ą":118," č":811," Č":1561," ė":303,"Gla":64,"Gib":75,"Gin":94,"Gil":90,"Gir":131," ž":11466," Ž":2446," ų":74," Ū":76," ū":618," Š":7067," š":19822," Į":722," į":17189,"šų ":222,"Gan":127,"Gal":319,"Gam":189,"Gaj":89,"Gau":103,"Gar":244,"Gai":76,"Gab":77,"ي":99,"ل":106,"م":65,"ن":73,"ب":63,"ا":149,"ر":65,"šūn":177,"Flo":97,"Fra":186,"Fri":67,"Fre":105,"A ":1074,"For":525,"F ":250,"Da":2187,"Cu":139,"Cy":71,"Cl":158,"Co":831,"Cr":212,"Ce":857,"Ch":1267,"Ci":314,"G ":220,"Ed":144,"Dv":187,"Du":605,"Dz":157,"Dy":151,"Do":667,"Dn":85,"Dr":666,"De":900,"Di":2059,"Fe":543,"H ":197,"Fa":428,"Eu":1613,"Ev":98,"Ex":73,"Er":363,"Et":189,"Es":386,"En":323,"Em":143,"Ep":87,"Ei":125,"El":573,"Ek":234,"Eg":301,"Ge":1133,"Ga":1414,"I ":1579,"Fu":220,"Fr":454,"Fo":751,"Fl":218,"Fi":559,"B ":521," С":93,"II ":866,"C ":1065,"Av":215,"Au":2682,"Ar":1944,"At":910,"As":631,"D ":380,"Ba":3819,"Az":731,"Af":806,"Ag":282,"Ab":409,"Ac":198,"Ad":552,"Am":1804,"An":2621,"Ap":788,"Ai":520,"Aj":72,"Ak":583,"Al":1985,"Bu":1123,"Br":1448,"Ca":774,"E ":242,"Bi":838,"Be":1548,"Bo":1206,"Bl":293,"Him":100,"Hip":99,"Kv":191,"Ku":1466,"Ky":76,"Kn":86,"Kl":830,"Kr":1957,"Ko":2668,"Le":2735,"Li":7587,"N ":208,"La":2652,"Lu":489,"Ly":250,"Lo":724,"Dū":73,"Me":1902,"Dž":420,"Mi":1792,"O ":471,"Ma":4741,"My":151,"Mu":533,"Mo":1998,"Ni":720,"Ež":273,"Ne":1803,"Na":1866,"P ":639,"Hel":151,"Ny":218,"Nr":561,"Nu":901,"No":779,"Ok":221,"Ol":265,"Om":89,"On":141,"Ją":106,"Oc":104,"Od":117,"Of":81,"Hen":73,"Ob":214,"Her":326,"Gi":594,"Gl":239,"Gr":2248,"Go":422,"Gu":450,"Gv":487,"Gy":327,"J ":126,"Ha":879,"He":851,"Hi":517,"Ho":543,"Hu":227,"Hy":110,"K ":521,"Ib":64,"Id":102,"ा":68,"Ig":304,"Im":227,"In":1584,"Ik":112,"Il":691,"Aš":111,"्":69,"Iv":75,"Is":624,"It":425,"Ir":401,"Ja":1783,"L ":366,"Dė":112,"Iz":149,"Ji":1102,"Je":435,"Jo":1434,"Bū":168,"Ju":1787,"Hal":105,"Ka":7199,"M ":415,"Han":93,"Ham":121,"Har":160,"Ki":1470,"Hav":88,"Ke":841,"Us":82,"Mū":111,"Ut":266,"Ur":330,"Uo":118,"Up":176,"Un":265,"Uk":462,"Ul":82,"Ug":88,"W ":71,"Ty":98,"Tv":72,"Gyv":237,"Tu":954,"Tr":1269,"To":1061,"Th":580,"Ti":954,"Te":1633,"Ta":3988,"V ":1768,"Sy":113,"St":1651,"Kū":349,"Sv":277,"Su":2454,"Wo":154,"Wi":274,"Wh":63,"Wa":211,"We":128,"Vy":520,"Vo":1251,"Vi":4324,"Vl":71,"X ":603,"Va":2935,"Ve":1368,"Uz":69,"Gva":257,"Gvi":230,"Iš":781,"Jį":73,"Pu":558,"Pr":4041,"Ps":108,"Kė":213,"S ":1115,"Pe":1585,"Gua":94,"ėžt":110,"Pa":7638,"Gud":68,"Pl":1104,"Po":1535,"Pi":2755,"Ph":112,"ėži":636,"Os":252,"Ot":63,"Op":146,"Or":562,"Jė":115,"R ":1502,"Se":1883,"Sc":300,"Si":1578,"Sh":180,"Sn":82,"Sm":286,"Sl":299,"Sk":1024,"Sr":63,"Sp":553,"So":1804,"Ru":2114,"Jū":181,"Ry":867,"Mė":156,"U ":260,"Jų":127,"Sa":3106,"Re":2289,"Ri":771,"Ro":1443,"Lė":71,"T ":426,"Ra":1932,"Gre":184,"Gri":532,"Gra":983,"Vė":216,"b ":435,"Už":349,"Gru":297,"Gro":162,"a ":85927,"Sė":67,"Yo":68,"Yr":183,"Z ":96,"Są":373,"Gol":68,"Sū":95,"Tė":65,"Za":514,"Ze":220,"Zi":68,"Zo":74,"Rū":230,"aė":112,"i ":72374,"fy":103,"gd":980,"ge":8770,"ga":24984,"Inf":65,"fl":318,"fg":67,"ff":102,"fi":4585,"fr":1236,"ač":2434,"fu":1782,"ft":398,"fo":4629,"Int":267,"Ins":77,"j ":778,"bę":413,"bė":6516,"gy":5384,"gz":596,"he":3549,"ha":2884,"gn":1795,"gm":391,"gl":2484,"bą":398,"gi":19462,"gh":277,"gv":1056,"gu":4898,"gt":1721,"gs":555,"gr":11904,"gp":278,"go":7511,"du":8502,"dv":2782,"dy":7581,"dz":385,"g ":2674,"ea":3623,"eb":2049,"ec":2366,"ed":9131,"de":15402,"dg":238,"Ilg":538,"di":31560,"dh":99,"dk":208,"dm":1790,"dl":210,"do":13968,"dn":150,"ds":320,"dr":6349,"ew":210,"ex":138,"eu":1022,"ev":4073,"ey":228,"ez":1754,"fa":1478,"h ":675,"Ind":876,"fe":2397,"eh":155,"eg":6969,"ef":1143,"ee":377,"el":20835,"ek":14743,"ej":2777,"ei":27949,"ep":4431,"Imp":87,"eo":2521,"en":55954,"em":13103,"et":36257,"es":24051,"er":37980,"ca":1365,"bz":182,"e ":76709,"by":382,"bs":381,"br":2910,"bu":7186,"bt":432,"bn":67,"bo":6948,"bj":1259,"bl":3193,"bi":8863,"bd":129,"be":9419,"da":31040,"f ":470,"cy":129,"cu":428,"ct":550,"cr":288,"co":1164,"cm":472,"ck":632,"cl":149,"ci":18836,"ch":6035,"ce":6724,"cc":125,"Iki":75,"c ":600,"az":2975,"ay":393,"ba":18685,"d ":4075,"at":26947,"as":91731,"ar":60917,"ax":94,"aw":113,"av":20318,"au":60068,"ak":21549,"al":70175,"ai":71518,"aj":9247,"ao":400,"ap":19804,"am":34880,"an":55907,"ac":11205,"ad":17624,"aa":279,"ab":6312,"ag":11672,"ah":654,"ae":3599,"af":1358,"nu":16862,"nt":40253,"ns":5838,"ič":2386,"nr":387,"np":87,"no":25328,"nn":299,"iė":81,"fų":79,"nz":353,"ny":7794,"nv":1496,"oe":521,"of":2043,"oc":3010,"od":8826,"oa":712,"ob":3847,"om":18799,"on":30734,"ok":15352,"ol":18423,"gš":230,"oi":681,"oj":37495,"ją":2263,"og":8954,"oh":445,"ot":12743,"m²":434,"os":91367,"gū":892,"ov":9611,"ou":825,"op":6292,"oo":622,"or":24858,"jė":931,"gų":1741,"ję":663,"r ":33992,"ox":93,"ow":356,"oz":1228,"oy":145,"pd":423,"pe":8966,"Ign":235,"gž":880,"pg":158,"pa":38606,"pc":152,"pl":7261,"pm":79,"pn":207,"po":11055,"ph":742,"pi":24691,"ką":1329,"pj":353,"pk":273,"lo":17284,"ln":7253,"lm":1772,"ll":1349,"ls":6461,"dū":239,"lp":828,"lv":2437,"lu":5702,"lt":6648,"dų":1603,"lz":63,"ly":8935,"gę":183,"gė":2321,"o ":113444,"md":246,"ma":41541,"mb":3140,"mg":95,"dž":9497,"me":27128,"mf":234,"mk":64,"ml":224,"mi":27912,"eš":4905,"mn":423,"mm":415,"mp":5761,"mo":36514,"mt":1706,"ms":5086,"mu":8277,"gį":95,"mz":185,"my":1666,"hė":71,"p ":5826,"na":39063,"nb":286,"nc":4749,"nd":18751,"ne":20163,"nf":1054,"ež":4639,"ng":14841,"nh":168,"ni":75394,"ią":1524,"nj":255,"nk":20248,"nl":177,"nm":383,"jy":130,"dį":460,"ju":6480,"eč":2839,"bū":3463,"jo":44031,"ki":36060,"kh":180,"kg":90,"ke":8017,"kd":1358,"kc":1366,"ka":55987,"m ":17519,"bų":1576,"ky":6899,"ks":8217,"cū":948,"kt":10687,"ku":20625,"eį":147,"kv":1386,"ko":29827,"kr":12212,"kl":14764,"km":7611,"kn":695,"li":63975,"lh":105,"lk":4623,"lj":376,"gą":598,"le":15356,"ld":6869,"lg":3321,"lf":324,"la":33178,"lc":275,"lb":5099,"fė":82,"cų":73,"n ":4899,"hr":575,"hs":179,"ht":431,"hu":755,"hi":2459,"hn":698,"ho":2226,"hl":151,"hm":207,"id":19607,"ic":4825,"ib":3663,"ia":66700,"ih":166,"ig":7460,"if":2042,"ie":61163,"hy":403,"k ":2911,"cė":232,"ir":43336,"is":76128,"it":24630,"iu":21162,"iv":9558,"ix":156,"aš":6163,"ii":607,"dą":519,"ij":49065,"ik":38607,"il":20049,"im":33709,"in":120546,"io":45938,"ip":8649,"je":37765,"až":6015,"ji":8060,"iz":7107,"dė":5783,"dę":233,"l ":5566,"ja":28492,"są":2757,"xi":80,"pš":499,"sč":2453,"pū":264,"rį":1236,"ww":153,"rę":1878,"z ":658,"rė":10366,"ož":891,"šį ":141,"nž":291,"oš":750,"wi":168,"rą":1320,"wn":67,"wo":98,"rč":361,"ws":118,"vy":6021,"nų":7287,"vz":413,"y ":1490,"wa":302,"we":130,"vl":94,"vk":143,"nš":207,"vi":38809,"mž":705,"vu":2039,"pį":89,"vr":219,"vs":237,"vn":98,"vo":18294,"pė":6097,"mų":3539,"uz":1745,"ux":118,"uv":14825,"ve":15404,"va":45949,"pę":314,"x ":613,"mš":101,"ui":3816,"uj":5784,"pą":197,"uk":12269,"ul":12572,"ue":864,"uf":303,"ėšr":96,"ug":10980,"lž":345,"uh":117,"ur":34041,"pč":97,"us":51555,"mū":487,"ut":11996,"um":9287,"un":10381,"uo":35316,"up":8302,"ty":13326,"lų":4078,"tz":73,"nį":1610,"tu":28297,"tt":447,"tw":77,"tv":4512,"ub":4404,"ua":1611,"ud":10289,"uc":1769,"w ":260,"to":36142,"tn":718,"tm":1109,"tl":1570,"ts":3686,"lū":569,"tr":20341,"oč":849,"tp":243,"tg":277,"tf":256,"te":29101,"tk":518,"tj":104,"lš":275,"ti":64800,"th":1088,"v ":492,"nę":1600,"kų":5621,"nė":25365,"tb":1204,"tc":76,"ta":61331,"su":18269,"sv":2479,"kū":2551,"ss":694,"st":45024,"sy":1283,"sz":67,"sl":4978,"sk":13287,"sn":3503,"sm":4814,"sp":7479,"so":8499,"sr":2349,"nč":4435,"sd":318,"sc":661,"sf":460,"se":19656,"sh":403,"sj":177,"ną":1594,"kš":7202,"si":47811,"rz":154,"jų":7476,"mė":4474,"u ":18088,"mę":301,"sa":23030,"sb":164,"mč":71,"rr":385,"jū":2283,"rs":5538,"rt":15475,"lį":874,"ru":13317,"rv":2004,"ry":13171,"rp":4617,"ro":31644,"rn":3965,"rm":7673,"rl":1174,"rk":4790,"mą":3705,"rj":224,"ri":79105,"rh":115,"rg":5620,"iž":716,"rf":267,"re":18683,"rd":4525,"rc":1831,"rb":9032,"ra":68430,"t ":8619,"lę":789,"ių":36065,"lė":8350,"kį":293,"qu":225,"iū":2482,"lč":267,"lą":1161,"iš":22230,"kė":5197,"s ":278368,"kę":593,"px":100,"py":2379,"pt":3572,"pu":6072,"jį":548,"pv":639,"pp":160,"kč":96,"pr":22301,"ps":4103,"IX ":219,"zę":74,"zė":693,"už":4062,"yč":2372,"vū":491,"vų":1508,"uš":1079,"tž":178,"IV ":137,"tš":139,"sų":1592,"vė":5454,"vę":713,"tų":12691,"tū":4346,"vą":321,"rž":1586,"rų":3959,"sū":452,"vč":186,"Hor":71,"zg":207,"zi":7116,"rš":2441,"zb":85,"zd":1810,"ze":985,"tę":1071,"za":3048,"Hom":79,"yz":303,"Hon":129,"tė":5769,"Hol":87,"zy":69,"rū":3551,"zr":101,"uč":735,"zu":1165,"tį":886,"zo":1897,"zn":153,"zk":67,"zm":1647,"zl":136,"pž":94,"yg":3318,"ye":94,"yc":258,"yd":2201,"ya":257,"yb":11044,"sę":282,"sė":3998,"pų":330,"yv":6788,"sį":230,"yt":11083,"ys":9392,"yr":12733,"yp":1504,"yo":203,"yn":5647,"ym":5531,"yl":2089,"yk":7720,"yj":6842,"tą":2285,"yi":98,"Arg":584,"Are":73,"Arc":130,"Ara":258,"Arm":184,"Ark":155,"Ari":111,"šči":2647,"Apo":92,"Atl":385,"Asu":66,"Ast":264,"Art":193,"Avi":81,"Aut":259,"Aus":646,"Auk":724,"Aug":852,"Ats":131,"yž":529,"Aze":82,"zū":96,"Azi":595,"zų":230,"yš":830,"šė ":158,"šės":100,"šėj":66,"Bag":85,"Bah":87,"Bai":88,"Bak":129,"Bal":1866,"Ban":272,"Bab":93,"Bad":64,"Bar":458,"Bat":90,"Bas":130,"Bau":99,"Abi":168,"Ada":67,"Ach":87,"Adm":203,"Adr":71,"Aga":104,"Afr":719,"šą ":73,"Air":332,"Aka":93,"Akm":194,"Ala":187,"Alb":193,"Alg":65,"Ali":208,"Ale":201,"Alt":143,"Aly":220,"Alk":99,"Alp":174,"Ame":1274,"Ama":241,"Ang":426,"Ank":145,"Ana":165,"And":549,"Any":154,"Ant":913,"Api":205,"Apa":86,"Buv":78,"But":90,"Bul":113,"Bur":231,"Bud":102,"Bue":112,"Bui":103,"Bru":71,"² ":443,"Cac":84,"Cad":64,"Cal":67,"Cam":66,"Car":189,"Can":84,"Cap":73,"Bet":72,"Ber":374,"Ben":312,"Bel":409,"Baž":77,"Bil":78,"Bir":299,"Bla":117,"Bre":103,"Bra":609,"Bro":169,"Bri":429,"Bol":312,"Bon":72,"Bor":174,"Bos":142,"Bot":70,"Dei":68,"Del":74,"Dem":131,"Dep":64,"Deg":63,"Dam":137,"Dan":260,"Dar":403,"Dau":478,"Dab":105,"Dai":135,"Dal":128,"Cho":100,"Chr":185,"Che":286,"Chi":117,"Chu":119,"Cit":67,"Cen":563,"Cer":118,"Cha":378,"Cro":65,"DP ":99,"Cor":127,"Com":218,"Col":154,"Con":133,"アアア":149,"FA ":185,"ón":71,"Dze":98,"Egi":189,"é ":79,"Dni":78,"Daž":276,"ą ":16360,"Dia":66,"Did":1104,"Dis":102,"Dir":98,"Din":76,"Die":275,"Dub":146,"Dun":110,"Dvi":80,"Dus":63,"Dva":68,"Dru":132,"Dri":158,"Dra":241,"Dod":83,"ES ":63,"Don":122,"Dom":99,"Nem":374,"Nev":213,"Neu":81,"Net":114,"ęs":3467,"Ner":231,"Nep":196,"ėz":158,"ę ":5692,"ėc":117,"ėd":1595,"ėl":4129,"ėk":704,"ėn":2911,"ėm":2033,"ėg":1127,"ėj":12119,"ėt":2531,"ės":37968,"ėv":406,"ėp":181,"ėr":671,"čų":69,"Nat":155,"Nau":859,"−":522,"Nig":114,"Niu":165,"ūšy":191,"Nik":109,"ūši":1752,"ėč":63,"Eže":263,"New":68,"ėž":954,"ėš":258,"Nar":128,"Nam":132,"Nag":68,"Nac":163,"Nuo":622,"ąl":250,"ąj":1470,"ąm":118,"ąn":83,"ąs":1224,"ąr":696,"ąv":409,"Nyd":143,"Či":814,"Če":428,"Ča":214,"či":21375,"čk":154,"če":803,"ča":249,"č ":409,"ąž":190,"čo":121,"ču":90,"ė ":26547,"ųjų":1526,"OS ":98,"Nr ":561,"Nov":113,"Nor":386,"Not":108,"čė":116,"įž":184,"Ją ":106,"Oke":81,"Och":90,"Obe":83,"Į ":110,"į ":15271,"Įe":85,"Įk":141,"įg":465,"įl":469,"įk":1035,"įm":672,"įd":140,"įe":122,"Oli":102,"Įs":221,"įr":905,"įp":182,"įv":1510,"įs":2207,"įt":836,"Ori":91,"Org":75,"Oro":67,"Ost":83," −":501,"Po ":75,"Plu":194,"š ":7298,"Plo":267,"Pli":66,"Ple":71,"Pla":439,"Pin":159,"Pil":450,"Paš":81,"Pir":463,"Pie":1313,"Pic":67,"šį":142,"Per":807,"Pet":236,"Pen":132,"Pel":183,"šč":2693,"šą":98,"šė":408,"Pat":292,"Pas":767,"Par":1014,"Pav":466,"Pau":206,"Pag":484,"Pab":107,"Pad":139,"Pan":697,"Pam":122,"Pap":991,"Paj":94,"Pal":1658,"Pak":297,"šg":221,"še":5219,"šd":199,"ša":5331,"šb":74,"šo":2052,"šp":366,"šm":981,"šn":613,"šk":10936,"šl":1712,"ši":16792,"šv":1932,"šu":1367,"št":7877,"šs":1233,"šr":983,"šy":1194,"Še":341,"Ša":886,"Šl":99,"Šo":70,"Ši":3391,"Šk":111,"Jėz":95,"Šu":220,"Št":155,"Šv":1543,"Šr":75,"Prū":117,"Kėd":210,"ō ":76,"Pun":96,"Pus":118,"Pue":76,"Pro":479,"Pri":882,"Pre":234,"Jį ":73,"Pra":2269,"Pod":91,"Pol":415,"Pon":120,"Pot":100,"Pos":67,"Pop":119,"Por":237,"žr":232,"žs":417,"žt":693,"žu":2236,"žn":2332,"žo":2063,"žp":171,"žv":1312,"žy":1527,"žb":100,"že":6838,"žd":1624,"ža":3699,"žk":289,"žm":1670,"žl":158,"ži":16869,"Žm":91,"Žy":116,"Žu":122,"Žv":178,"Ža":482,"Ži":399,"Že":955,"RS ":294,"Išs":132,"Išt":197,"ž ":1080,"žų":376,"Iš ":185,"žį":113,"žą":94,"žė":568,"SA ":111,"ūči":238,"Rad":254,"Rai":78,"Raj":66,"Rag":90,"Ram":470,"Ran":143,"ū ":138,"šų":238,"šū":280,"ūg":553,"ūd":1733,"ūb":446,"ūz":1060,"ūs":1429,"ūt":1461,"ūv":180,"ūp":172,"ūr":9049,"ūk":1535,"ūl":283,"ūm":598,"ūn":4467,"ų ":88718,"ūč":245,"ųj":1568,"ūš":2032,"ūž":93,"Ita":399,"Isp":323,"Isl":135,"įpr":139,"įma":63,"įmo":598,"Ira":230,"įsi":807,"įsk":80,"įkū":191,"įst":1175,"įta":351,"įte":278,"įtr":77,"įtv":117,"įra":525,"įre":301,"įro":74,"Izr":81,"įga":117,"įgy":242,"Dėl":85,"įei":119,"įdu":68,"įku":691,"Jav":80,"Jau":108,"Jas":71,"įla":456,"Jap":569,"Jan":213,"Jam":221,"Jak":178,"Jel":102,"Jer":74,"Jis":502,"Jie":135,"Ji ":425,"あ":83,"Jo ":148,"Įst":114,"Įsi":88,"Jos":270,"Jon":566,"ア":227,"Joh":97,"Būd":72,"Įku":124,"Jug":66,"Jup":99,"Juo":530,"Jur":232,"Juk":63,"Jun":554,"Įei":85,"Kad":98,"Kab":68,"Kai":1116,"Kam":412,"Kal":1098,"Kap":211,"Kan":625,"Kau":985,"Kat":242,"Kas":171,"Kar":1509,"Kaz":340,"国 ":82,"Ker":168,"Ket":93,"Ken":88,"Kel":250,"Kir":183,"Kit":124,"Kin":537,"Kip":63,"Kie":93,"Kil":126,"Kli":79,"Kle":64,"Kla":523,"Klu":106,"Kon":597,"Kom":332,"Kol":450,"Kos":140,"Kor":624,"Kop":80,"Kov":100,"LR ":167,"Kre":575,"Kra":414,"Kri":437,"Kro":173,"Kru":166,"Kry":125,"Kub":74,"Kul":193,"Kun":174,"Kup":188,"Kur":371,"Kva":100,"Les":78,"Lep":69,"Leo":93,"Len":1971,"Lei":123,"Lau":223,"Laz":170,"MC ":504,"Lai":279,"Lat":571,"Lao":67,"Lap":196,"Lam":138,"Lan":215,"Lab":226,"Kuž":106,"La ":159,"Lib":176,"Lia":138,"Lie":6085,"Lig":81,"Lim":100,"Lin":276,"Lit":89,"Liu":198,"Liv":74,"Luk":93,"Lua":75,"Los":103,"Lot":96,"Lon":143,"ūgš":200,"ūs ":568,"ūpi":97,"ūra":1708,"ūse":158,"ūry":183,"ūru":361,"ūro":3121,"ūri":1981,"ūks":471,"ūka":130,"ūki":544,"ūkl":137,"ūma":254,"ūna":1137,"ūmi":118,"ūdž":157,"Lyg":63,"ūnu":353,"ūni":1251,"ūne":67,"ūno":452,"ūrė":579,"ūrę":259,"Mek":335,"Men":159,"Mel":164,"Mes":186,"Mer":401,"Met":158,"Med":222,"Mez":64,"ūzi":829,"Džo":137,"ūrų":466,"Dža":76,"Dži":72,"Dže":84,"ūkš":76,"ūsi":141,"ūną":68,"ūst":200,"ūnė":216,"ūta":74,"ūte":79,"Man":463,"Mal":406,"ūti":914,"Mar":1387,"Mas":288,"Mag":384,"Mad":147,"Maj":79,"Mak":226,"Mah":72,"Mai":142,"ūty":109,"Mac":86,"ūmų":111,"ūvi":114,"Mau":95,"Mat":219,"ūnų":861,"ūrą":276,"Mok":227,"Mol":370,"Mon":426,"Mos":65,"Mor":209,"Mot":313,"Moz":127,"ūzų":144,"NR ":79,"Mik":194,"Mie":142,"Mia":99,"Mic":168,"Mit":95,"Mir":79,"Mis":99,"Mil":131,"Min":395,"ūtų":160,"ūsų":232,"Maž":500,"Mur":145,"Mus":128,"Miš":117,"Sąj":308,"XX ":204,"XV ":71,"кий":66,"Wor":95,"Wil":72,"Win":96,"XI ":73,"War":82,"Vyr":158,"Vyt":150,"Viž":91,"Vok":795,"Vol":265,"Viš":70,"Vis":391,"Vit":100,"čkų":69,"Zar":256,"Zel":141,"之":194,"並":122,"三":326,"丁":115,"Yra":183,"三三":72,"Sve":90,"Sva":109,"Sur":111,"Sus":251,"Suv":802,"Sum":113,"Suk":74,"Sup":79,"Suo":198,"Sun":92,"Sud":274,"Suc":101,"Sub":64,"Str":249,"Stu":148,"Sti":184,"Sto":157,"Sta":653,"Ste":219,"Ten":81,"Tei":320,"Tel":387,"Tek":128,"Tam":147,"Tan":110,"Tar":749,"Tau":535,"Tai":1790,"Tak":96,"Tal":105,"Tad":89,"Ski":160,"Skr":105,"Sku":184,"Ska":362,"Sim":140,"Sil":118,"Sir":117,"Sin":192,"Sid":156,"Sie":159,"Sib":270,"Sic":64,"Ser":315,"Sep":68,"Sen":642,"Sel":68,"Sem":92,"Sei":374,"TV ":98,"Kūn":234,"国":86,"Spa":162,"TS ":71,"Spi":75,"Spe":132,"Spo":74,"Sof":72,"Sok":963,"Soc":156,"Sol":99,"Son":84,"Sos":108,"Sky":69,"Slo":158,"Smi":70,"Sma":84," 三":86,"Ryt":687,"Ryg":105,"Jų ":127,"Jūr":171,"Rus":1573,"Rug":83,"Rud":101,"Rum":121,"Sak":119,"Sam":191,"Sal":563,"Sac":85,"Sco":107,"Sav":191,"Sat":80,"Sau":518,"Sar":175,"San":807,"Mėn":80,"Rač":79,"SI ":77,"Res":1390,"Rio":84,"Rin":77,"Rib":131,"Rie":98,"Ras":159,"Rau":279,"Rec":68,"Red":71,"Rei":156,"Reg":116,"Ren":82,"Rok":173,"Rob":64,"Rod":82,"SR ":568,"Ros":182,"Rom":573,"SS ":87,"čų ":69," 之":78,"SO ":120,"Vai":313,"Vad":65,"Vel":220,"Ven":367,"Vei":167,"ски":73,"Vas":102,"Van":212,"Val":695,"Vak":682,"Var":578,"Vaš":70,"Vid":634,"Vie":542,"Vir":184,"Vil":1774,"Vik":201,"Vin":109,"Ver":344,"Ves":109,"Ukr":322,"Ukm":131,"Uni":208,"Uru":89,"Ura":110,"Ute":210,"Upė":66,"VD ":82,"VI ":165,"Ter":267,"Tet":74,"Tes":113,"The":467,"Tib":113,"Tie":67,"Tik":149,"Til":74,"Tim":101,"Tin":67,"Tir":87,"Tiu":72,"Tor":126,"Tok":232,"Tol":111,"Tom":115,"Ton":98,"Tru":94,"Tro":116,"Tri":330,"Tre":156,"Tra":520,"Tur":537,"Tun":69,"šga":146,"ši ":168,"šel":123,"šer":281,"šei":4330,"ša ":145,"še ":72,"šas":283,"šar":241,"šau":451,"šac":68,"šai":175,"šak":893,"šal":2584,"šam":125,"šan":259,"Švč":176,"Šve":932,"Švi":88,"što":898,"štr":99,"šte":875,"šti":1747,"škų":474,"šta":1289,"šun":139,"šuo":95,"šul":170,"šum":70,"šus":224,"šuj":65,"šuk":112,"štu":1243,"šty":912,"švi":810,"šve":671,"šva":235,"šut":236,"šuv":83,"švy":117,"špa":109,"šką":187,"špl":90,"šon":165,"šor":222,"šos":190,"šiš":128,"škė":457,"šre":107,"šra":138,"šri":577,"šiū":137,"šių":982,"šsi":699,"šsp":63,"šsk":346,"šru":91,"šsa":81,"šmė":100,"šku":531,"šky":77,"ško":2173,"šle":715,"šli":309,"šla":532,"šme":157,"šeš":251,"šmi":172,"šo ":946,"šma":81,"šmo":366,"šni":181,"šią":67,"šne":70,"šna":94,"šny":138,"šno":80,"šoj":128,"šok":166,"šom":149,"šia":7811,"šie":268,"šin":2480,"šio":1156,"šil":344,"šim":653,"šik":111,"šiu":646,"šir":121,"šis":1523,"šdė":102,"ška":2905,"ški":3659,"ške":337,"štų":188,"štė":332,"štį":112,"šyt":666,"šys":236,"šym":121,"štą":94,"ęs ":3114,"ęsi":184,"ęst":139,"bje":1237,"baž":702,"bis":206,"bit":262,"biu":413,"bio":445,"bip":90,"bir":575,"bik":90,"bil":1165,"bim":181,"bin":2312,"bij":590,"bo ":831,"blo":199,"ble":254,"bli":2467,"bla":154,"bod":66,"bok":148,"bol":1634,"boj":747,"bež":91,"biš":132,"bon":209,"bom":266,"bor":294,"bot":239,"bos":2270,"be ":431,"bam":151,"ban":1542,"bak":168,"bal":1952,"bai":1573,"baj":127,"bac":144,"bad":71,"baz":180,"bau":383,"bat":206,"bas":1718,"bar":1644,"bda":77,"bi ":151,"bei":2412,"ber":929,"ben":2915,"bel":329,"bek":139,"bev":245,"bes":726,"bet":875,"bia":875,"bib":123,"ėdo":412,"bie":263,"ėdi":261,"ėda":433,"buč":70,"bzd":176,"− ":502,"ca ":297,"buž":82,"car":324,"cas":107,"cat":74,"can":137,"cac":140,"cal":88,"ce ":305,"bių":787,"bri":867,"bro":454,"bra":835,"bre":106,"bry":134,"bu ":188,"bru":157,"bso":103,"bse":75,"bta":77,"bst":94,"bti":224,"btr":68,"buo":284,"bur":719,"bul":328,"buk":109,"bum":669,"bui":94,"bud":276,"buv":3564,"but":190,"bus":430,"brė":285,"byl":146,"bys":99,"abų":128,"aka":10783,"am ":1782,"ake":779,"akc":437,"aki":1290,"aji":120,"ėza":80,"ajo":4851,"aju":423,"aiz":904,"al ":2864,"adė":694,"aja":2169,"aje":126,"aik":6012,"ail":788,"aim":3610,"ain":2341,"aip":4816,"air":2272,"ais":9853,"ait":1788,"aiv":2833,"ėsč":100,"ak ":92,"aig":1833,"aid":2977,"aib":279,"ahi":72,"aho":88,"ėvi":102,"aj ":639,"abė":174,"agv":331,"agy":177,"ėva":79,"aha":210,"agl":74,"agm":87,"agi":657,"ėnų":1107,"agr":3158,"agu":296,"agn":741,"ago":977,"anu":591,"anz":118,"any":235,"ano":3129,"ann":125,"anm":118,"ant":17439,"ans":1524,"aič":1223,"anr":199,"ane":1374,"ang":3590,"ani":4762,"anj":123,"ank":4229,"ap ":76,"ana":4088,"anc":1441,"and":6903,"amu":630,"ėtų":311,"amt":459,"amy":484,"amz":131,"amm":86,"amo":3569,"amp":1185,"ams":2382,"ami":5086,"adž":530,"ame":5511,"amb":1223,"ama":10174,"ao ":130,"adų":80,"agė":263,"aly":3612,"alv":1945,"alu":1766,"alt":2672,"als":5664,"alp":263,"alo":3517,"aln":4883,"alm":300,"all":289,"alk":1267,"alg":429,"ali":17593,"alc":176,"ald":5080,"ale":3230,"Šan":75,"ala":5633,"alb":4521,"Šal":318,"Šak":157,"an ":1340,"Šau":91,"ėtį":118,"akv":76,"aky":230,"aks":329,"akr":1125,"Šar":99,"aku":724,"akt":1972,"ako":1771,"akn":187,"akm":413,"akl":356,"aba":2740,"abe":207,"abi":1011,"abl":158,"abo":330,"abr":414,"abs":143,"abu":396,"abz":181,"ae ":3025,"aca":72,"ad ":1165,"ac ":144,"ab ":182,"afo":102,"afr":63,"aft":132,"afi":672,"ai ":28957,"aga":3563,"age":494,"afy":94,"aen":65,"ael":136,"aei":134,"afa":115,"ado":1953,"adr":251,"adm":1262,"adi":7918,"ade":803,"ady":144,"adu":285,"adv":445,"ack":137,"aci":7737,"ach":900,"ace":1807,"ada":1767,"act":213,"azm":131,"azo":425,"azi":1049,"arš":379,"azl":78,"auč":212,"arū":95,"atė":313,"aze":155,"azg":73,"aza":391,"azd":314,"ėcė":111,"arų":1990,"arž":560,"apš":64,"asč":85,"arė":673,"arę":105,"atą":166,"ėgo":219,"ėgi":261,"apų":68,"asė":868,"ėga":161,"avų":296,"ėkt":249,"Šta":114,"ėn ":149,"Šv ":293,"ėla":197,"ėli":1870,"auž":306,"ba ":7831,"azė":203,"ėly":383,"ėgė":107,"Šud":88,"ėme":99,"ėdž":122,"ėmi":1028,"ėms":611,"atū":905,"ėja":1671,"ėl ":1089,"ėdė":151,"ėji":1633,"ėje":5476,"atš":113,"asų":302,"ėjo":1675,"avė":225,"avę":277,"ėju":616,"atž":107,"ėki":72,"auš":245,"atų":809,"ėkl":135,"ėkm":115,"alč":210,"alė":817,"at ":1471,"ėpa":71,"arg":513,"aiž":139,"are":1358,"ard":2315,"arc":907,"arb":6774,"ara":6684,"arp":3635,"aro":3645,"arn":1263,"ėgų":151,"arm":594,"arl":418,"ark":2565,"amą":414,"ėję":513,"arj":115,"ari":8416,"alį":375,"aru":2631,"arv":357,"Šie":85,"Šia":1607,"arr":83,"ars":1153,"ajū":107,"art":6116,"au ":2801,"asa":3470,"ary":2268,"ajų":380,"akš":168,"asi":8082,"ash":79,"ase":867,"asd":82,"aso":710,"asn":256,"asp":307,"ask":1323,"aną":115,"asm":1404,"ės ":35616,"asl":491,"ėnu":240,"ėno":201,"aos":78,"agū":453,"ajė":352,"ar ":4730,"ėni":505,"agų":187,"apa":1300,"ėne":311,"ape":335,"ėna":175,"apd":357,"aką":119,"api":6467,"apg":138,"apl":1752,"apk":227,"apo":1183,"apr":1671,"aps":2502,"apt":510,"apu":404,"apv":201,"apy":1793,"akė":76,"as ":68022,"Ši ":192,"aiš":684,"alą":244,"ėny":160,"ėto":359,"ava":4842,"apė":321,"amų":755,"auz":79,"ėty":82,"aut":4104,"Ško":96,"avo":3989,"ėta":441,"avi":8394,"amž":667,"ėti":999,"ave":644,"ay ":151,"Šeš":127,"avy":790,"anų":720,"avu":579,"arč":154,"arą":182,"anž":85,"anė":190,"akų":454,"ata":3198,"asu":274,"ast":4236,"ass":235,"anč":3200,"asy":280,"asv":107,"Šil":407,"atm":389,"atn":97,"atk":211,"atl":931,"ėra":344,"Šip":64,"atr":904,"Šio":192,"ato":2724,"Šim":69,"atp":71,"ėlė":390,"ate":2024,"atf":194,"Šis":287,"Šir":132,"ėri":244,"ati":4429,"atg":129,"Šiu":193,"ath":131,"alų":2328,"ėsn":115,"aub":219,"att":75,"ats":3002,"alū":263,"atv":1480,"atu":978,"ėst":385,"aty":1953,"aul":3204,"aum":246,"aun":1743,"auo":93,"aup":243,"ėmė":214,"aur":9854,"aus":15793,"ėjų":416,"aud":4737,"ėsi":184,"aug":7354,"ėse":1427,"auj":2881,"auk":5802,"ος":93,"ος ":93,"Vėl":110,"ς ":168,"ėči":63,"Rūd":99,"α ":85,"Sūd":82,"アア":188,"ий ":71,"jei":143,"jer":295,"jek":1582,"jen":63,"jet":69,"jev":166,"ji ":3840,"aža":639,"ažd":237,"aže":494,"aži":1113,"ažn":2141,"ažo":516,"ažu":326,"ažy":107,"jad":82,"izė":124,"jas":2376,"jav":90,"jau":1901,"jap":205,"jar":157,"jak":120,"jan":2657,"jam":5082,"jag":120,"jai":1573,"je ":35197,"ažį":104,"ažų":112,"jog":127,"jok":177,"joj":10950,"jon":4897,"jom":502,"jot":90,"jos":20733,"jor":274,"ск":151,"jis":383,"jim":2557,"jin":736,"jie":392,"ст":86,"jo ":6159,"itn":63,"itm":290,"itk":77,"ioč":215,"itr":489,"ito":5645,"itv":127,"inį":1158,"itu":2883,"itt":74,"ity":2401,"ilų":166,"iub":80,"iuc":232,"ilž":195,"iud":174,"iuj":1926,"iuk":566,"iui":902,"isk":1115,"iną":194,"ism":716,"isl":291,"iso":747,"isp":344,"iss":63,"ikū":653,"inč":492,"isu":1429,"ist":10402,"isv":554,"isy":283,"ikų":1702,"inė":20322,"inę":1356,"ita":3101,"ite":2150,"ith":115,"iti":4567,"ivo":225,"inų":1093,"ivy":313,"ivu":89,"irą":66,"inž":95,"ius":6964,"iur":510,"ium":511,"iul":202,"iuo":4507,"iun":199,"imų":1088,"iuz":146,"iut":1006,"iuv":85,"iva":5713,"ipė":458,"ix ":136,"ivi":1601,"ive":1142,"ipr":458,"ipo":522,"ipu":252,"ips":676,"ipt":461,"ipi":440,"iką":480,"igž":771,"ipl":309,"ikę":172,"is ":52601,"ikė":859,"ion":5845,"iop":388,"ior":133,"ios":8346,"igū":296,"iot":445,"iog":457,"iją":1411,"ioj":3109,"iok":271,"iol":667,"iom":700,"iję":103,"ipa":679,"ipe":332,"iov":183,"igų":264,"ioz":68,"ir ":24988,"iru":470,"irv":355,"irs":602,"ijū":154,"irt":3182,"iro":972,"irp":210,"irm":1827,"irn":109,"irk":393,"iri":3779,"imą":1476,"ikš":1311,"isi":2798,"ish":114,"ise":370,"isd":71,"isc":198,"isa":1339,"iu ":2871,"imė":266,"ijų":2456,"iry":540,"ire":616,"irg":245,"irb":684,"ira":1224,"ird":228,"it ":169,"ilė":771,"ilę":379,"ašų":113,"iuš":73,"dėt":870,"dės":969,"itų":1031,"dėj":1386,"dėm":219,"dėn":263,"dėl":1132,"dę ":83,"ivė":295,"isų":290,"ivų":75,"dęs":150,"ja ":13854,"itą":217,"ipų":79,"isė":1814,"isę":115,"isą":156,"irę":81,"irė":261,"irž":446,"irų":76,"izu":821,"itį":88,"irū":86,"izo":451,"izn":83,"izm":1490,"izi":1251,"irš":1690,"izg":80,"ize":116,"izd":791,"iza":1796,"itę":494,"itė":412,"dė ":795,"kaš":89,"kik":107,"kij":2691,"kim":1590,"kil":1486,"kia":4293,"kie":2272,"kiv":63,"kin":5088,"kio":2376,"kip":128,"kir":3853,"kis":920,"kit":2704,"kiu":661,"kaž":63,"km ":5939,"ki ":3521,"kdy":223,"kg ":77,"kea":110,"ked":124,"kei":560,"kel":3590,"ken":372,"kep":138,"kes":439,"ker":661,"ket":958,"ke ":844,"kci":1331,"kda":895,"kdo":201,"kra":4663,"kre":1055,"klė":428,"kt ":364,"kių":2205,"kiš":1820,"klą":309,"ksa":796,"kse":134,"kry":1180,"kmė":162,"ku ":438,"kro":983,"kru":318,"kri":3232,"kov":891,"km²":423,"kot":816,"kos":8611,"kor":653,"kop":332,"koo":136,"kon":3138,"kom":3814,"kol":1241,"kok":423,"koj":2216,"koh":148,"kod":1418,"ks ":271,"kmu":184,"kme":718,"kmi":91,"kny":405,"kni":155,"kią":80,"klu":945,"ko ":5671,"kly":141,"kle":472,"kla":7712,"klo":1474,"kli":2387,"bų ":1576,"jyb":83,"jus":1760,"jun":1716,"juo":1236,"jur":161,"juj":263,"jui":107,"jud":642,"būn":290,"būk":89,"būr":626,"būs":338,"būt":996,"eči":2678,"dį ":451,"ju ":392,"būd":1044,"kaz":236,"kav":389,"kat":1246,"kau":876,"kar":10769,"kas":6028,"kap":1244,"kan":2442,"kal":11413,"kam":1843,"kaj":116,"kak":332,"kai":11174,"kag":106,"kad":1450,"kac":439,"kab":221,"ka ":5403," Ga":1407,"bėt":104,"bės":4168,"bėn":81,"bėm":94," Ge":1126,"bėj":670," I ":269,"bėg":144,"bėc":110," Fo":747," Fu":218,"bę ":399," Fr":454," Fi":557," Fl":215," Ha":877," He":844," Gy":327," J ":93," Go":421," Gr":2243," Gu":448," Gv":487," Gi":589," Gl":237," Ig":304," Id":102," Ib":64,"guž":295," K ":135," Hy":110," Hu":227," Ho":536,"ha ":309," Hi":515," Ji":1102," Je":431," L ":82," Ja":1776," Iz":149," Dė":110," Iv":74," Ir":401," Is":623," It":425," Im":226," In":1579," Aš":111," Ik":112," Il":691,"ham":166," M ":181,"han":534,"hai":104," Ka":7194,"haj":70,"hal":255,"hau":98," Ke":839," Ki":1466,"har":550,"has":169,"hat":88," Jo":1429," Ju":1786," Bū":168,"hab":63,"had":70," La":2647," Le":2732," Li":7563," Kl":830," Kn":86," Ko":2662," Kr":1957," Kv":191," Ku":1464," Ky":76," Ma":4719," O ":103," Mi":1782," Dž":420," Me":1896," Dū":73,"he ":653," Lo":721," Ly":249," Lu":488," Ne":1796,"а ":158," P ":142," Na":1859," Ež":273," Ni":719," Mo":1991," My":150," Mu":530,"hel":177,"hei":83,"hev":142," A ":384,"het":95,"hes":72,"her":871,"heo":356,"hen":157,"hem":647,"hi ":89," B ":160," C ":148," Ap":776," Am":1804," An":2616," Ak":583," Al":1977," Ai":520," Aj":72," Ag":281," Af":804," Ac":198," Ad":550," Ab":408," Ba":3796," D ":97," Az":731," Av":213," Au":2682," At":910," As":629," Ar":1939," Be":1541,"hie":74,"hid":370," Bi":835,"hia":94,"hip":216,"hin":342," Bl":291,"him":102,"hil":142," Bo":1202,"hij":135," Br":1443," Bu":1122,"his":79,"hit":343,"hir":118," E ":80," Ca":760," Ce":856," Ci":314," Ch":1252," Cl":153," Cr":197," Co":818," Cu":130," Cy":71," Da":2178," Di":2044," De":898," Dr":663," Do":658," Dn":85," Dy":151," Dz":157," Du":604," Dv":187," Ed":144," G ":65," El":573," Ek":234," Ei":124," Eg":301," Et":188," Es":385,"hlo":65," Er":362," Ep":87," En":317," Em":142," Ex":72," Eu":1612," Ev":98," Fe":541,"ho ":213,"hma":111," Fa":424," H ":80,"gma":128,"go ":1512,"gme":171," Są":373,"glo":300,"gle":181,"gli":959,"gla":443," Wo":146," Wi":267," We":123," Wa":210,"й ":139," Vy":520," Rū":228,"gog":103," Zo":73," Ze":220,"gno":577," Zi":68," Tė":65,"gni":373," Za":514," Yr":183,"gne":245,"gna":449," Yo":68," Sė":67,"giš":189,"gpj":199," Vė":216,"о ":63,"gr ":135," Sū":95,"goj":625,"gom":579,"gol":244,"gon":466,"gos":3067,"gor":332,"got":295,"gov":127,"gu ":264," a ":1580,"gro":326," Už":349,"gry":429,"gru":2484,"gra":4017,"gių":416,"glė":83,"gri":3118,"gre":877," R ":90," Jė":115,"gty":102,"glų":187,"gto":154," Os":251," Or":560," Op":146," Po":1526,"gui":113," Pl":1099," Pi":2753,"gum":1158," Ph":102,"gul":484," Pe":1581,"gua":121,"gub":128," Pa":7618,"gue":64,"gst":140," Ny":218," Nr":561," Nu":899," No":770," Ol":265," Ok":221," On":138," Om":89," Ją":106,"gti":868," Od":117," Oc":104," Of":78," Ob":214,"gta":465," Lė":70," Ra":1919,"gvo":218," Ro":1436,"grą":89," Re":2283," Ri":769," Kė":213," S ":144,"guv":99,"gut":67,"gur":151," Pr":4036," Ps":108,"gus":1460," Jį":73," Pu":558,"gun":99,"guo":261," Iš":781,"gvi":98,"gva":685," Sy":113," Sv":277," Su":2443,"grį":216," St":1586," Kū":349," Ta":3980,"gsė":181," V ":128,"gyb":293,"gyd":154," Th":577," Ti":951," Te":1623," Tr":1262,"gyl":77,"gyn":197," To":1060," Ry":866," Jū":181," Ru":2112," Sa":3099," Jų":127,"е ":76," Mė":156," Sh":178," Si":1574," Sc":292," Se":1876," So":1791," Sp":549," Sr":63," Sk":1023," Sl":298," Sm":286," Sn":82,"grū":76," Uz":68," Va":2931," X ":111,"и ":92," Ve":1358," Vi":4303," Vl":71," Vo":1248," Tu":948,"gyv":4200," Tv":72," Ty":93,"gys":254,"bė ":1092,"gzi":495," Ug":88," Uk":462," Ul":82," Un":258," Uo":118," Up":172," Ur":330," Us":82," Mū":111," Ut":265," ja":1311,"iai":12600,"iak":2489,"iaj":841,"iam":6822," dė":1157,"ial":2617," iz":109,"ian":3835," ji":819,"iap":102,"ias":3810,"iar":592,"iau":21521," je":150,"iat":599,"iav":1428," im":762," in":5586," ik":3172," il":1403,"ic ":202," aš":299,"iab":113,"iac":1098," is":1471,"iad":246," it":186," ir":24988,"iag":1048,"ibl":199," ka":22789,"ibi":638,"ibo":741," m ":8180," kg":76,"ibr":360," ki":5307," ke":3545,"ibu":210," jo":2067,"id ":126,"iba":407," bū":2814,"ibe":354," ju":2553," gy":4128," bė":99," ha":363," he":768," gi":1781," gl":388," gr":5317," go":132,"ia ":6457," gu":260," gv":148," k ":116," id":308," ie":140," hi":591," ho":365," hu":138,"iet":17789,"iev":1199," ni":198," ež":2821,"iel":363,"iem":1768," ne":7202,"ien":11441,"iep":530," na":5031,"ier":804,"ies":9563,"ied":842,"ieg":235," my":64,"iek":3632,"iej":2087," mu":1417," mo":7083," mm":210,"ieb":372," ok":189," ol":297," ją":256," oj":106,"ifo":912," od":155," of":644," ob":1166,"ife":169," ny":106,"ifi":598," nu":12007," no":1368,"ifa":104," le":2689,"icr":108,"icu":77," li":4372,"ico":274,"ick":163," la":7451," kv":492," ku":12047,"ici":1931," kt":309,"ich":922," ky":193,"ice":275," kn":370,"ie ":6151," km":6372," kl":2310,"ica":672," kr":4858," ko":8974," me":9020,"idy":563," dž":138," eš":161,"idu":2683," mi":7229,"idv":63," ml":158," gė":342,"я ":102," o ":1212,"idr":494,"ido":937,"idm":185," ma":4864," lu":92,"idi":2189," ly":1967,"ide":3721,"ida":3570," lo":631," ag":258,"aša":470," ab":508,"iid":108," ac":88,"aše":79," ad":1540," am":912,"idą":63,"ašk":334," an":3700," ap":10764,"aši":756," ai":453,"ašo":756," ak":1627,"ašl":96,"ašm":98,"iim":317," al":1456," av":222," au":6951," ar":9760," at":6438,"ašu":283,"ašt":1920," as":2056," d ":1323,"ašy":781," ba":4469,"il ":133,"idė":1297,"ija":13519," bi":992,"ije":95," be":6829,"iaž":208,"iji":627," bo":424," bl":292,"ijo":29798,"ieč":1593," by":148,"ibū":258," bu":4118,"iju":808," br":918," ca":122,"ibų":249," e ":82,"im ":74,"ika":5325,"ige":301,"iga":1803,"ii ":82,"igl":111,"igm":118,"igh":167,"igi":1406,"igu":322,"igt":307,"igr":206,"igo":701,"ign":206,"dą ":495,"ibė":80,"igy":112,"ik ":1008,"imo":10288," er":623,"imn":310," et":431," es":1712," en":778,"ims":264," em":226," ep":239,"imp":910,"idž":3127," ei":683,"imf":152,"ime":1400," el":1508,"imd":91," ek":915," ef":150,"imi":2582,"ieš":2115," eg":645," fe":604,"ip ":3767,"inc":1582,"ind":4737,"ina":9499," fa":728,"imt":970,"imu":2073," eu":161," ev":173,"imy":291," fu":1594,"inm":71,"ino":4836," fr":209,"ašč":176," fo":1905,"int":7918,"ins":1597,"inf":613," fl":132,"ine":2681,"iež":504,"ing":5394," fi":1973,"ini":42857,"ink":9182," ge":4705," ga":6719,"ioc":68,"iod":332,"inu":851,"inv":260,"iny":2814,"ašė":95,"iko":8237," cm":466,"ikm":102,"ikl":4355," co":131,"iki":8010," ce":2733,"ike":332," ch":821," ci":727,"ila":1055,"in ":677," da":11917,"iky":490," cu":64,"ikt":1187,"iku":935,"ikr":1885,"iks":1325," do":606,"ilp":157,"ilo":1045,"ill":367," dr":1386,"ilk":807,"iln":1654,"ilm":942,"ilg":1557,"ilj":190," de":4787,"ili":7113,"ild":935," di":7861,"ile":510,"ima":10774,"imb":476," g ":911,"idų":320,"igė":179,"io ":24537," dv":1810,"ily":138," du":1631,"ils":105,"ilt":617,"ilu":734," dy":1060,"ilv":228," sū":347," zo":229," rū":2140,"hol":535," tė":174,"hom":130,"hon":117,"ко":78," tę":151,"hos":152,"hot":89,"hov":154,"hop":73,"hor":425," yp":378,"ка":91,"ки":93," yr":9482," sė":250,"hni":331,"hno":260," są":2199," pū":86,"ин":66," ww":64,"ий":86," ož":66,"ра":115,"hua":109,"htt":75," už":2934,"hst":134,"ол":71,"ов":122,"hry":109,"но":71,"hro":216,"ни":79,"hri":103," tū":466,"ht ":128,"на":78,"hra":72," vė":1295," ru":1285," jū":1646," ry":5467," mė":866," jų":977," sa":11017," sf":110," se":4482," sc":162," si":4889," sn":168," sm":866," sl":1116," sk":5183," sr":2054,"hyl":73," sp":3493," so":1538,"ве":64," lė":404," t ":238," ra":8074," re":7211," ri":2152," mą":77," ro":1078," pv":390," pu":2293," jį":257," pr":18161," ps":398," s ":108," px":100," lą":374," iš":11701," os":113,"hum":232," ov":65,"hun":68," op":537," or":3230," jė":239,"ан":71,"ал":63," pe":3678," pa":29839,"ар":72," pl":4167," po":4807," pi":10334," pj":71," vy":3074," y ":87," x ":92," va":20321," ve":4247," pė":140," vo":600,"cėl":111," vu":113," vi":14126,"ер":72,"ен":65," ty":567," tv":701," tu":2991," mū":318," ur":214," uo":741," up":3500," un":1322," ul":71," ug":910," ta":10594," nė":349,"hyt":188," st":5898," kū":1165," sv":1124," su":14551," tr":3585," lū":96," to":1975," th":302," ti":5451," te":9733,"fes":435,"fer":701,"fed":364,"fen":78,"fek":526,"fel":82," Ča":214," Či":814," Če":428,"fga":67,"faz":69,"fas":187,"far":115,"žų ":324,"fan":250,"fak":286,"fal":119,"fai":102,"ezė":113,"fe ":71,"etų":4160,"esų":111,"evė":459,"evų":124,"fa ":109,"esė":92,"erį":83,"etą":266,"erė":268,"erč":118," če":566,"esč":286," či":215,"epš":367,"esą":106,"ezu":126,"erų":398,"erž":348,"etė":1071,"eza":64,"esį":127,"ezo":319,"eze":135,"erš":197,"ezi":795,"eta":5454,"enę":98,"ete":1059,"elš":173,"eti":4554,"etn":281,"esp":1704,"esn":1467,"eso":815,"est":4994,"esu":549,"enč":626,"ess":157,"esy":202,"enė":1439,"ekų":115,"eud":63,"euk":102,"eto":2887,"etr":2354,"ett":92,"enį":206,"etu":10440,"etv":1625,"ety":134,"ew ":79,"eve":542,"eva":902,"evo":715,"evi":699,"eut":158,"eur":366,"eus":130,"epė":63,"emų":168,"erą":156,"evr":154,"ey ":158,"evy":281,"enų":1350,"epe":177,"epc":101,"epi":513,"eph":64,"er ":2219,"epa":1258,"eot":67,"eor":538,"eom":87,"eol":734,"eop":149,"eon":218,"eiš":1009,"ekė":269,"es ":8000,"ekę":92,"ept":436,"eps":96,"epu":165,"epo":412,"epr":642,"erk":587,"erl":418,"eri":10880,"emą":219,"erg":1284,"ere":1184,"erf":77,"erc":545,"erd":659,"era":5900,"erb":981," ėj":280,"elę":124,"elė":2221,"et ":1493,"ekį":64,"esj":92,"eną":643,"esk":317,"esm":259,"ekš":67,"esi":2143,"ese":290,"esa":2125,"ery":234,"emė":1363,"ejų":738,"erv":903,"elį":206,"eru":488,"err":173,"ert":1937,"ers":2707,"ern":1320,"erm":1142,"erp":330,"ero":2197,"eki":1989,"ekl":411,"ekm":65,"eko":1042,"ekr":192,"eks":1896,"ekt":4207,"eku":345,"ekv":315,"eky":288,"en ":710,"elb":215,"ela":746,"eld":365,"elf":88,"ele":3424,"eli":8265,"elj":72,"elg":477,"elm":261,"eln":456,"elk":671,"ell":316,"elo":518,"elu":87,"elv":84,"els":353,"elt":670,"ely":474,"eo ":170,"egė":73,"edų":106,"emb":301,"ema":2008,"edž":1549,"eme":1016,"emd":90,"emo":2200,"emi":2274,"emt":191,"emu":703,"emp":946,"ems":853,"emy":348,"ene":2754,"eng":1816,"enb":111,"ena":6485,"end":4440,"enc":1042,"eno":4885,"enm":89,"enk":5159,"enl":92,"eni":6797,"enu":670,"env":1006,"ens":1444,"ent":11928,"enr":107,"eič":280,"enz":118,"eny":1624,"eog":245,"eod":89,"egl":182,"ego":396,"ege":449,"egi":3283,"egz":550,"ebė":292,"egr":181,"egu":769,"egy":91,"ek ":622,"eic":241,"eip":170,"eis":5132,"eir":124,"eim":4571,"eil":568,"ein":1861,"eik":5224,"eid":1499,"eig":1358,"eja":503,"edė":140,"el ":389,"eiz":76,"eit":977,"eiv":749,"ejo":153,"eji":683,"eje":65,"eke":125,"ekc":170,"eka":2352,"em ":150,"ejy":83,"eju":497," į ":8903,"gl ":220,"giu":292,"git":144,"gis":1794,"gir":399," Į ":86,"gil":204,"gim":1321,"gij":3190,"gik":123,"gip":251,"gin":4262,"gio":2880,"gie":143,"gia":3193,"ght":134,"bą ":394,"gi ":457,"gen":3944,"geo":416,"get":168,"ger":1023,"ges":352,"geb":197,"gei":69,"geg":248,"gem":66,"gel":1714,"gdo":74,"gdy":714,"gda":73,"ge ":442,"gab":134,"gac":108,"gad":87,"gai":1374,"gas":1974,"gar":1729,"gau":1337,"gat":335,"gav":268,"gam":2232,"gal":9014,"gan":3227,"ga ":2878," įv":1493," įt":810," įs":1931," įr":895," įp":146," Įs":221," įd":139," įe":113," įg":452," įk":1033," įl":469," įm":645,"fys":93," Įe":85," Įk":141,"fut":1079," įž":65,"fun":535,"fto":75,"ft ":191,"ača":64,"fra":263,"ačk":85,"ači":1928,"fri":745,"fro":133,"for":3388,"fos":117,"fot":172,"fon":490,"fol":145,"ač ":218,"fla":74,"fli":110,"fo ":188,"fic":473,"fig":118,"fij":466,"fil":952,"fik":605,"fin":1043,"fir":67,"fit":88,"fiz":526,"da ":3356,"de ":955,"dac":195,"dad":96,"dab":1199,"dak":155,"dal":5226,"dai":1928,"dag":146,"dae":1242,"dat":189,"das":3048,"dar":5510,"dan":2672,"dam":1100,"dav":1026,"dau":2555,"cul":112,"cto":80,"cti":127,"cta":124,"cus":68,"cur":66,"cko":72,"co ":242,"con":95,"col":102,"com":76,"cor":163,"cos":201,"cop":83,"ciš":133,"cro":177,"cea":1680,"ch ":208,"cer":417,"ces":658,"cet":70,"cen":3061,"cep":179,"cel":113,"ceg":78,"cha":955,"chu":89,"cia":2358,"ck ":279,"cie":95,"cid":198,"che":1211,"chl":85,"chi":1035,"cho":966,"chm":91,"chn":602,"chs":167,"cht":194,"chr":188,"civ":350,"cij":9068,"cik":433,"cil":163,"cif":247,"cis":123,"cit":220,"cin":3236,"cio":1497,"cip":355,"cm ":459,"ed ":292,"eba":313,"ebe":309,"ebi":367,"ebo":120,"ebr":205,"ebu":212,"eag":73,"eae":1640,"eak":217,"ean":215,"eal":365,"ear":134,"eap":89,"eat":373,"eau":64,"ea ":188,"efi":76,"efo":293,"efa":70,"efe":515,"ei ":3954,"ega":737,"eed":76,"edi":2343,"ede":757,"eda":2475,"edy":257,"edu":236,"edo":437,"edr":226,"eck":65,"ech":882,"eci":846,"ece":95,"eca":89,"ect":147,"eco":118,"dyg":104,"dyk":422,"dym":1573,"dyn":853,"dys":311,"dyt":634,"dyd":692,"dyb":2813,"drė":138,"drą":94,"dvy":133,"dvi":1481,"dve":133,"dvo":211,"duv":159,"dur":2035,"dut":428,"dus":534,"dva":622,"dvė":188,"dzi":233,"dor":1140,"don":1347,"dom":1366,"dol":160,"dok":455,"dow":94,"dov":1045,"dot":521,"dos":2310,"ds ":126,"diš":216,"deš":1558,"dmi":1463,"dni":66,"dob":88,"doe":78,"doj":2119,"dum":210,"duo":2102,"duj":255,"dui":167,"dul":493,"duk":700,"dug":90,"dub":205,"dua":83,"dri":926,"dra":1708,"dre":173,"dry":71,"du ":892,"dro":2362,"dru":787,"dge":113,"dic":606,"did":3151,"dia":454,"der":1524,"des":572,"det":124,"dev":201,"deb":268,"dea":123,"ded":1735,"deg":595,"dei":369,"del":1942,"dek":567,"den":3068,"dem":709,"dep":764,"deo":162,"di ":260,"dme":216,"do ":2940,"div":255,"diu":103,"diz":157,"dim":1824,"din":11534,"dio":390,"dip":85,"dir":773,"dis":1529,"dit":234,"die":4136,"dif":150,"dij":4732,"dik":613,"dil":259,"dka":165,"daž":1194,"ižy":171,"rbė":133,"rgu":182,"mą ":3290,"rga":2638,"iža":147,"ri ":2109,"rgl":72,"iži":186,"rgi":812,"rbą":117,"rge":1017,"rgo":391,"ret":1342,"res":1263,"rev":180,"rez":564,"rfi":80,"rfo":87,"rač":122,"rdu":515,"rds":64,"rdv":219,"reb":105,"rea":608,"ree":109,"ref":432,"rec":183,"red":291,"rei":3237,"lėš":194,"rej":92,"reg":2937,"rem":808,"ren":2189,"rek":801,"rel":806,"rer":127,"reo":63,"rep":489,"rda":868,"rcu":85,"rdo":569,"rdi":1074,"rde":192,"re ":1146,"rby":63,"rbt":207,"rbu":464,"rco":114,"rci":353,"rch":1033,"rce":143,"raz":577,"rd ":418,"rao":69,"rap":372,"rar":206,"ras":10157,"rat":2588,"rau":2880,"rav":918,"rbi":1182,"rbl":168,"rbo":606,"rba":5655,"rbe":227,"raj":5239,"rai":3447,"rah":84,"rag":1079,"ran":8258,"ram":2837,"ral":3029,"rak":1364,"rab":639,"raf":875,"rae":172,"rad":3322,"rac":2843,"rpt":814,"rpu":483,"rpr":64,"rps":98,"rpo":225,"rkė":212,"rs ":981,"rpe":180,"rpa":257,"rką":115,"rpi":679,"rgž":70,"ror":154,"ros":5272,"rot":753,"rom":1205,"ron":1539,"rop":2336,"roz":106,"rgų":68,"rou":123,"rov":2454,"row":67,"rob":326,"roa":194,"rod":1480,"roc":897,"roj":2662,"roi":152,"rol":1077,"rok":757,"rof":604,"roe":102,"rog":1825,"rno":408,"rič":321,"rnu":106,"rny":471,"rp ":1398,"rna":1385,"rež":195,"rne":498,"rią":492,"rni":813,"rmo":1239,"rmu":712,"rdų":72,"ro ":7225,"rgė":150,"rma":2602,"rme":740,"rdž":190,"rmi":1316,"rly":64,"rlo":116,"rli":270,"rld":79,"rle":65,"rla":454,"rn ":72,"rky":100,"rkv":63,"rku":184,"rkt":264,"rks":74,"rko":615,"rki":550,"rkl":319,"rke":407,"rka":1300,"rbų":71,"mąs":197,"reč":569,"mąj":193,"raž":304,"rje":132,"riz":580,"rix":85,"rdė":104,"rip":333,"rio":6643,"rit":7580,"ris":4114,"riv":457,"riu":3503,"rig":767,"rij":7031,"rdą":70,"raš":3717,"rii":272,"ril":196,"rik":6018,"rin":15816,"rim":1853,"ria":7346,"rib":1061,"ric":629,"rid":769,"rie":6574,"rif":330,"rk ":142,"rož":108,"jūč":213,"rsė":70,"jų ":7448,"ryb":1588,"ryd":120,"rui":138,"rug":582,"rud":269,"rur":65,"rup":2436,"ruo":2554,"run":372,"rum":1117,"ruk":710,"ruz":216,"ruv":66,"rus":3390,"rut":405,"rva":500,"rmų":317,"rvi":416,"rve":467,"rvo":162,"rnų":71,"rvu":153,"ry ":248,"rsk":222,"rsl":371,"rkš":158,"rsi":1214,"rso":508,"rsm":221,"jūn":185,"rsa":416,"rse":236,"rkų":131,"rta":4621,"rst":803,"jūr":1765,"rsu":138,"rtm":86,"rtn":140,"rto":2230,"rte":485,"rth":95,"rti":4648,"rub":105,"rtr":75,"rtu":1225,"rty":412,"riš":698,"rt ":258,"rių":3254,"mči":71,"rmą":324,"rre":91,"riž":94,"rra":105,"lį ":872,"ru ":632,"rmė":253,"sac":151,"sad":119,"sag":124,"sai":543,"saj":93,"sak":1089,"sal":3421,"sam":580,"ryš":526,"sap":78,"san":3811,"sau":3166,"sat":201,"sas":1914,"sar":840,"sav":6083,"sa ":609,"ryč":1308,"rvų":140,"mėg":132,"rvė":96,"rsų":124,"mę ":264,"rtų":277,"mėt":142,"mės":1624,"mėl":387,"mėm":110,"mėj":427,"mėn":862,"rtė":217,"mė ":740,"rys":1185,"ryt":5807,"ryo":78,"ryp":869,"ryk":138,"ryl":68,"rym":161,"ryn":188,"ryj":340,"rtą":376,"rtį":75,"ną ":1490,"shi":127,"si ":5878," 가":71,"siv":352,"sjo":73,"nąj":98,"sie":2665,"sid":3080,"sic":542,"sib":166,"sia":6343,"kšt":3974,"sit":1385,"siu":976,"sir":1172,"sis":6798,"sip":235,"sin":3201,"kšn":209,"sio":4893,"kšo":166,"kšl":78,"sil":885,"kšm":294,"sim":839,"sij":3056,"sik":1642,"sii":91,"sif":333,"sig":288,"sda":106,"sdi":133,"se ":12437,"ryž":454,"sce":120,"sci":144,"sch":242,"ser":955,"ses":253,"set":159,"seu":67,"sez":169,"sh ":92,"sfe":280,"sfo":80,"sei":720,"sed":68,"sep":203,"sen":3202,"sem":199,"sel":216,"sek":790,"spu":1602,"skė":68,"spy":159,"spo":957,"spr":582,"spe":1156,"spi":670,"sjė":92,"spa":2100,"sot":100,"sov":81,"sol":244,"som":660,"son":563,"sop":70,"sor":222,"sos":1164,"sod":242,"sof":371,"soj":156,"soc":1068,"su ":4472,"smė":297,"sru":176,"sro":178,"sri":1777,"nči":4255,"siž":207,"sra":172,"sių":1944,"slė":668,"st ":506,"siū":210,"siš":352,"ss ":140,"sli":978,"slo":811,"slu":296,"sky":900,"sla":1643,"sle":271,"ski":4050,"skl":344,"sko":1593,"skr":2283,"sku":545,"skv":165,"ska":2455,"ske":580,"kšč":2245,"sno":220,"sny":134,"snu":80,"sna":277,"sp ":78,"sni":2039,"sią":193,"sne":141,"smo":495,"smu":706,"so ":3418,"sma":829,"smi":538,"sme":1637,"syb":334,"kų ":5583,"syn":63,"syt":71,"sys":136,"syk":221,"stą":248,"stę":193,"syv":319,"stė":1387,"nė ":11072,"kūg":75,"sse":159,"ssa":96,"kūn":731,"sso":72,"ssi":140,"kūs":84,"kūr":1568,"snė":370,"skų":182,"ste":4835,"sta":11594,"sto":6918,"sti":7618,"snį":163,"stu":1797,"str":5446,"sua":240,"sty":3413,"slų":200,"sud":2168,"sue":135,"sub":489,"sui":78,"suf":148,"sug":205,"sul":523,"sum":815,"suj":169,"suk":1852,"sup":534,"sun":407,"suo":1175,"sut":1183,"sus":2952,"sur":378,"suv":210,"spė":63,"smų":124,"sva":1159,"sve":370,"svi":243,"svo":286,"svu":88,"svy":123,"tai":9707,"taj":152,"tak":2532,"tal":2568,"tag":213,"tab":401,"tac":699,"tad":469,"tav":1620,"tau":2680,"tat":2848,"tas":15287,"tar":6407,"tap":484,"tan":2917,"tam":3320,"te ":2806,"tbo":1115,"nę ":1530,"svė":126,"가":170,"stų":690,"nėm":645,"nėn":104,"nėl":246,"nėj":3435,"stū":94,"nės":9359,"nėt":145,"nėr":303,"ta ":7677,"suž":77,"nęs":68,"ovų":204,"jęs":545," št":72," šv":1206," šu":230," ši":9378,"pa ":504," šl":369," šo":352," ša":3536," še":4400," Šv":1543," Šu":220," Št":147," Šr":75," Šo":70," Šl":99," Šk":111,"otų":425," Ši":3390," Še":340,"jės":91," Ša":886,"jėg":559,"ovę":68,"ję ":118,"ovė":1571,"pdo":296,"pci":139,"pe ":192,"par":5103,"pat":3653,"pas":6180,"pav":2954,"pau":1209,"pac":165,"pad":1171,"paa":84,"pab":319,"pag":4995,"pak":1745,"pal":2163,"pai":635,"paj":498,"pap":2360,"pam":953,"pan":2096,"ozė":194,"pha":125,"ką ":1267,"pho":90,"phi":136,"gžd":845,"pač":638,"pec":747,"ped":487,"pen":679,"per":4439,"pet":193,"pes":135,"pei":362,"pel":1215,"pek":276,"pla":1821,"pli":2030,"ple":626,"plo":1805,"ply":260,"plu":196,"pkr":163,"paž":694,"phy":268,"pib":499,"pia":510,"pid":180,"pie":9064,"pig":92,"paš":357,"pij":468,"pik":263,"pil":4079,"pim":702,"pin":2920,"pio":1030,"pir":1825,"pis":533,"pit":515,"piu":812,"poz":347,"pr ":158,"por":1711,"pop":634,"pov":195,"pot":290,"pos":1432,"poj":445,"pog":134,"pom":76,"pon":986,"pok":315,"pol":1919,"pob":138,"poe":104,"poc":74,"pod":139,"po ":1517,"psu":123,"pst":87,"pta":1134,"pse":76,"psi":725,"psn":657,"psk":1854,"ptu":124,"pty":174,"pua":68,"pub":1670,"pte":200,"pti":1385,"pto":309,"plū":91,"pra":5043,"pių":716,"plė":383,"piš":241,"pjū":231,"pru":94,"psa":387,"pu ":180,"jį ":548,"kči":82,"pri":9679,"pre":1608,"pro":5447,"poš":212,"pož":275,"pyg":410,"pur":225,"pus":2091,"put":192,"pun":96,"puo":365,"pup":123,"puk":97,"pul":563,"px ":100,"pva":210,"pvz":390,"kę ":172,"kėj":853,"kėn":99,"kėl":172,"kės":2221,"ptū":85,"kėt":269,"puš":167,"pyn":242,"pyj":119,"pyl":1154,"pyk":93,"pyv":70,"pyr":87,"pyt":97,"kė ":1457,"prū":156,"puč":92,"kęs":421,"lą ":721,"iš ":6273,"ląs":439,"išd":131,"iše":155,"išg":193,"iša":318,"išm":207,"išl":1019,"išo":261,"išn":213,"iši":468,"išk":9640,"išt":673,"išv":411,"išp":207,"išs":1093,"išr":686," Ži":399," Že":954," Ža":482," Žy":115," Žu":120," Žv":178," Žm":91," ži":2342," žm":1523," ža":1939," že":2052," žy":592," žv":1024," žo":907," žu":959,"iū ":112," ųj":70,"kį ":292,"lči":255,"iūt":153,"iūr":499,"iūn":1343,"iūl":184,"que":92,"qui":80," šį":69,"ių ":35485,"lė ":2188," ūk":511,"iųj":519,"lėg":91,"lėj":539,"lėd":122,"lę ":354,"lėv":65,"lėt":585,"lės":3311,"lėn":507,"lėm":235,"lėl":106,"lėk":280,"ra ":13206,"lęs":405,"ngo":2538,"ngi":2530,"eži":1071,"ngl":1130,"ngv":440,"ngu":1035,"ežu":89,"ngr":450,"ežt":146,"ngt":1224,"ngs":123,"ni ":526,"eže":2899,"nge":541,"eža":160,"nga":2804,"ežd":87,"nha":74,"ią ":1321,"neg":409,"nei":1033,"nel":523,"nek":262,"nen":435,"nem":377,"nep":854,"neo":246,"ner":1909,"net":3370,"nes":2353,"nev":652,"neu":225,"ndy":107,"ng ":636,"nea":242,"neb":233,"nec":126,"ned":530,"nef":72,"nfo":543,"nfl":92,"nfr":63,"nez":420,"nfe":266,"nco":109,"nci":2521,"nce":603,"nch":181,"ne ":5137,"nbu":79,"ndu":1568,"ndr":3362,"nds":72,"ndo":1993,"ndi":4736,"nde":2551,"nda":1898,"nak":285,"nal":3006,"nam":4510,"nan":2181,"nap":314,"nar":1177,"nac":1442,"nad":530,"nae":88,"naf":82,"nag":711,"nai":2792,"naj":138,"nc ":104,"nab":100,"nbe":88,"nd ":453,"nav":974,"nau":2982,"nat":1500,"nas":9068,"naz":237,"na ":6125,"muš":82,"myr":132,"myn":393,"myl":81,"가 ":71,"mzd":138,"myk":132,"myb":757,"fų ":79,"nyb":586,"ntą":186,"nyj":775,"nyi":66,"nyg":399,"ny ":199,"nvi":1009,"nux":72,"nve":361,"nva":85,"nuk":354,"nul":255,"num":449,"nun":66,"nug":200,"nui":231,"nus":1958,"nut":401,"nuv":180,"nuo":11505,"nur":161,"nty":1460,"ntv":111,"nto":3213,"ntu":1006,"nts":156,"ntr":4642,"nti":13882,"nth":149,"ntg":70,"nta":6698,"nte":3240,"nsu":225,"nkų":1281,"nsp":335,"nso":235,"nst":1864,"nse":182,"nkš":139,"nsi":456,"nsl":149,"nsk":533,"nsa":277,"nu ":681,"nro":72,"iči":2192,"nri":70,"nra":165,"nių":11248,"nt ":3145,"niū":1065,"niš":1026,"nką":172,"ns ":1187,"nkė":2543,"nod":224,"nog":120,"nok":84,"nol":827,"noi":70,"noj":1649,"nop":168,"nom":2167,"non":256,"not":507,"nos":4995,"nor":1222,"nov":1035,"ngų":481,"noz":151,"nne":81,"než":127,"nni":71,"nme":211,"nma":125,"neš":331,"ndž":626,"ngą":233,"nla":91,"ndų":534,"ngė":262,"no ":11621,"ndū":86,"nke":587,"nkl":1886,"nki":5054,"nkc":507,"nka":2908,"nku":626,"neį":137,"nky":223,"nko":1384,"nks":891,"ncū":939,"nkt":1190,"nkr":206,"iąj":181,"naž":94,"nja":76,"ndė":404,"njo":112,"ndą":97,"nij":5825,"naš":624,"nig":783,"nif":163,"nie":779,"nid":254,"nic":409,"nia":7994,"nk ":388,"niz":2108,"niu":3647,"niv":780,"nis":15940,"nit":387,"nir":72,"nio":7526,"nip":69,"nim":4778,"nin":8141,"nik":1554,"nil":147,"ogs":78,"ogr":2210,"ogu":324,"ogi":3408,"ogl":119,"ogo":286,"ogn":64,"oga":839,"oge":235,"oho":167,"oha":70,"ohe":76,"obė":124,"oj ":68,"ją ":2223,"gšt":167,"odą":88,"oid":309,"ok ":122,"oju":555,"obū":144,"ojo":4241,"oji":3887,"oje":21105,"oja":6655,"odė":378,"ol ":243,"oce":779,"och":261,"oci":1478,"ock":185,"obs":81,"obu":232,"odg":104,"ode":1122,"odk":163,"odi":1127,"odo":1175,"odr":68,"of ":299,"oda":2306,"oel":84,"oet":66,"oeu":79,"ody":417,"odu":589,"og ":261,"ofi":842,"oft":141,"ofo":158,"ofe":379,"ofa":95,"oa ":103,"nyč":782,"oac":66,"oba":242,"od ":72,"oar":69,"oat":135,"obo":229,"obl":199,"obj":1006,"obi":1248,"obe":189,"nyn":1169,"nyk":427,"nyr":226,"nyt":99,"nys":2919,"ntė":672,"nzi":126,"nzo":87,"ntį":195,"nsų":145,"ntū":104,"ntų":1010,"gų ":1699,"osė":119,"orė":417,"ows":92,"orą":78,"orų":130,"orū":115,"ozo":156,"ozi":309,"jė ":190,"otė":267,"oza":345,"otą":235,"olų":162,"oty":482,"otu":369,"ow ":66,"oti":3180,"ote":1275,"ott":72,"otr":264,"oto":2263,"otn":65,"okų":103,"ost":1854,"osu":63,"osv":137,"ota":3370,"onė":1445,"osi":4557,"okš":697,"osk":117,"oną":313,"ose":10057,"osf":255,"osp":155,"oss":88,"gūr":160,"osm":327,"osl":114,"oso":811,"osn":165,"gūn":307,"ovy":172,"onų":1034,"ovi":2936,"onš":70,"ovo":1368,"ovu":118,"ovs":154,"omų":264,"opė":75,"ox ":72,"ova":2312,"ove":473,"oun":122,"oup":69,"ous":97,"our":160,"out":83,"opo":1809,"opi":1192,"opl":208,"ope":871,"oph":333,"opa":456,"os ":71850,"okė":208,"opu":410,"opr":78,"opt":255,"ops":178,"ool":67,"ood":82,"or ":364,"ojė":85,"ogų":154,"oor":180,"ork":297,"orl":173,"orm":3710,"orn":376,"oro":1427,"orp":329,"orc":113,"ord":1011,"ore":636,"orf":147,"org":2280,"ori":8425,"omą":125,"omė":233,"ojų":757,"ou ":101,"osa":489,"gūb":364,"ort":1674,"ors":672,"orv":193,"oru":254,"ory":137,"olą":105,"m² ":432,"ot ":142,"olė":893,"orb":205,"ora":1549,"olę":89,"okį":63,"ola":1063,"old":223,"on ":1119,"oli":7420,"oll":172,"olk":163,"olf":87,"ole":1072,"olg":102,"olt":96,"olm":81,"oln":64,"olo":5241,"oly":304,"odų":160,"ogė":746,"olu":523,"oka":1176,"om ":163,"oki":2742,"oke":340,"okr":516,"oks":1989,"oko":845,"okl":512,"oky":4231,"okt":115,"oku":1484,"ona":5705,"ond":604,"onc":437,"onf":314,"one":3050,"ong":677,"oni":5683,"onk":450,"ono":5947,"ons":941,"ont":1350,"onu":771,"onv":131,"ony":229,"onz":95,"oma":4058,"ome":2921,"omb":322,"omi":4219,"odž":830,"omp":1868,"omo":2822,"omu":462,"oms":893,"omy":305,"op ":190,"la ":3891,"kyč":84,"ksų":107,"kvė":157,"ktų":516,"ktū":882,"le ":987,"lci":77,"lde":301,"lda":470,"ldo":1438,"ldi":380,"ldu":86,"lab":885,"lac":354,"lad":233,"lag":301,"laj":360,"lai":7095,"lal":283,"lak":544,"lan":3678,"lam":672,"lap":1033,"lar":290,"lat":1022,"las":4962,"lau":4662,"lav":1958,"lay":83,"laz":248,"lba":2195,"ld ":140,"lbe":120,"kyš":114,"lbi":260,"lbo":1044,"lbu":643,"kvi":414,"kve":103,"kva":547,"kuv":68,"kut":500,"kus":1457,"kur":11466,"kup":293,"kuo":1084,"kun":573,"kum":831,"kul":3442,"krą":130,"kvo":154,"kta":2002,"kte":286,"cūz":939,"ksp":237,"ksu":217,"kst":1638,"ksi":1031,"kso":492,"ksn":481,"ksm":470,"ksl":1822,"kub":70,"kui":142,"kty":770,"klų":780,"ktr":1113,"ktu":872,"kti":2012,"kto":1582,"kyt":491,"kyr":514,"kys":407,"ktė":87,"krū":445,"kuč":63,"krų":140,"cų ":73,"kyb":405,"kyd":213,"kyk":2742,"kyj":122,"ktą":149,"kym":1532,"kyl":177,"ksč":304,"lpo":78,"lpn":97,"lpi":224,"lkė":206,"ls ":120,"lpt":73,"lok":724,"lon":920,"lom":534,"lop":556,"lor":335,"lod":121,"loc":79,"log":3426,"loj":732,"loi":69,"lpa":143,"los":2610,"lot":1612,"lou":74,"lov":512,"lno":410,"lią":195,"lni":4011,"lež":588,"lne":64,"lob":276,"lny":212,"lnu":597,"lmo":217,"lmi":148,"lme":109,"ldž":460,"lma":561,"lna":1027,"lmu":116,"lti":1386,"lto":827,"ltr":83,"loč":162,"ltu":191,"lty":140,"lub":724,"lkš":220,"lsi":157,"lsk":87,"lso":116,"dūr":138,"lkū":97,"lst":3935,"lsv":208,"lnė":167,"lkų":772,"lta":1286,"lte":434,"lu ":327,"lmė":408,"lsa":135,"liš":786,"liū":227,"lių":4116,"lt ":76,"lbė":158,"gą ":583,"lgu":66,"lgy":105,"lgo":332,"lge":222,"lbą":82,"lgi":1731,"li ":2175,"lga":584,"lač":341,"lfi":69,"lfa":89,"lez":63,"leu":86,"lev":356,"les":733,"let":811,"ler":662,"leo":285,"lep":84,"lem":868,"len":3529,"lek":1990,"lel":445,"lei":2618,"leg":368,"lef":117,"led":360,"lec":77,"ldy":3239,"lls":89,"llu":67,"lo ":4298,"lla":282,"lle":203,"lli":227,"llo":104,"lko":1340,"lku":68,"lks":80,"ln ":170,"lka":889,"lke":67,"lki":640,"lkl":70,"lbų":491,"lje":178,"ll ":250,"lja":125,"lit":2868,"lis":6123,"lip":320,"lio":6518,"lin":10945,"lim":1681,"ldė":95,"liz":1115,"liv":318,"liu":3368,"lic":390,"lid":423,"lia":9617,"lib":160,"lik":3451,"laš":117,"lij":2779,"lig":956,"lie":4974,"lif":303,"ma ":8966,"gęs":112,"mb ":104,"lvų":129,"mac":657,"mai":3495,"maj":238,"mak":310,"mad":111,"mag":648,"mar":995,"mas":16980,"mal":1077,"mam":360,"man":2648,"maz":236,"mav":409,"mau":158,"mat":2440,"mba":403,"mbl":299,"mbi":672,"mbe":160,"mbr":361,"mbo":656,"me ":4745,"mbu":328,"mdo":70,"mdi":69,"med":2275,"meg":289,"mec":166,"met":7146,"mes":916,"mer":2496,"mem":105,"mel":455,"men":7636,"mei":219,"mez":71,"mfo":93,"lmų":71,"lpė":80,"lva":657,"lve":135,"lvi":553,"luk":80,"lui":136,"lup":118,"luo":731,"lun":291,"lum":619,"lut":328,"lus":1933,"lur":73,"luv":66,"lnų":522,"ly ":90,"lvo":651,"lyb":79,"lyd":329,"dų ":1583,"ltą":68,"lyj":2459,"lyk":200,"lyg":1880,"lsč":1443,"gė ":262,"lyv":644,"lyp":129,"lym":88,"lyn":910,"lys":1218,"lyt":728,"gėl":263,"gėj":295,"gėg":66,"gę ":71,"lvė":163,"ltų":271,"gėr":187,"ltū":1732,"gės":1188,"mpi":2031,"mph":67,"mpe":889,"mpr":78,"mpo":545,"mpl":367,"mpu":163,"ms ":4671,"mog":880,"moc":111,"mob":923,"mod":511,"mon":3483,"mok":6158,"moj":1912,"mom":231,"mol":407,"mor":477,"mos":8683,"mot":735,"mpa":1170,"mu ":1142,"gį ":94,"miš":1246,"mių":389,"mt ":92,"mto":342,"mtm":156,"mti":479,"mso":66,"msi":145,"mta":393,"mur":124,"mus":1181,"mut":199,"mui":800,"mul":740,"mum":317,"mun":878,"muo":1470,"muz":1069,"mpė":146,"džo":105,"dža":265,"mga":68,"eš ":414,"mi ":2927,"dži":8825,"dže":173,"meč":426,"mbū":66,"maž":1405,"min":8866,"mio":695,"ešo":276,"mil":606,"mim":392,"ešm":321,"mir":512,"mis":3697,"ešp":103,"ešt":271,"mit":1456,"ešu":171,"miu":166,"mic":65,"mia":1046,"eša":283,"mig":185,"eše":206,"mie":3684,"mid":149,"mik":627,"ešk":262,"mij":1021,"eši":2201,"maš":150,"mo ":11787,"mln":124,"mm ":215,"mna":274,"meš":73,"vėž":564,"tša":98,"įve":191,"įva":943,"įvy":373,"tūr":3760,"tūn":64,"tūk":418,"sūn":301,"sūr":75,"sų ":1585,"vė ":1134,"Ček":141,"Čer":143,"vęs":499,"Čiu":96,"vėr":82,"vės":1539,"Čik":109,"vėd":103,"Čil":228,"vėm":70,"vėn":180,"vėp":105,"Čia":184,"vėj":505,"vėl":1093,"vę ":211,"rža":90,"rže":425,"rži":124,"ržo":186,"ržu":185,"ržy":332,"vą ":302,"ržų":128,"vč ":177,"zra":92,"zmą":73,"uči":685,"ršū":168,"čem":518,"rūd":121,"rūg":209,"tį ":848,"rūt":179,"rūs":316,"rūv":85,"rūk":126,"rūn":147,"rūm":473,"rūp":93,"zmų":92,"zuo":830,"zul":113,"čia":6305,"čiu":3945,"čin":247,"rų ":3880,"čio":4188,"rūš":1748,"rųj":78,"čią":252,"čių":6037,"čiū":111,"zga":86,"rš ":371,"zdu":307,"zdy":84,"zdo":187,"zeu":81,"zen":162,"zel":75,"zer":347,"ze ":133,"zda":224,"zdi":311,"zde":65,"zac":1511,"zai":89,"zam":101,"zan":208,"zal":69,"zar":93,"zau":206,"zav":191,"zas":215,"zos":160,"zot":82,"zon":766,"zol":127,"zo ":283,"zma":593,"zmo":623,"zme":95,"zdž":483,"zna":91,"zmu":127,"rša":115,"zia":212,"zie":245,"zid":291,"zic":125,"zij":2812,"rši":542,"zin":555,"zil":400,"zik":1298,"ršk":199,"zio":150,"zis":547,"ršt":245,"zit":198,"ršu":471,"yvu":368,"ynų":327,"yvy":146,"yvo":316,"yve":3277,"yvi":885,"yva":1082,"ymų":245,"ytu":3236,"yto":736,"yti":2477,"yta":2174,"ynė":548,"yst":1756,"yną":129,"ysk":104,"ysl":216,"ysi":1379,"ykš":318,"yse":638,"sį ":224,"ymė":134,"ymą":285,"yri":1443,"yro":395,"yru":239,"ylė":72,"yra":10061,"yre":153,"ys ":5011,"ykę":197,"ypt":748,"ygų":179,"ypa":451,"yop":72,"yny":165,"ynu":272,"tęs":608,"yvū":473,"yvų":140,"za ":200,"tėl":64,"tėn":94,"tėm":94,"tėj":863,"ytų":2089,"tės":2264,"tėv":185,"tę ":460,"yzd":234,"yrų":212,"ytą":75,"tė ":2107,"ytė":146,"ysč":138,"yrė":91,"ybi":1941,"ybo":1219,"yda":247,"yde":256,"ydi":267,"ydo":238,"ydr":112,"ydy":152,"ya ":66,"sęs":72,"rįž":117,"ybe":375,"yba":1117,"ydį":315,"ybų":501,"yka":208,"ykd":1149,"yki":766,"ykl":3525,"yko":378,"yks":518,"yku":494,"yn ":82,"yla":244,"yli":1437,"ygą":95,"yll":79,"ylo":78,"yma":917,"ydų":119,"ymi":564,"ydž":370,"yme":67,"ymo":2861,"ymu":353,"yna":1411,"yni":762,"yne":693,"yno":1069,"ygi":962,"ygl":225,"ybą":107,"yga":678,"ybė":5361,"tą ":2180,"ygo":549,"ygu":328,"ybę":375,"yin":78,"tąj":82,"yje":6777,"sči":2451,"pūs":102,"rį ":999,"sė ":786,"sėd":208,"sėk":228,"sėj":581,"sės":2025,"sėt":101,"sę ":210,"rįs":115,"pų ":324,"są ":505,"ožy":124,"ože":207,"oža":97,"oži":226,"rėž":276,"sąv":390,"sąr":690,"sąs":356,"sąj":351,"sąm":110,"sąl":240,"sąn":78,"pši":366,"ręs":410,"rėg":75,"rėl":160,"rėj":1557,"rėn":386,"rėm":119,"rėt":168,"rės":6344,"rę ":1445,"rėd":146,"ww ":76,"rąž":93,"rė ":1022,"www":76,"ws ":91,"rči":327,"nžu":77,"rą ":1079,"nži":161,"rąj":82,"ošt":122,"oši":132,"oše":197,"oša":95,"vyl":67,"vyk":2515,"vyn":610,"vyr":1040,"vyd":92,"vyj":96,"vys":564,"vyt":65,"vyz":235,"nųj":77,"war":74,"viš":606,"vro":160,"vių":2220,"vsk":219,"vu ":109,"pį ":89,"vus":1133,"vuo":278,"vum":127,"vul":245,"vz ":411,"nų ":7207,"vyb":651,"vož":79,"via":688,"vio":508,"vir":3137,"vik":248,"vil":1104,"vim":2439,"vin":4831,"vig":137,"nši":88,"vij":1156,"vic":102,"vid":2656,"vie":10516,"viz":527,"viv":2783,"viu":186,"vit":264,"vis":3495,"važ":200,"vka":106,"vo ":7394,"vež":182,"vič":254,"voj":2810,"vol":423,"vok":960,"von":160,"vor":253,"vot":192,"vos":5796,"ąra":686,"vi ":818,"ąjį":197,"vač":96,"mži":691,"ąją":384,"ver":3346,"ves":601,"vet":199,"vej":358,"vei":3674,"veg":178,"ven":4988,"vel":635,"vek":95,"ved":568,"ąmo":118,"ąna":83," − ":501,"ve ":425,"ąly":225,"val":10598,"vak":4952,"van":3055,"vam":251,"vap":91,"var":5998,"vat":733,"vas":2439,"vav":423,"vau":1068,"vaz":104,"vab":435,"vac":68,"vad":6655,"vai":6631,"vaj":535,"ąju":613,"vag":166,"uvų":343,"ąja":230,"va ":1330,"utų":550,"pės":2048,"pėj":200,"pėm":214,"pėn":79,"uvę":282,"pėd":630,"usų":229,"pę ":262,"uvė":461,"urž":199,"mūš":172,"mųj":304,"urų":123,"urš":132,"uzi":1325,"usį":91,"uza":114,"uzd":74,"utė":482,"pė ":2903,"urį":786,"mų ":3232,"usę":91,"usė":442,"usą":115,"uož":251,"urė":7194,"urę":1318,"uoš":293,"umų":278,"upę":254,"ux ":100,"upė":4550,"uvi":2426,"uvk":94,"uvo":8973,"uva":1382,"uve":227,"uvy":103,"unų":111,"uvu":401,"usl":117,"usm":292,"usk":802,"ukš":4009,"usi":16156,"mūg":65,"usd":94,"use":242,"usa":1585,"unė":100,"ukų":215,"usy":348,"usv":157,"usu":325,"ust":2671,"uss":74,"mūs":116,"ukū":174,"mūr":92,"usr":197,"uso":1941,"uti":3065,"ute":1211,"uta":1298,"utb":1100,"ulų":95,"uty":105,"uts":108,"utu":465,"uto":2462,"utr":654,"uoč":243,"us ":25324,"ukė":207,"ut ":98,"ulė":940,"urb":304,"ura":1290,"urd":135,"ure":433,"urg":609,"umą":619,"uri":13437,"pči":95,"urk":677,"urm":174,"urn":819,"uro":2323,"urp":92,"urs":335,"urt":2267,"uru":270,"ulį":98,"urv":125,"ury":263,"ujų":216,"uog":115,"uod":1799,"uob":148,"uop":102,"uon":468,"uol":1850,"uom":2229,"uoj":2916,"ują":85,"uok":472,"uot":3875,"uos":9387,"upa":260,"ugų":171,"uoz":64,"ur ":478,"uką":82,"upi":1336,"upe":880,"upo":164,"upr":184,"upy":138,"upt":113,"upu":131,"ump":675,"umu":499,"umi":565,"umo":1874,"uma":2535,"umb":630,"ume":1015,"udž":589,"uly":692,"ugė":81,"uo ":11031,"ugę":74,"unt":286,"uns":125,"unu":78,"unk":1010,"uni":2717,"uno":1046,"unc":101,"und":514,"una":778,"ung":2769,"une":377,"up ":104,"uks":488,"ukr":338,"uku":1424,"ukt":1044,"uko":768,"ukm":63,"ukl":276,"uki":835,"ukc":193,"uke":516,"um ":371,"uka":1363,"ubų":81,"uju":231,"ulv":76,"ulu":205,"ult":2376,"uls":117,"ulp":163,"ulo":319,"ulm":98,"ulk":1598,"uli":4261,"ulg":118,"ule":258,"ula":686,"un ":91,"uid":65,"uik":63,"uil":83,"uin":83,"uis":152,"uic":69,"ąvo":220,"uje":2229,"uji":238,"ujo":901,"ąve":118,"ąva":67,"uit":183,"uiz":104,"ul ":100,"udė":847,"uja":1817,"ugi":2043,"ąsi":447,"lži":190,"ugd":727,"uge":478,"ugn":281,"ugo":862,"ugp":256,"ugl":79,"ui ":2863,"uga":3449,"ugy":234,"ugv":81,"ugu":1147,"ugs":210,"ugr":96,"uha":63,"pą ":193,"ąst":536,"uda":2525,"ude":508,"udi":1843,"ubo":317,"ubt":123,"ubr":94,"ubu":247,"ue ":115,"uci":1337,"uch":229,"uer":154,"ufo":165,"udu":90,"udr":145,"udo":3114,"ug ":571,"udy":189,"udz":149,"uen":154,"uel":163,"uei":67,"tyč":85,"tuš":104,"ua ":181,"uau":101,"uar":229,"ual":389,"uan":260,"ubi":273,"ubj":203,"ubl":1778,"ube":224,"uba":739,"uac":101,"trų":261,"trū":89,"tyv":1223,"tyg":105,"tyj":2629,"tyk":592,"tyl":66,"tym":1125,"tyn":745,"tyr":732,"tys":1652,"tyt":1130,"tvė":152,"ty ":191,"tvy":70,"tve":795,"tvi":1569,"tva":1876,"tur":3389,"tus":4707,"tut":476,"tuv":8023,"tui":321,"tul":676,"tuk":242,"tun":200,"tum":1081,"tup":188,"tuo":4132,"tub":127,"tua":342,"tud":729,"tuc":1071,"tug":166,"tyb":2995,"lų ":4053,"trė":213,"trą":137,"ts ":284,"tiš":751,"tre":865,"tt ":65,"tra":6670,"tri":3774,"oči":767,"tru":2118,"tro":4397,"nį ":1595,"tu ":2200,"try":1680,"tsa":349,"lūd":70,"tsi":864,"lūn":117,"lūk":105,"tsk":627,"tsp":242,"tst":1087,"lūs":124,"tte":91,"ttp":75,"tme":480,"tma":182,"to ":9437,"tmo":148,"tmi":171,"tni":215,"tne":192,"tp ":76,"tna":162,"tno":94,"tod":622,"toc":126,"toj":3705,"tog":985,"tob":201,"tov":2012,"tos":4057,"tot":663,"toz":74,"tom":2117,"ton":1654,"tok":1091,"tol":2595,"tor":6343,"top":234,"tr ":100,"tpa":67,"tij":3726,"lši":179,"taš":256,"til":880,"tik":5456,"tif":144,"tie":4693,"tig":115,"tir":719,"tit":1932,"tis":9550,"tin":18689,"tim":2115,"tip":1038,"tio":733,"thu":146,"tia":180,"tib":66,"tic":399,"tid":346,"taž":90,"tiz":264,"tiu":112,"tiv":191,"tko":75,"tku":65,"tka":159,"tli":717,"tla":468,"tle":281,"tem":2567,"ten":1501,"teo":504,"tep":206,"tei":4906,"tek":2029,"tel":3046,"teg":305,"tea":212,"teb":306,"tec":578,"ted":149,"tfo":195,"th ":187,"tez":110,"tet":1080,"tes":640,"ter":7958,"ti ":12579,"tga":173,"tač":773,"tho":100,"the":357,"thi":68,"tha":124,"yži":382,"zūr":65,"žė ":79,"zų ":230,"žėj":93,"žės":302,"yšu":129,"yšk":285,"yši":281,"žįs":104,"AR ":460,"AT ":96,"AV ":1248,"zę ":74,"zės":255,"BA ":84,"AB ":129,"가가":99,"Žie":168,"Žmo":87,"Žal":202,"Žai":105,"Žem":792,"ža ":218,"vųj":76,"vų ":1432,"Žva":101,"žli":98,"žka":118,"žin":2776,"žim":518,"žik":242,"žir":69,"žio":3609,"žiu":1205,"žis":284,"ždė":292,"žia":4861,"žie":756,"三 ":108,"žpa":73,"žos":282,"zė ":373,"žny":789,"žoj":209,"žol":428,"žod":694,"žmo":1552,"žią":130,"žni":1016,"žna":456,"žo ":203,"ždž":217,"žde":109,"žda":511,"žas":486,"žba":64,"žai":1499,"žal":557,"žan":496,"žar":129,"ži ":73,"žer":3193,"žes":178,"žet":91,"žei":363,"žel":344,"žem":2279,"žen":275,"ždy":272,"ždu":70,"ždi":76,"už ":1010,"uža":101,"užd":391,"uže":68,"užs":382,"užt":281,"užr":186,"užu":95,"užk":126,"uži":622,"užp":129,"užn":124,"žys":63,"žym":728,"žyg":68,"žyd":166,"žyb":321,"žtv":87,"žud":129,"žuv":1113,"žut":66,"žur":382,"žuo":217,"žve":120,"žva":821,"žvi":225,"žvy":103,"žra":185,"žių":1603,"žiū":506,"žsi":337,"žta":167,"žte":70,"žti":234,"yči":2350,"vūn":458,"užė":257,"tųj":219,"tų ":12460,"ušė":81,"之 ":65,"ušt":63,"ušk":90,"uši":396,"uša":109,"tžv":91},"n_words":[6266541,7160065,6094403],"name":"lt"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ūci":182,"ūdu":26,"ūdr":44,"ūdo":28,"ūde":716,"ūda":90,"D":3059,"E":2166,"F":1831,"G":1835,"A":5495,"B":3355,"C":2145,"L":4567,"M":3974,"N":1829,"O":1507,"H":1820,"I":2327,"J":1801,"K":4616,"U":852,"T":4509,"W":365,"V":3715,"Q":38,"P":4759,"S":5447,"R":3339,"Y":65,"X":234,"Z":1747,"f":8161,"g":29438,"d":58295,"e":131519,"b":30435,"Fed":124,"c":22500,"a":237698,"n":95937,"o":88352,"l":73126,"m":69285,"j":43124,"k":78380,"Fel":22,"h":5764,"Fen":23,"i":199506,"w":553,"v":49552,"u":101354,"Fer":33,"t":118682,"s":178123,"r":127678,"q":293,"p":52161,"z":35062,"y":1297,"x":396,"²":54,"Fil":64,"í":35,"Fin":30,"Fir":23,"é":81,"ç":26,"ä":65,"â":42,"á":40,"ü":91,"ö":83,"ó":42,"Fiz":28,"ē":27728,"Ē":143,"Ā":409,"ā":76597,"Č":300,"č":1638,"ı":70,"ķ":3799,"Ķ":255,"Eze":42,"Ļ":56,"ļ":8048,"Ģ":128,"ģ":3732,"Ī":133,"ī":34346,"Ģer":28,"Ģeo":39,"Ģen":28,"ş":30,"ņ":7202,"Ņ":124,"ō":55,"Ž":134,"ž":4793,"Š":1043,"š":16459,"Ū":37,"ū":7903,"Equ":185,"Eri":30,"Ern":25,"Eur":70,"Eir":595,"́":38,"Ele":79,"Eks":36,"Eko":28,"μ":50,"ν":83,"End":31,"ο":116,"ι":60,"κ":31,"λ":61,"δ":42,"ε":43,"η":40,"α":118,"β":23,"γ":49,"ί":37,"Emi":22,"Eli":48,"ω":27,"Ent":23,"ό":36,"σ":41,"ς":97,"ρ":66,"π":29,"φ":23,"υ":22,"τ":47," l":9777,"ь":65," m":8646," n":13651," o":3224,"я":84," h":1222," i":26578," j":3868," k":23420,"ы":62," d":13444," e":3216,"х":31,"ц":23," f":3413," g":7521,"ч":56,"р":268," a":22507,"с":262," b":8086,"т":170," c":4249,"у":116," y":33," x":65," z":3907," u":13570," t":13126," w":93," v":19041," q":33," p":25406," s":20595," r":6772,"HK ":23,"Ūde":29,"Л":28,"К":30,"Н":52,"М":37,"О":27,"Б":29,"А":26,"В":29," J":1795," K":4607," H":1796," I":2309," N":1813," O":1473," L":4536," M":3956," B":3324," C":2058,"Р":35," A":5469,"С":193," F":1822," G":1815,"Ф":23," D":3031," E":2156,"л":187," Z":1739,"к":221,"й":81," Y":64," X":228,"и":382,"п":57,"о":387,"н":271,"м":85," S":5368,"г":53,"Ger":28," R":3325,"в":222,"б":51," Q":37," P":4729,"а":390,"Geo":50," W":350,"з":76," V":3691,"Gen":23," U":847,"е":335," T":4452,"д":133,"šža":24," ā":501," Ā":409," č":467," Č":299," Ē":143," ē":282,"HL ":22," ž":311," Ž":134," Ū":37," ū":565," Š":1042," š":2083," Ļ":56," ļ":329," ķ":663," Ķ":254," Ī":131," ī":702," Ģ":128," ģ":1094," Ņ":120," ņ":41,"י":25,"Gan":49,"Gal":115,"Gam":34,"Gau":47,"Gar":70,"Gai":64,"Fut":176,"و":37,"ي":53,"ل":68,"م":47,"ن":33,"د":35,"ب":27,"ة":22,"ا":101,"س":25,"ر":50,"Frī":41,"šūn":142,"Flo":40," А":25," Б":28," В":29,"Fra":343," К":29,"Fri":44," Л":27," М":37," Н":52," О":26,"Fre":87,"A ":493,"For":93," α":37,"F ":117,"Da":634,"Cu":37,"Cl":54,"Co":292,"Cr":43,"Ce":347,"Ch":135,"Ci":209,"G ":59,"Ed":95,"Eb":22,"Du":106,"Dz":229,"Do":208,"Dr":103,"De":400,"Di":679,"IP ":41,"Fe":227,"H ":468,"Fa":110,"Ez":43,"Eu":85,"Ev":42,"Ex":29,"Er":98,"Eq":185,"Et":48,"Es":44,"En":123,"Em":90,"Ei":677,"El":193,"Ek":93,"Eg":26,"Ge":158,"Bā":56,"Ga":460,"I ":434,"Fu":230,"Fr":536,"Fo":212,"Fl":70,"Fi":194," о":23,"B ":133," Р":34," С":176," Ф":23,"II ":185,"C ":377," с":29,"Av":74,"Au":759,"Ar":519,"At":405,"As":243,"D ":170,"Ba":1010,"Az":34,"Af":50,"Ag":51,"Ah":37,"Ab":147,"Ac":82,"Ad":148,"Am":328,"An":739,"Ap":260,"Ai":200,"Ak":154,"Al":535,"Hit":49,"His":22,"Bu":240,"Br":526,"Ca":279,"E ":106,"Bi":333,"Be":447,"Bo":341,"Hil":23,"Bl":116,"Him":40,"Hip":27,"Kv":49,"Ku":433,"Kl":267,"Kr":1045,"Ko":785,"Gā":24,"Le":274,"Li":1187,"N ":114,"La":2106,"Lu":187,"Ly":26,"Lo":258,"Me":656,"Hā":28,"Dž":311,"Mi":487,"O ":256,"Ma":1352,"My":39,"Hē":35,"Mu":257,"Mo":497,"Ni":163,"Ne":339,"Na":331,"P ":315,"Hel":44,"Hei":53,"Nu":37,"No":488,"Ok":85,"Ol":289,"On":42,"Og":53,"Oh":28,"Od":50,"Jā":93,"Hen":52,"Of":29,"Ob":41,"Her":113,"Gi":68,"Gl":88,"Gr":518,"Go":173,"IA ":51,"Gu":165,"Gv":31,"Bē":72,"J ":48,"Ha":442,"He":346,"Hi":235,"Ho":305,"Bī":48,"Hu":55,"Cē":86,"K ":99,"Ib":23,"Id":25,"Dā":72,"Ie":161,"Ig":193,"Im":77,"In":597,"Ik":26,"Il":62,"Dē":23,"Iv":35,"Is":56,"It":130,"Ir":159,"Ja":606,"L ":193,"Iz":255,"Je":180,"Jo":238,"Hab":30,"Ju":235,"Hal":59,"Hai":24,"Ka":1308,"Han":77,"M ":140,"Ham":27,"Har":76,"Ki":234,"Hav":40,"Ke":175,"Mū":173,"Ur":133,"Up":46,"Un":191,"Uk":57,"Ul":29,"Ug":37,"Pā":146,"W ":36,"Tu":367,"Nī":78,"Tr":348,"Lū":27,"To":528,"Th":183,"Ti":330,"Tj":24,"Te":412,"Ta":1094,"V ":507,"Bēr":58,"Grī":41,"Sy":41,"St":811,"Sv":222,"Su":204,"Wo":58,"Wi":99,"Wa":68,"Rā":30,"We":47,"Vo":206,"Pī":38,"Vu":40,"Vi":1349,"Vl":38,"X ":140,"Va":795,"Ve":528,"Pē":222,"Uz":97,"Lā":40,"Pu":149,"Pr":574,"Ps":29,"Gus":31,"S ":509,"Pe":330,"Kā":157,"Pa":1460,"Gui":25,"Pl":246,"Po":522,"Pi":543,"Ph":56,"Gul":51,"Os":199,"Ot":132," ا":37,"Ov":28,"Op":65,"Or":179,"R ":242,"šņu":59,"Jē":211,"Nā":75,"Se":449,"Sc":140,"Si":396,"Sh":52,"Sn":22,"Sm":71,"Sl":112,"Sk":318,"Mī":29,"Sp":308,"So":292,"Ru":198,"Jū":171,"Mē":139,"U ":122,"Sa":1320,"Grā":24,"Re":684,"Mā":133,"Ri":309,"Rh":25,"Ro":567,"Lī":215,"Qu":25,"Lē":28,"T ":118,"Ra":377,"Gre":140,"Gri":112,"Gra":91,"Vī":58,"b ":1431,"Gru":54,"Gro":39,"a ":45978,"Sē":77,"Tā":889,"Yo":26,"Tē":78,"Sī":40,"Z ":47,"Rē":60,"Sā":102,"Pū":23,"Rī":858,"Gol":28,"Got":33,"Vā":356,"Vē":122,"Za":126,"Ze":604,"Zi":554,"Tī":49,"Zo":72,"Zu":34,"God":26,"Rū":43,"Zv":222,"šķā":29,"i ":20844,"aē":38,"gd":52,"ge":675,"bā":1878,"ga":11955,"gb":23,"Ing":34,"Inf":36,"fl":161,"fg":38,"ff":54,"aā":32,"fi":2006,"šķē":67,"fs":184,"fr":742,"ač":195,"fu":915,"ft":143,"Int":119,"fo":1630,"Ins":32,"j ":431,"aķ":196,"gz":413,"bē":351,"Ir ":43,"cā":282,"he":981,"aļ":2174,"ha":893,"gn":346,"gm":103,"gl":1302,"gk":23,"aģ":126,"gi":1023,"gh":75,"gg":39,"gv":152,"gu":2538,"gt":345,"gs":1978,"gr":3115,"šķī":179,"aī":165,"go":1269,"dt":34,"du":3460,"dv":153,"dy":39,"dz":8558,"g ":378,"Ima":40,"ea":831,"eb":2145,"ec":2362,"ed":5035,"de":6326,"dd":61,"dg":22,"di":7508,"dh":33,"dk":50,"dm":437,"dl":81,"do":4137,"dn":387,"dp":40,"ds":1737,"dr":2632,"ew":72,"ex":66,"eu":138,"ev":3202,"ey":101,"ez":2104,"fa":698,"h ":326,"Inc":26,"Ind":290,"fe":924,"eh":574,"eg":1917,"ef":579,"ee":260,"el":8559,"ek":9435,"ej":1682,"ei":6080,"ep":1928,"eo":1018,"Imp":27,"en":17231,"em":12435,"et":9035,"es":13667,"er":13266,"cb":46,"ca":1325,"e ":9104,"bv":92,"by":23,"bs":478,"br":1873,"bu":2780,"bn":90,"bo":2478,"bp":134,"bj":321,"bk":204,"bl":923,"bi":5030,"bb":37,"bd":41,"be":3600,"da":13240,"f ":227,"cy":80,"cu":688,"ct":320,"cs":106,"cr":118,"co":534,"cp":28,"cm":65,"ck":210,"cl":78,"ci":9940,"ch":496,"ce":3798,"cc":58,"c ":1617,"Zī":28,"az":1867,"ay":138,"ba":7096,"d ":1260,"at":15301,"as":46560,"ar":22107,"šķe":37,"ax":44,"aw":25,"av":4016,"au":11606,"šķa":58,"ak":5105,"al":13877,"ai":16755,"aj":4377,"ao":99,"ap":6109,"šķi":792,"am":7193,"an":13591,"ac":1971,"ad":7331,"aa":123,"šķu":61,"ab":3333,"ag":2079,"ah":438,"ae":799,"af":317,"nu":3766,"nt":6816,"ns":6634,"ič":143,"nr":301,"hī":43,"np":87,"no":13676,"nn":366,"nz":252,"ny":68,"iē":45,"nv":1299,"oe":196,"jā":6443,"of":937,"Dān":37,"oc":1480,"od":5344,"oa":231,"ob":1666,"Dār":22,"om":4560,"on":9439,"ok":3012,"ol":6437,"oi":175,"gš":291,"iģ":203,"oj":3579,"og":1772,"oh":231,"Iga":176,"ot":8233,"m²":53,"gū":417,"os":7017,"ov":1717,"ou":258,"op":3592,"oo":225,"or":8642,"iķ":475,"r ":23056,"ox":43,"jē":253,"ow":104,"oz":1358,"oy":26,"pd":241,"pe":2624,"kā":9200,"pg":761,"gž":90,"pa":12196,"pb":92,"iļ":351,"pc":59,"pl":2446,"iņ":2232,"pm":337,"pn":123,"po":2922,"ph":171,"pi":8931,"pj":212,"pk":534,"lo":5141,"ln":1238,"eņ":1120,"lm":668,"ll":864,"ls":6050,"dū":151,"lr":53,"fī":68,"lp":774,"lv":2188,"lu":3183,"lt":2348,"lz":417,"ly":68,"gē":101,"o ":13523,"md":123,"ma":11255,"mb":1027,"dž":490,"mg":152,"me":8257,"mf":62,"hā":197,"mk":81,"ml":84,"mi":7529,"eš":2372,"mj":254,"mn":294,"mm":568,"mp":1937,"mo":3385,"mt":1499,"ms":4163,"mv":67,"Iek":27,"mu":4579,"hē":85,"mz":26,"Ied":23,"Ier":30,"p ":1187,"Ies":23,"na":11100,"nb":173,"nc":1308,"gļ":378,"nd":3610,"ne":6367,"nf":496,"iā":1355,"ež":1461,"ng":2033,"nh":159,"ni":10637,"nj":103,"nk":1409,"nl":178,"nm":119,"jv":54,"jt":24,"ju":6450,"jr":40,"eč":136,"bū":646,"js":1745,"jp":49,"dī":2457,"jn":108,"jo":2718,"jl":29,"jk":33,"ki":2861,"eģ":460,"kh":87,"kg":105,"fā":86,"ke":1430,"kd":52,"kc":519,"kb":27,"ka":21595,"m ":14604,"fē":229,"ky":24,"cū":51,"ks":6047,"kt":4783,"ku":9917,"kv":498,"ko":8251,"kp":53,"eī":44,"kr":3493,"kk":49,"kl":2204,"dņ":25,"km":1006,"kn":291,"li":12111,"lh":104,"lk":789,"lj":234,"le":5914,"ld":1879,"lg":521,"lf":344,"gā":2261,"la":14259,"eļ":2084,"lc":198,"lb":519,"n ":12333,"eķ":397,"hr":242,"hs":176,"bī":818,"ht":159,"hu":177,"hi":1019,"hn":345,"ho":773,"hl":97,"aņ":799,"hm":55,"id":6988,"ic":1972,"ib":840,"ia":987,"ih":374,"ig":1651,"if":662,"dā":3118,"ie":43851,"hy":49,"cē":786,"k ":3032,"cī":1751,"iq":24,"ir":20819,"is":22114,"it":6916,"iu":210,"iv":1914,"dē":1715,"ix":47,"ii":76,"aš":1551,"ij":18002,"ik":8262,"il":7272,"im":5227,"in":12037,"io":3152,"ip":1361,"je":2277,"jd":65,"až":1131,"eā":550,"ji":2430,"iz":8191,"l ":908,"ja":18785,"xi":24,"pš":285,"rģ":265,"pū":149,"rī":5569,"xt":23,"rē":1505,"ww":72,"z ":3167,"ož":188,"sā":1658,"nž":71,"wi":65,"oš":2487,"šīs":96,"šīr":28,"wo":40,"šīn":152,"ws":45,"vv":135,"vz":34,"y ":470,"wa":100,"pļ":34,"rā":7122,"we":66,"vl":36,"vm":57,"oņ":431,"vj":161,"vk":55,"vi":15879,"nš":154,"vg":67,"vu":1511,"vr":110,"vs":665,"nū":52,"vp":135,"pī":487,"vn":277,"vo":2198,"šīb":168,"uz":3239,"ux":47,"pē":3898,"uv":1138,"uu":163,"ve":6316,"vd":128,"vc":24,"oļ":231,"va":13478,"oķ":34,"x ":250,"oģ":891,"ui":330,"mš":154,"uj":549,"uk":2577,"ul":3970,"ue":159,"pā":3136,"uf":42,"ug":2904,"lž":37,"uh":49,"oī":103,"ur":8826,"mū":945,"us":8406,"ut":3554,"um":10369,"un":12883,"uo":66,"up":1299,"ty":109,"tz":181,"tu":9581,"tt":1263,"tw":38,"tv":2682,"ub":1129,"ua":219,"ud":1801,"uc":777,"w ":104,"to":9558,"tn":1900,"tm":416,"tl":676,"lū":587,"ts":7290,"tr":9995,"oč":89,"nī":2204,"tp":327,"tg":235,"tf":75,"te":8645,"td":205,"tk":771,"tj":107,"ti":16287,"lš":164,"th":431,"v ":977,"šī ":103,"tb":1541,"tc":58,"ta":17648,"su":2355,"sv":864,"ss":1501,"kū":96,"st":22568,"sy":29,"sz":152,"sw":23,"nē":1546,"sl":2038,"sk":10907,"sn":958,"sm":2964,"sp":4072,"so":1970,"sr":95,"nč":276,"mī":785,"sd":451,"sc":379,"sf":321,"nā":8240,"se":3495,"sh":197,"sg":86,"sj":78,"si":5921,"kš":1455,"rz":732,"u ":34678,"sa":11922,"sb":260,"rr":239,"rs":5613,"jū":1021,"rt":4290,"ru":7404,"rv":1310,"Cēs":56,"mē":2975,"ry":160,"lī":4107,"rp":1821,"ro":10142,"rn":1729,"kņ":114,"rm":3278,"rl":568,"rk":1664,"rj":126,"jš":27,"ri":15479,"rh":439,"rg":2074,"iž":275,"mā":4269,"rf":127,"re":6968,"rd":2322,"rc":590,"kļ":994,"rb":2090,"ra":18562,"t ":6209,"lē":2235,"qu":266,"kī":37,"iš":513,"lā":5128,"s ":99264,"kē":55,"pz":438,"pt":1346,"pu":3008,"pv":476,"pp":139,"jī":54,"pr":4678,"ps":1123,"zī":3820,"zā":1002,"IX ":48,"šģi":28,"zē":1137,"už":113,"uš":785,"IV ":42,"tš":221,"uļ":295,"uņ":333,"vī":926,"tū":1454,"vā":2847,"tļ":184,"uģ":166,"rž":79,"zz":61,"sū":160,"Bīb":25,"vē":3474,"tņ":104,"uā":341,"Hor":54,"zg":379,"zi":7929,"rš":1224,"zb":206,"Hou":27,"zc":577,"zd":606,"ze":4296,"za":1506,"Hom":27,"Hon":23,"Hok":34,"Hol":58,"zv":1734,"uē":34,"rū":541,"zs":1624,"uč":23,"zr":375,"zu":648,"zt":460,"zo":976,"zn":964,"tī":5903,"zp":657,"zk":332,"zj":64,"zm":1200,"zl":791,"yg":24,"ye":62,"tā":11917,"rļ":24,"yc":58,"yd":22,"ya":64,"rķ":279,"sē":2104,"tē":2602,"yt":35,"ys":126,"yr":30,"sī":1152,"yp":31,"yo":86,"yn":43,"ym":26,"rņ":171,"yl":66,"yi":22,"Arg":44,"Arh":40,"Are":22,"Ard":29,"Ara":25,"Arm":42,"Ari":31,"Šīs":37,"Apv":89,"Apo":23,"Atk":35,"Atl":83,"Atr":91,"Ato":22,"Ast":53,"Ata":28,"Asi":33,"Aso":76,"Art":56,"Avo":25,"Aut":94,"Aus":451,"Aug":149,"Alū":46,"Arā":51,"šēj":249,"zš":91,"zū":24,"zņ":392,"Šķi":29,"Atš":29,"Bak":26,"Bal":501,"Ban":57,"Bab":25,"Baz":45,"Bar":119,"Bas":49,"Bau":40,"Abr":31,"Act":23,"Ada":38,"šā ":313,"Adr":25,"šād":125,"šāk":184,"šām":60,"šās":183,"šāv":22,"Afg":26,"Aiz":104,"Aka":34,"Ala":26,"Alb":79,"Ale":93,"Alf":40,"Alt":55,"All":36,"Alp":29,"Ame":197,"Ama":49,"Šī ":93,"Ang":135,"Ana":95,"And":123,"Ant":228,"Ann":55,"Ar ":101,"ОН ":57,"Bul":41,"Bur":79,"Blū":22,"Bru":49,"Brī":92,"² ":53,"Brā":30,"Cal":37,"Cam":27,"Car":39,"Cau":23,"Can":86,"Cap":22,"CH ":201,"Bez":37,"Ber":134,"Ben":73,"Bel":45,"Bie":67,"Biz":31,"Šād":55,"Bil":27,"Bir":42,"Bio":33,"Blo":34,"CP ":24,"CO ":32,"Bla":25,"Bre":53,"Bra":125,"Bro":55,"Bri":92,"Bol":40,"Bon":23,"Bor":50,"Bos":71,"Bov":25,"Der":41,"Des":26,"Dei":40,"Del":44,"Dem":45,"Den":34,"Deg":23,"Dan":52,"Dar":42,"Dat":63,"Dau":261,"Dab":33,"Chr":32,"Cit":38,"Cir":22,"Cil":47,"Cel":26,"Cen":195,"Cet":27,"Cer":41,"Cha":50,"Cor":106,"Com":69,"Col":30,"Con":48,"FA ":41,"ģip":59,"ģin":756,"ģio":352,"ģit":52,"ģis":449,"ģim":77,"ģij":982,"Dze":94,"Dzi":93,"ģi ":28,"ģer":123,"ģel":28,"ģeo":161,"ģen":236,"Edv":25,"Edu":28,"ģa ":65,"Dzī":27,"āg":215,"Daž":59,"āf":379,"āc":3184,"ād":3136,"āb":862,"āp":472,"ām":4331,"ān":2529,"āk":5743,"āl":5581,"āj":3661,"āv":1981,"āt":4781,"ās":7161,"ār":6698,"āz":697,"Dib":24,"Dis":48,"Din":26,"Āb":22,"Dig":22,"Ād":23,"Āg":35,"Āf":113,"Die":362,"Āz":143,"Ār":50,"Div":37,"ā ":23455,"Dub":30,"ür":23,"ая ":29,"ģu ":60,"СН ":64,"Dra":31,"Dob":31,"ES ":40,"Don":29,"Dom":61,"OH ":75,"Nea":33,"ēģ":190,"Nei":26,"Net":40,"Ner":49,"Nep":30,"Neo":28,"Ēģ":41,"Nat":30,"−":111,"īdi":166,"īdr":32,"īds":186,"īdu":170,"īda":185,"īde":154,"Nin":27,"Nik":51,"ēž":23,"īci":383,"ūšu":22,"īca":276,"ūša":102,"īce":148,"ībn":58,"ēš":773,"ībi":22,"ību":1222,"ībv":37,"īd ":59,"ībe":36,"ība":3544,"ēļ":376,"New":31,"ēķ":56,"ēņ":27,"īcā":40,"ībā":960,"īgi":496,"īga":1676,"īgu":405,"īgo":298,"īgs":407,"īdz":1603,"īla":22,"Nam":26,"īgā":813,"īle":67,"īli":124,"Nac":121,"īka":42,"īkl":241,"īko":106,"īks":151,"īja":305,"ģu":81,"īji":84,"īju":700,"īdī":105,"īm ":246,"ģi":2798,"ģe":601,"ģa":66,"īdā":112,"īcī":70,"Ģe":106,"āļ":238,"āņ":597,"āč":113,"Či":48,"Če":124,"Ča":35,"Ču":25,"či":150,"če":516,"Īri":58,"ča":247,"āš":567,"āž":93,"No ":69,"ē ":2668,"Īsl":26,"čo":33,"čs":42,"ču":562,"ēz":349,"ēt":4896,"ēs":1279,"ēv":258,"čū":27,"Čī":26,"OS ":28,"ēm":2242,"ēn":867,"ēk":1496,"ēl":2212,"ēr":3544,"ēp":82,"Nov":61,"ēc":1136,"ēd":833,"Nor":163,"ēj":4209,"ēg":174,"Nos":50,"Not":22,"ēb":25,"Nok":22,"Ēr":64,"Nob":33,"ķe":541,"ķa":229,"ķu":510,"ķs":30,"ķo":78,"Odi":27,"ķi":1644,"ļ ":195,"Ogr":36,"Jān":51,"ļa":2291,"ļi":696,"ļk":39,"ļj":32,"ļe":71,"ļd":32,"ķā":31,"Ķī":170,"ļr":180,"ļs":48,"ļl":36,"ļo":646,"ķē":171,"ļu":2107,"ļv":154,"Ļe":28,"Oke":30,"ķī":541,"ļķ":37,"ļī":23,"ļģ":54,"ļē":91,"PA ":36,"ļā":656,"ļš":133,"ļū":169,"ļļ":89,"ļņ":176,"ņv":39,"ņi":716,"ņk":42,"ņj":25,"ņo":260,"ņn":28,"ņu":2226,"ņs":60,"ņr":139,"Ņu":43,"ņa":1766,"ņb":29,"ņd":47,"ņe":432,"Otr":101,"Ovi":22,"ī ":3304,"ģē":120,"ģī":39,"Jēk":140,"Īs":48,"Īr":60,"īb":5893,"Jēz":37,"īc":960,"īd":2819,"īj":1202,"īg":4125,"īl":317,"īk":663,"īn":1786,"īm":2002,"īp":453,"īr":757,"īt":3369,"īs":1871,"īv":3050,"īz":500,"ģīt":31,"Oli":250,"ı ":35,"Jāņ":29,"īģ":38,"īč":97,"īņ":368,"PS ":25,"īļ":122,"īš":592,"Ope":52,"Org":41,"Ost":30," −":67,"Osk":35,"Osm":120,"Ķe":55,"š ":1382,"Ple":55,"Pla":145,"Pil":83,"Paš":73,"Pit":22,"Pir":196,"QL ":27,"Pie":153,"Pha":28,"ģēt":22,"ģēr":24,"Kār":70,"ģēl":37,"šī":572,"šģ":28,"Per":109,"šļ":41,"Pet":30,"Pen":59,"Pek":25,"Pel":45,"šķ":1261,"Šī":134,"Kā ":42,"šā":893,"šē":273,"Šķ":45,"Šā":65,"Paz":31,"Pat":54,"Pas":386,"Par":468,"Pav":27,"Pau":41,"Pad":103,"Pan":44,"Pap":26,"Pal":61,"Pak":46,"šg":29,"še":180,"šd":202,"ša":5315,"šo":571,"šp":217,"šm":110,"šn":89,"šk":157,"šl":112,"ši":1623,"šv":170,"šu":2014,"št":354,"šs":357,"šr":148,"šz":36,"Še":104,"Ša":199,"Šo":132,"Ši":163,"Št":29,"Šv":81,"ņš":324,"Ņū":40,"ņģ":77,"Prū":54,"ņē":348,"ņķ":245,"ņā":297,"Prā":22,"Pro":228,"Pri":121,"Pre":78,"Pra":58,"Pol":274,"Pos":25,"Pop":24,"Por":59,"žr":80,"žs":96,"žu":691,"žn":24,"žo":773,"že":338,"ža":902,"žk":75,"žm":27,"ži":613,"Ža":46,"Ži":23,"Že":35,"RS ":104," ال":23,"ž ":41,"žū":26,"žņ":118,"žī":67,"žģ":48,"žā":790,"SA ":43,"Rad":110,"Rai":35,"Rag":24,"Rak":35,"ū ":140,"šņ":106,"Ūd":31,"šž":26,"šū":152,"ūg":56,"ūd":971,"ūc":206,"ūz":629,"ūs":815,"ūt":957,"ūv":256,"ūp":130,"ūr":2415,"ūk":417,"ūl":106,"ūm":112,"ūn":373,"ūš":146,"ūž":54,"Isl":22,"Irā":62,"Iva":31,"Izv":72,"Izr":29,"Itā":112,"Jac":27,"Jav":26,"Jau":217,"Jap":129,"Jan":40,"Jam":45,"Jag":28,"Ja ":27,"Jel":80,"Jer":41,"Jor":27,"Jon":25,"ア":23,"Joh":102,"KS ":29,"Jug":32,"Jup":83,"Jur":42,"Jum":22,"Kad":24,"Kab":22,"Kai":30,"Kam":79,"Kal":190,"Kap":52,"Kan":182,"Kau":72,"Kat":99,"Kas":57,"Kar":357,"Kaz":79,"Ker":28,"Ken":34,"Kem":29,"Kei":25,"Kir":46,"Kin":77,"Kij":27,"Kli":34,"Kle":26,"Kla":44,"Klu":132,"Kon":195,"Kom":112,"Kol":59,"Kos":58,"Kor":100,"Kop":106,"Kod":29,"Kok":40,"Kre":35,"Kra":70,"Kri":714,"Kro":35,"Kru":113,"Krā":28,"Kul":67,"Kur":259,"Kva":27,"Let":23,"Leo":37,"Lej":26,"Led":38,"Lau":64,"Lak":51,"Lai":103,"Lag":34,"Lat":1556,"Lar":29,"Lap":25,"Lam":24,"Lan":56,"Lab":56,"ML ":46,"Lib":28,"Lie":695,"Lim":30,"Lin":97,"Lit":29,"Liv":157,"Lut":24,"Luk":24,"Lud":28,"Lug":27,"Lor":22,"Lon":64,"Lok":22,"ūs ":24,"ūpn":62,"ūra":1491,"ūt ":294,"ūsd":328,"ūnā":49,"ūrs":28,"ūru":316,"ūrv":45,"ūrm":32,"ūrn":45,"ūrg":22,"ūri":123,"ūks":153,"ūku":78,"ūka":36,"ūko":44,"ūma":25,"ūli":71,"ūmu":36,"ūna":154,"ūns":22,"ūnu":55,"ūni":45,"Mek":48,"Mei":30,"Men":40,"Mel":144,"Mer":74,"Met":75,"ūtā":37,"Med":58,"Mez":22,"ūzi":561,"Džo":145,"ūzu":22,"ūtī":135,"Džu":25,"Dže":88,"ūsi":58,"ūsm":68,"ūsu":37,"ūst":237,"ūta":126,"ūte":28,"Man":129,"Mal":86,"ūti":73,"Mam":22,"Mar":393,"ūto":32,"Mas":145,"ūtn":56,"Mag":45,"ūts":52,"Mad":99,"Mak":70,"Mai":65,"ūtu":111,"Mac":23,"īzā":35,"īzē":29,"ūvi":42,"Maz":108,"Mat":72,"ūve":114,"ūrā":242,"īvā":296,"ītī":76,"īzi":177,"īze":130,"ītē":23,"Mod":95,"Mol":50,"Mon":122,"Mos":23,"Mor":85,"Mot":31,"Moz":28,"NS ":43,"īvī":139,"Mež":62,"īvē":41,"Mih":26,"Mik":53,"Mie":24,"īrā":42,"Mic":50,"īvn":200,"īvm":36,"īvp":28,"īvo":1046,"īvu":146,"Mir":23,"īvs":154,"Mil":64,"Min":133,"īva":467,"īve":148,"īvi":298,"ūvē":84,"ītā":552,"NO ":49,"īz ":81,"īsā":29,"īlī":22,"īrs":118,"īru":139,"īri":268,"īsa":30,"īnā":148,"īmē":659,"Mur":22,"Mus":124,"īra":120,"īt ":399,"ītu":205,"īnī":56,"ītr":41,"īts":565,"ītn":23,"īto":115,"īti":392,"īst":852,"īss":36,"īmī":130,"īsl":28,"īsk":119,"īsi":181,"īkš":28,"īte":151,"īta":808,"īne":41,"īni":507,"īno":33,"īns":285,"īnu":169,"īme":446,"īmi":343,"īmj":45,"īmo":31,"īmu":34,"īms":40,"īna":462,"īkā":26,"īs ":468,"ījā":106,"īpa":426,"Sāk":57,"Rēz":54,"XX ":24,"кий":32,"Wor":39,"Wil":32,"Win":29,"War":28,"Vul":29,"Vor":36,"Vol":134,"Viņ":206,"Viļ":33,"Vis":230,"Vit":48,"Vla":36,"ču ":517,"Zaļ":27,"Zie":391,"Zin":62,"Zil":29,"Zel":41,"Zem":525,"之":22,"Tēr":50,"三":24,"на ":32,"Tā ":634,"Tāl":53,"Tās":150,"Sēr":28,"Sēl":24,"Rīg":820,"Svē":102,"Sys":26,"Sve":45,"Spē":56,"Sul":33,"Spā":87,"Str":115,"Stu":53,"Sti":60,"Sto":87,"Sta":349,"Ste":80,"Teh":22,"Tec":22,"Tem":25,"Teo":25,"Tei":33,"Tel":43,"Tam":51,"Tan":40,"Tas":583,"Tar":26,"Tau":65,"Tai":63,"Taj":81,"Tak":27,"Tal":71,"Sko":119,"Sku":25,"Ska":110,"Sim":41,"Sil":58,"Sig":27,"Sir":32,"Sin":63,"Sid":28,"Sie":22,"Sib":28,"Nāc":29,"Ser":100,"Sen":122,"Sel":26,"Sem":34,"Sek":22,"TV ":22,"Spa":42,"TS ":22,"Spi":27,"Spe":35,"Spo":27,"Soc":53,"Sol":25,"Som":114,"Son":25,"Slo":56,"Smi":29,"TP ":38,"Mēr":47,"Mēn":64,"Jūr":108,"Jūl":32,"SV ":351,"Run":31,"Rum":56,"Rub":29,"Sai":29,"Sah":30,"Sak":57,"Sam":35,"Sal":189,"Sac":23,"Sab":29,"Sae":27,"Sad":27,"Sco":41,"Sci":36,"Sch":39,"Sav":262,"Sat":49,"Sau":200,"Sar":143,"Sas":60,"San":93,"ови":22,"Mār":46,"Māk":28,"SI ":51,"Res":30,"Rie":159,"Rau":22,"Rec":23,"Red":33,"Rei":58,"Ren":27,"Rep":321,"Rob":99,"Rod":49,"SR ":106,"Ros":76,"Ron":23,"Rom":176,"SS ":24,"Līg":32,"Līd":101,"čūs":27,"Reģ":30,"SO ":60,"Pēc":116,"Pēt":62,"Vai":65,"Vel":97,"Ven":119,"Vei":37,"Vec":107,"ски":39,"ска":25,"ско":22,"Vas":82,"Van":50,"Val":406,"Var":89,"Vaš":22,"Vid":291,"Vie":179,"Vir":65,"Vil":141,"Vik":70,"Vin":38,"Ver":44,"Ves":70,"Ukr":55,"Ung":47,"Uni":125,"Urs":73,"Mūz":68,"Mūs":71,"Ērg":30,"Uz ":28,"Čīl":23,"ēcī":76,"ēdā":27,"Trī":27,"ēj ":73,"ēgu":33,"ēgt":73,"ēkl":37,"ēko":23,"ēki":215,"ēku":376,"ēkt":35,"ēks":180,"ēm ":819,"ēka":504,"ēji":561,"ēju":643,"ējs":400,"ējr":35,"ējp":24,"ējo":477,"ēl ":109,"ēdē":151,"ējd":44,"ēja":1223,"ēmu":365,"ēma":558,"ēmi":221,"ēli":303,"ēlo":171,"ēln":72,"ēls":126,"ēlu":113,"ēla":172,"ēle":379,"ēc ":930,"ēci":41,"ēdu":26,"ēdz":296,"Pāv":29,"Pār":113,"ēda":98,"VI ":40,"ēde":49,"ēdi":142,"Ter":125,"Tet":23,"ēvs":32,"ērē":61,"ērā":124,"ērķ":215,"ēsē":144,"The":138,"ērī":91,"ēsā":29,"ēze":148,"Tib":26,"Tie":134,"ēsī":28,"Tim":41,"ētā":792,"Tir":25,"ēzi":135,"ērš":71,"ētī":178,"ēzu":41,"ēvē":143,"To ":233,"ēne":214,"ēni":174,"ēna":102,"ēns":137,"ēnu":91,"Top":34,"ējā":633,"Tor":80,"Tom":55,"Tos":31,"ēkā":56,"ēr ":112,"ējī":33,"ēpo":30,"ēlā":185,"ēs ":404,"ēt ":252,"ēlē":492,"Nīd":49,"ēra":721,"ērb":56,"ērd":31,"ēre":37,"ēmā":142,"ēri":669,"ērk":23,"Tro":39,"ērn":117,"ērp":23,"ēro":293,"ēlī":53,"ērt":389,"Tri":83,"ērs":217,"ērv":101,"ēru":97,"ēmē":83,"Tre":69,"ērz":31,"Tra":108,"ēnā":38,"ēkš":30,"ēsl":23,"ēsu":45,"ēst":566,"ēnē":31,"ēte":130,"ēta":1832,"ētn":71,"ēto":79,"ēti":598,"ētk":66,"ētu":414,"ētr":34,"ēts":396,"Tur":208,"Tuv":48,"Tuk":33,"Tul":26,"šdz":152,"šde":24,"ši ":834,"šel":77,"ša ":338,"šas":321,"šau":228,"šah":27,"šai":236,"šaj":205,"šam":126,"šan":3757,"ēģi":168,"Šve":65,"šte":79,"šta":234,"šst":29,"šum":55,"šus":57,"švi":22,"šve":23,"šva":72,"špa":34,"škā":87,"špi":125,"šos":132,"šot":31,"Ēģi":41,"špu":29,"šs ":266,"šre":66,"šru":52,"šsa":28,"šu ":1883,"šla":74,"šme":77,"вич":26,"šo ":346,"šni":32,"šno":38,"šie":334,"šif":27,"šin":270,"šim":31,"šis":80,"ēļa":42,"ēļ ":153,"ēķi":49,"švā":48,"šze":25,"ēļu":170,"bju":45,"bje":249,"biz":35,"bis":145,"bit":116,"bio":155,"bir":59,"bik":26,"bil":410,"bin":496,"bij":2367,"bo ":36,"blo":48,"ble":47,"bli":610,"bla":73,"bku":154,"bok":22,"bol":1514,"boj":283,"bni":61,"bež":545,"bs ":212,"biļ":23,"bpi":124,"biņ":28,"bon":217,"bor":108,"bot":84,"bos":35,"bov":23,"be ":419,"bam":32,"ban":81,"bak":69,"bal":1602,"bai":173,"baj":47,"bag":46,"bac":26,"bad":23,"baz":174,"bau":36,"bat":100,"bas":2875,"bar":192,"azī":447,"bi ":165,"bei":323,"bed":22,"ber":482,"ben":195,"bel":118,"bek":85,"bez":361,"bes":245,"bet":713,"bib":56,"bie":860,"brū":72,"− ":42,"buļ":79,"−C":44,"ca ":396,"car":55,"cas":201,"cat":29,"cau":337,"can":76,"cab":38,"cam":74,"cal":52,"ce ":431,"cba":35,"blē":52,"bri":385,"bro":61,"bra":298,"bre":171,"bu ":1529,"blī":48,"bru":342,"bsk":33,"bso":54,"bse":25,"bst":77,"bur":350,"bul":250,"bun":25,"bum":299,"bud":74,"but":64,"bus":88,"boļ":39,"bva":41,"brā":110,"brī":410,"aka":880,"am ":2221,"ake":96,"akc":136,"aki":53,"aji":783,"ajo":724,"ajs":38,"adī":860,"aju":73,"adē":277,"aiz":832,"al ":300,"aja":469,"aje":41,"aij":61,"aik":1200,"ail":137,"aim":532,"ain":809,"aip":45,"acī":139,"air":752,"ais":4315,"ait":1125,"aiv":51,"aig":508,"adā":819,"aie":25,"aid":426,"aic":101,"aib":25,"ahi":35,"ahs":67,"aht":36,"abī":81,"ahr":22,"abē":108,"aha":93,"agl":88,"abā":487,"agi":39,"agr":218,"ags":92,"agu":84,"agn":169,"ago":152,"ajā":2224,"anv":34,"anu":718,"anz":65,"ano":394,"ann":140,"ant":1775,"ans":674,"anr":123,"ane":123,"ang":611,"anh":30,"ani":1185,"anj":22,"ank":275,"anl":23,"ap ":332,"ana":3035,"anc":401,"and":1589,"amu":130,"amm":408,"amo":175,"amp":185,"ams":603,"ami":532,"ahā":45,"adž":91,"ame":775,"amb":217,"ama":1351,"ao ":25,"alv":1226,"alu":519,"alt":791,"als":1977,"alp":208,"alo":1383,"aln":628,"alm":121,"all":272,"alk":231,"alg":97,"ali":1208,"alc":61,"ald":1013,"ale":584,"alf":96,"agā":163,"ala":1330,"alb":172,"Šaj":103,"an ":784,"aks":1610,"akr":121,"Šar":24,"aku":213,"akt":789,"ako":151,"akn":91,"akm":124,"akl":47,"aba":1096,"abe":95,"abi":630,"abl":44,"abo":176,"abp":131,"abr":109,"abs":116,"abv":34,"abu":136,"ae ":655,"aca":38,"aau":77,"ad ":480,"ac ":37,"afr":25,"aft":53,"afi":105,"ai ":5352,"aga":835,"age":107,"ael":27,"aei":29,"ah ":32,"afa":26,"ado":609,"adr":151,"adl":29,"adn":49,"adm":304,"adi":802,"ade":142,"adz":161,"ads":379,"adu":517,"ack":35,"aci":939,"ach":47,"ace":507,"ada":1586,"act":88,"acu":34,"azn":207,"azm":31,"azo":137,"azi":230,"arš":263,"azu":55,"atī":1487,"azs":68,"atē":140,"aze":46,"aza":217,"azd":25,"atņ":23,"avē":54,"avā":267,"arž":51,"arī":2864,"arē":195,"az ":62,"asā":83,"asī":143,"asē":82,"Šob":33,"atā":330,"aye":23,"auž":79,"ba ":1526,"azā":276,"atū":468,"auņ":57,"avī":79,"atš":186,"auš":105,"at ":261,"alē":137,"arh":312,"aiž":27,"arg":217,"amā":271,"are":686,"ard":512,"arc":143,"akļ":72,"arb":1417,"ara":2574,"arp":1342,"aro":671,"arn":121,"arm":206,"arl":146,"ark":821,"arj":66,"ari":685,"aru":580,"Šie":27,"arv":25,"amē":73,"alī":910,"arr":34,"ars":430,"art":883,"au ":159,"asa":1301,"ary":36,"anā":994,"asf":22,"akš":415,"asi":550,"ash":28,"asc":35,"ase":553,"aso":90,"asn":297,"amī":170,"asp":375,"ask":709,"asm":50,"asl":59,"ar ":6199,"apb":88,"apa":727,"Šep":43,"akā":218,"ape":72,"apd":220,"aiļ":41,"apj":56,"api":222,"aph":30,"apg":734,"aiņ":241,"apm":273,"Šei":27,"apl":157,"apk":491,"apo":44,"app":36,"apr":520,"aps":483,"apt":349,"apu":118,"apv":358,"apz":415,"as ":38125,"alā":492,"aiš":99,"ava":1146,"ax ":25,"auz":73,"auv":33,"aut":1629,"avs":157,"apī":65,"avr":23,"avo":299,"avp":76,"anš":45,"avi":843,"ave":232,"ay ":49,"Šo ":72,"avv":82,"avu":357,"arā":580,"av ":331,"atb":314,"ata":845,"asu":217,"ast":3489,"ass":194,"anč":183,"anē":182,"asv":32,"atm":195,"atn":153,"atk":493,"atl":205,"anī":82,"atr":1790,"ato":1405,"atp":124,"ate":661,"atf":33,"Šis":89,"atc":32,"atd":107,"ati":895,"atj":54,"atg":171,"ath":38,"auc":511,"att":979,"ats":614,"alū":32,"atv":2153,"atu":973,"atz":132,"aul":1187,"aum":53,"aun":922,"aup":26,"aur":568,"aus":1232,"aud":1114,"apā":234,"aug":1739,"auj":444,"auk":1608,"Vēs":43,"Vēr":24,"ος":59,"ος ":59,"ς ":97,"ν ":22,"Zvi":157,"Zva":55,"α ":62,"Vār":66,"Vāc":268,"ēša":752,"Н ":147,"ий ":35,"ич ":26,"jeb":1453,"jer":106,"jek":399,"jel":23,"jen":64,"jet":57,"jev":88,"eān":337,"eāl":99,"eāt":94,"ji ":637,"aža":89,"aži":139,"ažk":37,"ažo":204,"ažr":77,"ažu":75,"ļģi":26,"jad":67,"izē":509,"jas":8640,"jau":557,"jap":53,"jar":34,"jak":65,"jan":85,"jam":931,"dēš":109,"jai":593,"izņ":119,"izš":65,"jda":46,"jni":88,"jol":22,"jon":751,"jom":89,"jot":574,"jos":350,"jor":71,"js ":1627,"dīb":475,"dīc":65,"jiņ":29,"jpu":34,"ск":94," zī":183,"jis":272,"jie":1464,"то":24,"ст":46,"jko":24,"jo ":387,"ažā":482,"itm":69,"itl":241,"itr":232,"inī":182,"itp":32,"ito":983,"itu":694,"itt":30,"its":171,"ity":36,"imš":47,"isk":6070,"ism":951,"isl":163,"iso":147,"isn":115,"imī":127,"isp":195,"iss":145,"inč":41,"isu":253,"ist":3658,"isv":78,"inē":330,"ita":561,"ite":695,"ith":30,"iti":844,"ilš":47,"inū":22,"ivs":26,"ivr":48,"ipī":32,"ivp":30,"ivo":171,"ivv":36,"ivu":117,"irā":703,"inž":48,"ius":102,"iur":38,"ium":46,"ipē":33,"iva":204,"ivd":29,"dē ":186,"ix ":38,"ivi":487,"inš":34,"ivj":45,"ivk":27,"ive":291,"ipr":208,"ipo":63,"ipu":70,"ips":96,"ipt":115,"ipi":127,"igž":89,"ipl":132,"cīb":793,"cīg":288,"ilā":148,"is ":8439,"ion":1944,"iop":23,"ašī":283,"ior":31,"igū":45,"ios":84,"iot":138,"ijā":2772,"iog":37,"iok":51,"iol":177,"iom":40,"ipa":229,"ikā":1450,"ipe":167,"iov":23,"ir ":14672,"imē":165,"iru":216,"irs":599,"irt":236,"ilī":94,"irr":24,"iro":905,"irm":929,"irn":124,"irk":171,"iri":558,"isi":459,"ikš":84,"ish":36,"inā":3097,"ise":117,"isc":158,"isb":151,"isa":605,"irz":314,"cīt":112,"cīn":190,"cīj":23,"cīk":97,"imā":119,"ire":99,"irg":402,"irb":90,"ira":259,"ird":185,"irc":39,"it ":228,"ilē":113,"cīz":63,"itū":160,"ivī":107,"izā":483,"dēļ":202,"ja ":7647,"itā":1332,"irī":150,"dēs":22,"dēt":124,"dēv":144,"dēj":602,"dēn":29,"dēm":188,"cīņ":106,"dēl":80,"isā":208,"irē":31,"iz ":254,"ivā":236,"itļ":164,"izz":43,"izu":55,"izv":798,"itī":222,"izr":274,"izs":752,"izt":200,"izp":558,"izo":166,"izn":131,"izm":1077,"izl":692,"izk":139,"izj":45,"irš":41,"izi":513,"izg":245,"ize":159,"izd":376,"izc":301,"izb":41,"iza":170,"itē":104,"isī":84,"eģi":394,"kij":77,"kil":129,"kia":33,"kie":620,"eģe":52,"kin":233,"ļās":38,"kip":24,"kir":36,"kis":66,"ļām":48,"keā":336,"km ":504,"kga":63,"ki ":1566,"kgr":28,"kaļ":69,"kaķ":31,"kho":57,"kaņ":374,"kej":204,"kel":71,"ken":50,"kes":27,"ker":86,"ket":457,"ļā ":507,"fāt":30,"ke ":111,"kci":508,"kdi":30,"kra":920,"kre":163,"klē":111,"kt ":222,"eīn":30,"klā":396,"ksa":250,"kse":83,"kmē":88,"ku ":2253,"kro":296,"klī":65,"kru":265,"kri":985,"koz":38,"kov":64,"kot":513,"km²":49,"kos":513,"kor":259,"kop":1110,"koo":61,"kon":1216,"kom":1199,"kol":578,"kok":286,"koj":143,"koh":29,"kog":34,"koe":27,"kod":306,"ks ":1859,"kme":260,"kne":183,"klu":183,"kls":75,"ko ":1627,"kle":136,"kla":494,"klo":153,"kli":573,"ļēj":87,"dīš":155,"još":397,"jus":320,"jum":2601,"jur":89,"jvi":27,"jsi":45,"būv":217,"būs":22,"būt":380,"eču":47,"ju ":3050,"dīg":518,"dīj":452,"dīn":38,"dīt":668,"dīz":28,"jra":36,"eče":56,"kaz":48,"kav":162,"kat":1010,"kau":385,"kar":1806,"kas":6913,"kap":301,"kan":621,"kal":1136,"kam":395,"kaj":1054,"kak":37,"kai":2823,"kad":430,"kab":198,"ka ":3723,"juš":280," Ga":455," Bā":54," Ge":158," I ":103,"guļ":52," Fo":211," Fu":230," Fr":535,"ļš ":124," Fi":190," Fl":70," Ha":441,"aķu":37," He":345," Bē":72,"aķi":22,"guš":31," J ":40," Go":172,"aķe":127," Gr":516," Gu":162," Gv":31," Gi":68," Gl":86," Ig":193," Dā":72," Ie":156," Id":25," Ib":23," K ":22," Cē":86," Hu":54," Bī":48," Ho":304,"ha ":116," Hi":232," Je":179," Ja":604," Iz":255," Iv":35," Dē":23," Ir":159," Is":56," It":130," Im":77," In":594," Ik":26," Il":59,"ham":85," M ":45,"han":159,"hai":42," Ka":1303,"hal":100," Ke":174,"hau":35," Ki":234,"har":152,"has":51,"hat":24,"aļ ":30," Jo":237," Ju":233,"hae":23," N ":55," La":2100," Le":272," Gā":24," Li":1175," Kl":266," Ko":784," Kr":1044," Kv":49," Ku":433," Ma":1344," O ":55,"aļu":335," Mi":483,"aļv":40," Dž":308," Me":654," Hā":28,"he ":281," Lo":254,"aļa":796," Ly":24," Lu":187,"aļi":184," Ne":334," P ":63,"а ":100," Na":328," Ni":163,"cā ":43," Mo":496," My":39," Mu":257," Hē":35,"hek":42,"hel":84,"hei":58," A ":102,"С ":41,"het":54,"her":196,"heo":49,"hen":64,"hem":51,"cāk":136,"cām":35,"hi ":25," B ":49,"cās":52," C ":203," Ap":260," Am":327," An":736," Ak":149," Al":533," Ai":198," Ag":50," Ah":36," Af":50," Ac":82," Ad":148,"aļā":556," Ab":144," Ba":1003,"aļē":89," D ":68," Az":34," Av":74," Au":758," At":403," As":243," Ar":519," Be":444,"hie":34,"hid":182,"hic":23,"hib":51," Bi":327,"hip":128,"hin":98," Bl":114,"him":44,"hil":44," Bo":338,"hij":70," Br":524," Bu":238,"his":79,"hit":143,"aļķ":22," E ":56," Ca":270," Ce":344," Ci":203," Ch":131," Cl":40," Cr":39," Co":292," Cu":37," F ":24," Da":629," Di":673," De":398," Dr":102," Do":202," Dz":227," Du":106,"hn ":22," Eb":22," Ed":95," El":193," Ek":92," Ei":676," Eg":26," Et":48," Es":44," Er":98,"hlo":75," Eq":185," En":121," Em":90," Ez":43," Ex":28," Eu":85," Ev":42," Fe":226,"aņa":180," Fa":110," H ":140,"aņd":30,"gma":49,"go ":413,"gme":45," Rī":858," Pū":23,"glu":49,"glo":94," Sā":102,"gls":48," Z ":22,"gle":241," Rē":60,"gli":344,"gla":305," Wo":56," Wi":95," We":44," Rā":30," Wa":65,"й ":60," Rū":43,"gog":33," Zu":31,"god":68," Zv":222," Zo":72,"aģē":26," Tī":49," Ze":604,"gno":38," Zi":551,"gni":35,"bāš":52," Za":124," Sī":40," Tē":78,"gna":34," Yo":26," Tā":889," Sē":77," Vī":58,"gs ":1150,"glā":76,"о ":42," Vē":121,"н ":27,"gol":71,"gon":119,"gos":60,"gor":256," Vā":354,"got":83,"gov":43,"gu ":870,"gnā":78," a ":72,"р ":25,"glī":122,"gro":67,"gru":596,"gra":755,"gt ":56,"gri":552,"gre":213,"aīs":153,"gtu":35," R ":37," Jē":211," Ov":27," Os":199,"gto":46," Ot":132," Or":178,"gts":50," Op":65," Po":516," Pl":245,"gum":401," Pi":541," Ph":55,"gul":224," Kā":157,"gua":28," Pe":329,"gub":86," Pa":1447,"gud":66,"gst":733,"gnē":109,"gsn":33," Nu":37," No":486," Ol":289," Ok":85," On":42," Oh":28," Og":53," Od":50," Jā":93," Of":27," Ob":41,"gta":111," Ra":371,"д ":27," Lē":28," Qu":23,"goš":45," Lī":215," Ro":565," Re":681,"grā":671," Ri":308," Rh":25," Mā":133," S ":101,"guv":148,"gur":196," Pr":570," Ps":28,"gus":188," Pu":147,"gun":194,"gvi":84," Lā":40,"gva":31,"bēr":112," Sy":39,"grī":79,"bēt":95," Sv":220," Su":203," St":793," Ta":1092," V ":48," Tj":24," Th":177," Ti":329," Te":399," Tr":346," Lū":27,"ļūt":29,"ļūs":33," Nī":78," To":521," Mē":139," Jū":171," Ru":196,"ļūd":92,"grē":114," Sa":1313," U ":30,"е ":61," Sh":52," Si":395," Sc":134," Se":442," Nā":75," So":290," Sp":307," Mī":29,"bēj":25," Sk":316," Sl":112," Sm":70,"bēm":82," Sn":22," Pē":221,"grū":45," Uz":96," Va":790," X ":41,"и ":54," Ve":526," Vi":1347," Vl":38," Pī":38," Vo":205," Vu":40," Tu":361," W ":26," Pā":146," Ug":37," Uk":57," Ul":29,"gzd":28," Un":191," Up":44," Ur":133,"gzn":376," Mū":173," ja":682,"cēš":42,"iak":26,"ь ":24,"iam":59,"ial":119," iz":4946,"ian":220,"ias":27," je":1511,"iat":108," im":336," in":1344," ik":136," il":190,"ic ":183," dē":302," iv":22," is":92," it":111," cī":105,"iag":25," ir":14600,"ibl":76," ka":7945,"ibi":285,"ibo":30," m ":158,"ibr":57," ki":241," fā":22,"ibu":58," ke":72,"iaz":26," jo":407," dī":55,"id ":25,"iba":66," bū":407,"ibe":125," ju":119," bē":90," ha":132," he":218," aģ":61," gl":287," gr":1494," go":110," gu":155,"ia ":270," gs":71," cē":148," id":175," dā":101," ie":4455," ig":94," hi":297," hl":48," ho":304," bī":86," hr":108," ht":33," hu":35,"iet":4005,"iev":1792," ni":63,"iez":227,"iel":3446,"iem":8120," ne":2383,"ien":7676,"iep":380," na":986,"ier":1197," p ":50,"ies":2658,"iee":100,"ied":2745,"ieg":992,"iek":4508," mu":445,"iej":39,"iea":35," mo":718,"iec":1164," mm":35,"ieb":72," ok":512," ol":76," on":40,"dāv":49," og":135,"ifo":180,"dām":160," of":307," jā":140,"dān":30,"ifs":22,"dās":334,"dāt":198,"dār":218,"ifr":46," ob":243,"dāf":28,"ife":75,"dāl":53,"dāj":251,"ifi":245," nu":108," no":9749," gā":212," le":458,"icr":40,"ics":32,"ict":31," li":2852,"icu":63," n ":74,"ico":88,"ick":37," eļ":37,"icl":22," la":3298," kv":261," ku":4521,"ici":543," cū":37,"ich":68,"ice":118,"ie ":1698," km":554,"ica":238," kl":531," kr":1914," ko":4523," me":1393,"idz":312," dž":28,"idu":626," mi":1711,"ids":336,"я ":56,"idr":632,"ido":1694,"idp":27,"idm":53," ma":1802,"idn":29," lu":115,"idi":317," lv":27,"ide":638,"idd":33,"ida":1167," lo":437,"dā ":1681," af":23," ag":155,"aša":197,"idā":226," ab":209," ac":105," ad":392," am":414," an":722,"aši":302," ap":4517,"ašn":47," ai":846," ak":503,"ašl":70," al":593," av":171,"ašs":53," au":2262,"ašr":63,"icī":260," ar":6236,"ašv":86," at":4655,"ašu":167," as":496," d ":35," ba":1482,"idē":338,"il ":50,"ija":12639," bi":2895,"ije":54," be":1390,"iji":165," bo":133," bl":146,"idī":348,"ijs":425," bu":382,"iju":1715," br":766," ca":410," e ":62,"im ":312,"ika":3320,"ige":58,"iga":284,"ii ":23,"igl":56,"igm":40,"igh":40,"igi":68,"igu":73,"igs":27,"igr":58,"igo":33,"ign":102,"ij ":163,"igz":407," b ":53,"iha":58,"ihi":54,"ihs":70," Zī":28,"iho":104,"ibī":64,"ik ":182,"icē":211," c ":63," er":23,"imo":88,"imn":195," et":118," es":227,"ims":83," en":251," em":85," ep":55,"imp":593,"idž":50,"imf":36," ei":45," el":849,"ime":522," ek":494," ef":77,"imi":811,"ieš":1802," fe":288,"ip ":24,"inc":281,"ind":539,"ina":1252," fa":282," ez":655,"imt":1282,"imu":237," ev":61," fu":829,"inn":34,"ino":729," fr":351,"ļļu":39,"inr":24,"int":1230," fo":740,"ins":796,"inf":318,"ašā":181,"ine":633," fl":69,"inh":61,"ing":589,"iež":570,"inj":23," fi":803,"dāš":37,"ini":1119,"ink":89," ge":43,"cī ":27," bā":233,"ioa":45," ga":5014,"iod":270,"ļļa":40,"inu":278,"inv":33," i ":44,"iko":225,"ikn":25," cm":50,"ikm":90,"ikl":215," co":68,"iki":144,"ikg":62," ce":962," ch":25,"ike":59,"ikd":24," ci":2295,"ila":297,"ilb":52," da":4272,"in ":197,"ieķ":378,"ikv":30," cu":46,"ikt":677,"iku":1045,"ikr":195,"iks":380," do":494,"ilp":256,"ilo":443,"ill":168," dr":234,"ilk":121,"iln":265,"ilm":237,"ieņ":198,"ilh":58,"ilg":175," de":1205,"ilj":175,"ili":579,"ild":409,"ilc":81,"ieļ":23,"igā":206," di":3087,"ile":125,"ima":242,"imb":308,"ч ":32," g ":36,"io ":204," eb":89," du":128,"ilz":47,"idū":60," dz":3456,"ils":2044,"ilt":346,"ilu":127,"ilv":752,"ль":27,"hs ":112,"bīg":65," sū":56,"bīd":30," vē":1059,"ми":22,"bīb":430,"ло":25," vā":1227," tī":340," zo":142,"ла":30," zu":23,"ле":23," rū":113,"ли":29," zv":544,"hok":186,"hol":226,"hom":35,"hon":30," za":193,"ко":45,"hos":37," ze":740,"hot":32," zi":2010,"hop":25,"hor":75,"ка":54," tē":247,"ки":52," sī":84,"hni":146," sē":228," tā":1796,"hno":151,"aņu":241," pū":63," rī":133,"aņe":35,"hme":29,"ин":34,"aņi":61," ww":36,"ий":38," sā":685,"ич":34,"aņo":46,"ри":31,"ро":53,"ра":42,"ре":36,"htt":39," zā":53,"hst":50,"ос":22,"ор":37,"ол":32,"aņģ":36,"ов":87,"hu ":43,"ое":22,"aņē":59,"нс":22,"но":31,"hro":158,"aņā":78,"ни":29,"hri":23," tū":103," vī":192,"ht ":30,"на":44,"bīr":30,"bīt":96,"bīs":120,"bīn":33,"cēt":118," ru":206," jū":849," mē":678," u ":116," sa":6813," nā":219," sf":46," se":1483," sc":42," si":1753," sh":33," sn":112," sm":289," sl":596," sk":1357," mī":111," sp":2239," so":351,"ве":29," qu":23,"ви":41," lē":170," t ":61,"во":24," ra":2342," kļ":152," re":2229," mā":826," ri":726,"cēn":32," kņ":49,"cēm":41,"cēl":179," ro":913,"cēj":228," lī":2107," pu":1266," pr":3156," ps":120," s ":86," lā":230,"ļņu":56,"ва":29," os":114," gū":38," ot":330,"ļņi":27,"hum":50," op":221,"ав":30," or":1147,"ан":46,"ļņa":76," jē":154,"ал":24," pe":651,"cē ":120," kā":2571," pa":8411,"ар":38," pc":24," pl":1389,"ая":29," po":1148," pi":5908," rā":73," x ":60," va":7044," ve":2127," pē":1285," uz":2903," vo":159," pī":83," vu":52," vi":7158,"ес":31,"ер":46,"ен":52," tv":38," tu":790," mū":750," ut":55," ur":52," up":232," un":10091," ug":66," pā":1765," ta":1745," st":3025," kū":39," sv":412," su":879," nī":29," tr":1203," lū":49," to":1082," th":155," ti":3365," te":2144,"aāt":29,"fi ":28,"fes":176,"fer":133,"feo":25,"fed":168,"ņi ":361,"feb":35,"fen":84,"fek":185,"fel":37," Ča":35,"ņin":63," Či":48,"ņie":244," Če":124,"fga":33,"faz":23,"fas":28,"ezī":69,"far":40,"fan":93,"fak":97,"fal":45,"fai":36,"ņbr":28,"fab":102,"ņem":387,"ņda":33,"fe ":34,"ņos":47,"ņot":124,"ņoj":49," Čī":26,"evē":231,"evī":174," ēz":45," ēt":31,"ņra":135,"fa ":173," Ēr":64,"ņs ":24," ēd":44," ēr":63," ēk":65,"esē":107,"esī":292,"erņ":62,"etā":683," Ču":25,"ez ":111,"erē":156,"erī":393," če":400,"ņji":23,"erģ":206," ča":23,"esā":195,"ezv":29,"ezu":188,"evā":40,"eza":38,"ezd":35,"etē":183,"ezm":33,"ezn":141,"ezo":141,"ezp":31,"etī":89,"erū":27,"ezt":29,"eze":817,"ezg":53,"erš":24,"ezi":201,"ezk":125,"etb":351,"eta":734,"ete":624,"eti":845,"elš":89,"eth":26,"etn":170,"etl":29,"esp":338,"emī":84,"esn":43,"eso":286,"est":937,"ņoš":22,"esu":212,"enč":28,"esr":33,"ess":340,"enē":213,"esv":35,"ev ":24,"epā":130,"emš":50,"eto":1378,"enī":969,"etr":698,"ets":222,"ett":58,"etu":1584,"etv":168,"ew ":38,"eve":83,"eva":376,"evo":146,"evn":29,"evi":1543,"enš":59,"eut":31,"eus":24,"ex ":26,"erā":709,"evu":397,"evr":24,"evs":91,"ey ":55,"epe":71,"ekā":290,"epi":258,"epj":76,"eph":37,"er ":1104,"epa":258,"eot":23,"egū":271,"eos":34,"eor":322,"ņu ":2105,"eom":53,"eol":176,"eop":39,"eon":61,"elā":1125,"es ":8988,"ept":197,"eps":32,"epu":444,"epl":47,"epo":71,"epr":190,"erk":66,"erl":157,"eri":2539,"erg":167,"ere":536,"emā":318,"erf":37,"ekļ":537,"erc":178,"erd":99,"era":1626,"erb":238,"et ":1093,"elē":112,"esk":278,"esl":110,"esm":388,"enā":1113,"ņus":89,"esi":374,"ekš":656,"esc":24,"ese":289," Ēģ":41,"esa":337,"erz":36,"ery":29,"erv":331,"eru":391,"emē":508,"ņve":32,"err":36,"elī":103,"ert":417,"ers":1648,"ern":467,"erm":855,"erp":67,"ero":595,"eki":366,"ekl":427,"ekm":150,"ekn":62,"eko":416,"ekr":305,"eks":1397,"ekt":1750,"eku":811,"ekv":109,"en ":402,"elb":111,"ela":1449,"eld":81,"egā":92,"elf":85,"ele":1367,"eli":980,"elj":22,"elg":122,"elm":98,"eln":210,"elk":50," Ģe":106,"ell":209,"elo":182,"elp":198,"elu":464,"elv":59,"els":567,"elt":409,"elz":306,"eo ":63,"emb":176,"ema":412,"edž":46,"emg":143,"ehā":103,"eme":2607,"emd":28," ģe":323,"emn":25,"emo":348," ģi":768,"emi":389,"emj":71,"emt":133,"emu":60,"emp":493,"ems":89,"ep ":41,"ene":1004,"eng":203,"enb":97,"ena":2039,"end":388,"enc":397,"eno":1358,"enp":40,"enm":45,"enn":52,"enk":331,"enl":88,"eni":1040,"enu":734,"env":1058,"ens":2464,"ent":2847,"ehī":26,"enr":79,"enz":110,"egš":47,"eog":110,"eof":26,"ejā":89,"eod":60,"egl":203,"ego":210,"ege":31,"egi":29,"eaģ":26,"egr":83,"egs":36,"egt":159,"egu":482,"egv":44,"ehn":276,"eho":29,"ecā":200,"ehi":58,"ek ":1542,"eib":23,"eic":317,"ecē":53,"ecī":715,"eis":229,"eir":180,"eim":142,"eil":42,"ein":253,"eih":26,"eik":735,"eie":54,"eid":2544,"eig":207,"edā":218,"eja":808,"el ":113,"edē":86,"eiz":614,"eit":215,"eiv":56,"ejs":177,"ebū":23,"edī":123,"ejo":81,"ejn":76,"eji":80,"eke":31,"ekc":120,"eka":402,"em ":6412,"eju":260,"ejv":23,"gaž":22,"git":41,"gis":49," īs":256,"gin":40,"aģe":63,"gie":240," īp":415,"aģi":37,"ght":32," Īr":60," Īs":47,"gaļ":93,"bās":254,"bāt":57,"bāz":178,"gi ":519,"bāj":44,"bāl":44,"bāk":295,"bān":43,"bām":208,"gen":244,"ger":118,"ges":34,"gel":54,"gej":38,"bā ":646,"ge ":89,"gab":646,"gad":2505,"gai":779,"gas":1682,"gar":786,"gau":305,"gat":362,"gav":387,"gaj":108,"gam":98,"gal":1339,"gan":1451,"ga ":1326,"frī":26," ķē":68,"aēl":30," ļo":206," Ķī":169,"frē":39," ļa":118,"žņu":81," ķī":292,"Ņuj":43,"fut":631," Ļe":28,"fta":36,"fun":189,"ft ":61,"fra":303,"fre":65,"fri":213," Ķe":55,"fu ":37," ķi":28,"fro":69,"aču":132," ķe":273,"for":1089,"fos":41,"fot":77,"fon":280,"fol":75,"ņak":24,"ņam":33,"ņas":543,"ņaz":78," ņe":38," Ņu":39,"fs ":175,"ņai":51,"ņa ":1018,"fle":24,"fla":25,"fli":29,"flo":42,"fic":294,"fig":49,"fij":235,"fil":435,"fik":181,"fin":189,"fir":37,"fis":214,"fiz":287,"da ":3140,"de ":574,"dac":25,"dab":346,"dak":25,"dal":823,"dai":150,"dag":58,"dae":523,"dat":711,"das":2562,"dar":1471,"dap":69,"dan":60,"dam":271,"dau":771,"dda":35,"cul":28,"cum":72,"cty":34,"cto":75,"cti":74,"cte":48,"cus":46,"cyo":60,"ceļ":571,"cle":31,"co ":36,"ciā":742,"cog":76,"con":50,"col":50,"com":55,"cor":52,"cop":28,"cot":97,"ciņ":28,"cs ":85,"ct ":32,"cro":67,"cu ":436,"cea":28,"ch ":124,"cer":171,"ces":699,"cet":115,"cen":1090,"cep":69,"cek":125,"cem":35,"cel":374,"ced":32,"ci ":164,"cha":72,"cia":105,"ck ":59,"cie":767,"che":97,"chi":53,"cho":24,"chn":24,"civ":69,"cij":3939,"cik":179,"cil":946,"cim":32,"cif":53,"cir":29,"cis":561,"cit":901,"ciu":34,"cin":303,"cio":768,"cip":250,"cm ":47,"cke":32,"cka":44,"ed ":130,"eba":49,"ebe":168,"ebi":87,"ebk":175,"ebo":22,"ebr":247,"ebs":44,"eak":105,"ean":79,"eal":83,"ear":51,"eas":36,"eap":54,"dzī":2001,"eat":203,"eau":46,"eb ":1292,"dzā":72,"dzē":373,"ea ":66,"efi":131,"efo":131,"efa":30,"efe":198,"ei ":338,"ega":166,"efs":31,"eej":99,"een":27,"edi":335,"ede":1042,"eda":646,"edz":1195,"eds":41,"edu":254,"edo":120,"edr":685,"eck":45,"ech":44,"eci":633,"ece":209,"eca":89,"ee ":59,"ecu":116,"ect":70,"ecp":24,"eco":62,"drī":349,"dz ":1238,"doš":285,"drā":113,"dy ":22,"dvi":33,"dve":34,"doņ":93,"dur":107,"dus":742,"dva":54,"duš":31,"dzv":49,"dzs":170,"dzu":124,"dzo":52,"dzn":78,"duā":33,"dzi":2913,"dze":1097,"dza":294,"dor":118,"don":402,"dom":421,"dol":232,"dok":188,"dow":29,"dov":34,"dot":762,"dos":455,"diņ":89,"dpo":22,"ds ":1372,"dmi":319,"dne":44,"dni":257,"diā":75,"dob":64,"dod":87,"doe":23,"doj":533,"dnī":68,"dun":24,"dum":173,"dul":78,"duk":158,"dug":33,"dub":73,"dua":23,"duc":94,"dri":640,"diž":52,"dra":350,"dre":139,"du ":1832,"dro":640,"drs":87,"dru":292,"dsi":206,"dsk":23,"dsm":48,"daļ":1548,"dic":163,"dia":195,"dib":246,"der":1382,"des":875,"det":70,"dev":375,"deb":147,"dea":22,"ded":49,"dec":38,"def":128,"deh":29,"deg":170,"dej":161,"dei":89,"del":244,"dek":100,"den":1101,"dem":139,"dep":138,"deo":95,"di ":661,"deļ":51,"dko":25,"dma":88,"do ":341,"deņ":285," Āz":143,"div":810,"diu":34," Ār":50,"dim":118,"din":621,"dio":398,"dip":35,"dir":59,"dis":577,"dit":40,"die":2511," Āf":113,"dif":53," Āg":35,"dig":41," Ād":23," Āb":22,"dij":481,"dik":129,"dil":24," ār":250," āt":154," āp":26,"daž":632," ād":29,"ižu":32,"rgu":304,"raļ":69,"rhe":62,"raķ":99,"rha":55,"rhi":273,"rbī":529,"māz":34,"raē":35,"māt":506,"iža":92,"rga":943,"jš ":26,"ri ":1063,"ižk":30,"rgi":83,"iže":26,"rge":111,"rbā":31,"rgs":168,"rgo":78,"ret":778,"res":696,"rev":103,"reu":25,"rez":384,"rfa":28,"māc":507,"māj":191,"māk":627,"māl":154,"mām":145,"mān":158,"rfo":45,"mār":102,"mās":220,"rdu":246,"rds":349,"rg ":40,"rdz":219,"reb":22,"rea":246,"ree":37,"ref":132,"rec":144,"red":582,"rei":822,"rej":271,"reg":233,"rem":185,"ren":387,"rek":208,"rel":259,"rer":29,"reo":23,"rep":144,"mā ":1508,"rda":365,"kļu":259,"rct":35,"rdo":96,"rdn":63,"rdi":228,"rde":305,"re ":399,"ņām":78,"ņās":22,"rbu":231,"rbs":88,"rco":80,"kļo":52,"kļi":136,"rci":226,"rch":63,"rce":59,"rca":34,"kļa":421,"rax":25,"raz":76,"rd ":116,"rap":69,"rar":28,"ras":4121,"rat":1049,"rau":810,"rav":242,"rbi":268,"rbo":531,"rba":232,"rbe":87,"raj":717,"lēš":123,"ņā ":197,"rai":643,"rah":53,"rag":283,"ran":1071,"ram":1238,"ral":520,"rak":1567,"rab":128,"raf":152,"rad":1141,"rac":100,"rpt":521,"rpu":97,"rpr":64,"rpp":23,"rpo":72,"rs ":2853,"rpe":30,"rpa":43,"riķ":45,"riņ":229,"rpl":24,"rpi":90,"rkā":93,"ror":63,"ros":500,"rot":765,"rom":435,"ron":900,"roo":23,"rop":1052,"roz":140,"rou":41,"rov":266,"rob":624,"roa":30,"rod":1744,"roc":512,"lī ":122,"roj":521,"roi":35,"riģ":56,"rol":216,"rok":423,"rof":236,"roe":58,"rog":545,"rno":46,"rns":56,"rnu":187,"rp ":573,"rgļ":35,"rna":307,"rež":135,"riā":400,"rne":333,"rni":347,"māš":32,"rmk":26,"māņ":51,"rmo":304,"rms":203,"kņu":24,"rmu":139,"ro ":308,"kņa":79,"rma":773,"rme":418,"rdž":57,"rmi":499,"reš":174,"rls":24,"rlo":32,"rli":126,"rgā":217,"ižā":28,"rld":35,"rle":24,"rla":201,"rn ":84,"rku":265,"rkt":251,"rks":248,"kļū":96,"rkn":35,"rko":61,"rki":73,"reģ":361,"rkl":58,"rke":63,"rka":377,"rbū":24,"reč":45,"rdī":30,"rja":44,"reā":83,"raž":305,"rje":48,"riz":128,"rdē":22,"rip":74,"rio":330,"rit":1293,"ris":2568,"riv":81,"riu":29,"rih":80,"rig":98,"rij":2366,"raš":66,"ril":99,"rik":885,"rin":683,"rim":290,"ria":161,"rib":107,"ric":188,"rid":231,"rie":3740,"rif":114,"rdā":161,"rk ":63,"roš":448,"mēd":92,"rož":23,"rsā":44,"mēj":294,"mēm":78,"mēl":71,"mēn":131,"mēs":84,"mēr":1231,"mēt":199,"rsē":59,"rtā":193,"rpā":40,"rud":44,"ruc":33,"rur":31,"roī":36,"rup":644,"run":357,"rum":1852,"ruk":350,"ruz":42,"rpē":106,"rus":878,"rut":84,"rva":432,"mē ":626,"rvi":327,"rve":210,"rvs":22,"līč":64,"roņ":62,"rvu":63,"ry ":107,"rsk":108,"rsl":46,"jūl":25,"rsi":451,"rkš":28,"rso":354,"rsp":142,"rsm":215,"rsn":57,"jūd":35,"rsd":23,"rsa":148,"rse":69,"rnā":130,"rta":1012,"jūt":39,"rst":685,"rss":51,"jūr":881,"rnē":53,"rsv":73,"rsu":128,"rtl":32,"rtn":59,"rto":248,"rnī":124,"rte":168,"rth":30,"rti":792,"rts":333,"rtr":97,"roč":76,"rtu":328,"līm":186,"līn":460,"riš":83,"līj":172,"līg":392,"līc":164,"līd":1562,"līb":213,"rt ":437,"līz":66,"līv":121,"līt":444,"rri":36,"rre":54,"rmā":762,"rra":96,"ru ":2726,"rmē":77,"rlī":45,"sab":330,"sac":442,"sad":310,"sag":154,"sai":857,"mēš":100,"saj":138,"sak":466,"sal":1082,"sam":248,"sbi":144,"rzī":41,"sap":162,"san":223,"sau":2277,"sat":285,"sas":1660,"sar":747,"sav":1100,"saz":48,"sa ":1042,"mēģ":49,"rsū":33,"rvē":141,"ruš":59,"ruņ":155,"rze":250,"ruā":39,"rza":50,"rtē":197,"līš":88,"rvā":75,"rzs":23,"rtī":278,"rzi":284,"saņ":59,"sho":31,"shi":22,"si ":826,"kš ":48,"sga":35,"nāv":198,"nāt":1807,"nās":1024,"sgr":33,"saī":148,"siz":24,"saž":44,"sie":635,"sid":107,"kšd":149,"sic":34,"sib":36,"kša":264,"sia":56,"sk ":23,"kšt":50,"sit":335,"kšu":53,"sir":69,"sis":1127,"kšs":52,"kšp":164,"sip":27,"sin":756,"kšn":32,"sio":148,"sil":229,"kšl":23,"kšm":75,"sim":594,"sij":423,"sik":99,"kšk":106,"sih":129,"sif":65,"sig":89,"nā ":1513,"sda":48,"sdi":356,"sbu":40,"se ":542,"sca":22,"sce":66,"sci":117,"sch":88,"sco":39,"sev":271,"ser":334,"ses":362,"set":54,"sfa":26,"sez":117,"sh ":43,"nāj":705,"nād":289,"nāc":166,"sfo":68,"nār":126,"nāl":1149,"nāk":422,"nām":427,"sdz":22,"sei":104,"seg":72,"sed":36,"sec":62,"sep":119,"sen":630,"sem":62,"sel":189,"sek":324,"sej":83,"spu":61,"spo":584,"siņ":49,"kšņ":49,"spr":198,"skā":2459,"spe":377,"spl":61,"spi":604,"kšķ":38,"spa":151,"sot":106,"sov":22,"sol":116,"som":91,"son":503,"sor":227,"sos":35,"sod":41,"sof":59,"kšģ":28,"soj":40,"soc":383,"su ":834,"smē":44,"nču":194,"sjū":46,"slī":114,"kšž":25,"smā":173,"nča":33,"sra":53,"st ":664,"slē":278,"mīt":118,"mīn":84,"mīk":32,"mīl":44,"mīg":202,"slā":344,"mīb":243,"mīd":29,"ss ":875,"sli":399,"slo":148,"slu":68,"sfē":192,"sla":592,"sle":81,"ski":1153,"sko":1104,"sks":400,"skr":170,"sku":481,"ska":4650,"ske":410,"sno":53,"kšē":181,"sns":27,"sna":41,"nāš":340,"sni":516,"sng":29,"sne":180,"kšā":24,"smo":229,"shē":40,"sms":242,"smu":340,"so ":75,"sma":1140,"smi":429,"seš":62,"sme":291,"soš":166,"nēt":549,"nēs":72,"nēm":160,"nēj":281,"nēz":87,"stā":3123,"sze":116,"suā":44,"stē":996,"stī":1710,"svā":52,"sse":108,"ssa":247,"sso":61,"ssk":34,"ssi":62,"ssv":26,"kūv":24,"smī":51,"ssp":22,"snē":32,"ste":1133,"sta":3898,"stm":49,"stn":212,"sto":1125,"stp":85,"sti":2606,"stl":22,"stv":62,"stu":1969,"str":3208,"snī":38,"sts":1496,"sud":63,"sub":69,"spā":202,"sug":619,"sul":139,"sum":128,"sup":49,"sun":42,"sus":127,"sur":82,"spē":1745,"suv":25,"nē ":308,"sva":373,"sve":113,"svi":110,"soņ":53,"spī":75,"nēš":36,"tai":828,"taj":702,"tak":124,"tal":256,"taf":30,"tag":177,"tab":171,"tac":309,"tad":222,"tbi":216,"tba":91,"tav":303,"tau":931,"tat":537,"tas":3364,"tar":1920,"tap":52,"tan":395,"tam":934,"tce":25,"te ":724,"tbo":1157,"tbr":38,"tdi":61,"tda":95,"svē":166,"suņ":88,"stū":113,"svī":41,"ta ":6165,"ozā":24," št":206," ši":160,"pa ":730," šo":253," ša":314,"iķu":54," še":87,"iķi":331," Šv":81,"iķa":30," Št":29,"iķe":39," Šo":131," Ši":163," Še":104,"ovī":25," Ša":199,"ovē":187,"osū":27," Šķ":45,"pdr":26," šā":147,"iļu":60," Šī":134,"kā ":4239,"iļi":22,"pci":27," Šā":65,"iļa":27,"pe ":174,"par":3884,"ozī":528,"pat":548,"pas":2092,"pav":354,"pau":149,"paz":430,"pba":51,"pac":124,"pad":181,"paa":70,"pab":25,"pag":415,"pak":786,"pal":445,"pai":59,"pap":138,"pam":682,"pan":195,"pc ":24,"iļā":27,"paļ":30,"pha":38,"paņ":40,"phi":54,"pga":664,"kāz":22,"pi ":142,"pgr":35,"kād":424,"kāb":641,"kāc":170,"kāp":196,"kān":152,"kāt":32,"kās":1168,"kār":1333,"paā":29,"kām":189,"kāl":251,"kāk":39,"kāj":89,"pea":50,"pec":223,"ped":72,"pdz":192,"pen":158,"per":1152,"pet":33,"pes":264,"pei":23,"pel":224,"pek":146,"peļ":35,"pla":1300,"pli":106,"pgā":32,"ple":241,"plo":77,"plu":29,"pka":321,"pko":110,"pja":69,"iļņ":141,"pjo":49,"pju":63,"pie":4300,"pig":22,"paš":762,"pij":105,"pil":2275,"pin":133,"pio":305,"pir":966,"pis":387,"pit":151,"poz":106,"por":630,"pop":247,"pot":153,"pos":221,"poj":154,"pog":70,"pon":210,"pok":25,"pol":767,"pod":51,"ps ":241,"ppu":25,"ppl":37,"piņ":42,"pkā":95,"iņu":475,"iņs":42,"iņo":51,"kāņ":216,"iņi":157,"iņj":22,"pme":52,"pma":68,"iņa":746,"po ":65,"gžņ":89,"pni":63,"pne":24,"psu":31,"pst":387,"iņķ":170,"pta":564,"pse":42,"ņķo":49,"ņķi":80,"psi":158,"psk":78,"ņķa":92,"pso":28,"ptu":237,"ptv":61,"pub":494,"pte":121,"pti":177,"pto":59,"pnī":27,"plū":201,"plē":184,"pra":501,"jīg":32,"plā":202,"pmē":120,"pru":34,"psa":72,"pu ":321,"pmā":37,"iņā":150,"pri":1091,"pre":942,"pro":1797,"plī":101,"poš":65,"pož":46,"prē":65,"prī":60,"psē":31,"pur":145,"pmū":26,"pus":720,"put":341,"pum":233,"pun":290,"ņģē":37,"iņš":323,"pul":349,"poļ":35,"pva":35,"pvi":404,"prā":125," Ņū":40,"ptū":22,"pzi":58,"ptī":28,"prū":42,"ņģe":27,"pzī":361,"lā ":1304,"lār":401,"lās":590,"lāt":209,"lām":367,"lān":215,"lāp":55,"lāj":219,"lāk":1056,"lāg":85,"lād":144,"lāc":180,"lāv":61,"lāz":36,"iša":70,"iši":31,"išu":56," Ži":23," Že":35," Ža":46,"lāč":85,"lāš":22," ža":153," že":29," žu":96,"lāņ":65,"išķ":223,"išņ":36,"ņēm":347,"qua":27,"lē ":420,"quu":124,"que":29,"qui":79," šī":215," šķ":525,"lēd":76,"lēc":25,"lēm":141,"lēn":178,"lēk":87,"lēl":48,"lēj":178,"lēg":138,"lēs":295,"lēt":383,"lēr":42,"lēp":64," šū":130," Ūd":31," ūd":556,"ra ":3874,"ežo":374,"ngo":130,"ngi":47,"eži":331,"ngl":303,"ngv":61,"ežu":144,"ngu":183,"ngr":212,"ngt":36,"ngs":108,"ni ":1261,"nge":139,"eža":263,"nga":229,"ncē":85,"nha":38,"nhi":24,"nhe":54,"neg":80,"nej":92,"nei":201,"nel":264,"nek":293,"nen":247,"nem":97,"nep":336,"neo":76,"ner":698,"net":418,"nes":1590,"nev":218,"ndz":31,"ng ":184,"nea":224,"neb":55,"nec":44,"ned":153,"nfi":27,"nfo":273,"iān":89,"iāl":1004,"nfl":40,"nfr":48,"nez":35,"iāc":169,"nfe":88,"nci":571,"gļi":38,"nce":476,"nch":29,"nca":32,"ne ":1014,"nbu":52,"ndu":273,"ndr":359,"nds":68,"ndo":248,"ndi":727,"nde":437,"nda":780,"gļu":239,"nak":85,"nal":235,"nam":315,"nan":170,"nar":136,"nac":383,"nad":66,"nae":90,"naf":23,"nag":57,"nai":923,"naj":193,"nab":23,"nbe":61,"nd ":206,"nav":456,"nau":117,"nat":306,"nas":3073,"naz":22,"na ":4384,"muļ":23,"muš":43,"hēm":45,"mtā":193,"ntā":395,"nsē":48,"noš":193,"ny ":39,"nvi":1034,"nux":29,"nve":133,"nva":62,"nuk":32,"nul":25,"num":139,"nus":295,"noī":26,"nto":950,"ntu":416,"nts":925,"ntr":875,"nti":950,"nth":28,"nta":799,"nte":768,"nsu":59,"nsv":36,"nsp":162,"nso":88,"nst":540,"nss":27,"nkū":25,"nsf":27,"nse":131,"nsg":24,"nsi":192,"nsl":44,"nsk":554,"nsa":85,"nu ":3157,"nmē":32,"iču":24,"ičs":33,"nrs":66,"iči":25,"nri":78,"iča":38,"nra":85,"nt ":98,"hīd":23,"nkā":234,"niņ":82,"ns ":4213,"noc":58,"nod":505,"noa":58,"nob":42,"nog":200,"nof":35,"nok":292,"nol":424,"noj":304,"nop":85,"nom":610,"non":115,"not":1518,"nos":1301,"nor":435,"nov":804,"noz":696,"npa":23,"niķ":23,"nne":83,"nna":118,"ngļ":211,"nno":31,"nni":41,"nme":24,"nma":29,"iāņ":39,"ežģ":48,"neš":108,"ndž":62,"ežī":26,"ežā":198,"ngā":136,"nn ":32,"nla":94,"neļ":30,"nle":25,"no ":5915,"nke":26,"nki":45,"nkc":180,"nka":173,"nku":121,"nko":53,"gļū":52,"nks":81,"nkt":272,"nkr":126,"nja":39,"ndī":35,"njo":28,"nij":1586,"nig":62,"nif":36,"ndā":228,"nie":3147,"nid":123,"nic":83,"nia":73,"nk ":29,"niz":488,"ndē":98,"niv":242,"nis":2463,"nit":151,"nir":27,"nio":28,"nip":37,"nim":128,"nin":84,"nik":329,"nil":57,"ogs":183,"ogr":713,"ogu":116,"ogi":122,"ogl":96,"ogo":65,"ogn":46,"oga":163,"obā":56,"ņš ":323,"obī":23,"ocē":27,"oho":29,"ohn":25,"oha":113,"odē":120,"ois":41,"oin":31,"ocī":54,"odā":459,"iģi":165,"iģe":27,"ok ":31,"gša":114,"oju":1293,"odī":31,"ojo":586,"oji":109,"oje":183,"oja":1186,"ol ":51,"oce":578,"och":26,"oci":581,"ock":58,"oco":46,"obs":66,"obu":91,"oca":32,"ode":548,"odi":313,"odo":337,"odn":41,"ods":132,"odr":212,"ocy":51,"jā ":4204,"of ":167,"oda":2367,"oel":45,"oef":31,"oei":29,"oen":27,"odz":39,"odu":575,"jām":1042,"jān":27,"ofi":331,"jāj":22,"jās":923,"ofs":60,"oft":50,"ofo":72,"ofe":168,"jād":51,"jāb":32,"ofa":27,"nzē":40,"oal":24,"oak":87,"oba":54,"od ":93,"obo":33,"obr":164,"obl":72,"obj":208,"obi":154,"obe":704,"nsī":306,"ntē":146,"ة ":22,"nza":50,"nzi":41,"nzo":53,"ntī":340,"nvā":46,"ntū":58,"otā":1010,"orķ":25,"orī":50,"opš":220,"jēj":29,"osā":36,"jēd":148,"orē":91,"ows":32,"ovā":78,"gūš":56,"ozu":30,"otī":57,"ozo":189,"oze":115,"ozi":143,"oza":288,"otē":115,"orņ":38,"otu":305,"oti":1556,"oth":36,"ote":602,"ott":59,"olū":125,"ots":723,"otr":369,"onī":135,"oto":694,"otn":222,"onē":180,"ost":506,"gūt":168,"osu":40,"ota":947,"osi":149,"osk":156,"ose":88,"osf":203,"onā":1526,"osp":73,"oss":48,"gūs":114,"gūr":45,"osm":456,"osl":225,"oso":109,"orā":211,"owe":22,"ovi":389,"ovg":31,"opī":120,"ovs":143,"opē":176,"ox ":29,"jē ":37,"ova":643,"ove":137,"opā":322,"oun":55,"ous":25,"our":37,"out":44,"opm":30,"opo":347,"opi":253,"opl":68,"ope":301,"okā":278,"oph":40,"opa":835,"os ":3856,"opu":475,"opr":71,"opt":86,"ops":93,"ojā":198,"or ":99,"oot":22,"oor":77,"ork":107,"orl":52,"orm":1086,"orn":222,"oro":248,"orp":128,"olī":129,"orr":39,"orc":49,"okļ":89,"ord":492,"ore":247,"omā":452,"orf":64,"org":872,"orh":26,"ori":1689,"osa":935,"ort":723,"ors":691,"orv":185,"oru":295,"omē":208,"orz":33,"ory":23,"olā":244,"ot ":1449,"m² ":52,"orb":148,"ora":653,"olē":98,"olb":29,"ola":1498,"old":135,"on ":659,"oli":1155,"oll":82,"olk":110,"olf":57,"ogā":71,"ole":441,"olg":32,"ols":417,"olt":46,"olm":107,"oln":31,"olo":1146,"olu":413,"ogē":50,"oka":339,"om ":55,"oki":85,"okh":49,"oke":542,"okr":192,"oks":506,"oko":208,"okl":145,"okm":47,"okt":89,"oku":309,"ona":1194,"ond":224,"ogļ":73,"onc":136,"onf":111,"one":173,"ong":158,"oni":1530,"onk":159,"onn":56,"ono":626,"onr":30,"ons":1271,"ont":491,"onu":567,"onv":60,"gšē":65,"ony":39,"onz":36,"oma":1047,"ome":546,"omb":95,"omi":730,"omm":51,"omj":98,"omp":487,"omo":191,"omt":26,"omu":284,"oms":200,"op ":56,"la ":3657,"eķu":303,"kuļ":50,"ktū":291,"kuš":110,"eķi":82,"le ":615,"eļd":25,"lce":40,"eļe":30,"eļa":635,"lci":93,"eļi":128,"eļv":32,"eļu":421,"eļo":286,"eļr":165,"gā ":654,"lde":303,"lda":256,"ldo":68,"ldn":139,"ldi":101,"ldv":22,"ldu":71,"lds":71,"ldr":27,"lab":686,"lac":74,"lad":63,"lah":38,"lag":32,"laj":474,"lai":2379,"lak":285,"lan":766,"lam":314,"lap":246,"lar":131,"lat":1524,"las":2361,"lau":462,"lav":247,"lay":44,"laz":38,"lba":32,"ld ":81,"lbe":128,"lbu":171,"lbr":107,"kvi":68,"kve":97,"kva":316,"kuu":25,"kut":36,"kmū":49,"kus":576,"kur":4124,"kup":73,"kun":181,"kum":1594,"kul":627,"kuk":32,"kuj":25,"koš":92,"krā":588,"koņ":106,"kta":560,"kte":51,"knē":45,"kss":117,"kmī":26,"ksp":156,"ksu":120,"kst":1947,"ksk":31,"cūk":37,"ksi":433,"kso":59,"ksn":205,"ksm":118,"ksl":373,"ktr":699,"kts":634,"ktu":492,"kti":662,"kto":408,"ksī":103,"ktē":73,"kuģ":144,"krū":64,"ktī":473,"ksā":78,"krē":127,"ktā":172,"krī":67,"fēr":208,"ksē":49,"lpo":192,"liņ":107,"lps":165,"lkā":113,"lpe":42,"lpi":52,"dū ":66,"ls ":2358,"lpu":76,"lok":186,"lon":250,"lom":194,"lop":98,"lor":215,"lod":1255,"loc":172,"log":309,"loj":39,"liģ":139,"lpa":98,"los":267,"lot":233,"lov":125,"loz":171,"lno":58,"lnk":23,"lni":256,"lne":55,"lob":58,"lnv":34,"lnu":164,"lns":198,"eņo":30,"lmi":75,"eņi":138,"lme":73,"eņe":150,"eņb":28,"lma":269,"eņa":350,"gāļ":23,"lna":214,"lmu":99,"eņu":143,"eņr":134,"lms":33,"ltk":80,"lti":424,"ltn":68,"lto":53,"ltr":29,"lts":217,"lnī":100,"ltu":221,"lud":54,"luc":32,"lub":111,"lug":36,"lpā":54,"lsi":33,"lkš":22,"lsk":140,"lsm":135,"lsn":30,"lso":131,"lsp":34,"lss":53,"lst":1735,"lsu":49,"eņķ":69,"lv ":26,"lta":371,"lte":188,"lkņ":24,"lmā":69,"lu ":1807,"eņē":35,"lnā":79,"lse":64,"dūd":48,"lsa":85,"lt ":35,"lhe":59,"eļā":42,"lgu":44,"lgs":36,"lgt":25,"lgr":44,"lgo":65,"lbā":28,"lge":25,"li ":1126,"gāz":197,"lga":178,"gār":144,"lfr":22,"gās":198,"gāt":113,"gān":110,"gāj":129,"gāk":229,"gāl":68,"gām":215,"gāc":40,"gād":124,"lfa":193,"lez":112,"lev":95,"les":1626,"let":176,"ler":213,"leo":70,"lep":67,"lem":358,"len":329,"lek":1222,"lei":141,"lej":190,"leg":36,"lef":47,"led":254,"lec":71,"lea":23,"lfī":26,"lls":27,"ldū":50,"llu":78,"lo ":480,"lla":140,"lle":160,"lgā":68,"lli":182,"leņ":73,"llo":82,"lko":184,"lku":31,"lks":97,"lka":119,"lke":27,"lki":33,"eļš":95,"leģ":37,"lkl":39,"leč":51,"ljo":123,"ldī":542,"eļļ":42,"lje":30,"ll ":123,"lja":61,"eļņ":24,"lit":895,"lis":1779,"lir":22,"lip":94,"lio":99,"lin":517,"lim":573,"liz":308,"ldē":33,"liv":35,"lic":208,"lid":326,"lia":94,"lib":49,"lik":929,"laš":379,"eļģ":25,"lij":964,"lig":100,"lie":3537,"lif":131,"ldā":34,"ma ":2997,"mac":23,"mai":882,"maj":174,"mak":194,"mad":68,"mag":255,"map":23,"lzī":23,"mar":276,"mas":1459,"mal":305,"mam":184,"man":1880,"maz":508,"mat":1537,"mba":149,"mbl":29,"mbi":138,"mbe":47,"mbr":158,"mbo":287,"me ":570,"mbu":166,"mda":33,"mde":27,"mdo":23,"med":310,"meg":22,"met":1247,"mes":1625,"mer":791,"mem":29,"mel":157,"men":1391,"mei":147,"meh":105,"mek":209,"mez":28,"hār":41,"mfo":25,"hān":110,"lva":502,"lve":648,"lvi":41,"luk":23,"loģ":845,"lup":35,"luo":34,"lum":401,"lut":87,"lus":425,"lur":30,"loī":24,"ly ":28,"loš":29,"loņ":26,"lvo":84,"lvu":57,"lož":83,"ltā":300,"gēn":73,"lsē":1122,"lzi":33,"lza":40,"lzc":242,"lsī":24,"lvā":83,"ltī":58,"lzs":43,"lvē":725,"ltū":225,"mpi":549,"mpe":254,"mpr":55,"mpo":156,"miņ":109,"mpl":189,"mpu":71,"mps":29,"mpt":22,"ms ":3789,"mog":37,"moc":27,"mob":84,"mod":286,"mon":399,"mop":26,"mok":98,"moj":43,"mom":24,"mol":217,"mor":220,"mos":1307,"mot":190,"mpa":147,"ešķ":123,"miķ":31,"mmā":27,"mmē":68,"msa":67,"mu ":2868,"mt ":26,"mtu":58,"mts":51,"mti":48,"mug":77,"mpā":152,"mss":33,"mst":25,"msu":24,"msk":121,"mte":23,"mta":1054,"mvi":24,"moš":35,"mur":78,"mus":558,"mut":78,"mui":109,"mul":121,"mum":318,"mun":151,"mpē":250,"muz":106,"maņ":208,"džu":100,"džs":35,"mga":146,"dža":84,"mi ":937,"maģ":28,"dži":70,"dže":114,"mju":201,"ml ":23,"mje":32,"min":1418,"mio":29,"ešo":30,"mil":392,"mir":718,"ešs":32,"mis":1428,"ešv":29,"mit":475,"ešu":1242,"miz":31,"mic":35,"eša":400,"mig":64,"mie":997,"ešd":35,"mid":33,"mik":313,"mij":472,"maš":145,"eši":317,"mo ":301,"meņ":339,"meļ":1113,"džā":37,"mko":36,"mm ":35,"mni":192,"mna":27,"ešā":114,"mež":103,"mmu":74,"mma":291,"mme":33,"vī ":36,"tšķ":209,"uņi":48,"uņo":84,"uņu":131,"tūt":53,"tūr":1186,"tūk":66,"tūc":93,"vīn":52,"vīt":70,"vīr":237,"vīg":77,"vīd":56,"vīb":202,"vīz":153,"vē ":134,"īču":22,"īča":67,"sūt":95,"vēl":461,"vēk":693,"vēj":244,"vēc":27,"Čeh":54,"vēs":501,"vēt":425,"vēr":790,"vēn":31,"vēģ":76,"Čik":22,"uļo":37,"uļv":73,"uļu":71,"uļi":36,"uļa":32,"vēš":46,"tļa":51,"tļu":59,"tļi":52,"tļo":22,"vā ":578,"vāj":43,"vāc":471,"vāl":70,"vāk":140,"vān":38,"vām":138,"vār":1021,"vāt":107,"vās":151,"vāv":76,"ržu":43,"uģa":52,"uģi":59,"uģu":36,"tņu":72,"ča ":145,"zra":210,"zru":92,"zjū":39,"tīd":188,"zlā":29,"tīb":1094,"zs ":189,"tīt":1190,"tīs":433,"tīv":934,"tīr":77,"tīk":237,"tīn":159,"tīm":268,"tīg":275,"tīj":350,"zte":106,"zti":66,"ztu":134,"čem":249,"ztv":96,"čet":144,"znī":263,"čer":57,"zsa":231,"rūd":56,"znā":215,"zmē":101,"zu ":183,"rūt":48,"zst":360,"rūs":59,"zsv":33,"znē":22,"zsl":25,"zsk":600,"rūk":50,"rūn":90,"rūm":55,"zsp":47,"rūp":88,"zva":673,"zvi":325,"zve":584,"či ":69,"zul":164,"zum":137,"zpē":51,"zus":63,"čie":24,"zsā":72,"tīņ":237,"zrā":47,"tīģ":32,"zzi":39,"zuā":39,"īģe":36,"tīš":166,"zvē":121,"rūš":31,"čs ":40,"zgl":134,"zga":142,"rš ":626,"zi ":317,"uār":73,"uāl":181,"zaļ":79,"zgu":59,"zef":34,"zej":232,"zdz":26,"zeb":60,"zdo":186,"uāc":50,"zes":430,"zen":285,"zem":1268,"zel":450,"zek":170,"zer":903,"ze ":308,"zce":422,"zbr":68,"zda":107,"zde":234,"zci":73,"zab":39,"īļu":88,"zai":49,"tēš":90,"zah":57,"zam":111,"zan":24,"zak":23,"zar":328,"zau":128,"zav":27,"zas":275,"zat":43,"zod":22,"zob":42,"tī ":235,"zos":43,"zot":69,"zor":28,"zop":22,"zom":26,"zon":281,"zol":154,"zof":144,"ziļ":81,"ziķ":104,"zpa":78,"zpr":45,"ال":32,"zpl":262,"ziņ":261,"zkā":32,"zpi":173,"zo ":48,"zma":918,"zme":35,"zmi":84,"zna":38,"zmu":41,"zno":68,"zne":198,"ršā":79,"zni":156,"zbū":81,"zka":34,"zkl":48,"zkr":193,"zla":642,"zgā":25,"zli":78,"zeļ":55,"zeņ":47,"rša":176,"zie":1720,"zid":135,"zij":625,"rši":100,"zin":1174,"zim":1926,"zil":123,"zik":669,"ršo":29,"zio":48,"zcī":73,"ršr":51,"zir":324,"zis":229,"zit":58,"ršu":90,"ziv":58,"tē ":318,"yst":46,"rņā":31,"sīk":51,"sīj":64,"sīm":27,"sīn":31,"sīs":28,"sīt":105,"sīv":85,"sīb":502,"sīd":98,"sīg":56,"tāž":25,"yon":65,"sī ":32,"tēģ":40,"īņā":28,"za ":232,"āža":23,"āžu":50,"īņu":254,"īņa":76,"sīļ":23,"tēk":46,"tēl":335,"tēm":821,"tēn":44,"tēj":348,"tēt":204,"tēs":73,"tēv":63,"tēr":175,"tēz":37,"īša":499,"īšu":52,"tā ":2912,"tāc":148,"tād":527,"tāk":538,"tāl":1306,"tāj":1760,"rķu":27,"rķe":31,"rķi":179,"yla":29,"tāļ":48,"rņa":66,"rņu":41,"tāš":25,"tāv":1332,"tāz":22,"tāp":79,"tām":538,"tān":411,"tās":1054,"tāt":740,"tār":419,"rīz":162,"rīn":188,"rīj":40,"rīk":162,"rīl":26,"rīv":387,"rīs":243,"rīt":190,"rīg":665,"rīb":593,"rīd":236,"rīc":210,"pūt":23,"sāļ":43,"rī ":2361,"rīš":55,"pūš":30,"sē ":157,"sēj":316,"sēk":27,"sēm":89,"sēn":52,"sēd":44,"sēs":30,"sēr":135,"sēt":1211,"ožu":58,"oža":54,"sāt":137,"sās":67,"sār":68,"sāp":24,"sān":37,"pš ":215,"sāc":43,"sāj":28,"sāk":643,"sāl":103,"sām":92,"ožā":27,"pšu":22,"rģi":226,"pša":33,"rģe":23,"rēķ":47,"sā ":333,"rēš":122,"rē ":110,"ww ":36,"āļu":173,"āļi":45,"www":36,"rēt":449,"rēs":31,"rēm":93,"rēn":56,"rēj":380,"rēk":33,"rēl":48,"rēd":88,"rāļ":69,"rāņ":44,"ošā":198,"rāš":23,"ws ":30,"ošī":35,"wor":28,"rā ":1557,"pļa":22,"rāk":869,"rāl":881,"rām":402,"rān":164,"rāg":38,"rāj":67,"rāc":432,"rād":816,"rāf":309,"rāb":117,"wer":27,"nže":53,"rār":31,"rāp":24,"rāv":135,"rās":553,"rāt":492,"rāz":74,"ošs":181,"ošu":115,"oši":415,"ošo":147,"oša":1342,"vze":33,"vvē":32,"āņi":41,"āņu":468,"vuš":26,"āņa":65,"war":33,"pīt":24,"pīr":89,"pīn":23,"pīl":42,"viš":195,"pīg":88,"pīd":81,"pīb":56,"vs ":473,"vri":37,"vst":90,"vsk":63,"vu ":956,"vus":141,"vmū":35,"vum":201,"vuk":61,"vul":71,"vva":94,"voš":86,"pīļ":68,"āša":527,"via":49,"nša":32,"vio":33,"vir":905,"vik":41,"vil":371,"vin":313,"vig":52,"vij":2101,"vic":31,"vid":1832,"vie":6969,"viz":69,"nšu":53,"vit":157,"nšt":29,"vis":1620,"vji":56,"vju":71,"vo ":339,"oņa":59,"veš":46,"oņi":162,"oņu":145,"viā":31,"vna":31,"vni":238,"vič":61,"vod":34,"voj":423,"vol":130,"vok":149,"von":248,"vor":27,"vot":667,"vos":64,"vpa":23,"viļ":118,"vpi":79,"viņ":404,"vpr":25,"vaņ":38,"vaļ":151,"vi ":493,"vgo":31,"ver":992,"ves":531,"vet":45,"vej":30,"vei":2843,"ven":961,"vem":34,"vel":158,"vek":42,"ved":123,"vec":250,"oļu":172," − ":35,"vda":91,"uzņ":254,"ve ":176,"val":3215,"vak":43,"van":198,"vam":39,"var":1672,"vat":135,"vas":1579,"uzē":26,"vab":47,"vad":1354,"vai":3704,"vaj":267,"pēļ":144,"va ":982,"mūž":31,"uvē":31,"uvā":81,"uzv":164,"uzl":72,"uzk":39,"urš":530,"uzi":73,"uzg":36,"uze":128,"uzt":192,"uzs":479,"uzr":64,"utī":62,"uzm":28,"utē":47,"uzb":144,"uzc":22,"uzd":109,"utā":171,"usī":47,"pēt":433,"usē":169,"usā":96,"pēd":188,"pēc":941,"pēj":691,"urģ":36,"pēm":32,"pēl":736,"pēk":384,"pēr":307,"urī":204,"urē":196,"uz ":1335,"urā":908,"umš":52,"uum":27,"upā":104,"upē":50,"ux ":33,"uus":124,"uvi":141,"uvo":31,"uva":357,"uve":337,"upī":23,"uvu":135,"usl":182,"usm":71,"usj":49,"usk":104,"mūk":26,"ukš":120,"usi":609,"unā":343,"usd":32,"use":127,"usa":206,"mūz":468,"usz":100,"usv":36,"usu":107,"ust":2158,"mūs":296,"uss":395,"mūr":69,"umī":24,"uso":49,"mūn":33,"utn":283,"uth":28,"uti":652,"ute":201,"uta":279,"utb":797,"utt":67,"uts":90,"utv":22,"utu":102,"uto":580,"utr":30,"unī":25,"us ":3775,"ulā":424,"oīd":99,"ulē":350,"ut ":97,"urb":65,"ura":1608,"urd":30,"urc":141,"umā":913,"ure":246,"urg":216,"uiž":110,"urj":22,"uri":851,"url":46,"urk":327,"urn":303,"uro":291,"urp":107,"ulī":38,"urs":344,"urt":245,"uru":1149,"urv":97,"umē":24,"urz":200,"unz":34,"ugš":195,"ujā":42,"uor":22,"upa":461,"ur ":573,"upj":23,"upi":127,"ukā":121,"upe":293,"upo":29,"upu":109,"ump":111,"ums":2443,"umu":2190,"umt":47,"umv":39,"umi":1048,"umk":31,"pāņ":98,"umm":30,"uml":29,"umo":1060,"umn":30,"uma":1511,"umb":191,"umd":66,"ume":317,"udž":74,"unt":48,"uns":163,"unr":34,"unv":30,"unu":158,"unl":25,"unk":486,"uni":663,"uno":67,"ugļ":37,"unc":25,"und":348,"una":264,"ung":150,"une":47,"up ":25,"uks":271,"uku":836,"ukt":791,"uko":36,"ukl":91,"uki":42,"ukc":69,"um ":175,"uka":140,"ulv":63,"ulu":136,"ult":666,"uls":122,"ulp":39,"ulo":29,"ulm":42,"ull":58,"ulk":243,"uli":203,"ulg":48,"ule":823,"ulf":35,"ugā":140,"ulc":29,"uld":110,"ula":397,"ulb":50,"un ":9898,"uid":83,"oģe":28,"udā":23,"oģi":861,"uil":24,"uis":38,"ucē":126,"ķa ":175,"mša":94,"uji":27,"ujo":73,"udē":94,"ul ":24,"uja":223,"ubā":54,"ugi":71,"ugo":30,"ugl":76,"pār":1984,"pās":27,"uga":887,"ugu":553,"ugs":799,"uha":22,"uj ":134,"uco":23,"pā ":453,"uda":124,"ude":104,"udi":282,"ķir":550,"ķis":433,"ķin":48,"ubs":106,"ķij":62,"ķim":24,"ubu":76,"ķie":106,"ķid":184,"uca":88,"ue ":35,"uce":66,"uci":96,"uer":36,"pāc":26,"pān":369,"pām":68,"pāj":65,"udu":86,"āču":68,"udr":142,"udo":32,"ug ":37,"udz":752,"uel":29,"ķet":39,"ķes":25,"ķer":270,"ķen":35,"ķel":35,"tuš":25,"ua ":31,"tzī":84,"uar":51,"ual":31,"uan":35,"ubi":59,"ubj":31,"ubl":522,"ube":111,"uba":77,"uag":22,"uc ":316,"ķi ":217,"tvā":37,"tzi":22,"tuā":134,"ttī":266,"trū":52,"ttē":198,"tza":38,"ttā":196,"nīš":44,"tyl":39,"tvē":108,"ķu ":470,"ty ":51,"toņ":39,"tve":395,"toļ":81,"tvi":1976,"tva":158,"tur":1909,"tus":646,"tuv":673,"ķo ":45,"tul":280,"tuk":47,"tun":72,"ķeš":66,"tum":1336,"tub":43,"tud":188,"tuc":22,"tug":34,"tpū":25,"trī":313,"trē":132,"two":22,"toš":435,"trā":1294,"nīc":430,"nīd":58,"nīb":728,"ts ":6404,"nīj":53,"nīg":331,"nīt":121,"nīs":25,"nīr":102,"nīn":27,"nīm":63,"tlē":35,"tre":229,"tt ":106,"oča":27,"tra":1972,"tri":1251,"oči":28,"trs":398,"tru":1883,"tro":2406,"tlī":43,"tu ":4075,"tmē":32,"tsa":92,"tse":174,"lūc":74,"lūd":44,"lūg":25,"lūk":137,"tsk":152,"tsl":24,"lūp":24,"tsp":162,"tsv":27,"tsu":24,"lūt":49,"tst":123,"lūs":138,"lūz":46,"tnē":293,"tta":34,"tte":67,"tti":291,"ttl":22,"tto":22,"tnī":27,"ttp":39,"ķus":35,"tma":80,"to ":1892,"tms":32,"tmo":123,"tml":23,"tmi":87,"tni":753,"tne":504,"tp ":39,"tna":35,"tns":112,"tnu":123,"tno":42,"tof":38,"tod":192,"toc":137,"toj":671,"tog":69,"nī ":207,"tob":116,"tov":52,"tos":547,"tot":998,"toz":23,"tom":540,"ton":588,"tok":294,"tol":346,"tor":2083,"top":326,"tpe":28,"tkā":47,"tpi":87,"tiķ":219,"tpa":63,"tpl":32,"tiņ":164,"tpr":24,"tij":940,"til":601,"tik":1958,"tif":82,"tie":4403,"tig":30,"tir":195,"tit":273,"tis":2277,"tin":593,"tim":104,"tip":378,"tio":312,"tia":75,"lša":104,"tic":277,"tid":70,"teā":78,"tju":24,"tiz":110,"lšu":27,"tiv":158,"tja":44,"tki":59,"tkl":212,"tko":27,"tkr":96,"tku":37,"tka":280,"tli":341,"teņ":82,"tla":152,"tgā":22,"tle":76,"tem":480,"ten":587,"teo":285,"tep":65,"tei":672,"tej":47,"tek":587,"tel":542,"teg":123,"teh":268,"teb":48,"tec":92,"ted":53,"tfo":35,"th ":68,"tet":45,"tes":1146,"ter":2575,"tgr":47,"ti ":2942,"tga":132,"tač":123,"tho":40,"the":191,"thi":29,"taļ":38,"tha":41," α ":31,"āpa":64,"āpe":142,"ākā":884,"āpi":34,"ājē":32,"ār ":34,"ājā":201,"āno":72,"āns":365,"ānu":327,"āre":82,"āmā":78,"ārg":36,"āra":470,"ārb":33,"ākļ":103,"ārd":904,"ārm":72,"ārn":280,"āro":74,"ārp":114,"āri":461,"ārk":89,"ārl":118,"āt ":234,"ālā":1101,"āpu":44,"āps":37,"ās ":6616,"āld":44,"āgā":35,"āle":181,"āli":1326,"āla":1215,"āks":485,"ākt":61,"āku":495,"āko":642,"āki":268,"āka":1716,"ām ":3525,"ājs":680,"ābū":29,"ādī":293,"āju":1139,"āji":502,"ājo":89,"ājp":22,"āni":891,"āna":615,"ānd":34,"āmu":42,"āms":160,"āmi":68,"āma":396,"ālu":633,"āls":579,"ālo":317,"ārē":202,"āsā":40,"āpš":36,"ārī":93,"āvu":149,"žās":33,"ārā":289,"žād":455,"žāk":194,"žām":34,"ātē":54,"āze":265,"ārš":275,"āzi":216,"āzu":50,"zšķ":85,"ātī":67,"ātā":439,"ārņ":54,"āv ":562,"žā ":38,"āta":627,"āst":95,"āsu":62,"ātn":541,"ātr":318,"āto":42,"āte":469,"āti":825,"ārz":92,"āsa":252,"ālī":22,"ārr":75,"ārs":844,"ārt":1313,"āru":144,"ārv":488,"ānā":124,"āsi":27,"ākš":30,"āvd":67,"āva":113,"āpē":74,"āvs":30,"āvo":161,"āvj":71,"āvi":182,"āve":205,"āts":679,"ātu":408,"Āzi":142,"āfi":261,"āfr":63,"āfs":37,"āga":57,"āgs":41,"āgo":45,"āj ":38,"ābē":84,"ācē":37,"āk ":1029,"ādā":677,"ācī":199,"ādē":157,"āja":844,"āje":33,"ābi":54,"ābj":26,"ābe":514,"āba":24,"āca":27,"ābu":87,"āci":2513,"ācb":31,"āda":636,"ācu":337,"ādn":33,"ādo":104,"āde":246,"ādi":416,"āds":129,"ādu":390,"ļin":31,"ļie":252,"ļi ":260,"zņe":149,"ļda":25,"ļav":42,"ļau":478,"ļai":47,"ļas":391,"ļam":83,"ļa ":1215,"āzē":88,"zņē":241,"žīm":24,"žģī":37,"ātņ":41,"āvē":149,"āvā":189,"āvī":87,"AU ":30,"":23,"zēš":55,"BA ":99,"ķī ":22,"ļot":251,"ļos":305,"ļiņ":134,"ļoj":28,"ļri":165,"Ķīn":125,"ļu ":1814,"zīš":29,"ļsk":34,"ļve":81,"ļvi":28,"ļum":33,"ļus":125,"ļuv":123,"ķēr":58,"ķēn":27,"ķēd":39,"ļoš":38,"Žan":29,"zīs":345,"zīt":108,"zīv":1468,"zīl":49,"zīj":29,"zīm":1070,"zīd":65,"zīc":55,"zīb":324,"zīg":242,"AP ":26,"zāc":457,"zā ":57,"ža ":310,"zāk":210,"zāl":65,"zām":26,"zān":65,"zās":75,"žku":26,"zē ":172," С ":34,"žis":66,"ķīd":71,"ķīr":23,"ķīs":84,"žie":84,"ķīn":49,"ķīm":271,"žos":33,"žot":153,"žor":51,"žkā":37,"Āge":33,"žon":63,"žoj":249,"Āfr":113,"zēl":36,"zēj":212,"zēm":74,"zēt":500,"zēs":27,"zēr":47,"žni":23,"žo ":82,"žas":293,"žag":66,"žai":25,"žan":122,"žam":29,"ži ":383,"žer":29,"žes":28,"žet":61,"žei":43,"žel":23,"žen":74,"uža":37,"užu":32,"žus":34,"žur":90,"žoš":80," Н ":25,"žs ":70,"žre":77,"žu ":492,"ušā":109,"ušu":56,"ušo":68,"uši":230,"uša":268},"n_words":[2106271,2428959,2036825],"name":"lv"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":603,"E":600,"F":432,"G":577,"A":1208,"B":839,"C":1276,"L":579,"M":1021,"N":552,"O":461,"H":549,"I":1773,"J":289,"K":455,"U":346,"T":883,"W":350,"V":551,"P":991,"S":1413,"R":594,"X":347,"f":1092,"g":1689,"d":2970,"e":9834,"b":1224,"c":2969,"a":10833,"n":6781,"o":7437,"l":4802,"m":3009,"j":528,"k":1469,"h":2724,"i":8516,"w":745,"v":996,"u":3216,"t":5815,"s":5501,"r":6845,"p":1803,"z":571,"y":1482,"x":320,"ν":292,"ο":532,"ι":356,"α":455,"ς":427,"ρ":335,"τ":303," m":470," o":776,"ш":16654," i":285," k":375," d":474,"ф":11418," e":367,"х":6666,"ц":21764,"ч":22883,"р":143160,"с":124190," a":559,"т":178546," c":357,"у":55891,"ќ":2840,"џ":1064,"љ":298,"ј":55736,"њ":6876,"ѕ":416," t":538," p":361,"ѓ":4991," s":442,"Ј":1691,"Џ":661,"И":3549,"Л":2399,"К":6325,"Н":3704,"М":7265,"П":6311,"О":4503,"Б":5118,"А":6164,"Г":3754,"В":4777,"Е":2673,"Д":3627,"З":1960,"Ж":390," K":424,"Ш":1184," H":467," I":966," N":433," O":314," L":498," M":897," B":722,"Т":4778," C":1036,"У":965,"Р":4369," A":1024,"С":9342,"Ц":1100," F":403,"Ч":767," G":506," D":520,"Ф":2111," E":476,"Х":1951,"л":91274,"к":111435,"и":261784,"п":63533,"о":278169,"н":193670,"м":56196," S":1135,"г":42718,"в":105270," R":489,"б":32290," P":814,"а":349769,"з":39653," W":317,"ж":11105," V":362,"е":224574," U":304,"д":90920," T":673,"ا":324,"Ист":523," А":5482," Б":5022," В":4654," Г":3616," Д":3069," Е":2489," Ж":385," З":1936," И":3361," К":5975," Л":2339," М":6938," Н":3453," О":4096," П":6133," Ј":1624," Џ":661,"I ":636," б":12052," а":9283," г":16454," в":30954," е":23200," д":18766," з":12782," ж":2284," и":33901," л":4546," к":23687," н":51827," м":12882," п":36928," о":29291," Р":3961," С":8974," Т":4639," У":893," Ф":1931," Х":1928," Ц":1068,"II ":332," Ч":758," Ш":1131," ј":6372," ќ":305," т":10678," у":3946," р":10396," с":42490," ц":3019," ч":2568," ф":5078," х":2102," ш":4226,"Ma":323,"II":479,"Кар":402,"a ":2176,"i ":786,"he":719,"ha":360,"g ":312,"ea":411,"ed":390,"de":624,"di":400,"h ":457,"el":544,"Кра":315,"en":923,"et":387,"es":758,"er":1600,"ca":400,"e ":2587,"da":295,"f ":324,"Кон":322,"co":397,"ci":314,"ch":448,"ce":390,"d ":868,"at":846,"as":511,"ar":1185,"al":1004,"am":363,"an":1614,"ac":442,"ae":314,"nt":684,"of":311,"om":450,"on":1271,"ol":392,"os":424,"ou":298,"op":290,"or":933,"r ":882,"lo":357,"ll":449,"o ":1254,"ma":512,"me":447,"na":717,"nd":628,"ne":472,"ng":429,"ni":597,"m ":843,"km":293,"li":761,"le":704,"la":806,"n ":1564,"hi":303,"id":295,"ic":719,"ia":681,"ie":315,"is":934,"it":584,"il":394,"in":1115,"io":670,"l ":690,"y ":690,"ve":297,"ur":355,"us":735,"to":540,"tr":283,"te":849,"ti":907,"th":673,"ta":699,"st":673,"se":302,"si":446,"rt":285,"ro":643,"ri":940,"re":789,"ra":861,"t ":818,"s ":2376," ја":4740," ју":1544," ќе":299," ра":3709," ре":3839," ри":619," ро":1485," пр":14360," св":1979," си":2281," се":16046," сл":2331," см":1011," ск":603," сп":1778," ср":1095," со":9088," ру":564," са":1270," ти":832," тв":438," те":3329," то":1808," тр":1830," ст":3760," су":653," та":1317," ус":533," уп":512," ун":383," ум":454," ту":815," фо":1103," фр":608," фу":590," фе":667," фи":1284," фа":744," уч":625," уш":328," хр":549," хо":304," хи":314," хе":609," цр":836," ци":311," це":1471," чо":400," чл":393," чи":499," че":954,"На ":392,"Мак":2308,"Мар":480," ше":287," шп":318," шт":2822,"Нов":305,"ад ":2199,"ав ":328,"аа ":1364,"ам ":781,"ан ":3695,"ак ":666,"ал ":2115,"Ово":374,"Ова":565,"аву":1675,"авт":686,"ага":381,"агр":309,"аго":607,"ада":949,"ади":1566,"аде":2265,"адр":313,"адо":1542,"адм":285,"адн":1034,"Опш":342,"аед":445,"би ":515,"ај ":1373,"ажу":426,"аат":3002,"або":1052,"ава":2546,"авн":1461,"аво":532,"аве":1404,"ави":1807,"алс":704,"алн":2980,"ало":848,"алб":313,"ала":1561,"алк":385,"али":3092,"але":2411,"амо":926,"ама":619,"ами":1203,"аме":1650,"ано":1527,"ану":570,"анс":4432,"ант":1418,"анц":867,"ана":3750,"анд":1155,"анг":1211,"ани":6541,"ане":747,"анк":497,"азн":452,"азл":623,"ази":3687,"азб":417,"азв":373,"аза":432,"аин":488,"аку":331,"акт":1164,"ако":4144,"акв":313,"аке":3801,"ака":975,"ач ":549,"Пар":311,"ас ":519,"ар ":2828,"ат ":6463,"ба ":1244,"аш ":464,"Пет":286,"САД":331,"Пол":375,"При":340,"Пре":445,"Про":342,"Рим":340,"Се ":508,"Реп":976,"Алб":429,"Аме":307,"Рус":402,"Ско":700,"Срб":287,"Сев":333,"Све":487,"Сел":334,"Таа":318,"Во ":1246,"Бел":406,"Тур":381,"Тој":871,"Вел":340,"Буг":380,"Гра":387,"Грц":367,"Фил":384,"Фра":322,"Евр":489,"лам":296,"лар":497,"лан":1512,"лас":1834,"лат":1878,"лба":979,"ме ":1911,"ма ":2440,"лав":1747,"лаг":370,"лад":919,"кул":1012,"куп":340,"кув":856,"кци":783,"кој":5826,"кре":290,"кра":1871,"кри":723,"кру":671,"кро":310,"лу ":886,"кса":284,"кси":508,"кст":341,"кте":605,"кти":891,"кто":713,"ктр":386,"кту":473,"кла":474,"кло":480,"ло ":2497,"кле":364,"клу":723,"кни":318,"ког":686,"ков":1275,"ком":1394,"кон":2061,"коп":1132,"кор":1338,"кот":2936,"кое":689,"кои":2339,"кол":2256,"кин":368,"кио":3400,"кит":2518,"ле ":2377,"кви":345,"кед":3751,"ли ":5921,"кај":517,"ква":1424,"кат":5730,"кар":1133,"кан":1422,"как":3097,"кал":885,"каж":406,"каз":294,"кац":351,"кад":630,"ла ":4400,"кт ":520,"ку ":1035,"иња":474,"ко ":6033,"ија":20901,"ки ":14686,"од ":14751,"нај":2709,"нац":749,"нау":554,"нач":1308,"нао":2376,"ог ":396,"нан":329,"нам":682,"нал":1991,"нат":7755,"нас":2005,"нар":1995,"нап":588,"нае":357,"над":781,"нак":346,"нда":719,"нго":310,"нгл":1025,"нга":386,"ое ":669,"неш":642,"нек":1134,"нен":441,"ои ":2794,"нер":1150,"нес":1333,"нет":980,"нег":1025,"нез":398,"нди":607,"ндо":434,"нив":1190,"низ":1413,"ник":2233,"ние":1319,"ок ":1161,"неј":441,"оа ":1059,"ов ":1440,"нав":317,"нт ":615,"мпи":518,"мпе":650,"мот":1353,"мск":1350,"мун":373,"муз":537,"мил":526,"мин":1679,"мио":320,"мис":724,"мир":732,"мит":1061,"миј":737,"но ":8706,"мна":468,"мно":1052,"мод":313,"мож":942,"мон":609,"мор":627,"мај":590,"нд ":382,"мбо":288,"маш":335,"мал":1112,"мак":1628,"мат":1625,"мас":309,"мар":614,"нг ":468,"ман":2367,"маа":339,"меѓ":1150,"мес":950,"мет":2904,"мен":3380,"ни ":9820,"мер":1511,"мед":429,"мвр":974,"не ":1045,"на ":51854,"лош":669,"му ":324,"лни":1621,"лно":1301,"лна":1254,"лок":568,"лог":1288,"лож":284,"лос":414,"лот":1377,"лом":349,"лон":478,"лов":1561,"лоб":414,"луч":790,"луц":507,"лув":419,"луѓ":365,"лст":374,"лск":1346,"лту":424,"лта":406,"лиг":412,"лив":613,"лиз":933,"лим":882,"лик":2508,"лез":472,"лев":497,"лег":370,"лед":1625,"лес":486,"лер":345,"ми ":1067,"лен":3033,"лем":2373,"лек":1469,"лет":727,"лку":522,"мо ":748,"лиц":678,"лич":1268,"лис":1550,"лит":1817,"лиф":510,"лин":1340,"лио":355,"лип":313,"лиј":1726,"лка":359,"пат":1339,"пад":1381,"пак":283,"пан":956,"пар":1020,"паѓ":517,"ре ":437,"оѓа":2306,"ра ":3142,"рв ":449,"пин":286,"пис":1343,"пиш":657,"оја":4079,"ојн":740,"ојо":993,"оју":491,"пла":934,"пли":320,"пле":333,"ро ":695,"пло":389,"рз ":285,"ри ":5094,"пер":1862,"пес":307,"пет":425,"ори":3831,"опј":624,"орд":482,"оре":2005,"орг":1108,"орс":410,"оро":1288,"орм":1326,"орн":663,"опш":1157,"опс":733,"ора":1582,"опе":396,"опи":565,"опо":646,"опр":349,"опа":631,"оте":1213,"отк":367,"оти":883,"ото":5629,"отн":448,"отр":639,"ота":373,"осе":641,"оси":467,"оск":407,"осл":1286,"осн":1425,"осо":623,"ост":6452,"ору":832,"орт":556,"осв":433,"омн":415,"оми":891,"оме":1683,"ома":1343,"олу":1982,"олс":527,"олн":609,"по ":1690,"оло":2164,"олк":576,"олж":334,"оле":2647,"оли":2456,"олг":289,"ола":807,"окр":1306,"окт":286,"око":2063,"онс":2772,"онт":604,"онц":377,"они":3271,"оно":1647,"она":1646,"онг":566,"оне":1032,"омо":661,"омп":789,"ошк":618,"очи":455,"оче":494,"очн":1426,"офи":716,"оце":494,"оци":931,"оцн":311,"ова":2348,"обр":811,"обо":536,"обл":1113,"оби":1201,"обе":598,"па ":1261,"оис":379,"оим":505,"ока":1091,"ој ":6820,"озд":514,"ози":514,"оза":639,"озо":309,"озн":2037,"оиз":367,"одн":2574,"оди":4273,"одр":1508,"одо":1251,"оед":304,"оем":310,"оен":405,"пи ":325,"оет":296,"оже":937,"ове":2546,"ови":5165,"обј":366,"ово":2355,"овн":742,"овр":984,"овс":795,"ога":1201,"оги":878,"огл":313,"ого":815,"огр":1093,"огу":753,"ода":1107,"одг":298,"одв":294,"оде":2517,"от ":21175,"нот":2355,"нос":3399,"ос ":575,"ном":868,"ное":302,"ног":889,"нов":2577,"ор ":1906,"он ":2167,"нкц":314,"нка":519,"нки":331,"мја":559,"ниј":3754,"мји":343,"нир":495,"нис":1095,"нит":3914,"ним":450,"нин":793,"ол ":609,"нио":3455,"нич":1161,"ниц":2089,"нци":1726,"нцу":394,"нув":1506,"нта":1389,"нте":740,"нти":1942,"нто":379,"нтр":628,"нск":8777,"нст":1138,"сам":861,"сан":508,"сат":511,"те ":14904,"сво":1166,"све":1266,"себ":334,"сев":971,"сед":864,"сел":2274,"сек":726,"сеп":284,"ти ":4039,"сен":517,"сем":349,"сет":758,"сер":521,"сис":839,"сит":795,"син":510,"сил":492,"сим":340,"ски":15155,"ска":7767,"сиј":608,"сли":753,"сле":1796,"сла":744,"ско":2753,"сме":854,"слу":574,"сло":1010,"то ":13336,"сна":721,"сни":929,"соб":473,"сов":600,"сод":296,"соз":452,"сок":602,"сно":1836,"спе":625,"спа":419,"сот":388,"сос":874,"сон":314,"соч":283,"соц":292,"сре":870,"спо":1324,"спр":420,"роц":373,"рот":1919,"роф":371,"роп":944,"рос":834,"ст ":3322,"рој":2600,"рст":683,"рти":754,"рск":2698,"руа":354,"рув":1033,"руг":1560,"руж":292,"руп":1220,"рус":732,"рум":335,"рци":855,"рхе":302,"рхи":519,"АД ":331,"рши":562,"рчк":744,"та ":23306,"рад":4624,"раз":2373,"раа":405,"раб":909,"рав":1584,"рам":1207,"ран":5737,"рап":368,"раи":514,"рак":1026,"рал":1971,"раф":696,"рач":346,"рац":462,"рас":1293,"рат":2616,"рби":405,"рањ":570,"рај":718,"рва":669,"рди":936,"реб":752,"рев":1305,"рег":1001,"ред":3985,"реа":398,"рет":3047,"рес":955,"реп":457,"си ":856,"рен":1613,"рем":1904,"рел":504,"рек":1638,"рез":448,"реж":432,"ржа":1664,"реч":605,"реш":461,"се ":11859,"рво":342,"рве":449,"рви":554,"рга":1142,"рда":316,"пје":616,"рио":1095,"рип":420,"рим":878,"рин":740,"рик":1477,"рил":774,"рии":353,"рич":442,"рит":2109,"рир":852,"рис":2774,"рка":561,"ркв":703,"риј":3270,"рза":354,"реј":340,"риб":299,"рив":398,"рие":488,"рид":648,"риз":552,"рзи":395,"рни":1032,"рна":1303,"рок":621,"рол":306,"ром":966,"рон":752,"роз":340,"рои":701,"ров":2202,"рог":438,"род":3344,"роб":326,"рно":847,"рми":912,"рма":1504,"со ":5615,"пра":1665,"прв":1141,"при":3441,"пре":5957,"про":4233,"поп":440,"пор":1847,"пос":2237,"пот":1782,"поч":860,"пој":398,"рт ":589,"пое":340,"под":1948,"пог":377,"пов":1628,"пон":487,"пом":1198,"пол":2038,"пок":698,"поз":1786,"пуб":1193,"пст":289,"пск":1161,"пшт":1492,"са ":460,"вар":956,"ват":2229,"вач":868,"вај":507,"вањ":3265,"ваа":2386,"ван":1597,"вал":1237,"га ":1478,"бук":590,"вто":1107,"вск":1077,"вст":329,"врд":656,"вре":1804,"ври":1040,"врз":737,"вро":757,"гу ":683,"вру":340,"врш":792,"вој":2262,"вол":822,"вои":368,"воз":430,"вое":499,"вод":1487,"вот":2101,"вор":1284,"вос":388,"вни":1914,"вна":1234,"вно":1024,"вле":285,"вла":692,"го ":2513,"вкл":312,"виј":388,"вич":290,"бје":284,"виз":445,"виж":381,"вил":734,"вин":1260,"вио":811,"вис":1212,"вит":1405,"вид":1204,"вие":441,"веќ":667,"вес":376,"вет":2185,"вер":2282,"вен":3438,"ги ":1837,"вел":447,"век":1300,"вез":364,"вее":283,"вед":808,"ва ":8865,"бан":796,"бал":536,"ачи":1031,"ачк":640,"ачу":385,"ашк":290,"афс":366,"афи":298,"ача":344,"аче":347,"аци":3529,"апс":423,"апр":517,"аоѓ":2290,"апа":1524,"апо":665,"апи":499,"арх":678,"арс":1354,"арт":1039,"ару":298,"аса":450,"аре":1123,"ард":653,"ара":1867,"арн":612,"аро":1814,"ари":3185,"арк":629,"аст":2995,"ата":20566,"аси":680,"асе":1117,"асл":312,"аск":436,"асп":410,"асо":310,"асн":388,"ату":514,"ате":2125,"ати":3730,"атк":340,"атн":495,"атп":334,"ато":1865,"атс":493,"атр":330,"бол":467,"бод":337,"бор":1569,"бот":1139,"аќа":380,"бро":2687,"бри":330,"бре":488,"бра":1074,"ајќ":358,"бич":355,"аја":393,"ајг":681,"ајд":338,"биј":384,"ајм":316,"ајн":658,"ајп":372,"ајс":402,"ајч":379,"ања":532,"бла":981,"бли":2035,"ање":3967,"во ":22199,"беш":291,"ви ":3384,"бен":594,"бер":412,"без":326,"бед":404,"бел":545,"бир":443,"бит":461,"бил":3972,"бид":516,"ве ":532,"бат":375,"бар":494,"аѓа":856,"дба":505,"дан":444,"дар":782,"дат":1402,"дви":611,"два":530,"две":568,"ед ":1436,"дал":491,"дад":551,"дав":666,"ев ":387,"дек":1195,"дем":341,"дел":3411,"ден":5845,"дер":607,"деј":478,"дес":441,"дво":453,"ез ":342,"дст":314,"дск":660,"дро":328,"дру":1086,"држ":2045,"дре":612,"дра":666,"ет ":2199,"дув":889,"ец ":658,"ен ":10275,"диш":466,"диц":456,"ем ":1181,"диј":826,"дин":5382,"ел ":3388,"дис":491,"дит":597,"ек ":1095,"доб":755,"дов":988,"дод":356,"ес ":859,"дос":369,"дол":887,"док":331,"дон":4125,"дом":347,"доц":331,"дот":1786,"дна":2163,"дни":2521,"дне":428,"ер ":2117,"дно":2480,"дми":387,"al ":294,"да ":4123,"гал":357,"гаш":492,"гат":448,"ган":1283,"гар":1072,"де ":1153,"and":335,"an ":327,"вув":2038,"гол":2184,"гот":553,"гор":505,"гов":1806,"год":3338,"гру":1162,"грч":711,"гра":5732,"гри":662,"гре":320,"гус":349,"ген":767,"гер":583,"ди ":1894,"гео":440,"гит":546,"гио":772,"гиј":928,"ati":284,"гле":564,"гла":1406,"до ":1648,"гли":1163,"жан":284,"жат":386,"жав":1162,"за ":5828,"жит":423,"еја":386,"ејз":450,"ејс":859,"жив":1211,"жењ":316,"еѓу":1271,"жел":288,"зи ":329,"жен":928,"жно":527,"жни":489,"еќе":715,"жна":492,"ење":1370,"ς ":427,"жув":901,"еј ":342,"ежи":423,"еду":826,"жи ":501,"еза":382,"езн":323,"езд":396,"езе":477,"ези":298,"ева":629,"еви":727,"еве":1630,"еба":481,"ебе":337,"его":1463,"еда":979,"еде":2597,"еди":2433,"едо":4194,"едн":3441,"ево":1021,"же ":578,"евр":732,"ега":454,"еги":839,"ент":3074,"ену":344,"енс":1151,"енц":490,"енк":321,"ени":4714,"емј":955,"ено":2370,"ена":3300,"ене":1764,"енд":500,"еол":430,"еор":360,"еог":380,"епт":359,"епу":1119,"епо":307,"ерс":707,"ерм":1219,"ерн":1475,"еро":1509,"ери":4408,"ерз":406,"ерк":320,"ере":664,"ера":2000,"еке":284,"еки":306,"екл":312,"еко":2603,"ект":1806,"екс":1174,"еку":1063,"ека":1814,"елн":396,"ели":2173,"елу":301,"елс":372,"ело":2613,"еле":3177,"ела":1614,"елб":498,"емо":621,"еми":2065,"еме":2170,"ема":1276,"емв":738,"еци":352,"ечк":442,"ечн":297,"ече":723,"ешт":455,"ешн":741,"ешк":443,"еше":584,"есе":1065,"еси":417,"еск":430,"есн":1000,"есо":341,"есу":502,"ест":3547,"ета":1622,"ети":1694,"ете":797,"етк":321,"етр":679,"ето":5033,"етн":1147,"етс":2563,"етх":473,"иве":1425,"иви":638,"ива":1008,"К ":292,"иен":500,"иер":283,"иет":479,"иже":402,"идо":449,"игр":971,"ида":352,"иди":356,"иде":991,"иво":1058,"ивн":1457,"ига":533,"иги":461,"ико":1235,"ики":550,"ика":4788,"иит":454,"изр":316,"изм":698,"изи":1398,"изд":435,"изг":666,"иза":1563,"изв":1007,"изб":296,"иј ":327,"ион":2538,"иот":8785,"инц":810,"иод":616,"Д ":485,"ине":862,"ини":3140,"ино":890,"инс":2166,"инт":558,"ину":385,"ина":9820,"инд":577,"инг":507,"ими":494,"име":2091,"имс":590,"имо":452,"имп":928,"има":2023,"или":5248,"иле":1067,"илм":478,"ило":1256,"ику":390,"ила":1963,"иси":462,"исе":361,"иса":487,"ист":8007,"исо":810,"исн":402,"исл":520,"иск":3494,"ити":1229,"ите":15877,"ита":1364,"иту":775,"ито":1415,"ипа":375,"ипи":348,"ира":3770,"ире":311,"ири":916,"иро":1138,"ица":1798,"ици":3395,"ифи":395,"ифо":430,"ичи":380,"ичк":3274,"ичн":2026,"ича":867,"иче":543,"ишу":288,"ишт":844,"иша":368,"ка ":12158,"ив ":663,"зав":514,"зае":409,"збу":314,"збо":1117,"ид ":907,"зви":372,"зве":330,"зац":679,"зат":324,"зар":363,"зап":1380,"зан":720,"зам":529,"зби":290,"згр":557,"зда":948,"зво":628,"ие ":1909,"зер":453,"ии ":1591,"зен":322,"зем":1002,"из ":417,"зик":2343,"зир":670,"ил ":3300,"зин":797,"ик ":3740,"ин ":1454,"им ":736,"зиј":706,"зиц":855,"зич":514,"зит":574,"змо":513,"зли":625,"зна":3413,"зни":663,"ир ":506,"ис ":533,"зон":283,"ит ":464,"зра":561,"зув":311,"хе":1090,"хи":1138,"хо":1598,"хр":886,"ха":892,"ци":13250,"цн":316,"цр":866,"цу":450,"ца":2399,"це":3069,"чл":414,"чн":4154,"чо":686,"чи":3265,"чк":5243,"чу":1362,"че":4695,"ча":1926,"шп":336,"шн":1414,"шк":1724,"ши":1346,"шт":6867,"шу":599,"ше":1638,"ша":1244,"ск":26715,"см":1616,"сл":4921,"со":11740,"сн":3623,"ср":1256,"сп":3213,"св":2617,"се":20196,"си":5941,"рш":1040,"са":3377,"рс":3892,"рт":2549,"ру":6871,"рх":1018,"рц":1077,"рч":928,"тн":2841,"тл":534,"тк":1435,"тс":3837,"тр":9143,"тп":601,"то":28365,"те":29796,"тв":4680,"ти":19086,"та":36712,"су":2000,"ст":32059,"сц":293,"ур":4376,"уп":2469,"ут":1165,"ус":3800,"ум":2729,"ул":3220,"ун":2432,"уз":1241,"ук":2337,"уд":1768,"уг":3250,"уж":1378,"уе":298,"уа":1187,"уб":2169,"ув":11713,"тт":407,"ту":4071,"тх":499,"фу":697,"фс":456,"фр":997,"фо":2242,"фи":3122,"фе":1624,"уѓ":407,"фа":1468,"уч":2178,"уш":1171,"уц":759,"џа":360,"is ":354,"ion":475,"јќ":588,"ње":5394,"ња":1280,"ќе":1037,"ќи":734,"ќа":686,"јк":379,"ји":475,"јп":396,"јо":1613,"јн":1588,"јм":402,"ју":2500,"јс":1508,"јч":415,"ја":31927,"јв":456,"јг":710,"јд":458,"је":1464,"јз":570,"ѓа":3178,"ѕв":310,"ѓу":1282,"ѓе":418,"Ју":674,"Ја":560," Ma":317,"he ":401,"а ":151197,"Ис":747,"Им":497,"Ин":571,"к ":7206,"Из":327,"Ле":628,"Ли":491,"Ла":558,"Ку":463,"Кл":294,"м ":4019,"Ко":1594,"Кр":735,"Ки":516,"Ка":1846,"л ":10222,"На":1551,"Не":719,"Ни":442,"Мо":930,"о ":68362,"Ма":3832,"Ми":784,"Ме":823,"Ло":317,"н ":18349,"Лу":312,"Па":998,"Пе":965,"Пи":295,"По":1730,"с ":3798,"Оп":470,"р ":8070,"Ос":606,"Ов":1078,"п ":907,"Но":521,"в ":3431,"Ам":433,"Ан":848,"Ал":1075,"Ав":472,"Ба":928,"Ар":692,"б ":542,"АД":338,"д ":20357,"Во":1671,"Ве":686,"Ви":848,"Га":384,"г ":2028,"Бо":738,"Бр":705,"Бе":879,"Би":685,"Ва":687,"Бу":778,"Ди":461,"Де":856,"Др":341,"До":567,"Ег":326,"Ев":564,"Ге":525,"Гр":1204,"Гл":343,"Го":804,"е ":62888,"Да":437,"и ":77640,"За":893,"Зе":393,"з ":1486,"Ер":317,"ia ":314," km":286," o ":369,"Ст":846,"Та":770,"Ти":438,"Те":690,"То":1430,"ф ":405,"Тр":452,"Ту":524,"х ":407,"Пр":1600,"СА":368,"Ра":521,"Ре":1553,"Ри":680,"Ро":593,"т ":35867,"Ру":528,"Са":911,"Св":650,"Си":589,"Се":1780,"Сл":373,"Ск":807,"Ср":474,"Сп":404,"у ":4465,"Со":1077,"Це":313,"ш ":868,"Цр":465,"Че":319,"ц ":857,"Фр":445,"Фи":522,"Ха":496,"Хр":339,"ч ":920,"Хо":298,"Хе":392,"мб":886,"ма":12609,"мв":986,"ме":13316,"лј":325,"ми":7979,"лм":650,"лн":4295,"ло":11112,"лс":1887,"лт":1426,"лу":4631,"лб":1321,"ла":14718,"лж":375,"ле":15180,"лд":283,"лг":660,"лк":1166,"ли":21964,"кн":889,"кл":2271,"кр":4199,"кс":1892,"ко":29166,"кт":3820,"ку":4201,"кц":796,"ка":27714,"ки":22033,"кв":1916,"ке":4892,"иј":21371,"ињ":534,"иш":2058,"ио":12982,"ип":1917,"им":7924,"ин":21871,"ик":11472,"ил":14255,"ии":2086,"иц":5454,"ич":7399,"иф":1198,"их":496,"ит":22205,"иу":361,"ир":7402,"ис":15994,"ри":23710,"пј":828,"рк":2169,"рл":526,"рм":2952,"рн":3559,"ро":18691,"рп":592,"ра":32025,"рб":1278,"рв":2697,"рг":2004,"рд":2000,"ре":21468,"рж":2185,"рз":1370,"пш":1508,"пр":16649,"пт":674,"пс":1836,"пу":1996,"ој":14117,"пи":3835,"пн":348,"по":20419,"пл":2009,"па":7611,"пе":4276,"оѓ":2317,"ош":1554,"оч":2768,"оц":1836,"ос":13243,"ор":16926,"оп":6149,"оо":627,"ох":400,"оф":1458,"оу":339,"от":31750,"ок":6835,"ол":13527,"ом":7054,"он":15080,"ож":1857,"оз":4638,"ои":4773,"ов":16820,"ог":5746,"од":29777,"ое":2635,"оа":1652,"об":5296,"нц":2585,"нч":293,"нт":6041,"нс":10729,"нф":376,"ну":2258,"но":20747,"нл":307,"мј":1034,"нк":1682,"нз":408,"ни":34239,"не":10732,"нг":2911,"нд":3033,"нб":290,"на":77634,"му":1844,"мс":1507,"мр":516,"мп":2215,"мо":6592,"мн":1816,"ге":2417,"ги":5000,"гн":490,"го":11946,"гл":3601,"гр":9052,"гу":1829,"дг":373,"дв":2181,"дб":788,"да":9183,"вг":415,"ве":13770,"ви":13074,"бј":460,"вк":750,"вл":1274,"вн":4219,"во":32617,"вр":6858,"вс":1589,"ву":2258,"вт":1215,"га":5709,"би":7825,"бе":3314,"аѓ":883,"бр":4813,"бн":514,"аќ":462,"бо":4771,"бл":3356,"ањ":4559,"ај":7017,"бу":1986,"ва":22923,"ад":10986,"ае":1609,"аж":1096,"аз":7096,"аа":4444,"аб":2331,"ав":11512,"аг":2167,"ам":6375,"ан":27243,"ао":2597,"ап":4252,"аи":936,"ак":12466,"ал":15259,"ах":737,"аф":1127,"ач":3591,"ац":3769,"ас":7790,"ар":17103,"ау":1031,"ат":37636,"ба":4355,"аш":1946,"зр":646,"зу":791,"зи":7831,"зо":1382,"зн":4381,"зм":998,"зл":1008,"ив":6555,"иг":2802,"иа":334,"иб":811,"иж":639,"из":7830,"ид":3802,"ие":3605,"ј ":8902,"жу":939,"еј":2932,"жи":2889,"ењ":1526,"жн":1527,"еќ":923,"за":12607,"зб":1994,"зв":1627,"зг":962,"зд":1570,"зе":2250,"еф":889,"ет":17396,"ес":8709,"ер":16613,"еп":2846,"ео":1987,"ен":28920,"ем":9666,"ел":15327,"ек":11126,"еи":513,"ез":2727,"еж":1162,"ее":663,"еѓ":1275,"же":2514,"жа":2420,"еч":1979,"еш":2657,"ех":563,"ец":1329,"дс":1024,"др":5219,"ду":2009,"дн":7627,"дм":697,"до":12951,"ди":12328,"дл":415,"де":15216,"еб":1885,"ев":5864,"ег":3639,"ед":16863,"еа":1309," th":283,"er ":422,"es ":384,"ѓу ":904,"ѓа ":2452,"уци":681,"учу":537,"учн":321,"учи":444,"уче":651,"ушт":599,"уѓе":382,"фев":335,"фер":358,"фиј":319,"фин":438,"фил":788,"фик":307,"фиц":459,"фун":322,"фра":515,"фот":446,"фор":1281,"фск":446,"ца ":1645,"ци ":2599,"хео":283,"хем":340,"хри":386,"ход":569,"хит":310,"сто":6090,"стр":4031,"ств":3177,"сте":2303,"сти":4708,"ста":7801,"сув":590,"сту":413,"тав":2466,"таа":321,"так":837,"тал":1456,"тан":2640,"тат":1512,"тап":396,"уг ":454,"тар":2183,"тац":321,"тву":302,"твр":556,"тво":2438,"тве":696,"тва":548,"тем":1441,"тел":2805,"тео":301,"тен":1940,"тер":3359,"тет":1279,"тес":476,"тек":1487,"тив":1778,"тка":575,"тиј":1079,"ум ":532,"тич":2394,"тин":2798,"тик":921,"тил":553,"тир":892,"тис":499,"тио":463,"тип":307,"тит":1897,"тко":341,"тки":324,"тно":744,"ток":893,"тол":870,"тои":602,"тов":635,"тоа":832,"тни":1408,"тна":541,"тпр":359,"тре":1295,"тра":3106,"три":1787,"тор":3285,"тот":2026,"том":830,"тон":944,"ус ":503,"топ":356,"тоц":289,"точ":1182,"тој":1245,"тст":1499,"тро":1844,"тру":758,"тсе":374,"тск":1871,"тхо":477,"туд":444,"тув":539,"тур":2009,"ува":11421,"уго":905,"уги":667,"уга":1084,"уда":547,"уар":643,"убл":1212,"узи":649,"ужн":805,"уди":328,"удб":322,"уме":785,"уми":352,"улт":955,"ули":697,"ула":701,"ука":732,"укв":382,"упо":434,"ура":1158,"ури":529,"уре":411,"упа":1105,"унк":323,"уна":414,"уни":908,"уст":803,"уте":298,"урц":319,"урс":618,"урн":333,"уск":1016,"уси":423,"шки":805,"шко":400,"шни":549,"шно":321,"шна":496,"шка":496,"шир":378,"шин":495,"шен":456,"шан":341,"што":3204,"шув":475,"ште":1404,"шти":1481,"шта":515,"on ":498,"че ":342,"цен":1177,"чи ":884,"цел":738,"циј":5710,"ции":969,"цио":1351,"цир":340,"цит":991,"цар":343,"цат":333,"цус":389,"црк":515,"чењ":285,"чев":319,"чен":1330,"чес":1120,"чет":714,"чле":406,"чко":656,"чки":2688,"чка":1890,"чин":910,"чит":363,"ше ":558,"чар":740,"чна":1312,"чов":406,"чни":1436,"чно":1236,"чув":1137,"ќи ":636,"us ":554,"tio":345,"ter":323,"the":294,"ѕве":297,"ѓаа":300," ар":1051," ас":329," ат":297," ба":813," ав":1128," ад":312," аз":335," ал":614," ак":630," ан":1462," ам":537," ап":418," бу":674," ва":516," би":4668," бе":1085," бр":3187," бо":1018," бл":584," вт":300," ви":1958," ве":1999," во":21906," вр":2219," вл":994," вк":456," дв":1419," да":2966," го":6416," гл":1761," гр":5370," ге":1159," ги":1230," ев":363," ед":3169," до":4472," др":2545," де":5656," ди":1319," же":492," ел":592," ек":551," зе":866," за":9018," зб":1071," жи":1512," зн":1014," иг":813," из":2798," ил":3716," ин":1410," им":3303," ит":411," ис":2928," ка":5502," ки":580," кр":1663," ко":13222," кн":382," кл":860," ку":901," ла":991," ли":1394," ле":768," лу":423," ло":957," ме":2789," ми":1057," ма":4441," мо":2407," мн":729," му":1023," ни":1307," не":4874," на":43543," но":1774," ол":358," ок":1879," оз":315," ов":1233," од":15767," об":2355," оф":386,"њет":1203," от":647," ор":1315," ос":2370," оп":2029," по":15733," пл":1137," пи":929," пе":1794," па":2332," Ре":1553," Ра":517," Ро":593," Ри":680," Пр":1592,"ќа ":348," СА":357," Пе":961," Па":996," По":1728," Пи":295," Ос":605," Оп":468," Те":689," Ти":438," То":1430," Тр":451," Ст":843," Та":770," Св":648," Си":588," Се":1778," Сл":373," Ск":806," Сп":403," Ср":473," Со":1077," Ру":528," Са":911," Фр":445," Фи":520,"ќе ":750," Ту":523," Цр":465," Це":313," Хр":338," Хо":298," Хе":392," Ха":496," Че":319," Ба":927," Ар":690," Ан":848," Ам":433," Ал":1073," Ав":472," Ва":686," Бу":778," Бо":736," Бр":703," Бе":877," Би":680," а ":1703,"јќи":588," Ег":326," Ев":563," Ди":459," Де":856," Др":341," До":561," Ер":317," Га":384," Ве":685," Ви":845," Во":1669," Да":437," Ге":523," Гл":341," Го":800," е ":17047," Гр":1204," Ис":742," Ин":570," Им":496," Ки":514," Ка":1843," и ":17952,"ње ":4162," За":893," Зе":393," Из":327," Мо":928," На":1547," Не":717," Ни":438," Но":521," Ов":1078," Кл":294," Ко":1590," Кр":732," Ку":462," Ла":557," Ле":628," Ли":490," Ло":316," Лу":312," Ма":3825," Ме":823," Ми":780," Ја":559," Ју":674,"јна":646,"јни":502,"јзи":419,"јек":289,"јго":612,"јал":1511,"јан":1997,"јат":4096,"јав":629,"јаз":2727,"јад":290,"ња ":993,"јче":337,"југ":496,"јуж":519,"јск":745,"јст":589,"јот":1205,"је ":862,"ја ":19199},"n_words":[2810075,3234074,2357835],"name":"mk"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":213,"E":274,"F":167,"G":214,"A":497,"B":279,"C":430,"L":186,"M":384,"N":219,"O":182,"H":218,"I":415,"K":134,"T":348,"P":401,"S":493,"ിത ":164,"R":240,"f":703,"g":1106,"d":1581,"e":5009,"b":935,"c":1838,"a":5004,"n":3585,"o":3390,"l":2378,"m":1762,"k":455,"h":2313,"i":4303,"w":903,"v":415,"u":1770,"t":4207,"s":2723,"r":3549,"p":1477,"y":840,"x":345,"ില ":173,"ിയ ":1562," m":228," n":149," o":399," h":314," i":345," d":151," e":167," f":185," a":486," b":205," c":370," t":707," w":308," p":446," s":377," r":294," H":179," I":298," N":154," O":131," L":160," M":333," B":247," C":358," A":386," F":135," G":193," D":163," E":193," S":374," R":193," P":342," T":265,"ും ":8335,"ا":186,"ുക ":185,"ീയ ":378,"ാൻ ":875,"ാർ ":676,"ാൾ ":197,"ാൽ ":543,"ാഗമ":334,"ാഗത":387,"ാഗങ":167,"ാക്":1414,"ാകു":356,"ാങ്":227,"ാജാ":164,"ാജ്":625,"ാടി":272,"ാടക":206,"ा":135,"In":142,"ാട്":1178,"ാടു":187,"ിംഗ":155,"ാണു":455,"Th":138,"ാണി":500,"ാണപ":194,"ാതി":141,"ാതന":248,"ാണ്":9070,"ാത്":940,"ുഴ ":240,"ാനം":570,"ാദി":179,"ാനത":448,"ാധാ":281,"ാധി":232,"ാനങ":199,"ാദ്":187,"ാനു":295,"ാനി":583,"ാനാ":214,"ാനപ":155,"ാനമ":615,"ാപന":156,"ാന്":803,"ാപി":311,"ാപ്":231,"ാമം":286,"b ":294,"a ":674,"ാമത":404,"ാമപ":768,"ാമമ":268,"ാമാ":133,"ാമി":294,"ായക":210,"ിൻ ":265,"ാമ്":413,"ാരം":468,"ായത":1564,"ായി":3609,"ായാ":181,"ായ്":163,"ാരമ":247,"ാരന":265,"ാരത":490,"ാരണ":361,"ായു":463,"ാരാ":454,"ാരി":539,"ാലം":169,"ാരു":187,"ിൽ ":7420,"ാര്":247,"ാറു":256,"ാറി":215,"ാളം":134,"ാലക":339,"ാലയ":259,"ാറ്":497,"ാലത":252,"i ":213,"ാവ്":303,"ാവി":316,"ാവു":302,"ാവാ":229,"ാഴ്":134,"ിക്":6093,"he":539,"ha":244,"gh":231,"ികാ":247,"ികമ":181,"ികള":716,"g ":236,"ea":262,"ec":168,"ed":311,"de":246,"ാളി":252,"di":309,"h ":190,"ാലി":392,"ാലാ":156,"el":240,"ാളത":153,"ാലൂ":542,"ാലു":158,"ികം":150,"en":424,"em":159,"et":186,"es":424,"er":794,"ca":205,"e ":1341,"ിച്":3119,"ാഹി":294,"ിചെ":253,"da":131,"ിങ്":403,"ാസ്":1047,"f ":272,"ct":158,"co":290,"ാസി":297,"ci":170,"ch":206,"ce":229,"c ":161,"ാസമ":136,"ാഷ്":369,"ാഷയ":224,"d ":539,"at":600,"as":248,"ar":530,"al":614,"ാശി":190,"am":238,"an":747,"ac":198,"ad":149,"ാസം":149,"ികൾ":430,"nt":385,"ിഞ്":388,"ns":149,"no":135,"of":259,"om":259,"on":664,"ു് ":338,"ol":194,"ou":163,"op":135,"or":514,"r ":518,"pe":169,"ിടക":167,"lo":176,"ll":179,"o ":216,"ma":277,"mb":320,"me":225,"mi":137,"p ":313,"na":310,"nc":163,"nd":424,"ne":310,"ng":305,"ni":239,"്":186908,"ൊ":2828,"ോ":12821,"ൈ":2243,"െ":27110,"േ":13547,"ൂ":8137,"ൃ":1848,"ീ":8295,"ു":58406,"ൗ":469,"m ":405,"ൽ":11771,"li":349,"ർ":12870,"ൾ":4761,"le":352,"ൻ":6054,"ൺ":871,"la":410,"n ":864,"ഈ":1751,"ഉ":3287,"എ":6292,"ഏ":1399,"ht":515,"hu":298,"ം":26623,"hi":201,"ആ":4557,"ഇ":5473,"അ":8089,"id":145,"ച":18270,"ic":457,"ങ":13504,"ia":297,"ഘ":772,"ട":30378,"ഞ":4086,"ig":259,"ജ":7532,"ie":161,"ഓ":760,"ഒ":5000,"ഐ":333,"ഗ":10132,"ഖ":1936,"ക":65976,"ഔ":217,"ന":73400,"പ":39149,"is":392,"ഫ":2123,"it":348,"ബ":5674,"ഭ":5198,"മ":32874,"ിടെ":160,"യ":48687,"ഠ":364,"ഡ":3439,"ണ":22285,"il":179,"ത":67027,"ഥ":4613,"ിട്":804,"in":824,"ദ":11612,"io":368,"ധ":5371,"ഹ":4850,"സ":25595,"ി":86416,"ാ":63951,"റ":22910,"ര":47335,"ള":18695,"ല":30821,"വ":29294,"ഴ":3384,"l ":444,"ഷ":7585,"ശ":8287,"ww":338,"ിധാ":325,"y ":432,"ിദ്":640,"ിതി":938,"ിതാ":203,"ിതമ":146,"ve":190,"x ":258,"ul":150,"ur":239,"us":255,"um":390,"tt":296,"ിത്":1980,"w ":236,"to":263,"tr":153,"tp":241,"te":463,"ti":568,"th":747,"ta":278,"ss":146,"st":355,"se":201,"si":227,"ിപ്":1405,"rt":139,"ry":142,"ിനി":378,"ിനാ":453,"ro":259,"ri":601,"ിനെ":479,"re":445,"ിനു":1055,"ra":513,"t ":716,"ിന്":5173,"s ":1099,"px":216,"ിനോ":212,"ിമാ":267,"ിയന":152,"ിയപ":914,"ിയമ":337,"ിയയ":135,"ിയത":291,"ിയി":1237,"ിയാ":1791,"ിയോ":318,"ിയു":1173,"ിയെ":180,"ിയേ":145,"ിയം":136,"ിഭാ":320,"ിലവ":146,"ിലാ":907,"ിലി":195,"ിലു":1762,"ിലൂ":238,"ിലെ":4614,"ിലേ":330,"ിലൊ":225,"ിലോ":400,"ില്":2038,"ിഴക":306,"ിളി":317,"ീകര":447,"ിറങ":160,"ിരാ":141,"ിരി":844,"ിരു":2143,"ിയൻ":295,"ിറ്":420," ല":2034," ര":2887," റ":942," ശ":1879," ഷ":238," വ":10435," സ":10862," ഹ":1263," ഡ":822," ധ":248," ദ":2030," ത":5869," ഫ":858," പ":15081," ന":8054," യ":1000," മ":9063," ഭ":2376," ബ":2531," ൽ":544,"ശ്ര":558,"ശ്യ":219,"ശ്ശ":386,"ശ്വ":377,"ശ്ച":178,"ശേഷ":320,"ഷത്":575,"ശൂർ":174," എ":6248," ഏ":1397," ഈ":1746," ഉ":3278," ആ":4494," ഇ":5444," അ":8029," ട":631," ജ":4225," ച":4988," ഘ":145," ഗ":2980," ഖ":305," ക":14030," ഔ":215," ഓ":757," ഒ":4983," ഐ":325,"ശേര":218,"സത്":172,"വർഷ":372,"ഷിണ":161,"സഞ്":150,"ഷിക":304,"വർഗ":216,"വർത":599,"ഷിയ":170,"സങ്":186,"സരി":133,"സമ്":174,"സമാ":200,"സമു":201,"സഭയ":152,"ഷ്ണ":218,"സഭാ":174,"ഷ്ട":557,"ഷ്ഠ":143,"ഷ്യ":741,"സന്":155,"ഷേത":398,"ഹത്":297,"സൂച":143,"സിസ":131,"സിയ":151,"സില":142,"സിന":374,"സിപ":159,"സാധ":303,"സാന":195,"സാമ":424,"സിക":356,"സാഹ":254,"സിദ":345,"സസ്":334,"ാം ":756,"സോഫ":142,"സ്യ":395,"സ്റ":846,"സ്ല":281,"സ്ഥ":2863,"സ്പ":221,"സ്ത":2267,"സ്ക":623,"സ്സ":397,"സ്വ":855,"ഹിത":291,"ഹാര":204,"ഹായ":132,"ഹിന":258,"ാന ":514,"ഹ്മ":132,"ിക ":723,"ാള ":284,"സർക":222,"ാല ":148,"ായ ":2610,"െക്":658,"െട്":1283,"െടു":2359,"െങ്":500,"േക്":437,"െള്":180,"േഖല":167,"െറ്":191,"െല്":138," ഈ ":1593,"െത്":201,"െന്":699,"െബ്":146,"െപ്":220,"െറി":225,"െറു":216,"െയ്":1600,"െയു":744,"െയാ":471,"െയി":161,"േശം":258,"േശത":140,"േഷം":140,"േശങ":199,"ൈക്":133,"േശ്":174,"േശി":194,"േശീ":282,"േഷ്":132,"േഹം":278,"േഹത":162,"േഷൻ":136,"േത്":452,"േതാ":159,"േണ്":180,"േന്":408,"േയു":430,"േരള":1271,"േരി":960,"േരു":255,"േറ്":263,"ീവി":408,"ുക്":824,"ുകൊ":132,"ുകി":165,"ുകാ":227,"ുകള":1051,"ുകയ":410,"ീറ്":411,"ീരത":159,"ീതി":299,"ുംബ":279,"ിസ്":1202,"ിശ്":421,"ിശേ":137,"ിവർ":154,"ിഷ്":341,"ിവയ":235,"ീക്":294,"ിഴ്":206,"ിവര":215,"ിവസ":160,"ിവാ":178,"ിവി":329,"ുവി":444,"ുവാ":341,"ുവന":251,"ുവര":355,"ൂക്":727,"ുഷ്":354,"ുറ്":170,"ുളം":204,"ുറി":322,"ുറത":245,"ുരു":345,"ുരാ":293,"ുള്":2528,"ുമ്":328,"ൂർ ":1389,"ുറം":199,"ുമാ":1413,"ുരം":260,"ുമു":219,"ുന്":10068,"ുനി":282,"ുപ്":479,"ുതു":171,"ുതി":335,"ുണ്":1051,"ുത്":1223,"ുതൽ":371,"ുദ്":545,"ുടെ":2766,"ുടേ":141,"ുട്":263,"ുടു":255,"ംഗീ":301,"ംഗ്":542,"ാർക":154,"ുടങ":281,"ാർത":251,"ാർഡ":136,"ുകൾ":669,"ീഷ്":363,"ീസ്":201,"ംഖ്":225,"ംഗല":153," ബ്":675," ഭര":361," മണ":230," ഭൂ":280," മത":278," ഭാ":1188," മദ":178," മന":430," മല":1061," മര":285," മറ":395," മഹ":339," മൂ":522," മു":1582," മീ":223," മി":342," മാ":1333," മോ":137," മേ":380," മെ":288," പൊ":392," പോ":463," പ്":4372," ബന":199," ഫ്":175," ബി":227," ബാ":363," ബു":137," ബോ":172," പക":257," ന്":326," നോ":233," പദ":316," പന":151," പണ":151," പത":547," പഞ":607," പട":612," പഴ":174," പല":234," പറ":573," പര":747," പെ":480," പേ":780," പൂ":332," പു":1278," പാ":1107," പി":660," നൽ":229," നക":244," നഗ":396," ദേ":508," ദ്":241," നദ":211," നട":599," നവ":174," നൂ":265," നീ":290," നെ":227," നേ":376," നാ":1197," നി":2688," തെ":531," ദക":157," ത്":183," ദി":368," തി":720," താ":982," തു":591," തീ":297," തൃ":268," തന":303," തമ":355," തര":169," തല":366," ഡി":290,"ൂറ്":264,"ംബത":191,"ൂരി":182,"ൂമി":187," ടെ":156,"ംബർ":289,"ൂണി":149," സർ":371," ഹൈ":162,"ൂന്":269," ഹാ":165," ഹി":383," സെ":318," സൂ":331," സു":414," സി":733," സാ":1185," സഹ":208," സസ":221," സ്":2807," സോ":248," സൈ":131," വർ":592," സഭ":212," സമ":812," ശേ":181," ശ്":451," ശാ":428," ശി":199," സം":1809," വൈ":304," വേ":548," വെ":601," വ്":771," വസ":228," വി":3041," വീ":201," വാ":826," വൃ":131," വന":287,"ുസ്":374," വല":423," വഴ":131," വള":462," വര":613," വയ":153," വക":168," വട":443,"ംവി":172," ലോ":543,"ൂട്":482," ലെ":167,"ിൽപ":165," ലി":203,"ൂടെ":257," ലാ":299,"ൂടു":168,"ൂടി":333," റോ":225," ലഭ":180,"ംസ്":701," രീ":168," രൂ":379," രാ":1161,"ിർമ":366," യു":294," രണ":452," യൂ":213," രച":157,"ിർത":148," ഉൾ":449,"ൃഷ്":193," എം":133,"ൃശ്":191," എന":4353," എട":138," ആധ":137," ആദ":541," ആന":214," ആയ":406," ആറ":135," ആര":244," ആല":221," ആവ":187," ആസ":220," ഇട":294," ഇത":1342," ഇദ":201," ഇന":1407," ഇല":190," ഇര":190," ഇവ":544," ഇസ":230," അർ":208," ഉദ":192," ഉണ":345," ഉത":324," ഉയ":159," ഉപ":1087," ഉള":175," ഇൻ":148," അം":186," അത":659," അണ":160," ആം":188," അട":472," അക":377," ആണ":724," ഇം":372," അവ":660," അസ":137," അറ":908," അല":394," ആക":232," അഭ":214," അമ":420," അയ":173," അര":179," അപ":189," അഥ":462," അദ":348," അധ":209," അന":672," ജന":950,"ൃതി":272," ചെ":1896," ചേ":382,"ൃത്":357," ചാ":241," ചി":892," ചു":295," ജോ":204," ജി":1596," ജീ":407," ജൂ":195," ജ്":148," ഗാ":220," കർ":226," ഗു":264," ഗ്":1743," ഗോ":177," ഗണ":178," ചര":212," ചല":414," ഓഫ":177," ക്":1197," കെ":231," കൈ":147," കേ":1543," കൊ":822," കോ":957," കാ":1855," കി":865," കീ":159," കു":1227," കൂ":693," കൃ":254," കവ":234," കര":347," കമ":372," കഴ":213," കള":233," കല":418," കന":144," കഥ":180," കട":337," കണ":816," ഏറ":601," എഴ":295," ഏക":259," എല":187," എറ":154," എസ":137," ഒന":341," ഒര":4105," ൽ ":540,"മി ":226,"ബർ ":418,"പ്ര":5359,"പ്യ":278,"പ്റ":162,"പ്പ":6916,"പോൾ":223,"രം ":1921,"ബത്":224,"ബന്":340,"ഫ്ര":256,"ഫ്റ":145,"പരമ":221,"പയോ":768,"പരി":493,"പരാ":131,"പറയ":460,"യം ":1215,"പള്":185,"പാട":326,"പാദ":161,"പാത":301,"പാല":480,"പിച":417,"നൽക":242,"പിക":595,"പിന":279,"പുക":146,"പാർ":143,"പുത":214,"പുര":764,"പുറ":547,"പുഴ":384,"പൂർ":310,"പെര":170,"പെട":3183,"പേര":707,"പൊത":227,"പോല":201,"രെ ":495,"രു ":4025,"ലം ":658,"ഭാഷ":528,"ഭാവ":201,"ര് ":145,"ഭിച":159,"ഭിന":146,"ഭാഗ":860,"ഭാര":363,"മങ്":199,"മന്":204,"മനു":281,"മപഞ":718,"മദ്":277,"മത്":656,"ഭൂമ":198,"മണ്":363,"ളം ":631,"മലപ":135,"മറ്":307,"മലയ":732,"റു ":169,"യർ ":285,"മരണ":133,"റി ":343,"ഭ്യ":155,"മമാ":310,"യൻ ":979,"രണ ":296,"ഭക്":141,"യി ":2287,"റം ":319,"യോ ":322,"യ് ":274,"യെ ":323,"രള ":241,"ബ്ര":460,"മൻ ":173,"ബ്ള":167,"ബ്ല":277,"ഭരണ":352,"രി ":963,"പി ":248,"ദ്ദ":733,"ധമാ":139,"ദ്വ":180,"ദ്യ":1112,"ദ്ര":848,"ദ്ധ":1337,"ദേഹ":453,"ദേശ":1461,"ദേവ":242,"ധീക":138,"ധിയ":166,"ധാര":358,"ധിക":430,"പ് ":643,"ധാന":997,"നടന":157,"നടത":237,"നത്":2604,"നതു":269,"നതി":539,"നതാ":252,"നങ്":656,"നഗര":403,"നക്":345,"ദത്":171,"ദിയ":235,"ദിവ":229,"ദിന":206,"ദായ":132,"ദിക":158,"പങ്":155,"പക്":254,"പകര":152,"മം ":430,"ബി ":170,"ന്ത":3077,"ന്ദ":1121,"ന്ഥ":210,"ന്ന":18547,"ന്ധ":586,"ന്റ":3777,"ന്മ":414,"ന്യ":476,"നോവ":149,"പനി":191,"പന്":169,"പതി":293,"പത്":765,"പട്":440,"പടി":267,"പഞ്":1398,"ബ് ":189,"നയി":167,"നറി":301,"നപ്":274,"നമാ":873,"ധ്യ":502,"നനം":232,"നദി":238,"നന്":309,"നൂർ":159,"നേത":186,"നേട":136,"നെയ":183,"നാൽ":247,"നുമ":332,"നുള":311,"നുസ":192,"നുഷ":303,"നുവ":212,"നിസ":191,"നും":1607,"നുക":143,"നിർ":607,"നിൽ":228,"നൂറ":277,"നിവ":495,"നിയ":759,"നിര":355,"നിറ":179,"നില":691,"നിമ":172,"നിന":1060,"ദർശ":140,"നിച":204,"നിക":437,"ഫ് ":323,"നാല":249,"നാമ":420,"നായ":1102,"നാണ":1039,"നാട":633,"വർ ":251,"ഹം ":356,"വകാ":131,"വക്":151,"വടക":415,"ഴിക":261,"ഷ് ":469,"ളിക":406,"ളായ":311,"ളാണ":384,"വംശ":155,"ളുട":947,"ളിയ":253,"ളില":1194,"ളും":648,"ളിൽ":1372,"ളെയ":186,"ളോക":168,"ള്ള":3418,"വൻ ":161,"ൾപ്":365,"ല്പ":290,"ല്ല":3043,"ല്യ":139,"ളരെ":204,"സം ":495,"ഴക്":419,"ലായ":359,"ലാമ":187,"ലിക":373,"വ് ":420,"ലിന":217,"ലിപ":166,"ലാണ":736,"ലാത":169,"ലാം":153,"ലൂട":241,"ലുള":500,"ലൂക":546,"ലീഷ":334,"ലുക":166,"ലിയ":569,"ലും":1211,"ലേയ":136,"ലേക":222,"ലേജ":143,"ലെയ":139,"ലെങ":175,"ളത്":1319,"ലോമ":256,"ലോക":643,"ലൊന":171,"ൾക്":604,"ൽപ്":210,"ശസ്":398,"ശിയ":152,"സംഘ":223,"സംഖ":216,"സംഗ":292,"സംബ":143,"സംഭ":157,"ശീയ":288,"സംവ":186,"സംസ":658,"ശിക":160,"ശാല":131,"ശാസ":704,"വേദ":193,"വേണ":171,"വെള":233,"വെയ":137,"വൈദ":160,"വേഷ":140,"വൃത":136,"ർവ്":178,"വിൽ":193,"ൽക്":256,"വുമ":350,"ർഷത":185,"ശത്":307,"ശമാ":164,"ഷൻ ":245,"വ്വ":236,"വ്യ":986,"ർമ്":559,"ർപ്":202,"al ":246,"വസാ":210,"വഴി":162,"ർന്":390,"വളര":299,"വലി":422,"ർദ്":196,"and":176,"ർണ്":319,"വരി":311,"വരു":524,"വരെ":305,"ർത്":1697,"an ":167,"വിസ":175,"വിശ":604,"വിഷ":250,"വിവ":377,"വില":388,"വിള":304,"വിയ":254,"വിഭ":320,"ർഡ്":135,"വും":1320,"വാർ":151,"വാക":237,"വാത":177,"വാണ":177,"ർട്":250,"വാദ":195,"വിക":518,"സ് ":1944,"വാസ":359,"വായ":334,"വിത":325,"വിധ":550,"വിദ":293,"വിന":462,"വിച":163,"വിട":354,"വസ്":414,"ശങ്":338,"ർജ്":166,"ർച്":198,"വനന":179,"ൻസ്":171,"വത്":265,"ർഗ്":232,"ർക്":684,"ഴുത":366,"ഴിയ":159,"ൻറെ":142,"വയാ":176,"വയു":170,"സി ":306,"വയം":133,"ati":208,"വന്":355,"ളി ":445,"രയി":142,"റർ ":428,"രമ്":132,"രമു":245,"രമാ":1123,"വം ":264,"രളത":906,"രശസ":300,"രവു":183,"രവാ":158,"റക്":177,"രധാ":578,"യേക":178,"രദേ":627,"രനാ":222,"രന്":347,"രപ്":252,"യോഗ":971,"യ്ത":319,"യ്യ":1468,"യ്ക":431,"രഞ്":146,"യാപ":286,"യാന":412,"യാണ":2227,"യാക":148,"യിട":223,"യാസ":209,"യാള":680,"യിക":256,"യായ":924,"യും":2125,"യില":2744,"യിര":1399,"യുമ":248,"യുത":144,"യുന":1398,"യുട":1638,"രണം":295,"രണത":177,"രണമ":149,"യുള":358,"യിൽ":1778,"യൂട":261,"രണ്":518,"രത്":1578,"രതി":352,"രതീ":156,"രക്":477,"യവസ":141,"ലെ ":4757,"യസ്":188,"രങ്":806,"രജ്":131,"ലോ ":133,"ലി ":304,"യയു":269,"ലാ ":171,"യയി":607,"മ്ര":188,"മ്യ":161,"രൻ ":203,"യമാ":807,"മ്പ":1717,"മ്മ":1128,"രകാ":407,"മേര":234,"മെന":220,"മേഖ":168,"മുൻ":206,"യതി":155,"യത്":2606,"യപ്":1110,"യനാ":300,"യന്":328,"മാണ":3756,"മാത":372,"മാന":490,"മായ":3224,"മാക":397,"മിന":134,"മിയ":208,"മാര":358,"മാറ":260,"മാല":167,"മിക":467,"റ് ":1265,"മാസ":192,"മിച":154,"മൂല":253,"മൂന":234,"മുള":608,"മുസ":161,"മൂഹ":169,"മീറ":381,"രംഭ":134,"രംഗ":221,"മിഴ":237,"മില":152,"മുന":161,"മുണ":204,"മുത":397,"മുദ":221,"മാർ":420,"മുഖ":353,"മുക":147,"യക്":335,"യകാ":146,"യങ്":581,"മഹാ":304,"റെ ":3202,"ലസ്":209,"ലവി":164,"വി ":249,"ലയി":1550,"ലയാ":752,"ലയു":182,"വാ ":463,"ലമാ":175,"ഷം ":301,"ലപ്":395,"റ്റ":5132,"ലഭി":141,"റെയ":243,"റിൽ":141,"ലണ്":140,"ലത്":725,"റാണ":337,"റിന":263,"റിക":201,"ഴ് ":251,"റിച":232,"റും":178,"റിയ":1715,"റില":158,"റുക":338,"ലങ്":374,"ലച്":445,"ലക്":585,"റവു":574,"ലകള":209,"ശം ":392,"ഴി ":137,"റയു":379,"ര്യ":585,"റബി":132,"റപ്":145,"രോഗ":204,"രെയ":158,"റത്":391,"രുവ":555,"രിൽ":376,"രൂപ":574,"രീക":374,"രില":203,"രിയ":788,"രിസ":279,"രീത":277,"രും":137,"രീയ":292,"രുക":338,"രീര":138,"രുട":311,"രുന":2242,"രുത":325,"രുമ":189,"രാധ":160,"രാണ":212,"രാത":212,"രാഗ":175,"രാജ":994,"രിപ":260,"രിന":226,"രിട":191,"രിത":358,"രിച":602,"രാഷ":322,"രാശ":159,"രായ":317,"രാമ":1573,"റഞ്":136,"രാവ":195,"രിക":1867,"രാള":187,"റങ്":254,"രസ്":464,"രവർ":501,"ളെ ":475,"രസി":277,"ാ ":1411,"ി ":10063,"ീ ":761,"ു ":10222,"ഗീത":286,"െ ":14490,"കൾക":240,"േ ":611,"ഗിച":235,"ച് ":831,"ഗിക":587,"ഗാന":207,"ഘടന":192,"ൈ ":191,"ൂ ":136,"ഗത്":599,"മ ":394,"ര ":1278,"യ ":5708,"ion":318,"ഖ്യ":445,"വ ":552,"ഗമാ":450,"സ ":133,"ഷ ":141,"ചു ":503,"ല ":970,"റ ":307,"ഴ ":264,"ള ":2840,"ഗ്ഗ":220,"ഗ്ല":445,"് ":30096,"ഗ്ര":2123,"ോ ":1249,"ങൾ ":1356," In":137,"കല്":194,"കലാ":228,"കറ്":169,"കവി":299,"he ":334,"കഴി":190,"കളെ":330,"കളു":895,"കളി":1388,"കളാ":321,"കിയ":350,"കിഴ":329,"കില":839,"കും":421,"കായ":240,"കാര":1466,"കാവ":233,"കാശ":316,"കാസ":134,"കാറ":154,"കാല":744,"കിട":299,"കുമ":280,"കുള":420,"കുറ":545,"കുവ":161,"കിൽ":695,"കൂട":756,"ം ":21178,"കുക":537,"കീഴ":158,"കുട":502,"കാർ":351,"കാൻ":202,"കുന":3755,"കുപ":182,"കാക":163,"കാണ":507,"കാന":330,"കാട":474,"കഥാ":145,"കന്":204,"കനു":217,"കനാ":221,"കണക":160,"കണ്":651,"കത്":528,"കമാ":395,"കമ്":385,"കയാ":132,"കയി":197,"കയു":359,"കരണ":329,"കരു":191,"കരി":544,"കപ്":708," in":230,"ച ":753,"ഗങ്":342," ht":204," of":235,"ട ":916," an":158,"ണ ":661,"ഗണി":159,"ത ":1303,"igh":217,"ധ ":268,"ing":193,"ങ് ":236,"ന ":6972," co":199,"in ":185,"ദ ":154,"കൊല":162,"കൊള":171,"കൊണ":402,"കൊട":175,"കേര":1281,"കേന":314,"ഈ ":1594," ww":169,"htt":234,"കൃത":542,"ht ":197,"കൃഷ":204," ri":189," px":215,"hum":268,"ക്ട":264,"ക്ത":515,"ക്യ":293,"ക്ഷ":2208,"ക്ര":1063,"ക്ല":147,"ഖ ":135,"ക്സ":418,"ക ":1483,"കോട":568,"കോഴ":224," th":549,"ക്ക":14730,"ജനന":254,"ജനു":148,"ജനി":202,"ജന്":153,"ങൾക":243,"er ":252,"es ":231,"ച്ച":4631,"ent":145,"ght":209,"ജ്യ":739,"ജ്ഞ":320,"ജ്ജ":187,"ജീവ":507,"ജില":1519,"ട് ":2684,"ടു ":337,"ടി ":838,"ടെ ":2958,"ചത്":271,"ജ് ":184,"ങിയ":431,"ചക്":148,"ങളെ":405,"ങളാ":313,"ങളി":1296,"ങളു":891,"ed ":198,"ചേർ":314,"ചെയ":1441,"ചെറ":399,"ചേര":187,"ചെട":173,"ചിട":172,"ചിത":936,"ചായ":1294,"ചാര":301,"ചിന":169,"ചിപ":139,"ചിര":413,"ചിറ":138,"ചില":197,"ചലച":426,"ടം ":259,"ചരി":346,"ങ്ങ":6013,"ങ്ക":1271,"ടാക":287,"ടുക":629,"ടും":382,"ടിൽ":160,"ടുള":178,"ടുവ":240,"ടുമ":147,"ടുപ":172,"ടുന":1925,"ടുണ":196,"ടുത":820,"ടിച":133,"ടിഞ":226,"ടായ":281,"ടിക":477,"ടാമ":153,"ടിസ":180,"ടിര":215,"ടില":180,"ടിന":182,"ടിയ":767,"rig":195,"ടേയ":132,"ടെയ":346,"ട്ര":721,"ട്ട":5142,"ണു ":234,"ഥം ":166,"ണി ":174,"ണ് ":9312,"തി ":1445,"തു ":412,"ദം ":215,"തെ ":1217,"ണം ":720,"ഡി ":135,"px ":213,"ടർ ":247,"ടങ്":577,"ടക്":856,"ഡ് ":679,"ഞാറ":228,"ടന്":213,"ടത്":622,"തം ":428,"ഞ്ഞ":800,"ഞ്ച":1999,"ടയി":180,"തപു":179,"തന്":625,"ോർ":445,"ോൾ":398,"തനാ":181,"്മ":2196,"്യ":11234,"്ര":16053,"്റ":10182,"്ല":4477,"്ള":3597,"ng ":163,"്വ":2298,"്ശ":389,"്ഷ":2226,"്സ":1373,"ോൺ":268,"തമാ":549,"തമി":248,"തരം":239,"ോസ":481,"ോഷ":199,"ോഹ":213,"ോഴ":323,"ോള":566,"ോവ":275,"ോര":171,"ോയ":198,"ോല":414,"ോറ":240,"ോബ":223,"ോഫ":177,"ോമ":554,"്ബ":173,"ണ്ഡ":544,"ണ്ട":3469,"്പ":9300,"്ധ":1930,"്ന":18960,"്ഥ":3459,"്ദ":2047,"്ണ":1196,"്ത":22771,"്ഡ":591,"്ട":9563,"്ഠ":162,"ണ്ണ":973,"്ഞ":1123,"്ജ":323,"്ച":6932,"്ങ":6013,"്ഗ":330,"്ക":17340,"ൊണ":430,"ൊത":283,"ൊന":285,"ൊട":319,"േർ":559,"ോദ":305,"ോണ":319,"ോത":321,"ോപ":434,"ോന":149,"ോജ":148,"ോഡ":324,"ോട":1348,"nd ":168,"ോക":1283,"ൊള":210,"ൊല":199,"തപ്":226,"ോഗ":1263,"ൊര":233,"േന":623,"േവ":391,"േശ":1652,"ൈക":178,"േറ":444,"േല":220,"േയ":868,"േര":2800,"േഹ":481,"േസ":147,"േഷ":809,"ൈദ":208,"ൈന":296,"ൈവ":271,"ൈറ":197,"െൻ":139,"െൽ":155,"െർ":168,"ൈസ":157,"ണിത":465,"െക":672,"െങ":501,"ണിയ":184,"െട":3862,"െത":287,"െന":785,"െപ":232,"െബ":161,"െമ":160,"െയ":3291,"െര":246,"െറ":681,"ണാട":139,"െല":319,"െള":276,"േക":651,"േഖ":487,"ദ് ":330,"ണിക":268,"െസ":158,"േജ":272,"േട":244,"േത":832,"േണ":267,"േദ":234,"ൃത":879,"ുൽ":137,"ുൻ":211,"ുർ":254,"തനം":152,"ൃഷ":287,"ൃശ":256,"തത്":650,"ൂർ":1703,"ണൂർ":281,"ുപ":1108,"ുര":1360,"ുമ":2460,"ുഭ":196,"ുത":2652,"ുണ":1220,"ാൾ":251,"ുന":10596,"ുദ":692,"ൂച":155,"ുഹ":134,"ിൽ":7943,"ിൻ":555,"ൂട":1342,"ിർ":824,"ുള":2978,"ുഴ":501,"ൂക":744,"ുറ":1425,"ുല":285,"ുഷ":436,"ുസ":641,"ുവ":2148,"ുശ":133,"ൂപ":660,"ൂമ":233,"ു്":339,"ൂറ":506,"ൂര":435,"ൂണ":183,"ൂത":256,"ണു്":155,"ൂന":329,"ീർ":319,"ൂല":403,"ൃക":135,"ൂഹ":200,"താര":239,"തായ":301,"തിക":867,"താവ":406,"താല":576,"തിച":316,"താണ":917,"താന":186,"തിയ":1314,"തിര":1091,"തിറ":164,"തില":2687,"തിന":3912,"തിപ":216,"താം":142,"തലസ":164,"തമ്":155,"തോട":190,"ഥമാ":131,"ത്വ":330,"ത്സ":249,"ത്മ":171,"ത്ര":4088,"ത്യ":2047,"ത്ത":16443,"ത്ഥ":385,"of ":224,"നി ":376,"തുട":444,"തുന":334,"തുമ":302,"തുള":199,"തുവ":386,"തും":420,"തീയ":195,"തീര":189,"തുക":619,"തീർ":152,"തൃശ":187,"തിർ":138,"തിൽ":2321,"തെക":351,"തെയ":178,"ർമ":803,"ന് ":1964,"ർശ":261,"ർഷ":608,"ൽക":534,"ർവ":450,"ഥാപ":368,"ർണ":557,"ഥാന":1185,"ർഡ":239,"ർട":256,"ർജ":233,"ർപ":232,"ർന":493,"ർദ":256,"ർത":1751,"ൻറ":218,"ൻസ":353,"ർച":209,"ർക":724,"ർഗ":447,"ൻഡ":203,"ഥിത":865,"ൻപ":166,"ൾപ":377,"ൾക":618,"ൽപ":280,"പം ":271,"നീ ":509,"നു ":3729,"ദക്":161,"ഥവാ":438,"on ":337,"നെ ":631,"ഉള":177,"ഉയ":159,"ഉപ":1088,"ഇൻ":149,"ഇര":190,"ഇല":190,"ഇവ":544,"ഇന":1412,"ഇസ":231,"അർ":208,"ഉണ":345,"ഉത":324,"ഉദ":193,"le ":135,"എട":140,"എന":4361,"ൺ ":614,"എഴ":295,"ഏക":259,"എല":190,"എറ":155,"എസ":156,"ഉൾ":449,"എം":136,"ംഘ":229,"ംഖ":254,"ംഗ":1698,"ംക":276,"ംശ":210,"ംസ":820,"ംവ":207,"ംഭ":344,"ംബ":834,"ഡിയ":165,"ത് ":4999,"അസ":137,"അവ":661,"ആണ":726,"ഇം":373,"അപ":189,"അധ":210,"അന":675,"അഥ":462,"അദ":350,"ആക":235,"അറ":909,"അല":395,"അയ":178,"അര":181,"അഭ":217,"അമ":421,"ആസ":222,"ഇത":1342,"ഇദ":201,"ഇട":298,"ആദ":541,"ആധ":137,"ആന":248,"ആല":222,"ആവ":187,"ആയ":406,"ആര":244,"ആറ":138,"അം":186,"അക":377,"അട":473,"ആം":188,"അത":670,"അണ":160,"ചേ":581,"ചെ":2298,"ചു":1255,"ചി":2735,"ചാ":2146,"ച്":5556,"ചത":464,"ചന":269,"ങാ":172,"ങി":552,"ചല":494,"ങ്":7589,"ചയ":168,"ചര":413,"ങന":154,"ങള":3065,"ചക":269,"കൾ":1615,"ഗീ":392,"ഗി":1014,"കൽ":282,"ഗു":368,"കർ":533,"ഗാ":550,"ഘട":411,"കൻ":533,"ഗസ":163,"ണങ്":272,"ഗോ":498,"ഗ്":3272,"ഞാ":426,"ടണ":268,"ടത":795,"ടപ":165,"ടന":619,"ടമ":180,"ഞ്":2940,"ടല":163,"ടറ":140,"ടയ":443,"ടവ":181,"ടാ":1306,"ടം":274,"ടക":1208,"ടങ":583,"ജൂ":197,"ജീ":561,"ജു":186,"ജാ":371,"ജി":2005,"ജോ":210,"ജ്":1493,"ങൾ":1604,"ജന":1163,"ജല":137,"ണക്":248,"ജയ":163,"ഓഫ":177,"ൾ ":3523,"ഒര":4108,"ഒന":344,"ൽ ":10227,"ർ ":4755,"ൻ ":4719,"ഏറ":601,"ഗമ":519,"ഖ്":518,"ഗര":457,"ഗല":182,"ഗവ":220,"ഗങ":344,"ഗണ":317,"ഗത":782,"കൈ":174,"കേ":2116,"ഖന":135,"കെ":542,"കോ":1765,"കൊ":1243,"ഖര":135,"ക്":21779,"ഖല":214,"കസ":257,"കവ":506,"കി":3548,"ഗം":303,"കീ":288,"കാ":6054,"കൃ":756,"കു":7634,"കൂ":1037,"കപ":759,"കദ":149,"കഥ":303,"കന":747,"കല":863,"കറ":301,"കഴ":243,"കള":3150,"കമ":989,"കര":1697,"കയ":816,"കങ":196,"കക":230,"കണ":938,"കത":663,"കട":577,"കം":672,"നന":662,"നപ":369,"നയ":493,"നമ":1154,"ധ്":613,"നറ":327,"നല":133,"നവ":539,"നസ":256,"നാ":4964,"ദർ":205,"നി":6348,"പം":293,"നീ":1069,"നു":7416,"നൂ":483,"നെ":1274,"നേ":692,"നോ":770,"ന്":30257,"mb ":256,"പക":675,"പങ":155,"പഞ":1399,"പട":805,"പത":1182,"പണ":230,"പന":675,"പദ":488,"പയ":870,"പമ":183,"പറ":902,"പര":1220,"പള":198,"പല":318,"പഴ":198,"പാ":2340,"നൽ":278,"പി":2507,"പീ":178,"പു":2667,"പെ":3528,"പേ":973,"പൂ":626,"ഫല":133,"പ്":13660,"പോ":1036,"പൊ":466,"ണപ്":245,"ഫി":226,"ബന":358,"ഫെ":137,"ബത":225,"ഭക":205,"ഫോ":209,"ഫ്":882,"ബൈ":150,"ബോ":333,"ബി":709,"ബാ":585,"പർ":146,"ബു":258,"മം":543,"മങ":200,"ഭവ":280,"ഭയ":191,"ഭര":449,"ബ്":1417,"മക":266,"ണത്":429,"മന":817,"മധ":161,"മദ":367,"മപ":831,"യം":1280,"ഭി":475,"ഭാ":2218,"മത":1028,"മണ":590,"ഭൂ":412,"യങ":585,"മസ":322,"മഹ":381,"മല":1243,"യക":899,"മമ":375,"ഭ്":194,"മയ":404,"മര":545,"മറ":466,"മോ":397,"മൊ":134,"മ്":3221,"യമ":1442,"യന":858,"മേ":980,"മെ":621,"യപ":1289,"മൈ":144,"യത":3000,"മൂ":870,"മി":2147,"ബർ":483,"മാ":10411,"മു":3097,"മീ":726,"രം":2337,"രഞ":148,"യാ":6131,"രജ":211,"രച":317,"യസ":303,"രങ":807,"യവ":498,"രക":1280,"യറ":218,"യല":155,"യയ":1033,"യര":304,"രയ":465,"രമ":2024,"യ്":2647,"യോ":2076,"രപ":409,"യേ":524,"രന":771,"രധ":593,"യെ":576,"രദ":892,"രത":2444,"രണ":1938,"യൂ":920,"യു":6606,"റം":434,"യി":9140,"ടു":5165,"ടീ":186,"ടി":3790,"ടെ":3674,"ടേ":223,"ട്":8736,"ടോ":357,"ഡല":261,"ഡാ":142,"ടർ":374,"ഡി":791,"ണം":729,"ണങ":272,"ണക":349,"തം":452,"ഡെ":175,"ഡ്":914,"ഡോ":225,"തങ":173,"തക":671,"ണവ":186,"ണി":1469,"ഥം":170,"ണു":765,"ണൂ":306,"ണാ":642,"തട":148,"ണപ":319,"ണത":458,"ണയ":165,"ണമ":522,"തി":15367,"താ":3666,"തൃ":441,"തൂ":222,"തു":3776,"തീ":783,"ദം":233,"തവ":316,"ണ്":14451,"തമ":1061,"തയ":396,"തല":712,"തര":903,"തത":746,"തപ":458,"തന":1444,"ഥാ":1753,"ഥി":968,"ദത":182,"ഥവ":455,"ത്":29067,"ഥമ":194,"ഥയ":152,"ഥല":260,"ദക":185,"തെ":2173,"തേ":368,"തൊ":210,"തോ":646,"ധത":175,"ദു":403,"നം":1457,"ദീ":159,"ദി":1290,"തൽ":420,"ദാ":495,"ദര":150,"ദമ":198,"നത":3826,"നദ":293,"ധീ":204,"ധി":1094,"ധു":167,"ധാ":1533,"നട":732,"നങ":658,"നക":609,"നഗ":483,"ധമ":209,"ദ്":4614,"ധന":255,"ദേ":2169,"ദൈ":141,"ഹി":1080,"ഹാ":862,"ഷൻ":274,"ഹൈ":165,"ഹ്":425,"ഹോ":192,"സി":2539,"സാ":1870,"സഹ":227,"സസ":365,"സേ":205,"സെ":370,"സു":655,"സീ":147,"ഹത":306,"സൂ":414,"സ്":11156,"ഹമ":212,"ഹര":199,"സൈ":220,"സോ":402,"സർ":564,"ണാക":136,"ാബ":224,"ാഭ":193,"ാപ":1119,"ാന":5014,"ാദ":754,"ാധ":756,"ാത":1928,"ാണ":10704,"ിം":358,"ാട":2078,"ിഞ":391,"ിജ":257,"ാഹ":589,"ിച":3441,"ിങ":411,"ാസ":2241,"ാഷ":985,"ിഗ":248,"ാശ":745,"ാവ":1901,"ിക":9608,"ാഴ":297,"ാള":1241,"ാല":2916,"ാറ":1288,"ാര":3963,"ായ":9472,"ാമ":3387,"ാം":1111,"ാച":293,"ാജ":1079,"ാഗ":1285,"ാഖ":155,"ാങ":229,"ാക":2271,"ീപ":245,"ീന":303,"ീയ":989,"ും":8705,"ീത":716,"ീസ":331,"ാൽ":616,"ാർ":1852,"ുട":4050,"ാൻ":1060,"ീല":186,"ീറ":467,"ീര":476,"ീഷ":499,"ുഗ":147,"ീവ":642,"ുഖ":381,"ുക":4108,"ീഴ":170,"ിധ":718,"ിന":8501,"ിപ":1879,"ിഭ":450,"ിമ":977,"ിട":1697,"ിഡ":255,"ിണ":277,"ിത":4094,"ിദ":770,"ിസ":1623,"ിഷ":679,"ിഹ":196,"ീട":157,"ിര":3940,"തങ്":171,"ിയ":9237,"ില":11613,"ിറ":959,"ീക":980,"ിഴ":585,"ിള":544,"ിശ":915,"ിവ":1776,"റെ":3559,"ലന":318,"റേ":484,"റു":1385,"ലണ":143,"ലത":808,"ലമ":351,"റ്":6646,"ലയ":2792,"ലപ":453,"റോ":534,"ലഭ":229,"റവ":704,"ലങ":375,"ററ":173,"ലക":1101,"റാ":1047,"യർ":416,"യൻ":1031,"റീ":135,"ളം":676,"റി":3562,"നം ":1340,"ലച":462,"രെ":725,"രേ":448,"രൂ":693,"റണ":147,"റത":394,"രീ":1446,"ലം":722,"രു":8550,"റയ":625,"റമ":259,"ര്":755,"രോ":818,"റബ":175,"റപ":149,"രശ":457,"രവ":1227,"റക":237,"രള":1305,"രര":176,"രി":6901,"രാ":5614,"മർ":179,"മൻ":182,"റഞ":137,"രഹ":238,"റങ":256,"രസ":985,"ളെ":751,"ളേ":165,"ളോ":309,"ള്":3452,"ഴയ":224,"ഴക":439,"ളവ":247,"ളാ":873,"റർ":523,"വം":540,"ളി":4155,"ളു":2116,"ലൂ":979,"ളത":1504,"ലൈ":193,"ലെ":5253,"ലേ":789,"ല്":3649,"ളമ":142,"ലൊ":242,"ലോ":1419,"ളര":321,"ലവ":344,"ളക":192,"ലസ":248,"രൻ":214,"ലു":2355,"ലീ":583,"ലി":2464,"ലാ":2706,"ഷക":240,"ശയ":135,"ശമ":220,"വ്":1715,"ശര":163,"വോ":132,"വെ":830,"വൈ":359,"വേ":1138,"ശന":174,"വു":2019,"വൃ":226,"ശത":359,"വാ":3225,"വി":6428,"ഷം":305,"വീ":487,"വഹ":163,"വശ":269,"വസ":973,"ശങ":339,"ശക":161,"വഴ":173,"വള":556,"വല":693,"വര":1681,"ണമാ":331,"വയ":976,"വമ":167,"ഴ്":515,"വന":931,"വണ":138,"വത":706,"ശം":401,"ഴു":589,"വട":524,"ഴി":934,"വച":214,"വക":573,"സര":276,"സവ":164,"സന":236,"ഷേ":485,"സഭ":457,"ഷ്":2365,"സമ":1136,"ഷി":1059,"ഹം":374,"സത":207,"സങ":187,"വൻ":194,"സഞ":150,"ഷാ":253,"വർ":1764,"ഷര":160,"ഷയ":312,"ശേ":646,"ഷന":151,"ഷമ":135,"ശ്":1951,"സം":2540,"ശീ":345,"ശു":222,"ശാ":1071,"ശി":810,"ദി ":217,"തൽ ":385,"ഷണ":462,"ശൂ":214,"ഷത":605,"ശസ":408,"ഇസ്":226,"ഇന്":1364,"ഇദ്":200,"ഇവി":189,"ആസ്":173,"ഇത്":627,"ഇതി":553,"ആണ്":627,"ആദ്":407,"ആയി":224,"അവസ":151,"അല്":193,"്സ്":542,"്സി":291,"്ഷ്":138,"്ലെ":193,"്ലേ":201,"്ലോ":274,"്റർ":497,"്ളി":382,"്ളു":150,"്റേ":221,"്റെ":3192,"്ലയ":1398,"്റ്":2584,"്ലി":325,"്ലീ":428,"്ലാ":703,"്ളത":321,"്ലൂ":138,"്രേ":194,"്ററ":147,"്രോ":492,"്റവ":566,"്റു":446,"്യൻ":654,"്റാ":444,"്റി":828,"്രപ":199,"്രന":361,"്യേ":240,"്രധ":583,"്രദ":820,"്രത":1256,"്യൂ":574,"്രര":148,"്രയ":207,"്രമ":1061,"്യോ":256,"്രഹ":203,"്രസ":612,"്രവ":914,"്രശ":364,"്രു":156,"്രി":1522,"്രീ":856,"്രാ":2390,"്ഷര":156,"്ഷേ":452,"്ഷി":552,"്ഷത":297,"്ഷണ":214,"്ശേ":205,"്വര":172,"്വയ":160,"്വീ":179,"്വാ":520,"്ളോ":174,"്വത":242,"ഇംഗ":366,"അഥവ":448,"അതി":433,"അത്":146,"അനു":271,"അന്":287,"അധി":169,"അദ്":338,"അഭി":173,"അറബ":149,"അറി":618,"അമേ":219,"്ങന":152,"്കൽ":212,"്കൻ":384,"്ചേ":162,"്ചി":1394,"്ചാ":1705,"്ചു":944,"്ങ്":141,"്ങാ":169,"്ങി":542,"്ചത":372,"്ങള":3056,"്ങൾ":1597,"അക്":233,"്ച്":847,"്ടത":365,"്ടണ":266,"്ടറ":139,"്ടയ":169,"്ഞാ":395,"്ടം":148,"്കട":170,"്കം":167,"്കൊ":183,"്കോ":591,"്ക്":1983,"്കപ":570,"അടി":235,"്കയ":184,"്കള":257,"്കര":378,"്കറ":207,"്കി":2121,"്കാ":2470,"്കൃ":134,"്കൂ":155,"്കു":5270,"്കേ":378,"്കെ":229,"്നവ":135,"അംഗ":150,"്ധ്":347,"്നറ":289,"്നത":3020,"്ധി":284,"്ധാ":165,"്ന്":1214,"്നെ":265,"്നാ":1196,"്നൂ":143,"്നു":4523,"്നീ":591,"്നി":970,"്പറ":240,"്പര":163,"്പന":256,"്പത":250,"്പ്":921,"്പോ":426,"്പെ":2946,"്പൂ":178,"്പു":822,"്പി":1252,"്പാ":674,"്യങ":413,"്യക":572,"്മാ":586,"്രം":725,"്മി":445,"്മദ":157,"്യം":593,"്യസ":193,"്രങ":393,"്യവ":286,"്രക":715,"്യു":1251,"്റം":197,"്യാ":1524,"്രജ":175,"്യന":356,"്യത":846,"്യര":146,"്യമ":802,"്യയ":869,"്യപ":213,"്ടു":1302,"്ടി":1743,"്ടാ":1065,"്ട്":2062,"്ടോ":261,"്ടെ":178,"്ഡല":230,"്ടർ":257,"്തം":182,"്തക":337,"്ഥം":135,"്ണാ":141,"്ണൂ":295,"്തത":221,"്തപ":248,"്തന":539,"്തമ":327,"്തര":440,"്തവ":158,"്താ":1357,"്തി":9289,"്തീ":242,"്തു":1834,"്തൂ":145,"്തെ":1346,"്തേ":231,"്തോ":380,"്ഥമ":153,"്ത്":4115,"്ഥല":260,"്ഥാ":1534,"്ഥി":931,"്ദി":192,"്ദേ":658,"്ദു":211,"്ധത":135,"്ധമ":162,"്ദ്":530,"ോളി":145,"ോമീ":259,"ോഫ്":143,"ോസ്":247,"ോഴി":212,"ഉൾപ":337,"ww ":169,"www":169,"ൊരു":147,"ോൾ ":349,"ോൺ ":165,"ോഗി":791,"ോഗ്":169,"ൊല്":164,"ോകത":185,"ൊള്":135,"ോക്":696,"ോട്":908,"ോഡ്":151,"ോപ്":223,"ോത്":164,"ോദ്":147,"്ച ":729,"്ട ":817,"്ത ":630,"ഉള്":169,"്ന ":5898,"us ":153,"്ര ":614,"്യ ":889,"ൊട്":150,"േർന":202,"ൊതു":218,"umb":269,"ൊണ്":419,"്ള ":2181,"്ല ":256,"ഉപയ":669,"ൊന്":268,"ൈദ്":158,"ഉണ്":335,"ഉത്":318,"ൈറ്":167,"ttp":234,"tp ":236,"tio":259,"thu":262,"്ക ":209,"ter":154,"the":302,"കം ":627,"എസ്":144,"ഏകദ":134,"എഴു":292,"ഏറ്":547,"എന്":4356,"എല്":166,"കൻ ":521,"�":641,"ഗ് ":228,"കങ്":196,"കൽ ":221,"കൾ ":1360,"ഓഫ്":164,"ക് ":1892,"ഒന്":342,"കര ":204,"കെ ":257,"കേ ":146,"കി ":452,"ഗം ":252,"ഒരു":3867,"കു ":269},"n_words":[1303092,1392078,987774],"name":"ml"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"णां":79,"णाच":69,"थे ":276,"तका":129,"था ":217,"ोल्":72,"ोलि":66,"ोला":81,"ोर्":136,"ोबर":168,"णता":197,"तंत":167,"ोव्":196,"तो ":552,"णजे":209,"्तर":330,"ं ":115,"्त्":568,"्ते":64,"्थळ":62,"तां":107,"्ता":389,"ः ":139,"्ती":202,"ताक":90,"्ति":145,"्तु":82,"ताच":344,"्दर":73,"तात":957,"तान":226,"्थि":115,"ताम":63,"्था":569,"तार":104,"ताल":160,"तिक":153,"्थे":76,"तीं":64,"्दा":73,"а":72,"्धत":82,"तिह":109,"तीच":131,"तिस":79,"्ट्":957,"्टो":150,"्टे":199,"्टी":161,"्टा":154,"्टि":89,"इ ":181,"तसे":151,"धन ":83,"्तक":79,"दी ":549,"्यं":164,"्मन":141,"्यक":342,"दू ":113,"्मा":304,"्मि":128,"्रं":90,"्यत":102,"्मे":88,"्यप":104,"्यम":126,"्रक":379,"्यव":153,"्रज":340,"्यु":195,"्या":9729,"्धा":165,"तया":65,"्ने":69,"तरे":84,"्ना":140,"तरा":162,"तरर":91,"्पन":111,"दा ":131,"्पर":85,"तले":94,"तला":76,"्पा":80,"ण्य":949,"ا":67,"तमि":228,"्का":190,"्कृ":130,"्कर":75,"दर ":72,"णून":182,"तदा":122,"्ञा":188,"दल ":81,"तत्":79,"णार":809,"णात":73,"्गा":89,"्चि":216,"्चा":100,"णुक":137,"्चन":73,"थ ":288,"द ":645,"ध ":495,"न ":5349,"ड ":671,"थील":100,"ठ ":317,"थाप":202,"ण ":1146,"थान":236,"त ":6358,"थित":69,"धी ":120,"ज ":339,"दक्":266,"ञ ":122,"धा ":98,"ट ":1322,"ं":13502,"ः":232,"ँ":308,"आ":6891,"इ":1323,"अ":4895,"ऊ":224,"ऋ":69,"ई":607,"उ":1473,"घ ":103,"ए":2374,"ओ":357,"ऑ":510,"ऐ":63,"ग":7424,"ख":2772,"क":20151,"औ":108,"छ":206,"च":12137,"घ":1111,"ट":6169,"ञ":320,"झ":981,"ज":8381,"ठ":2153,"ड":3743,"ढ":440,"ण":6906,"त":27784,"थ":2477,"द":9254,"ध":4679,"न":18404,"प":12607,"फ":1499,"ब":5130,"्ग ":233,"भ":4247,"म":15733,"य":22975,"र":35779,"ळ":2991,"ल":18817,"व":16505,"ष":4156,"श":6670,"ह":17826,"स":18689,"ऽ":116,"ि":18218,"ा":66714,"थवा":122,"े":31183,"ॅ":601,"ू":4430,"ृ":896,"ी":21357,"च ":843,"ु":7163,"ौ":422,"्":44808,"ो":8505,"ै":961,"ॉ":916,"०":1926,"१":3312,"्क ":91,"६":758,"७":829,"८":1020,"९":2445,"२":1611,"३":695,"४":690,"५":705,"क ":4288,"ग ":912,"ख ":378,"त्स":89,"त्व":365,"त्प":68,"त्र":2148,"त्य":1716,"त्म":72,"त्त":815,"त्न":66,"ம":77,"த":78,"ா":90,"ி":93,"ர":79,"तून":246,"க":89,"तुर":76,"ॉर्":122,"்":211,"तीत":68,"तीन":98,"तीय":438,"तील":2444,"तेल":85,"ई ":267,"ै ":165,"्न ":140," ख":774," ग":1923," औ":107,"ोजी":152," क":6366," ओ":265," ऑ":506," ट":487," ज":3863," झ":541," च":2075," छ":154," घ":462," इ":1199," आ":6783," अ":4797," ए":2269," ऊ":68," उ":1326," ई":99,"दार":306,"दिग":99,"दान":93,"दाच":70,"दिल":153,"दिव":143,"्फ ":77,"दिर":79,"े ":16291,"दीच":62,"नंत":140,"दुर":84,"दुस":164,"ोठे":238,"्म ":214,"ू ":729,"्य ":1111,"्र ":868,"ि ":987,"नी ":1364,"ोणा":90,"ी ":12189,"ोत्":125,"ोतो":104,"ोते":752,"ोती":123,"ोता":310,"ु ":263,"ा ":17571,"ोधन":72," ८":95," ९":93," ६":115," ७":115," ४":132," ५":115," २":1066," ३":209," ०":70," १":2735,"्व ":312," प":6615," फ":822," न":2997," म":6902," य":3998,"्ष ":149," ब":2263," भ":2543," ठ":228," ड":419,"ह ":265," द":3178," ध":461," त":3644," थ":112,"ोपा":104," ह":7628," स":7738," ल":2032," र":3046," श":2241," व":5406,"ने ":992,"्स ":322,"स ":1697,"ष ":215,"थ्व":73,"श ":822,"व ":2502,"्च ":156,"दरम":106,"ळ ":724,"्ट ":345,"ल ":4586,"्ञ ":122,"दर्":246,"ना ":828,"्ड ":88,"र ":6250,"य ":2749,"्ठ ":232,"म ":1312,"्ण ":136,"ोका":76,"ोकस":391,"भ ":104,"ब ":134,"्थ ":178,"फ ":180,"्त ":411,"्ध ":328,"प ":291,"्द ":119,"टना":76,"डू ":84,"डी ":137,"डा ":115,"ठे ":245,"ठी ":788,"ञान":146,"टोब":135,"ट्य":132,"ट्र":1051,"ट्ट":87,"टेड":67,"टेक":65,"तः ":64,"डे ":134,"टां":121,"टें":125,"टात":71,"टार":107,"टिक":111,"ठ्य":96,"णी ":301,"णि ":782,"णा ":114,"्ह्":230,"्सि":71,"्हण":631,"्हा":235,"्हि":103,"्हे":192,"्स्":66,"ठिक":87,"डणु":118,"्लि":183,"्ली":63,"्ला":198,"्ले":93,"्रद":304,"्रथ":62,"्रत":188,"्रण":89,"्यू":155,"्रप":582,"्ये":1183,"्रम":557,"्यो":94,"्रल":83,"्रव":174,"्रश":146,"्रह":141,"्रस":352,"्रा":1606,"्रु":153,"्रि":802,"्लं":83,"्री":882,"्रे":549,"्रो":151,"्र्":109,"्षण":152,"्शि":77,"्षे":198,"्षि":311,"्षा":344,"्षी":88,"्वत":172,"्वज":70,"्वर":132,"्वी":309,"्वे":282,"्वि":88,"्वा":1006,"ड्य":63,"ता ":928,"ती ":831,"ते ":2330,"णे ":394,"डाच":85,"डात":62,"डिय":72,"डिस":128,"डील":75,"तर ":662,"डून":170,"जे ":259,"जा ":105,"जी ":439,"चीन":184,"चित":727,"चार":281,"चाल":111,"चिन":184,"चिम":207,"जगा":147,"च्य":2911,"च्च":223,"जन्":279,"टक ":89,"जनत":86,"जधा":220,"टन ":79,"जाग":68,"जां":62,"जिल":278,"जास":142,"जार":66,"जान":157,"जात":654,"जाण":139,"जरा":68,"जर्":134,"जवळ":124,"जोड":73,"ज्य":1261,"ज्ञ":320,"जीव":120,"जुन":111,"जुल":121,"जून":124,"टर ":126,"झाल":422,"टी ":174,"टा ":105,"ठा ":104,"ंघ":431,"ंख":180,"ंग":1699,"ंक":394,"ंड":876,"केत":187,"ंट":262,"ंज":187,"ंच":1704,"केच":132,"केट":130,"ँड":81,"ंश":217,"ंस":555,"ंव":461,"् ":692,"ंध":318,"ंद":1200,"ंन":966,"ंथ":109,"ंत":1644,"ंम":356,"केल":701,"ंप":555,"ंभ":92,"ंब":953,"ो ":900,"ँग":72,"आण":837,"आढ":90,"इं":419,"कृत":201,"अस":1528,"अश":181,"आग":71,"अव":109,"आक":127,"अल":143,"अर":282,"अभ":461,"अम":428,"अप":77,"अध":195,"अन":311,"अथ":144,"इत":287,"आश":86,"आह":3959,"आल":245,"आय":156,"आर":178,"आप":132,"आफ":117,"कृष":91,"आध":103,"अं":259,"आं":194,"कें":67,"अत":143,"अण":68,"अक":80,"उर":65,"उप":257,"उल":75,"ऊन":63,"ऊर":66,"इस":98,"उं":94,"खले":96,"उच":118,"उत":389,"उन":70,"उद":171,"कोण":88,"कोल":77,"एक":1893,"एख":65,"एप":100,"क्क":92,"क्ट":201,"क्त":320,"क्य":169,"क्ष":1291,"क्र":523,"क्स":174,"ऑस":91,"ऑफ":67,"ऑक":141,"ऑग":114,"गर":430,"गल":148,"गळ":139,"गव":115,"खे":276,"गप":74,"गम":74,"ख्":538,"गट":66,"खा":505,"खि":83,"गड":87,"खी":84,"गण":286,"गत":154,"खर":91,"क्":2945,"खल":175,"के":1566,"खन":94,"कॅ":149,"को":604,"कॉ":161,"कि":847,"की":630,"गं":74,"का":3922,"कृ":301,"कु":375,"कू":79,"कस":544,"कव":162,"कश":95,"कल":330,"खक":117,"कम":167,"कर":1579,"कप":84,"कथ":172,"कन":175,"कड":224,"खं":163,"कण":68,"कत":150,"कक":66,"कं":129,"ओळ":135,"ची":1602,"चि":1243,"चा":1991,"चे":2243,"चौ":88,"च्":3182,"० ":717,"जग":235,"जक":120,"चन":187,"चर":81,"चल":81,"चव":64,"घा":367,"घे":138,"चं":156,"घट":138,"गा":1549,"गस":133,"घड":69,"गी":328,"गि":172,"गू":80,"गु":294,"गो":373,"ग्":1223,"गे":325,"घर":90,"टन":228,"ञा":188,"टा":652,"टल":177,"टर":206,"खाद":93,"४ ":416,"झा":550,"झि":70,"टच":62,"३ ":377,"टक":258,"जो":179,"जे":499,"जू":189,"जी":702,"जु":275,"जा":1637,"जि":541,"२ ":421,"ज्":1646,"जन":601,"खान":75,"जध":220,"खाल":87,"जस":63,"जव":226,"१ ":522,"जर":265,"जल":91,"जय":81,"जम":98,"डच":80,"ठा":219,"ठि":103,"ठव":67,"५ ":430,"टे":445,"ट्":1360,"टो":293,"टी":396,"टि":297,"डा":413,"डि":310,"डी":307,"डल":139,"डळ":73,"डव":71,"६ ":435,"ठ्":106,"डम":84,"डर":65,"ठी":879,"डत":76,"डण":186,"डन":65,"ठे":307,"तं":195,"तः":94,"ढा":78,"णज":221,"णक":97,"७ ":492,"ढळ":87,"ड्":156,"डो":138,"डॉ":73,"डे":278,"डू":298,"डु":64,"णि":910,"णी":405,"णु":223,"णू":264,"णा":1365,"तक":277,"८ ":406,"णप":91,"णत":272,"९ ":481,"तव":72,"तस":223,"ति":798,"ता":3296,"तू":362,"तु":325,"ती":4269,"दं":76,"तद":128,"तत":106,"तप":146,"तन":88,"णे":550,"तम":332,"ण्":982,"तय":70,"तल":255,"तळ":94,"तर":1232,"थव":138,"था":849,"थी":145,"थि":153,"ते":2652,"तो":665,"त्":5456,"खेळ":198,"थळ":77,"दक":326,"दृ":82,"धत":108,"दू":226,"दु":367,"नं":194,"दी":767,"दि":707,"दा":1004,"थे":411,"दन":81,"दव":72,"दल":156,"दर":524,"थ्":121,"धा":812,"नच":194,"नत":190,"नद":208,"धी":622,"नड":78,"धि":310,"धू":73,"धु":110,"दो":255,"द्":1977,"धन":165,"दे":1492,"नक":199,"नग":152,"धर":260,"धल":79,"नर":109,"नल":207,"नव":362,"धे":84,"नय":86,"नम":96,"ध्":1350,"नी":1612,"पं":155,"नु":263,"गळ्":76,"ने":2153,"नस":171,"ना":2993,"नि":1438,"पक":409,"नो":289,"नै":75,"न्":1414,"पत":355,"पण":229,"पन":409,"पद":339,"पड":106,"पट":647,"पश":225,"पह":161,"पस":114,"पल":219,"पर":957,"बई":129,"पे":273,"पै":233,"पॉ":77,"पू":594,"पृ":102,"पॅ":69,"पा":1577,"पि":254,"बं":239,"पी":379,"पु":752,"बच":72,"फळ":72,"प्":3134,"पो":211,"बन":164,"फे":175,"बद":156,"बत":64,"फु":80,"फि":95,"फा":147,"फो":62,"फ्":393,"बर":861,"बह":109,"बि":244,"बा":902,"बु":115,"मं":347,"बी":172,"बे":403,"बॉ":75,"बो":197,"भर":75,"ब्":696,"मक":100,"गरा":120,"गरी":78,"मग":69,"मज":82,"मच":80,"यं":250,"भि":399,"भा":2637,"मत":229,"भू":282,"मण":132,"भे":177,"मन":439,"मध":1532,"मद":153,"भौ":79,"भो":73,"मल":215,"यक":493,"भ्":94,"मर":604,"मह":1126,"यत":203,"मृ":108,"मू":271,"मॅ":66,"ख्य":500,"मि":1010,"यट":74,"मा":2886,"मु":1239,"रं":435,"मी":463,"मो":679,"यम":237,"म्":1246,"यन":341,"मे":875,"यप":140,"मै":72,"यव":231,"रख":99,"रग":70,"रक":670,"यल":98,"यर":147,"या":14036,"रज":391,"रच":375,"यस":75,"रध":68,"रद":356,"रथ":65,"रत":1594,"यू":189,"रण":1061,"यु":675,"यी":118,"यि":121,"रय":68,"रम":767,"रभ":78,"रब":97,"यो":389,"रफ":64,"रप":674,"ये":1927,"रन":85,"गाच":88,"लय":143,"लब":72,"लन":141,"लढ":74,"गात":319,"लच":92,"लग":102,"लक":174,"र्":5914,"रो":716,"रॉ":92,"रे":1413,"गां":194,"रू":387,"लं":299,"री":2531,"रु":575,"रि":1770,"रा":6808,"रह":201,"रस":797,"रश":254,"रव":382,"रल":306,"रर":139,"ळ्":150,"ळे":264,"ळा":565,"वं":149,"ळी":258,"ळू":62,"ळव":128,"ल्":2173,"लो":767,"चन ":87,"लू":62,"ळण":107,"ळत":82,"लै":137,"ले":3399,"ळन":66,"लु":220,"ली":1530,"लि":1122,"ला":2983,"लव":72,"ळख":135,"ळक":78,"वो":76,"शब":124,"वै":95,"वे":911,"शन":181,"शव":74,"षक":96,"व्":1218,"शर":79,"वश":88,"गस्":118,"वस":571,"वू":99,"वृ":105,"शत":95,"वा":3822,"वि":1941,"वी":773,"वन":263,"शक":187,"वळ":226,"वल":308,"वर":1642,"वय":91,"वज":113,"वच":65,"वक":74,"वण":225,"वत":410,"वड":312,"वट":66,"सन":187,"षे":500,"सप":188,"सभ":348,"ष्":1679,"सम":806,"सर":1179,"सल":632,"सव":108,"सच":99,"षा":600,"षि":375,"षी":127,"सण":108,"सत":526,"सद":113,"सध":64,"शे":245,"श्":798,"शो":167,"षय":121,"सक":135,"शह":705,"सं":1892,"शी":422,"शु":72,"शा":1405,"शि":914,"षण":199,"हे":6136,"हॅ":73,"हु":162,"ही":1380,"हि":1219,"हा":3206,"ह्":871,"हो":1733,"हन":73,"से":980,"सु":643,"सी":265,"हत":249,"सू":619,"हण":691,"सि":714,"सा":2701,"सह":169,"हव":64,"हस":71,"हम":90,"स्":3801,"हय":143,"हर":898,"सै":65,"सो":298,"ात":6409,"ाथ":69,"ाढ":83,"ाण":917,"ाठ":965,"िं":1222,"ाड":364,"ाट":442,"ाब":339,"ाभ":170,"ाप":1160,"ान":3426,"ाद":925,"ाध":343,"गुर":68,"ाव":2259,"िख":96,"िक":2722,"ाळ":594,"ाल":2412,"ार":6697,"ाय":1081,"ाम":1731,"िज":229,"ाह":1051,"िच":184,"ास":2359,"ाष":1328,"ाश":369,"िग":160,"ां":4665,"ाँ":113,"ाऊ":66,"ाइ":80,"ाई":239,"ाउ":96,"ाक":602,"ाच":4155,"ाझ":79,"ाज":2078,"ाग":1046,"ाख":206,"ीण":65,"ीठ":87,"ुं":330,"गिर":64,"ीत":755,"ीप":214,"ीन":664,"ीम":234,"ीय":1104,"ील":3494,"ीर":265,"ुग":140,"ुख":527,"ीव":305,"ुक":528,"गीत":150,"ुज":89,"ीस":155,"ुट":125,"िट":204,"गाव":277,"ीं":192,"िड":79,"गाल":127,"िण":383,"ित":1967,"गाय":92,"िद":477,"िध":193,"िन":1260,"िप":347,"िब":126,"घटन":111,"िभ":130,"िम":634,"िर":694,"िय":1422,"िल":1282,"ीक":319,"िळ":425,"िश":552,"िव":770,"िस":721,"िष":365,"िह":250,"ीच":727,"ीज":85,"ेव":793,"ेश":1384,"ेळ":324,"ैक":225,"ेल":2762,"ेय":96,"ेर":672,"ेम":206,"ेब":175,"ेप":128,"ेन":599,"ैद":75,"ॉं":62,"ेह":98,"ेस":492,"ेष":313,"ॉक":64,"गेल":207,"ैव":63,"ैन":90,"ॉन":105,"ॉट":71,"ॅर":127,"ॅन":107,"ें":768,"ेक":722,"ेख":448,"ेट":491,"ेड":164,"ेत":2226,"ेण":224,"ेद":177,"ेथ":380,"ेग":137,"ेच":757,"ेज":75,"ृथ":73,"ृत":438,"ृष":221,"गोल":138,"ुत":238,"ुण":210,"ुढ":64,"ुड":67,"ूं":81,"ुन":719,"ुध":64,"ुद":440,"ुप":192,"ुर":1052,"ुम":183,"ुळ":257,"ग्ल":179,"ूक":85,"ुल":420,"ुष":125,"ुस":491,"ुव":264,"ग्र":670,"ूच":81,"ग्द":71,"ग्न":84,"ूत":126,"ून":1612,"ूप":102,"ूम":143,"ूर":779,"ूल":112,"ूळ":67,"ूह":94,"चा ":1450,"्व":2745,"्श":274,"्ष":1475,"्स":788,"्ह":1660,"्म":1315,"्य":13761,"्र":9096,"्ल":826,"ची ":1393,"ोर":443,"ोय":88,"ोल":499,"ोब":307,"ोम":148,"ोस":168,"ोष":97,"ोह":121,"ोळ":97,"ोश":68,"ोव":325,"्ण":384,"्त":2563,"्ड":225,"्ट":2277,"्ठ":301,"्झ":104,"्ञ":320,"्फ":142,"्ब":125,"्प":569,"्ध":714,"्न":549,"्थ":1118,"्द":543,"ौर":111,"्ज":211,"्च":600,"्ग":488,"्क":801,"ों":135,"ॉर":159,"ॉल":165,"ोज":285,"ोड":211,"ोट":232,"ोठ":384,"ोद":77,"ोण":200,"ोत":1519,"ोप":280,"ोध":220,"ोन":394,"ोक":697,"ोच":112,"ोग":276,"चे ":2125,"घात":147,"घाच":71,"चंद":96,"घेत":69,"०६":62,"०७":100,"०४":65,"००":610,"०१":76,"०८":75,"०९":89,"१०":143,"१५":108,"१६":128,"१७":139,"१८":321,"११":87,"१२":105,"१३":94,"१४":109,"१९":1554,"२१":65,"२०":603,"८०":89,"८६":64,"९०":106,"९६":205,"९५":154,"९८":288,"९७":177,"९२":135,"९१":161,"९४":175,"९३":129,"८८":69,"८९":125,"९९":420,"३०":86,"२३":65,"२२":68,"२५":80,"२७":76,"२६":63,"२८":64,"५०":65,"६०":76,"जन ":67,"चना":79,"के ":109,"का ":542,"ओळख":133,"की ":396,"खक ":110,"कर ":232,"कन ":100,"काह":112,"किन":92,"काम":326,"कार":1020,"काय":62,"काल":221,"काळ":222,"काश":148,"कास":81,"किम":109,"किल":76,"कीय":113,"कां":335,"काद":84,"कात":158,"काण":89,"किं":328,"काच":171,"कवी":78,"कसं":90,"कशा":76,"कसभ":296,"कला":81,"कल्":115,"गी ":87,"कर्":219,"गा ":89,"करा":112,"करू":89,"करी":67,"三":73,"करत":181,"करण":522,"कथा":145,"कडे":62,"कडू":62,"गर ":134,"खंड":160,"कंप":101,"एप्":100,"एका":149,"एखा":65,"ऑफ ":63,"ऑस्":91,"ऑगस":114,"ऑक्":139,"०९ ":83,"०८ ":72,"०७ ":92,"ऊर्":63,"११ ":69,"१२ ":81,"१३ ":67,"१० ":113,"१६ ":75,"१७ ":70,"१४ ":80,"१५ ":76,"०० ":116,"०४ ":63,"एक ":1511,"आणि":780,"आपल":113,"आफ्":114,"२००":459,"आले":120,"आर्":98,"आहे":3954,"इति":65,"इतर":127,"अथव":122,"अति":63,"१९१":93,"१९८":221,"१९९":347,"१९६":128,"१९७":149,"१९४":143,"१९५":124,"१९२":109,"१९३":101,"अधि":146,"अने":154,"अभि":344,"अभ्":70,"अमे":278,"अमि":68,"अर्":160,"आका":68,"असल":335,"असत":304,"असण":73,"अशा":75,"अशी":64,"असे":343,"असू":247,"असा":90,"इंड":62,"इंग":258,"आढळ":87,"००९":79,"००७":86,"் ":95,"उच्":113,"उद्":130,"३० ":73,"उत्":376,"१९ ":73,"२७ ":71,"ऊन ":62,"२८ ":62,"२५ ":73,"इस्":65,"२० ":116,"ेशा":514,"ेशि":132," ओळ":134," कं":103,"ेष्":213," खं":102," कथ":124," कम":84," कर":956,"ैकी":208," कल":112," कव":97," कि":683," का":1385," कृ":78,"ेवर":160," कु":249," के":933," कॅ":146," को":333," कॉ":122," क्":525,"ेवा":214," एक":1893," एख":65," एप":100," ऑक":140," ऑग":114," ऑफ":67,"ेस्":75," ऑस":91," चौ":88," च्":195," जग":208," ची":128," चि":913," चा":258," चे":120," जर":148," जम":72," १ ":124," जन":432," ज्":461," २ ":80," जि":352," जा":1062," जू":107," जु":247," जी":146," जे":116," जो":163," ३ ":78," झा":447," खे":215," गण":130," गट":64," खा":206," घर":66," गो":192," ग्":328," गे":212," गु":208," गा":404," चं":92," घे":131," घा":63,"ंट ":79," अं":254,"ेथे":251,"ेथी":97," अप":73,"ेते":395," अन":300," अध":194," अथ":143," आक":124,"ेता":130," अल":139,"ेती":391," अर":236," अम":426," अभ":454," अस":1524," अव":109," अश":180," आग":68,"ेत्":247," आढ":90," आण":837," इं":409,"ेणा":90," अक":80," आं":187,"ेण्":83," अण":68," अत":140,"ेतल":69,"६० ":62,"ंत ":465,"ंड ":204,"ेने":94,"ेन्":114,"ेनि":72,"ंग ":304,"ेब्":104,"ेळा":116," इस":89,"ेल्":528,"ेलि":98,"ेली":343," इत":279,"ेला":618," आह":3959,"ेले":852," आश":86," आर":162," आय":154," आल":243,"ंच ":100," आध":97," आप":129," आफ":117," उल":66," उप":255," उद":168," उत":384,"ेरि":321,"ेरी":87," उच":118,"ंघ ":80," वा":841,"ोत ":71," वि":1279," शत":75," वस":293," व्":476," शर":73," वे":251," वै":84," शब":119," सं":1488," शि":338," शा":307," शह":692," शे":119," श्":176," शो":63," शक":86," वर":325," वन":73,"ोन ":196," ला":353," लि":318," ले":317," लो":622," या":2852," रच":72," यु":309," यो":83," ये":604," रश":94," रा":1893," रि":79," रस":83," रे":153," रु":64," रो":269,"ंद ":67,"ोर ":89," हो":1650," ह्":509," हि":526," ही":801," हा":1542," हे":1892,"ोल ":64," सम":640," सप":137," सर":815," सद":67," सध":64," सत":74," स्":1248," हर":99," हय":132," सो":189," सा":1287," सि":219," सह":142," से":175," सी":112," सु":443," सू":90," दर":173," दृ":63," दु":235," दा":117,"ोक ":70," दि":467," दक":264," त्":1085," तो":143," ते":655," तय":65," तम":233," तर":237," तत":72," ता":327," ति":182,"ोग ":64," ती":172," तु":116," तस":163," ७ ":73," ६ ":68," डि":141," टे":67," ट्":135," टो":93," ५ ":63," ठि":88," ४ ":80," टा":72," मो":576," म्":694," मे":240," मै":64," मू":114," मृ":78," मा":1083," मि":294," रं":86," मु":622," मह":1114," मल":145," मर":508," भौ":66," मन":77," मध":589," मत":139," भू":208," भा":1980," ब्":283," बे":267," बो":108," बा":493," बि":111," मं":217," बह":106," फ्":212," बर":139," बद":64," बन":145," फे":128," फा":98," फु":65," फि":73," बच":70," प्":2295," पो":147," पि":143," पा":818," पु":600," बं":158," पे":127," पॉ":75," पृ":96," पू":277," पर":492," पश":206," पह":159," पड":69," पट":81," पद":228," पत":76," न्":105," नो":178," पक":276," नि":731," ना":905," पं":133," ने":304," नव":130," धा":110," नद":157," दे":1033," द्":193," दो":206," धर":205,"ॅरि":66,"ेंट":68,"ेंद":165,"ेंब":371,"८९ ":68," द ":78," इ ":175,"ेल ":174,"८० ":71,"ेर ":66,"ेस ":273,"ेश ":484,"ेव ":85," स ":136,"ेच्":192,"ेचे":150,"ेचा":92,"ेची":66,"ेकड":79,"ेखन":78,"ेक्":200,"ेखक":117,"ेटा":76,"ॉन ":63,"९८ ":66,"९९ ":71,"९६ ":77,"९१ ":71," व ":1440,"ृष्":213,"ृथ्":72,"ृती":81,"ृत्":188,"अंत":115,"ेत ":845,"ेन ":118,"ेट ":229,"ेड ":81,"९९६":63,"ेक ":319,"ेख ":111,"ेच ":240,"आंत":108,"ुष्":79,"ुसर":130,"ुवा":185,"ुळे":148,"ुला":65,"ुलै":113,"ुरा":109,"ुरस":95,"ुरो":111,"ुर्":256,"ुरु":156,"ुरू":78,"ुमा":140,"ुप्":76,"ुनि":101,"ुनी":63,"ुना":87,"ुद्":358,"ुत्":100,"ंच्":531,"ंचा":365,"ंची":306,"ंचे":348,"ंग्":429,"ंगा":332,"ंघट":78,"ंगी":166,"ंघा":242,"ंका":93,"ंख्":149,"ंगल":89,"ंगण":71,"ंडा":180,"ंडळ":71,"ंना":305,"ंनी":632,"ंपै":165,"ंबई":124,"ूर्":429,"ंपा":63,"ंपर":71,"ंपन":86,"ंत्":351,"ंता":140,"ंती":82,"ंतर":401,"ंद्":289,"ंदी":266,"ंदू":116,"ंदा":71,"ंदि":113,"ंदर":105,"ंबर":448,"ंमध":305,"ंस्":364,"ंसा":99,"ुस्":136,"ंवर":89,"ुसा":139,"ंवा":298,"ंशो":74,"ाळी":71,"ाळा":189,"िकन":68,"ाला":601,"ालि":128,"ाली":458,"ालु":126,"ाल्":170,"ाले":279,"ावि":118,"ावी":70,"ावा":374,"ाव्":94,"ावे":148,"ावण":64,"िका":572,"ावर":588,"ावल":82,"िको":101,"िक्":215,"िके":418,"ाषा":187,"ासत":84,"ाषे":285,"ाष्":810,"ाशि":80,"ाशी":94,"िग्":108,"ासक":92,"ाही":306,"ाहि":257,"िचा":69,"ाह्":62,"ुन ":350,"ासि":69,"ासा":570,"ासु":84,"ाहत":75,"ासू":218,"ास्":488,"िजे":63,"ींच":65,"िणे":98,"िता":173,"िती":211,"ित्":984,"िद्":408,"िना":177,"िनि":68,"िनी":230,"िने":334,"िन्":135,"िपी":179,"िभा":112,"ियन":201,"िमे":82,"ियम":83,"िमी":103,"िमा":152,"ियो":66,"िया":833,"िर्":245,"िरी":73,"िरा":70,"िले":194,"िल्":507,"िला":170,"िली":83,"िसर":109,"िष्":172,"िषय":111,"िश्":136,"िशे":76,"िवस":72,"िवा":161,"िवि":81,"ीका":68,"िवड":238,"ीचे":213,"ीची":91,"िहि":83,"िहा":133,"ीचा":113,"ून ":1516,"ीच्":262,"िसे":127,"िस्":228,"ुंब":177,"ीती":162,"ूर ":255,"ीने":102,"ीरा":80,"ीवर":113,"ुक्":227,"ुका":154,"ुख्":276,"ृत ":101,"ीला":83,"ित ":454,"िण ":169,"िन ":167,"िध ":64,"िल ":211,"िळ ":165,"ीक ":118,"ांड":163,"ांच":1386,"ांक":146,"ांग":206,"ांस":157,"ांव":124,"िम ":197,"ांम":292,"ांब":110,"ांप":219,"ांन":910,"ांद":82,"ांध":110,"ांत":540,"िय ":84,"िर ":71,"िश ":190,"िस ":90,"ागा":289,"ीत ":405,"ागर":198,"ाखा":79,"ाक्":63,"ाका":132,"ाकर":80,"िंद":422,"ाडू":108,"िंव":279,"ाठी":839,"िंग":264,"ाटक":140,"ाजा":172,"ाजी":169,"ाज्":867,"ाजध":220,"ाजव":101,"ाचा":713,"ाची":807,"ाचे":1121,"ाच्":1245,"ाजक":91,"ीन ":336,"ाने":765,"ाना":350,"ानि":81,"ानी":337,"ानु":71,"ानल":114,"ानव":85,"ुख ":209,"ाध्":84,"ापन":141,"ान्":247,"ादी":203,"ानं":83,"ानत":66,"ाधि":74,"ाधा":87,"ानच":62,"ानक":91,"ाद्":145,"ाता":198,"ाती":1964,"ातू":202,"ाणे":107,"ाण्":123,"ातल":106,"ाते":315,"ात्":194,"ातो":153,"ील ":3327,"ीय ":1038,"ातं":85,"ीर ":82,"ाणी":153,"ाणा":214,"ारी":709,"ारि":99,"ारा":1018,"ारस":180,"ार्":1216,"ारे":318,"ालय":129,"ामा":334,"ायक":105,"ाम्":126,"ायन":99,"ामी":72,"ारं":64,"ामु":161,"ाया":101,"ारच":84,"ारख":74,"ारक":98,"ारण":233,"ारत":1080,"ाबा":143,"ामध":252,"ामन":110,"ापर":215,"ापू":104,"ापी":79,"ापा":183,"ाप्":78,"ीस ":70,"ाई ":106,"होत":1397,"होण":97,"ह्म":91,"ह्य":743,"ाक ":79,"ाग ":209,"ाद ":178,"ाण ":173,"ात ":2992,"ान ":859,"ाज ":109,"ाच ":165,"ाट ":100,"ाव ":432,"िक ":1026,"ाळ ":124,"ास ":473,"ाम ":369,"ाल ":196,"ार ":1261,"ाय ":102,"सले":457,"सल्":84,"समु":164,"समा":227,"समू":87,"सम्":80,"सरा":98,"सर्":792,"सप्":140,"सभे":108,"ष्य":98,"ष्ण":139,"ष्ठ":291,"सभा":231,"ष्ट":1050,"सध्":62,"षेत":366,"हत्":160,"सेच":158,"सेन":81,"सें":161,"सीम":75,"सुन":150,"हणू":177,"सुर":133,"सुम":66,"हणत":183,"सून":460,"हणज":210,"सां":148,"साह":175,"सिद":219,"साध":112,"सार":316,"साम":328,"साय":147,"सिक":101,"साव":80,"साल":226,"साग":141,"सिं":119,"साठ":416,"सात":110," १२":89," ११":78," १४":90," १३":81," १६":113," १५":99," १८":311," १७":121," १९":1529," २०":568,"हरा":269,"हर्":68,"स्व":512,"हया":139,"स्य":64,"स्ल":87,"स्थ":759,"स्प":226,"स्ट":559,"स्त":1031,"स्क":382,"सेव":68," १०":100,"हेत":649,"हें":134,"हिल":252,"हिन":68,"हित":204,"हास":245,"हाव":84,"हाय":65,"हार":516,"हाम":139,"हान":148,"हिं":378,"हात":102,"हे ":5167,"षां":107,"सणा":80,"सतो":69,"सते":145,"सता":120,"सत्":153,"षाच":186,"षिण":264,"शेष":68,"शोध":130,"हा ":1574,"ही ":1273,"श्व":169,"श्र":226,"श्य":73,"श्च":243,"शहर":683,"से ":365,"सी ":115,"हर ":423,"संत":74,"शिय":317,"संक":81,"संग":244,"शिव":108,"संख":143,"संघ":417,"संब":106,"संप":91,"संस":355,"संश":79,"शात":171,"शाच":270,"शास":375,"शिक":213,"शां":163,"वेळ":85,"वेल":78,"वेश":67,"वेग":99,"वेद":75,"शतक":63,"वृत":90,"सा ":185,"व्ह":547,"व्य":626,"शब्":122,"वर्":390,"षा ":197,"वरा":67,"वरू":76,"वरी":175,"षी ":90,"वले":122,"वसा":93,"वसल":181,"वस्":189,"सन ":83,"वां":120,"वात":562,"वाद":207,"वान":200,"वाच":234,"वाज":104,"विक":171,"वाल":80,"वास":154,"वाप":201,"वार":452,"वाय":134,"वाम":62,"वित":108,"विन":71,"विद":197,"विध":115,"विज":139,"वाह":173,"विच":82,"विष":207,"विश":185,"वीच":70,"विल":73,"विव":106,"विम":91,"विभ":103,"वीप":110,"वडू":94,"वडण":125,"वणा":82,"वण्":85,"वती":67,"वता":101,"वना":93,"शा ":95,"षण ":107,"शी ":306,"वंश":67,"ळात":171,"ळाड":66,"षक ":73,"ळ्य":142,"शन ":88,"वे ":211,"वा ":653,"वी ":367,"ल्प":162,"ल्य":1022,"ल्ल":251,"ल्व":86,"ल्स":80,"ल्ह":320,"ळना":64,"लेश":64,"लोक":579,"वळ ":98,"लेल":1062,"लेख":355,"लिह":91,"लिश":121,"लिय":136,"लील":65,"लुक":126,"वर ":789,"लां":197,"लाग":85,"लाच":70,"लिं":80,"लाप":66,"लाव":113,"लिक":135,"लास":65,"लिन":79,"लिप":197,"वत ":73,"ळखल":119,"ळे ":200,"ळा ":98,"ळी ":148,"रेस":151,"रोज":164,"रोप":113,"र्श":238,"र्व":1106,"र्स":68,"र्ष":184,"र्म":630,"र्य":552,"र्ल":103,"र्थ":343,"र्द":88,"र्ध":121,"र्न":169,"र्फ":104,"र्ट":116,"र्ड":113,"र्ण":231,"र्त":217,"र्ग":412,"र्क":189,"र्ज":148,"र्च":115,"रीक":109,"रिय":267,"रिल":109,"रिस":109,"रीत":101,"लंड":195,"रीय":372,"रील":210,"रुन":72,"रुप":68,"रुव":148,"रून":145,"रूप":76,"रें":77,"रेक":125,"रेट":62,"रेल":183,"रसा":80,"रसि":236,"रस्":202,"ले ":1654,"लै ":119,"रां":490,"रान":176,"राठ":475,"राट":76,"रात":566,"राण":185,"राज":1474,"राच":427,"रिट":93,"रित":141,"राष":801,"रास":112,"राह":156,"राम":258,"राय":119,"रिक":669,"राव":320,"रलि":71,"ला ":1877,"ररा":122,"रम्":122,"रमा":263,"रमु":208,"रसं":149,"रशि":112,"रशा":97,"रवा":143,"रले":92,"ली ":1229,"रपट":518,"येष":193,"येण":75,"येथ":336,"येत":173,"रदे":228,"योग":175,"युक":84,"युर":149,"युन":102,"युद":90,"याच":1413,"याम":178,"यान":560,"याप":224,"यात":1507,"याद":157,"यास":414,"याव":251,"यिक":66,"याल":203,"यार":164,"रता":731,"रति":74,"रती":443,"रते":65,"रत्":99,"लय ":91,"रणा":308,"रणे":72,"रण्":354,"रचन":71,"यवस":94,"रक्":111,"रजा":126,"रजी":94,"यां":1609,"रज्":107,"रच्":105,"लन ":62,"रे ":511,"महा":885,"महत":141,"यक्":261,"रू ":76,"मले":80,"री ":1362,"मृत":96,"मूह":88,"मुळ":150,"मुल":69,"मुद":185,"र् ":478,"मुख":511,"रंप":65,"रंथ":84,"मुं":140,"रंग":175,"मिळ":390,"मित":196,"मिन":63,"मार":537,"माल":161,"मिक":114,"माव":85,"माह":78,"माण":213,"मात":178,"मान":677,"माच":73,"माज":179,"मां":206,"मोठ":381,"में":100,"मेर":300,"मेल":94,"यत्":103,"रका":358,"म्र":173,"म्य":205,"म्ह":682,"म्म":64,"रत ":129,"यू ":94,"रण ":190,"या ":6887,"भिन":289,"भाव":89,"भाष":512,"भार":1098,"भाग":467,"रम ":63,"यंत":200,"ये ":856,"मधी":410,"मध्":993,"भेव":86,"मनी":88,"भूत":65,"भूम":113,"मतद":118,"मरा":500,"रा ":574,"भ्य":77,"मा ":144,"बर्":157,"मी ":268,"यन ":212,"मे ":125,"बहु":71,"बिय":93,"बाज":80,"बाब":72,"बार":87,"बाद":98,"बां":115,"मंत":114,"मंद":83,"मंड":86,"यम ":64,"रक ":67,"बेर":65,"बेट":143,"बोध":67,"ब्र":411,"ब्द":139,"प्र":2788,"प्त":110,"प्ट":127,"भा ":239,"मण ":66,"मन ":141,"बच्":64,"यक ":87,"बद्":68,"बनव":83,"फेब":101,"फ्र":309,"पहि":119,"पश्":206,"पल्":82,"बी ":93,"बा ":68,"पर्":325,"परि":145,"परा":78,"परं":68,"�":133,"पैक":208,"पृथ":73,"पुस":66,"पूर":541,"पुण":116,"पुत":63,"पुर":350,"पीठ":86,"बंग":106,"बंध":84,"पास":320,"पाच":82,"पाण":105,"पान":76,"पात":141,"पाद":66,"पार":155,"पाल":72,"पाय":64,"पां":102,"पक्":286,"न्न":159,"न्य":422,"न्म":293,"न्ह":93,"न्स":215,"नोव":141,"पद्":103,"पदा":135,"पनी":76,"पना":113,"पणे":87,"बर ":537,"पत्":186,"पती":143,"पटा":141,"नले":90,"नवी":100,"पी ":206,"नदी":146,"ध्य":1211,"नुस":148,"नीच":95,"निस":85,"नेत":475,"नेश":67,"नेव":132,"नेक":163,"बई ":90,"निव":276,"निर":218,"निय":224,"नास":64,"नाह":82,"नाव":453,"निक":308,"नाय":79,"नाम":119,"नार":159,"नात":134,"नाट":176,"नाड":71,"नाच":211,"नाग":125,"नां":144,"द्द":122,"द्व":204,"द्र":575,"द्य":488,"द्ध":540,"धर्":201,"पट ":338,"देव":241,"देश":972,"देण":76,"दोन":169,"धात":66,"धार":222,"धिक":210,"धान":301,"धील":412,"पर ":71,"नता":99,"नगर":128,"पण ":79,"नच्":79},"n_words":[573395,652939,442285],"name":"mr"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ेश्":122,"ेष्":121,"दन ":955," कम":272," कर":171," कल":142," कि":313,"ेशक":101," का":2467," कृ":130," कु":353," के":400," को":746," क्":668,"ेवा":114,"ेवी":117," एक":2232," एम":84," एस":66," एव":82,"ेही":103," चौ":73," चु":87," ची":65," चि":272," चा":174," छन":436,"थी ":98," जर":83,"तको":395," छो":111," जस":287," १ ":94,"तका":126," छा":110," जन":834," ज्":137," २ ":106," जि":1031," जा":297," जु":184," जी":93," जे":102,"था ":777," छ।":1176," जो":201," ३ ":81," झा":69," गर":1726," खो":93," खे":113," गण":154," गत":96," खा":152," घर":283," गौ":72," गो":262," ग्":152,"ैति":183," गु":176," गा":503," । ":1729,"दछ ":91," चल":134,"णमा":89," चर":93," चन":91,"ोल्":69," अं":111,"ोलि":69,"ोला":99,"ोलन":95," अप":72," अन":738," अध":277," अथ":96," आक":83,"ेता":128," अल":77," अर":427," अम":148," अभ":65,"ोर्":161," अस":323," अव":627,"ेत्":416," अक":100," अग":100," अत":71," अञ":333,"ेपा":2443,"ेन्":518,"ोबा":73,"थल ":101,"ेना":77,"ंग ":129,"ेमा":93," एउ":305,"त् ":79,"थम ":65," इल":78," इन":145,"ेली":90,"ेला":163,"तो ":127," आर":92," आय":68," आन":91," आद":140," आध":94," आफ":173," उह":329,"ेर्":71," उप":369," उन":300," उद":118," उत":328,"ेरि":117," उच":89,"ेरै":136,"ंघ ":125," वा":585," वी":67," वि":1830," वु":93,"्तर":540," व्":234," शर":146,"ं ":520,"ैशा":79," वै":119," शब":115," शु":75,"्तो":97," सं":1338,"्त्":507," शि":206," शा":289,"्तै":88," शह":107,"्थल":136,"्ता":424," सक":211,"्ती":153," श्":338,"ताक":144,"्ति":514,"्तु":122,"्थ्":92,"तान":131,"्दछ":254,"्थि":236,"ताम":82,"्था":656,"तार":119,"ताल":259,"तिक":679,"्थी":106,"ताह":141,"्दो":111,"्द्":640," वट":348,"तिन":112,"तिम":220,"ँ ":497,"तिब":72,"्दा":387,"तिल":64," वर":390,"्दी":92,"्दि":109,"तिर":95,"्दू":82,"तीक":85,"तिह":119," वन":80,"्दै":77,"्दे":123,"्ट्":320," लग":144,"्टी":291,"्टा":74," लल":75,"्डल":215," लु":106," ला":569," लि":251," ले":262,"इ ":252," लो":97,"्डक":81,"्णक":65," या":166," यस":2003," यह":103," यि":70,"धन ":65," यु":243," यो":2991,"्तक":221," रा":1721," रह":505," रे":111," रू":232,"दै ":146," रु":253," रो":152,"्बन":84,"दी ":721,"्बि":65,"्मक":109,"्मन":119,"्यक":669,"्मा":313,"्मि":89,"्मी":132,"्यत":93,"्मे":80,"्यम":169,"्रक":2903,"्यव":169,"्यस":223,"्यह":70,"्रज":90,"्यु":227,"्या":1225," हो":3798,"्धा":79,"्धि":77,"्धु":64,"तया":66," हि":324," हा":314," हु":1822," हे":105,"्ने":717,"्नु":381,"तरी":81,"्ना":84,"तरा":100,"्नो":104," सम":2258," सभ":305," सब":222,"्पन":76," सन":367,"्पत":195," सर":306,"दा ":445," सद":429," सत":75,"तर्":239," स्":1058," हर":182,"्फब":94," सो":117,"्पा":1027," सा":1310," सि":468," सह":215," से":264,"्प्":73," सु":497," सू":97,"दि ":71," दल":430," दर":95," दु":249," दा":223,"ोक ":69," दि":387," दक":141," त्":468," ते":64," था":199," थि":1391," तर":320," तथ":624," ता":221,"्क्":76," ति":156,"ोग ":226,"तमा":143,"्को":75," ती":93,"्का":162,"्कृ":122,"्की":66,"ण्ड":575,"ैनि":66," ठु":72," ठू":98," डा":80,"ँको":90," टे":70,"ैभन":80,"तपु":159," ५ ":87," ठा":328,"तन्":137," ४ ":90,"७१ ":81," मो":133," मे":182,"्जा":70," मा":2199," मि":226," मी":71," मु":393,"्जी":68,"्छ।":1182," मह":623," भो":79," मन":295," भे":81," मध":395,"ैमा":131," भु":80," मण":175," भू":134," भि":128," भा":1679," मज":70," भर":94," ब्":200," भद":64," बे":149," भन":724," बै":74," बो":103,"्ञा":169," बा":593," बि":392," मं":65," बी":67," बु":143," भग":64," बह":81," बस":196,"्टर":85," फ्":93," भए":2250," बर":162,"दल ":217," बन":332," फा":77," बज":106,"तथा":612," बढ":88," प्":5303," पो":110," फल":67," पि":113," पा":1007," पु":600," पे":81," पृ":77," पू":312,"ौं ":109," पर":802," पश":191," पह":651," पछ":122," पन":637," पद":103,"णाल":87," पत":3652,"ौँ ":65," न्":72," नै":168," नि":657," ना":582," पं":74,"्चल":413," ने":2763," धे":138,"्छन":169," नय":92," नर":78,"्चि":231," नव":75,"्चा":256," धा":133,"्। ":184," नद":186," दै":64," दे":732," धन":78," द्":298," दो":108," धर":161," नग":197,"थ ":223,"दछ।":89,"द ":562,"ध ":304,"दछन":85,"न ":5506,"ड ":307," छ ":575,"ठ ":146,"थाल":88,"थिए":221,"थाप":321,"ण ":876,"थान":166,"थिय":1145,"त ":3231,"थित":211,"धी ":65,"ज ":416,"दक्":143,"ट ":1241,"धि ":82,"ं":3288,"ः":73,"ँ":2200,"आ":1341,"इ":1404,"अ":4126,"ऋ":113,"ई":1594,"उ":3478,"घ ":163,"ए":6874,"ओ":539,"ऐ":69,"ग":8111,"ख":2732,"क":41084,"औ":111,"छ":5495,"च":3790,"ङ":845,"घ":968,"ट":4637,"ञ":822,"झ":319,"ज":7684,"ठ":1551,"ङ ":292,"ड":2764,"ढ":364,"ण":2637,"त":20032,"थ":4532,"द":12121,"ध":3905,"न":29922,"प":22723,"फ":1251,"्ग ":136,"ब":5465,"भ":7912,"म":22298,"य":17139,"र":42105,"ल":18318,"व":12132,"ष":4104,"श":6654,"ह":13763,"स":20352,"ि":29268,"ा":66742,"े":14995,"थवा":92,"ू":2666,"ृ":815,"ी":11554,"च ":186,"ु":12537,"ौ":1228,"्":49626,"ो":28329,"ै":2713,"।":9235,"०":2064,"१":1785,"्क ":72,"छ ":1403,"६":623,"७":619,"८":678,"९":972,"२":2170,"३":565,"४":638,"५":791,"क ":5557,"ेल ":167,"ग ":763,"ख ":276,"ेर ":327,"त्स":67,"त्व":197,"त्प":68,"त्र":5347,"त्य":758,"त्त":449,"ए ":239,"तीय":69,"द् ":189,"ेस ":70,"उ ":287,"ई ":1278,"ेश ":318,"ै ":1366,"्न ":467,"नै ":335," ख":612," ग":3519," औ":90," क":6153," ओ":97," ऐ":65," ट":271," ज":3597," झ":164," च":1230," छ":2605," घ":502," इ":461," आ":1284," अ":3988," ए":2988," ऋ":77," उ":1784," ई":92,"दार":162,"नो ":211,"दान":85,"दीक":207,"दिर":84,"े ":3604,"दिन":243,"न् ":626,"दुई":79,"ौँम":309,"दुर":326,"ेकप":97,"्म ":374,"ू ":585,"ेका":344,"ेको":828,"्य ":1998,"ेखि":464,"्र ":942,"ेखा":65,"ि ":3525,"नी ":612,"ेटि":110,"ी ":7016,"नु ":137,"ोत्":73,"ु ":1173,"ा ":19102,"ँउ ":227," ८":76," ९":73," ६":100," ७":169," ४":169," ५":165," २":1308," ३":186," ०":277," १":1083," ।":2222,"्व ":234,"ेजी":71,"दस्":318," प":14289," फ":558," न":5247," म":5367," य":5733,"्ष ":268," ब":2996," भ":5836," ढ":70," ठ":526," ड":306,"ह ":271," द":2964," ध":656," त":2303," थ":1702," ह":6806," स":10170," ल":1764," र":4971," श":1531," व":3977,"ने ":1547,"्स ":116,"स ":1459,"ष ":373,"थ्य":109,"श ":446,"्छ ":694,"व ":550,"्च ":74,"दरम":107,"्ट ":232,"ल ":2508,"दलह":69,"नि ":671,"दर्":91,"दलक":97,"ना ":698,"ँग ":83,"र ":6666,"्ड ":120,"य ":3205,"्ठ ":89,"म ":1712,"्ण ":171,"ोखर":85," र ":1581,"ब ":130,"्थ ":94,"फ ":86,"्त ":382,"्ध ":250,"प ":228,"ो। ":2741,"्द ":129,"ृष्":136,"डल ":72,"डी ":201,"डा ":124,"ञ्ज":95,"ञ्च":499,"ृति":122,"ृत्":80,"ञान":153,"ेत ":110,"ठमा":517,"ट्र":339,"ट्ट":87,"ढी ":85,"ेक ":82,"अक्":91,"टिन":116,"अञ्":333,"टीक":104,"ुवा":235,"ूको":117,"ुला":154,"ुलो":69,"ुल्":89,"ुले":184,"ुरा":226,"ुरो":69,"ुर्":237,"ुरी":67,"ुरु":159,"ुरम":86,"ुम्":151,"ुरक":79,"ुमा":298,"णी ":110,"ँदै":80,"ुभय":124,"णा ":67,"डलक":101,"ुभए":420,"ुप्":75,"ुपा":81,"ुपम":101,"ुन्":1483,"ुनै":129,"ुने":151,"ुनु":515,"ुनि":176,"ुना":77,"ुद्":231,"्सा":87,"्से":64,"्ष्":65,"ँमा":361,"्स्":64,"ठाउ":291,"ठूल":104,"्ला":1132,"्लो":92,"्ले":68,"ंचा":97,"्रद":218,"्रथ":82,"्रत":719,"्रण":90,"्रप":72,"्रन":77,"्ये":172,"्रध":142,"्रम":518,"्यो":226,"ंग्":118,"्रय":179,"्रव":165,"्रश":109,"्रह":151,"्रस":389,"्रा":928,"ंगा":83,"्रि":4096,"्री":584,"ठुल":72,"्रै":117,"्रे":422,"्रो":266,"्षक":74,"ीहर":197,"्षर":74,"ंगठ":68,"्षे":409,"्षि":335,"्षा":151,"्षी":92,"्वक":82,"ुग्":67,"्वत":106,"्वर":184,"्वव":84,"्वम":76,"्वय":164,"्वी":175,"्वा":1398,"डौँ":365,"डौं":74,"ूला":68,"ूलो":82,"ति ":1256,"ता ":557,"णको":70,"तु ":113,"ूमि":65,"ूमा":80,"ती ":337,"ूर्":396,"ूपम":142,"तै ":115,"ते ":88,"ूद्":136,"ंमा":238,"ंस्":240,"तर ":294,"ुस्":101,"ुसा":127,"ुहु":425,"ंवि":168,"ाला":279,"ालि":277,"ाली":1573,"ाल्":175,"ाले":372,"छ। ":2152,"ावि":101,"ाशन":833,"िकृ":90,"िका":4480,"ाशक":884,"िको":207,"िक्":271,"ाषा":1077,"ासन":94,"ासद":191,"ाष्":296,"ाशि":795,"ासक":88,"ाही":67,"ाहि":278,"ाहा":83,"ुन ":257,"ासि":356,"ासी":103,"ाहर":380,"ास्":236,"जा ":203,"चलक":256,"िज्":95,"चर्":65,"िजय":135,"जी ":92,"ुर ":568,"ङ्ग":239,"ितप":65,"छन्":559,"िता":146,"िति":411,"ित्":493,"िद्":368,"िधि":130,"िधा":249,"िना":260,"िनि":160,"िनी":149,"िने":242,"िनु":78,"िन्":1022,"चीन":87,"िभि":96,"िभा":91,"ियन":73,"िम्":132,"ियम":114,"िमा":914,"ियो":1315,"िया":360,"िर्":332,"िरा":176,"िले":173,"िल्":1069,"िलो":573,"चित":254,"िला":204,"चाय":66,"चार":351,"चाल":100,"चिव":79,"चिन":100,"चिम":183,"िष्":174,"िश्":356,"िशे":71,"िवा":232,"ीका":189,"ीको":490,"िहा":169,"िस्":142,"ूल ":110,"ीति":294,"च्च":109,"जमा":145,"ीद्":108,"छोर":78,"जन्":230,"जनक":98,"जनत":72,"जधा":122,"जनव":107,"जनै":185,"जनी":264,"जना":168,"जदु":67,"ीमा":357,"ीया":65,"ुक्":205,"ुको":176,"ुका":171,"ुख्":130,"ृत ":152,"ीले":86,"ीला":83,"जस्":157,"जसल":64,"ित ":1545,"िण ":71,"जिल":993,"जिक":146,"जार":186,"जान":79,"जात":246,"जर्":77,"जयी":89,"ाँउ":242,"ाँच":67,"ाँक":137,"िन ":363,"जवा":81,"ाइन":178,"िल ":92,"जोड":89,"ाउँ":218,"ाउं":237,"ाउन":366,"ीक ":78,"ज्य":351,"ाँस":67,"ांग":75,"ज्ञ":223,"ाँल":66,"िम ":169,"िय ":243,"िर ":148,"जील":69,"जीव":121,"ाएक":100,"जुन":124,"टर ":108,"िव ":68,"िस ":86,"ाग्":194,"ागु":64,"ागि":189,"ीत ":128,"ागर":78,"ागम":84,"ाको":3606,"ाक्":157,"ाका":303,"ाओव":397,"ाडी":141,"िंह":72,"ाठम":478,"ाटन":82,"टी ":233,"टा ":741,"ाजा":131,"ाजि":146,"ाज्":244,"ाजन":456,"ाजध":126,"ाजव":83,"ीन ":153,"ाचन":103,"ाङ्":108,"ुई ":73,"ाने":97,"ाना":169,"ानि":283,"ानी":322,"ानु":103,"ानव":65,"ुख ":149,"ानस":161,"ानम":146,"ाध्":103,"ापन":228,"ान्":702,"ानो":110,"ादे":93,"ादी":738,"ादु":180,"ादन":914,"ानक":94,"ाद्":158,"ाति":178,"ाता":140,"ाती":68,"ाण्":134,"ात्":408,"ादक":75,"ीय ":427,"ाडौ":440,"ारी":407,"ारि":163,"ारा":1342,"ार्":2184,"ारे":69,"ालक":765,"ालम":256,"ालय":324,"ालद":78,"ामा":889,"ायक":72,"ाम्":197,"ायण":151,"ायत":156,"ाया":71,"ारक":244,"ारम":182,"ारण":184,"ारत":376,"ाबा":106,"िएक":373,"ामक":80,"ामय":74,"ामम":66,"ापा":186,"ाप्":232,"ुङ ":64,"ंघ":232,"ँस":83,"ौ ":65,"ंख":88,"ंग":634,"ंक":193,"ँल":78,"ंच":134,"केन":240,"ँद":154,"ँड":80,"ँम":363,"ंस":389,"ंह":86,"ंव":201,"् ":1181,"ंत":94,"ंम":241,"केह":83,"ँक":178,"ो ":20357,"ँग":166,"ँच":76,"ँउ":243,"कृत":264,"अस":341,"अव":627,"आक":84,"अल":82,"अर":461,"अभ":65,"आए":70,"अम":151,"घर ":251,"अप":74,"अध":281,"अन":771,"अथ":96,"इत":80,"ाइ ":156,"इए":81,"आय":68,"आर":95,"आफ":174,"कृष":151,"आद":141,"आध":94,"आन":112,"ाई ":1080,"अं":111,"अञ":333,"अत":71,"अक":101,"अग":107,"उम":66,"उप":377,"उह":331,"ऋत":72,"इन":366,"इर":70,"इल":137,"इस":102,"उँ":251,"उं":238,"उक":79,"उच":90,"उट":311,"उत":340,"हो।":2612,"उन":717,"उद":141,"कोट":86,"एउ":305,"एक":5494,"एम":113,"एप":66,"क्क":84,"क्त":425,"क्य":129,"क्न":67,"क्ष":1267,"क्र":426,"क्स":110,"ाँ ":310,"ए।":67,"एव":82,"एर":147,"एस":80,"गर":2258,"गल":135,"गव":76,"खे":215,"गन":66,"खो":112,"गम":263,"ख्":345,"खा":431,"गठ":95,"खि":580,"खी":82,"खु":123,"ाग ":123,"गढ":68,"गण":181,"गत":326,"गको":91,"खर":150,"क्":2722,"खम":75,"गक":127,"कै":214,"के":654,"को":14452,"कि":691,"की":474,"का":12832,"कृ":425,"कु":618,"कू":91,"कस":66,"कव":78,"कह":90,"कल":273,"कम":415,"कर":463,"कप":247,"कन":76,"कत":154,"ाङ ":105,"कक":112,"ओव":398,"चु":124,"ची":172,"चि":731,"चा":795,"छन":717,"चौ":90,"च्":216,"चो":89,"० ":345,"जक":89,"चन":324,"ङ्":397,"चर":129,"चल":599,"घा":108,"गण्":66,"ङम":69,"गते":76,"। ":7045,"गा":955,"गी":218,"गि":310,"गु":302,"गो":351,"गौ":84,"ग्":809,"गे":161,"गै":75,"घर":335,"टन":160,"ञ्":596,"ञा":169,"टा":992,"टर":235,"४ ":302,"झा":105,"३ ":263,"टक":135,"जो":225,"जे":208,"छ।":2479,"जी":387,"जु":351,"जा":939,"जि":1330,"२ ":297,"ज्":670,"जन":1499,"खान":74,"जद":90,"जध":126,"छि":255,"छा":205,"जस":334,"जव":93,"१ ":351,"जर":120,"जल":76,"गठन":79,"छो":118,"जय":160,"जम":205,"ठा":393,"डक":133,"ाथ ":102,"५ ":351,"ठन":83,"टे":203,"ठम":522,"ट्":498,"टो":148,"टी":408,"टि":258,"डा":364,"डि":270,"डी":265,"डल":256,"६ ":313,"ाद ":282,"ठु":73,"ठू":104,"ढी":114,"ढा":72,"ाण ":70,"णक":119,"७ ":328,"ड्":123,"डौ":474,"डो":86,"डे":171,"णि":132,"णी":170,"णा":196,"ात ":197,"तक":594,"८ ":352,"णम":93,"तव":80,"९ ":264,"तह":110,"ति":2878,"ता":1722,"तु":292,"ती":680,"तथ":624,"तप":208,"तन":233,"तम":226,"ण्":631,"तय":73,"तल":91,"तर":935,"थव":101,"दछ":271,"था":1584,"थी":140,"थि":1694,"ते":197,"तै":125,"तो":181,"थम":91,"त्":7154,"थल":165,"खेल":103,"दक":289,"दस":347,"दू":145,"दु":584,"दी":1175,"दि":689,"दा":996,"दन":1011,"दव":74,"दल":480,"दर":324,"दम":71,"थ्":174,"नज":74,"धा":907,"नत":94,"नद":266,"धी":98,"धि":828,"ान ":852,"धु":156,"दो":255,"दौ":70,"द्":2831,"दे":1366,"धन":183,"दै":256,"नक":650,"नग":337,"धर":218,"नर":137,"नल":223,"नव":289,"ाज ":178,"धे":169,"नन":100,"नप":96,"नब":64,"नय":150,"नम":477,"ध्":787,"पं":75,"नी":1179,"नु":1548,"ने":4740,"नस":329,"नह":97,"ना":2286,"नि":2559,"पक":159,"नो":316,"नै":540,"न्":7416,"पत":4116,"पन":1101,"पद":120,"न।":118,"पछ":277,"पट":80,"पश":213,"पह":665,"पस":130,"पल":132,"पम":272,"पर":1026,"पे":166,"पू":427,"पृ":79,"पा":5213,"पि":272,"पी":141,"पु":1182,"फल":131,"फर":70,"प्":6004,"फब":94,"पो":196,"बन":449,"फे":70,"बत":86,"फु":76,"बढ":88,"फू":67,"फा":131,"बज":120,"फ्":316,"भए":2692,"बर":285,"बल":88,"भक":78,"भग":82,"बस":232,"बह":195,"बि":521,"बा":1458,"बु":218,"मं":75,"बी":142,"भद":79,"भन":878,"बे":211,"गरम":92,"बै":272,"बो":141,"गरप":94,"भय":191,"भर":147,"गरे":321,"ब्":486,"मक":339,"गरी":134,"गरि":502,"मग":77,"मज":101,"भि":375,"भा":2323,"मत":163,"मण":318,"भू":158,"भु":153,"गर्":821,"मन":572,"भे":108,"मध":481,"मद":72,"भो":89,"मप":65,"मल":187,"यक":819,"मम":151,"भ्":133,"मय":212,"मर":128,"मस":126,"मह":673,"मृ":68,"यत":299,"मू":114,"यण":158,"ख्य":245,"यद":91,"मि":1113,"मा":10729,"ाट ":785,"मु":840,"मी":348,"रं":73,"मो":273,"यम":527,"म्":2421,"मे":530,"यन":231,"यप":89,"मै":88,"यव":183,"रख":100,"रग":116,"रक":3696,"यल":80,"यर":101,"या":2168,"रज":106,"यह":189,"रच":84,"गमा":170,"यस":2268,"रध":173,"रद":310,"रथ":102,"रत":1236,"रण":521,"यु":657,"यी":175,"यि":191,"रय":191,"रम":1296,"रभ":77,"रब":131,"यो":5061,"रप":226,"रन":174,"ये":226,"लम":462,"लय":351,"लब":81,"लद":123,"लन":243,"गाउ":130,"लच":65,"लग":235,"लक":1487,"र्":6868,"रो":703,"रै":297,"रे":1245,"गाँ":224,"रू":1254,"री":1735,"रु":1451,"रि":5597,"रा":5311,"रह":868,"रस":574,"रश":114,"रव":321,"रल":193,"वं":88,"िक ":1742,"ल्":1854,"लो":1078,"चन ":69,"ले":2083,"लु":247,"ली":1940,"लि":1013,"ला":3708,"लल":168,"लह":138,"लस":69,"शब":122,"वै":233,"शन":968,"वे":271,"षक":102,"शम":116,"व्":325,"शर":176,"वह":113,"वव":90,"वस":368,"वु":106,"वा":3817,"वि":2520,"वी":434,"वप":72,"वन":350,"वध":419,"शक":1037,"वल":144,"वर":826,"वय":169,"वम":117,"वक":170,"वत":227,"वट":383,"ाह ":84,"सन":601,"षे":417,"सप":72,"सभ":377,"सब":248,"ष्":905,"सम":2673,"सर":429,"सल":312,"सव":65,"हक":127,"सच":90,"षा":1291,"षि":415,"षी":120,"िङ ":81,"ास ":619,"सत":83,"सद":673,"शे":152,"श्":1168,"षर":77,"सग":64,"सक":2028,"शह":143,"सं":1482,"शी":150,"शु":136,"शा":630,"शि":1111,"सँ":111,"षण":89,"हे":701,"हु":2366,"ही":279,"हि":1570,"हा":1720,"िए ":165,"ाम ":560,"ह्":87,"हो":3850,"से":478,"हन":162,"सु":586,"सी":345,"हत":108,"सू":105,"सि":1173,"चल ":150,"सा":2163,"सह":275,"हज":68,"हल":85,"स्":3576,"हर":1967,"सै":149,"सो":307,"ात":1181,"ाथ":320,"ाण":404,"ाठ":517,"िं":132,"ाड":760,"ाल ":1016,"ाट":1019,"ाब":231,"ाभ":93,"ाप":949,"ान":3357,"ाद":2589,"ाध":265,"गुर":71,"िख":71,"ाव":435,"िक":7298,"ाल":5422,"ार":6852,"ाय":812,"ाम":2329,"िए":681,"िज":372,"ाह":1078,"िच":143,"िङ":184,"ास":1934,"ाष":1391,"ाश":2631,"िग":115,"ां":267,"ाँ":1104,"ाइ":698,"ाई":1298,"ाउ":964,"ाओ":403,"ाक":4316,"ाए":185,"गुन":74,"ाच":265,"ाज":1659,"ाग":1013,"ाख":288,"ाङ":244,"ाघ":70,"ार ":1305,"ुँ":124,"ीद":163,"गिर":76,"ुई":88,"ीत":508,"ीप":148,"ीन":313,"ीम":461,"ीय":520,"ीब":79,"ील":304,"ीर":227,"ुग":188,"ुख":347,"ीव":147,"गीत":100,"ुक":691,"ीह":209,"ीस":78,"ुङ":145,"ुट":214,"िट":172,"ाय ":70,"िण":163,"ित":2827,"िद":484,"गाय":88,"िध":410,"िन":2481,"िप":245,"िब":181,"िभ":260,"िम":1517,"िर":1038,"िय":2231,"िल":2254,"ीक":872,"िश":533,"िव":468,"िस":542,"िष":301,"िह":253,"ेव":427,"ेश":820,"ेल":681,"सला":171,"ेर":844,"ेम":230,"ेब":84,"ेप":2592,"सले":93,"ेन":806,"ैत":257,"ैज":79,"ेह":196,"ेस":274,"ेष":206,"ैश":81,"ैर":64,"ैल":110,"ैभ":84,"ैम":134,"ैन":181,"ेक":1516,"ेख":723,"ेट":234,"ेड":98,"ेत":801,"ेद":151,"ेग":72,"ेज":215,"समु":85,"समि":387,"समा":459,"ृथ":65,"ृत":389,"ृष":194,"गोर":127,"सरक":167,"गोल":71,"समे":69,"सम्":1386,"ुत":130,"ुण":70,"ुन":2984,"ुद":383,"ुब":76,"ुप":397,"ुर":1722,"ुम":556,"ुभ":589,"ूक":155,"ग्ल":85,"ुल":730,"ुष":88,"ुस":363,"ुव":409,"ग्र":367,"ुश":84,"ुह":538,"सर्":105,"ग्न":160,"ूद":147,"ून":67,"ूप":214,"ूम":171,"ूर":453,"ूल":388,"ूह":74,"सबै":187,"्व":2939,"्श":82,"्ष":1678,"्स":579,"्ह":81,"्भ":100,"्म":1441,"्य":5693,"्र":13669,"्ल":1547,"्।":480,"समय":115,"ष्म":69,"ष्ण":122,"ष्ठ":171,"ष्ट":483,"सभा":365,"सद्":160,"ौत":91,"ोर":513,"ोल":515,"सदस":314,"ोब":119,"ोम":148,"ोस":122,"ोह":153,"सदर":107,"ोश":104,"ोव":69,"्ण":427,"्त":3287,"्ड":711,"्ट":1245,"्ठ":192,"्ञ":223,"्फ":210,"्ब":412,"्प":1700,"्ध":702,"्न":1912,"षेत":396,"्थ":1433,"ो।":3741,"्द":2288,"ौल":71,"ौर":128,"्ज":453,"्छ":2152,"्च":1134,"्ग":579,"्ख":113,"्क":866,"सन्":376,"ोज":238,"ौं":159,"ोड":185,"ोट":228,"ौँ":401,"ोद":66,"ोत":169,"ोप":186,"ोध":78,"ोन":98,"ोख":109,"ोक":287,"ोग":453,"हत्":91,"सेन":118,"छन ":99,"सुन":123,"सिर":66,"सिम":95,"सिन":119,"सुर":172,"साह":147,"सिद":138,"सान":163,"साद":218,"साप":131,"सार":311,"साम":357,"सिक":403,"साल":247,"सिं":91,"साथ":94,"सहर":108," १८":152," १९":516," २०":1012,"०६":103,"०४":114,"०५":217,"०२":500,"०३":93,"००":262,"०१":255,"१०":116,"१५":81,"१७":82,"१८":226,"१२":66,"१४":88,"१९":555,"२०":1126," ७१":72,"हरू":762,"हरु":794,"हरि":77,"ङमा":65,"७१":90,"स्व":373,"९०":64,"छि ":189,"९६":77,"९५":70,"९८":66,"९१":112,"स्र":74,"स्य":402,"स्न":77,"स्थ":1019,"स्प":226,"स्ट":105,"स्त":727,"९९":85,"स्क":351,"२३":69,"२२":66,"२५":83,"२७":122,"२६":80,"२९":70,"२८":115,"सोज":81,"सेव":93," १०":76," ०५":134,"हेक":471,"हेन":82,"जन ":98,"हुन":2172,"हुँ":68,"हिम":124,"हिल":691,"हिन":285,"हित":161,"हास":142,"हिक":125,"हाल":183,"हार":140,"हान":72,"हाद":225,"हाड":113,"हाँ":430,"चना":64,"चन्":135,"सचि":83,"सकि":79,"सका":115,"सको":1596,"सक्":148,"कै ":184,"का ":4257,"कि ":84,"षाक":932,"की ":394,"षिण":127,"हो ":1122,"षिक":187,"ओवा":396,"शेष":71,"हा ":81,"ही ":218,"श्व":419,"श्र":376,"श्य":86,"श्च":215,"शहर":129,"सी ":177,"हर ":124,"संग":215,"संख":67,"संघ":224,"संव":197,"संस":296,"शाख":89,"शित":796,"शाह":92,"शास":187,"सँग":107,"शिक":120,"सो ":69,"शार":66,"वैश":69,"वेश":81,"वुल":80,"शर्":111,"सा ":76,"व्य":273,"शब्":114,"वर्":460,"षा ":211,"किन":120,"काम":260,"कार":2072,"काल":377,"किक":104,"काश":2575,"कास":405,"किस":96,"कुन":166,"कुर":99,"कुम":159,"ववि":79,"शको":75,"कान":97,"काठ":489,"काक":1844,"वस्":272,"वहा":72,"सन ":103,"गि ":141,"वाद":803,"वान":150,"वाच":132,"विक":476,"वाल":112,"वास":237,"कला":91,"वार":1366,"वाम":64,"वित":78,"विन":80,"विद":224,"विध":297,"विज":192,"वाह":64,"गी ":84,"विष":89,"विश":380,"विस":90,"विम":83,"विर":102,"विभ":130,"वीर":70,"कम्":158,"कमा":129,"कर्":161,"गा ":93,"करण":105,"कपु":78,"कपा":103,"वधि":404,"कता":70,"वमा":91,"गर ":180,"वयम":124,"सं ":192,"शी ":67,"गत ":162,"खि ":378,"वटा":364,"खी ":68,"खा ":132,"को ":13998,"शन ":893,"वि ":102,"एमा":79,"वा ":501,"वी ":188,"ल्न":81,"ल्प":107,"ल्य":159,"ल्ल":1131,"एवं":67,"एकी":88,"एका":367,"एकि":98,"एको":2830,"लोक":92,"लेख":158,"शक ":887,"लेक":101,"लेट":101,"लिम":70,"लुम":71,"वर ":134,"लाई":880,"लाइ":94,"लाग":378,"लाक":490,"लाम":331,"लाल":114,"लिक":155,"लाह":236,"लिङ":98,"लित":141,"लिन":92,"वन ":146,"लहर":130,"एउट":304,"ललि":72,"लमा":361,"लद्":81,"लका":156,"वं ":67,"लगा":129,"लको":1159,"लक्":65,"रैम":78,"रेस":80,"रेष":101,"रोप":82,"रोग":68,"र्श":68,"र्व":558,"र्स":92,"र्ष":409,"र्म":544,"र्य":1351,"र्थ":286,"र्द":429,"र्न":750,"र्फ":154,"र्ट":316,"र्ण":301,"र्त":237,"र्ग":285,"र्ख":68,"र्क":192,"र्ज":198,"र्छ":112,"र्च":149,"रीक":121,"रिव":148,"रिय":408,"रीम":87,"रीय":210,"रुक":171,"रुम":97,"रुप":191,"रुल":94,"रूक":141,"रूद":136,"रूप":207,"रूम":82,"रूल":87,"रेक":307,"रेज":78,"रेल":67,"रेर":97,"रेन":80,"रसा":265,"रसि":109,"रहे":467,"रहर":171,"रस्":71,"ले ":1440,"राई":127,"रान":448,"राप":77,"रात":83,"राण":115,"राख":96,"राज":1244,"राक":104,"रिन":282,"रित":70,"राष":293,"राह":72,"राम":287,"रिए":176,"राय":214,"राल":136,"रिक":3842,"लो ":852,"रला":76,"लि ":69,"ला ":807,"रयो":160,"रम्":75,"रमा":715,"रमु":241,"रवा":142,"रले":64,"ली ":1594,"रधा":161,"येक":79,"रदे":155,"रमण":71,"यो।":1126,"रबा":70,"योग":340,"रपा":108,"युक":77,"युर":75,"युन":112,"युद":113,"याङ":69,"याक":103,"याम":150,"यान":128,"याप":111,"यात":120,"याद":68,"यास":84,"यिक":111,"याल":253,"यार":124,"याय":70,"यिन":65,"रति":595,"रत्":114,"रथम":70,"लय ":258,"युव":76,"रतक":192,"रणा":86,"यसक":1478,"यसै":79,"यसल":145,"रगत":68,"यवस":100,"रक्":100,"रको":493,"रजा":74,"याँ":127,"यहा":98,"यस्":93,"लन ":125,"रे ":127,"महे":70,"महा":221,"महि":223,"महत":77,"यको":226,"यक्":275,"रू ":452,"रु ":592,"यका":195,"मले":67,"री ":1056,"मुद":90,"मुख":298,"मुक":206,"मिल":98,"मित":526,"मिन":83,"मार":349,"माल":263,"मिक":100,"रो ":251,"मास":338,"मिट":74,"माण":135,"माड":449,"माध":70,"माथ":111,"मात":198,"मान":717,"माओ":399,"माक":73,"माज":262,"रै ":187,"मोर":76,"यन्":64,"यद्":67,"मेत":64,"मेर":107,"मेल":71,"रका":2994,"यमि":88,"ऋतु":72,"म्र":98,"म्य":162,"यमा":180,"म्प":1201,"म्ब":311,"म्म":335,"रत ":106,"मको":161,"रण ":320,"या ":441,"उहा":325,"यी ":158,"भिन":116,"भित":82,"भाव":75,"यो ":3433,"भास":199,"भाष":1089,"भार":394,"भाग":184,"उपत":120,"रम ":92,"मजद":66,"ये ":122,"मध्":444,"मना":80,"मन्":256,"भूम":78,"मण्":216,"मती":69,"मयि":71,"रा ":1447,"०० ":67,"रि ":116,"ममा":144,"मा ":6826,"बर्":135,"यत ":118,"भक्":69,"मी ":185,"यन ":118,"बस्":105,"बहा":133,"बाल":77,"बाह":65,"बास":89,"बिन":70,"बाट":745,"बाग":79,"बार":117,"बिह":66,"यम ":171,"बेल":114,"यस ":334,"भनि":212,"भने":219,"भन्":420,"बैभ":80,"भयो":166,"ब्र":111,"ब्य":100,"ब्द":140,"एर ":132,"प्य":67,"प्र":5545,"प्त":252,"भा ":133,"मन ":82,"बजा":98,"मय ":66,"यक ":69,"बने":91,"बना":154,"बन्":175,"एक ":2017,"भएक":2580,"फ्न":128,"फ्र":99,"आदि":87,"पहि":540,"पहा":109,"पश्":183,"आन्":94,"पर्":460,"आफ्":129,"इएक":72,"२०२":474,"परि":275,"२०१":229,"परा":119,"२००":176,"पमा":255,"�":311,"फबा":94,"पोख":90,"उन ":91,"पृथ":65,"पुस":117,"पूर":347,"पुग":79,"पुर":744,"पित":80,"पाक":119,"पान":108,"पात":131,"पाद":966,"पार":420,"पाल":2675,"पाइ":114,"बै ":101,"अथव":91,"१९१":90,"१९९":68,"१९६":68,"अधि":111,"अनि":120,"अनु":226,"अध्":166,"अन्":365,"न्च":76,"न्छ":1928,"न्त":1150,"न्ट":89,"न्ध":228,"न्न":383,"न्थ":109,"न्द":1479,"न्य":282,"न्म":233,"न्स":147,"न्।":473,"नैत":179,"अमे":66,"अरू":140,"अर्":230,"अवस":167,"अवध":415,"पनी":71,"पना":230,"पनि":614,"पन्":133,"असो":81,"अस्":166,"पत्":3819,"पति":105,"पता":130,"पछि":199,"पा ":229,"नले":102,"नया":92,"नवा":108,"नन्":74,"०२७":93,"०२८":90,"नदी":180,"धेर":139,"नमा":333,"ध्य":728,"नुह":426,"उटा":296,"नुप":72,"नुभ":561,"नुस":135,"निस":101,"निष":82,"नीत":296,"नेप":2459,"नेत":159,"नेक":221,"फल ":80,"उनी":101,"उनु":99,"उने":183,"उनल":77,"उद्":70,"उनक":91,"नस्":108,"निर":312,"निय":235,"निम":64,"निन":391,"उत्":329,"निध":76,"नाल":106,"निक":384,"नाम":395,"नार":232,"नाथ":91,"नाक":103,"नाउ":100,"नाइ":96,"द्द":65,"द्व":1093,"द्र":692,"द्य":285,"द्ध":407,"धर्":124,"नकप":71,"इने":87,"इन्":241,"नका":130,"नको":327,"इला":81,"देख":530,"देव":249,"देश":451,"दैन":81,"१८ ":75,"दोल":117,"२७ ":96,"धित":72,"धिम":418,"धार":244,"धिक":138,"धान":515,"२८ ":93,"नता":76,"नगर":305,"न। ":91,"उंम":222},"n_words":[614665,704688,490631],"name":"ne"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":295391,"E":101171,"F":127232,"G":105530,"A":221658,"B":178586,"C":193400,"L":124657,"M":162943,"N":137196,"O":83395,"H":199830,"I":105198,"J":61744,"K":84088,"U":22509,"T":111990,"W":76478,"V":96433,"P":145276,"S":218863,"R":114676,"Y":11382,"Z":56681,"f":358313,"g":1325585,"d":2549536,"e":9521882,"b":675678,"c":885504,"a":4428747,"n":5174765,"o":3044475,"l":2174413,"m":1301830,"j":487381,"k":957853,"h":1157193,"i":4146613,"w":618312,"v":1136367,"u":1226136,"t":3691802,"s":3058116,"r":3420928,"q":14380,"p":897097,"z":331215,"y":211813,"x":48411,"²":22665,"ï":7308,"í":7570,"ë":58538,"é":54151,"è":10784,"ä":6605,"á":12476,"ü":12035,"ö":12215,"ô":9257,"ó":9442," l":108039," m":271385," n":113728," o":359305," h":429647," i":863484," j":55299," k":151715," d":1022256," e":865604," f":60480," g":362770,"р":6363," a":271750," b":268848," c":85027," z":129440," u":151127," t":287237," w":287907," v":736662," p":253369," s":272741," r":136806," J":58839," K":80686," H":197094," I":88140," N":131673," O":75874," L":120300," M":156963," B":171895," C":177462," A":209281," F":122253," G":98641," D":289943," E":96010," Z":55679," Y":11046,"и":8389,"о":8805,"н":6323," S":205129," R":108525," P":136936,"а":10448," W":74232," V":87489," U":20421,"е":6745," T":105528," é":7785,"A ":9463,"Da":14497,"Cu":5824,"Cl":7287,"Co":57452,"Cr":6279,"Ce":10546,"Ch":35435,"Ci":5986,"Du":33850,"Do":13572,"Dr":5649,"De":198026,"Di":16644,"Fe":6923,"Fa":6211,"Eu":11420,"Er":8327,"En":17423,"El":7917,"Ee":19326,"Ge":21620,"Ga":13267,"I ":14576,"Fr":73223,"Fo":9924,"Fi":11740,"C ":12680,"Au":12206,"Ar":23539,"Ba":28977,"Am":55987,"An":22709,"Al":28901,"Bu":10850,"Br":34564,"Ca":38690,"Bi":11873,"Be":44722,"Bo":26862,"Bl":5807,"Kr":9527,"Ko":15418,"Le":21847,"Li":19176,"La":31976,"Lu":12150,"Lo":28450,"Me":24563,"Mi":26609,"Ma":59234,"Mu":7133,"Mo":28002,"Ni":14152,"Ne":59633,"Na":17889,"No":27858,"Ol":9544,"Gi":5965,"Gr":28322,"Go":11925,"Gu":7667,"Ha":30031,"He":95269,"II":10036,"Hi":30742,"Ho":27037,"Hu":6810,"In":31628,"Is":5913,"It":15844,"Ja":19607,"Je":8912,"Jo":19365,"Ju":6270,"Ka":21546,"Ki":7967,"Ke":10121,"Un":6926,"Tu":6976,"Tr":10760,"Ts":9146,"To":14361,"Th":19966,"Ti":10118,"Te":13194,"Ta":10688,"V ":6513,"St":35202,"Su":9985,"Wo":6733,"Wi":21345,"Wa":15248,"We":22896,"Vo":12732,"Vi":16383,"Vl":8595,"Va":15790,"Ve":22438,"Pr":15997,"S ":6945,"Pe":17479,"Pa":35566,"Po":30054,"Pi":14013,"Oo":13070,"Op":6344,"Or":9483,"Se":15433,"Sc":16133,"Si":16384,"Sh":6081,"Sl":6946,"Sp":19561,"So":16434,"Ru":16114,"Sa":42083,"Re":17106,"Ri":17272,"Rh":6524,"Ro":32707,"Ra":12538,"b ":17772,"a ":223995,"Yo":6270,"Ze":12835,"Zi":6830,"Zu":10609,"Zw":13009,"i ":117137,"gd":23906,"ge":622849,"ga":53344,"fl":6356,"fg":6573,"ff":17069,"fi":41042,"fs":14314,"fr":11929,"fu":6506,"ft":32454,"fo":24009,"j ":85100,"he":517783,"ha":140501,"gn":22691,"gl":11647,"gi":125274,"gh":17920,"gg":9680,"gu":24589,"gt":35908,"gs":36659,"gr":72480,"go":35093,"dt":42741,"du":30903,"dw":10868,"g ":209248,"ea":48028,"eb":81350,"ec":86712,"ed":198243,"de":1265409,"dd":20989,"dg":6934,"di":281697,"dh":8510,"dk":6604,"do":126396,"ds":99321,"dr":53877,"ew":31588,"ex":17157,"eu":69138,"ev":102331,"ey":15758,"ez":66338,"fa":30179,"h ":79224,"fd":18426,"fe":38002,"eh":28491,"eg":168269,"ef":50847,"ee":952339,"el":605553,"ek":125240,"ei":156611,"ep":103855,"eo":27881,"en":2016971,"em":320296,"et":590121,"es":318427,"er":1223846,"ca":46321,"e ":2100607,"bs":5841,"br":58403,"bu":44669,"bo":58794,"bl":25851,"bi":87840,"bb":9494,"be":276525,"da":122637,"f ":90198,"cu":23421,"ct":84409,"cr":11311,"co":70369,"ck":28358,"cl":14976,"ci":82756,"ch":396490,"ce":87968,"c ":16305,"az":12546,"ay":18941,"ba":76157,"d ":427738,"at":401880,"as":187038,"ar":458993,"aw":6737,"av":29958,"au":65384,"ak":138064,"al":362702,"ai":53889,"aj":7035,"ap":71362,"am":172784,"an":1131186,"ac":99694,"ad":102907,"aa":662152,"ab":30531,"ag":85506,"ah":10878,"ae":23635,"af":47867,"nu":31030,"nt":435391,"ns":284103,"nr":14733,"no":95599,"nn":67675,"nz":11234,"ny":9083,"nw":88548,"nv":13925,"oe":163750,"of":83411,"oc":46361,"od":56125,"oa":14423,"ob":29747,"om":159575,"on":522696,"ok":52068,"ol":157894,"oi":33928,"og":70046,"oh":14048,"ot":108603,"m²":22643,"os":81239,"ov":88551,"ou":131520,"op":166634,"oo":370075,"or":483257,"r ":577062,"ow":27555,"oz":6517,"oy":5912,"pe":152921,"pg":12730,"pa":116725,"pl":132966,"po":60048,"ph":12783,"pi":48220,"lo":99408,"lm":24809,"ll":128927,"ls":119057,"lp":13038,"lv":15042,"lu":37402,"lt":126691,"ly":17100,"o ":155926,"md":19425,"ma":206926,"mb":60844,"me":492457,"mi":91034,"mm":32963,"mp":51654,"ië":52728,"mo":56705,"mt":19679,"ms":35581,"mu":25483,"my":6479,"p ":141470,"na":201890,"nb":21559,"nc":69092,"nd":473575,"ne":270200,"nf":9979,"ng":259192,"nh":15116,"ni":168878,"nk":43137,"nl":16899,"nm":6930,"jv":11516,"ju":22953,"js":16655,"jn":70736,"jo":13034,"jk":115620,"ki":53710,"kh":5862,"ke":182363,"ka":118671,"m ":155936,"kw":7713,"ks":46927,"kt":131352,"ku":22530,"ko":60158,"kr":23259,"kk":20620,"kl":29172,"km":27830,"kn":6593,"li":338485,"lh":5934,"lk":34257,"le":302236,"ld":95337,"lg":36498,"lf":20122,"la":398196,"lc":8500,"lb":22338,"n ":2543334,"hr":30422,"ht":107458,"hu":34327,"hi":99792,"hn":9649,"ho":105347,"id":125660,"ic":175476,"ib":14735,"ia":95834,"ig":142126,"if":16130,"ie":477304,"k ":193827,"ir":54494,"is":722822,"it":333187,"iu":13498,"iv":35131,"iw":8456,"ix":5720,"ij":364327,"ik":93516,"il":169468,"im":40230,"in":879621,"io":144997,"ip":26974,"je":41083,"jd":31240,"jf":10910,"iz":17029,"l ":299819,"ja":41826,"xi":7194,"z ":13752,"wi":58960,"wn":7387,"wo":161887,"ws":8197,"y ":100628,"wa":155603,"wd":6545,"we":181101,"vl":34784,"ré":7748,"vi":112804,"vr":17131,"vo":180923,"uz":15014,"ux":8237,"uw":44437,"uv":7894,"uu":52115,"ve":243007,"va":525240,"x ":20927,"ui":271759,"uk":10265,"ul":55545,"ue":25713,"ug":30034,"ur":165500,"us":143221,"ut":46184,"um":51805,"un":100952,"up":10064,"ty":46646,"tz":8930,"tu":100531,"tt":45242,"tw":33429,"tv":9249,"ub":30475,"ua":33926,"ud":45500,"uc":29841,"w ":28312,"to":174226,"tm":7826,"tl":14300,"ts":211260,"tr":144811,"tg":20068,"te":770637,"tk":6386,"tj":6074,"ti":257739,"th":76245,"v ":8380,"tb":23716,"ta":261366,"su":28639,"sv":9064,"ss":138251,"st":562933,"sy":14252,"sz":5928,"sw":6865,"sl":43368,"sk":33909,"sn":9526,"sm":15035,"sp":74738,"so":59040,"sr":6378,"sd":20593,"sc":205927,"se":425364,"sh":26990,"sg":6293,"sj":16338,"si":98258,"rz":20638,"u ":30645,"sa":44187,"sb":15979,"rr":71892,"rs":228779,"rt":187734,"ru":73560,"rv":44462,"rw":18490,"ry":14720,"rp":27084,"ro":286166,"rn":60686,"né":6017,"rm":70366,"rl":117376,"rk":74650,"ri":419079,"rh":16037,"rg":84498,"rf":11818,"re":392586,"rd":257870,"rc":25722,"rb":33548,"ra":285112,"t ":1445388,"qu":12987,"s ":1176543,"pt":24055,"pu":20157,"pp":39364,"pr":99751,"ps":14478,"zi":116688,"ze":78298,"za":31109,"zu":15798,"zo":49094,"ye":6127,"yc":6275,"ya":8170,"ys":18578,"yr":9415,"yp":6946,"yn":9021,"ym":12848,"yl":8352,"² ":22661,"éé":7636,"én":13925,"ë ":43502,"é ":8856,"一":6916," Ga":13216," Ge":21536," Fo":9883," Fr":73181," Fi":11695," Ha":29997," He":95105," Go":11879," Gr":28204," Gu":7630," Gi":5928," Hu":6795," Ho":26988," II":6497," Hi":30725," Je":8879," Ja":19556," Is":5895," It":15839," In":31514," Ka":21414," Ke":10042," Ki":7899," Jo":19317," Ju":6264," La":31862," Le":21646," Li":18941," Ko":15401," Kr":9517," Ma":59034," Mi":26529," Me":24479," Lo":28411," Lu":12123," Ne":59527," Na":17815," Ni":14127," Mo":27929," Mu":7091," Am":55975," An":22679," Al":28811," Ba":28861," Au":12111," Ar":23451," Be":44588," Bi":11811," Bl":5783," Bo":26752," Br":34502," Bu":10803," Ca":38284," Ce":10523," Ci":5893," Ch":35359," Cl":7183," Cr":6209," Co":57213," Cu":5704," Da":14480," Di":16596," De":197691," Do":13329," Du":33818," El":7895," Ee":19260," Er":8309," En":17358," Eu":11409," Fe":6899," Fa":6133," Wo":6660," Wi":21285," We":22829," Wa":15180," Zu":10598," Zw":13007," Ze":12821," Zi":6778," Yo":6266," Or":9466," Oo":13036," Op":6330," Po":29977," Pi":14002," Pe":16009," Pa":35403," No":27810," Ol":9537," Ra":12376," Ro":32636," Re":17067," Ri":17238," Rh":6521," Pr":15952," Su":9973," St":34839," Ta":10649," Th":19903," Ti":10088," Te":13125," Tr":10698," Ts":9134," To":14233," Ru":16099," Sa":42032," Sh":6028," Si":16291," Sc":16056," Se":15382," So":16378," Sp":19500," Sl":6935," Va":15750," Ve":22303," Vi":16318," Vl":8585," Vo":12506," Tu":6926," Un":6865," ja":26885," in":468125," is":385652," ka":34673," ki":8773," ke":15418," ju":21208," ha":22481," he":342958," gr":38760," go":7687," hi":19625," ho":31679," hu":11200," ni":13445," ne":8341," na":55514," mu":11657," mo":19517," ok":9997," om":21439," on":84624," of":50488," nu":5724," no":28282," le":24338," li":33990," la":36262," ku":10447," km":25898," kl":14578," kr":8283," ko":24113," me":105531," mi":19475," ma":109683," lo":6645," af":19870," aa":54537," ac":15329," an":19082," ap":11762," al":65830," au":17140," ar":47665," ba":14963," bi":41311," be":166033," bo":19486," bl":6588," bu":9669," br":10013," ca":7354," e ":8358," er":11018," et":7087," en":320059," ei":19716," el":9897," ee":469095," fe":12855," fa":15977," fo":5946," fi":16445," ge":303157," ga":6353," co":32521," ce":11923," ch":6021," ci":15694," da":51374," do":72769," dr":11539," de":740739," di":128156," du":9386," zo":26839," zu":11699," za":9390," ze":21870," zi":55419," ru":5958," sa":9420," se":22896," sc":26028," si":12277," sl":6445," sp":31376," so":18785," ra":8521," re":91305," ri":13940," ro":16553," pr":57908," s ":10712," ou":7125," ov":18421," oo":33765," op":114061," or":11383," pe":20368," pa":24621," pl":113070," po":21982," wa":118659," we":90321," wo":59080," wi":19128," va":481040," ve":76969," vo":131543," vr":12217," vi":22015," vl":9586," tw":15968," tu":14723," ui":144553," ta":9836," sy":6615," st":111138," su":9964," tr":11805," to":58623," th":14839," ti":18789," te":138599," éé":7588,"Eur":9786,"Eng":13833,"Fra":63542,"II ":7320,"Hij":21364,"Het":68660,"Her":7011,"Gri":6704,"Gra":6848,"Gro":9812,"Ind":7590,"In ":14751,"Hon":6999,"Hol":6098,"Bar":6393,"Alp":5725,"Ame":44932,"Ams":6535,"Ant":6487,"Cal":8797,"Car":7946,"Ber":10327,"Bel":16135,"Bra":7962,"Bri":11166,"Bou":5646,"De ":168413,"Dez":5967,"Dee":6161,"Chi":8065,"Cen":7181,"Cha":14656,"Cor":5822,"Com":7136,"Con":6541,"Cou":26493,"Een":16908,"Dui":29012,"Ned":43974,"Nat":6561,"New":6417,"Nor":10502,"Noo":11547,"Oly":6017,"Oos":10737,"Per":5867,"Par":11966,"Poo":9123,"Ita":15181,"Joh":7769,"Lan":8583,"Man":6806,"Mar":24247,"Mon":9837,"Mid":7429,"Wil":9135,"Wer":5760,"Wes":9343,"Vla":7545,"Ze ":6085,"Sta":16762,"Sin":5858,"Spa":11230,"Rus":11905,"Sai":9817,"Sch":12674,"San":8826,"Rij":7380,"Rom":5676,"Ver":12912,"Uni":5818,"The":11210,"Tsj":8380,"bis":7734,"bin":13224,"bij":36202,"bli":10301,"bla":6463,"boe":8360,"bon":5829,"bor":10260,"bou":11481,"bbe":6915,"ban":15026,"bal":21186,"baa":7632,"bas":9398,"bar":8019,"beh":12808,"beg":7886,"bee":12385,"bed":18175,"ber":72483,"ben":9807,"bel":14825,"bek":19147,"bev":17462,"bes":52050,"bet":13589,"bie":17101,"ca ":12312,"car":7029,"cat":5698,"ce ":27218,"bri":6013,"bro":7591,"bra":10008,"bre":7181,"bru":26538,"bur":21107,"bum":8070,"am ":47277,"ake":12657,"al ":74743,"ail":5969,"ain":18706,"air":10562,"ais":6111,"ak ":11574,"agt":10140,"agn":7735,"anu":16559,"ano":8039,"ann":16248,"ant":55265,"ans":143853,"ane":9267,"ang":46635,"ani":27973,"ank":15316,"ap ":25134,"ana":20565,"anc":16651,"and":178626,"amm":8429,"amp":20053,"ams":9681,"ami":28047,"ame":31772,"amb":6788,"ama":9230,"alv":6355,"alt":35976,"als":49851,"all":30537,"ali":53705,"alc":5747,"ald":14462,"ale":38623,"ala":12213,"alb":11148,"an ":555423,"aks":6554,"akt":91851,"abe":6263,"abi":6431,"ae ":14725,"aaf":9902,"aag":19367,"aad":5831,"aak":81870,"aan":153725,"aal":41110,"aam":28835,"aas":6793,"aar":125046,"aat":183677,"ad ":41911,"afs":8757,"age":22392,"adi":11143,"ade":19021,"ag ":18180,"ach":45489,"ace":13447,"ada":5737,"af ":10774,"act":17739,"at ":119635,"are":27820,"ard":36005,"arc":10579,"arb":9145,"ara":15993,"aro":16161,"arn":8226,"arm":7787,"arl":11777,"ark":11271,"ari":63150,"arr":46599,"ars":14646,"art":89778,"au ":9748,"asi":6239,"ase":7343,"ar ":70080,"apa":7428,"app":11428,"apr":10161,"as ":104628,"aut":12900,"avi":8222,"ave":10618,"ay ":8013,"ata":7505,"ast":27312,"ass":19856,"ato":8717,"ate":41676,"ati":71342,"ath":9399,"att":6300,"ats":113866,"atu":9691,"aug":10264,"Zwe":6862,"Zui":9741,"jec":10400,"jk ":66346,"jaa":10547,"jar":7927,"jan":13106,"jd ":7717,"je ":16212,"jde":16692,"jns":7518,"js ":7214,"jn ":50205,"jks":11628,"jke":29064,"itt":7963,"its":47572,"ity":14389,"iss":57247,"ist":78828,"ita":16110,"ite":28930,"itg":14085,"iti":26080,"ium":6105,"ivi":15930,"ive":12826,"is ":431822,"ion":45782,"ir ":7395,"isi":11995,"ish":6473,"ise":13845,"isc":89426,"isa":6882,"ire":19679,"it ":148319,"iwo":7073,"ize":7008,"kin":23405,"ki ":12463,"kee":5719,"kel":22383,"ken":67439,"ker":29057,"ke ":43164,"kt ":89985,"kse":13533,"kri":8432,"km²":22590,"kor":8329,"kon":7670,"kom":21374,"ks ":13043,"kke":14372,"kle":13907,"kla":8538,"jul":9669,"jun":10013,"jve":8311,"jst":5685,"kan":26234,"kam":11248,"kaa":47537,"ka ":11569,"ham":9052,"han":20214,"hap":31501,"hal":11321,"har":17692,"haa":16281,"had":6694,"he ":78504,"hel":11001,"hei":31175,"hee":28005,"het":305470,"her":18043,"hen":9035,"hem":8872,"hie":14941,"hin":11368,"hil":13952,"hij":15137,"his":12860,"gne":9405,"gna":8227,"gon":7132,"gsd":8056,"gro":30942,"gra":24600,"gt ":25727,"gri":8059,"gre":7606,"gst":8398,"gus":11645,"ial":9076,"ian":13694,"iat":6807,"iaa":21077,"id ":39722,"ia ":31791,"iet":21274,"ieu":11845,"iev":7556,"iel":12150,"iem":5824,"ien":50882,"ier":48258,"ies":24371,"ied":23758,"ief":9097,"iek":35699,"ig ":26250,"ict":38543,"icu":11018,"ico":6343,"ici":9264,"ich":59319,"ice":15194,"ie ":201388,"ica":19828,"idi":13237,"ide":28170,"idd":13665,"ida":12788,"il ":19641,"ijd":30969,"ije":8022,"ijf":10866,"ijk":115115,"ijn":70104,"ijs":16040,"ijv":11367,"im ":7764,"ika":50912,"igd":11510,"ige":49060,"igh":9499,"igi":9202,"igt":11964,"ign":9284,"ij ":80452,"ik ":10619,"ime":7803,"inc":33586,"ind":41153,"ina":22287,"inn":14337,"ino":8487,"int":30127,"ins":19772,"ine":33694,"ing":150853,"ini":14846,"ink":10381,"ioe":9616,"inw":77340,"ikk":7357,"ike":6423,"ila":17614,"in ":405294,"ikt":10786,"ilo":7646,"ill":44186,"ilm":12944,"ili":39663,"ild":8065,"ile":5842,"ima":5906,"io ":71745,"hol":9679,"hou":16715,"hoo":29440,"hor":9181,"hoe":5979,"hui":10622,"hts":6548,"hth":9745,"hti":8381,"hte":21726,"hre":7796,"hri":17051,"ht ":49854,"hum":5764,"ffe":6190,"ffi":6818,"feb":9777,"fen":6207,"fam":16628,"fde":6413,"eze":22604,"ezi":32218,"etb":16830,"eta":11379,"ete":33498,"eti":8945,"eth":6048,"esp":11543,"est":111009,"ess":14653,"etr":9667,"ets":6968,"ett":13669,"ew ":6999,"eve":54896,"eva":7494,"evo":21109,"evi":14484,"euw":17939,"eur":21284,"ewe":9375,"ewo":5810,"ey ":9860,"epe":6552,"er ":333095,"epa":50447,"eor":9695,"es ":93024,"ept":12657,"erk":43542,"erl":61155,"eri":98353,"erg":28574,"erh":11977,"ere":86052,"erd":117197,"era":27762,"erb":15858,"et ":468269,"esl":15451,"esi":12100,"esc":18281,"ese":16953,"erz":13202,"erv":34773,"erw":12350,"err":17024,"ert":36463,"ers":189637,"ern":29282,"erm":20928,"erp":10525,"ero":16315,"eks":13762,"ekt":9522,"en ":1373884,"ela":20546,"eld":60315,"elf":9763,"ele":68456,"eli":59882,"elg":17311,"elk":7255,"ell":29565,"elo":7411,"els":48151,"elt":76311,"emb":35339,"ema":13115,"eme":213429,"emd":11991,"emi":9436,"ep ":11271,"ene":28524,"enh":7989,"eng":11580,"enb":15906,"ena":20359,"end":77370,"enc":8130,"eno":20895,"enn":18048,"enk":9463,"enl":7298,"eni":29460,"enw":7353,"ens":81592,"ent":262064,"enr":11987,"ego":6580,"ege":50512,"egi":74199,"eho":15902,"ek ":28609,"eis":7935,"eil":21113,"ein":30892,"eid":39967,"eig":6260,"el ":169543,"eiz":7508,"eit":10871,"eke":48960,"em ":17335,"gis":18136,"gin":20458,"gio":63608,"gie":9618,"gge":6516,"gep":6253,"gen":119618,"geo":6626,"get":5657,"ger":43727,"ges":41787,"gev":26798,"gew":8731,"gez":31827,"gee":9960,"ged":10354,"geb":49406,"geh":10722,"gem":117467,"gel":56245,"gek":5735,"gde":10792,"ge ":59115,"gd ":7871,"gaa":11487,"gan":14683,"ft ":25649,"for":15464,"fic":10500,"fie":5840,"fil":14418,"da ":11415,"de ":684651,"daa":13978,"dag":8773,"dae":7793,"dat":38507,"dan":11068,"dam":12288,"dde":16386,"cti":18928,"cte":10141,"cus":11312,"clu":9316,"co ":6846,"con":15186,"com":22591,"ct ":40340,"cea":6367,"ch ":51967,"ces":6110,"cen":17525,"cem":10124,"cha":49925,"cia":10705,"ck ":10471,"cie":34119,"che":90373,"chi":44310,"cho":18169,"cht":101737,"chr":21384,"cit":10702,"ed ":30096,"eba":5684,"ebe":6342,"ebi":14653,"ebo":12775,"ebr":30709,"eau":7092,"ei ":13562,"eft":21131,"eek":11355,"een":586471,"eel":149087,"eem":7130,"eef":23903,"eed":19633,"ees":22270,"eer":87604,"eeu":10006,"eet":5732,"edi":16531,"ede":93412,"eda":9229,"eg ":11274,"eds":11135,"edo":6698,"edr":18455,"ech":31918,"eci":6720,"ece":11419,"ee ":19989,"ef ":10385,"ect":16877,"eco":6380,"dwe":6219,"dor":15099,"doo":63254,"don":8072,"dom":7397,"ds ":31717,"dië":7942,"doc":6254,"doe":8162,"dst":15322,"dui":6443,"duc":8131,"dri":17764,"dra":19458,"dt ":39781,"dro":6043,"dsc":11185,"dse":26450,"dic":13497,"dia":8442,"der":192741,"des":13390,"dez":10434,"dec":12054,"dee":111320,"del":33701,"den":135340,"dep":43953,"di ":6337,"do ":6228,"din":21044,"dio":7262,"dis":79203,"dit":10793,"die":85548,"dig":22881,"rha":6318,"rga":14305,"ri ":27658,"rgi":6243,"rge":20743,"ret":8087,"res":26172,"rev":8098,"rdt":29462,"rg ":25954,"rea":7679,"ree":20672,"rec":16838,"red":11064,"rei":15261,"reg":76191,"ren":91873,"rek":7399,"rel":18716,"rda":12870,"rdo":6848,"rdi":16416,"rde":80930,"re ":58121,"rch":10326,"rd ":94954,"ras":7988,"rat":17581,"rbi":10377,"rbe":8163,"rag":7326,"ran":87841,"ram":11943,"ral":17891,"raa":43729,"rad":10061,"rac":17496,"rs ":114967,"rpe":6605,"ros":6652,"rot":12676,"rom":9424,"ron":74395,"roo":21289,"rop":13868,"rou":11473,"rov":34172,"rod":10405,"roc":11802,"rol":11209,"roe":27621,"rog":7078,"rno":7457,"rp ":10130,"rna":17955,"rne":14489,"rni":6410,"rmo":6164,"ro ":8067,"rma":25601,"rme":14051,"rlo":10187,"rli":43352,"rle":10232,"rla":47955,"rn ":7785,"rko":8091,"rke":15492,"rm ":8899,"rip":6607,"rio":7162,"rit":17759,"ris":26171,"riv":7770,"rig":10543,"rij":60217,"ril":12296,"rik":53920,"rin":39022,"ria":15453,"ric":60714,"rid":8288,"rie":42534,"rk ":23012,"rwe":7689,"rwi":6024,"rui":23623,"rug":6639,"rum":7291,"rus":7171,"rva":8120,"rve":6455,"rvl":18128,"rvo":6709,"ry ":9896,"rsi":7125,"rso":10142,"rsp":13174,"rsc":16580,"rse":16920,"rta":7455,"rst":29072,"rto":9036,"rte":64198,"rth":8072,"rti":18904,"rua":9937,"rts":6565,"rt ":52830,"rro":41654,"rri":6371,"rre":14498,"sam":7637,"sat":5699,"shi":8681,"sje":10417,"sie":18702,"sit":8909,"sis":17171,"sin":18433,"sig":6448,"sdi":9330,"se ":245903,"sch":188901,"sco":5845,"ser":23257,"sh ":6367,"sei":5692,"see":12427,"sep":12185,"sen":46205,"sem":40984,"sel":17905,"spo":12356,"spr":14426,"spe":34342,"spi":6360,"son":16537,"soo":14382,"st ":68174,"sla":28873,"ski":9665,"ske":6724,"sme":5826,"sse":83731,"ssa":7327,"sso":6523,"ssi":25931,"ste":153650,"sta":134037,"sto":24680,"sti":36773,"stu":49771,"str":76216,"sus":9037,"tai":6439,"tal":42563,"taa":79421,"tad":33209,"tba":18995,"tat":25594,"tar":9576,"tan":24920,"te ":245946,"ta ":14225,"pe ":6304,"par":65094,"paa":13416,"pan":11481,"pge":11800,"pec":7014,"pen":27399,"per":54393,"pes":8975,"pee":13839,"pel":26450,"pla":118636,"ple":6523,"pij":6479,"pio":11106,"pis":8712,"por":10136,"poo":7190,"pon":6968,"pol":16543,"ppe":30477,"pub":6405,"pte":13585,"pri":21984,"pre":14246,"pro":59321,"que":5834,"ra ":17662,"ngr":7754,"ngt":6149,"ngs":24028,"ni ":14308,"nge":85205,"nga":9441,"nha":5738,"nel":9354,"nen":38308,"nem":7134,"ner":92669,"net":10080,"nes":17246,"ng ":100478,"nee":12320,"nci":34445,"nce":13701,"nch":8196,"ne ":58920,"nbu":5898,"ndr":9897,"nds":53318,"ndo":10563,"ndi":65506,"nde":170234,"nda":12836,"nal":19401,"nam":19197,"nan":5995,"nad":6211,"naa":50268,"nbe":7893,"nd ":125333,"nat":22785,"na ":37268,"nwo":81256,"ny ":6026,"num":6116,"nua":11671,"nty":25239,"ntw":10449,"nto":19506,"nts":8743,"ntr":16443,"nti":22690,"nta":19603,"nte":152583,"nsu":9180,"nst":31383,"nse":133099,"nsc":14189,"nri":8599,"nt ":140657,"ns ":66627,"noe":11549,"noo":19104,"nom":9568,"nov":12875,"nne":42144,"nna":5969,"nni":9362,"nië":12014,"nla":8754,"no ":10798,"nke":14949,"nkr":6212,"nig":15471,"nie":28378,"nic":7027,"nia":7786,"nk ":5831,"niv":5902,"nis":35078,"nin":21782,"ogr":8544,"ogi":8951,"oge":15686,"oiw":7018,"ois":5765,"oir":6590,"ok ":26496,"ol ":12318,"och":15236,"ock":8061,"ode":16327,"ods":9122,"of ":51317,"oek":14333,"oel":8698,"oem":15572,"oeg":10025,"oer":17651,"oet":21912,"oen":24849,"oep":14118,"odu":7865,"oed":12901,"og ":18782,"off":7132,"ofd":11478,"oal":5868,"od ":7190,"obe":14031,"own":6969,"oud":23948,"oth":6889,"ote":16152,"ott":8284,"ots":8865,"oto":6753,"ost":29293,"ota":7234,"ose":6693,"ovi":34676,"ouw":19904,"ove":41666,"oun":29232,"ous":5891,"our":16621,"out":8877,"opp":20304,"ope":17398,"opg":12331,"os ":12938,"oon":17563,"ool":15370,"oom":10898,"ook":25159,"ooi":6071,"oof":13207,"oog":15636,"ood":7845,"or ":123656,"oot":15244,"oos":12524,"oor":222103,"oop":6797,"ork":12244,"orl":12130,"orm":37127,"orn":13425,"orp":13450,"ord":94351,"ore":21033,"org":20784,"ori":18821,"ou ":6777,"ort":47654,"ors":16087,"ot ":44568,"m² ":22639,"ora":14333,"ola":5947,"old":6867,"on ":90823,"oli":23091,"oll":13695,"olk":19645,"ole":13281,"olg":11475,"ols":13292,"olo":16700,"om ":28845,"okt":10174,"ona":29547,"ond":144852,"one":97393,"ong":27845,"oni":32623,"onn":9985,"ono":10092,"ons":14560,"ont":39227,"oma":10713,"ome":31512,"omb":6304,"omi":12146,"omm":12297,"omp":15805,"omt":12605,"oms":11302,"op ":79508,"la ":16552,"le ":71663,"lf ":5640,"lde":29685,"laa":121868,"lac":19544,"lad":5854,"lag":14890,"lai":7568,"lak":20347,"lan":133760,"lar":5948,"lat":13164,"las":13782,"ld ":39055,"lbu":9341,"kun":16374,"kwa":5842,"kte":24985,"kst":6852,"kto":10642,"lpe":6305,"ls ":61450,"lon":9738,"lom":7994,"loo":10759,"lor":5860,"loe":7778,"log":16741,"los":5784,"lië":7455,"lub":9599,"lst":25217,"lte":6994,"lse":20318,"lt ":100197,"lge":14115,"lgi":13560,"li ":13675,"lev":15162,"les":15192,"let":7945,"ler":20832,"lem":10140,"len":59840,"lei":20020,"leg":19493,"lee":17156,"led":8570,"lec":6605,"lo ":6322,"lla":18593,"lle":62567,"lli":22272,"llo":6485,"lks":5894,"lki":11362,"lm ":10134,"ll ":9099,"lit":19124,"lis":19817,"lip":6344,"lin":41169,"lic":12334,"lia":22468,"lij":95486,"lig":34482,"lie":40870,"ma ":13279,"maa":102890,"mar":12201,"mal":14307,"man":28303,"mat":12974,"md ":13491,"mbe":34190,"me ":20464,"med":7669,"mee":138208,"met":69732,"mes":6603,"mer":72670,"mel":11918,"men":146532,"mei":13406,"lve":5791,"lym":7416,"mpi":18269,"mpe":6753,"mpo":7375,"ms ":12121,"moe":5824,"mon":14109,"mt ":13658,"mst":14316,"muz":8520,"min":20688,"mil":24349,"mis":8333,"mit":6248,"mig":6106,"mie":5852,"mid":8678,"ië ":43280,"mma":6643,"mme":17999,"zui":10334,"zee":6519,"zet":5935,"zen":14435,"zel":8377,"ze ":24809,"zan":8216,"zoo":6798,"zon":12147,"zoe":7835,"zie":38234,"zic":12949,"zij":41913,"yst":6749,"ys ":5784,"ymp":7552,"wn ":6464,"wod":7070,"wor":46676,"woo":12376,"won":83488,"woi":6978,"wes":10367,"wer":76690,"wet":5976,"wen":7541,"wel":18559,"weg":14286,"wee":29661,"wit":7604,"win":7741,"wij":18720,"wat":9640,"war":13183,"was":80249,"waa":29854,"vro":6819,"vil":13878,"vin":41497,"vie":14100,"vis":16850,"vla":21113,"vli":6207,"voe":25269,"vol":30369,"von":6589,"voo":92340,"vor":15339,"ver":131519,"ven":47801,"vem":9979,"vel":12213,"vee":14817,"ve ":13676,"val":33641,"van":455616,"vat":5892,"vaa":8390,"uzi":9073,"uwe":13657,"uwd":6413,"uur":48711,"usi":5945,"use":7901,"ust":24814,"uss":30621,"ute":13524,"uw ":14870,"uto":6696,"us ":63461,"ut ":6738,"ure":14305,"urg":25394,"uri":10844,"url":29792,"uro":10683,"urt":8342,"ur ":31687,"umb":6654,"unt":32613,"uns":8285,"uni":16955,"und":14286,"um ":25713,"ult":7652,"uli":14819,"ule":6125,"ula":6204,"un ":7353,"uid":29632,"uik":15052,"uis":15533,"uit":185570,"ugu":11498,"ude":20442,"udi":6485,"ue ":5767,"uch":9124,"ub ":8370,"uar":23535,"ubl":7698,"ty ":41728,"tur":6970,"tus":27504,"tuu":41013,"tud":6157,"twi":6118,"twe":21595,"ts ":120548,"tre":20553,"tra":31398,"tri":58554,"tru":9236,"tro":22667,"tse":38669,"tsc":8839,"tst":17796,"tte":25599,"to ":11397,"toe":15554,"tob":10290,"tot":36145,"tow":6284,"ton":34234,"tor":20776,"tij":29417,"tie":72965,"tig":17350,"tit":9886,"tis":23784,"tin":21180,"tio":32273,"thu":8037,"tic":17261,"tle":5983,"tem":61681,"ten":116036,"tei":13819,"tek":11255,"tel":112278,"tee":16372,"teg":10988,"ted":9251,"th ":11037,"teu":8413,"tes":8744,"ter":133625,"tge":17358,"tho":11376,"the":28515,"tha":9247,"én ":8357,"één":7621},"n_words":[56157687,65372177,47614417],"name":"nl"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":67298,"E":31132,"F":46085,"G":30699,"A":51244,"B":53160,"C":43132,"L":37752,"M":50816,"N":44173,"O":29040,"H":46630,"I":28485,"J":19280,"K":42898,"U":16316,"T":46770,"W":15084,"V":29381,"Q":2727,"P":33476,"S":98250,"R":33889,"Y":5384,"X":3613,"Z":4254,"f":354544,"g":680290,"d":751947,"e":2892960,"b":291896,"c":77603,"a":1296256,"n":1640496,"o":1106000,"l":1037936,"m":608425,"j":148253,"k":716879,"h":226424,"i":1317183,"w":19861,"v":434834,"u":407647,"t":1430364,"s":1268259,"r":1753294,"q":3203,"p":350711,"z":18142,"y":203712,"x":10440,"²":2235,"Å":2838,"Ø":5129,"í":2375,"é":5770,"æ":27321,"å":157055,"ä":7866,"á":3631,"à":2467,"ü":2355,"ø":174400,"ö":5464,"ó":2058," l":79574," m":127350," n":66294," o":215607," h":91447," i":286253," j":14786," k":126841," d":163994," e":389283," f":217023," g":55928,"р":2932," a":164625,"с":2244," b":141242," c":8452," u":55940," t":125598," v":100006," p":120765," s":286186," r":49569," J":19242," K":42735," H":46471," I":28413," N":43960," O":28901," L":37540," M":50591," B":52872," C":42604," A":51010," F":45901," G":30503," D":67077," E":31023,"л":2294," Z":4215,"к":2313," Y":5370," X":3570,"и":3772,"о":4246,"н":2997," S":97781,"в":2674," R":33770," Q":2713,"а":4944," P":33276," W":14969," V":29296,"е":3204," U":16274," T":46489," å":24460," ø":14695," Å":2834," Ø":5122,"A ":7964,"Da":7415,"Co":15888,"Ch":7594,"Do":3375,"Dr":2174,"De":44977,"Di":3865,"Fe":2733,"Fa":3197,"Eu":2611,"Et":2524,"En":8728,"El":2986,"Ge":3683,"Ga":4272,"I ":9052,"Fy":2932,"Fr":8731,"Fo":9701,"Fl":2786,"Fj":2610,"Fi":5958,"C ":3097,"Au":3599,"Ar":6717,"As":3221,"Ba":9331,"Am":2967,"An":7679,"Al":7567,"By":3184,"Bu":3866,"Br":7415,"Ca":9043,"Bi":3113,"Be":11885,"Bo":6452,"Bl":2411,"Ku":2113,"Kr":5137,"Ko":7893,"Le":5322,"Li":8435,"La":9531,"Lu":3278,"Lo":6840,"Me":7383,"Mi":7116,"Ma":15908,"Mu":2812,"Mo":8218,"Ni":3496,"Ne":6662,"Na":6162,"No":20842,"Ol":2369,"Gr":7570,"Go":2542,"Gu":4480,"Ha":15522,"He":10062,"Hi":2665,"Ho":7455,"Hu":4099,"In":7421,"Is":2952,"Ir":2049,"Ja":4954,"L ":8381,"Je":2911,"Jo":5435,"Ju":2288,"Ka":7574,"M ":2289,"Ki":8846,"Ke":2144,"Un":4807,"W ":3224,"Ty":3546,"Tr":8937,"To":6250,"Th":6517,"Ti":3470,"Te":5634,"Ta":4262,"V ":2848,"Sy":2174,"St":18668,"Sv":4781,"Su":4159,"Wi":3769,"Wa":4239,"We":2978,"Vi":7414,"Va":4893,"Ve":9972,"Pr":5274,"S ":4165,"Pe":5166,"Pa":8226,"Po":5247,"Pi":2410,"Os":6058,"Op":2483,"Or":3270,"Se":7511,"Sc":2844,"Si":5113,"Sh":4054,"Sl":2066,"Sk":6894,"Sp":4048,"So":11528,"Ru":4355,"Sa":11798,"Re":6477,"Ri":4763,"Ro":9292,"Ra":4607,"b ":4101,"a ":140249,"Yo":2095,"Sø":4628,"Xi":2231,"bø":2634,"i ":283647,"fy":8788,"gd":9301,"ge":149531,"ga":32513,"fj":12532,"fl":14388,"ff":8428,"bå":3627,"fi":25866,"fr":47803,"fu":8120,"ft":23386,"fo":101580,"j ":2796,"gy":3318,"he":44416,"ha":66151,"gn":19968,"gl":10219,"gj":12645,"gi":33050,"gh":8862,"gg":35921,"gv":2904,"gu":9872,"gt":8325,"gs":33529,"gr":39232,"go":8838,"dt":33464,"du":13333,"dv":5679,"dy":4144,"g ":242136,"ea":17194,"eb":16045,"ec":4860,"ed":108322,"de":299418,"dd":13784,"df":2274,"di":40822,"dh":3225,"dk":3004,"dm":5071,"dl":19375,"do":17360,"dn":4555,"ds":31560,"dr":31419,"ew":4011,"ex":3115,"eu":5509,"ev":23237,"ey":5833,"fa":23302,"h ":12462,"fe":26099,"eh":7528,"eg":49132,"ef":18320,"ee":8701,"el":199057,"ek":50272,"ei":44782,"ep":20746,"eo":9834,"en":582197,"em":46141,"et":320684,"es":146652,"er":695279,"ca":10711,"e ":500246,"by":34947,"br":29081,"bu":14609,"bo":17004,"bl":54931,"bi":19197,"bb":5548,"be":63706,"db":3584,"da":47371,"f ":9303,"cu":2103,"ct":3488,"co":8095,"ck":10053,"ci":6056,"ch":18251,"ce":8996,"c ":3649,"az":2995,"ay":5287,"ba":35656,"d ":150657,"at":91048,"as":69325,"ar":172213,"av":121888,"au":14940,"ak":30193,"al":135078,"ai":12693,"aj":2677,"ao":2477,"ap":29140,"am":63029,"an":267240,"ac":9777,"ad":37337,"aa":2792,"ab":12532,"ag":40280,"ah":4661,"ae":7154,"af":16520,"nu":7442,"nt":92694,"ns":124075,"nr":3026,"no":52873,"nn":98330,"jø":13856,"ny":13664,"nv":3459,"oe":7618,"of":17045,"oc":8436,"od":26153,"oa":3647,"ob":11128,"om":188777,"on":123342,"ok":27219,"ol":75500,"oi":3014,"og":163889,"oh":4581,"ot":32564,"m²":2226,"os":28900,"ov":41178,"ou":20519,"op":34634,"oo":5024,"or":234371,"r ":604948,"ow":4407,"oy":2125,"pe":55903,"pa":27179,"pl":13813,"pn":2118,"po":25761,"ph":3721,"lä":2860,"pi":29184,"lå":4732,"læ":3598,"lo":47607,"lm":11687,"ll":125327,"ls":54920,"lp":4961,"lv":17299,"lu":16394,"lt":42371,"ly":13905,"hø":10888,"o ":38269,"ma":52812,"mb":14541,"me":153343,"mf":5780,"mk":2245,"ml":5863,"mi":35740,"mn":2923,"mm":59754,"mp":15073,"mo":24316,"mr":8156,"mt":8238,"ms":13233,"mu":35413,"my":4118,"p ":18651,"na":63523,"nb":10179,"nc":6842,"nd":155382,"ne":210727,"nf":8666,"ng":145804,"nh":6700,"ni":70736,"nj":3359,"nk":18937,"nl":14559,"nm":3689,"ju":8303,"jo":49289,"kj":21128,"gå":9251,"ki":30043,"kh":3599,"ke":149522,"ka":75120,"m ":148254,"fø":39544,"ky":6546,"ks":31928,"kt":45246,"ku":18940,"kv":7398,"ko":71373,"kr":37123,"kk":42089,"kl":19159,"km":10210,"kn":9496,"li":148110,"lh":5181,"lk":27090,"lj":4011,"le":206454,"ld":27006,"lg":10977,"lf":6606,"la":117768,"lb":9872,"n ":508629,"hr":3491,"hv":8983,"ht":2318,"hu":13547,"hj":3760,"hi":16514,"hn":2241,"ho":28328,"id":53829,"ic":16512,"ib":6764,"ia":42013,"ig":91825,"if":12391,"ie":49167,"hy":2076,"k ":129587,"dø":14496,"ir":32357,"is":145060,"it":66902,"iu":3847,"iv":28029,"ii":2629,"ij":2484,"ik":79231,"il":114997,"im":14324,"in":224355,"io":25793,"ip":8856,"je":54644,"ji":3380,"l ":114552,"ja":10997,"xi":3017,"tå":6348,"z ":3331,"sø":16304,"så":13214,"wi":2306,"y ":43971,"rø":14616,"wa":4985,"we":2854,"vh":2078,"rå":14698,"vi":56505,"vt":2733,"vs":5579,"vn":14394,"vo":12525,"ve":142305,"vd":3441,"va":71132,"x ":3406,"ui":4966,"uk":21664,"ul":26959,"ue":11327,"uf":3039,"ug":10093,"ur":41090,"us":50014,"ut":47498,"um":19336,"un":102304,"up":9818,"ty":27626,"tu":28284,"tt":93499,"tv":14967,"ub":13117,"ua":11212,"ud":11995,"uc":2853,"w ":4541,"to":63371,"tn":11784,"tm":2957,"tl":16216,"ts":42441,"tr":65044,"tg":7470,"tf":6385,"te":283104,"td":2594,"tk":2587,"tj":4066,"ti":143857,"på":54079,"th":14355,"v ":99425,"tb":9067,"ta":95087,"su":13370,"sv":19448,"ss":49933,"st":245546,"sy":11979,"sl":28878,"sk":191864,"sn":6441,"sm":11084,"sp":37365,"so":108796,"sr":4235,"sd":5831,"sc":5435,"sf":8108,"se":148118,"sh":13228,"sg":2307,"sj":47719,"si":74309,"nø":2380,"u ":10252,"sa":44310,"sb":10119,"rr":19009,"rs":78883,"rt":82606,"ru":50990,"rv":11610,"ry":11156,"rp":5076,"ro":69758,"rn":31836,"rm":21775,"rl":20062,"rk":47716,"næ":4051,"ri":132605,"nå":4464,"rh":6919,"rg":35522,"rf":12509,"re":256059,"rd":66008,"rc":4262,"rb":18116,"ra":128776,"t ":465642,"mø":2258,"qu":2140,"må":7691,"lø":8427,"s ":146195,"pt":7260,"pu":11071,"pp":34390,"pr":51296,"ps":7773,"vå":3732,"za":2150,"væ":7493,"yg":16794,"ye":16616,"yd":10423,"ya":7947,"tø":14974,"yt":13580,"ys":21183,"yr":13910,"yp":7213,"yn":9715,"ym":5956,"yl":14093,"yk":8767,"yi":4209,"² ":2226,"å ":86331,"Øs":2809,"ær":24440,"åp":3244,"ån":3530,"åt":7047,"ås":2737,"år":24111,"åe":2573,"ål":6435,"åk":4798,"åd":11397,"än":3083,"øy":21642,"ør":54354,"øs":14419,"øp":5028,"øv":7903,"øt":3062,"øk":5634,"øn":8821,"øl":4965,"øm":4740,"øe":2478,"ød":34288,"ø ":2765,"之":2379,"专":2738,"三":3026," Ga":4247," Ge":3655," Fy":2931," I ":6572," Fo":9668," Fr":8721," Fi":5928," Fl":2776," Fj":2609," Ha":15503," He":10039," Go":2524," Gr":7534," Gu":4458," Hu":4096," Ho":7447," Hi":2663," Je":2895," Ja":4943," Is":2884," In":7378," Ka":7550," Ke":2113," Ki":8813," Jo":5426," Ju":2284," La":9489," Le":5267," Li":8383," Ko":7880," Kr":5128," Ku":2108," Ma":15819," Mi":7093," Me":7355," Lo":6813," Lu":3268," Ne":6630," Na":6059," Ni":3491," Mo":8183," Mu":2787," Am":2962," An":7665," Al":7546," Ba":9272," Au":3596," As":2828," Ar":6669," Be":11852," Bi":3097," Bl":2409," Bo":6415," Br":7392," Bu":3840," By":3181," Ca":8810," Ch":7563," Co":15800," Da":7402," Di":3846," De":44915," Dr":2166," Do":3289," El":2979," Et":2521," En":8708," Eu":2608," Fe":2724," Fa":3178," Xi":2228," Sø":4627," Wi":3744," We":2962," Wa":4211," Yo":2085," Os":5937," Or":3262," Op":2481," Po":5205," Pi":2406," Pe":5104," Pa":8187," No":20722," Ol":2367," Ra":4586," Ro":9267," Re":6462," Ri":4754," Pr":5244," Sy":2169," Sv":4778," Su":4150," St":18500," Ta":4246," Th":6493," Ti":3455," Te":5588," Tr":8905," To":6181," Ru":4349," Sa":11773," Sh":4026," Si":5095," Sc":2823," Se":7484," So":11504," Sp":4034," Sk":6885," Sl":2061," Va":4882," Ve":9942," Vi":7397," Ty":3541," Un":4794," ja":4722," in":34065," ik":4460," is":2212," ka":20700," fø":33238," kj":12239," gå":5771," ki":10103," jo":2540," ju":4702," ha":40329," he":11962," gi":4943," gj":7272," gr":19932," gu":2109," dø":7855," hi":3888," hj":2877," ho":10427," hu":4249," hv":8245," ne":7120," na":12532," my":2654," mu":6899," mo":13920," ok":2382," ol":3868," om":17822," og":141787," of":9223," ob":2162," ny":3021," no":33974," le":11218," li":27765," la":25410," kv":4363," ku":8567," km":9490," kl":4277," kr":10076," ko":42531," me":70036," mi":10095," hø":6629," ma":17890," lu":2098," lo":3520," ad":3981," am":9313," an":19228," ap":2222," ak":3076," al":9858," av":89135," au":3693," ar":13091," at":7530," ba":13278," bi":6342," be":29509," bo":6968," bl":40680," by":21015," bu":2338," br":16487," ca":3932," er":153435," et":63825," en":123670," ei":4836," el":30282," ek":4165," eg":3293," fe":8817," fa":9591," fu":3699," fr":43218," fo":78205," fl":9728," fj":6013," bå":2550," fi":13808," ge":3067," ga":9278," i ":237285," fy":7547," da":14064," do":2621," dr":4255," de":122624," di":7830," dy":2072," væ":4635," sø":12596," ru":9067," sa":19419," se":29156," sj":3216," si":17713," sl":7113," sk":21745," sp":19430," so":87507," ra":3039," re":25505," ri":2962," nå":3726," næ":2896," ro":6266," pu":2169," pr":27989," lø":2099," må":3587," ov":11878," op":17267," or":7497," pe":8091," pa":7750," pl":5274," po":9080," pi":4926," lä":2565," så":2458," va":41741," ve":33600," vo":3210," vi":14710," ty":6900," tu":3382," ut":27318," un":21502," ta":8589," sy":5687," st":43167," sv":7377," su":2425," tr":14296," to":9228," th":2455," på":53815," ti":62638," te":14306," Øs":2807," å ":15026," år":6691," øs":6280," øy":4235,"Fin":2794,"Eur":2192,"En ":4024,"Eng":3095,"Fyl":2572,"Fra":3815,"Fre":2185,"Fol":3808,"For":3758,"Hel":2604,"Han":5694,"Har":2577,"Ind":2121,"Øst":2801,"Bar":2295,"And":2152,"Car":2172,"Ber":4934,"De ":3390,"Det":18734,"Den":19203,"Dan":2408,"Cha":2366,"Cou":8345,"OL ":7076,"New":2594,"Nor":18501,"Osl":4630,"Par":3554,"Pro":2168,"SA ":4753,"Joh":2414,"Kar":2117,"Kin":6314,"Kon":2546,"Kom":2130,"Kri":2217,"Lan":3221,"MW ":2903,"Lon":2680,"Man":2484,"Mar":5164,"Sør":3934,"Sve":3734,"Str":2300,"Sto":5079,"Sta":5059,"Ste":2680,"Sko":2364,"Som":4932,"Rus":2959,"San":3826,"än ":2530,"Rom":2726,"åde":9646,"ål ":2493,"Vei":2618,"åle":2191,"Vin":2983,"åre":3629,"ård":2572,"år ":14295,"Ves":3148,"ått":3275,"Uni":3264,"Tys":2479,"ær ":4620,"ært":3809,"ære":8422,"The":4012,"Tro":3352,"bis":3511,"bil":4902,"bin":5018,"blo":2551,"ble":31774,"bli":12888,"bla":6396,"bok":3234,"bor":4144,"bbe":3093,"ban":9995,"bak":3210,"bal":7414,"bas":4309,"bar":6636,"bei":5432,"beg":3804,"ber":17512,"ben":4646,"bel":3914,"bev":2185,"bes":8730,"bet":9507,"ca ":4720,"ce ":3774,"bri":5623,"bra":2313,"bre":4128,"bru":13492,"bur":2443,"bun":2536,"bum":3444,"by ":8318,"byd":2738,"bye":5451,"byg":13374,"byp":2127,"am ":5629,"ake":6088,"al ":22413,"ain":3128,"ak ":2688,"agt":4288,"anu":3347,"ano":2720,"ann":21260,"ant":16537,"ans":34997,"ane":10156,"ang":34028,"ani":10694,"ank":5395,"anl":5003,"ap ":5392,"ana":7215,"anc":2796,"and":71633,"amt":3120,"amm":13306,"aml":4533,"amp":3404,"ami":6970,"ame":12312,"amb":2328,"ama":4584,"alv":2901,"alt":13338,"als":4480,"all":28689,"alg":3345,"ali":12960,"ald":4724,"ale":19089,"ala":6862,"alb":4116,"an ":35027,"aks":3242,"akt":7856,"akk":3187,"abl":2978,"ae ":2689,"ad ":7446,"aft":9172,"afi":2584,"ai ":3427,"aga":2271,"age":11424,"adm":2981,"adi":6411,"add":5083,"ade":7303,"ag ":10292,"ach":2606,"ada":2399,"at ":12379,"are":11421,"ard":7601,"arb":5329,"ara":7138,"arn":4304,"arm":4027,"arl":3706,"ark":12605,"ari":9703,"arr":4082,"ars":6026,"art":19222,"asi":5334,"ase":4616,"ask":2682,"asj":20288,"ar ":65150,"apa":4754,"ape":8916,"app":2255,"as ":8910,"ava":2465,"aut":2096,"avn":11841,"avi":3695,"ave":7468,"ay ":2288,"av ":85297,"ata":4831,"ast":9209,"ass":12223,"ato":4866,"ate":21584,"ati":14442,"att":15952,"ats":2979,"atu":5294,"aug":3319,"jer":9769,"jek":2468,"jel":9659,"jem":3205,"jen":18638,"jan":3394,"je ":3919,"jon":34428,"jor":11663,"itu":2594,"itt":16474,"its":2537,"ity":2756,"isk":51560,"isj":4819,"ism":2335,"iss":5696,"ist":35845,"iv ":3591,"ita":7242,"ite":15976,"iti":9684,"iva":2856,"ivi":3570,"ive":13599,"ipp":2048,"is ":12915,"ion":14718,"ir ":6919,"irk":9296,"isi":4522,"ish":2413,"ise":12425,"isa":4298,"ire":7035,"it ":2143,"kil":3815,"kk ":10113,"kin":8354,"kip":2377,"kir":5305,"går":7228,"kis":2484,"kje":16281,"km ":7242,"kel":6677,"ken":15267,"kes":10195,"ker":29257,"ket":9398,"ke ":65844,"kra":11143,"kre":8431,"kt ":12793,"kse":4519,"kro":2450,"kri":13226,"kot":2092,"km²":2186,"kor":5874,"kon":13827,"kom":31756,"kol":9129,"ks ":3032,"kny":2137,"kjø":3238,"kni":5015,"kke":25288,"klu":3574,"kle":4386,"kla":6195,"kli":3140,"jul":2456,"kat":3942,"kar":4133,"kas":2638,"kap":17702,"kan":20091,"kal":14105,"kam":2092,"ka ":6242,"før":14702,"fød":22348,"føl":2302,"ha ":2473,"ham":3082,"han":12460,"hal":3654,"hav":6160,"har":25645,"had":4958,"he ":6327,"hel":5939,"hei":3009,"het":10209,"her":5545,"hen":7004,"hin":2784,"his":4361,"hje":2935,"gle":2190,"gn ":3125,"gla":3596,"gjø":2529,"gni":2494,"gne":11554,"gs ":3264,"gru":12215,"gra":9273,"gt ":5657,"gre":14854,"gst":4516,"gså":9706,"gus":2418,"ial":4669,"ian":8957,"øke":2125,"ølg":2438,"id ":6821,"ibe":2226,"ia ":20599,"iet":4979,"iel":5273,"ien":17663,"ier":7794,"ies":2351,"ig ":24484,"ift":6890,"ør ":13261,"øpe":2597,"ømm":2930,"ich":3831,"ie ":7138,"ica":2979,"ids":3533,"idr":3478,"ønn":3150,"idl":8890,"ide":18687,"idd":2770,"ida":3822,"ønd":2352,"øst":12196,"il ":45404,"im ":2951,"ika":14540,"ige":27059,"iga":2145,"igh":5558,"igi":2075,"igg":17522,"igs":3095,"ign":3006,"øre":10636,"ørs":12878,"ørt":2899,"ik ":6168,"ime":2903,"ind":10332,"ina":14021,"inn":35697,"ino":2582,"int":10160,"ins":19112,"ine":16471,"ing":71844,"ini":7560,"ink":2146,"iod":2244,"iny":3773,"ikl":3957,"ikk":22590,"ike":13916,"in ":20575,"ikt":9421,"iks":3390,"ilo":2911,"ill":23109,"ilk":2217,"øve":6392,"ilm":4862,"ilh":3403,"ili":8732,"ild":3649,"ile":4123,"io ":2961,"ils":3090,"ilt":4137,"ørø":2134,"øy ":5016,"hol":9462,"hov":6717,"øye":5083,"øya":3991,"hvo":3807,"hun":2960,"hus":4616,"hve":2835,"døs":2243,"død":8337,"ffe":3959,"ffi":2131,"fes":3648,"fer":4921,"fem":2188,"fen":2692,"fek":3989,"fel":4217,"fat":5769,"far":3333,"fam":3942,"fal":2369,"eta":7185,"ete":19743,"eti":2646,"etn":3543,"esp":3357,"eso":2351,"est":46619,"ødt":21901,"ess":8652,"esv":2859,"etr":2759,"ets":9941,"ett":37893,"ety":3237,"ew ":2555,"eve":11454,"eva":2230,"evi":2446,"ey ":3642,"er ":431766,"epa":2788,"eor":2578,"eol":2061,"ød ":7105,"es ":40484,"ept":2696,"epu":5041,"epr":2644,"erk":15694,"erl":6414,"eri":32585,"erg":11340,"erh":2771,"ere":60500,"erf":5751,"erd":9568,"era":10673,"erb":3057,"et ":226950,"esk":8307,"esi":10382,"øde":3134,"ese":10631,"erv":4618,"eru":2639,"err":6355,"ert":31938,"ers":27995,"ern":17742,"erm":4003,"ero":2745,"ekk":7221,"ekn":2204,"ekr":3313,"eks":10659,"ekt":15683,"en ":368973,"ela":6680,"eld":5215,"ele":18548,"eli":14582,"elg":2392,"ell":56973,"elo":2842,"elv":10426,"els":37467,"elt":17365,"emb":7048,"ema":3872,"eme":6179,"emm":4042,"emo":2614,"emi":3720,"emp":3266,"ems":2210,"enf":4002,"ene":43923,"enh":3807,"eng":13869,"ena":2893,"end":24201,"enn":18704,"enk":3130,"eni":5336,"ens":38448,"ent":42518,"egn":9280,"ege":7883,"egg":4893,"egi":7310,"eha":2568,"egr":3736,"eis":4556,"eim":2591,"ein":6378,"eie":6921,"eid":6829,"el ":17607,"eke":3082,"em ":6695,"gje":8413,"git":6162,"gis":5345,"gin":2581,"gio":4769,"ghe":4290,"gge":29980,"gi ":3980,"gen":31946,"get":10089,"ger":52848,"ges":4527,"gg ":2739,"gel":10922,"gde":3615,"ge ":33172,"gas":3030,"gar":3339,"gat":3035,"gam":2397,"gal":2375,"gan":11800,"ga ":3023,"fyl":7035,"fte":8757,"fun":3666,"ftv":7006,"ft ":3864,"fra":35021,"fre":5013,"fri":6048,"for":85448,"fot":5733,"fol":6844,"fle":5300,"fly":4535,"fil":5351,"fik":2963,"fin":6087,"fir":2657,"fis":4937,"fje":5241,"fjo":6945,"da ":8120,"de ":60427,"dal":11372,"dag":6176,"dat":3189,"dar":2553,"dan":8876,"dde":9644,"co ":2205,"ch ":3122,"cha":2322,"ck ":3691,"che":4052,"chi":2081,"cke":2693,"ed ":55809,"eba":2196,"ebe":2240,"ebr":3393,"eal":3627,"eat":2589,"efi":2051,"efo":4434,"efe":4659,"ei ":7806,"een":2210,"edl":3176,"edi":4356,"ede":17856,"eda":3620,"eg ":6746,"eds":6107,"edr":4047,"dve":3394,"dus":5910,"don":4573,"dom":4754,"ds ":5042,"dmi":3335,"dni":2776,"dst":4471,"dte":2377,"duk":4468,"dri":5473,"dra":3627,"dt ":28714,"dre":18932,"dsk":5623,"dia":3579,"der":53613,"des":6374,"det":47571,"del":41297,"dek":2450,"den":73325,"dem":4267,"di ":2586,"dle":5037,"dla":2367,"dli":11521,"din":4634,"dio":3152,"dis":11321,"die":2789,"dig":4896,"rga":6400,"ri ":3980,"rgi":2280,"rge":14583,"ret":27902,"res":20361,"rev":6468,"rfa":4418,"rds":3327,"rdv":2179,"rg ":6581,"rea":5698,"ree":2115,"ref":5778,"red":9285,"rei":3228,"reg":12569,"rem":5839,"ren":35981,"rek":8172,"rel":4778,"rer":20318,"rep":8966,"rda":5684,"rdl":3552,"rdi":4958,"rde":19629,"re ":73979,"rbu":2376,"rd ":17240,"ras":7043,"rat":9184,"rav":3015,"rbi":3863,"rbe":5664,"rag":2711,"ran":19588,"ram":7456,"ral":8750,"rak":4191,"raf":12560,"rad":5981,"rs ":8675,"ros":5462,"rom":6997,"ron":6959,"rop":5525,"rov":9073,"rod":9193,"rol":4308,"rof":2729,"rog":4064,"rna":5941,"rne":13070,"ro ":3052,"rma":5722,"rme":6882,"rli":7449,"rle":2859,"rla":5982,"rn ":5061,"rks":2379,"rko":2612,"rki":3108,"rke":16934,"rka":2079,"rm ":3363,"nær":3936,"rio":3486,"rit":8311,"ris":17487,"riv":5194,"rig":8523,"ril":2995,"rik":24390,"rin":21593,"rim":2649,"ria":5352,"ric":2697,"rid":3352,"rie":12327,"rif":3440,"rdø":2401,"rk ":15343,"rup":6154,"run":12301,"rum":4066,"ruk":11189,"rus":6418,"rva":2898,"rvi":2442,"rve":4784,"ry ":3420,"rsk":24627,"rsi":6764,"rso":5206,"rse":3955,"rta":2228,"rst":17535,"rte":20285,"rti":8810,"rua":2277,"rts":5729,"rt ":35053,"rri":3179,"rre":7643,"rra":4011,"sak":3680,"sal":2138,"sam":15315,"sbe":2623,"san":6993,"sat":4359,"sas":3809,"sa ":2445,"ryk":2399,"sha":2088,"shi":3201,"sje":5058,"sjo":34525,"sie":4330,"sid":8350,"sia":4578,"sk ":75137,"sit":9775,"sis":16636,"sin":9850,"sik":9289,"sda":2876,"sby":2166,"se ":19672,"sch":2395,"ser":35054,"ses":4849,"set":9508,"sfo":2877,"seg":6362,"sep":3173,"sen":41736,"sem":4309,"sel":10690,"sek":3507,"spo":8496,"spr":5617,"spe":5862,"spi":12660,"spa":2392,"sol":2063,"som":85971,"son":9679,"sor":2986,"sjø":3410,"st ":39863,"ss ":6097,"sli":3561,"slo":6037,"slu":2332,"sky":2874,"sla":9825,"sle":4538,"ski":8246,"skj":4074,"skl":2812,"sko":10936,"skr":10481,"sku":3651,"ska":22269,"ske":46720,"sma":2688,"sme":3758,"stå":5531,"stø":6651,"syn":2627,"sys":2439,"syk":3044,"sse":15330,"ssa":2824,"sso":3173,"ssl":2943,"ssk":3059,"ssi":8369,"sst":2689,"ste":66334,"stf":2230,"sta":38189,"stn":2344,"sto":16688,"sti":20167,"stl":6400,"stu":3784,"str":25344,"sty":4151,"sun":2773,"sut":4111,"sva":4458,"sve":9604,"svi":2603,"tak":2812,"tal":21730,"tab":3588,"tad":7590,"tba":6294,"tav":2813,"tat":16792,"tas":5433,"tar":7233,"tan":11218,"tam":2411,"te ":58351,"ta ":8154,"pa ":2052,"pe ":5528,"par":10024,"pas":4090,"pan":5764,"län":2721,"pen":10042,"per":17920,"pet":8954,"pes":3294,"pel":3770,"pla":9218,"lær":3470,"pil":11406,"pin":7171,"pis":4864,"por":10569,"pol":7076,"ppr":5964,"ppl":2678,"ppe":9671,"pp ":3799,"pub":5466,"pte":2999,"pri":10360,"pre":12096,"pro":23686,"prå":2961,"løp":3837,"mål":4336,"ra ":37023,"ngi":2137,"ngl":4160,"ngr":2861,"ngt":2215,"ngs":15959,"ni ":4401,"nge":45140,"nga":2868,"ngd":4050,"nhe":2871,"nel":12450,"nen":27435,"ner":33385,"net":21165,"nes":25627,"ng ":54251,"ned":4860,"nfo":4282,"nce":2258,"ne ":72733,"nby":5798,"ndt":4870,"ndr":12413,"nds":12659,"ndo":5458,"ndl":2779,"ndi":6968,"nde":56337,"nda":5748,"nal":12130,"nan":2828,"nar":2470,"nad":2448,"nd ":38901,"nav":9346,"nat":5154,"nas":7061,"na ":15576,"nyi":3711,"nua":2135,"nty":8509,"nto":3515,"ntr":8273,"nti":5299,"ntl":2516,"nta":6663,"nte":28676,"nsp":2257,"nst":14726,"nss":3014,"nse":21534,"nsj":2053,"nsi":3034,"nsl":2182,"nsk":39260,"nsa":2994,"nt ":24653,"ns ":20153,"noe":3540,"nom":6845,"nor":29131,"nov":2911,"nne":42931,"nna":2105,"nnb":5914,"nnl":4919,"nno":3889,"nni":5129,"nnk":2501,"nns":7691,"nma":2510,"nli":4969,"nn ":14981,"nla":6112,"nle":2679,"no ":2847,"nke":3676,"nkr":4191,"nia":6220,"niv":5311,"nis":15475,"nit":2397,"nin":25664,"nik":2124,"ogs":10014,"ogr":5237,"ogi":5246,"ogn":3334,"oge":2102,"ok ":7509,"ol ":2411,"ock":4853,"ode":7882,"of ":4012,"oen":3787,"odu":8565,"og ":134463,"oft":3759,"off":4398,"ofe":2299,"od ":2196,"obl":2263,"obe":3133,"nyt":4280,"jøe":2169,"jør":5841,"ote":3613,"ott":4146,"ots":2202,"oto":3139,"ost":4414,"otb":5531,"osi":2968,"ose":3901,"oss":3129,"ovi":8837,"ove":25154,"oun":9480,"our":3311,"opp":20548,"ope":3654,"opa":2135,"os ":5236,"or ":56677,"ork":4146,"orl":2445,"orm":10647,"orn":3102,"orr":3175,"ord":42679,"ore":13522,"orf":4509,"org":16810,"ori":9530,"ort":20720,"ors":26414,"m² ":2219,"ot ":7999,"orb":6069,"ora":3709,"ola":2381,"old":10833,"on ":35130,"oli":8827,"oll":6799,"olk":9387,"ole":9725,"ols":3475,"olm":2556,"olo":8072,"oly":3133,"oka":3954,"om ":107366,"okk":3118,"oks":3517,"okt":2231,"ona":9739,"ond":6224,"one":22668,"ong":10465,"oni":5288,"ono":3591,"ons":14426,"ont":6196,"oma":5119,"ome":6397,"omb":2444,"omi":3410,"omf":2921,"omm":37033,"omp":3178,"omr":6915,"oms":5919,"la ":5793,"le ":57070,"lde":10166,"lds":2282,"lad":2570,"lag":17793,"lan":58009,"lar":3088,"lat":7640,"las":9906,"lav":2641,"lba":2444,"ld ":6657,"lbu":3945,"kvi":3664,"kva":2672,"kur":2362,"kun":5248,"kul":5655,"kte":10988,"kst":5101,"ksj":6623,"ksi":2580,"ktu":4360,"kti":6575,"kto":3659,"kys":2223,"ls ":2559,"lok":3826,"lon":2509,"lom":14627,"log":6983,"los":2313,"lov":2717,"lme":3805,"lto":5869,"ltu":2394,"lub":3807,"lsk":13894,"lst":8591,"lv ":4083,"lta":2646,"lte":5587,"lse":19815,"lt ":21046,"lge":4495,"li ":4167,"lev":4182,"les":12321,"let":13065,"ler":55284,"lem":5548,"len":25387,"lek":9666,"lei":2319,"leg":6026,"led":5805,"lls":3758,"lo ":5941,"lhø":2797,"lla":5878,"lle":65120,"lli":10408,"llk":3343,"llo":12212,"lke":16528,"lkn":2938,"lm ":3452,"lje":2230,"ll ":17353,"lit":14639,"lis":9353,"lir":3649,"lin":18549,"liv":2248,"lia":5498,"lik":12256,"lig":60844,"lie":7273,"ma ":3762,"mai":2118,"mar":11919,"mas":4336,"mal":3128,"man":16587,"mat":5209,"mbe":7011,"me ":8290,"med":37095,"met":13813,"mes":7734,"mer":33081,"mel":15249,"men":33002,"mfa":2204,"lva":3696,"lve":4358,"lut":2852,"ly ":2856,"lym":3245,"lys":2102,"høy":5492,"hør":3562,"mpi":3576,"mpe":4413,"ms ":2196,"mod":2416,"mon":3257,"mor":2125,"mot":10268,"mt ":3487,"mst":3459,"mrå":7270,"mus":6700,"mun":25496,"min":11788,"mil":8071,"mis":4562,"mid":4108,"mle":3222,"mmu":24025,"mme":31877,"vær":6865,"ytt":5943,"yte":2764,"yst":6626,"ysk":7598,"yrk":2373,"yre":3443,"ypr":2466,"yr ":3697,"ype":2571,"ye ":2367,"yde":4954,"yer":2533,"yen":9116,"ya ":4898,"ykk":3634,"ylk":9083,"ymp":3267,"ygg":10777,"ygd":2788,"yin":3851,"tør":7504,"tøv":4228,"tår":3969,"sør":11014,"så ":11194,"røs":2166,"røn":4125,"røy":2397,"vir":2566,"råk":3246,"vik":8483,"vil":4773,"vin":16505,"vid":3526,"råd":8908,"vit":4787,"vis":10758,"vn ":5466,"vne":6818,"vol":2228,"vok":2535,"vor":4361,"ver":56905,"ves":12440,"vet":7953,"vei":7599,"veg":2114,"ven":15144,"vem":2207,"vel":5384,"ved":23018,"ve ":6401,"val":6677,"van":11534,"var":40025,"vat":3521,"va ":5156,"utø":4075,"usi":6124,"use":8793,"ust":8108,"uss":8811,"utg":6539,"ute":5272,"utt":5236,"uts":3096,"utv":4169,"us ":10711,"ut ":5919,"ure":6307,"urg":2892,"uri":2750,"urn":3039,"uro":3768,"urr":2556,"ur ":5709,"upp":6292,"ume":4177,"unt":9250,"uns":3775,"unk":3135,"uni":6920,"unn":14136,"und":31193,"ung":4670,"une":22585,"uks":4735,"ukt":5470,"uke":5765,"um ":8855,"ult":4324,"ull":5554,"uli":4543,"ule":2555,"un ":3708,"ugl":2152,"ugu":2387,"ude":2985,"udi":2221,"ue ":2256,"ues":2288,"uar":4525,"uan":2334,"ubl":5816,"ubb":3409,"ud ":2293,"typ":2478,"tyr":5657,"tys":4599,"ty ":11596,"tve":8575,"tvi":4656,"tur":14995,"tus":2293,"tun":2106,"tud":2813,"ts ":7521,"tre":18842,"tt ":26597,"tra":17913,"tri":10126,"tru":4763,"tro":7380,"try":2883,"tse":4787,"tsk":2860,"tsu":3903,"tst":7808,"tta":2344,"tte":44536,"tti":2906,"tts":8940,"to ":8546,"tni":6514,"tne":2955,"tob":2085,"ton":7030,"tok":6614,"tol":4307,"tor":21379,"til":54443,"tik":6372,"tif":3581,"tie":3348,"tig":5426,"tit":4010,"tis":16494,"tin":10509,"tio":5365,"tia":3220,"tid":14688,"tiv":7883,"tje":3021,"tli":7658,"tla":4715,"tle":2120,"tem":9786,"ten":44932,"tei":3878,"tek":5962,"tel":9339,"teg":7791,"ted":11925,"tfo":3043,"th ":3609,"tet":33199,"tes":6700,"ter":82933,"tgi":4175,"på ":52716,"ti ":4056,"the":4578},"n_words":[20399254,23799460,17069273],"name":"no"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ਾਜ਼":17,"ਾਜਧ":12,"ਾਜਾ":13,"ੀਲ ":18,"D":18,"E":20,"F":29,"G":23,"A":52,"B":35,"C":59,"L":38,"M":47,"N":37,"O":37,"H":31,"I":60,"U":14,"T":45,"W":18,"P":47,"S":55,"R":24,"f":103,"g":125,"d":221,"e":655,"b":81,"c":194,"ੀਰ ":33,"a":561,"n":492,"o":398,"l":256,"m":260,"j":16,"k":37,"h":252,"i":537,"w":71,"v":42,"u":236,"t":467,"s":328,"r":400,"p":124,"z":13,"y":106,"x":19,"ਿਆ।":22,"ਾਤਰ":17,"ਾਤਾ":27,"ਾਤੀ":12,"ਿਆਣ":31,"ਿਆਦ":20,"ਿਆਨ":47,"ਿਆਲ":13,"ਿਆਰ":28,"ਿਆਵ":14,"ਿਆਸ":12,"ਾਣਕ":12,"ਾਣੂ":104,"ਿਆਂ":76,"ਾਣੀ":40,"ਾਣਿ":27,"ਾਣਾ":22,"ਾਨੇ":13,"ਾਨਾ":31,"ਿਊਟ":14,"ਾਨੀ":48,"੩੧ ":18," । ":128,"੩੦ ":23,"ਾਦੀ":22,"ਾਦਾ":19,"ਾਨਕ":26,"ੁਝ ":21,"।ਇਸ":14,"ium":27,"is ":22,"ion":36,"ੀਤ ":15,"਼੍ਰ":22,"੦੦੮":24,"ੀਪ ":375,"ੀਨ ":57,"ੀਮ ":51,"ਟੀ ":57,"ਜੋਂ":19,"ਾਗਰ":20,"ਾਕੀ":370,"ਾਕਿ":34,"ਟਾ ":22,"੨੩ ":13," m":24," o":57," h":17," i":57," d":21," e":15," f":30," a":133," b":21," c":34," t":120," w":40," p":41," s":54,"੨੨ ":14," r":16,"੨੧ ":14,"੨੫ ":14,"਼ਸੀ":29," H":25," I":54," N":32," O":31," L":21," M":43,"੨੪ ":13," B":32," C":54," A":46," F":17," G":21," D":15," E":17,"ਿਰ ":73," S":47," R":22,"਼ਹਿ":50," P":44," W":15," T":36,"ਜੀਵ":20,"੨੭ ":13,"੨੬ ":13,"ਿਲ ":25,"ਜੁਲ":54,"ਜਿੰ":30,"੨੯ ":16,"ਜੂਨ":35,"਼ਾਂ":36,"਼ਾਹ":32,"੨੮ ":15,"਼ਿਆ":14,"ੀਕ ":34,"ਜ਼ਮ":12,"਼ਿਲ":75,"ਜ਼ਾ":33,"ਜ਼ਿ":88,"ਾਂਦ":190,"ਜਾਂ":231,"ਜ਼ੀ":72,"ਾਂਤ":20,"ਾਂਸ":16,"ਜਾਬ":125,"਼ੁਰ":27,"ਜਾਤ":13,"ਜਾਣ":55,"ਜਾਦ":21,"ਜਿਮ":22,"ਜਿਲ":12,"ਜੀਅ":19,"ਿਸ ":72,"ਾਅਦ":30,"ਜਿਸ":70,"ਜਿਹ":19,"ਾਇਆ":59,"ਾਇਣ":62,"੨੦ ":14,"ਾਇਲ":17,"ਟਰ ":67,"ਾਈਟ":14,"ਾਈਡ":27,"ਾਉਂ":14,"ਾਈਨ":29,"ਾਉਣ":23,"਼ਬਦ":26,"ੀਂ ":52,"ਿਤ ":82,"ਟਨ ":17,"ਚੰਦ":21,"੧੮ ":12,"ਛੋਟ":13,"੧੯ ":14,"ੀਆ ":53,"ਿਨ ":1101,"਼ਰਵ":39,"ਜਨਮ":32,"ਜਨਵ":39,"ਜਨਸ":13,"ਜਦੋ":15,"ਜਧਾ":12,"ਿਬ ":69,"ੀਤਾ":41,"Co":14,"ੀਤੀ":42,"ੁਆਰ":46," In":28," Ma":13,"he ":69,"ਾੜੀ":12,"ੀਟਰ":38,"Ma":13,"Ol":12," Co":14,"In":29,"L ":13,"।":1840,"ੀਕਲ":22,"ੂਪ ":25,"ੀਕਨ":13,"ਿਲੀ":21,"ਿਲਾ":83,"Th":15,"ਿਲ੍":13,"ਿਲੋ":46,"ਿਲੇ":38,"ੂਨ ":35,"ਿਲਦ":12,"ਵ":3164,"ਲ":4004,"ਰ":5772,"ਿ":5178,"ਾ":10004,"਼":1111,"ਹ":4455,"ਸ":5034,"ਦ":5926,"ਧ":297,"ਤ":3377,"ਥ":271,"ਢ":42,"ਣ":777,"ਠ":69,"ਡ":853,"ਮ":2207,"ਯ":188,"ਬ":2031,"ਭ":430,"ਪ":1926,"ਫ":303,"ਨ":4410,"ਕ":3798,"ਗ":2047,"ਖ":660,"ਐ":62,"ਓ":66,"ਝ":61,"ਜ":2041,"ਟ":625,"ਘ":134,"ਛ":70,"ਚ":1522,"ਅ":1321,"ਆ":1270,"ਇ":2053,"ਂ":2880,"ਏ":233,"ਈ":490,"ਉ":422,"ਊ":48,"ੱ":1924,"ੰ":2555," a ":18,"੦":200,"੧":273,"੪":62,"ਿਮਨ":20,"੫":77,"੨":239,"੩":93,"੮":92,"੯":96,"੬":69,"੭":58,"੨੦੦":38,"ੜ":191,"ੀ":4636,"ੁ":1825,"ੂ":1115,"ੇ":3761,"ੈ":2081,"ੋ":1676,"ੌ":206,"੍":1046,"ੂਲ ":22,"ੀਜਿ":21," Ol":12,"ੂਰ ":25,"b ":18,"ਿਹਾ":91,"a ":87,"ਿਸੇ":29,"ਿਸ਼":57,"ਿਸਾ":23," Th":15,"ਿਸਤ":34,"ਚੌਂ":12,"ਚੋਂ":38,"ਿਵੇ":63,"ਿਵਾ":13,"ੀਕਾ":45,"i ":27,"ਿਟੀ":38,"ge":13," in":40,"ic ":12,"fi":14," is":12,"fo":16,"ਚਾਈ":23,"he":95,"ha":27,"gh":19,"go":13,"g ":34,"ea":30,"ਚਾਰ":34,"ec":17,"ਚਾਲ":16,"ed":45,"de":37,"di":36,"ia ":20,"ev":13,"h ":33,"Ind":16,"ee":14,"el":41,"ei":12,"en":68,"em":14,"et":19,"ੀਆਂ":178,"es":53,"er":114,"ੀਅਤ":34,"ca":15,"e ":169,"ਚੀਨ":33,"be":16,"da":20,"f ":44," of":43,"ct":18,"cs":14,"co":29,"ck":12,"ch":22,"ce":32,"c ":16,"ics":14,"d ":98,"at":58,"as":29,"ar":52,"al":54,"ai":16,"am":68,"an":97,"ac":18,"ad":16,"ab":12,"ਿਨਾ":26,"nt":47,"ns":26," am":51," an":24,"ਿਨੇ":13,"ੈ। ":755,"of":44,"om":42,"on":91,"ol":28," ਅ":1094,"os":19," ਇ":1749," ਆ":274,"ou":18,"or":54," ਏ":119,"r ":72," ਉ":313,"ow":13," ਈ":30," ਕ":1460,"pe":18," ਖ":182," ਗ":918,"ਿਨ੍":12," ਐ":59," ਓ":47," ਜ":1208," ਝ":16," ਟ":112,"po":12," ਘ":42,"pi":19," ਚ":259," ਛ":39,"lo":16,"ਜੋ ":82,"ll":20,"igh":12,"ly":15,"o ":26,"ma":17,"mb":17,"me":42,"mi":19,"mp":27,"mu":48,"na":37,"nc":21,"nd":68,"ne":28,"ng":50,"ni":32,"ਿਤਾ":77,"ਿਤੀ":12," ।":190,"ੀਅਮ":40,"m ":58,"ੀਅਨ":27,"ਿਥਿ":27,"ine":16,"ing":33,"li":35,"le":33,"ld":21,"la":31,"n ":137," co":22,"ht":18,"hu":17,"hi":29,"ho":16,"id":13,"ic":60,"ia":36,"ig":20,"in ":38,"ie":22,"k ":17,"ir":14,"is":39,"it":46,"iu":27,"il":21,"in":129,"io":41,"l ":48,"ਾਰਕ":16,"ਾਰਚ":41,"ਾਰਤ":101,"ਾਰਨ":27,"wo":13,"ਾਮਿ":15,"y ":61,"wa":12,"ve":29,"ਾਰੀ":54,"ur":21,"us":14,"ਾਰਾ":51,"ut":16,"um":56,"un":15,"ty":14,"ਾਰੇ":39,"ua":16,"to":25,"ts":12,"tr":30,"te":65,"ti":67,"ਾਬਕ":377,"th":116,"ta":28,"st":52,"se":26,"sh":12,"si":30," ੨":195," ੩":56," ੪":26," ੫":29,"u ":55,"ਚਰਲ":20," ੬":17," ੭":14," ੮":15," ੯":19,"rs":22,"rt":24,"ry":21," ੧":210,"ro":42,"ri":64,"ਚਲਾ":12,"re":47,"rd":17,"ੁਤ ":26,"ra":24,"t ":72," ਹ":2872," ਸ":2699," ਵ":2408,"ਾਬਲ":23,"ht ":12," ਰ":498," ਲ":715," ਬ":868," ਭ":339,"s ":155,"ੁਣ ":18," ਮ":1134," ਯ":133," ਨ":1023," ਪ":1056," ਫ":184," ਤ":951," ਥ":21,"ਜੇ ":27," ਦ":4245," ਧ":86," ਡ":55,"ਾਬੀ":48," ਢ":16,"pr":12,"ਾਬਾ":23,"ਿਗਿ":35,"ੁਰ ":13," s ":16,"ਾਹੀ":40,"ਾਹਿ":88,"ਿਚਾ":17,"hum":14,"ਾਸਿ":34,"ਾਸ਼":82,"ਾਸੇ":16,"ਜਾ ":63,"ਚਨਾ":19,"ਾਲੇ":56,"ਾਲਾ":46,"ਾਲੀ":54,"ਜ਼ ":52,"ਜੀ ":155,"ਿਖੇ":23,"ਾਵਾ":47,"ਿਖਾ":19,"ਾਵਲ":13,"ਿਕਾ":29," th":90,"ym":13,"ਿਕਸ":14,"ਹਾਂ":94,"ਹੀਰ":14,"ਹੀਨ":13,"ਹੁਣ":12,"ਹੁਤ":24,"ਹਿਰ":60,"ਹਿਲ":50,"ਹਿਨ":24,"ਹਿਬ":75,"ਹਿਸ":19,"ਹਾਲ":13,"ਹਾਰ":25,"ਹਾਨ":18,"ਹਿਤ":27,"ਹੀਂ":44,"ਹਾਸ":57,"ਹਾਈ":26,"ਜਨ ":22,"਼ਨ ":23,"ਹਰਿ":36,"ਹਨਾ":25,"er ":48,"es ":28,"ਸ੍ਰ":19,"ers":16,"en ":20,"ਸਿੱ":77,"ਸਿੰ":78,"ਸੂਰ":32,"ਸੂਬ":12,"ਸਿਧ":13,"ਸਿਰ":19,"ਹਨ।":438,"ent":21,"ght":14,"ਸੀ।":71,"ਸ਼ਨ":37,"ਸ਼ਤ":17,"ਸ਼ਰ":12,"ਸ਼ਬ":26,"ਸ਼ਹ":65,"ਸ਼ੁ":34,"ਸ਼ੀ":30,"ਸਾਂ":13,"ਸ਼ਿ":19,"ਸ਼ਾ":149,"ਸ਼ਖ":29,"ਸ਼ਟ":21,"ਸਾਨ":18,"ਸਾਰ":46,"ਸਿਕ":33,"ਸਿਖ":14,"ਸਾਲ":1144,"ਸਾਹ":90,"ਸਿਟ":36,"ਸੀਅ":29,"ਸਾਇ":73,"ਸ਼ੇ":16,"ਸ਼੍":23,"ਸਿਆ":13,"ਸਰਕ":13,"ਸਮੇ":23,"ਸਮਾ":24,"ਗੜ੍":19,"ਸਰੀ":12,"ਸਲਾ":21,"ਸਤੰ":39,"ਸਦੇ":13,"ਸਦਾ":17,"ਸਦੀ":15,"ਸਨੂ":15,"ਗੁਰ":167,"ਗੋਬ":26,"for":13,"ਸਟਾ":14,"ਗ੍ਰ":502,"ਸਨ।":37,"ਚਾ ":12,"ਸਤਾ":54,"ਸਥਾ":37,"ਸਥਿ":13,"ਗਰੇ":16,"ਗਰਾ":30,"ਗਰੀ":415,"ਿਚ ":162,"ਾਹ ":17,"ਾਸ ":54,"਼ਟਰ":18,"ਿਕ ":134,"ਕੰਪ":19,"ਾਲ ":1369,"੧੬ ":14,"cti":12,"ਗਸਤ":63,"੧੭ ":15,"ਾਰ ":238,"੧੪ ":14,"੧੫ ":13,"੧੧ ":12,"ਾਮ ":62,"ਗਿਆ":104,"੧੨ ":16,"ਗਾਂ":14,"੧੩ ":12,"਼ਖ਼":29,"ਾਬ ":99,"੧੦ ":14,"ਹੱਦ":12,"ਖਾਂ":21,"ਖ਼ਸ":29,"ਖਿਆ":40,"ਾਨ ":155,"com":13,"ਗਣਿ":17,"ਿਆ ":232,"ਾਦ ":29,"੦੮ ":24,"cs ":14,"ਾਣ ":28,"ਾਤ ":23,"ਖੇਡ":37,"ਖੇਤ":37,"ਾਜ ":52,"ਖੋਂ":14,"ed ":32,"ਕੌਮ":15,"ਾਗ ":23,"ਕ੍ਰ":37,"ਸੰਸ":37,"੦੦ ":21,"ਘਰ ":13,"ਾਈ ":117,"ਸੰਖ":18,"ਕਾਂ":22,"ਸੰਬ":52,"ਸੰਤ":12,"ਕਿਤ":57,"ਾਂ ":1353,"਼ੀ ":81,"ਹੋਏ":48,"ਹੋਇ":70,"ਹੋਈ":20,"ਕਾਲ":30,"ਕਾਸ":23,"ਕਾਬ":25,"ਕਾਰ":103,"ਕੀਤ":87,"ਹੋਰ":25,"ਹੋਣ":23,"ਕਿਲ":58,"ਕਿਸ":74,"ਕਿਹ":19,"ਕੁਝ":20,"ਕੁੱ":13,"dia":15,"਼ਾ ":51,"ਹਿੰ":31,"ਹੈ।":922,"ਕੇਸ":12,"ਹੁੰ":468,"ਕੋਈ":24," ਅਪ":47," ਅਨ":20," ਆਉ":20," ਅਧ":17,"ੱਖੀ":14,"ੱਖਾ":16," ਅਮ":64," ਅਰ":39," ਅਸ":17,"ੱਖਰ":18,"ਕਦੀ":15," ਅਜ":33," ਅਗ":67," ਅਕ":54," ਅਤ":411,"ੰਸਥ":15,"ਗਾ ":22,"ਕਨੀ":13,"ੰਸਾ":14,"re ":16,"ੱਚੋ":29,"ੱਛਮ":12,"ਕਤੀ":13,"ਕਤੂ":34,"ੱਜੋ":16,"rs ":13," ਉੱ":56," ਉਹ":51," ਉਸ":69," ਇੱ":255," ਇੰ":25," ਉਪ":25," ਉਨ":48," ਉਤ":19," ਏ।":18," ਇਹ":533," ਇਸ":723," ਇਲ":20," ਅੰ":217," ਅੱ":34,"ਾ। ":18," ਇਨ":39," ਇਤ":23," ਇਥ":21,"rig":16,"ਗਰ ":22," ਆਮ":22,"ੱਖਣ":15," ਆਰ":19," ਇਕ":94," ਆਦ":14," ਆਪ":76," ਆਬ":12,"ਸਟ ":13," ਕਰ":183," ੨ ":19," ਕਲ":399," ਕਹ":19," ਕਾ":86," ਕੀ":109," ਕਿ":260," ਕੁ":68," ਗਈ":32," ਕੇ":73," ਕੈ":22," ਕੋ":64," ਕੌ":24," ੩ ":14," ਕ੍":23," ਗਏ":21,"ਕਸ਼":26," ਓਲ":31," ਕਈ":21," ੧ ":21,"ry ":17,"ਸਤ ":69,"ਕਲਾ":14,"ਵਰਗ":12,"ਕਲੰ":379," ਏਫ":13," ਏਨ":15," ਏਸ":15,"ਵਰਸ":36," ਐਗ":22," ਏਲ":14,"ਵਰਤ":32,"ਵਰੀ":76," ਜ਼":109," ਜਿ":159," ਜਾ":364," ਜੁ":64," ਜੀ":148," ਜੂ":37," ਜੇ":15," ਜੋ":96," ਜੰ":39,"ਸਨ ":38,"ਕਰਕ":17," ੮ ":14,"ਕਰਦ":41,"ੱਡਾ":23,"ਕਰਨ":55," ਚਿ":16," ਚਾ":51," ਚੁ":16," ਚੀ":38," ਚੰ":37," ਚੱ":12," ਜਰ":13," ਛੋ":14,"ਕਰੀ":18," ੯ ":15,"ਕਰੋ":16," ਜਨ":89," ਜਦ":19,"ਕਲਚ":23," ੬ ":12,"ੱਤਰ":42," ਚਲ":15," ੭ ":13,"ਗੀ ":17,"ਸਭ ":25," ਗਰ":23," ਕੰ":44," ੪ ":15," ਖੇ":84," ਗਣ":16," ਖਾ":28," ਖਿ":13,"ੱਥੇ":12," ਖ਼":15,"ੱਤਾ":19," ਘਰ":13," ਗ੍":436," ੫ ":15," ਗੋ":35," ਗੁ":188," ਗਿ":86,"ੱਤੇ":24," ਤਰ":31,"ੰਖਿ":16," ਤਾ":41," ਤਿ":24,"ੰਗਾ":15,"ੰਗਰ":20," ਤਕ":27," ਤੱ":66," ਦਰ":56," ਦੂ":37," ਦਾ":950," ਦਿ":1171," ਦੀ":419," ਦੁ":51," ਦਸ":54," ਤੌ":38," ਤੋ":497," ਤੇ":160," ਟੀ":56," ਟਰ":13,"ੰਗ੍":60," ਮਈ":37," ਬੇ":15," ਬੋ":36," ਬਾ":492," ਬਿ":25," ਬੀ":32," ਬੁ":18," ਭਗ":25," ਬਹ":34," ਫ੍":14," ਪੱ":30," ਪੰ":163," ਬਲ":14," ਬਰ":25," ਬਨ":14," ਫ਼":66," ਪੜ":16," ਫਾ":12," ਫਿ":18," ਫੁ":20," ਬਣ":71," ਪੋ":18," ਪ੍":133," ਮੌ":17," ਮੋ":24," ਮੈ":37," ਮੀ":19," ਮੁ":481," ਮਾ":131," ਮਿ":97," ਮਹ":95," ਮਸ":25," ਬੰ":15,"st ":12," ਭੌ":12," ਮਨ":31," ਮਤ":12," ਭਾ":230," ਭੀ":16," ਬ੍":13," ਭਰ":19," ਨਹ":38," ਨਿ":106," ਨਾ":302," ਨੂ":313," ਨੇ":127," ਦੱ":14," ਨਵ":51,"ਗਏ ":15," ਦੇ":1425," ਦੋ":31," ਧਰ":56," ਪੁ":69," ਪੀ":18," ਪਿ":76," ਪਾ":125," ਪੈ":40," ਪੇ":13," ਪੂ":34," ਪਰ":200," ਪਹ":57," ਪਟ":13,"ਕੋ ":15," ਲੱ":19,"ਈ ":351,"ੰਦੇ":65," ਵਖ":19,"ੰਦੀ":31,"ੰਦਾ":409," ਵਰ":63," ਵਧ":14," ਵੀ":103," ਵਿ":1204," ਵਾ":793,"ੰਦਰ":40," ਵਸ":13," ਵੇ":12," ਵੈ":16," ਵੰ":18," ਵੱ":108," ਸਕ":45,"ੰਬਰ":125," ਯਾ":23," ਰਚ":14," ਯੂ":72,"ਖੇ ":23," ਮੰ":56," ਰਾ":142," ਰਿ":33," ਰਸ":75," ਰਹ":56," ਰੇ":12," ਲਈ":76," ਰੁ":23,"str":14,"ੰਪਿ":40," ਰੂ":37," ਰੋ":38," ਲਗ":21," ਰੱ":19," ਲਾ":29," ਲਿ":55," ਲੀ":380,"ਏ ":114," ਲੇ":15," ਲੈ":20," ਲੋ":37,"Oly":12,"ਗਤ ":30,"ਲੰਪ":29,"ਲੰਡ":379,"ਲੱਗ":14,"ਂ ":2409,"ੰਜੀ":16,"ੰਜਾ":127," ਸਭ":32," ਸਬ":19," ਸਮ":69," ਸਨ":75," ਸਪ":16," ਸਰ":55," ਸਟ":13," ਸਤ":61," ਸਥ":20," ਸਦ":12,"ੰਤਰ":46," ਹਰ":55," ਸ੍":21," ਸੋ":27,"ਅ ":12," ਸਾ":1296," ਸਿ":210," ਸ਼":257,"ਖੀ ":31," ਹਨ":519," ਸੇ":33," ਸੂ":56," ਸੀ":135," ਸੁ":40," ਹੋ":233,"ੰਡਲ":14,"ੰਡੀ":26," ਸੱ":24,"ਆ ":416," ਸੰ":107,"ੰਡਾ":15," ਹਾ":41," ਹਿ":46," ਹੀ":66," ਹੁ":491," ਹੇ":15," ਹੈ":1352,"ੰਡਰ":381,"ਟ ":103,"ਝ ":29," ਏ ":29,"ਜ ":145,"ਕਸ ":17,"ਚ ":1074,"ਘ ":72,"ਹਿ ":31,"ਗ ":162," ਚ ":20,"ਹਾ ":59,"ਖ ":140,"ਕ ":1139,"ਹੀ ":112,"ਓ ":12,"ੀ। ":55,"pic":12,"ਕਰ ":44,"ਵੰਡ":19,"ਰ ":1488,"ਵੱਖ":25,"ਹੇ ":27,"ਵੰਬ":36,"ਵੱਜ":17,"ਵੱਡ":34,"ਭ ":31,"ਹੈ ":423,"ਵੱਲ":20,"ਸਕਦ":19,"ਗਈ ":25,"ਮ ":350,"ਫ ":31,"ਕੇ ":100,"ਬ ":211,"ਓਲੰ":29,"ਪ ":461,"ਧ ":64,"ਨ ":1907,"ਥ ":48,"ਕਾ ":68,"ਦ ":171,"ਕਿ ":66,"ਣ ":217,"ਕੀ ":396,"ਹੋ ":30,"ਤ ":537,"ਠ ":22,"ਡ ":132,"ਖਣ ":24,"ਾ ":2802,"ਵਿਸ":21,"ਵਾਰ":49,"ਵਿਕ":13,"ੁ ":14,"ਵਾਲ":96,"ਵਾਸ":13,"ਵਿਗ":34,"ਵਿਖ":22,"ਵਿਚ":200,"ੀ ":2902,"ਵਿਦ":16,"ਿ ":127,"ਵਾਨ":14,"ਵਾਂ":742,"ਹ ":633,"ਸਰ ":20,"਼ ":152,"ਵ ":71,"ਸਾ ":23,"ਵੇਦ":48,"ਸ ":997,"ਵੇਂ":22,"ਸ਼ ":84,"ਲ ":1735,"ਸਸ ":13,"ਵਿੱ":870,"ਸੇ ":56,"ਹਨ ":79,"ਂ।":15,"ਕਨ ":15,"ੋ ":213,"ਸੀ ":89,"੍ ":22,"ਹਰ ":16,"ੇ ":2940,"ੈ ":444,"ੂ ":303,"ਕਟ ":14,"ਰਸ਼":34,"ਰਸਾ":69,"ਰਸਿ":50,"ਰਹਿ":56,"ng ":24,"ਰਹੇ":17,"nce":16,"ne ":14,"ndi":15,"ਰਾਂ":87,"ਰਾਜ":89,"ਰਾਬ":13,"ਰਾਨ":31,"ਰਿਆ":72,"ਰਾਣ":19,"ਰਾਸ":25,"ਰਾਹ":24,"ਰਿਕ":21,"ਰਾਵ":14,"ਰਾਮ":35,"nd ":32,"ਰਿਤ":27,"ਰੀਆ":20,"ਰਮਾ":139,"ਮੰਡ":22,"ਮੰਨ":14,"ਲੇ ":121,"ਰਵਾ":29,"ਰਵਰ":37,"ਰੈਲ":39,"ਰੋਜ":15,"ਰੋਮ":30,"ਰੋੜ":12,"nte":17,"ੰਕ ":58,"ns ":12,"ਰੀਸ":12,"ਰੀਬ":18,"ਰੀਕ":94,"ਰਿਸ":18,"ਰਿਹ":15,"ੰਗ ":69,"। ":1125,"ਕਈ ":21,"ਰੂਪ":26,"ੰਘ ":71,"ਰੈਗ":367,"ਰੈਕ":12,"ਰੇਗ":15,"ਰੇਜ":79,"ੰਜ ":22,"ਲਮ ":13,"ਰਕੇ":20,"of ":41,"ਲਣ ":22,"ਰਕਾ":23,"ਐਗਰ":22,"ਰਨਾ":16,"ਲਾ ":163,"ਰਦੁ":28,"ਰਦੀ":16,"ਰਦਾ":42,"ਰਦੇ":29,"ਰਮਨ":15," ਈ ":16,"ਰਬੀ":20,"ਲੀ ":116,"ਯੋਗ":18," ਆ ":21,"on ":41,"ਰਤਾ":15,"ਰਤਿ":15,"ਰਤੀ":63,"ona":13,"ons":12,"ਯੂਨ":34,"ਯੂਰ":14,"ਯੂਲ":19,"ਲੀਪ":371,"ੱਤ ":62,"ਲੀਵ":25,"ੱਦ ":13,"ਲਾਂ":56,"ਲਾਈ":52,"ਵਲ ":18,"ਲਿਆ":33,"ਲੀਅ":22,"ld ":18,"ਲੀਆ":17,"ਲਾਵ":16,"ਲਿਖ":25,"ੱਧ ":29,"ਵੇ ":16,"ਲ੍ਹ":16,"ੱਲ ":23,"ਵਾ ":26,"ਲੋਮ":22,"ਲੋਂ":20,"ਲੋਕ":25,"ਲੋਗ":23,"ਵੀ ":125,"ਲੇਖ":15,"ਲੈਂ":19,"ੰਤ ":18,"mb ":14,"ਵਖ ":12,"ੰਡ ":46,"ਲਚਰ":23,"mer":15,"ਲਗਾ":13,"ੰਨ ":21,"lym":12,"ੰਧ ":16,"।ਇ":25,"ੰਦ ":43,"ੰਥ ":14,"mpi":14,"ਰੱਖ":17,"ੱਕ ":255,"ਰੰਥ":15,"ਵਨ ":15,"ੱਖ ":91,"mu ":47,"ੱਚ ":827,"ਆਉਂ":16,"ਅਨੁ":13,"੍ਹਾ":96,"ਇਸ ":693,"ਰਾ ":103,"ਮਨੇ":20,"ਇਹ ":527,"ਮਨੁ":16,"ਮਨੀ":13,"ਅਤੇ":407,"ਰਲ ":24,"ਭਾਈ":15,"ਭਾਗ":17,"ਭਾਰ":147,"ਭਾਵ":14,"ਭਾਸ":44,"ਅਜਿ":15,"ਇਲ ":16,"ਰਨ ":81,"ਰਮ ":38,"ਅਰਥ":18,"ਅਮਰ":51,"ਯਾ ":16,"ਬੋਲ":29,"ਰਡ ":23,"ਅਪ੍":39,"ਰਤ ":82,"ਬ੍ਰ":15,"੍ਰੋ":27,"੍ਰੇ":90,"੍ਰੈ":416,"ਰਣ ":20,"੍ਰਮ":14,"੍ਰਦ":16,"੍ਰੀ":65,"੍ਰਿ":60,"੍ਰਾ":30,"੍ਰਸ":16,"੍ਰਹ":30,"ਰਥ ":20,"੍ਰੰ":13,"ਮੇਂ":24,"ਆਨੀ":21,"ਏ। ":23,"ਆਪਣ":51,"ਆਦਿ":12,"ਆਦਾ":18,"ਆਣਾ":27,"ਮੈਨ":17,"ਮਾਨ":46,"ਮਾਤ":26,"ਮਾਣ":116,"ਮਿਕ":21,"ਮਾਰ":62,"ਮਾਲ":18,"ਮਿਥ":28,"ਮਾਂ":39,"ਇਆ।":31,"ਮੁੱ":38,"ਮਿਲ":41,"ਮਿਸ":19,"ਮੀਟ":34,"ਮੁਖ":14,"ਮੁਕ":29,"ਮੁਤ":380,"ਰੋ ":14,"ਮਹਾ":43,"ਮਹਿ":32,"ਮਹੀ":13,"ਆਵਾ":13,"ਰੂ ":145,"ਭੌਤ":12,"ਆਰਾ":46,"ਰੀ ":651,"ਬੰਧ":25,"ਰੇ ":75,"ਲਈ ":78,"ਉਣ ":23,"ਮਰੀ":43,"ਇਥੇ":16,"ਬਣਾ":43,"ਇਨ੍":18,"ਪੜ੍":12,"ਫੁਟ":13,"ਫ਼ਰ":40,"ਇਣਕ":56,"ਉਸ ":51,"ਉਹ ":40,"ਈਆਂ":13,"ਇਤਿ":19,"ਮਨ ":35,"ਇਸਦ":16,"ਪ੍ਰ":176,"ਪੈਦ":15,"ਅੰਤ":31,"ਅੰਦ":15,"ਅੰਗ":91,"ਅੰਕ":65,"ਭੀ ":15,"ਇਲਾ":16,"ਈਨਾ":22,"ਰਜ ":32,"ਰਚ ":44,"ਬੀਜ":26,"ਰਗ ":14,"ਬਿੰ":25,"ਬਾਦ":22,"ਬਾਰ":32,"ਬਾਬ":12,"ਰਕ ":18,"ਬਾਕ":372,"ਬਾਲ":27,"ymp":12,"ਈਡਰ":15,"ਬਾਅ":27,"ਬਾਈ":14,"ਉਂਦ":22,"ਬਹੁ":25,"ਭਗਤ":25,"ਮੇ ":12,"ਬਲਾ":20,"ਪੱਛ":12,"ਪੰਜ":152,"ਬਰਾ":15,"ਫ੍ਰ":14,"ਮੀ ":18,"ਮਾ ":32,"ਸਮ":81,"ਸਭ":32,"ਸਬ":22,"ਸਫ":15,"ਸਪ":32,"ਸਨ":101,"ਸਵ":19,"ਸਲ":51,"ਸਰ":94,"ਾ।":27,"ਸਟ":58,"ਸਦ":49,"ਸਤ":207,"ਸਥ":50,"ਸਕ":73,"ਵੱ":108,"ਵੰ":62,"ਵੈ":22,"ਵੇ":119,"ਉਨ੍":43,"ਵਸ":16,"ਵਿ":1234,"ਵੀ":154,"ਵਾ":1015,"ਵਨ":21,"ਵਧ":14,"ਵਰ":201,"ਵਲ":39,"ਵਖ":19,"ਲੰ":416,"ਲੱ":23,"ਲੜ":14,"ੌਰ ":34,"ਲੋ":133,"ਲ੍":24,"ਲੇ":163,"ਲੈ":43,"ਲਿ":106,"ਲਾ":368,"ਲੁ":15,"ਲੀ":580,"ਲਹ":12,"ਰੱ":21,"ਰੰ":40,"ਲਵ":25,"ਲਬ":13,"ਲਮ":25,"ਲਤ":16,"ਲਣ":25,"ਲਦ":24,"ਲਚ":23,"ਲਕ":25,"ਲਗ":28,"ਰੈ":428,"ਰੋ":128,"ਰ੍":22,"ਰੀ":839,"ਰੁ":36,"ਰੂ":205,"ਲਈ":79,"ਰੇ":213,"ਰਹ":94,"ਰਸ":179,"ਰਿ":221,"ਰਾ":502,"ਮੱ":15,"ਮੰ":60,"ਰਲ":37,"ਰਵ":92,"ੀਤ":114,"ੁਆ":59,"ੀਦ":20,"ੀਬ":34,"ੀਮ":70,"ੀਨ":92,"ੀਪ":396,"ੁਖ":20,"ੀਵ":67,"ੁਕ":56,"ੀਰ":67,"ੀਲ":39,"ੇ।":14,"ੁਝ":24,"ੁਟ":25,"ੀਸ":29,"ੁਜ":22,"ੀਆ":241,"ਿਥ":45,"ਿਦ":23,"ਿਣ":20,"ੀਅ":117,"ਿਤ":204,"ੀਂ":64,"ਿਡ":14,"ਿਟ":58,"ਿਮ":51,"ਿਬ":77,"ਬਰ ":175,"ਿਪ":16,"ਿਧ":20,"ਿਨ":1162,"ੀਗ":18,"ਿਵ":98,"ੀਕ":136,"ਿਲ":265,"ਿਰ":130,"ੀਟ":47,"ਾੜ":27,"ੀਜ":44,"ਿਹ":111,"ਿਸ":263,"ਾਡ":16,"ਇੰਟ":13,"ਾਣ":249,"ਿਅ":43,"ਾਤ":101,"ਿਆ":517,"ਾਦ":96,"ਾਧ":15,"ਿਉ":16,"ਾਨ":318,"ਿਊ":33,"ਾਪ":44,"ਾਬ":594,"ਾਮ":104,"ਾਰ":660,"ਾਲ":1569,"ਾਵ":109,"ਿਖ":72,"ਿਕ":215,"ਿਗ":45,"ਾਹ":169,"ਿਚ":202,"ਾਸ":245,"ਿਜ":25,"਼ਾ":203,"਼ਿ":116,"ਾਅ":32,"਼ੀ":109,"ਾਂ":1630,"਼ੁ":43,"ਾਈ":221,"ਾਉ":53,"਼ੇ":19,"ਾਇ":185,"਼ੋ":16,"਼ੈ":21,"ਬਲ ":16,"਼੍":25,"ਾਕ":443,"ਾਗ":63,"ਾਖ":18,"ਾਜ":126,"ਾਚ":13,"਼ਨ":40,"਼ਤ":27,"਼ਟ":24,"਼ਸ":37,"਼ਹ":70,"਼ਵ":12,"਼ਰ":65,"਼ਬ":33,"਼ਮ":21,"਼ਖ":29,"ਹੱ":25,"ਹੰ":15,"ਹੂ":17,"ਹੁ":520,"ਹੈ":1355,"ਹੇ":48,"ਹਾ":329,"ੀ।":87,"ਹੀ":203,"ਹਿ":360,"ਸੰ":167,"ਸੱ":26,"ਹੋ":237,"ਹੌ":13,"ਸੂ":66,"ਸੁ":43,"ਸੀ":222,"ਸੇ":97,"ਹਨ":550,"ਸਹ":12,"ਸਸ":15,"ਸਿ":313,"ਸਾ":1477,"ਸ਼":623,"ਹਲ":14,"ਸੋ":30,"ਹਰ":67,"ਹਮ":13,"ਸ੍":26,"ਦਸ":57,"ਦਿ":1225,"ਦਾ":1748,"ਦੁ":90,"ਦੀ":615,"ਦੂ":48,"ਥੇ":53,"ਦਨ":53,"ਥੋ":13,"ਦਰ":129,"ਤੰ":44,"ਦਲ":21,"ਤੱ":67,"ਧਾ":77,"ਨਜ":12,"ਨਡ":15,"ਧੀ":16,"ਧਿ":28,"ਨਦ":13,"ਨਤ":17,"ਦੇ":1613,"ਦੋ":54,"ਨੁਸ":12,"ਧਰ":65,"ਨਕ":44,"ਤਸ":17,"ਤਵ":19,"ਤੀ":231,"ਤੂ":46,"ਤਿ":96,"ਤਾ":728,"ਣੇ":53,"ਤਨ":20,"ਤਪ":13,"ਤਤ":12,"ਤਰ":186,"ਤਲ":37,"ਤਮ":27,"ਉਹਨ":12,"ਥੀ":15,"ਥਿ":46,"ਥਾ":70,"ਤੋ":518,"ਤੇ":631,"ਨੁੱ":16,"ਤੌ":43,"ਇੱਕ":245,"ਣਕ":72,"ਨੂੰ":329,"ਡੇ":30,"ਣਾ":137,"ਣੂ":108,"ਣਿ":62,"ਣੀ":85,"ਤਕ":48,"ਨੇਂ":14,"ਟ੍":17,"ਟੇ":26,"ਨੇਜ":20,"ਟੀ":130,"ਟਿ":29,"ਡਿ":29,"ਡੀ":68,"ਨ।":476,"ਡਾ":86,"ਡਲ":20,"ਡਰ":410,"ਡਦ":12,"ਮਰ":69,"ਮਲ":18,"ਬੰ":44,"ਮਸ":29,"ਮਹ":96,"ਮੁ":499,"ਮੀ":82,"ਮਿ":154,"ਮਾ":391,"ਮੂ":19,"ਮੈ":49,"ਮੇ":59,"ਮ੍":15,"ਮੌ":19,"ਮੋ":28,"ਰਕ":76,"ਰਖ":17,"ਰਗ":45,"ਰਚ":72,"ਰਜ":81,"ਰਟ":18,"ਯਾ":37,"ਰਡ":28,"ਯੂ":73,"ਰਣ":36,"ਯੁ":14,"ਰਥ":37,"ਰਤ":198,"ਰਦ":131,"ਰਨ":116,"ਰਫ":22,"ਰਪ":20,"ਯੋ":27,"ਰਬ":46,"ਰਮ":236,"ਪੰ":164,"ਬਲ":53,"ਪੱ":37,"ਬਰ":215,"ਫ੍":15,"ਬਹ":34,"ਭਗ":29,"ਬੁ":20,"ਬੀ":120,"ਬਿ":55,"ਬਾ":578,"ਬੋ":46,"ਬੈ":14,"ਬੇ":32,"ਮਈ":41,"ਮਕ":17,"ਬ੍":15,"ਭਰ":22,"ਮਜ":12,"ਮਤ":30,"ਭਾ":256,"ਭੀ":18,"ਭਿ":14,"ਭੌ":12,"ਮਦ":18,"ਮਨ":98,"ਪਲ":21,"ਨੰ":12,"ਪਹ":57,"ਪਸ":12,"ਪਰ":222,"ਪੂ":52,"ਪੈ":44,"ਪੇ":23,"ਪੀ":37,"ਪੁ":90,"ਪਾ":168,"ਪਿ":130,"ਬਕ":380,"ਪੋ":22,"ਪ੍":176,"ਨ੍ਹ":75,"ਫਰ":17,"ਫਲ":16,"ਬਦ":46,"ਬਨ":25,"ਫ਼":106,"ਪੜ":21,"ਫਾ":21,"ਫਿ":21,"ਫੁ":23,"ਬਣ":71,"ਨਵ":102,"ਦੱ":14,"ਨਲ":14,"ਨਰ":14,"ਨਮ":39,"ਨੇ":196,"ਨੂ":341,"ਨੀ":200,"ਨੁ":33,"ਨਾ":539,"ਨਿ":163,"ਨਸ":36,"ਨਹ":40,"ਨ੍":89,"ਨੈ":12,"ਨੋ":19,"ਪਨ":15,"ਪਣ":56,"ਪਤ":28,"ਪਟ":24,"ਬਦ ":20,"ਕਰ":232,"ਕਮ":20,"੨ ":59,"ਕਲ":439,"ਕਨ":33,"ਕਦ":33,"ਕਟ":31,"ਕਤ":65,"੧ ":77,"ਕਈ":21,"ਕੱ":19,"ਕੰ":48,"ਗਲ":25,"ਗਰ":511,"੪ ":44,"ਖੋ":29,"ਖੇ":111,"ਗਦ":22,"ਗਣ":22,"ਗਤ":41,"ਖੀ":35,"ਖਾ":81,"ਖਿ":54,"ਖ਼":60,"ਕੜ":15,"ਕ੍":37,"ਗਏ":21,"ਖਰ":32,"ਕੌ":24,"੩ ":43,"ਕੋ":104,"ਗਈ":32,"ਖਦ":15,"ਕੈ":22,"ਕੇ":135,"ਕੁ":80,"ਕੂ":15,"ਖਣ":34,"ਖਤ":12,"ਕਾ":323,"ਕਿ":322,"ਕੀ":517,"ਕਹ":19,"ਕਵ":12,"ਕਸ":64,"ਏਲ":14,"ਐਗ":22,"ਪਹਿ":45,"ਏਸ":17,"ਓਲ":31,"੦ ":87,"ਜੇ":38,"ਜੈ":24,"ਜੋ":120,"ਜ਼":304,"ਜਾ":547,"ਜਿ":213,"ਜੀ":234,"ਜੁ":64,"ਜੂ":46,"ਪਿਤ":20,"ੰ ":328,"ਪਾਸ":19,"ਪਿਕ":29,"ਜਦ":23,"ਪਾਰ":18,"ਜਧ":12,"ਜਨ":116,"ਪਿਊ":14,"ਪਾਣ":18,"੯ ":52,"ਛੋ":17,"ਚੱ":12,"ਪਾਕ":32,"ਚੰ":38,"ਜਲ":16,"ਜਰ":27,"ਟਬ":13,"ਟਨ":25,"ਪਾਈ":14,"ਟਾ":78,"ਟਰ":121,"ਜੰ":41,"ੋਗਰ":27,"ਗੜ":29,"੬ ":50,"ਚਕ":12,"ਗਸ":65,"ਗਾ":75,"ਗੂ":16,"ਗੁ":197,"ਗੀ":24,"ਗਿ":132,"ਪੂਰ":40,"ਗੇ":24,"ਗ੍":505,"ਗੋ":52,"੫ ":50,"ਘਰ":19,"ਚਿ":28,"ਚਾ":99,"ਚੁ":16,"ਚੀ":48,"ਚੇ":17,"ਪਿੰ":41,"ਪੁਰ":63,"ਚੋ":42,"ਚੌ":23,"੮ ":73,"ਛਮ":12,"ਜਗ":13,"ਚਨ":19,"ਚਰ":32,"੭ ":49,"ਚਲ":24,"ਅਤ":454,"ਆਂ":297,"ਅਜ":34,"ਅਗ":69,"ਅਕ":61,"ਆਣ":33,"ਇਆ":135,"ਆਖ":12,"ਅਸ":23,"ਅਮ":113,"ਅਰ":63,"ਅਲ":18,"ਅਦ":42,"ਆਇ":12,"ਅਧ":17,"ਆਉ":22,"ਅਨ":55,"ਅਪ":47,"ਪਣੇ":34,"ਈਆ":13,"ਇਥ":21,"ਇਤ":25,"ਇਣ":62,"ਆਸ":22,"ਪਣੀ":15,"ਆਰ":95,"ਆਮ":24,"ਇਕ":101,"ਆਵ":17,"ਆਲ":16,"ਆਨ":54,"ਆਦ":35,"ਆਬ":13,"ਆਪ":78,"ਆ।":54,"ਬਾ ":22,"ਂਕ":18,"ਂਗ":31,"ਈ।":12,"ਂਟ":25,"ਂਡ":36,"ਂਸ":30,"ਂਤ":24,"ਂਦ":244,"ਉੱ":56,"ੋੜ ":16,"ਬੀ ":78,"ਏਨ":15,"ਏਫ":13,"ਇਨ":48,"ਅੱ":34,"ਇਲ":37,"ਅੰ":220,"ਈਟ":14,"ਇਹ":542,"ਮਈ ":41,"ਇਸ":741,"ਉਂ":46,"ਈਡ":27,"ਈਨ":32,"ਏ।":33,"ਬੇ ":13,"ਉਦ":21,"ਉਨ":51,"ੋਇਆ":71,"ਉਣ":28,"ਉਤ":19,"ਉਪ":26,"ੜ ":44,"ਇੱ":255,"ਇੰ":32,"ਉਸ":76,"ਉਹ":56,"ਊਟ":14,"ਪਰਮ":113,"ਨਕਸ":15,"ਪਤ ":12,"ਧਰਮ":31,"ਧਰਤ":22,"ਪਰ ":49,"ਧਾਂ":14,"ਉੱਤ":40,"ੋਬਿ":25,"ਦੁਆ":48,"ਦਿੱ":27,"ਦੁਨ":24,"ੋਮੀ":24,"ਦੂਸ":12,"ਦੂਜ":16,"ੋਮਨ":25,"੦੦":66,"੦੮":25,"੧੦":19,"੧੩":12,"੧੪":15,"੧੧":14,"੧੨":18,"੧੭":17,"੧੮":24,"੧੫":17,"੧੬":17,"੧੯":42,"੨੧":14,"੨੦":53,"ਦਾਂ":14," ੧੮":24," ੧੭":16,"ਦਾਨ":17," ੧੬":17," ੧੫":16," ੧੪":15," ੧੩":12," ੧੨":16," ੧੧":13,"ੋਲੀ":19,"ਦਾਰ":26," ੧੯":41," ੨੦":53," ੨੧":14,"ਦਿਆ":31," ੨੭":12," ੨੬":12," ੨੯":17," ੨੮":14," ੨੩":13," ੨੨":14," ੨੫":13," ੨੪":14,"ਦਾਸ":19," ੩੦":25," ੩੧":16,"ਦੀਆ":75,"ਦਿਨ":1097,"umb":16,"੍ਹ ":21," ੧੦":19,"ਦੋਂ":17,"੩੧":18,"੩੦":26,"um ":31,"੨੯":17,"੨੮":15,"੨੭":13,"੨੬":13,"੨੫":14,"੨੪":14,"੨੩":13,"੨੨":14,"ੱਤ":172,"ੱਢ":13,"ੱਡ":45,"ੱਧ":49,"ੱਦ":25,"ੱਥ":25,"ੱਗ":23,"ੱਕ":286,"ੱਖ":185,"ੱਛ":18,"ੱਜ":30,"ੰਸ":52,"ੱਚ":879,"ੱਟ":28,"ੱਠ":21,"ੱਲ":52,"ੱਸ":22,"ੰਥ":18,"ੰਤ":85,"ੰਧ":36,"ੰਦ":618,"ੰਡ":502,"ੰਬ":155,"ੰਮ":32,"ੰਨ":45,"ੰਪ":49,"ੰਕ":75,"ੰਖ":19,"ੰਗ":218,"ਦੇਸ":44,"ਦੇਵ":25,"ੰਜ":177,"ੰਟ":22,"ੰਘ":85,"ਨਵਰ":42,"ਨੀਆ":32,"ਨੀਅ":13,"ਨਿਵ":62,"ਨੀਕ":15,"ਨਿਆ":24,"ਬਕ ":378,"ਨਾਨ":32,"ਨਾਲ":179,"ਨਾਵ":16,"ਨਿਕ":18,"ਨਾਮ":38,"ੜ੍":38,"ਨਾਂ":88,"ੜੀ":29,"ੜਾ":29,"ੜੇ":20,"ty ":13,"ਨਸੰ":13,"ਨਹੀ":36,"ਏਫ ":13,"ਨਵੰ":36,"ੁਦ":13,"ੂਆ":13,"ੁਨ":42,"ੁਤ":412,"ੁਣ":26,"ੁਮ":18,"ੁਰ":279,"ੁਸ":47,"ਿੰ":258,"ੁਲ":97,"ਿੱ":1026,"ੈ।":922,"ੂਜ":17,"ੂਨ":78,"ੂਦ":14,"ੂਰ":124,"ਏਲ ":13,"ੂਬ":50,"ੂਪ":31,"ੂਸ":23,"ੂਲ":53,"ੁੰ":490,"ੁੱ":117,"ੂੰ":335,"ੇਂ":94,"ੇਕ":12,"ੇਖ":36,"ੇਦ":53,"ੇਤ":52,"ੇਡ":47,"ੈਂ":49,"ੇਟ":14,"ੇਜ":108,"ੇਗ":24,"ੇਰ":47,"ੇਲ":44,"ੈਕ":39,"ੇਵ":52,"ੈਗ":377,"ਧਾਰ":30,"ੇਨ":27,"ਧਾਨ":15,"ਧਿਆ":15,"ੈਣ":13,"ੈਦ":24,"ੇਸ":109,"ੈਟ":17,"ੈਲ":73,"ੈਰ":17,"ੈਨ":38,"ੈਸ":23,"ੋਂ":617,"ੋਇ":77,"ੋਈ":46,"tio":34,"thu":15,"ੋਟ":27,"ੌਂ":47,"ੋਡ":13,"ੋਜ":32,"ੋਧ":14,"ੋਨ":32,"ੋਪ":22,"ੋਣ":30,"ੋਤ":20,"ੋਏ":51,"ੋਗ":55,"ੋਚ":14,"ੋਕ":33,"ੌਜ":14,"ੌਤ":23,"ੋਮ":61,"ੋਬ":36,"ੋਲ":71,"ੋਰ":92,"ੋਵ":23,"ੋਹ":24,"ੋਸ":21,"ੋੜ":31,"ੌਮ":17,"ੌਰ":50,"ted":14,"੍ਹ":146,"੍ਰ":834,"ter":25,"the":71,"ਆ। ":38,"ਤੂਬ":34,"ੇਲ ":13,"ੇਰ ":12,"ਤੀਆ":19,"ੇਸ ":12,"ਤਿਹ":21,"ਤਿਆ":30,"ਤਾਨ":49,"ਤਿਕ":18,"ਤਾਬ":383,"ਨਕ ":14,"ਤਾਰ":18,"ਤਾਂ":61,"ੇਵ ":19,"ਨਮ ":32,"ਤੌਂ":25,"ਤੋਂ":485,"ੈਨ ":18,"ਤੌਰ":15,"ੈਲ ":41,"ਨਾ ":113,"ਥਾਂ":13,"ਥਿਤ":12,"ਥਿਹ":26,"ਥਾਨ":18,"ਦਸੰ":39,"ਨੀ ":107,"ੇਗਰ":13,"ਦਰਿ":26,"ਨੇ ":143,"ਤੰਬ":41,"ਤੱਕ":12,"ਤੱਤ":53,"ਂਟ ":12,"ੇਜੀ":27,"ੇਜ਼":73,"ੇਡਾ":24,"ਂਡ ":22,"ੈਂਡ":21,"ੇਡਦ":12,"ਂਗ ":18,"ੇਦਨ":48,"ੇਤੀ":16,"ੇਤਰ":23,"ੋਂ ":614,"ਤੋ ":20,"ਣਕਾ":14,"ਤਕਨ":13,"ਥੇ ":48,"ਦਨ ":52,"ੈਕਟ":16,"ੋਈ ":39,"ce ":16,"ੈਗਰ":366,"ੋਏ ":45,"ਥਾ ":23,"ੇਵਾ":19,"ੇਸ਼":80,"am ":12,"ੋਕ ":20,"al ":27,"ਣਿਤ":17,"ਣਿਆ":40,"and":32,"amu":47,"an ":23,"ੈਦਾ":18,"ਦਰ ":34,"ਣਾਇ":20,"ੌਂ ":41,"ੋਣ ":17,"ਦੋ ":25,"ਤਰਾ":24,"ਤਰੀ":36,"ੋਪ ":15,"ਦੇ ":1528,"at ":19,"ਦੀ ":510,"as ":15,"ੋਰ ":44,"ੋਲ ":12,"ਦਾ ":1619,"ati":17,"ਦਿ ":15,"ੜਾ ":14,"ੁਟਬ":13,"ੀਸਟ":12,"ੀਵਨ":13,"ੀਵਰ":32,"ੁਕਾ":27,"ਤਕ ":19,"ੜੀ ":26,"ੀਮਾ":12,"�":48,"ੁਰਦ":34,"ਡਦੀ":12,"ੜੇ ":18,"ੁਨਿ":12,"ੁਨੀ":20,"ਣੀ ":69,"ੁਤਾ":380,"ਣਾ ":86,"ਡਰਾ":13,"ਣੂ ":105,"ਣੇ ":51,"ਆਂ ":277,"ੁਸ਼":14,"ਂਦਰ":14,"ਂਦੀ":34,"ਂਦਾ":168,"ਂਦੇ":20,"ੈ।ਇ":14,"ੂੰ ":327,"ਤਰ ":83,"ਅਨ ":34,"ਡਾਂ":31,"ੁਰੂ":142,"ੁਰਾ":23,"ੁਰਸ":21,"ਿੰਦ":69,"ਿੰਡ":45,"ਿੰਘ":68,"ਿੰਗ":49,"ਡੀਅ":12,"ਿੱਤ":44,"ਿੱਧ":18,"ਿੱਖ":61,"ਅਤ ":35,"ਿੱਚ":863,"ੁਲਾ":52,"ੁਲੀ":15,"ਅਦ ":27,"ੂਰਜ":24,"ੂਰਬ":14,"ੂਬਰ":34,"ਤਾ ":184,"ਤੀ ":175,"ੂਨਿ":14,"ੂਨੀ":19,"ਤੇ ":613,"ਜੰਤ":20,"ਜੰਗ":15,"ਡਰ ":388,"ਆਨ ":22,"ੂਲੀ":19,"ਇਆ ":100,"ਆਪ ":24,"ਅਮ ":45,"ੁੱਖ":54,"ਅਰ ":17,"ੁੰਦ":474,"ਡਲ ":18,"ਟਬਾ":13,"ਡੀ ":25,"ਡਾ ":35,"ਨ। ":77,"ੇਂ ":52,"ਡੇ ":21,"ੜ੍ਹ":37,"ਟਰੀ":23,"ਟਾਂ":13,"ਟਾਇ":14,"ਅਕਤ":41,"ਅਕਾ":15,"ਅਗਸ":63,"ਆਮ ":19,"ਆਰ ":21,"ਣਕ ":58,"ਟਿਆ":13,"ਇਕ ":86,"ਟੀਮ":54,"ਟ੍ਰ":17},"n_words":[112478,136533,89577],"name":"pa"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":76546,"E":48349,"F":59499,"G":97533,"A":113161,"B":112599,"C":110974,"L":74803,"M":131493,"N":86269,"O":63924,"H":50582,"I":84977,"J":54373,"K":109538,"U":33818,"T":71466,"W":122172,"V":29111,"P":225853,"S":204600,"R":87715,"Y":5329,"X":17841,"Z":57656,"f":154643,"g":760584,"d":1304112,"e":3541226,"b":467514,"c":1792078,"a":3833729,"n":2529442,"o":3537464,"l":1140017,"m":1150421,"j":965827,"k":1483844,"h":558914,"i":3732766,"w":2322567,"v":42704,"u":1069834,"t":1619824,"s":1819258,"r":2204575,"p":1082020,"z":1828688,"y":1372315,"x":13069,"é":11522,"á":5335,"ü":5350,"ö":4590,"ó":396123,"ę":271422,"ć":60605,"ą":370153,"ś":279181,"Ś":15543,"ń":123462,"ł":591480,"Ł":9157,"ż":255164,"Ż":6256,"ź":20372," l":139321," m":204189," n":260999," o":286725," h":33638," i":177645," j":136928," k":213385," d":243257," e":37159," f":50076," g":203060,"р":5358," a":148427," b":95754," c":133679," z":321847," u":97002," t":172994," w":969288," p":661390," s":339763," r":209751," J":52915," K":104698," H":47495," I":52640," N":80062," O":56243," L":69035," M":123296," B":105665," C":84648," A":95983," F":55238," G":73890," D":70098," E":42554," Z":55304,"к":5011," X":11746,"и":6265,"о":7740,"н":5938," S":182443," R":76715," P":214389,"а":9102," W":115231," V":20154," U":30651,"е":5223," T":64394," ż":19337," Ż":6158,"ęśc":13317," ś":51887," Ś":15346," Ł":9064," ł":17467,"A ":12056,"Da":8795,"Cz":10287,"Co":11705,"Ce":7894,"Ch":21362,"Ci":4668,"G ":10044,"Du":4576,"Do":17200,"Dr":5145,"De":9899,"Di":5692,"GC":12708,"Fe":4746,"Eu":8774,"Ge":6043,"Ga":8896,"I ":22055,"Fr":25795,"Fo":4852,"Fi":6166,"B ":5264,"C ":19133,"Au":8116,"Ar":11839,"Ba":23485,"Am":7571,"An":13752,"Al":15057,"Bu":9080,"Br":16403,"Ca":12445,"Bi":14211,"Be":12406,"Bo":15278,"Ku":5947,"Gó":8233,"Kr":17850,"Ko":31811,"Le":13922,"Li":15420,"La":13916,"Lu":9458,"Lo":10236,"Me":12645,"NG":5663,"Mi":29200,"O ":4778,"Ma":41474,"Mu":5320,"Mo":18522,"Ni":19737,"Ne":7072,"Na":24418,"P ":7412,"No":17077,"Ol":5489,"Od":7399,"PG":5183,"Ob":6870,"Gm":6336,"Gr":14758,"Go":7919,"Ha":11794,"He":11337,"II":18254,"Hi":5668,"Ho":9461,"In":12447,"Ja":16186,"L ":4866,"Je":18291,"Jo":8139,"Ka":25307,"M ":4990,"Ki":5813,"Un":7211,"VI":4920,"W ":15791,"Tu":5606,"Tr":10180,"To":9177,"Th":7956,"Te":9915,"Ta":10132,"V ":6364,"Sz":17267,"Sy":6237,"St":34755,"Su":6926,"Wo":12341,"Ws":4692,"Wi":28290,"Wa":16039,"We":8452,"Vi":5052,"X ":6678,"Pu":5975,"Pr":25703,"S ":11205,"Pe":7756,"Pa":34023,"Po":99382,"Pi":15484,"Os":6465,"Or":5855,"Se":13365,"Sc":7388,"Si":11311,"Sk":5337,"Sp":7038,"So":10398,"Ru":5061,"Rz":5267,"Sa":22530,"Re":17869,"Ro":19242,"Ra":13392,"b ":46793,"a ":1141880,"Wy":14854,"Za":17995,"Sł":20200,"i ":569208,"bó":4844,"gd":5266,"ge":34328,"ga":71454,"ać":8134,"fi":50271,"fr":12808,"fu":7503,"fo":23920,"j ":229252,"bę":5925,"có":12082,"gw":10035,"he":43730,"ha":50953,"gn":13137,"gm":84581,"gl":15240,"bą":7905,"gi":77017,"gh":5124,"gu":37274,"gr":81461,"go":201815,"du":56828,"dw":18508,"dy":56165,"dz":249442,"g ":46202,"ea":22981,"eb":20189,"ec":208980,"ed":135395,"de":106238,"dd":7144,"di":38677,"dk":20908,"dm":14590,"dl":41406,"do":162506,"dn":125390,"dp":5891,"ds":17240,"dr":43512,"ew":111327,"eu":15671,"ev":5608,"ey":6513,"ez":85170,"fa":15386,"h ":260015,"fe":16025,"eg":232853,"ef":12705,"ee":8458,"el":150281,"ek":109837,"ej":291522,"ei":21799,"ep":45340,"eo":16834,"en":264211,"em":150465,"et":92062,"es":172911,"er":319668,"ca":105725,"e ":876959,"by":41730,"bs":19604,"br":47012,"bu":41705,"bn":8407,"bo":39861,"bl":23444,"bi":69531,"be":48450,"dc":10689,"db":7638,"da":129820,"f ":12810,"cz":335815,"cy":107837,"cu":14533,"ct":15111,"co":51690,"cn":22110,"ck":80920,"ci":275500,"cj":115414,"ch":414464,"ce":179393,"c ":34764,"az":93181,"ay":6838,"ba":43868,"d ":139962,"at":193861,"as":161357,"ar":314898,"aw":119971,"av":7852,"au":36829,"ak":99776,"al":185695,"ai":24623,"aj":148717,"ap":33608,"am":123297,"an":453452,"ac":219676,"ad":149992,"ab":34080,"ag":38473,"ah":5887,"ae":10185,"af":22452,"nu":23194,"nt":96712,"ns":36134,"nr":4954,"no":152706,"nn":38516,"ny":249928,"oe":4808,"of":20662,"oc":101234,"od":293823,"oa":8273,"ob":82692,"ię":123546,"om":108472,"on":318937,"ok":137945,"ol":232069,"oi":39360,"ją":114675,"oj":102542,"og":59938,"oh":7680,"ot":70221,"os":163642,"ov":6489,"ou":18871,"op":86592,"oo":8474,"or":254276,"ję":19551,"r ":119247,"ow":537395,"kó":27850,"oz":62053,"pe":42396,"pa":107114,"pc":8415,"pl":33619,"pn":11150,"po":424408,"ph":5594,"ił":21358,"pi":86460,"ką":16568,"iń":19637,"lo":81679,"ln":79071,"lm":9683,"ll":32626,"ls":113022,"hó":9384,"lu":76951,"lt":13727,"o ":461084,"dź":5330,"eś":75573,"mc":10115,"ma":121066,"mb":17385,"me":89660,"mi":333776,"mn":11655,"mm":6313,"mp":24445,"mo":86555,"ms":11588,"mu":41658,"ió":8421,"my":18150,"p ":18753,"na":529472,"nb":5437,"nc":75955,"nd":63112,"ne":249253,"eż":40953,"nf":7374,"ng":54651,"gł":47489,"ni":692138,"ią":60315,"nk":45573,"jw":6916,"ju":20103,"js":79998,"jn":43747,"jo":26898,"jm":12943,"dł":15854,"ki":412466,"ke":10550,"kc":19574,"ka":229229,"m ":343638,"kw":12402,"gó":15509,"ks":37081,"kt":82034,"ku":103176,"ko":241820,"kr":97482,"kl":22672,"km":9933,"eł":15052,"li":209435,"eń":19329,"lk":31238,"le":182455,"ld":11857,"lg":4525,"la":173548,"lc":6468,"lb":16636,"n ":138864,"hr":17682,"dó":7283,"ht":5694,"hu":10241,"cą":8049,"ań":46393,"ał":115023,"hi":35646,"hn":15999,"ho":81700,"id":59864,"ic":201937,"aś":6694,"ib":17573,"ia":315115,"ig":21890,"if":5037,"ie":1136068,"hy":6565,"k ":111343,"ir":31982,"is":139600,"it":68750,"iu":39428,"iv":6169,"iw":23202,"ii":58678,"ij":14562,"dą":5425,"ik":74901,"il":54561,"im":169852,"in":280897,"io":133204,"ip":17035,"aź":5844,"jc":7619,"aż":13250,"je":192771,"jd":20070,"ji":76024,"iz":35250,"l ":52041,"ja":94283,"są":9617,"tó":52700,"wz":9495,"wy":175884,"só":9957,"rę":14692,"z ":208589,"oż":86148,"wi":428233,"pł":15785,"rą":23256,"wk":11864,"wn":80692,"wo":213117,"wr":13531,"ws":100565,"wu":6452,"ró":58140,"y ":457874,"wa":271461,"oś":108251,"wc":28885,"we":125766,"oń":26624,"oł":121062,"vi":10565,"uz":15301,"uw":4835,"ve":14040,"va":7846,"x ":7519,"ui":5984,"uj":65669,"uk":36207,"ul":35319,"ue":10441,"ug":25222,"ur":117881,"us":71220,"ut":47463,"um":45758,"un":53367,"up":30095,"ty":157784,"tz":6072,"tu":85903,"tt":12443,"pó":31075,"tw":127984,"ub":63557,"ua":10045,"ud":56663,"uc":35683,"w ":734288,"to":228890,"tn":41685,"tl":10672,"ts":8380,"tr":134088,"te":190777,"kż":7374,"tk":33051,"ti":40445,"th":19993,"nę":6037,"ta":271329,"su":26228,"ss":16223,"st":440956,"sy":40789,"sz":185716,"sw":7100,"sl":4597,"sk":362869,"sn":10482,"sm":8241,"sp":72745,"so":57024,"sc":108490,"se":50637,"sh":8501,"ną":21831,"sj":10343,"si":154595,"rz":330244,"u ":285894,"mę":6084,"sa":75170,"rr":8977,"rs":83617,"rt":71318,"ru":79284,"nó":11750,"rw":35761,"ry":128901,"rp":13436,"ro":332963,"rn":56890,"rm":40806,"rl":14366,"rk":29749,"ri":79907,"kł":24507,"rg":38504,"rf":5604,"re":197175,"iż":7681,"rd":33902,"rc":41834,"jś":4511,"rb":12273,"ra":338168,"t ":152714,"lę":5780,"mó":8836,"lą":12242,"s ":109715,"kę":4736,"py":18332,"pt":9892,"pu":38310,"pr":207712,"ps":10578,"ył":33312,"yń":8163,"zą":41404,"zę":35066,"uż":24054,"wł":15632,"zó":6325,"wę":7097,"uł":11203,"wą":11216,"wó":72388,"zg":11821,"zi":142936,"sł":33435,"zb":26540,"zc":34117,"zd":18869,"ze":326759,"tę":19569,"za":220009,"yz":12669,"zw":54934,"zy":216261,"zr":6357,"zu":31373,"zt":75735,"zo":93735,"zn":149848,"zp":13878,"zk":54785,"zj":15781,"zm":18594,"zl":6622,"yg":10175,"yf":6219,"yc":219665,"yd":32633,"yb":20679,"yw":52492,"yt":47612,"ys":92766,"yr":14041,"yp":20913,"yn":66941,"ym":103256,"yl":20573,"yk":74193,"yj":45803,"tą":4964,"yż":9849,"yś":4897,"zł":13341,"ów":161116,"ób":6381,"ój":5872,"ód":78413,"óc":5492,"ór":65730,"ól":16369,"ć ":59245,"ąd":23612,"ąc":135745,"ą ":117382,"óż":8608,"ół":34199,"ęb":8256,"ęc":19527,"ęd":32895,"ęg":11685,"ęk":11873,"ęp":16514,"ęt":22906,"ęs":9306,"ę ":96973,"ęz":11858,"ęś":18258,"ęż":7758,"ąg":22092,"ąs":12009,"ąt":12097,"ąz":17763,"ąż":21246,"ł ":59292,"łu":50120,"łt":4875,"ły":40224,"łk":15386,"łn":20167,"ło":189649,"łe":22223,"łc":7515,"ń ":16610,"ła":119873,"ńs":75332,"ńc":31072,"łó":33659,"śl":22153,"śn":19502,"śr":30296,"św":24072,"śc":92180,"Św":8984,"ś ":45778,"ść":40444,"łę":5560,"łą":10057,"źn":6441,"źd":5042,"ż ":20942,"żu":6315,"ży":40208,"żo":72896,"żn":19632,"żs":5910,"że":31935,"ża":27767,"żą":12050,"ąża":15191,"ęci":10430,"ędz":16930,"ędu":4976,"ęzy":9477,"ępu":8551,"ęks":6674,"ęst":6004,"łec":8332,"ła ":38802,"łan":5477,"ład":27517,"łac":9469,"ław":16966,"łas":4900,"łoż":68106,"łoś":19952,"ły ":21321,"ływ":5534,"łu ":9627,"łoń":16090,"łud":14450,"ług":11020,"łow":21447,"łos":9246,"łon":8256,"łno":15191,"łod":6015,"łka":8122,"ło ":21533," Ga":8827," Ge":5971," I ":4633," Fo":4786," Fr":25744," Fi":5918," Ha":11737," He":11280," Go":7898," Gr":14678,"ńcó":5655," Gm":6328," Ho":9424," II":9642," Hi":5632," Je":18244," Ja":16067," In":12303," Ka":25219," Ki":5713," Jo":8117," La":13845," Le":13833," Li":15290," Ko":31759," Kr":17816," Ku":5884," Gó":8232," Ma":41192," Mi":29080," Me":12575," NG":5574," Lo":10195," Lu":9428," Ne":6981," Na":24184," Ni":19681," Mo":18451," Mu":5268," Am":7370," An":13699," Al":15025," Ba":23126," Au":8090," Ar":11761," Be":12343," Bi":14133," Bo":15194," Br":16331," Bu":9050," Ca":12220," Ce":7873,"ńcz":4853," Ci":4561," Ch":21239," Co":11557," Cz":10255,"ńce":15534," Da":8706," Di":5639," De":9843," Dr":5105," Do":16986," Du":4568," Eu":8764," Fe":4580," Wy":14786," Ws":4674," Wo":12135," Wi":28155," We":8391," Wa":15970," Sł":20157," Za":17851," a ":33564," Os":6455," Or":5834," Po":99137," Pi":15394," Pe":7708," Pa":33893," No":17025," Ol":5468," Od":7369," PG":4658," Ob":6845," Ra":13288," Ro":19091," Re":17774," Pr":25548," Pu":5949," Sz":17219," Sy":6223," Su":6907," St":33766," Ta":10078," Th":7821," Te":9806," Tr":10135,"ńst":9625," To":9099,"ńsk":65260," Ru":5052," Sa":22468," Rz":5260," Si":11239," Sc":7241," Se":13251," So":10324," Sp":6969," Sk":5318," Vi":4980," Tu":5482," W ":13679," Un":6885," ja":24161," je":81709," im":8378," in":25684," is":7255," ka":27845," ki":9981," gw":7366," j ":15130," ha":5671," he":5463," gm":81906," gr":42165," go":8008," hi":10476," hr":4991," gł":25015," ni":54044," na":181345," mu":9539," mo":21735," ok":44273," on":4611," og":5234," od":78508," of":7838," ob":36134," no":11764," le":15807," li":34396," la":36395," ku":10205," kt":33550," ks":6999," kw":7071," km":9336," kl":10638," kr":29471," ko":64410," me":15598," mi":86248," o ":18463," ma":52176," lu":42219," lo":7327," ad":8368," am":10571," an":21235," ak":7205," al":15945," au":9619," ar":12965," as":16623," ba":14299," bi":14788," be":7689," bo":7340," by":20334," bu":7038," br":13094,"łuż":7054," ca":5242," el":9782," fa":6215," fu":5153," fr":6875," fo":9292," fi":17828," ge":6292," ga":16786," i ":123913," co":7528," ce":14033," ch":18489," ci":24028," da":17665," cy":4833," cz":55270," do":83523," dn":7531," dl":8990," dr":18598," de":38500," di":6649," dw":10226," du":6950," dz":27524," dy":8440," zm":8262," zo":12577," zn":26389," zw":23230," za":89284," zd":5616," ze":19904," zb":9872," zi":4717," sł":12543," są":8630," z ":107756," wy":79384," wz":8136," wł":13150," uż":7103," ru":4802," ry":6135," rz":19343," sa":14213," se":14117," si":84234," sk":26716," sp":31645," so":11139," ra":15542," re":46718," ro":89060," pu":7297," pr":169964," os":21891," ot":4775," op":16859," or":30563," r ":7257," ję":8026," oz":4712," pe":8239," pa":50520," pl":22703," po":329900," pi":36353," wa":17275," we":29944," wc":10065," ró":17486," wr":10056," wo":81271," ws":27715," wi":85470," pł":9284," w ":591873," ty":20926," tw":5981," pó":15190," tu":7399," us":6046," ut":8073," ur":32556," uk":7357," ul":5488," ta":20050," sw":5466," sz":26144," sy":14406," st":72132," su":7093," tr":22707," to":40581," th":4944," te":42334,"łów":28362," ła":6906,"łąc":7474," Św":8979," śr":24528," św":19258," że":5562," ży":7572,"GC ":12479,"Eur":7453,"Fra":21145,"II ":13664,"Her":4920,"Gra":4795,"Gmi":6178,"Bar":4693,"Bra":4520,"Cha":6684,"Dol":5831,"Nie":14925,"Now":7712,"Nor":6043,"PGC":4549,"Pie":5284,"Par":14525,"Poł":4889,"Pro":6911,"Prz":9834,"Pod":4669,"Pol":63410,"Pow":4846,"Rad":5296,"Jan":4677,"Jes":5251,"Kar":5182,"Kon":5190,"Kra":7526,"Koś":5328,"Gór":8184,"NGC":5540,"Mar":15306,"Mon":4650,"Mie":5310,"Mis":5057,"Wys":5223,"Wie":13213,"War":8050,"Sło":18142,"Str":5084,"Sta":19220,"Sie":4994,"Sai":4713,"Sch":5152,"Ros":4617,"Uni":6296,"The":5470,"bio":14193,"bli":16339,"bor":8274,"bow":6707,"bar":9707,"bec":8656,"ber":14109,"bel":6901,"bez":6239,"bia":7547,"bie":21077,"ca ":84053,"cac":4624,"był":19691,"ce ":116313,"bro":10887,"bra":11568,"bry":5366,"brz":9909,"bsz":5466,"bst":6710,"bur":11294,"bum":8181,"bud":9663,"by ":10708,"aka":5857,"am ":8178,"akc":4640,"aki":8717,"ajo":5413,"ajm":4898,"ajw":6681,"aju":12289,"al ":15001,"aja":6999,"ajd":19542,"aje":5093,"ain":11213,"ak ":12839,"aj ":9008,"agr":6880,"ago":6833,"ają":56423,"anu":7044,"any":57351,"ano":19191,"ann":5995,"ant":15544,"ans":10106,"ane":52267,"ang":21057,"ani":105534,"ank":9718,"ana":52659,"anc":30280,"and":28542,"amo":13297,"amp":4556,"ami":37349,"ame":32849,"ama":9031,"alo":8151,"aln":38047,"all":6281,"ali":35299,"ale":28164,"ala":17506,"alb":10365,"an ":25897,"akt":18470,"ako":21085,"abi":5944,"abs":7005,"ae ":4965,"ad ":24951,"ac ":5802,"afi":15590,"aga":5833,"ado":9262,"adm":7603,"adi":7361,"ade":12127,"ady":8896,"adz":20389,"ack":10335,"acj":42190,"aci":10932,"ach":91172,"ada":31955,"acz":24570,"acy":17521,"azo":11989,"azu":9191,"azw":14194,"aza":4626,"azd":9097,"azy":7482,"az ":24099,"ba ":9607,"at ":32096,"are":9332,"ard":17168,"arc":24556,"ara":29532,"aro":21478,"arn":16904,"arm":8931,"arl":5792,"ark":13034,"ari":18485,"aru":4729,"anó":4796,"ars":25559,"art":41017,"asa":22529,"ary":12822,"arz":32133,"asi":8550,"aso":5103,"ask":9308,"ar ":9731,"api":8704,"apo":6161,"as ":15649,"aut":8273,"awa":20666,"aws":12931,"awn":15933,"awo":12547,"awi":30734,"asz":13814,"ata":29924,"ść ":40186,"ast":57235,"asy":6947,"atk":6138,"atr":8063,"ato":27910,"ate":18972,"akż":7374,"ati":9455,"aw ":7482,"atu":20800,"aty":21623,"auk":5346,"Świ":8845,"ści":91962,"śni":16799,"śli":5074,"śro":7633,"śre":20628,"ślą":5357,"świ":21339,"jeg":6331,"jej":4751,"jed":34575,"jen":6642,"jew":58578,"jes":38109,"ji ":74801,"jal":7148,"jak":18567,"jaw":9057,"aźd":4510,"je ":30364,"jdu":17294,"jmu":7356,"jna":5939,"jny":19293,"jne":11196,"jow":13446,"jon":8504,"ito":6446,"ity":14838,"isk":17265,"ist":59047,"isz":10261,"ita":8740,"ite":12873,"iwe":4858,"iwi":4661,"ius":4598,"ium":6118,"is ":13674,"ion":48352,"iop":4645,"ior":18228,"ios":6430,"iot":6025,"ipc":5025,"ikó":5855,"iow":26535,"isi":4518,"isa":8456,"iu ":24955,"ire":4546,"ira":6326,"ja ":38334,"izy":4891,"izo":6649,"izm":4986,"iza":9287,"kim":128411,"kic":24931,"kie":126958,"dłu":9193,"km ":8293,"ki ":116429,"kań":16086,"kcj":11696,"kra":24036,"kre":18070,"kry":8710,"krz":7284,"ku ":72509,"kro":6037,"kow":64383,"kos":4831,"kor":8298,"kop":10012,"kon":28370,"kom":18077,"kol":18048,"kok":4910,"klu":5529,"ko ":39683,"kle":4640,"kla":7237,"jsk":40065,"jsz":8170,"ju ":15477,"jsc":28565,"kaz":5339,"kat":10533,"kar":21380,"kan":15572,"kal":9168,"kam":6459,"kad":5909,"kac":12564,"ka ":110422,"ha ":5601,"han":8170,"har":13309,"he ":11732,"her":6722,"ał ":24279,"cą ":7445,"ań ":4989,"ałe":8027,"ała":29091,"ało":22494,"his":8897,"ały":11839,"ańc":7994,"ańs":33252,"go ":160963,"god":4635,"gni":4919,"gmi":81864,"gos":5577,"gor":5388,"gow":8194,"gu ":24037,"gro":9171,"grz":5388,"gry":6845,"gru":20718,"gra":31694,"gwi":7275,"ców":11188,"iaj":4940,"iam":4548,"ial":8076,"ian":22005,"ias":25407,"iar":9378,"iat":20282,"ic ":6481,"iac":9488,"iad":9916,"iaz":7546,"id ":16421,"ia ":151774,"iet":14221,"iew":12649,"iel":59211,"iem":53337,"ien":34053,"ier":72550,"ies":27602,"ied":28632,"ieg":53169,"iek":15875,"iej":135012,"iec":108952,"icy":13868,"ict":6284,"icj":4509,"ick":18455,"ici":5695,"ich":42480,"ice":16432,"ie ":425948,"ica":18194,"ide":5884,"ida":21189,"icz":59703,"ijs":6419,"im ":137877,"ika":22144,"ii ":57658,"ibą":4979,"iał":33757,"ik ":16857,"imp":4698,"imi":11049,"inc":8772,"ind":7280,"ina":57100,"inn":9746,"ino":8623,"int":11381,"ins":5688,"ine":11157,"ież":14661,"ing":19340,"ini":80940,"iny":19314,"iko":7564,"iki":9122,"in ":24060,"ilo":5237,"ill":10795,"ień":7511,"ilm":5161,"ili":7909,"ieś":51901,"io ":7198,"how":11191,"hol":4719,"hor":6372,"hod":42292,"hni":9648,"hra":5522,"dów":6742,"fia":9588,"ewó":56727,"ez ":38609,"ews":7573,"eze":8964,"ezi":7185,"ezj":6360,"eta":10069,"etn":9079,"esp":11621,"eso":4594,"est":61386,"esz":24722,"eto":22583,"etr":10559,"ety":8345,"ew ":5885,"ewi":12221,"ewo":9270,"ewn":8260,"ewa":5038,"er ":34400,"epa":15408,"eją":6188,"es ":24102,"epu":5035,"epr":8058,"eri":17872,"erg":8537,"ere":16329,"erc":5702,"era":33404,"erb":5967,"et ":10917,"esk":6857,"esi":11275,"esa":4705,"erz":21989,"ery":23381,"eru":5751,"erw":27139,"ert":7815,"ers":24326,"ern":18633,"erm":6952,"erp":6669,"ero":30451,"eki":4817,"eko":5980,"eks":10430,"ekt":18859,"eku":6941,"en ":29550,"ela":11200,"ele":25422,"eli":15142,"eln":10771,"elk":19485,"ell":8172,"elo":10644,"elu":7577,"els":11724,"emc":9224,"ema":7081,"eme":7011,"emo":5689,"emi":25674,"ene":12261,"ena":8388,"end":7106,"enc":29197,"eno":4952,"enn":11782,"enk":5319,"eni":82488,"egł":18236,"ens":6693,"ent":47936,"ego":161252,"egi":24719,"ej ":196844,"egu":4666,"ek ":37209,"ein":6239,"el ":16434,"ejs":51321,"ejo":11481,"ejm":7156,"eje":7376,"eka":10766,"em ":73512,"gio":21748,"gie":12191,"gic":6767,"gii":7756,"gia":4982,"bą ":6293,"gi ":9182,"gen":12957,"gel":5153,"gar":6319,"gat":9377,"gaj":4881,"gal":8113,"gan":14815,"ga ":14947,"fun":4925,"fra":5853,"for":16996,"fic":8573,"fil":7378,"fik":6148,"ać ":7866,"czą":12269,"da ":43439,"czł":5966,"de ":15481,"dby":5063,"czę":22437,"dal":5635,"daj":6034,"dar":8045,"dan":24834,"daw":15145,"dcz":6407,"ctw":8655,"cy ":54622,"cus":4654,"cym":5360,"cyj":21911,"cyc":12366,"cz ":10143,"czy":47263,"czk":8277,"czn":94844,"czo":18714,"cza":38835,"cze":61835,"cki":56501,"chó":9144,"co ":6133,"cni":7419,"cią":23459,"cne":4800,"cny":4546,"cow":30428,"cję":4623,"cez":4969,"ch ":250098,"cer":7119,"ces":6923,"cen":13171,"cej":5254,"cel":7686,"ceg":6128,"ci ":63750,"cha":21398,"cia":18501,"cie":119729,"che":16574,"chi":12341,"cho":63783,"chn":12122,"chr":6017,"ciw":5743,"cja":35582,"ciu":5042,"cin":7038,"cio":9246,"cka":9993,"cji":61836,"ed ":7038,"ebi":5142,"ec ":12319,"dzą":8566,"ega":9608,"edl":5574,"edn":51333,"ede":12015,"eda":5582,"edz":17029,"edy":9486,"eds":6095,"ecj":5624,"eck":33379,"ech":31282,"eci":73659,"ece":7263,"ecz":26497,"ect":5183,"ecn":7076,"dyn":9974,"dys":5848,"dyc":5708,"dy ":20176,"dzy":15488,"dzt":56173,"dzo":7609,"dzk":13570,"dzi":114188,"dze":17343,"dza":11866,"dor":6162,"dom":7183,"dol":8096,"dok":4665,"doz":5829,"dow":41632,"dos":5363,"dmi":12552,"dna":7598,"dne":6153,"dni":85396,"dno":17791,"dny":5925,"dob":7583,"dst":8461,"duj":19180,"duk":7030,"dra":4842,"drz":4777,"du ":16409,"dro":12893,"dru":8195,"dia":9387,"der":13407,"del":4853,"dek":8141,"den":21584,"dem":8020,"dep":15043,"dle":22301,"dla":14281,"dko":7298,"do ":56353,"dio":7032,"die":6315,"rga":13673,"rgi":6819,"ret":5375,"res":16392,"rez":11929,"rg ":8389,"rdz":8372,"rea":6346,"rec":6681,"red":24223,"rej":11845,"reg":27968,"rem":8867,"ren":18517,"rek":5432,"rep":5124,"rcz":7138,"re ":17858,"rci":7329,"rch":9881,"rce":4547,"rca":6614,"raw":21807,"raz":29604,"rd ":6953,"ras":7862,"rat":14245,"raj":22570,"rai":5724,"ran":52894,"ram":15553,"ral":18585,"rak":14728,"rab":9860,"raf":17050,"rad":12311,"rac":33149,"rpn":4743,"ros":15610,"rot":7718,"rom":11621,"ron":23271,"rop":15650,"roz":22873,"row":46481,"rob":7374,"rod":59324,"roc":14309,"roj":6342,"roi":15518,"rol":7056,"rok":41872,"rog":11103,"rno":5279,"rny":6200,"rna":10748,"rne":9659,"rni":18640,"ro ":7330,"rma":13967,"reś":7847,"rmi":12636,"rla":5043,"rki":6506,"rka":6181,"riu":5278,"rii":14143,"rin":4886,"ria":19003,"kła":22407,"ric":4906,"rie":5586,"rwo":5268,"rws":10740,"nów":11177,"rz ":14438,"ryb":4752,"ryc":13818,"rug":6274,"rud":6357,"ruc":5354,"rup":16289,"run":6993,"rum":5890,"ruk":5290,"rus":7391,"rwa":7707,"rwc":4554,"ry ":30832,"rsk":40769,"rsz":11669,"rta":19705,"rst":9077,"rto":10205,"rte":6255,"rti":6031,"rtu":4870,"rty":7882,"rt ":8136,"ru ":8305,"rzę":9880,"sad":8969,"sam":12218,"san":8261,"sar":4951,"sa ":26422,"rzą":15714,"rze":166268,"rza":14480,"rzc":6541,"ryw":7519,"rys":7800,"ryt":14172,"ryk":17536,"rym":6312,"ryn":5422,"rzy":76075,"rzo":14189,"ną ":19254,"si ":6951,"sie":37561,"sia":7216,"sk ":6051,"sin":6607,"se ":4707,"sce":55140,"sch":21250,"sco":24974,"ser":11407,"sen":9224,"spo":28928,"spr":6284,"spe":6485,"spi":8532,"sow":15205,"son":7183,"sok":5885,"się":66198,"sob":7483,"su ":8612,"skł":11877,"st ":49786,"ski":241685,"sko":33991,"skr":5762,"sku":9283,"ska":44791,"sią":6562,"sz ":5749,"syn":5333,"sys":6083,"sza":29086,"stę":16122,"sze":30322,"szc":22167,"szp":5112,"szo":8648,"szt":14601,"szk":19152,"szy":32568,"ste":40839,"sta":110434,"stn":10874,"sto":55229,"sti":7248,"stk":11925,"stu":7827,"spó":13799,"stw":35135,"str":55283,"sty":30683,"sy ":5848,"tak":16147,"tal":13415,"tac":30341,"tad":4771,"taw":14561,"tat":11340,"tar":27950,"tan":40782,"tam":18213,"te ":12262,"ta ":47407,"jęz":8303,"pa ":7030,"ową":6832,"ję ":5820,"pca":4745,"par":31402,"pas":19500,"pac":4930,"pad":8610,"pal":4646,"pan":7407,"ką ":14644,"pań":7582,"pec":4830,"per":13425,"pej":4544,"pla":21906,"ple":4628,"ińs":18739,"pie":29091,"iłk":7468,"pio":5728,"pir":4743,"pis":14610,"poz":13437,"pow":94565,"por":15922,"pop":9036,"pot":7235,"pos":20600,"poj":4594,"pom":22714,"pon":6930,"pok":5372,"pol":56274,"poc":15903,"pod":44935,"pił":8661,"po ":9878,"pni":8916,"pub":6917,"pra":25179,"prz":116916,"pu ":5270,"pre":13822,"pro":44748,"put":4642,"puj":8407,"poł":82723,"py ":16720,"ląs":7826,"mów":5303,"ra ":38610,"ngi":4912,"ni ":23063,"nge":9484,"ią ":10996,"neg":51936,"nej":47963,"nek":13069,"nem":5441,"ner":9906,"net":21971,"nes":4536,"ndy":5147,"ng ":20546,"eży":7778,"nci":16716,"ncj":34860,"nce":9629,"ne ":77120,"eż ":13347,"ndr":6665,"ndo":5824,"ndi":8916,"nde":9089,"nda":6965,"nak":5404,"nal":20391,"nam":4778,"nan":15192,"nap":5117,"nar":14632,"nac":26022,"nad":18221,"nag":7300,"naj":37748,"nd ":10560,"nau":6824,"nat":12135,"nas":8986,"naz":14802,"naw":6217,"na ":315412,"moż":6857,"nyc":66044,"ny ":143123,"noś":21837,"nty":9827,"nto":7930,"ntu":6194,"ntr":10458,"nta":17677,"nte":16419,"nst":10393,"nu ":12191,"nt ":17005,"ns ":4556,"noc":23507,"nom":6299,"nos":13706,"nor":4839,"now":33236,"nne":10038,"nna":4663,"nni":6637,"nię":6603,"nny":11159,"głó":21438,"no ":23520,"nki":7495,"nkc":4543,"nka":8221,"nku":6543,"nko":6939,"eżą":8181,"iąg":20115,"iąz":16848,"nii":15486,"nie":303754,"nic":58316,"nia":115515,"niz":10946,"niu":18651,"nis":21291,"nio":33347,"gło":21325,"nim":8023,"nin":5902,"nik":44866,"ogr":14773,"ogi":15434,"ogo":4557,"oga":7462,"ją ":20069,"oid":31094,"ok ":8415,"ojs":6215,"ojn":8775,"oje":68149,"jąc":92228,"oce":7209,"och":19137,"oci":8093,"ock":7255,"ocn":14524,"obs":6969,"oby":5467,"ode":8229,"odk":11049,"odl":22987,"odo":21990,"odp":5300,"odn":35244,"ods":5690,"odr":4605,"ocz":25360,"of ":6324,"odc":6738,"odb":6509,"oda":13640,"odz":61637,"ody":9005,"odu":13447,"ofi":6001,"ięk":9725,"ięc":10576,"ięd":13216,"ięt":12440,"od ":47389,"obo":8696,"obr":12863,"obl":4516,"obn":4740,"obi":10703,"obe":11907,"nym":35914,"ię ":65182,"owy":75588,"osó":4510,"ków":24138,"ows":40460,"own":11322,"owo":53873,"owi":138750,"ozy":6621,"ozw":5341,"ozn":9119,"osł":6618,"ozb":6448,"oty":9500,"ote":7310,"otr":5518,"oto":11888,"otn":8681,"osz":15296,"ost":51897,"ota":6378,"osi":14132,"osk":7450,"ose":9237,"osp":5848,"oso":13740,"oró":5598,"owc":6201,"owa":124010,"owe":69144,"opo":27225,"opi":9321,"ope":10070,"opa":11132,"os ":8863,"opu":5185,"opr":8595,"or ":15758,"ork":4605,"orm":16889,"orn":6157,"oro":17951,"orc":4621,"ord":8495,"ore":9264,"org":11803,"ori":12364,"osa":8380,"ort":15205,"ors":22187,"oru":7711,"orz":34348,"ory":13310,"ora":31522,"ola":10017,"on ":32944,"oli":42609,"ole":26115,"ols":92465,"oln":14866,"olo":22365,"olu":5605,"oka":12522,"om ":7335,"oki":5566,"okr":39604,"oko":19282,"ogó":4810,"oku":40113,"ona":86386,"ond":4877,"one":29526,"oni":52759,"onk":6206,"onn":4626,"ono":16680,"ons":8848,"ont":10876,"onu":6969,"ony":38958,"oma":14351,"ome":9149,"omi":22956,"omp":9870,"omo":23578,"omu":5636,"la ":29248,"le ":27634,"lac":9698,"lak":7036,"lan":38327,"lam":5773,"lar":10967,"lat":37023,"las":14320,"ld ":4729,"lbu":8442,"koł":12944,"kul":5213,"kuj":7417,"kwi":7243,"krą":16281,"koś":9426,"kró":6396,"kte":5141,"ksz":11351,"ksi":5778,"kty":11231,"ktr":5362,"ktu":7021,"kto":8566,"krę":7083,"gól":4564,"gór":7545,"któ":37445,"lok":4711,"lon":16670,"log":15244,"lot":5895,"low":11925,"lno":13692,"lni":16273,"leż":15711,"lne":15681,"lny":18798,"lna":11882,"lud":5047,"lub":38824,"lsk":52175,"lu ":15294,"lsc":50391,"li ":18475,"lew":7878,"les":10216,"let":6569,"ler":6678,"lem":11032,"len":14544,"lek":14961,"lej":10577,"leg":25572,"lec":7985,"lla":5033,"lle":8878,"lli":5642,"lko":12509,"eńs":10006,"lka":5102,"lki":9010,"ll ":4740,"lit":23937,"lis":17192,"lip":7716,"lin":28801,"lim":6337,"liz":9545,"liw":7216,"lic":38891,"lia":7515,"eń ":8305,"lik":8729,"lii":5868,"ma ":13583,"mac":9833,"eś ":42823,"maj":8752,"mar":13792,"mas":6086,"mal":6141,"man":12874,"maz":13085,"mat":15538,"me ":5269,"eśc":9065,"eśn":12795,"eśl":8074,"mcz":8811,"met":12557,"mer":18450,"men":33777,"lut":5364,"hód":6854,"mpi":5561,"miń":6040,"moc":8677,"mod":4841,"mon":7419,"mow":13205,"mor":22691,"mu ":12663,"msk":7954,"my ":8132,"muj":8914,"mun":5245,"muz":6245,"mał":6431,"mi ":36519,"min":109517,"mis":7482,"mit":5081,"mic":8508,"mia":35513,"mie":85656,"mię":17160,"mni":5567,"wą ":9465,"wód":59175,"wór":6020,"źdz":4969,"zta":5990,"ztw":56537,"zu ":8060,"zuj":5309,"zur":6895,"zy ":52536,"zwa":16748,"zwi":19757,"zwy":6194,"zyw":7464,"zys":27664,"zym":15419,"zyn":18487,"zyk":17972,"zyl":5653,"zyj":5225,"zyc":26077,"zyd":4698,"zył":4808,"zi ":13438,"zał":9317,"zgr":4705,"zec":34126,"zed":16305,"zeg":13280,"zej":12005,"zeb":5093,"zdo":9053,"zes":26033,"zez":38687,"zew":14699,"zen":40191,"zem":9711,"zel":6475,"zek":13631,"zer":16763,"ze ":43444,"zch":5824,"zbi":11943,"zcz":25210,"zac":28997,"zaw":17852,"zaj":18917,"zam":9131,"zan":18287,"zak":6986,"zal":7485,"zar":15092,"zap":6541,"zas":21035,"zny":34365,"zos":17618,"zon":31868,"zow":26765,"zpo":4827,"ześ":12448,"zmi":5909,"zna":56249,"zno":5266,"zne":31393,"zni":17047,"zm ":5171,"zka":12896,"zko":15029,"zki":16987,"zeń":5484,"zib":8100,"zia":21275,"sła":11498,"zie":58529,"zin":14757,"sło":8768,"zio":8015,"słu":9392,"zja":5084,"zji":8382,"yty":7484,"ytu":9487,"yto":6663,"źni":4961,"yte":4862,"yta":5297,"ysz":7085,"yst":44231,"yso":5982,"ysp":7012,"ysk":11600,"tęp":15379,"za ":37664,"yzn":5243,"ywa":26508,"ywi":7128,"ywn":5067,"yce":6751,"ych":133947,"yci":10363,"ycz":50193,"yda":15003,"żaj":15742,"yck":5423,"ycj":6652,"że ":16017,"yjs":11565,"yka":23821,"ym ":69059,"yki":7812,"ykl":4610,"yko":9261,"yn ":8183,"yli":7761,"ymi":14584,"yms":4916,"yna":17425,"yni":13954,"yno":4822,"żen":5078,"yk ":13655,"yjn":27661,"tów":16736,"tór":35547,"są ":7387,"ożo":65228,"oży":4895,"oże":6845,"wy ":42246,"wsp":8605,"wsz":18473,"wst":10921,"wsc":10075,"wsk":48535,"rąż":16283,"wys":19278,"wym":21711,"wyk":10398,"wyn":5788,"wyd":14033,"wyb":6431,"wyc":28770,"sów":5741,"woś":26886,"wo ":32774,"wna":7537,"wne":21421,"wni":31749,"wią":20802,"wno":4837,"wka":4779,"wrz":6674,"wod":23078,"wię":20120,"wny":9160,"wow":8334,"wor":17209,"wol":5953,"woj":77026,"wcz":7095,"ośl":6671,"wch":7307,"we ":41958,"ośc":57589,"wca":6919,"wer":9502,"wej":33041,"weg":17678,"wed":4775,"wał":10230,"ość":33539,"wi ":5607,"pły":9060,"wis":10022,"wiz":5342,"wie":266248,"wid":6688,"wic":20177,"win":12633,"wia":38637,"wa ":75032,"wan":92067,"wal":11772,"waj":5843,"wat":12396,"war":32065,"wac":6719,"róż":7313,"wad":9110,"rów":28327,"ród":6783,"ról":5670,"oła":8333,"oło":72193,"ołe":8524,"ołu":20038,"ońc":21153,"ver":4892,"uzy":7699,"usk":8960,"usz":14475,"ust":14884,"ute":10402,"utw":6376,"uto":11126,"us ":13666,"ura":9619,"ure":5261,"urg":10533,"uro":16796,"urs":7537,"ury":7673,"urz":9377,"ują":40031,"upa":5160,"ur ":29893,"upy":9651,"umi":4996,"ume":5036,"unk":16732,"uni":8824,"und":6279,"une":6969,"uko":7594,"um ":20351,"uka":4715,"ult":6004,"uli":7259,"ula":8673,"uje":18646,"uja":4953,"ugi":6634,"ucz":7047,"udn":21514,"uch":15770,"udo":10709,"udz":6914,"ub ":30727,"pół":22236,"ubl":8802,"ube":7781,"tyw":7322,"tyj":4614,"tyk":20395,"tyl":6759,"tym":7939,"tyn":8462,"typ":6891,"tys":5969,"tyt":6677,"twó":5271,"ty ":25638,"twa":27679,"tur":19565,"tun":9184,"tyc":48848,"two":26167,"pól":4883,"twi":61091,"tre":5469,"tra":36373,"tri":6420,"tru":11588,"tro":23044,"trz":32044,"tu ":25361,"try":9123,"to ":54716,"tni":28308,"toc":7538,"toi":15874,"tos":10191,"tow":38410,"tom":6513,"ton":10929,"tok":8683,"tol":14627,"tor":34275,"top":9511,"tin":4571,"tio":8819,"tki":10465,"tko":7816,"tka":9207,"tle":5081,"tem":19837,"ten":10208,"tej":7771,"tek":9091,"tel":10074,"kże":7372,"teg":14724,"tec":7568,"ter":68917,"the":6053,"tał":27215,"żąc":10258,"zło":10911,"ył ":11114,"ży ":9032,"zęś":17553,"życ":6860,"yła":8647,"żyn":4577,"żyw":6989,"yły":6907,"yńs":6994,"żni":7269,"żon":68291,"zęd":7177,"zęs":4784,"ższ":5421,"ząd":13275,"ząc":15081,"zą ":5842,"ząt":4518,"uży":12473,"zów":5144,"wła":9785,"óżn":6877,"ół ":8476,"ółn":14744,"ów ":112668,"óra":6553,"óre":15215,"órn":8319,"óry":15904,"ór ":5776,"ówk":5666,"ówn":37744,"ódz":62547,"ód ":10612,"ób ":4595,"óln":9187,"ągu":15768,"ądz":6409,"ącz":8602,"ący":47223,"ące":25280,"ąca":47593,"ązk":9275,"ąza":4528,"ąsk":8772,"ątk":5004},"n_words":[44927968,50956492,36530760],"name":"pl"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":84998,"E":149956,"F":100967,"G":92106,"A":261572,"B":132089,"C":229572,"L":100908,"M":162595,"N":83569,"O":115743,"H":53006,"I":86169,"J":66643,"K":27233,"U":54108,"T":95611,"W":30243,"V":63770,"Q":7457,"P":198289,"S":200312,"R":102796,"Y":7968,"X":13707,"Z":9504,"f":420930,"g":646820,"d":3123972,"e":5421795,"b":522239,"c":1736812,"a":5855900,"n":2824771,"o":4680584,"l":1547279,"m":2105049,"j":94292,"k":164391,"h":490405,"i":3472398,"w":44026,"v":454134,"u":1899338,"t":2255026,"s":2945228,"r":3035316,"q":191465,"p":1075380,"z":175309,"y":94919,"x":113550,"²":81071,"É":13693,"Á":9519,"í":209507,"ê":67610,"é":451892,"è":5136,"ç":214936,"ã":317848,"â":37738,"á":187832,"à":26465,"ü":5538,"ú":40758,"ô":23605,"õ":33734,"ó":129413," l":122890," m":228220," n":368310," o":307851," h":135081," i":114948," j":44060," k":90787," d":1636882," e":668130," f":265709," g":86172," a":553690," b":90195," c":638137," u":471782," t":166023," v":73530," q":119308," p":552163," s":358148," r":175170," J":65167," K":25787," H":50638," I":64806," N":77349," O":109788," L":96197," M":155484," B":122895," C":211790," A":232527," F":95595," G":81500," D":78496," E":140343," Z":9058," Y":7429," X":9721," S":182999," R":96218," Q":6965," P":187689," W":28455," V":54881," U":50226," T":85637," á":66191," à":25954," é":287636," ú":5989," Á":9484," É":13626,"A ":78627,"Da":9542,"Cu":6462,"Cl":8427,"Co":62457,"Cr":10302,"Ce":12150,"Ch":23975,"Ci":8714,"Ed":5016,"Do":10375,"De":23490,"Di":17511,"GC":5835,"Fe":14358,"Fa":9224,"Eu":7410,"Es":76180,"En":6707,"Em":8917,"El":11312,"Ge":12507,"Ga":11449,"I ":12898,"Fu":6893,"Fr":18796,"Fo":24543,"Fl":6042,"Fi":9055,"C ":14112,"Au":8700,"Ar":20685,"At":7248,"As":11784,"D ":5749,"Ba":33758,"Ag":5894,"Ab":6380,"Ac":5079,"Am":13589,"An":22486,"Al":38766,"Bu":7329,"Br":31392,"Ca":66526,"E ":5464,"Bi":7032,"Be":17271,"Bo":16310,"Le":17360,"Li":18877,"La":20252,"Lu":9478,"Lo":21075,"Me":18166,"NG":5897,"Mi":22675,"O ":59245,"Ma":62167,"Mu":14513,"Mo":23160,"Ni":6791,"Ne":10091,"Na":16341,"P ":5903,"No":30608,"Ol":8785,"Gi":5566,"Gr":19098,"Go":9769,"Gu":13341,"Ha":12782,"He":10653,"II":8816,"Hi":6333,"Ho":11804,"In":21460,"Ja":20297,"Je":5208,"Jo":23876,"Ju":10807,"Ka":6332,"Um":5798,"Un":17690,"Tu":6480,"Tr":12997,"To":12084,"Th":10714,"Ti":5247,"Te":19234,"Ta":10759,"UA":15545,"V ":6036,"St":11159,"Su":24311,"Wi":7399,"Sã":16301,"Wa":7041,"Vi":19753,"Va":11369,"Ve":14368,"Pr":23453,"S ":9133,"Pe":21497,"Pa":48708,"Pl":5522,"Po":57527,"Pi":15167,"Os":8721,"Or":9792,"Se":27899,"Sc":6387,"Si":12578,"Sh":5677,"So":14955,"Sa":39239,"Re":33199,"Ri":19155,"Ro":19693,"Qu":6140,"T ":5229,"Ra":8949,"b ":50847,"a ":2181662,"i ":168232,"ge":70306,"ga":87148,"bé":16913,"fl":8986,"fi":74478,"fr":59032,"fu":27785,"fo":117850,"dá":5256,"he":73854,"ha":210111,"gn":21302,"cê":5480,"gl":18191,"cç":8441,"gi":116568,"gh":7414,"gu":120239,"gr":61974,"cí":25911,"go":74169,"du":48733,"g ":26553,"ea":109055,"eb":27192,"ec":114155,"ed":73900,"de":1384296,"di":203085,"dm":36748,"do":671969,"ds":6146,"dr":29810,"ew":5596,"ex":56960,"eu":52678,"ev":46416,"ey":9665,"ez":25288,"fa":47754,"h ":22234,"fe":50703,"eg":161615,"ef":22289,"ee":12045,"el":215678,"aç":106998,"ej":12312,"ei":173784,"ep":64753,"eo":27810,"en":615409,"em":301673,"et":118849,"es":604827,"er":508227,"eq":14954,"aí":13564,"ca":322979,"e ":2017914,"bs":7191,"br":100433,"bu":28485,"bo":46844,"bl":22439,"bi":100249,"be":49470,"da":652111,"f ":9921,"cu":52185,"ct":31468,"cr":50241,"co":536282,"ck":15123,"cl":33604,"ci":328384,"ch":65018,"ce":215952,"c ":14270,"az":18902,"ay":13866,"ba":74690,"d ":43293,"at":195898,"as":441529,"ar":395911,"aq":7417,"av":49201,"au":56223,"ak":8531,"al":413966,"ai":134662,"aj":6551,"ao":31258,"ap":60289,"am":235425,"an":557322,"ac":100712,"ad":561485,"ab":137795,"ag":58484,"ah":7769,"ae":25110,"af":14102,"nu":24993,"nt":522402,"ns":188107,"no":344725,"nn":15103,"nz":5976,"ny":7562,"nv":17155,"oe":16285,"of":27077,"oc":103353,"od":80574,"oa":21115,"ob":40172,"om":360608,"on":341990,"ol":141927,"oi":109737,"oj":7341,"og":52000,"oh":5263,"ot":49005,"m²":81019,"os":507875,"ov":77341,"ou":126561,"op":64022,"oo":11470,"or":489375,"r ":303972,"ox":5797,"ow":11301,"oz":5717,"pe":192004,"lá":12510,"pa":207724,"pl":31072,"lé":9270,"lê":6741,"po":262183,"ph":9247,"pi":69522,"lo":175844,"lm":34639,"ll":48668,"ls":10434,"lp":6052,"lv":20781,"lu":42856,"lt":38629,"ly":6623,"o ":2000634,"ma":495019,"mb":74173,"iã":65284,"me":284803,"iá":6420,"iç":26049,"mi":135455,"mm":5698,"mp":92721,"mo":132550,"mu":111484,"p ":8620,"na":437221,"nc":193122,"nd":299787,"ne":119195,"nf":17343,"ng":76996,"nh":95890,"ni":224974,"nj":6755,"nk":6105,"ju":23595,"fí":5753,"jo":21637,"ki":12050,"ke":10814,"ka":8676,"m ":610401,"ko":5815,"gé":5229,"km":88160,"gê":12280,"li":257137,"lh":65430,"le":173629,"ld":17597,"lg":14230,"lf":5772,"la":234272,"lc":7172,"lb":14242,"n ":112737,"ht":6380,"hu":13446,"hi":44146,"dê":5692,"ho":87126,"dé":6382,"id":307392,"ic":326922,"ib":24713,"ia":390572,"ig":76460,"if":27051,"ie":76108,"k ":20205,"ir":178791,"is":322059,"it":260738,"iu":12350,"iv":114919,"ix":17582,"ik":5904,"eç":8830,"il":161901,"im":114838,"in":395415,"io":223313,"ip":55696,"je":12713,"iz":80563,"l ":281164,"ja":26393,"xi":25832,"tê":5442,"xo":8355,"té":18873,"xp":7291,"tí":13967,"tó":26791,"xt":10508,"z ":28371,"xc":16696,"xa":14045,"tã":10541,"tâ":11963,"tá":22795,"xe":8953,"sã":39079,"wi":7047,"sé":18935,"sí":5599,"ró":40309,"y ":44219,"wa":10938,"sá":5639,"ré":9925,"rç":11146,"vi":94338,"rã":8523,"vr":9835,"rí":29707,"rê":8332,"vo":58913,"uz":14799,"ux":5541,"uv":5858,"ve":130840,"rá":18536,"va":112071,"x ":15240,"ui":98859,"uj":5584,"ul":126725,"ue":171782,"ug":35554,"ur":125836,"us":87304,"ut":96190,"um":493013,"un":209874,"up":26206,"ty":5162,"tu":132302,"tt":15069,"pó":5919,"ub":41087,"ua":116112,"ud":26863,"uc":23327,"w ":7530,"pú":7028,"to":326748,"pé":22989,"tl":9288,"ts":8324,"tr":260066,"te":557844,"ti":259190,"th":25055,"ta":456660,"su":99840,"ss":133040,"st":452789,"sl":7670,"sk":8628,"sm":23244,"sp":74972,"so":134893,"sq":6495,"sd":6659,"sc":72024,"se":292294,"sh":13868,"si":227032,"u ":130375,"sa":133204,"sb":7374,"rr":74864,"rs":43158,"rt":193480,"ru":48286,"rv":18129,"ry":9625,"ní":7223,"rq":17132,"rp":10793,"ro":344915,"rn":53960,"rm":73617,"né":7825,"rl":15891,"nç":36309,"rk":7248,"ri":409856,"nã":10618,"rg":47427,"rf":6634,"ná":10363,"re":480208,"rd":53701,"rc":51264,"rb":27665,"ra":596837,"t ":61702,"mú":6929,"qu":189373,"mí":22028,"mé":15139,"má":8216,"mã":9155,"s ":1150995,"pt":9571,"pu":45299,"ló":14756,"lí":26351,"pr":175784,"ps":6678,"zi":14725,"ze":25298,"vá":5473,"za":80538,"zo":10261,"ví":24973,"ya":6552,"ys":5731,"yr":7114,"uí":16711,"uê":10113,"uç":8683,"² ":81055,"É ":12059,"ã ":5975,"ál":15193,"ác":6980,"ád":5021,"áv":6927,"áx":6998,"ár":84913,"át":10444,"ás":9114,"âm":6223,"ân":30468,"ão":308442,"à ":22252,"á ":25281,"ós":7667,"ôm":5559,"ôn":14217,"ói":30866,"óg":6061,"ód":6423,"ór":23808,"óp":8429,"ón":16555,"ól":8310,"ív":5979,"ín":37401,"ím":8935,"íp":23888,"ío":18396,"ít":18007,"ís":21982,"íf":5629,"íl":22187,"íc":11953,"íd":15269,"çõ":21574,"çã":131394,"ên":31853,"êm":5074,"ês":26627,"él":7493,"ém":24476,"én":6312,"és":8228,"ét":9564,"ér":30738,"éd":7542,"éc":31275,"ço":21084,"ça":38992,"é ":308148,"ún":5650,"ús":10148,"úb":7903,"õe":33653," Ga":11344," Ge":12440," Fo":24420," Fu":6872," Fr":18748," Fi":9007," Fl":5988," Ha":12738," He":10614," Go":9722," Gr":18935," Gu":13279," Gi":5513," Ho":11760," Hi":6304," Je":5193," Ja":20267," In":21384," Ka":6274," Jo":23796," Ju":10788," La":20143," Le":17264," Li":18789," Ma":61882," O ":56830," Mi":22595," Me":18100," NG":5699," Lo":21011," Lu":9458," Ne":9995," Na":16245," Ni":6760," Mo":23061," Mu":14430," A ":56821," Am":13551," An":22425," Al":38669," Ag":5878," Ac":5031," Ab":6362," Ba":33333," Au":8681," At":7218," As":11678," Ar":20591," Be":17207," Bi":6949," Bo":16224," Br":31284," Bu":7302," Ca":66026," Ce":12123," Ci":8645," Ch":23864," Cl":8347," Cr":10239," Co":62169," Cu":6320," Da":9487," Di":17388," De":23395," Do":10149," El":11281," Es":76102," En":6655," Em":8884," Eu":7403," Fe":14330," Fa":9149," Wi":7332," Sã":16295," Wa":6979," a ":159392," Os":8676," Or":9772," Po":57372," Pl":5480," Pi":15147," Pe":21311," Pa":48469," No":30533," Ol":8779," Ra":8887," Qu":6094," Ro":19582," Re":33096," Ri":19127," Pr":23377," Su":24284," St":10640," Ta":10684," UA":14219," Th":10656," Ti":5222," Te":19124," Tr":12898," To":12004," Sa":39159," Sh":5584," Si":12511," Sc":6287," Se":27797," So":14864," Va":11341," Ve":14300," Vi":19683," Tu":6372," Um":5787," Un":17669," ja":9410," im":11343," in":69292," il":7613," it":11439," jo":14428," ju":16927," ha":94775," gr":27016," go":7301," gu":5619," hi":11250," ho":15720," ne":9802," na":144146," mu":41919," mo":24736," on":8262," oc":7515," of":9658," ob":9893," nu":6152," no":190063," le":13833," li":23905," la":21762," gê":9642," km":87834," me":40602," mi":24825," o ":110021," ma":77227," lu":6557," lo":47174," ag":9251," ab":11690," ac":15052," ad":42206," am":26285," an":52016," ao":28506," ap":22430," al":29345," av":5103," au":17149," ar":22234," at":33443," as":63845," ba":35433," bi":6595," be":6338," bo":7718," br":28760," ca":66098," e ":241281," er":11510," et":6896," es":129714," en":44317," em":145368," el":17511," fe":16956," fa":35767," ex":43118," fu":24478," fr":49866," fo":99597," fi":29419," ge":13813," ga":14076," cl":11956," co":383123," cr":20621," ce":57678," ch":14448," ci":62716," da":258975," cu":12445," do":238864," de":1021034," di":92269," ed":7807," du":15330," sa":10534," se":193144," si":34152," so":24812," qu":119214," mú":6854," ra":8844," re":136757," nã":9894," ri":7562," ro":14358," pu":7134," pr":135007," lí":6039," os":51421," ou":60851," op":5907," or":37748," pe":115763," pa":93579," pl":9433," po":171127," pi":7518," sã":13196," sé":13507," va":8521," ve":22883," vo":11121," vi":22819," us":9297," ut":7175," um":444668," un":7084," ta":23369," su":56894," tr":33259," to":19262," th":6383," ti":9760," te":60106," É ":12023," à ":21932," ár":53832," ál":7915," é ":284550,"GC ":5727,"Est":57555,"Esp":12609,"Eur":5804,"Ele":5431,"Em ":5626,"Ger":5645,"Fra":12238,"Foi":10640,"For":9044,"II ":6274,"Gra":10045,"Int":5948,"Amé":5088,"Bai":6138,"Bar":6969,"Ale":12779,"Alt":5502,"Ant":8344,"Cal":7739,"Cam":11986,"Cas":10156,"Car":12659,"Cat":5648,"Can":7596,"Bra":20750,"Den":4991,"Chi":5974,"Cen":6997,"Cha":10337,"Cor":7280,"Com":13375,"Col":5164,"Con":24999,"Dis":5047,"Nov":9350,"Nor":13657,"Os ":6443,"Per":7552,"Par":15095,"Pau":10573,"Pal":5074,"Pro":8377,"Pol":9021,"Pos":21873,"Por":13937,"Jan":9313,"Jos":5715,"Jog":5307,"Lan":5276,"NGC":5668,"Man":7960,"Mar":25108,"Mon":7943,"Min":6224,"Mun":8484,"São":16298,"Sul":8673,"UA ":15445,"Sai":5389,"San":15530,"Rio":11763,"Val":5795,"Vil":5627,"Ver":7558,"Uni":16626,"Ter":5697,"The":7418,"Tra":5593,"bit":68890,"bil":5849,"bo ":5900,"bli":15859,"bol":16054,"bor":6814,"be ":8829,"ban":15903,"bal":9064,"bai":9744,"bas":9817,"bar":9261,"ber":18628,"bel":6221,"ca ":99422,"car":23995,"cas":22552,"cat":6662,"can":45210,"cap":11240,"cad":24057,"cam":12174,"cal":52109,"ce ":17573,"bri":17515,"bro":31772,"bra":35217,"bre":13206,"bur":6170,"bum":7625,"am ":33009,"aix":9798,"al ":180975,"ain":14382,"aio":19228,"air":8502,"ais":60324,"aia":5333,"ago":14405,"anu":5896,"ano":74765,"ant":129336,"ans":13455,"ane":21736,"ang":13849,"anh":31372,"ani":21090,"ana":43001,"anc":58051,"and":76029,"amo":8042,"amp":20451,"ami":6934,"ame":89963,"amb":25048,"ama":26496,"ao ":25066,"alt":10265,"alo":6981,"alm":23414,"all":8441,"alg":7598,"alh":11152,"ali":86609,"ald":5965,"ale":20569,"ala":17691,"an ":23333,"aba":10377,"abe":9561,"abi":55871,"abo":5404,"abr":10253,"ae ":16585,"aca":7889,"ab ":40485,"ai ":5437,"aga":6957,"age":17901,"ado":217915,"adr":8357,"adm":35408,"adi":9559,"ade":147619,"adu":9382,"aco":11338,"aci":33260,"ach":9385,"ace":11134,"ada":120249,"act":10015,"até":8607,"ató":5142,"ba ":7163,"aqu":7206,"amí":18994,"arg":7520,"are":23525,"ard":21302,"arc":12276,"ara":72529,"aro":8382,"arn":4993,"arm":6518,"arl":7205,"anç":24866,"ari":33288,"arq":9519,"arr":16718,"art":76275,"asa":6049,"asi":39800,"asc":20114,"ase":9902,"ar ":59582,"apa":10937,"alá":8272,"ape":8629,"api":9537,"apo":6995,"apr":7853,"as ":272706,"ava":17923,"aut":16096,"arç":7034,"avi":9973,"ave":10994,"ata":23226,"ast":51032,"ass":26073,"atr":16524,"ato":21654,"ate":18241,"ati":65753,"atu":18602,"aul":12887,"aus":5992,"jet":6359,"jan":5629,"jog":9592,"ito":67486,"itu":23834,"ism":9783,"isp":8481,"iss":14904,"ist":137228,"ita":115808,"ite":14296,"iti":9264,"ivr":6273,"ivo":19750,"isã":7956,"iva":52169,"ivi":13782,"ive":20602,"ipo":7067,"is ":98587,"ion":54065,"ior":20141,"ios":21003,"ipa":29796,"ipe":6360,"ir ":14429,"irr":7079,"iro":63701,"iri":6848,"ise":5201,"isc":10067,"isa":6981,"iu ":6072,"ire":22818,"ira":48023,"ja ":10608,"ixa":7209,"itâ":5590,"iz ":5440,"iza":68411,"km ":6889,"ki ":6235,"km²":80918,"gên":12105,"jul":5237,"jun":11429,"ha ":54934,"ham":13772,"han":7220,"har":10472,"has":8814,"hab":92261,"he ":16106,"hei":5849,"hec":22728,"her":9140,"hin":8200,"his":10200,"ho ":40610,"go ":31208,"cçã":8036,"gna":9579,"giã":60677,"gos":20347,"gov":5137,"gru":8390,"gra":31116,"gre":13275,"cíp":21053,"gui":6993,"gua":14762,"gue":29368,"gun":46422,"guê":7234,"iai":5206,"iam":5743,"ial":29154,"ian":36430,"ias":36377,"iad":18351,"ibu":6231,"ibe":5599,"ia ":236755,"ien":14065,"ier":7590,"ies":5878,"ied":5977,"iaç":6604,"ife":6300,"ifi":12350,"icu":5216,"ico":76077,"ici":43382,"ich":7968,"ice":7761,"ie ":27537,"ica":143651,"ido":58535,"idi":8244,"ide":55475,"ida":173600,"il ":28740,"im ":15779,"ige":5893,"iga":14060,"igi":13843,"igu":5678,"icí":20778,"igo":8521,"ign":14724,"imo":12687,"imp":12501,"ime":38876,"imi":7471,"inc":33960,"ind":23589,"ina":69514,"ino":22963,"int":49765,"ins":15328,"inf":7949,"ine":18603,"inh":19122,"ing":32761,"ini":53005,"inu":8866,"ila":11890,"in ":16212,"ilo":7283,"ill":18435,"ilm":5403,"ilh":18906,"ili":24989,"ile":27199,"ima":19294,"io ":111339,"hom":5231,"hos":8051,"hor":13996,"hum":7243,"fes":5911,"fer":17361,"fei":6807,"fam":20697,"ext":9301,"ez ":8287,"exp":6312,"exi":7134,"exc":16402,"eze":10386,"eta":21527,"ete":19661,"eti":12522,"esp":45070,"est":117047,"ess":36172,"eto":18111,"etr":18837,"eve":21643,"eva":5679,"evi":13265,"eus":10550,"eró":29181,"erí":19798,"ey ":6913,"er ":64588,"epa":35649,"açõ":12626,"eon":5303,"es ":232451,"epr":6051,"enç":6954,"eri":53238,"erg":9605,"ere":26129,"erc":25527,"erd":8470,"era":58345,"et ":10811,"equ":14020,"aís":9202,"esm":7907,"esi":25700,"esc":27105,"esd":5456,"ese":28703,"eu ":29870,"esa":56958,"erv":11462,"err":29563,"ert":32707,"ers":31036,"ern":27425,"erm":19525,"ero":27972,"en ":19630,"ela":57757,"ele":31916,"eli":10608,"elh":19021,"ell":12189,"elo":35477,"eo ":6077,"emb":29421,"ema":25633,"eme":11201,"emo":8721,"emi":9916,"emp":20050,"ene":11086,"enh":10752,"ena":27141,"end":70599,"enc":33249,"eno":17065,"eni":7335,"env":8603,"ens":111144,"ent":263509,"açã":84894,"ecç":6791,"ego":8230,"egi":67782,"egr":7999,"egu":58530,"eia":6927,"eis":10113,"eir":97869,"eio":5873,"ein":12110,"eja":8929,"el ":27319,"eit":17574,"em ":185457,"gin":9613,"gio":6344,"gic":7282,"gia":12807,"gen":19808,"ger":11186,"gem":16786,"ge ":8655,"gad":9856,"gas":7737,"gar":10290,"gal":15478,"gan":11614,"ga ":19137,"fut":9060,"fun":15140,"fra":39155,"fre":11644,"for":36958,"foi":67238,"bém":15726,"fic":31195,"fil":13806,"fin":10276,"fis":5455,"da ":386105,"de ":1113463,"dad":172007,"dal":5052,"dae":8716,"das":50851,"dan":7654,"dam":5121,"cul":25961,"cto":6036,"cti":5064,"cta":7890,"cur":7265,"cla":7926,"clu":9168,"cli":8238,"co ":77432,"con":114590,"col":16761,"com":260529,"cor":23539,"cos":23065,"cre":8780,"cri":27018,"cro":6255,"cea":7114,"ch ":7362,"cer":21587,"ces":47284,"ceu":5073,"cen":80958,"caç":9087,"cel":14050,"cei":7700,"cha":18824,"cia":84536,"ck ":7938,"cie":24606,"cid":84373,"che":14945,"chi":8861,"cim":6171,"cis":5750,"cin":23953,"cio":45369,"cip":32198,"ebo":13090,"ead":6607,"ean":5576,"eal":11859,"eat":4993,"ea ":57325,"efe":11937,"ei ":8171,"ega":11879,"edi":19796,"ede":22533,"eda":8333,"edo":8760,"ecl":7415,"eci":35040,"ece":14294,"ecu":5255,"ect":14984,"eco":11351,"dur":9714,"duz":6967,"dor":36541,"don":5597,"dos":101787,"diç":6127,"dmi":36030,"dua":7331,"dri":7071,"dra":5496,"dre":6857,"dro":6768,"dic":13696,"did":7827,"dia":53753,"der":26564,"des":61760,"dez":6564,"dec":9473,"def":5074,"dei":8639,"del":8957,"den":70173,"dem":10237,"dep":38994,"do ":499886,"div":11366,"din":7506,"dio":15896,"dir":15370,"dis":34873,"dit":6840,"dif":6362,"rga":9198,"ri ":5474,"rgi":6642,"rge":9033,"não":10106,"rgo":7927,"ret":18367,"res":88172,"rev":9016,"rg ":6617,"rea":66834,"ref":8795,"rec":27943,"red":8922,"rei":27335,"reg":80521,"rem":9993,"ren":24469,"raç":16220,"rel":14080,"nár":5684,"rep":8548,"rda":6649,"rdo":6927,"rdi":10382,"rde":15782,"re ":57889,"rci":7180,"rce":9738,"rca":20138,"rd ":8761,"rar":6504,"ras":65057,"rat":52626,"rav":11402,"rbi":16469,"rba":5129,"rai":12729,"rag":7162,"ran":101236,"ram":28519,"ral":30318,"rab":8082,"raf":5059,"rad":42430,"rac":11489,"rs ":6575,"ros":32986,"rot":8757,"rom":13204,"ron":14637,"rop":15939,"rou":5772,"rov":31049,"rod":14281,"roc":11812,"rol":6958,"rof":9288,"nçã":8381,"rog":7781,"rno":11572,"rna":23454,"rne":8579,"rmo":8053,"ro ":148425,"rma":34000,"rme":10601,"rmi":14095,"nça":24469,"riz":8006,"rio":44890,"rit":46897,"ris":18600,"rig":17891,"ril":10251,"rin":40225,"rim":24339,"ria":73821,"rib":7744,"ric":60975,"rid":13442,"rie":17826,"rup":9891,"rus":5606,"rva":5441,"rvi":6787,"ry ":6483,"rsi":7243,"rso":14017,"rta":49041,"rto":14307,"rte":57600,"rti":26907,"rtu":25752,"rt ":7638,"rqu":17086,"rro":14834,"rri":9266,"rre":21970,"rra":23654,"sad":9694,"san":5546,"sas":9730,"sar":5392,"sa ":76681,"sid":59438,"sic":18066,"sia":15495,"sit":17633,"sis":13892,"sin":13902,"sio":8799,"sil":41142,"sim":8594,"sig":13420,"scr":14392,"scu":5033,"sde":5601,"se ":99012,"sca":8201,"sce":12058,"sci":8690,"sco":17645,"ser":25139,"ses":9941,"set":9939,"seu":20458,"seg":45509,"sed":9373,"sen":34160,"sem":9716,"sel":5712,"spo":14510,"spe":12903,"spi":7965,"spa":15990,"sol":6214,"son":13806,"sor":8549,"sos":34321,"soa":5579,"soc":9148,"sob":10181,"st ":5646,"squ":6464,"smo":15213,"so ":33266,"ssã":5427,"stá":10797,"stã":5171,"stó":7858,"sse":16316,"ssa":18526,"sso":30852,"ssi":23013,"ssu":30134,"ste":119936,"sta":117494,"spé":16790,"sto":24915,"sti":39339,"stu":7367,"str":101454,"sua":22858,"sub":11326,"sui":27030,"sul":9138,"sup":6250,"sur":5128,"tai":5662,"tal":56545,"tad":72109,"tat":5635,"tas":21431,"tar":22207,"tan":74838,"tam":58144,"te ":182044,"ta ":108579,"pa ":8506,"pe ":6181,"par":108733,"pas":6728,"pac":5622,"pal":27847,"pan":19668,"láx":5857,"pec":9282,"pen":12180,"per":70169,"paí":6811,"pet":6411,"pes":15522,"pel":52058,"pla":10972,"ple":7590,"plo":5321,"pic":9020,"pin":7594,"pio":22913,"pir":7271,"pit":9245,"por":117200,"pop":23863,"pos":31387,"pon":14947,"pol":21044,"pod":12218,"po ":20718,"lês":5033,"pub":5596,"lít":10135,"pri":47853,"pre":42479,"pro":71854,"put":8461,"pul":26066,"mão":6273,"mér":6337,"míl":18580,"qua":26640,"que":121041,"qui":34023,"ra ":178784,"mús":6480,"ngo":5638,"ngl":12535,"ngu":12950,"ni ":5382,"nge":10906,"nga":5764,"nho":23066,"nha":43874,"nhe":24382,"nei":17925,"naç":10350,"nen":5777,"ner":21401,"net":5963,"nes":11864,"ng ":14902,"nco":14783,"nci":80668,"ncl":5010,"nce":69648,"nch":7394,"nca":5561,"ne ":24199,"ndr":9286,"ndo":94739,"ndi":27820,"nde":77208,"nda":61455,"nal":50464,"nam":7322,"nan":6772,"nar":8444,"nac":9251,"nad":24850,"nag":7446,"nai":6712,"nd ":10811,"nat":14635,"nas":35071,"na ":240257,"ny ":5694,"nsã":7776,"nvo":7994,"nve":5937,"num":5422,"nut":6804,"nto":104039,"ntu":19077,"ntr":64145,"nti":32354,"nta":52915,"nte":218759,"nso":37106,"nst":25404,"nse":18037,"nsi":55540,"nsa":5243,"nt ":11783,"ns ":20234,"nom":29572,"not":5470,"nos":46255,"nor":21961,"nov":11058,"nne":6439,"no ":209889,"nid":27726,"nic":53412,"nia":29100,"niz":6361,"niv":8244,"nis":48078,"nio":9439,"nim":8021,"nin":5656,"ogr":11168,"ogi":9366,"ogo":16032,"oga":7825,"oi ":78403,"ois":10929,"oje":5415,"ol ":15035,"oce":8227,"oci":14692,"oco":7976,"oca":52787,"ode":20265,"odi":6628,"odo":29483,"of ":5154,"oda":5699,"oes":5612,"odu":11939,"ofi":9565,"oa ":8425,"obr":14956,"oví":22195,"ote":8672,"oto":8649,"ost":25890,"ota":9466,"osi":9200,"oss":38213,"oso":8522,"ovi":11009,"ovo":6748,"ova":12269,"ove":21270,"ous":5680,"our":9672,"out":17508,"opo":7367,"ope":8960,"opa":6108,"os ":396479,"opu":24150,"oló":8455,"olí":12107,"or ":155288,"orm":37181,"orn":14580,"oro":7726,"orr":17850,"ord":17760,"ore":31129,"org":12371,"ori":29601,"ou ":76190,"osa":9113,"ort":72366,"m² ":81009,"orb":15491,"ora":44778,"ola":17798,"on ":35303,"oli":16249,"ole":10370,"olo":20061,"olu":6559,"olv":8428,"om ":129315,"ona":61961,"ond":42834,"onc":16183,"one":12259,"onh":24723,"ong":8421,"oni":11825,"ono":11037,"ons":37243,"ont":48875,"oma":22403,"ome":35533,"omb":7493,"omi":12319,"omp":29869,"omo":47819,"omu":62085,"la ":68801,"le ":30025,"lac":17518,"lad":10810,"lag":5216,"lan":32402,"lam":5387,"lar":20429,"lat":13280,"las":19631,"ld ":5145,"lbu":8512,"lon":10273,"lor":10947,"loc":44637,"log":14014,"los":17433,"lme":28363,"lti":6810,"lto":6807,"ltu":6502,"lub":6858,"lta":10527,"lho":31407,"lhe":5407,"lha":24850,"lgu":5605,"lev":8296,"les":20832,"let":10999,"ler":5749,"lem":18432,"len":11746,"laç":21723,"lei":31609,"leg":6064,"lec":6739,"lo ":60493,"lla":8917,"lle":13121,"lli":8289,"ll ":7390,"lit":15034,"lis":21665,"lio":5219,"lin":27495,"lim":6007,"liz":58462,"liv":7687,"lic":27450,"lid":13334,"lia":47125,"lig":7684,"ma ":312319,"mai":41100,"mad":18812,"mar":24207,"mas":19571,"mal":6120,"man":38877,"mat":11364,"mba":7627,"mbi":6786,"mbr":26443,"mbo":5222,"me ":30188,"med":7143,"met":15815,"mes":16176,"mer":33993,"mem":6344,"mel":7946,"men":135706,"mei":21546,"maç":6372,"mbé":15876,"lva":5641,"lve":5109,"lvi":6861,"mpi":7951,"mpe":15384,"mpr":13673,"mpo":26049,"mpl":10714,"içã":16989,"mod":7097,"mon":12314,"mor":10927,"mos":10632,"mpa":8900,"mui":6353,"mul":5806,"mun":87543,"ião":65139,"min":67980,"mil":11247,"mis":7225,"mit":9135,"mic":17167,"mia":5205,"mo ":71790,"vín":22165,"zem":7471,"zaç":7006,"zad":51320,"zon":5106,"uíd":7919,"uçã":7240,"za ":11487,"uês":7863,"tón":8038,"tór":13405,"tão":9310,"tân":11804,"tár":6234,"té ":9213,"xim":6106,"xia":6380,"xa ":5938,"xce":15412,"tá ":7267,"séc":5490,"sér":7647,"são":39006,"río":17761,"rói":28842,"róp":5175,"via":10682,"vil":9868,"vim":5528,"vid":18086,"vis":17869,"rço":7357,"vo ":18229,"vol":13908,"vos":6610,"rão":7259,"vez":5961,"ver":46986,"ves":7125,"vei":6022,"ven":17920,"vem":8996,"vel":15436,"ve ":14933,"val":11487,"van":5589,"var":6549,"vas":5897,"vad":9906,"va ":59685,"uzi":7134,"utó":6849,"usi":7047,"use":5760,"usa":11181,"ust":14662,"uss":6347,"uti":10782,"ute":14945,"uta":12601,"utu":10263,"uto":22443,"utr":11202,"us ":31309,"ura":55930,"ure":6495,"urg":9235,"uri":7874,"uro":13305,"ur ":5714,"upe":7143,"upo":9984,"uma":271540,"ume":7805,"unt":10008,"uni":47036,"und":70729,"una":50258,"unh":9065,"um ":201899,"ult":13394,"ulo":24134,"uli":6278,"ulh":9149,"ula":44299,"uil":6385,"uin":8363,"uip":5474,"uis":7425,"uia":5524,"uit":14697,"ul ":16765,"ui ":29672,"uga":10215,"ugu":19601,"uda":6128,"ude":5928,"ubr":7378,"uca":5380,"ue ":100430,"uer":13174,"ues":23125,"udo":7316,"uen":9436,"uel":7358,"púb":6723,"ua ":33662,"uas":11459,"uar":8119,"ual":23558,"uan":11328,"ubl":7183,"ube":7379,"uai":5297,"uad":12055,"tur":41731,"tus":5101,"tui":5138,"tul":5524,"tub":7669,"tua":23176,"tud":7868,"tug":23030,"tre":31460,"tra":95106,"tri":60532,"tru":9663,"tro":48136,"péc":16451,"to ":188185,"tod":9530,"tou":5354,"tos":40605,"tom":5946,"ton":10767,"tor":47860,"til":14634,"tig":9433,"tir":6653,"tit":12090,"tis":7919,"tin":26516,"tim":13089,"tip":5927,"tio":9200,"tia":5519,"tic":53927,"tid":12974,"tiv":62127,"tem":39133,"ten":72655,"tei":7527,"taç":9386,"tel":22078,"teg":5010,"teb":12305,"tec":8043,"th ":5190,"tes":82988,"ter":103003,"the":7042,"ço ":13183,"ém ":22415,"édi":7117,"éci":19647,"écu":6135,"éti":5315,"éri":21433,"ênc":16544,"êne":10481,"ês ":26475,"ção":131304,"ão ":306157,"ça ":19591,"çad":10194,"áti":8887,"áve":5112,"áxi":6942,"álb":7607,"áli":5630,"ári":27812,"áre":51471,"âni":17684,"úsi":6572,"úbl":7260,"ões":32837,"ôni":11441,"óri":16535,"óno":6648,"óni":7908,"óid":28287,"íti":12961,"íst":6738,"ínc":23886,"íng":5418,"íli":21725,"íod":17716,"ípi":21652,"ís ":6687,"íci":9019,"íde":5475,"çõe":21567},"n_words":[49778514,58587553,42469388],"name":"pt"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":14206,"E":15258,"F":12709,"G":12710,"A":32029,"B":21807,"C":31096,"L":13478,"M":26711,"N":11195,"O":8313,"H":8411,"I":15485,"J":5893,"K":4820,"U":8262,"T":15341,"W":4878,"V":8350,"Q":589,"P":22438,"S":30214,"R":23700,"Y":1325,"X":1901,"Z":1899,"f":89747,"g":84289,"d":270437,"e":933940,"b":76172,"c":339926,"a":811577,"n":568157,"o":427370,"l":434670,"m":224465,"j":16485,"k":15291,"h":47000,"i":839847,"w":7236,"v":72644,"u":459551,"t":525861,"s":321476,"r":567673,"q":978,"p":199529,"z":55715,"y":12885,"x":14642,"²":280,"Î":4242,"É":135,"ß":93,"î":69716,"í":464,"é":1581,"è":290,"ç":171,"ä":380,"ã":94,"â":33080,"á":1243,"à":152,"ü":921,"ú":132,"ö":583,"ó":581,"ñ":107,"Ă":92,"ă":161614,"ā":266,"ć":89,"ı":136,"ī":130,"ş":69795,"ł":112,"ō":143,"š":140,"ţ":70418,"ū":99,"ǎ":351,"Ș":1233,"Ț":844,"ə":124,"ˈ":89,"́":280,"μ":153,"ν":326,"ο":443,"ι":274,"κ":132,"λ":209,"δ":99,"ε":222,"η":124,"α":406,"γ":123,"ά":91,"ί":118,"ό":118,"σ":174,"ς":360,"ρ":245,"π":122,"υ":100,"τ":203," l":54431,"ь":237," m":47832," n":26804," o":49358,"я":249," h":3267," i":28305," j":7583," k":2023,"ы":136," d":156105," e":67780,"х":112," f":51072,"ц":117," g":13468,"ч":249,"р":882," a":134125,"с":828," b":11159,"т":574," c":116614,"у":311," y":145," x":142," z":3353," u":43415," t":26873," w":1170," v":14303,"і":84," q":90," p":90611," s":84706," r":35629,"К":133,"Н":83,"М":137,"П":110,"Б":103,"А":114,"В":101," J":5857," K":4744," H":8304," I":15348," N":11099," O":8211," L":13327," M":26422," B":21644," C":30661,"Р":101," A":31745,"С":176," F":12566," G":12513," D":13925," E":15142,"л":649," Z":1885,"к":766," Y":1319,"й":327," X":1885,"и":1244,"п":176,"о":1327,"н":924,"м":284," S":29902,"г":187," R":23579,"в":707," Q":580,"б":205," P":22131,"а":1480," W":4821,"з":146," V":8268," U":8214,"е":1081," T":15157,"д":332," î":68405," Ă":90," É":134," Î":4219," ţ":2046," ş":40504,"ה":105,"ו":138,"א":92,"ל":89,"י":166," Ș":1227," Ț":843,"ר":104,"و":163,"ي":302,"ل":318,"م":220,"ن":216,"د":146,"ح":87,"ب":195,"ا":469,"ع":89,"س":117,"ر":219," А":114," Б":103," В":101," К":131," М":137,"A ":4154," П":110,"F ":568,"Da":2015,"Cu":1954,"Cy":113,"Cl":1608,"Co":8574,"Cr":1776,"Ce":2628,"Ch":2680,"Ci":1438,"G ":509,"Ec":471,"Ed":733,"Ea":729,"Du":1500,"Do":1908,"Dr":850,"De":3817,"Di":2658,"Fe":1913,"H ":435,"Fa":1302,"Eu":3402,"Ev":451,"Ex":957,"Er":513,"Et":184,"Es":2728,"En":827,"Em":497,"Ep":280,"Ei":240,"El":1974,"Ef":84,"Eg":290,"Ge":3177,"Cá":254,"Câ":416,"Ga":1765,"I ":3107,"Fu":513,"Fr":2787,"Fo":1889,"Fl":1292,"Fi":1643,"Bâ":129,"B ":735," Р":100," С":176,"C ":1390,"Av":423,"Au":2210,"Aw":103,"Ar":4794,"At":636,"As":1391,"D ":881,"Ba":4270,"Az":208,"Ae":279,"Af":611,"Ag":465,"Ah":119,"Ab":553,"Ac":3223,"Ad":1097,"Am":1924,"An":3647,"Ap":1490,"Ai":410,"Ak":87,"Al":4305,"Bu":3694,"Br":3092,"Ca":7562,"E ":767,"Bi":3068,"Be":2955,"Bo":2204,"Bl":574,"Ku":328,"Ky":124,"Kn":99,"Kl":149,"Kr":348,"Ko":662,"Le":1965,"Gă":138,"Li":3218,"N ":854,"La":3044,"Lu":1753,"Ly":104,"Ll":318,"Lo":1983,"Me":2961,"Mi":4169,"O ":1394,"Ma":9710,"Mc":201,"My":160,"Mu":2543,"Mo":4971,"Ni":1761,"Ne":2385,"Na":2454,"P ":736,"Q ":153,"Nu":750,"No":2851,"Ol":1045,"Om":198,"On":376,"Oh":91,"Oc":1033,"Od":206,"Of":171,"Oa":113,"Ob":305,"Gi":853,"Bă":725,"Gh":757,"Gl":444,"Gr":2243,"Go":1170,"Gu":1179,"Gy":91,"J ":354,"Ha":2364,"Dâ":128,"He":1745,"Că":581,"Hi":659,"Ho":1571,"Hr":249,"Hu":882,"Hy":112,"K ":358,"Ib":100,"Ia":1254,"Id":117,"Ie":183,"Ig":92,"Io":1307,"Im":1022,"In":3678,"Il":431,"Iu":1265,"Iv":162,"Is":1017,"It":790,"Ir":549,"Ja":1638,"L ":645,"Iz":203,"Ji":337,"Je":630,"Jo":1867,"Ju":935,"Ka":1128,"M ":833,"Kh":125,"Fă":159,"Ki":625,"Ke":541,"Ut":114,"Ur":420,"Up":83,"Um":91,"Un":5673,"Ul":182,"Pă":635,"Uc":368,"W ":229,"Ty":98,"Tu":1266,"Tr":2949,"To":1782,"Th":2212,"Ti":1500,"Te":2836,"Pâ":232,"Ta":1163,"V ":1269,"Sw":112,"Sz":144,"Sy":285,"St":5254,"Su":2579,"Wr":229,"Wo":691,"Wi":1273,"Ră":813,"Wh":234,"Wa":1039,"Sâ":220,"We":859,"Y ":147,"Vo":713,"Vr":133,"Vu":111,"Râ":933,"Vi":1974,"Vl":297,"X ":1079,"Va":2283,"Ve":1567,"Lă":421,"Pu":736,"Pr":4511,"S ":1482,"Pe":2677,"Pa":6072,"Pl":931,"Po":3427,"Pi":1539,"Ph":445,"Os":331,"Ot":294,"Ou":154," ا":202,"Ov":120,"Op":423,"Or":1772,"R ":901,"Kö":111,"Sf":926,"Se":3730,"Sc":1718,"Si":2479,"Sh":841,"Nă":243,"Sm":167,"Sl":471,"Sk":113,"Sp":3075,"So":2382,"Ru":1542,"U ":435,"Sa":3082,"Re":6657,"Ri":1089,"Mă":769,"Rh":203,"Ro":9754,"Qu":329,"T ":645,"Mü":187,"Ra":1639,"Wü":172,"b ":5072,"a ":212743,"Ye":128,"Tă":194,"Ya":168,"Yo":679,"Yu":120,"Z ":124,"Tâ":356,"Să":464,"Vă":154,"Za":383,"Ze":392,"Zi":317,"Vâ":376,"Zo":278,"i ":179645,"gd":205,"ge":13414,"câ":4391,"ga":11234,"gb":146,"fl":4423,"ff":711,"bâ":136,"fi":21259,"fr":4539,"fu":3903,"ft":996,"fo":28249,"j ":2040,"gy":251,"dâ":214,"he":10237,"ha":6256,"gn":1318,"gm":382,"gl":4043,"gi":18379,"gh":4326,"bă":2234,"gg":253,"gv":296,"gu":7182,"gt":188,"gs":381,"gr":9341,"cî":121,"go":3415,"dt":248,"du":15963,"dv":200,"dw":264,"dy":391,"dz":86,"g ":5818,"ea":59379,"eb":4646,"ec":30872,"ed":13772,"de":117725,"dd":311,"dg":263,"di":66800,"dh":111,"dj":215,"dm":1691,"eM":86,"dl":291,"do":13587,"dn":210,"ds":959,"dr":6694,"ew":1340,"ex":7454,"eu":5496,"ev":8185,"ey":1419,"ez":15187,"fa":8662,"h ":3714,"fe":10587,"eh":1815,"eg":14417,"ef":4744,"ee":3471,"el":50297,"ek":775,"ej":687,"ei":30112,"ep":15349,"eo":6294,"en":63988,"em":24435,"et":20416,"es":79718,"er":87843,"ca":58257,"e ":363120,"by":328,"bs":1372,"br":14880,"bu":7988,"bt":195,"bn":117,"bo":5650,"bl":6057,"bm":100,"bi":12351,"bb":308,"bc":108,"bd":241,"be":6318,"db":261,"da":18159,"f ":3092,"cz":119,"cy":195,"cv":646,"cu":45953,"ct":20001,"cs":291,"cq":100,"cr":11320,"co":42083,"cm":243,"cn":187,"ck":2747,"cl":5298,"ci":32813,"ch":13925,"ce":51893,"cc":2745,"c ":18619,"aP":84,"aC":95,"az":9879,"ay":1659,"ba":11641,"d ":22818,"at":93723,"as":21874,"ar":105841,"aq":90,"ax":1099,"aw":499,"av":8565,"au":21651,"ak":1416,"al":87116,"ai":18198,"aj":3933,"ao":374,"ap":15113,"am":19806,"an":87690,"ac":26532,"ad":16915,"aa":641,"ab":6836,"ag":7060,"ah":1692,"ae":3048,"af":7326,"nu":32036,"nt":67674,"ns":20163,"nr":1134,"np":113,"no":17936,"nn":2298,"q ":85,"nz":1824,"ny":836,"nw":156,"nv":2089,"oe":1765,"of":5818,"oc":21548,"od":10761,"oa":21531,"ob":5951,"om":43493,"on":56179,"ok":801,"ol":30865,"oi":8296,"oj":309,"og":8118,"oh":1090,"m²":273,"ot":11364,"os":27341,"ov":12192,"ou":7338,"op":17725,"oo":1964,"or":85929,"r ":49086,"ox":1908,"ow":1237,"oz":3227,"oy":555,"pe":42586,"lâ":532,"pa":25762,"pc":98,"pl":7066,"lé":112,"po":22942,"ph":1089,"pi":11751,"lo":40730,"ln":822,"lm":3343,"ll":6217,"ls":1221,"lp":822,"lv":2229,"lu":48097,"lt":11451,"lz":261,"ly":862,"o ":37058,"mc":90,"ma":44395,"mb":17891,"eş":9887,"hă":248,"me":40474,"mf":257,"eţ":7364,"ml":160,"iè":112,"mi":28493,"mn":4278,"mm":955,"mp":17238,"mo":11698,"mt":225,"ms":601,"mu":21624,"my":299,"p ":5018,"na":37617,"nb":568,"nc":20662,"nd":32264,"ne":45632,"nf":4807,"ng":14388,"nh":479,"ni":65371,"nj":736,"nk":1221,"nl":1126,"nm":330,"ju":7523,"jo":2219,"jl":509,"bţ":531,"ki":1735,"kh":269,"fă":1862,"cş":118,"gâ":388,"kf":113,"ke":1642,"ka":1464,"m ":15258,"ky":316,"ks":661,"kt":260,"ku":385,"ko":995,"kr":318,"kk":161,"cţ":4949,"kl":292,"km":1457,"li":54376,"lh":237,"gă":2958,"lk":383,"lj":135,"le":70613,"ld":4148,"lg":2038,"lf":1236,"hâ":94,"la":55152,"lc":2339,"lb":4110,"n ":164001,"hr":1429,"hs":252,"hw":257,"ht":1909,"hu":2313,"că":22340,"hi":13680,"hn":1591,"ho":3277,"hl":404,"hm":291,"id":10613,"ic":65212,"ib":4937,"ia":63950,"dă":2289,"ih":1392,"ig":9132,"aş":9011,"if":6185,"ie":62856,"hy":354,"k ":4658,"iq":165,"eî":135,"ir":13340,"is":36911,"it":62439,"iu":24448,"iv":15138,"iw":97,"ix":744,"ii":33158,"ij":1160,"aţ":19894,"ik":1190,"il":42983,"im":26881,"in":122257,"io":22518,"ip":8638,"je":1168,"fâ":980,"ji":658,"iz":12018,"iy":146,"l ":109782,"ja":1735,"pţ":503,"să":10006,"xi":3113,"xo":371,"té":92,"tî":216,"xp":1513,"xt":2105,"xu":441,"ww":338,"z ":3980,"xc":491,"xa":1180,"tâ":2269,"xe":1609,"oş":1503,"ră":15639,"wi":755,"oţ":1099,"sé":93,"wn":249,"wo":518,"sî":86,"wr":185,"ws":439,"vy":161,"y ":7000,"wa":1979,"sâ":398,"we":941,"vl":85,"ré":180,"nţ":14138,"vi":18551,"râ":2794,"nş":236,"vu":1822,"vr":1272,"rî":85,"vs":260,"rí":114,"vn":113,"vo":6165,"uz":4516,"uy":132,"ux":811,"uw":93,"uv":2854,"uu":147,"ve":20356,"va":13083,"x ":3188,"ui":36731,"uj":940,"mţ":245,"uk":412,"ul":105505,"ue":3558,"uf":779,"ug":3958,"pă":8327,"uh":313,"ur":41535,"us":16414,"ut":20794,"um":22214,"un":77098,"uo":207,"up":12202,"ty":1026,"tz":478,"tu":41090,"tt":2205,"tw":494,"tv":129,"ub":9499,"ua":12620,"ud":10993,"uc":12122,"w ":1362,"to":36489,"tn":678,"tm":622,"tl":1983,"ts":1249,"tr":54687,"tp":270,"pâ":2492,"tf":804,"te":135412,"td":104,"lţ":1071,"tk":89,"ti":57663,"th":5025,"v ":5358,"tb":1584,"tc":809,"ta":57242,"su":20142,"sv":111,"ss":3065,"st":121541,"sy":354,"sz":231,"sw":164,"sl":2146,"sk":1369,"sn":714,"sm":3046,"sp":10131,"so":10359,"sr":365,"sd":200,"sc":21675,"sf":2108,"se":33580,"sh":1776,"nă":16784,"sg":163,"si":25637,"rz":902,"u ":55858,"sa":21555,"sb":549,"rr":1909,"rs":9764,"rt":23612,"ru":36664,"rv":2469,"rw":210,"ry":1487,"rp":2476,"ro":43127,"rn":8536,"rm":17803,"né":122,"rl":4592,"nç":103,"rk":1642,"rj":246,"ri":114051,"mă":7183,"rh":1830,"rg":8792,"rf":1291,"nâ":643,"re":125851,"rd":9049,"rc":6829,"rb":4129,"ra":67426,"t ":87746,"qu":761,"mé":97,"iţ":6799,"mâ":11991,"lă":11508,"iş":3714,"s ":29143,"px":1165,"py":91,"pt":7489,"pu":17959,"pp":664,"pr":42710,"ps":1182,"ză":8661,"zâ":331,"uţ":3113,"xă":383,"uş":1691,"vă":2660,"zz":266,"sţ":297,"zf":124,"vâ":2188,"rş":635,"zg":128,"uă":2010,"zi":14356,"zb":1622,"zd":236,"ze":7824,"vá":88,"za":10358,"zv":1283,"zy":102,"zu":1714,"zt":122,"zo":3689,"zn":131,"rţ":3519,"zm":105,"zl":93,"tă":34729,"ye":514,"yc":205,"yd":240,"ya":763,"yb":116,"yw":120,"yu":112,"yt":228,"ys":611,"yr":295,"yp":159,"yo":376,"yn":435,"ym":285,"yl":584,"yi":202,"² ":275,"Î ":96,"În":3944,"Îm":147,"án":283,"ác":265,"ár":117,"ás":83,"âl":535,"âm":971,"ân":23884,"âi":324,"âu":2278,"ât":1719,"âr":2512,"à ":112,"á ":120,"アアア":92,"ón":162,"ó ":105,"în":66365,"ín":93,"îi":280,"îl":220,"îm":1778,"ía":115,"î ":282,"âş":625,"él":95,"én":106,"és":86,"ér":215,"ée":141,"èr":89,"é ":392,"ăc":1712,"ăd":1353,"ăi":1593,"ăj":113,"ăg":390,"ăm":1728,"ăn":1902,"ăl":2480,"ăr":15029,"ăp":813,"ăv":254,"ău":2670,"ăt":8950,"ăs":5402,"ăz":2233,"ăb":253,"Ă ":96,"ă ":108634,"ün":238,"ür":317,"ör":105,"ön":99,"öl":118,"îş":473,"ăţ":4694,"ăş":1107,"şo":1467,"şt":10708,"şu":3626,"şi":42994,"şn":265,"şm":86,"şa":1934,"şc":1152,"şe":2787,"ş ":4403,"şă":140,"ţe":5578,"ţi":45859,"ţu":4697,"ţa":7179,"ţ ":1101,"ţă":5758,"ア":157,"ǎ ":253,"Ți":110,"Ța":292,"Ș ":94,"Și":132,"Șo":113,"Șt":390,"Ț ":94,"Șc":133,"Șa":135,"Șe":144,"Ță":266,"之":98,"三":194,"丁":102,"ος":175,"ος ":175,"ς ":359,"ν ":94,"α ":133,"アア":124,"ск":290,"та":92,"ст":158," Ga":1748," Câ":415," Ge":3154," Cá":254," I ":808," Fo":1868," Fu":504," Fr":2770," Fi":1622," Bâ":129," Fl":1278," Ha":2353," He":1731," Dâ":128," Gy":84," J ":235," Go":1158," Gr":2202," Gu":1163," Bă":725," Gh":755," Gi":841," Gl":431," Ig":92," Ie":183," Id":117," Ib":97," Ia":1248," K ":217," Hy":109," Hu":880," Hr":248," Ho":1557," Că":576," Hi":652," Ji":335," Je":625," L ":250," Ja":1632," Iz":203," Iu":1263," Iv":159," Ir":548," Is":1004," It":785," Im":1017," In":3630," Io":1287," Il":430," M ":262," Ka":1125," Ke":531," Ki":614," Kh":124," Fă":159," Jo":1852," Ju":930," N ":206," La":3007," Le":1942," Gă":138," Li":3175," Kl":144," Kn":98," Ko":661," Kr":346," Ku":321," Ky":124," Mc":200," Ma":9571," O ":853," Mi":4107," Me":2921," Lo":1967," Ll":318," Ly":104," Lu":1743," Ne":2351," P ":197,"а ":320," Na":2442," Ni":1749," Mo":4944," My":159," Mu":2527," A ":2659," B ":442," C ":439," Ap":1483," Am":1916," An":3623," Ak":86," Al":4275," Ai":406," Ag":455," Ah":117," Ae":277," Af":571," Ac":3199," Ad":1079," Ab":530," Ba":4239," D ":276," Az":205," Aw":102," Av":410," Au":2198," At":631," As":1386," Ar":4774," Be":2935," Bi":3051," Bl":566," Bo":2181," Br":3074," Bu":3675," E ":267," Ca":7457," Ce":2586," Ci":1423," Ch":2660," Cl":1578," Cr":1755," Co":8451," Cu":1909," Cy":110," F ":199," Da":1990," Di":2603," De":3729," Dr":841," Do":1853," Du":1487," Ea":726," Ec":468," Ed":725," G ":189," El":1961," Ei":240," Eg":286," Ef":83," Et":180," Es":2724," Er":511," Ep":280," En":810," Em":492," Ex":936," Eu":3393," Ev":441," Fe":1900," Fa":1274," H ":241," Să":460," Tâ":354,"к ":95," Wr":229," Wo":669," Wi":1265," Ră":813," Wh":231," We":849," Sâ":220," Wa":1026," Y ":114,"й ":231," Zo":278," Ze":389," Vâ":375," Zi":312," Za":381," Yu":120," Yo":676," Ya":164," Tă":194," Ye":128," Wü":172,"о ":126,"н ":131," Vă":153," a ":36875," R ":300,"в ":116," Kö":111," Ou":147," Ov":103," Os":331," Ot":292," Or":1764," Op":422," Po":3392," Pl":908," Pi":1531," Ph":433," Pe":2652," Pa":6041," Q ":118," Nu":741," No":2833," Ol":1044," On":368," Om":193," Oh":91," Od":204," Oc":1030," Of":168," Ob":298," Oa":113," Mü":186," Ra":1619," T ":190," Qu":324," Ro":9725," Re":6614," Ri":1081," Mă":767," Rh":203," S ":442," Pr":4358," Pu":723," Lă":359," Sz":142," Sy":283," Sw":111," Su":2564," St":5193," Ta":1158," V ":299," Th":2181," Ti":1481," Pâ":232," Te":2810," Tr":2917," To":1756," Ru":1539," Sa":3068," U ":212,"е ":128," Sh":834," Nă":243," Si":2443," Sc":1686," Se":3700," Sf":922," So":2361," Sp":3054," Sk":111," Sl":463," Sm":160," Va":2273," X ":295,"и ":98," Ve":1551," Vi":1939," Râ":933," Vl":295," Vo":709," Vu":110," Vr":132," Tu":1248," Ty":98," W ":133," Uc":367," Pă":634," Ul":182," Um":91," Un":5644," Ur":414," Ut":114," ja":470," l ":435," iz":399," io":137," ip":103," im":2637," in":15177," il":210," iu":2496," is":1936," it":606," ir":289," ka":124," m ":620," fă":1009," ki":195," gâ":148," jo":1049," ju":5955," ha":692," he":341," gi":362," gh":227," bă":568," gl":415," gr":4643," cî":96," go":322," gu":1027," ia":2677," id":624," ie":296," dă":97," aş":533," că":5014," hi":694," ho":634," hr":508," ht":170," ni":1017," ne":2985," na":3334," p ":138," mu":6108," mo":5925," ol":415," om":2225," on":472," oc":2425," od":198," of":2241," oa":721," ob":2230," nr":213," nu":8582," no":5141," le":5248," gă":633," li":8299," n ":4015," la":23505," km":1385," me":7797," eş":107," mi":5999," o ":24882,"я ":186," ma":19097," lu":7950," lo":8243," ae":559," af":3722," ag":621," ab":871," ac":11612," ad":3606," am":3470," an":10041," ap":7602," ai":935," aj":609," al":27010," av":3789," au":7831," ax":133," ar":7993," at":2213," as":3512," d ":818," ba":2886," az":461," bi":2225," be":668," bo":1156," bl":302," bu":1418," br":1785," ca":30202," e ":424," c ":92," er":1837," et":1185," es":44667," en":2470," em":716," ep":805," ei":565," el":3115," ef":535," eg":319," fe":3014," fa":5201," eu":1087," ev":1448," ex":4580," fu":2227," fr":3215," fo":24062," fl":777," fi":11367," ge":4653," câ":3570," ga":964," i ":494," cl":2039," cm":100," co":29681," cr":3733," cc":163," ce":15081," ch":1829," ci":2979," f ":104," da":4946," cu":21872," do":6118," dr":1888," de":93131," di":45684,"ч ":147," ec":2186," ed":1091," ea":295," eb":276," du":3328,"ль":104," vă":281," zo":1012,"ла":120,"ли":124,"ко":168," ze":739," zb":153," zi":1040," zf":120," vâ":719,"ка":188,"ки":145," tă":177," să":3897,"ин":138,"ик":85," ww":167,"ий":142," tâ":352,"ич":160,"ри":89,"ро":151,"ра":156,"ре":83,"ос":104,"ор":119,"ол":94,"ов":307," uş":254,"но":129,"ни":117,"на":162," ru":1689," sa":12280," sf":678," se":17401," sc":5833," si":9687," sh":138," nă":1457," sl":456," sp":5202," so":3426,"ви":183,"во":101," ra":2561," re":17790," ri":1713," mă":1822," ro":8367," pu":4423," pr":28214," ps":399," s ":2092," px":1165," lă":254," mâ":207," os":197," ot":110," op":1754," or":11013,"ан":170," ox":160,"ал":98," pe":25650," lâ":421," pa":9591,"ар":142," pl":2462," po":12520,"ая":98," pi":2615," wa":449," sâ":329," we":213," wr":124," wi":111," ră":1828," x ":110," va":2539," ve":4691," uz":162," vo":2209," vr":384," vu":171," râ":1545," vi":3164," uc":310,"ес":86,"ер":137,"ен":133," tu":1391," us":154," ut":956," ur":2048," um":561," un":37329," ul":1504," pă":1596," ta":986," st":9754," su":13217,"ев":109," tr":8292," to":2637," th":2187," ti":4195," te":6492," pâ":1627,"Țăr":258," Î ":96," Îm":146," În":3919," în":65579," îl":200," îm":1722," îi":236," î ":223," îş":424," Ă ":90," ţa":471," şt":950," şo":136," şi":37976," şc":219," şe":338," şa":764," ţi":391," ţe":161," ţă":1008," Ță":265," Ța":292," Ți":110," Șe":142," Șc":133," Șa":134," Ț ":92," Șt":387," Șo":113," Și":132," Ș ":91,"ال":185,"ي ":110,"ن ":132,"AS ":90,"BC ":103,"Feb":468,"Fed":536,"Fel":141,"Fer":333,"Fes":98,"Bâr":109,"Fil":558,"Fin":381,"Fir":91,"Fie":114,"Ext":451,"Fam":192,"Fan":103,"Fal":98,"Far":165,"Fac":313,"Fab":84,"Era":97,"Eri":131,"Est":2527,"Eur":3119,"Eva":149,"Eve":110,"Eug":130,"Exp":186,"Exi":117,"Exc":103,"Evu":86,"El ":710,"Ele":546,"Enc":124,"Eng":174,"Ene":132,"Emi":260,"Elv":237,"Eli":210,"Epi":189,"Ent":184,"Câm":221,"Cân":103,"Các":221,"Ger":1707,"Geo":684,"Gen":482,"Gla":99,"Ghe":506,"Băl":198,"Ghi":176,"Băt":252,"Gil":105,"Gir":238,"Giu":159,"Gaz":102,"Gal":559,"Gam":141,"Gav":159,"Gar":269,"Gab":147,"Fun":252,"Fru":86,"Fro":141,"Flo":463,"Fla":640,"Fra":1957,"Fri":249,"Fre":285,"Fon":202,"Fot":362,"For":809,"Fox":86,"II ":1544,"Dâm":116,"Căl":223,"His":116,"Hil":108,"Hel":236,"Hei":175,"Hea":139,"Hen":243,"Hes":101,"Her":541,"Hal":221,"Hai":98,"Han":325,"Ham":243,"Har":682,"Hau":104,"Gur":143,"Guv":353,"Gua":120,"Gui":207,"Gre":662,"Gri":261,"Gra":547,"Gru":280,"Gro":360,"ţă ":4334,"Glo":227,"Goo":112,"Gol":285,"Got":84,"ţăt":101,"Gor":250,"ţăr":1089,"ţăm":207,"Inv":138,"Ioa":445,"Inf":292,"Ini":100,"Int":1173,"Ins":803,"Ion":531,"Ios":98,"Ior":163,"Ili":165,"Ill":92,"Inc":150,"Ind":606,"Imp":850,"In ":158,"Iaş":404,"Ier":125,"Ian":535,"Ial":83,"Hun":383,"Hum":90,"IX ":412,"Hug":91,"IV ":391,"IT ":137,"Hor":215,"Hou":138,"Hot":160,"Hom":127,"Hon":117,"Hol":365,"Hr ":133,"Arg":385,"Arh":350,"Are":1696,"Arc":194,"Ard":100,"Ara":584,"Arm":402,"Ari":241,"Apo":195,"Apr":477,"Ate":96,"Atl":247,"Ast":296,"Ass":154,"Asi":398,"Aso":236,"Art":517,"Au ":88,"Avi":111,"Ave":127,"Aut":413,"Aus":786,"Aur":237,"Apă":85,"Aug":551,"Bai":170,"Bal":441,"Ban":574,"Bab":153,"Bac":358,"Bad":439,"Baz":111,"Bay":92,"Bar":873,"Bat":155,"Bas":347,"Bav":161,"CD ":118,"Abr":91,"Aca":558,"Act":175,"Ada":136,"Ace":1958,"Acc":188,"Adu":103,"Adm":250,"Ado":166,"Adr":161,"Ade":100,"Afa":103,"Aer":222,"Age":153,"Afr":381,"Agr":117,"Air":208,"Al ":217,"Ala":198,"Alb":1104,"Alg":107,"Ali":309,"Alc":89,"Ale":1016,"Alf":175,"Alt":180,"Alm":145,"All":248,"Alp":203,"Ame":1318,"Amb":85,"Ama":153,"Ang":951,"Ani":130,"Ana":326,"And":938,"Ant":732,"Ann":166,"Apa":378,"But":87,"Bus":115,"Buz":155,"Bul":406,"Bun":176,"Bur":449,"Buc":1806,"Bud":195,"Bru":284,"Bră":148,"Ca ":119,"Cab":93,"Cal":738,"Cam":777,"Cas":1102,"Car":1723,"Cau":101,"Cat":1040,"Can":883,"Cap":564,"Bea":150,"Bet":104,"Ber":735,"Ben":285,"Bel":1128,"Bib":207,"Bil":178,"Bih":202,"Bis":1833,"Bir":155,"Bio":98,"Blo":128,"CN ":94,"CO ":85,"Bla":274,"Bre":357,"Bra":1081,"Bro":311,"Bri":819,"Bog":147,"Boe":111,"Bol":190,"Bon":168,"Boo":93,"Bor":292,"Bos":170,"Bot":287,"Bou":149,"Cuv":164,"Cur":363,"Cup":386,"Cul":405,"De ":453,"Dez":104,"Der":87,"Det":101,"Des":342,"Dev":138,"Deu":92,"Del":231,"Dem":343,"Den":254,"Dep":258,"Dea":199,"Dec":652,"Dam":105,"Dan":549,"Dar":174,"Dat":151,"Dav":338,"Dac":187,"Dal":97,"Chr":305,"Che":298,"Chi":1153,"Cip":91,"Cin":138,"Cio":191,"Cit":251,"Ciu":252,"Civ":93,"DN ":145,"Cle":88,"Cla":363,"Cea":181,"Ceh":224,"Cel":451,"Cen":839,"Cet":187,"Cer":452,"Cha":723,"Cri":558,"Cra":341,"Cre":321,"Cu ":179,"Cru":182,"Cro":263,"Cli":120,"Clo":93,"şă ":93,"Clu":841,"Coc":83,"Coa":119,"Cod":224,"Cop":211,"Cos":228,"Cor":1033,"Com":2355,"Col":727,"Coo":87,"Con":2677,"Cou":282,"Cot":116,"Cov":111,"Ea ":436,"FA ":238,"Egi":189,"FI ":85,"Edu":88,"Edi":420,"Eco":139,"Ech":197,"ţur":116,"Eas":194,"ţui":121,"ţul":4343,"FC ":245,"ţar":732,"ţat":1470,"Deş":155,"ţe ":1287,"ţa ":4852,"Dia":147,"Dic":236,"Dis":609,"Dir":129,"Dio":102,"Din":623,"Dim":132,"Die":129,"Div":260,"ţit":365,"ţiu":2393,"ţir":210,"Duc":160,"ţin":4093,"ţio":5262,"Dup":205,"ţil":2341,"ţim":464,"Dun":357,"Dum":429,"ţii":7297,"ţif":426,"EX ":88,"Dur":97,"ая ":95,"ţia":6859,"ţie":10551,"ţei":1511,"ţen":180,"ţel":1678,"ţes":190,"Dre":250,"Dra":255,"ţi ":5299,"Doi":184,"Dob":158,"Dou":102,"Dol":170,"Don":271,"Dom":358,"Dor":224,"ţea":461,"Nea":360,"Neg":223,"Nev":114,"Neu":159,"Net":137,"Nep":85,"Nas":96,"Nat":327,"Nav":140,"Nig":112,"Nic":784,"Nis":298,"Nin":99,"Nik":102,"Naţ":910,"New":711,"Nap":289,"Nam":92,"Num":477,"OS ":93,"Nou":391,"Nov":172,"Nor":1220,"Not":118,"Noi":520,"Nob":138,"Înc":117,"Oct":512,"Înt":212,"Ode":88,"PC ":107,"Oce":295,"Împ":136,"Obe":86,"În ":3310,"Oto":155,"Olt":287,"Oli":317,"Ola":252,"Ono":115,"One":96,"Ope":278,"Ora":462,"Ort":215,"Osc":136,"Ord":181,"Ori":268,"Org":211,"Peş":117,"Plo":152,"Ple":87,"Pla":572,"Pin":170,"Pit":107,"Pir":96,"Pie":423,"Pic":155,"Pia":306,"Pho":97,"Phi":218,"Ped":83,"Per":522,"Pet":738,"Pen":513,"Pel":85,"Pe ":267,"Pat":391,"Pas":174,"Par":3240,"Pav":117,"Pau":327,"Pac":294,"Pan":274,"Pap":271,"Pal":567,"Pub":157,"Pur":83,"Put":143,"Pro":1436,"Pri":1498,"Pre":1075,"Pru":161,"Pra":285,"Pod":203,"Poa":126,"Pol":793,"Pom":101,"Pon":142,"Poi":169,"Pot":185,"Pos":133,"Pop":623,"Por":507," ال":165,"Lăc":323,"SA ":143,"Rac":113,"Rad":382,"Rai":238,"Ram":106,"Mün":150,"Ran":107,"SD ":117,"Que":155,"Isa":101,"Irl":266,"Ita":687,"Isl":175,"Isr":173,"Ist":357,"Ira":132,"Iug":109,"Iva":127,"Iul":580,"Iun":435,"Izv":134,"Jae":83,"Jac":383,"Jap":315,"Jan":217,"Jam":257,"Jer":140,"Jea":190,"Jim":122,"Jos":531,"Jon":151,"Joh":545,"Joc":288,"Jud":261,"Jus":113,"Jur":102,"Jul":125,"Jun":115,"Kal":121,"Kan":96,"Kat":124,"Kar":336,"Ken":149,"Kir":86,"Kin":185,"Kie":90,"Kon":94,"Kos":163,"Kre":87,"Kra":118,"Lew":83,"Lev":90,"Let":83,"Les":120,"Leo":277,"Len":176,"Lei":93,"Leg":332,"Lee":87,"Lea":120,"Lau":147,"Laz":87,"Le ":90,"Las":86,"Lat":109,"Lar":154,"Lam":95,"Lan":338,"Lac":259,"Lab":85,"La ":1181,"Lle":258,"Lib":362,"Lic":139,"Lie":89,"Lig":414,"Lim":410,"Lin":461,"Lis":431,"Lit":301,"Liv":198,"MI ":96,"Lux":121,"Lup":107,"Lum":137,"Lun":558,"Lud":146,"Luc":356,"Lou":179,"Lov":137,"Los":244,"Lot":89,"Loc":483,"Lor":132,"Lon":361,"Meh":106,"Men":147,"Mem":162,"Mel":209,"Mes":271,"Mer":299,"Met":461,"Mec":135,"Med":595,"Mex":216,"Man":1131,"Mal":342,"Mar":4668,"Mas":368,"Mag":383,"Mad":271,"Maj":121,"Mai":903,"Mac":356,"May":100,"Max":155,"Mau":132,"Mat":338,"Mod":176,"Mol":1838,"Mon":1277,"Mos":318,"Mor":392,"Mou":115,"Mot":179,"Mih":609,"Mik":106,"Mij":92,"Mid":157,"Mig":86,"Mic":936,"Mit":232,"Mir":321,"Mis":279,"Mil":373,"Min":651,"Muz":367,"Mun":1060,"Mul":94,"Mur":508,"Mus":300,"NU ":88,"Săl":172,"Săr":113,"Târ":305,"XX ":206,"XV ":95,"Wre":180,"Wor":386,"Wol":146,"Whi":111,"Răd":101,"Răz":528,"Răs":89,"Wil":518,"Win":333,"Wie":86,"Wit":97,"ère":85,"Web":84,"Wes":374,"Sân":132,"Was":121,"War":230,"Wat":92,"Wal":290,"Vra":93,"ée ":112,"Vol":202,"Voi":185,"Vis":124,"Vit":135,"Vla":243,"Ziu":91,"Zon":137,"Zee":92,"Vâl":191,"Vâr":128,"âşt":551,"Yor":388,"You":200,"на ":94,"Stă":112,"Sys":110,"Stî":142,"Sur":132,"Sus":204,"Sul":83,"Sup":346,"Sun":219,"Sue":270,"Sud":589,"Suc":243,"Sub":164,"Str":664,"Stu":292,"Sti":105,"Sto":394,"Sta":2644,"Ste":728,"Teh":119,"Tea":170,"Tec":113,"Ten":121,"Tem":164,"Teo":264,"Tel":395,"Tan":144,"Tat":111,"Tar":189,"Tai":83,"Tal":112,"UA ":470,"Sfâ":498,"Shi":108,"She":197,"Năs":219,"Sho":130,"Sha":300,"Sim":365,"Sil":252,"Sig":138,"Sit":102,"Sis":180,"Sir":233,"Sin":438,"Sie":127,"Sib":370,"Sfi":241,"Sez":141,"Ser":760,"Sev":325,"Sf ":135,"Scr":179,"Sep":495,"Sen":217,"Sel":98,"Sem":99,"Sec":315,"Sea":111,"TV ":299,"Spa":2255,"Spi":211,"Spe":223,"Spr":121,"Spo":178,"Sof":165,"Soa":134,"Soc":483,"Sou":289,"Sov":352,"Sol":260,"Som":154,"Son":175,"Sor":138,"Sla":128,"Slo":264,"Roş":199,"Rus":998,"Sai":239,"Sam":225,"Sal":380,"Sab":86,"Se ":674,"Sco":358,"Sci":132,"Sch":680,"Sca":185,"Sax":130,"Sav":134,"Sat":330,"Sau":85,"Sar":201,"San":758,"ови":108,"TA ":103,"Rez":129,"Res":168,"Ret":90,"Rev":440,"Măn":337,"Măr":265,"Rhe":108,"Riv":111,"Rin":182,"Ric":330,"Rap":96,"Ref":116,"Rec":405,"Red":97,"Rei":117,"Reg":1929,"Rem":121,"Ren":298,"Rel":102,"Rep":1978,"Rea":273,"Rol":118,"Rob":343,"Roc":197,"Rod":127,"Roy":87,"Rot":120,"Ros":294,"Rom":7794,"SS ":239,"SO ":102,"Reş":139,"Vel":106,"Ven":242,"Vec":214,"ски":101,"Vas":382,"Van":141,"Val":1157,"Var":247,"Vic":330,"Vie":288,"Vir":168,"Vil":251,"Vin":215,"Râu":810,"Ver":423,"Ves":364,"Păm":274,"Păd":123,"Ung":582,"Uni":3993,"Un ":792,"VD ":93,"Ucr":344,"VI ":204,"Tex":109,"Ter":953,"Tes":175,"Pân":88,"Pâr":129,"Tha":108,"The":1600,"Thi":130,"Tho":227,"Tib":83,"Tim":723,"Tin":109,"Tit":147,"Tir":112,"Top":132,"Tor":259,"Tok":94,"Tol":279,"Tom":280,"Ton":126,"Tot":116,"Tou":121,"Tru":137,"Tro":183,"Tri":361,"Tre":328,"Tra":1888,"Tur":781,"Tul":107,"Tun":93,"Tud":91,"вич":136,"biz":191,"bis":684,"bit":942,"biu":329,"bio":569,"bir":494,"baţ":118,"bil":3719,"bin":1111,"bii":385,"beş":168,"bo ":147,"blu":442,"şa ":427,"blo":266,"ble":593,"bli":4345,"bla":309,"boa":326,"bol":798,"boi":1231,"bog":143,"biş":155,"biţ":97,"şe ":206,"şca":505,"şal":128,"şan":384,"şap":297,"bon":331,"bom":116,"bor":1299,"bot":233,"bos":94,"bov":171,"şar":126,"şas":296,"bou":197,"box":118,"şat":99,"be ":463,"ban":1063,"bal":2277,"bai":156,"baj":238,"bac":607,"bab":295,"án ":151,"baz":1371,"bat":704,"bas":635,"bar":766,"bdi":107,"bea":119,"bi ":576,"bei":94,"bee":100,"bec":120,"ber":2623,"ben":568,"bel":997,"bes":190,"bet":600,"bia":703,"bib":169,"bic":703,"bie":1278,"áce":223,"şnu":136,"buţ":387,"buş":122,"şoa":686,"şor":235,"şov":403,"ca ":12197,"car":20537,"cas":995,"cat":4867,"cau":568,"can":4982,"cap":1522,"caz":602,"cav":140,"cac":106,"cab":250,"cad":1721,"cam":769,"cal":7164,"caf":88,"cai":123,"şu ":299,"ce ":13744,"bri":8393,"bro":479,"şco":241,"bra":1280,"bre":732,"bu ":154,"şea":100,"bru":3685,"şed":992,"bso":227,"bse":314,"şez":265,"şev":87,"bst":592,"şel":294,"şef":250,"şer":161,"şen":171,"şi ":40695,"bur":1502,"bul":939,"bun":893,"bum":1812,"bui":746,"buc":290,"but":536,"bus":305,"buz":102,"şcă":238,"şie":311,"şii":223,"by ":257,"şia":83,"bră":230,"şit":599,"şir":239,"şin":563,"şil":101,"aka":186,"am ":1398,"ake":307,"aki":175,"ajo":811,"aju":1101,"al ":30968,"aja":297,"aje":549,"adă":644,"ail":713,"aim":241,"ain":2042,"aio":622,"air":263,"ais":301,"ait":135,"aiu":141,"ak ":275,"aie":333,"aid":284,"aic":780,"aib":127,"aia":581,"ahn":102,"ahi":154,"acă":1581,"ahu":103,"aho":221,"aj ":993,"adâ":150,"ârâ":145,"aha":317,"agl":117,"agm":128,"agh":501,"abă":348,"agi":1223,"agr":589,"agu":482,"agn":543,"ago":775,"anu":8793,"anz":399,"any":226,"ano":1227,"ann":872,"ant":7062,"ans":5187,"ane":4573,"ang":1746,"anh":83,"ani":17950,"anj":319,"ank":595,"ap ":209,"ana":3541,"anc":4078,"and":7634,"amu":1451,"amm":250,"amo":500,"amn":539,"amp":1577,"ams":162,"ami":2968,"ame":7488,"amb":1018,"ama":1598,"alz":98,"alv":263,"alu":3357,"alt":3527,"als":199,"alp":187,"alo":2252,"aln":118,"alm":409,"all":1429,"alk":125,"alg":190,"agă":206,"ali":13360,"alc":1323,"ald":503,"ale":15313,"alf":367,"ala":3376,"alb":2405,"an ":14955,"aku":88,"ako":128,"acţ":1206,"ârş":461,"aba":554,"abe":900,"abi":2311,"abl":317,"abo":655,"abr":755,"abs":332,"abu":239,"ae ":1225,"aca":773,"aaa":86,"aal":84,"aar":122,"ad ":1372,"ânt":3632,"şur":726,"ânu":94,"ânz":207,"şul":2492,"âns":100,"ac ":1040,"ştr":244,"âmt":99,"şti":6783,"şte":3540,"âmp":537,"âng":896,"âne":1748,"şta":102,"ând":5547,"âni":6072,"ab ":204,"ânc":472,"âna":293,"afr":149,"aft":132,"aff":130,"afe":369,"afi":1982,"afl":2509,"ai ":11311,"aga":899,"age":1103,"afu":111,"âur":92,"âul":1913,"aen":112,"ael":563,"aes":133,"aer":621,"ah ":371,"âte":342,"afa":1595,"ado":765,"ârs":182,"adr":1167,"ârt":98,"adm":1348,"adj":104,"adi":2476,"ârz":238,"âu ":266,"ade":2813,"ag ":301,"ână":2880,"adt":92,"adu":1730,"aco":1240,"acl":100,"ack":689,"aci":1819,"ach":1150,"ace":7711,"ât ":1298,"acc":1560,"ârb":268,"ada":3740,"ârf":276,"ârg":288,"af ":209,"acv":115,"ârn":129,"act":5256,"acu":1523,"ârl":126,"acr":543,"azo":188,"azi":1570,"arţ":1475,"azu":425,"aze":593,"avâ":822,"aza":1518,"azd":83,"azz":139,"avă":214,"asă":1423,"axi":362,"axo":262,"az ":338,"axa":101,"atâ":459,"ată":12359,"âi ":95,"ays":88,"aya":143,"aye":215,"ân ":1771,"âlc":198,"ba ":2953,"âmb":183,"ază":4808,"âln":280,"âin":172,"at ":24564,"amă":335,"arh":1164,"arg":1153,"arf":116,"are":42152,"ard":2497,"arc":2307,"arb":848,"ara":6502,"arp":413,"aro":1047,"arn":629,"arm":1225,"arl":2466,"anç":90,"ark":583,"ari":10760,"aru":1463,"arv":129,"arr":585,"ars":566,"art":13731,"au ":13208,"asa":2050,"ary":262,"arz":166,"asi":2001,"ană":4433,"ash":375,"asc":1871,"ase":2773,"aso":520,"asn":213,"asp":360,"ask":118,"asm":175,"asl":121,"ar ":8918,"apa":2757,"ape":1246,"api":1462,"aph":142,"apl":456,"apo":1977,"app":137,"apr":2890,"aps":139,"apt":1565,"apu":270,"as ":1819,"ală":6884,"ava":1432,"ax ":193,"auz":385,"aux":116,"aut":2879,"avr":217,"avo":325,"anţ":3393,"avi":1654,"anş":96,"ave":2326,"ay ":775,"awa":162,"avy":131,"avu":1095,"ară":4267,"av ":216,"ata":4096,"asu":901,"ast":6214,"ass":610,"asy":113,"atm":162,"alţ":185,"atl":201,"atr":2501,"ato":5035,"ate":24962,"atf":138,"atc":92,"ati":8684,"ath":466,"aw ":115,"aua":249,"auc":144,"att":328,"ats":134,"atu":9240,"aul":521,"aum":109,"aun":296,"aur":1060,"aus":649,"aud":358,"aug":1299,"apă":1796,"amţ":177,"Wür":170,"ка ":90,"ий ":128,"ич ":137,"jec":116,"jel":228,"jen":194,"fâr":376,"fân":580,"ji ":99,"jat":278,"jap":309,"jar":183,"jan":170,"jaz":89,"je ":321,"joa":246,"joc":694,"joz":167,"jos":164,"jor":690,"jit":146,"jin":221,"jaţ":96,"bţi":531,"jo ":103,"jlo":496,"itm":199,"itl":818,"itr":663,"ito":6294,"itu":8009,"itt":280,"its":137,"itz":191,"ity":420,"iub":117,"iuc":160,"iua":221,"iud":230,"ipă":404,"isk":89,"ism":2124,"isl":772,"iso":596,"isn":143,"isp":881,"iss":507,"isr":101,"isu":227,"ist":19214,"iv ":3796,"ita":17021,"itc":116,"ite":7821,"ith":307,"iti":4614,"ivo":234,"ivu":281,"ius":647,"iur":519,"ium":322,"iul":7954,"iun":9474,"iva":1811,"ix ":362,"ivi":3232,"inţ":4376,"ive":4336,"ipr":212,"ipo":338,"ipp":144,"ipu":386,"ips":239,"ipt":541,"ipi":1233,"aţă":1525,"ipl":436,"is ":3857,"ion":10900,"iop":116,"ior":1475,"ios":402,"iot":422,"iou":159,"iog":188,"iol":1112,"ipa":2480,"ipe":948,"iov":274,"ir ":799,"iru":345,"irs":100,"irt":172,"iro":730,"irm":483,"eîn":134,"irk":94,"irl":333,"iri":2227,"isi":1374,"ish":315,"ină":2514,"isf":103,"ise":3028,"isc":2092,"isa":444,"iu ":4493,"iqu":132,"ilă":788,"inâ":388,"ire":5707,"imă":577,"irg":179,"ira":895,"irc":947,"it ":8220,"ünc":112,"iză":370,"ja ":337,"ită":7145,"ixt":86,"isă":796,"ixe":89,"iz ":119,"ivă":1340,"izu":147,"izv":208,"izo":952,"izi":2792,"ize":859,"iza":6312,"kil":158,"kin":314,"kir":111,"kis":140,"km ":1110,"ki ":642,"făc":542,"făr":519,"kel":127,"ken":233,"kes":118,"ker":360,"ket":144,"fă ":122,"gân":291,"kfu":88,"ke ":452,"kra":120,"kre":109,"kt ":111,"kov":198,"km²":262,"kol":93,"ks ":224,"făş":535,"cţi":4945,"ko ":243,"jut":351,"jus":113,"jul":508,"jun":382,"jum":207,"jur":1124,"jud":4033,"juc":683,"kar":128,"kan":145,"kai":124,"kad":96,"ka ":524,"ha ":718,"ham":492,"han":1154,"hai":779,"hak":92,"hal":402,"hau":169,"har":1141,"has":93,"hat":192,"hae":287,"hag":86,"hab":90,"he ":3331,"hel":915,"hei":851,"hee":120,"hed":164,"hea":469,"hez":295,"hev":85,"het":520,"hes":378,"her":1057,"heo":670,"hen":622,"hem":280,"că ":14688,"hi ":1098,"dân":197,"căi":96,"căl":499,"căd":116,"căz":100,"căp":117,"căs":168,"căr":2073,"cău":418,"căt":3820,"hie":477,"hid":863,"hic":510,"hib":86,"hia":1153,"hip":1598,"hio":209,"hin":1334,"him":1433,"hil":698,"hii":152,"hiu":429,"hiv":233,"his":738,"hit":1266,"hir":636,"hiz":150,"hn ":414,"hle":150,"ho ":172,"gma":139,"go ":432,"gme":179,"glo":451,"gle":2336,"gli":771,"gn ":157,"gla":327,"gog":138,"goa":126,"gnu":85,"gno":150,"gni":185,"câş":533,"gne":487,"gna":170,"geş":236,"gs ":160,"gol":308,"gon":427,"gos":413,"gor":814,"got":99,"gov":160,"gu ":263,"gro":318,"gru":1821,"gra":4239,"gri":471,"gre":2106,"gto":159,"gui":168,"gum":179,"gul":1451,"gua":163,"gue":323,"gy ":135,"gră":290,"guv":925,"gur":1708,"gus":1778,"gvi":266,"iam":458,"ial":8383,"ian":7496,"ias":341,"iar":3383,"iau":106,"iat":2458,"ic ":11072,"iab":156,"iac":516,"iad":260,"iag":132,"ibl":375,"ibi":1287,"ibo":95,"ibr":438,"ibu":1167,"iaz":367,"id ":1405,"iba":208,"ibe":1115,"ia ":37192,"iet":2227,"ieu":92,"iev":591,"iew":117,"iez":102,"iel":766,"iem":1503,"ien":2907,"iep":154,"ier":4481,"ies":1526,"ied":283,"ief":128,"iei":9680,"aş ":2678,"ig ":320,"iec":2094,"ifu":325,"ifo":500,"ifr":208,"iff":103,"ife":1805,"ifi":2918,"dă ":1783,"ifa":105,"icr":452,"ics":163,"ict":2757,"icu":1908,"ico":2006,"ick":421,"icl":554,"ici":10582,"ich":1701,"ice":8554,"ie ":34669,"ica":13932,"idu":1244,"idr":458,"ido":239,"idi":1327,"idg":110,"ide":3884,"ida":1312,"if ":119,"iic":157,"iaş":207,"iaţ":1579,"idă":222,"iin":4917,"iil":3254,"iit":1059,"il ":2791,"ija":114,"iji":265,"ijl":490,"ijo":87,"im ":1225,"ika":140,"aţa":1077,"aşe":616,"ige":485,"aşc":90,"aşa":330,"iga":1279,"ii ":23496,"igm":125,"igh":1464,"igi":3078,"aşi":1717,"icâ":83,"igu":863,"aşu":2407,"aşt":704,"igr":211,"aşo":348,"igo":379,"ign":421,"dăc":87,"iha":624,"ică":10472,"ihi":105,"dău":132,"iho":423,"dăr":156,"ik ":233,"imo":864,"imn":359,"imm":132,"imp":4726,"ieş":578,"imf":123,"ime":4318,"ieţ":356,"imi":3490,"ip ":1099,"inc":6719,"ind":7549,"ina":7598,"imu":2173,"inn":213,"inm":129,"ino":1626,"int":13234,"ins":3523,"inf":1447,"ine":10869,"inh":83,"ing":4928,"ini":7437,"inl":374,"ink":229,"ioa":4790,"ioc":312,"iod":317,"inu":2953,"inv":735,"iny":91,"inz":228,"iko":237,"icţ":367,"aţi":16926,"iki":186,"ike":152,"aţe":270,"ila":1467,"ilb":98,"in ":44693,"ilo":9977,"ill":1766,"ilm":2143,"igă":285,"ilh":99,"ili":9285,"ild":180,"ile":12078,"ima":4046,"imb":4580,"io ":1569,"ily":129,"ilt":143,"ilu":581,"ilv":1166,"hiş":314,"how":106,"hol":376,"hom":292,"hon":245,"hos":162,"hot":319,"hou":159,"hov":191,"hoo":89,"hop":132,"hor":542,"hoe":88,"hoc":86,"hni":581,"hno":368,"hne":86,"heţ":86,"hul":158,"hua":109,"htt":211,"hte":132,"hro":140,"hre":117,"hri":377,"ht ":1261,"hra":577,"hiţ":84,"hy ":109,"hwa":155,"hum":1416,"hus":146,"hur":185,"fi ":1715,"ffe":164,"ffi":135,"feu":99,"fet":117,"fes":1040,"fer":4398,"fec":1006,"fed":383,"feb":993,"fem":538,"fen":538,"fel":943,"fib":88,"fia":458,"faz":136,"fas":102,"fat":182,"far":402,"fap":490,"fam":1470,"fan":745,"fal":367,"fai":144,"fac":2220,"fab":663,"ff ":157,"fe ":258,"euş":129,"eză":2753,"fa ":130,"exu":301,"ext":1548,"etă":1447,"exa":920,"ez ":2054,"ews":115,"exp":1323,"epţ":305,"esă":354,"exi":1412,"exc":376,"exe":917,"ezv":899,"ezu":584,"evă":275,"eza":830,"ezo":739,"eze":4121,"ezi":2997,"erţ":230,"eta":3771,"epâ":360,"ete":2601,"etc":489,"eti":2973,"eth":248,"etn":363,"esp":1890,"eso":846,"est":55654,"esu":952,"ess":1010,"ev ":251,"euc":86,"eud":222,"epă":213,"eum":115,"eul":878,"eun":627,"eto":805,"etr":2728,"ets":89,"ett":511,"etu":1024,"etw":102,"ew ":746,"eve":1810,"eva":1191,"evo":1396,"enţ":4104,"evi":2580,"eut":437,"eur":1309,"eus":264,"ex ":508,"ewi":114,"eră":1931,"evr":439,"ey ":1079,"ewa":103,"epe":1003,"epi":751,"eph":288,"er ":7653,"epa":1109,"eot":142,"eos":392,"eor":2351,"eom":197,"eol":739,"eop":176,"eon":471,"elă":165,"eiţ":151,"es ":5970,"ept":3637,"epu":3456,"epl":448,"epp":122,"epo":532,"epr":2782,"erk":178,"erl":756,"eri":23643,"erg":1582,"emă":409,"erh":102,"ere":9729,"erf":436,"erc":1961,"erd":628,"era":10049,"erb":1083,"et ":2968,"esk":95,"esl":106,"esf":637,"enă":298,"esh":139,"esi":1831,"esb":88,"esc":5445,"ese":2325,"eu ":1098,"esa":2035,"erz":182,"ery":156,"erv":1894,"eru":1701,"erw":96,"err":761,"ert":2020,"ers":6219,"ern":5307,"erm":6392,"erp":946,"ero":1890,"eki":83,"ecţ":1050,"en ":4825,"elb":109,"ela":2761,"eld":314,"elf":84,"ele":22448,"eli":2942,"elg":1037,"egă":470,"elm":166,"eln":87,"ell":1629,"elo":7272,"elu":2902,"elv":133,"els":286,"elt":287,"eo ":496,"emb":7566,"ema":2057,"eme":3406,"emn":2329,"emo":1281,"ehă":102,"emi":3276,"emu":1154,"emp":1201,"ems":94,"ep ":186,"ene":5641,"enh":144,"eng":1760,"enb":358,"ena":2023,"end":1865,"enc":997,"eno":950,"enn":450,"enk":132,"eni":7113,"enu":3506,"ens":2167,"ent":26508,"enr":218,"eoa":176,"enz":462,"eog":396,"eod":283,"eoc":172,"egl":156,"ego":650,"egn":92,"ege":2159,"ecâ":351,"egi":6402,"egh":185,"egr":796,"egu":603,"ehn":787,"eho":111,"ehe":163,"ehi":357,"ecă":398,"ek ":382,"eic":203,"eia":920,"eis":644,"eir":131,"eim":251,"eil":863,"ein":887,"eii":140,"edă":119,"eaţ":502,"eie":409,"eid":483,"eig":145,"eaş":214,"eja":336,"el ":7309,"eit":209,"eiu":167,"eke":85,"em ":1443,"eju":102,"giz":650,"giu":3192,"git":384,"gis":1283,"gir":91,"gil":280,"gim":999,"gaţ":227,"gip":292,"gin":3668,"gio":683,"gid":91,"gie":1712,"gic":1515,"gii":481,"gia":2426,"bău":105,"băr":238,"ght":1266,"băt":475,"băn":99,"ghi":1143,"ghe":1448,"gha":167,"cât":1186,"gi ":458,"câi":94,"cân":2264,"câm":225,"gen":4213,"geo":702,"get":450,"ger":3349,"ges":353,"gh ":216,"bă ":1169,"gea":351,"geb":118,"gem":253,"gel":950,"gda":132,"ge ":2096,"gaz":511,"gby":87,"gas":177,"gar":1662,"gau":115,"gat":2233,"gaj":154,"gam":133,"gal":1275,"gan":2759,"ga ":1522,"îşi":423,"fuz":427,"fur":551,"fus":129,"ful":658,"fun":1914,"ftw":294,"ft ":458,"fra":2347,"fre":553,"fri":632,"fiţ":85,"fro":290,"fru":551,"for":8285,"fos":13009,"fot":1551,"fon":2006,"fol":2314,"flă":1101,"fiş":196,"feţ":166,"foc":121,"foa":663,"fle":282,"fla":1128,"fli":157,"flu":1381,"flo":329,"fic":5576,"fie":1608,"fig":239,"fii":4052,"fil":2800,"faţ":1332,"fin":1931,"fir":582,"fis":97,"fit":276,"fiu":394,"fix":216,"fiz":803,"cuţ":360,"da ":5431,"dba":199,"de ":81669,"dac":568,"dad":169,"dal":1004,"dai":112,"daj":204,"dag":131,"dae":325,"dat":5418,"dar":2392,"dap":257,"dan":788,"dam":518,"day":88,"dav":93,"dau":120,"cup":2017,"cun":3394,"cul":7448,"cum":2248,"cui":2182,"cuf":114,"cub":97,"cuc":191,"cua":310,"ctu":3820,"ctr":1989,"cto":4340,"cti":3912,"cte":2267,"cta":977,"coţ":184,"cră":288,"cy ":117,"cve":360,"cvi":112,"cva":166,"cus":289,"cur":6209,"cut":5650,"cuv":696,"ctă":260,"Șco":105,"cks":236,"ckh":87,"cla":1563,"cle":639,"clu":1691,"cli":610,"clo":419,"ceş":351,"cmi":98,"co ":1054,"cni":95,"cod":494,"coe":86,"cof":296,"cog":235,"coa":845,"cob":156,"coc":134,"con":13122,"coo":319,"col":4664,"com":13771,"cor":2665,"cos":422,"cop":2252,"cov":597,"cot":422,"cou":153,"cs ":186,"clă":363,"cqu":87,"ct ":2355,"cre":3245,"cra":1687,"cri":4361,"cru":523,"cro":1167,"cu ":14527,"cci":346,"cca":249,"cce":1997,"cea":5425,"cez":1350,"ch ":1051,"cev":105,"cer":3600,"ces":6831,"cet":999,"ceu":226,"cen":3570,"cep":2469,"cem":1612,"cel":8680,"cei":1228,"ceh":124,"cee":645,"ced":641,"ci ":3845,"Ști":135,"cha":1202,"Ște":246,"chw":220,"chu":202,"cia":8398,"ck ":1749,"cie":2502,"cid":710,"cic":613,"che":3039,"chl":127,"chi":6835,"cho":240,"chn":170,"chs":144,"cht":297,"ciz":320,"civ":411,"caţ":977,"cil":1200,"cim":273,"cif":735,"caş":448,"cii":3633,"cir":703,"cis":679,"cit":1218,"ciu":1361,"cin":2022,"cio":753,"cip":3172,"cm ":96,"cke":198,"ed ":964,"eba":172,"ebe":338,"ebi":493,"ebo":152,"ebr":2253,"ebu":793,"ec ":965,"eac":1155,"eag":683,"eae":329,"ead":219,"eak":166,"ean":4116,"eal":2295,"eam":780,"ear":779,"eas":2985,"eap":381,"eav":360,"eat":1944,"eau":812,"eaz":4296,"eb ":255,"ea ":37128,"efi":943,"efl":126,"efo":468,"efa":467,"efe":1778,"eff":109,"ei ":24037,"ega":2463,"eft":125,"efu":229,"eek":131,"een":318,"eel":266,"eea":861,"eed":149,"eer":336,"eep":102,"eet":278,"edi":6693,"ede":2885,"eda":988,"eg ":239,"edu":500,"edo":913,"edr":363,"ecl":455,"eck":180,"ech":2954,"eci":4375,"ece":4160,"eca":1262,"ee ":711,"ef ":336,"ecv":378,"ecu":2928,"ect":7556,"ecr":556,"eco":3176,"dwi":90,"dwa":128,"dy ":306,"dve":124,"dur":1540,"dut":183,"dus":2699,"duş":94,"dor":982,"dop":268,"don":1156,"dom":1511,"dol":361,"dox":527,"dow":256,"dov":2135,"dou":2054,"dos":139,"ds ":634,"diţ":1218,"diş":157,"deţ":4616,"dmi":1594,"dne":111,"doa":843,"dob":218,"doc":617,"dof":143,"doi":1173,"Țar":280,"dun":278,"dum":103,"dup":2223,"dui":145,"dul":3772,"dub":195,"dua":160,"duc":3456,"dri":740,"dra":1452,"dt ":159,"dre":2042,"du ":925,"dro":713,"dru":1503,"dge":174,"dic":3233,"did":167,"dia":4181,"der":5158,"des":4508,"det":654,"deu":131,"dev":1104,"dez":1907,"deb":451,"dea":1065,"ded":225,"dec":2801,"def":691,"dee":144,"deg":147,"dej":85,"dei":744,"del":2089,"den":4447,"dem":1462,"dep":2099,"deo":975,"di ":446,"dle":126,"dla":95,"deş":387,"do ":923,"dja":87,"div":1213,"diu":1854,"diz":147,"dim":595,"din":41384,"dio":1557,"dip":176,"dir":1178,"dis":3885,"dit":1572,"die":1235,"dif":1731,"dig":295,"dii":366,"daţ":232,"dil":110,"rgu":556,"rhe":302,"rj ":109,"rha":273,"rcă":369,"rhi":1066,"măr":2035,"măs":677,"măt":499,"măn":620,"rfu":326,"rga":2411,"ri ":15694,"rgi":1524,"rgh":746,"rbă":345,"rge":1425,"rgo":215,"ret":2582,"res":5915,"rev":1477,"reu":1156,"rew":92,"rez":5275,"rey":127,"mă ":3118,"rfa":103,"rfe":202,"rfi":117,"nân":632,"rfo":213,"rdu":767,"rds":382,"rg ":1476,"reb":516,"rea":18674,"ree":777,"ref":1708,"rec":5448,"red":1424,"rei":3494,"rej":104,"reg":5973,"rem":2573,"ren":2941,"rek":143,"rel":3576,"rer":308,"reo":342,"rep":4378,"rf ":235,"rda":852,"rcu":1210,"rct":172,"rdo":623,"rdi":1685,"rde":964,"re ":51071,"rbu":436,"rco":232,"rci":1263,"rch":609,"rce":1470,"rca":1029,"ray":96,"raz":629,"rd ":3327,"rap":865,"rar":2929,"ras":1358,"rat":9747,"rau":625,"rav":765,"rbi":1165,"rbo":575,"rba":839,"rbe":545,"rc ":287,"raj":371,"rai":1767,"rah":296,"rag":920,"ran":9556,"ram":2869,"ral":5890,"rak":183,"rab":1135,"raf":2847,"rae":350,"rad":3282,"rac":2468,"rpu":387,"rpr":755,"rpo":278,"rs ":1565,"rpe":182,"rpa":307,"rpi":136,"ror":724,"ros":1102,"rot":1489,"rom":6745,"ron":3428,"roo":164,"rop":6403,"roz":338,"rou":752,"rov":4132,"row":176,"rox":732,"rob":828,"roa":1307,"rod":3362,"roc":2541,"roi":1094,"rol":1987,"rof":1423,"roe":237,"roh":90,"rog":1247,"rno":327,"rnu":701,"rp ":237,"rna":2969,"rne":1758,"rni":1303,"reţ":1190,"rmo":436,"rmu":522,"ro ":2108,"rma":9885,"rme":3062,"reş":4337,"rmi":1348,"rls":94,"rlo":266,"rgă":207,"rli":413,"rld":353,"rle":420,"rla":2658,"rn ":698,"rks":134,"rke":254,"rm ":799,"riz":1053,"rl ":211,"rip":540,"rio":5018,"rir":448,"rit":7787,"ris":4701,"riv":1528,"riu":2574,"rdă":172,"rig":3530,"raş":5092,"rij":321,"rii":7673,"ril":8587,"rik":126,"raţ":2453,"rin":9619,"rim":5400,"ria":7770,"rib":1255,"ric":11723,"rid":1224,"rie":15408,"rif":356,"rk ":829,"roş":363,"rsă":312,"rui":1637,"rug":366,"rud":206,"ruc":1749,"rur":434,"rup":3037,"run":754,"rum":1184,"rul":9603,"ruz":121,"rux":123,"rus":1703,"rut":1017,"rva":737,"rvi":837,"rve":659,"rvo":119,"ry ":1178,"rsk":120,"rsi":1656,"rso":2085,"rsc":90,"rsa":860,"rnă":515,"rsh":97,"rse":1189,"rta":3149,"rst":426,"rsu":1104,"rto":1146,"rte":6836,"rth":550,"rti":6445,"rua":1449,"rts":182,"rtr":166,"rtu":1485,"rtt":156,"riţ":1112,"riş":356,"rmâ":178,"rt ":2203,"rro":293,"rmă":1457,"rri":340,"rre":470,"rra":339,"ru ":12849,"rry":345,"sc ":2232,"sab":243,"sac":340,"sad":91,"sag":137,"sai":83,"saj":124,"sal":1207,"sam":459,"sbe":121,"san":693,"sau":8354,"sat":4809,"sas":155,"sar":1304,"sa ":3104,"ón ":120,"ruş":110,"rze":109,"rtă":914,"rzi":434,"sha":218,"sho":144,"năr":653,"năs":2089,"năt":775,"she":136,"scă":1232,"shi":636,"năl":262,"si ":1523,"sfâ":443,"siv":865,"sie":984,"sid":975,"sic":666,"sib":419,"sia":1164,"sk ":250,"nău":325,"sit":6826,"siu":1567,"sir":294,"sis":2018,"sip":93,"sin":2491,"sio":987,"sil":2030,"sim":1237,"sih":314,"sii":129,"sif":302,"sig":587,"scr":3431,"scu":7764,"sbu":216,"se ":12289,"sca":1105,"sce":729,"sci":701,"sch":1873,"sco":2491,"sex":343,"sey":122,"ser":5132,"ses":902,"set":368,"seu":253,"sfa":100,"sez":381,"sh ":403,"nă ":12507,"sfi":168,"sfe":495,"sfo":294,"sea":1239,"sei":529,"see":104,"sed":775,"sec":2976,"seb":398,"sep":1669,"seo":135,"sen":1576,"sem":2294,"sel":1265,"spu":788,"spo":1513,"spr":1609,"spe":3508,"spi":782,"spa":1327,"sou":172,"sov":534,"sol":1405,"som":99,"son":2390,"sop":133,"sor":985,"sos":101,"sod":147,"sof":666,"soa":1403,"soc":1613,"su ":139,"sra":283,"st ":20718,"ss ":769,"sli":132,"slo":207,"slu":183,"sla":1454,"sle":117,"ski":424,"sfă":558,"sko":122,"sm ":736,"ska":303,"sna":139,"sni":276,"sne":177,"smo":155,"seţ":118,"smu":996,"so ":251,"sma":281,"seş":442,"smi":416,"sme":397,"soţ":294,"stâ":421,"stă":4284,"sse":604,"ssa":443,"sso":336,"ssi":591,"ssu":98,"ste":53928,"spâ":306,"stf":531,"sta":11097,"stm":85,"sto":3769,"sti":8907,"stl":380,"stu":4079,"str":12855,"sty":123,"sud":1668,"sue":207,"sub":3715,"suc":517,"spă":179,"suf":275,"sul":3772,"sum":370,"sup":2843,"sun":3645,"sut":208,"sus":705,"sur":1681,"suv":86,"sy ":155,"tai":479,"taj":164,"tal":7011,"taf":116,"tag":288,"tab":988,"tac":696,"tad":477,"tc ":460,"tba":1494,"tax":122,"tav":190,"tau":320,"tat":21973,"tas":470,"tar":6168,"tap":132,"tan":6974,"tam":968,"tch":206,"te ":79600,"suş":110,"ta ":8238,"oză":144,"pa ":2141,"oxă":260,"pe ":11904,"par":10822,"pat":2004,"pas":403,"pay":83,"pac":752,"pad":108,"pab":102,"pag":502,"pal":1642,"pai":83,"pap":190,"pam":183,"pan":5506,"phe":183,"pha":169,"pho":127,"phi":253,"pi ":238,"ph ":186,"lân":506,"pea":2043,"pec":3297,"ped":445,"pen":9052,"per":11036,"pet":1065,"pes":1059,"pei":1300,"pel":911,"pla":2428,"pli":1624,"ple":1254,"plo":453,"plu":926,"phy":102,"pia":1002,"pid":288,"pic":1543,"pie":1703,"pii":550,"paţ":844,"pil":610,"pin":728,"pio":918,"pir":800,"pis":848,"pit":1452,"piu":839,"poz":1416,"por":4692,"pop":3746,"pov":414,"pot":1327,"pos":1095,"poi":429,"pog":111,"pom":116,"pon":1731,"pol":4578,"poa":1462,"poe":647,"poc":495,"pod":314,"ps ":147,"ppe":285,"peş":178,"po ":105,"pta":1006,"pse":287,"psi":410,"pso":85,"ptu":1231,"pub":3609,"pte":2615,"pti":445,"pto":319,"pra":2308,"pt ":1178,"plă":316,"pru":251,"psa":94,"pri":13510,"pre":11982,"pro":14449,"pră":140,"ptă":561,"pur":1199,"pus":1351,"put":2590,"pun":2005,"pul":6398,"px ":1151,"puş":180,"puţ":344,"mân":11918,"lă ":9449,"iş ":380,"lăc":209,"lăd":384,"lăr":280,"lău":157,"lăt":576,"işa":158,"işc":590,"işe":100,"işi":457,"işo":462,"işn":143,"işu":248,"işt":1090,"iţe":420,"iţi":4507,"iţa":1141,"lăţ":94,"iţă":635,"qua":125,"que":368,"qui":218,"ra ":10628,"rb ":100,"ngo":269,"ngi":1205,"ngl":3065,"ngv":280,"ngu":1282,"ngr":328,"ngt":167,"ngs":258,"ni ":7085,"nge":1701,"ncâ":95,"ngh":727,"nga":1437,"nha":227,"ncă":762,"neg":502,"nei":3106,"nel":2471,"nen":1433,"nem":658,"nep":189,"neo":674,"ner":4256,"net":2067,"nes":1905,"nev":324,"neu":474,"ng ":2660,"nea":6161,"nec":713,"ned":727,"nee":362,"nef":188,"nfi":1313,"nfo":1463,"nfl":766,"nfr":253,"nfu":123,"ney":389,"nez":1743,"nex":158,"nfa":94,"nfe":701,"ncr":242,"nct":978,"nco":1033,"nci":6536,"ncl":974,"nce":5421,"nch":1298,"nca":697,"ne ":15507,"nbu":225,"ndu":3412,"ndr":2268,"nds":282,"ndo":1104,"ndi":3514,"nde":6309,"ndb":189,"nda":4140,"ncy":92,"ncu":853,"nal":8677,"nam":764,"nan":953,"nap":143,"nar":3126,"nac":272,"nad":606,"nag":295,"nah":123,"nai":648,"naj":750,"nc ":203,"nbe":172,"nd ":10185,"nav":597,"nau":373,"nat":5278,"nas":876,"naz":192,"na ":10566,"muş":108,"moţ":117,"nyi":101,"ntă":3762,"nz ":336,"ntâ":712,"nsă":705,"noţ":101,"ny ":515,"noş":120,"nvi":271,"nux":134,"nve":1249,"nva":170,"nul":11142,"num":9716,"nun":659,"nui":2359,"nus":463,"nut":1282,"nuu":86,"nuv":87,"nur":524,"nua":2364,"nue":133,"nuc":341,"nty":205,"nto":1327,"ntu":5345,"nts":104,"ntr":20429,"nti":5120,"nth":232,"nta":5926,"nte":13468,"nsu":2110,"nsn":130,"nsm":304,"nsp":737,"nso":628,"nst":5554,"nsf":385,"nse":1387,"nsh":116,"nsi":3286,"nsl":126,"nsk":220,"nsc":408,"nsa":2214,"nu ":2393,"nru":101,"nri":188,"nre":465,"nt ":10809,"niţ":1334,"niş":600,"ns ":1634,"noc":153,"nod":134,"noa":802,"nob":170,"nog":217,"nol":681,"noi":1449,"nop":228,"nom":2958,"non":580,"not":496,"nos":3264,"nor":3599,"nov":557,"nou":744,"noz":170,"nr ":214,"nne":796,"nna":357,"nno":163,"nni":320,"nny":106,"nme":156,"nma":86,"neţ":165,"neş":1107,"nli":309,"ngă":685,"nn ":394,"nla":407,"no ":1211,"nlo":244,"nkf":88,"nke":129,"ncţ":1362,"nki":120,"nka":125,"nkt":91,"înţ":344,"nje":175,"nja":194,"nju":195,"nii":3788,"ndă":489,"naş":348,"nig":141,"nif":691,"nie":6578,"nid":183,"nic":6607,"nib":123,"nia":13403,"nk ":409,"niz":1868,"nix":112,"niu":1843,"niv":2033,"nis":4310,"nit":7595,"nir":583,"nio":766,"nim":2048,"nin":1227,"naţ":2629,"nik":97,"nil":2118,"ogr":2357,"ogu":334,"ogi":3138,"ogl":113,"ogo":258,"ogn":118,"oga":345,"ogd":87,"oge":523,"oi ":1939,"ohi":99,"oho":87,"ohn":440,"oha":219,"ocă":128,"ohe":86,"ois":381,"oir":138,"oiu":600,"oit":118,"oin":312,"oaţ":87,"oil":1036,"oaş":230,"odă":270,"înă":265,"oii":90,"oic":184,"oid":249,"oie":2586,"ok ":171,"oia":306,"obţ":455,"ol ":1950,"oiz":87,"oce":1371,"och":606,"oci":2584,"ock":1201,"ocl":282,"ocm":112,"oco":593,"împ":1524,"ocr":595,"obs":247,"obu":209,"oe ":171,"oca":5500,"occ":161,"îmb":234,"ode":1683,"odi":1388,"înv":645,"odo":1005,"înt":5727,"înr":506,"îns":1196,"odr":168,"ocu":4799,"înl":252,"oct":1467,"îng":438,"înf":1221,"of ":1300,"înd":673,"înc":2936,"îna":811,"oda":803,"oel":100,"oem":170,"oes":95,"oet":442,"oen":174,"ody":97,"odu":3682,"og ":550,"ofi":1691,"ofu":93,"oft":569,"ofo":587,"oez":194,"off":117,"ofe":1241,"ofa":91,"nzâ":121,"ob ":212,"îl ":203,"nză":237,"oc ":1888,"oap":522,"oan":2626,"oam":565,"oal":517,"oai":251,"oad":3000,"oac":410,"în ":51095,"oba":769,"od ":1364,"oar":8811,"oas":2028,"oat":2201,"obo":275,"obr":179,"obl":493,"obi":2216,"obe":608,"nza":270,"nze":236,"nuă":93,"nzi":410,"nzo":124,"nvă":302,"îi ":276,"otă":208,"oya":123,"oxi":1058,"oxe":145,"oz ":239,"ows":299,"own":215,"oră":461,"owi":83,"ovă":83,"orţ":934,"ozo":459,"oze":129,"ouă":1877,"ozi":1701,"oza":446,"otu":494,"oua":772,"ow ":278,"olţ":86,"oti":981,"oth":247,"ote":1814,"ott":429,"ots":89,"otr":617,"oto":1723,"ost":14678,"osu":176,"ota":1624,"otb":1480,"ov ":924,"osi":2418,"osh":105,"onă":692,"ose":1383,"osf":245,"osp":184,"oss":270,"osm":217,"osl":251,"oso":685,"osn":103,"oy ":170,"owa":92,"owe":145,"ovi":5242,"onţ":708,"ovo":380,"ovs":162,"ouv":83,"ox ":338,"ova":2953,"ove":2093,"orâ":88,"oug":124,"oui":176,"oul":710,"oun":598,"oup":188,"ous":347,"our":791,"out":476,"opo":1954,"opp":88,"opi":1812,"opl":160,"ope":4845,"oph":214,"opa":1115,"os ":2802,"opu":4445,"opr":1065,"opt":660,"ops":95,"oon":154,"ool":264,"ook":218,"ood":367,"or ":26373,"oot":145,"oor":339,"oop":114,"ork":590,"orl":449,"orm":7578,"orn":1037,"oro":968,"orp":872,"orr":316,"orc":286,"ord":5185,"ore":2827,"orf":352,"org":3181,"omă":1147,"ori":16103,"orj":132,"ou ":829,"osa":293,"osc":3302,"ort":5002,"ors":359,"orv":285,"oru":3894,"ory":244,"olă":311,"omâ":10245,"m² ":271,"ot ":1395,"orb":953,"ora":7147,"ola":2095,"old":2473,"olc":184,"on ":8021,"olj":96,"oli":6258,"oll":628,"olk":148,"olf":429,"ole":2350,"olg":83,"ols":165,"olt":1104,"olm":146,"oln":127,"olo":7169,"oly":117,"olu":4417,"olv":270,"om ":2396,"oki":92,"oke":139,"ona":10266,"ond":4478,"onc":1725,"onf":1659,"one":3635,"ong":702,"onj":211,"oni":6032,"onl":252,"onn":355,"ono":3183,"ons":6444,"ont":4103,"onu":2421,"onv":453,"ony":208,"onz":212,"oma":4275,"oo ":114,"ome":3698,"omb":2289,"omi":3631,"omm":249,"omp":6241,"omn":706,"omo":1504,"omu":6796,"op ":1017,"la ":23354,"lb ":181,"le ":40374,"lce":389,"lca":429,"lcl":134,"lch":83,"lci":164,"lcu":522,"lco":212,"lf ":227,"lde":317,"lda":182,"ldo":2003,"ldi":182,"ldu":132,"lab":636,"lac":1028,"lad":507,"lae":470,"lah":161,"lag":391,"laj":309,"lai":336,"lal":366,"lan":7785,"lam":2475,"lap":209,"lar":3822,"lat":4828,"las":2223,"lax":136,"lau":569,"lav":737,"lay":251,"lba":949,"ld ":1067,"lbe":538,"lbi":174,"lbo":115,"lbu":1909,"ky ":197,"kso":178,"lpi":329,"lph":128,"ls ":385,"lpt":166,"lon":2782,"lom":608,"lop":364,"loo":88,"lor":17380,"lod":381,"loc":8139,"log":3938,"loi":220,"los":2548,"lot":440,"lou":173,"lov":661,"low":107,"loz":395,"lni":416,"lne":136,"loa":1027,"lob":416,"lmo":117,"lmi":101,"leţ":90,"lme":740,"leş":291,"lma":385,"lna":86,"lmu":506,"lth":83,"lti":1438,"lto":515,"ltr":114,"ltu":1401,"lud":310,"luc":1053,"lub":644,"lua":671,"lug":175,"lue":1286,"lsi":131,"lso":99,"lst":136,"lta":1833,"lte":3356,"lu ":1358,"lse":84,"lsa":93,"liţ":378,"ía ":107,"liş":264,"lt ":1778,"găt":711,"gău":109,"găr":308,"găs":508,"lhe":118,"lcă":318,"lj ":83,"lgo":101,"lge":272,"lgi":975,"lbă":158,"li ":1453,"lga":583,"lfu":196,"lfo":121,"gă ":1042,"lfa":415,"lez":1735,"ley":333,"lex":1098,"leu":326,"lev":1072,"les":2557,"let":1046,"ler":1357,"leo":330,"lep":98,"lem":2259,"len":1650,"lek":93,"lel":1961,"lei":1502,"leg":2868,"lef":302,"lee":148,"led":336,"lec":2987,"leb":331,"lea":5194,"lls":187,"llu":109,"lly":238,"lo ":746,"lla":922,"lle":1509,"lli":1027,"llo":392,"ln ":118,"lm ":1367,"ll ":1537,"lit":10843,"lis":3477,"lir":181,"lip":665,"lio":847,"lin":3950,"lim":4821,"liz":3596,"liv":266,"liu":800,"lic":6840,"lid":584,"lia":5811,"lib":815,"lk ":118,"lik":113,"laţ":3598,"lil":379,"lii":609,"lig":1016,"laş":630,"lie":5757,"lif":704,"ma ":3957,"mb ":1486,"mac":332,"mai":9505,"maj":528,"mad":521,"mae":84,"mag":1419,"mar":6545,"mas":1035,"mal":1356,"mam":301,"man":9186,"max":174,"mat":6955,"mba":2864,"mbl":374,"mbi":1641,"mbe":588,"mbr":9100,"mbo":699,"me ":5049,"mbu":511,"med":2490,"meg":129,"mea":1453,"mec":632,"eş ":976,"met":2646,"mes":1043,"mer":5757,"mem":2715,"mel":4206,"men":12391,"mei":953,"hă ":127,"mez":105,"mex":145,"mfo":85,"luz":516,"lva":1356,"lve":558,"lvi":203,"lul":6901,"luj":787,"lui":26427,"lup":481,"lun":1806,"lum":1944,"lut":376,"lus":1014,"lur":909,"lux":157,"luv":232,"ly ":423,"ltă":720,"luţ":840,"mpi":1771,"mpe":1922,"mpr":847,"mpo":2882,"mpl":2315,"mpu":2400,"mps":120,"mpt":100,"ms ":246,"mog":102,"moc":469,"mob":459,"mod":2074,"mon":3526,"mom":281,"mol":693,"mov":314,"mor":1244,"mos":450,"mot":668,"mou":110,"mpa":3076,"mnâ":163,"miş":960,"miţ":248,"mto":98,"mst":145,"mnă":376,"moş":151,"my ":245,"mur":1183,"mus":438,"mut":273,"mpă":755,"mul":9511,"mun":7969,"muz":1842,"eşt":6641,"eşu":215,"eşa":195,"mi ":621,"eşi":561,"mbă":561,"eşe":1204,"min":5735,"mio":138,"mil":3851,"mir":1594,"mis":1523,"mit":6241,"miu":551,"miz":90,"mix":115,"eţ ":436,"mic":3196,"mia":589,"mig":156,"maş":321,"mif":173,"mie":1057,"mid":167,"maţ":1711,"mij":400,"mii":918,"mo ":311,"mn ":1067,"eţu":4047,"eţi":1636,"eţe":1089,"eţa":97,"mm ":112,"moa":417,"mnu":348,"mni":697,"mna":983,"mne":568,"mmy":134,"mp ":917,"mmo":100,"mma":189,"meş":407,"mme":257,"ăţă":263,"vă ":1622,"văr":375,"văz":128,"sţi":293,"văţ":340,"ziţ":770,"zvo":1221,"zua":152,"zur":207,"zul":782,"zut":213,"zz ":105,"rşi":489,"zi ":1628,"vâr":386,"vân":1791,"zec":677,"zei":499,"zea":690,"zdu":96,"uă ":1773,"zeu":674,"zes":98,"zen":2310,"zel":386,"zer":601,"ze ":1470,"zbo":1342,"zbu":165,"zf ":120,"zac":138,"zah":127,"zam":95,"zan":432,"zar":1712,"zau":120,"zat":4549,"zoa":213,"zot":117,"zor":484,"zom":97,"zon":1721,"zol":318,"zof":297,"rţă":100,"zo ":122,"rţe":443,"rţa":169,"rţi":2629,"rţu":83,"rţ ":87,"uăz":94,"zib":96,"zia":1295,"zie":586,"zid":234,"zic":2773,"zii":433,"zin":2192,"zim":92,"zil":806,"zaţ":846,"zio":247,"zir":91,"zis":507,"zit":1014,"ziu":1413,"yst":189,"ysi":92,"ys ":152,"tăţ":3951,"yon":90,"za ":1974,"ywo":86,"ye ":94,"tă ":27261,"yer":209,"ya ":253,"yan":149,"yn ":128,"yle":187,"yo ":86,"yne":94,"tăl":523,"tăn":89,"tăm":184,"tăz":367,"tăt":383,"tăr":1670,"tăp":108,"yin":96,"tîn":184,"xt ":219,"xtr":935,"xtu":126,"xte":487,"xti":276,"xpr":511,"xpo":137,"xpl":405,"xpe":325,"xon":265,"ăţi":4022,"ăţe":252,"ăşe":185,"ăşi":118,"ăşo":168,"ăşu":379,"ăşt":194,"xul":217,"xua":180,"săm":195,"săn":94,"săi":149,"tât":368,"târ":286,"tân":1123,"să ":6878,"tâi":116,"tâl":280,"tâm":92,"xem":547,"xer":167,"xec":276,"xel":184,"pţi":497,"xis":976,"xil":150,"xim":851,"xid":122,"xic":439,"xig":85,"săr":830,"săs":204,"săp":211,"săt":291,"său":989,"xcl":154,"xce":187,"xe ":275,"xat":91,"xan":620,"xac":149,"ww ":168,"www":168,"oţi":926,"oţe":90,"wn ":181,"ws ":286,"wre":154,"wor":208,"woo":193,"răş":270,"răţ":90,"sân":187,"ră ":10046,"wer":171,"wel":112,"wei":114,"web":177,"oş ":108,"răb":152,"răd":292,"răc":168,"răg":121,"răj":88,"răi":753,"răn":163,"răm":369,"oşt":362,"oşu":230,"oşa":206,"oşi":435,"sâr":193,"wis":119,"wig":94,"wic":83,"win":151,"rău":112,"răs":796,"răt":330,"răr":765,"răz":877,"wa ":123,"wan":150,"wal":599,"way":113,"war":714,"viţ":222,"viş":91,"vro":102,"vri":167,"vre":765,"vra":124,"vsk":216,"vut":1014,"vul":660,"vy ":150,"via":1245,"nţ ":202,"vio":597,"vir":425,"vaţ":256,"vil":1004,"vin":4712,"vig":218,"vii":288,"vic":1489,"vid":1022,"vie":1344,"viz":1818,"viu":343,"vit":1797,"vis":1422,"nţa":4036,"nţe":2750,"nţi":4753,"nţu":270,"vo ":269,"veţ":303,"veş":233,"voa":145,"voc":768,"vod":247,"voi":344,"vol":2372,"von":307,"vor":1265,"vot":169,"vos":108,"nţă":2104,"vi ":279,"rât":86,"râu":1467,"râr":88,"vez":158,"ver":5755,"ves":2284,"vet":135,"râm":188,"rân":915,"vei":383,"veh":194,"veg":396,"ven":3368,"vel":1246,"vea":1522,"ved":733,"vec":1443,"ve ":2034,"val":1848,"van":2141,"var":1789,"vat":1229,"vas":355,"vaz":94,"vac":381,"vad":175,"vai":89,"uză":207,"uzâ":97,"va ":4378,"usţ":288,"uzu":128,"uzi":2633,"uvâ":532,"uze":512,"uza":616,"ută":2040,"uxe":267,"usă":1693,"uz ":167,"ură":2750,"upă":3161,"ux ":398,"uvi":580,"unţ":1150,"uve":1576,"urâ":83,"ună":2252,"ush":108,"usi":1228,"use":1795,"usc":466,"usa":280,"uu ":89,"usu":380,"ust":4305,"uss":282,"uso":94,"ulţ":391,"uth":338,"uti":1787,"ute":2406,"uta":1302,"utt":99,"uts":136,"utu":1616,"uto":3287,"utr":237,"us ":5190,"ulă":561,"ut ":7246,"urb":404,"ura":6173,"urd":294,"urc":802,"ure":4226,"umă":1226,"urg":1586,"uri":11434,"urk":100,"urm":1472,"urn":1161,"uro":4600,"urp":138,"urr":101,"urs":2050,"urt":1427,"uru":770,"ury":127,"urz":103,"unz":236,"upa":1044,"ur ":1460,"păş":90,"upi":117,"upe":1889,"upo":171,"upr":3078,"upl":134,"upt":549,"upu":1021,"ump":261,"umu":1472,"umi":4694,"umo":274,"umn":200,"uma":1213,"umb":1802,"ume":7127,"unt":4936,"uns":365,"unu":4147,"unk":173,"uni":10730,"uno":4024,"unn":89,"unc":3006,"und":2612,"una":3959,"ung":1996,"une":10686,"up ":833,"ucţ":871,"um ":3644,"ulu":21791,"ult":5763,"uls":125,"ulp":273,"ulo":572,"ulm":181,"ull":195,"uli":2417,"ugă":238,"ulg":558,"ule":2260,"ulc":332,"ula":6764,"ulb":106,"un ":26513,"uid":145,"uie":791,"uaţ":284,"uil":227,"uin":534,"uir":346,"uis":305,"păt":304,"păs":368,"uk ":108,"mţ ":178,"uia":763,"uit":3995,"ul ":62901,"ucâ":92,"ugh":328,"ugi":103,"ugb":125,"uge":384,"ugo":240,"ufu":127,"ui ":29161,"uga":410,"păi":109,"ucă":1628,"păd":164,"păr":2478,"păm":252,"ugu":1856,"uha":84,"uj ":675,"uco":175,"ucr":1114,"uct":1068,"ucu":1790,"uda":563,"ude":4922,"udi":2446,"ubo":209,"ubm":96,"ubs":626,"ubt":149,"ubu":425,"uca":1046,"ue ":469,"uce":1968,"ucc":519,"uci":711,"uch":237,"ucl":385,"uck":119,"uer":254,"ues":246,"pă ":4354,"uff":102,"ufe":113,"ufi":109,"ufl":99,"udu":394,"udo":402,"ug ":96,"ued":469,"uea":236,"uen":1166,"uel":339,"ub ":2463,"tuţ":713,"ua ":1456,"uat":4356,"uar":3767,"ual":1937,"uan":471,"ubi":491,"ubl":4031,"ube":197,"uba":209,"ubc":93,"ubd":102,"ud ":2013,"uad":93,"uc ":293,"tze":90,"tyl":114,"tuş":101,"ty ":780,"twa":283,"trâ":424,"tur":9303,"tus":270,"tut":920,"tui":1608,"tul":18833,"tun":827,"tum":175,"tua":5218,"tud":1979,"tue":106,"tug":400,"tz ":249,"two":138,"toţ":88,"tră":1759,"toş":149,"ts ":552,"tiţ":743,"tiş":169,"tre":13329,"tt ":332,"tra":10048,"tri":7435,"tru":15106,"tro":4167,"tu ":332,"tsc":140,"tsu":91,"tst":102,"tta":206,"tte":678,"tti":236,"ttl":111,"tto":150,"ttp":214,"tts":95,"teş":321,"tme":92,"tma":135,"to ":1187,"tmo":189,"tni":304,"tne":121,"tp ":211,"tna":149,"tno":89,"tof":98,"tod":1039,"toc":696,"toi":238,"tog":562,"tob":142,"toa":4019,"tou":153,"tov":167,"tos":511,"tot":1088,"tox":87,"tom":2691,"ton":3163,"tol":1579,"tor":17808,"top":634,"tr ":2261,"tii":1559,"til":2704,"taţ":1027,"tif":641,"tie":3100,"taş":87,"tig":849,"tir":1365,"tit":3982,"tis":1483,"tin":5976,"tim":3231,"tip":1320,"tio":2187,"thu":1487,"tia":853,"tib":219,"tic":12965,"tid":1357,"tiz":272,"tiu":313,"tiv":7387,"lţi":875,"tli":252,"tlu":762,"tla":333,"tle":501,"tem":5382,"ten":2986,"teo":833,"tep":232,"tei":2254,"tej":251,"tel":8378,"tee":83,"tef":321,"teg":901,"teh":698,"tea":10690,"teb":131,"tec":1715,"ted":479,"pân":2359,"tfo":164,"tfe":489,"tfa":104,"th ":1080,"tez":721,"tex":467,"tev":339,"teu":83,"tet":387,"tes":961,"ter":17000,"ti ":4750,"pâr":90,"tho":249,"thr":172,"the":1255,"thi":201,"tha":314,"ăpi":117,"ăr ":639,"ărg":110,"ări":6448,"ărc":171,"ăre":1669,"ărb":346,"ăra":1104,"ăpu":119,"ăpt":200,"ălă":333,"ămâ":1238,"ăsp":468,"ăsu":674,"ăst":933,"ăta":498,"ăte":138,"ăpâ":120,"ăti":182,"ălţ":396,"ăto":3911,"ăro":211,"ăru":1791,"ărt":197,"ărs":101,"ău ":1645,"ăsc":1578,"ăse":603,"ănă":998,"ăsi":241,"ăut":251,"ărâ":207,"ătu":1360,"ătr":2034,"ăud":174,"ăul":112,"ăsă":746,"ără":1000,"ăzu":203,"ărţ":804,"ăzi":480,"ăze":118,"ăzd":88,"ăzb":1237,"ătă":689,"ăuţ":165,"ăce":168,"ăca":435,"ăci":357,"ăcu":469,"ăde":218,"ădi":504,"ădu":392,"ăi ":268,"ăia":213,"ăie":323,"ăcă":141,"ăl ":131,"ăin":267,"ăit":151,"ădă":164,"ăil":231,"ăgă":140,"ăle":134,"ălc":90,"ăld":111,"ăla":258,"ălb":84,"ăma":126,"ămi":144,"ăli":530,"ălu":247,"ăne":410,"ănc":84,"ăni":157,"ză ":7802,"zăr":410,"zăt":197,"zău":117,"zân":327,"ürt":162,"uş ":157,"xă ":377,"uţi":2814,"uşu":97,"uşt":92,"uşo":225,"uşe":140,"uşc":117,"uşi":662},"n_words":[8213281,9706468,7977386],"name":"ro"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":12949,"E":10306,"F":8444,"G":9635,"A":18320,"B":11812,"C":18729,"L":10133,"M":18025,"N":9350,"O":8480,"H":7852,"I":36250,"K":4555,"U":4649,"T":15138,"W":7160,"V":11078,"P":15071,"S":24620,"R":10933,"X":14985,"f":12270,"g":18709,"d":30289,"e":107154,"b":13058,"c":32324,"a":88793,"n":72221,"o":77565,"l":46717,"m":27155,"k":11440,"h":29689,"i":78874,"w":9055,"v":9950,"u":33414,"t":65000,"s":53078,"r":72396,"p":19491,"z":3846,"y":15658,"x":6168,"́":88446,"ь":418052,"э":80439,"ю":190059," o":3966,"я":756306,"ш":164289,"щ":122233,"ъ":10879,"ы":569718,"ф":163421,"х":364215,"ц":246632,"ч":362951,"р":2010938," a":3919,"с":1933070,"т":1865754,"у":741481," t":5450,"ё":73990,"И":44043,"Л":43881,"К":96587,"Н":57501,"М":84561,"П":106733,"О":53561,"Б":68591,"А":88041,"Г":58489,"В":87514,"Е":15516,"Д":50036,"З":22723,"Ж":9219,"Ш":21342," H":6358," I":14743,"Ю":9600," N":6330,"Я":13672," O":4736," L":7278,"Э":22580," M":13492,"Т":45273," B":8977,"У":23856," C":13678,"Р":75819," A":12906,"С":148731,"Ц":10320," F":6714,"Ч":15862," G":7609,"Ф":36383," D":9169,"Х":20421," E":6845,"л":1373509,"к":1271742,"й":669125,"и":2990916," X":9849,"п":840034,"о":3520831,"н":2386826,"м":997958,"г":667312," S":16699,"в":1520663," R":8106,"б":419069,"а":2917550," P":10356," W":6218,"з":510343,"ж":242497," V":5002,"е":2737376,"д":981077," T":11223," А":74197," Б":66425," В":84271," Г":55440," Д":46951," Е":13561," Ж":8620," З":21445," И":40121," К":90300," Л":41939," М":81442," Н":51890," О":47190," П":101697,"Co":3768,"I ":12785," б":96560," а":114180," г":188817," в":459004," е":32581," д":177939," з":81625," ж":31277," й":4814," и":369580," л":56771," к":233167," н":240854," м":177553," п":455990," о":265653," Р":63171," С":125419," Т":41894," У":21149," Ф":32286," Х":19430," Ц":9599," Ч":15364," Ш":14654," Э":20859," Ю":9415," Я":13422,"C ":3880," т":131031," у":81588," р":185096," с":401251," ц":26307," ч":72261," ф":71734," х":30938," ш":31141," ю":9238," я":35568," э":51707,"Ma":3809,"II":10412,"VI":3879,"Th":4668,"X ":5588,"S ":3829,"a ":12175,"i ":4290,"he":9109,"g ":4678,"ea":4162,"ec":3745,"de":5482,"h ":4087,"el":5609,"en":10589,"et":4586,"es":8452,"er":17575,"e ":29556,"f ":4058,"ch":4543,"ce":3891,"c ":3917,"d ":10245,"at":8929,"as":4202,"ar":11277,"al":8675,"am":4333,"an":13622,"ac":3987,"nt":7777,"ns":3721,"of":4484,"om":4497,"on":14150,"ol":4531,"os":3740,"ou":4174,"or":10679,"r ":11184,"ll":5038,"o ":6694,"ma":4022,"me":5234,"na":4836,"nd":7645,"ne":5797,"ng":6066,"ni":4386,"m ":5088,"li":5421,"le":7129,"la":5868,"n ":18576,"ic":8588,"ia":4454,"ie":3873,"k ":4036,"is":6529,"it":5343,"il":4505,"in":14086,"io":6997,"l ":9526,"y ":8801,"ve":4522,"x ":4062,"ur":4265,"us":4229,"to":4590,"te":9298,"ti":9123,"th":7132,"ta":6068,"st":7180,"ro":7763,"ri":8232,"re":8155,"ra":7850,"t ":13376,"s ":22614,"́в":5893,"́д":3913,"́й":4535,"́м":3791,"́л":9850,"́н":15831,"́р":9052,"́с":4186,"́т":6032,"К ":4112,"О ":4223,"А ":10572,"В ":18440,"ья":15305,"ью":22763,"эл":12007,"эк":10864,"эр":3980,"эн":8177,"эт":23791,"юб":4435,"юг":4684,"юд":5402,"юз":4895,"юл":6476,"юн":7127,"юр":4992,"юс":3863,"ют":24842,"юч":8444,"ющ":38941,"яж":4621,"яе":26767,"яд":10334,"яв":18798,"яб":17686,"ян":29130,"ям":15306,"ял":9628,"яз":22046,"ях":9639,"яс":6449,"ят":41978,"яр":7070,"яю":12156,"ящ":14702,"щи":42463,"ще":53089,"ща":18897,"ъе":8051,"ый":115257,"ык":21356,"ые":44083,"ыд":3688,"ыв":20957,"ыш":9845,"ыч":6916,"ых":88893,"ыт":10971,"ыс":15933,"ыр":7427,"ып":8198,"ын":9614,"ым":40979,"ыл":22285,"ьк":10517,"ьд":4034,"ье":17860,"ьз":12362,"ьб":9420,"ьш":16573,"ьс":28658,"ьт":14115,"ьн":93902,"ьм":17794,"хе":5192,"хи":15842,"хн":9628,"хо":65229,"хр":8875,"хс":5190,"ху":8026,"фф":4256,"ха":22339,"ци":120621,"цк":12640,"цо":4729,"цу":6546,"ца":29752,"це":46729,"чл":3954,"чн":40293,"чи":44063,"чк":5394,"чт":9754,"цы":8412,"че":142550,"ча":74585,"шо":6953,"шн":6580,"шк":7840,"шл":7340,"ши":40508,"шт":17550,"ше":40253,"чё":6388,"ша":19003,"ск":366421,"см":15604,"сл":64229,"со":139920,"сн":41282,"ср":13223,"сп":78471,"св":31144,"рё":5020,"сб":4953,"се":102280,"си":96102,"рш":7321,"ры":45781,"рь":12146,"са":63371,"ря":46850,"рр":14529,"рс":52200,"рт":57700,"ру":105041,"рх":14747,"рц":4365,"тн":65472,"тм":3983,"тл":9354,"тк":17989,"тс":84335,"тр":155968,"то":273813,"те":235202,"тд":4232,"тв":124455,"сё":6893,"ти":199676,"сы":9941,"сь":18264,"та":266195,"ся":89398,"тб":4549,"су":38119,"сф":3977,"сс":88379,"ст":540751,"сч":5733,"сш":4240,"сх":8473,"сц":3788,"ур":58652,"уп":39068,"ут":30026,"ус":56106,"ум":19318,"ул":47107,"ун":41314,"уи":4250,"уз":23986,"ук":30175,"уд":42687,"уг":33231,"уж":19840,"уе":15435,"уа":10851,"тя":20201,"уб":26698,"ув":5212,"ты":37440,"ть":74835,"тч":3850,"тт":6006,"ту":57964,"фу":8016,"фр":15690,"фо":29756,"фи":45878,"фе":25068,"фа":16306,"ую":32657,"ущ":17182,"уч":38493,"уш":8215,"ух":10059,"ёт":8989,"ён":22517,"ёр":12412,"ёл":4881,"ём":5538," II":3728," Ma":3710,"а ":616120,"Р ":10559,"С ":6434,"Ис":5682,"Ин":6739,"к ":120218,"Из":4203,"Ив":3823,"й ":507309,"Ле":11105,"Ли":8362,"Ла":6540,"Ку":8515,"Ко":21705,"м ":237652,"Кр":10087,"Ки":8491,"Ка":27548,"л ":64722,"На":20616,"Не":6593,"Ни":9075,"Му":3926,"Мо":21246,"о ":404148,"Ма":25177,"Ми":12922,"Ме":9781,"Ло":5700,"н ":135483,"Лу":3901,"Па":15703,"Пе":17521,"Пи":4313,"Пл":3876,"с ":91974,"По":29660,"р ":85184,"Ос":6849,"Ор":3751,"От":4582,"Об":5237,"Он":3901,"Ол":8342,"Но":9337,"п ":8470,"в ":366455,"Ам":4416,"Ан":11755,"Ал":14756,"Ав":5862,"Ба":12391,"Ар":8812,"б ":7229,"д ":64292,"Во":11807,"Ве":14286,"Ви":9847,"Га":7333,"Бо":12192,"г ":35799,"Бр":14227,"Бе":12321,"Би":3781,"Ва":10241,"Бу":4848,"Ди":3742,"Дж":8980,"Де":6705,"ж ":6087,"До":7659," Th":4595,"Ев":5669,"Ге":11433,"Гр":11587,"Го":10198,"е ":450788,"Да":6449,"и ":574901,"За":11852,"з ":53551,"ь ":133456,"е́":15696,"ы ":130752,"и́":16589,"ё ":6265,"Яв":5487,"ю ":63842,"я ":490850,"Эт":4232,"Ст":11217,"Су":5933,"Та":9444,"Ти":3758,"Те":7001,"ф ":6051,"То":6501,"Тр":6028,"Ту":4280,"Ук":4529,"х ":211491,"Пр":23361,"Ра":11292,"Ре":10364,"Ри":7652,"СР":8553,"т ":206463,"Ро":21953,"Ру":5315,"СС":13685,"СШ":6418,"Са":18244,"Св":4560,"Си":8236,"Се":17477,"у ":114144,"Со":23127,"Це":5862,"ш ":6132,"а́":24040,"Че":6673,"ША":6418,"ц ":11865,"Фр":9086,"Фе":6645,"Фи":4961,"Ха":6486,"ч ":21859,"Хо":3899,"ль":255358,"лы":10808,"лю":22731,"мб":8994,"ля":104684,"ма":130982,"лё":9664,"ме":172608,"ми":140084,"мл":6471,"лл":23806,"лн":13959,"ло":163517,"лс":10825,"лт":4507,"лу":35325,"ла":158459,"лж":4211,"ле":205849,"лд":3683,"лг":5390,"лк":10879,"ли":255185,"км":7931,"кн":9321,"кк":4284,"кл":31050,"кр":60959,"кс":31731,"ко":383857,"кт":66101,"ку":41628,"кц":9152,"ка":219076,"кж":14683,"ки":210538,"кв":18587,"ке":35320,"йн":21970,"йо":19765,"йр":3918,"йк":4778,"йл":4612,"йш":7189,"йс":69884,"йт":6056,"ия":163126,"ию":29435,"ищ":4465,"иш":6262,"у́":5838,"йд":4105,"ио":61311,"ип":31810,"им":121890,"ин":226639,"ик":130372,"ил":122490,"ии":133366,"ий":180537,"иц":62824,"ич":108247,"иф":12791,"их":84356,"ит":178471,"иу":4714,"ир":73443,"ис":176435,"ри":235653,"рк":23089,"рл":11046,"рм":37955,"рн":63599,"ро":363111,"рп":4622,"ра":429505,"рб":11729,"рв":24770,"рг":34028,"рд":18157,"ре":279331,"рж":10477,"пь":4901,"пы":11436,"пр":223152,"пп":19192,"пт":6272,"пс":5990,"пу":32229,"пи":56651,"пн":7871,"по":247631,"пл":31561,"ою":7698,"оя":26385,"па":81900,"пе":95232,"ощ":8932,"ош":11972,"оч":34323,"оц":13939,"оэ":5861,"ос":271102,"ор":307758,"оп":60188,"оо":12934,"ох":15431,"оф":19467,"оу":5082,"от":162519,"ок":93705,"ол":228863,"ом":207667,"он":199509,"ож":50393,"оз":54318,"ои":31770,"ой":205541,"ов":359690,"нё":5120,"ог":196172,"од":289964,"ое":90767,"ня":35079,"оа":5488,"об":119246,"нь":18690,"ны":243593,"нц":33951,"нч":4962,"нт":90634,"нс":97833,"нф":7203,"ну":26904,"но":402371,"нн":163382,"нр":4657,"нк":27682,"нз":4096,"ни":399245,"не":163042,"нг":32092,"нд":56240,"мё":3976,"нв":7556,"на":409072,"мя":17291,"мь":4683,"мы":32208,"му":54274,"мс":10476,"мп":42062,"мо":86128,"мн":19897,"мм":15488,"ге":39235,"гд":7818,"ги":64206,"гн":5587,"го":307537,"гл":27873,"гм":4198,"гр":76619,"гу":22493,"дв":21897,"да":163117,"вг":7437,"ве":186015,"ви":112054,"вк":13130,"вл":60895,"вм":3919,"вн":71300,"во":212143,"вп":4179,"вр":34804,"вс":46803,"ву":21309,"вт":16905,"вх":16990,"вш":21902,"вы":73241,"вь":4164,"га":61976,"вя":19117,"вё":4031,"би":27123,"бе":43659,"бр":63607,"бн":9059,"бо":89921,"бл":44500,"бу":21528,"бс":7153,"ва":210954,"бы":35755,"бъ":8402,"бщ":13191,"ад":83035,"ае":38603,"аж":22477,"аз":106308,"аб":41193,"ав":144996,"аг":30571,"ам":106564,"ан":372615,"ап":45088,"аи":20853,"ай":49922,"ак":101386,"ал":220430,"ах":49626,"аф":15259,"ач":35451,"ац":44113,"ас":161556,"ар":181597,"ау":19997,"ат":182329,"ба":30240,"ая":142316,"аю":29293,"ащ":9761,"аш":12824,"зс":6594,"зр":12483,"зу":17726,"зк":4107,"зи":46141,"зо":48202,"зн":42107,"зм":17172,"зл":7624,"ив":68972,"иг":39987,"зя":3693,"иа":42640,"иб":20393,"иж":14048,"из":124756,"ид":43626,"ие":128645,"зы":33503,"жо":5749,"жс":4009,"жу":6473,"жи":33668,"жн":29430,"за":99672,"зв":49033,"жё":3790,"зг":3804,"зд":32492,"зе":22277,"еф":7165,"еу":4109,"ет":190550,"ес":211743,"ер":322951,"еп":23032,"ео":22288,"ен":392846,"ем":131477,"ел":205217,"ек":97566,"ей":99541,"еи":5939,"ез":47927,"еж":36786,"ее":34825,"жд":33436,"же":89890,"жа":21003,"ея":12292,"ещ":8165,"еч":24475,"еш":11291,"ех":17181,"ец":24531,"дс":40426,"др":39150,"ду":60618,"дн":71944,"дм":11329,"дп":4848,"до":102671,"ди":135662,"дл":31466,"дк":7688,"де":155382,"дд":3909,"о́":17497,"дж":10265,"еб":21454,"ев":101298,"дё":3920,"ег":62011,"ед":133360,"дь":6956,"еа":12623,"дя":7893,"дш":3897,"ды":16920," Яв":5485," ар":13568," ба":10710," ав":18221," ад":9137," ал":10374," ак":9085," ан":12535," ам":9443," ап":7635," бу":7250," ва":6199," бы":23597," би":6156," бе":13045," бр":8596," бо":19871," бл":6242," вт":4810," вх":16859," га":5609," вы":32582," ви":13232," ве":27020," во":59783," вс":13316," вр":12829," вл":4862," вк":4169," вн":5530," дв":15611," да":12181," гу":4609," го":108387," гл":8578," гр":29774," ге":11208," ги":5523," ед":6587," ег":8307," до":35767," др":17969," ду":6602," де":43890," ди":14485," дл":25835," же":14087," за":57830," зв":5213," жи":10062," зн":7274," иг":13915," из":63621," ил":26819," ин":23235," им":21022," ис":39365," их":4602," ию":10395," ка":46349," ки":10920," кр":19920," ко":101579," кн":6962," км":7101," кл":11050," ку":8839," ла":5400," ли":22008," ле":16122," ло":3818," ме":49176," ми":21788," лю":4902," ма":35222," мо":29145," мн":7010," му":26862," ни":8598," не":49337," на":155501," но":20718," ок":24600," он":5812," од":28184," об":63971," от":54560," ор":15811," ос":31704," оп":15294," по":164771," пл":14871," пи":11060," пе":48196," па":21414," Ре":10285," Ра":11205," Ро":21862," Ри":7649," Пр":23192," Пе":17464," Па":15586," с ":51024," По":29407," Пл":3863," Пи":4292," От":4556," Ос":6825," Ор":3729," Те":6933," Ти":3719," То":6441," Тр":5998," Ст":11107," Су":5919," Та":9379," Св":4548," Си":8202," Се":17430," у ":4211," Со":23029," Ру":5277," СС":6857," СШ":6404," Са":18190," Фр":9072," Фи":4937," Фе":6613," Ук":4507," Ту":4270," Це":5856," Хо":3885," Ха":6470," Че":6653," Эт":4213," Ба":12348," Ар":8769," в ":244550," Ан":11697," Ам":4402," Ал":14708," Ав":5843," Ва":10204," Бу":4830," Бо":12018," г ":7009," Бр":14207," Бе":12299," Би":3719," а ":10629," Ев":5656," Ди":3719," Дж":8956," Де":6645," До":7584," Га":7281," Ве":14245," Ви":9808," Во":11759," Да":6384," Ге":11389," Го":10155," Гр":11539," Ис":5607," Ин":6668," к ":15541," Ки":8443," Ка":27418," и ":157390," За":11752," й ":4397," Ив":3814," Из":4180," Му":3903," Мо":21203," о ":7496," На":20476," Не":6542," Ни":9034," Но":9287," Об":5188," Он":3896," Ол":8330," Ко":21565," Кр":10028," Ку":8476," Ла":6504," Ле":11080," Ли":8319," Ло":5687," Лу":3887," Ма":25091," Ме":9720," Ми":12850," В ":17470,"II ":7232," ра":87254," ре":41392," ро":33130," пр":177960," пс":3748," пу":8711," св":25177," си":22645," се":47025," сл":16395," см":6218," ск":7720," сп":19254," ср":11080," сн":4707," со":89660," ру":13053," са":14863," ти":6431," те":38937," то":20252," тр":22707," ст":53938," су":12938," та":27006," ук":4204," ус":11044," уп":6557," ур":3864," ун":4890," ул":9872," ту":5700," фо":12379," фр":10534," фу":6744," фе":9085," фи":20989," фа":9016," уч":17593," хр":4873," хо":7233," ху":6092," ха":4633," ци":3784," це":18264," чт":6942," чи":6042," че":25547," ча":26885," ша":3808," шт":13545," эк":9725," эл":8825," эт":18390," юг":4276," ян":5667," яз":10931," яв":12558,"аз ":5770,"ад ":7406,"Явл":5341,"ав ":5373,"ам ":14841,"ан ":31178,"ак ":26116,"ал ":21311,"ай ":4239,"авш":9597,"авт":9645,"ага":8115,"аго":7572,"ада":10767,"ади":12630,"аде":10895,"адм":7561,"адн":6641,"адь":3777,"аем":7453,"ает":25830,"аже":6271,"ажд":5556,"аба":3999,"або":17739,"абр":6755,"ава":11200,"авн":21035,"авл":26690,"авс":4043,"аво":12671,"аве":7803,"авг":5378,"ави":21691,"ало":13331,"алл":5996,"ала":22307,"али":48595,"але":13975,"амм":7674,"амо":8686,"амы":4469,"аля":5952,"ама":7226,"аль":73407,"ами":28699,"аме":19983,"анн":37419,"ано":18426,"анс":39188,"ант":20680,"анц":15483,"аны":7448,"ань":4229,"ана":29768,"анд":25091,"анг":9722,"ани":99562,"ане":11955,"анк":9278,"азр":7025,"азо":13307,"азн":9789,"азл":4533,"ази":10605,"азе":3818,"азд":5347,"азв":17619,"аза":10657,"аин":6236,"аиб":4114,"азы":8101,"айс":4889,"айо":18909,"айн":4793,"акт":16378,"ако":12308,"акж":14674,"аки":5796,"ака":8072,"ах ":31165,"ас ":6451,"ар ":4822,"ат ":18392,"ая ":136821,"ба ":4571,"Пол":8430,"Пор":6259,"При":5525,"Пре":6670,"Про":7157,"Рос":14485,"Рас":4325,"Рес":3751,"Сан":8587,"ССС":5587,"ССР":7399,"США":6399,"Сос":6600,"Сов":3947,"Сев":6002,"Сер":4458,"Ста":4121,"Укр":4080,"Фед":3907,"Фра":5981,"ША ":6384,"а́н":5689,"лам":6144,"лан":15129,"лас":29897,"лат":7877,"ме ":7744,"ля ":50911,"ма ":24448,"лав":17972,"лаг":7649,"лад":11258,"лы ":5725,"ль ":38018,"кую":3788,"кус":4925,"кул":8752,"ктя":6372,"кци":8948,"кре":6974,"кра":17934,"кри":5557,"кру":16231,"кро":7021,"лу ":5744,"кры":6225,"кса":6759,"кси":5075,"кта":5223,"кте":4216,"кти":10807,"кто":14126,"ктр":8094,"кту":4971,"кла":9863,"кло":3891,"ло ":19382,"клю":8248,"кни":4141,"ког":53446,"ков":40843,"ком":58598,"кон":29711,"коп":5499,"кор":15168,"кос":5742,"кот":33255,"кое":17818,"кой":65146,"кол":25577,"кие":12731,"кин":10160,"ким":14411,"кий":81796,"ких":35222,"ле ":19103,"кже":14673,"ли ":56435,"ква":7601,"ках":4702,"кат":7467,"кар":8017,"кам":8824,"кан":23037,"как":20319,"кал":11552,"каз":10972,"кая":36891,"каб":6283,"кад":5211,"ла ":40418,"йши":4432,"йся":5983,"йск":44468,"йст":14738,"кт ":8556,"ку ":9345,"йны":8079,"кс ":4079,"йон":18007,"ко ":19496,"км ":4431,"ки ":43486,"ке ":21167,"иям":6120,"иях":4963,"нва":6008,"од ":33146,"ная":57610,"нах":9504,"нац":5517,"нау":8200,"нач":22853,"ог ":6440,"нан":6213,"нам":8570,"нал":25338,"нат":9581,"нас":17157,"нар":16443,"нап":11298,"над":9721,"нак":5181,"наи":4066,"наз":20814,"нде":5878,"нда":8693,"нгл":9288,"нге":3994,"нга":3873,"ое ":58821,"ней":12822,"нек":4897,"нем":9855,"нен":19222,"нер":13236,"нес":9300,"нет":5791,"нег":6018,"нев":4773,"нее":5864,"нди":11172,"ндо":5836,"ндр":8570,"нив":5586,"нии":26244,"низ":11864,"ник":36850,"ний":18207,"ниг":4932,"ние":74066,"ок ":21671,"ой ":187929,"ны ":35382,"нь ":8902,"ня ":11152,"мый":5262,"мых":5149,"мыш":3735,"ов ":92702,"нт ":11498,"мпо":5172,"мпи":13008,"мпе":6698,"мпа":7653,"мот":4724,"ну ":6285,"мпь":3765,"мск":6474,"мун":14544,"мул":3800,"муз":9742,"мик":7960,"мил":4443,"мии":4851,"мич":7046,"мин":28165,"мир":17908,"мит":4140,"но ":62002,"мму":3810,"мно":11450,"мод":4772,"мог":5035,"мов":6153,"мой":4768,"мож":6250,"мон":9329,"мом":4520,"мол":6893,"мос":8228,"мор":8527,"нд ":4096,"мац":4061,"ляю":8990,"мая":9013,"лял":3785,"мал":9300,"ляе":22475,"мат":21959,"мас":5866,"ляр":4924,"мар":12011,"ман":23543,"люч":8195,"мец":4888,"мес":13130,"мет":17123,"мен":50820,"ни ":19557,"мер":29417,"мез":5962,"меж":12542,"мед":8168,"не ":38359,"лён":4990,"мы ":9798,"льн":93765,"на ":153332,"мя ":11127,"льм":14104,"льк":7925,"льз":12358,"льд":3831,"льб":7821,"лья":5362,"льш":14923,"льт":14050,"льс":23881,"лощ":4920,"му ":16323,"лок":5500,"лог":21990,"лод":5038,"лож":18221,"лор":4516,"лос":11300,"лот":8157,"лом":9527,"лон":6111,"лов":35245,"луч":9778,"луж":6113,"лся":9183,"лиг":4087,"лив":6013,"лиз":10430,"лии":15024,"лим":9347,"лий":6287,"лик":17837,"лез":6815,"лей":13214,"лев":9323,"лег":4328,"лед":13548,"лее":8350,"лес":4757,"лер":5420,"ми ":51806,"лен":60261,"лем":10309,"лек":22756,"лет":13846,"лли":5515,"лла":3879,"лле":7201,"лиц":18482,"лич":17052,"лис":14036,"лит":29052,"лин":18545,"лия":7551,"лко":3705,"пас":4125,"оящ":5066,"ояб":5483,"пад":12246,"пал":13951,"рг ":4862,"оян":3986,"пан":12047,"пар":16176,"ре ":17661,"ра ":44965,"оюз":3745,"пий":7004,"пио":5573,"пис":24111,"пла":9596,"пле":8010,"ро ":9844,"пло":9025,"пед":4071,"ри ":19004,"пер":55359,"пес":4491,"печ":4195,"пец":6528,"ори":37323,"орд":5846,"оре":20523,"орг":13852,"орс":6547,"оро":76961,"орм":17625,"орн":11312,"опу":5280,"ора":20394,"опе":10688,"опи":6454,"опо":11431,"опр":11046,"оор":3720,"опа":4525,"отд":3834,"оте":8329,"отк":7498,"отл":4401,"оти":8154,"ото":45196,"отн":11373,"отр":8447,"осу":15201,"отв":5315,"ота":11881,"осе":5320,"оси":7543,"оск":15979,"осл":18494,"осн":17062,"осо":11923,"осп":4485,"осс":25551,"ост":122012,"ору":9047,"орт":19622,"оры":19028,"оря":6038,"осв":3854,"омм":4672,"оми":12071,"оме":14471,"ома":19204,"оля":5909,"оль":52719,"олу":10916,"олн":13122,"по ":36706,"оло":54184,"олл":5951,"олж":3930,"оле":21412,"оли":27404,"ола":9666,"окр":16259,"окт":7805,"оку":5419,"око":16003,"ооб":4126,"онс":12501,"онт":8231,"онц":5168,"они":16817,"онк":3985,"оно":19353,"онн":16949,"она":44812,"онд":5190,"оне":18296,"омо":11288,"омп":18935,"ому":11330,"оше":5000,"пы ":9126,"очи":4013,"оче":6235,"очн":15161,"оща":4802,"офе":5906,"офи":7745,"оты":3797,"оце":7042,"оци":5856,"охо":7896,"нят":6570,"няе":3761,"ова":79059,"обы":7576,"общ":12061,"объ":7061,"обр":22953,"обо":16065,"обн":4315,"обл":17532,"оби":5216,"обе":11035,"ных":63297,"ные":28042,"ным":27407,"ный":86741,"па ":9132,"оит":3855,"оис":6658,"оим":4265,"ойс":6061,"ойн":7471,"оке":4198,"ока":10728,"ожи":4007,"ожн":11252,"озд":12188,"озв":5021,"ози":6006,"озн":9808,"оиз":10355,"одн":39144,"оди":52480,"оду":22578,"одр":3912,"одс":13394,"одо":20011,"оды":5756,"оед":4040,"оев":9413,"одя":6688,"оен":8911,"оек":3819,"оже":24113,"ожд":8010,"ове":40857,"овк":4823,"овл":5356,"ови":24912,"ово":42247,"овн":14676,"овр":5254,"овс":17603,"овы":17385,"ога":5667,"овя":4334,"огд":4881,"оги":20707,"огл":3937,"ого":132906,"огр":15611,"ода":58550,"оде":17121,"от ":31995,"ноя":6122,"нос":44637,"нор":3684,"нол":4077,"ном":37955,"ной":71982,"ное":29866,"ног":57728,"нод":4260,"нов":54046,"нны":69394,"нно":59505,"ор ":24212,"нни":6472,"нна":20166,"нко":5353,"он ":29089,"нкт":5008,"нка":4499,"ом ":104456,"ния":84752,"нию":8268,"нир":5623,"нис":17636,"нит":8771,"ним":17208,"нин":5963,"нич":11022,"них":9088,"ниц":23421,"нце":5996,"нци":17698,"нцу":5080,"ную":9327,"нфо":3773,"нтя":5255,"нут":4116,"нта":17028,"нте":10866,"нти":10987,"нто":10091,"нтр":15601,"нск":63751,"нст":16529,"сам":12116,"сан":12409,"ряд":7767,"сат":6375,"сво":14816,"те ":15248,"све":4519,"свя":9460,"сев":8467,"сел":21166,"ти ":46330,"сен":10794,"сем":7371,"сет":3970,"сер":16452,"сис":11359,"сит":9045,"сий":14008,"сии":9496,"син":6230,"сил":7978,"сим":6904,"скв":8290,"ски":131938,"ска":46869,"сли":3844,"сле":23336,"сла":10537,"ску":8391,"ско":157451,"сме":6005,"слу":9361,"сло":15249,"то ":29238,"сня":3968,"соб":15821,"сов":27452,"соз":9281,"сок":8059,"сом":3851,"сно":26195,"тр ":8572,"сны":3885,"спе":13580,"спа":4761,"спи":4683,"сос":21656,"сор":6482,"соо":5100,"сон":7291,"соц":4226,"сре":10218,"ту ":8116,"спу":6484,"спо":39170,"спр":6952,"рри":9506,"роц":5562,"рош":3692,"рот":12263,"роф":6754,"рох":5095,"роп":12392,"рос":28947,"ст ":16255,"рта":13460,"рст":15956,"рто":4691,"рти":13560,"рск":18805,"рсо":4243,"рси":7173,"рую":4208,"рту":7614,"рук":7417,"руг":20792,"руд":5213,"руж":6808,"руп":20038,"рус":13228,"рхи":6150,"сь ":14955,"ся ":83299,"та ":65491,"рыт":4303,"рых":5096,"рый":5524,"рые":4885,"тв ":5853,"рад":12120,"раж":10828,"раз":53144,"раб":18768,"рав":32412,"рам":17118,"ран":56502,"раи":8567,"рай":21830,"рак":10369,"рал":23925,"рах":8851,"раф":10748,"рац":10644,"рас":32059,"рат":31627,"рая":5022,"ращ":3710,"рбу":3964,"рва":4083,"пью":3789,"реб":6936,"рев":19639,"рег":20011,"ред":53953,"реа":5135,"рет":10005,"рес":14958,"реп":5228,"си ":5105,"рен":15519,"рем":25502,"рел":11664,"рек":12317,"рей":7082,"рез":11794,"реж":11533,"ржа":3754,"реч":6532,"рво":6073,"се ":6838,"рвы":9033,"рга":14493,"рге":4800,"рги":4021,"рия":13567,"рию":4048,"рио":6627,"рим":11681,"рин":20707,"рик":20481,"рил":3781,"рии":19852,"рий":6871,"рич":8657,"рит":22893,"рир":3686,"рис":16618,"рка":4003,"риа":11930,"риг":4581,"рив":4645,"рид":5404,"риз":6655,"ск ":4696,"рни":9740,"рна":12681,"рок":11103,"рол":12179,"ром":23332,"рон":16716,"рож":7867,"роз":4092,"рои":18841,"рой":14224,"ров":68815,"рог":16486,"род":57888,"рое":10921,"рны":14512,"рно":18416,"рла":4760,"рко":4371,"рми":8750,"рма":15602,"со ":3903,"ппы":6617,"пра":26585,"при":52484,"пре":53035,"ру ":6954,"про":87029,"поп":4906,"пор":17269,"пос":28755,"пот":5403,"поэ":3864,"ппа":6927,"рт ":5103,"The":3957,"под":23464,"пов":12924,"пон":8324,"пом":6183,"пол":68859,"пок":4701,"поз":9391,"пуб":8028,"пус":5784,"пут":5691,"пул":3714,"ры ":15775,"са ":16206,"ря ":29812,"рь ":4987,"вар":15705,"ват":16606,"вая":7821,"ваю":8055,"ге ":6711,"вае":10913,"вав":4587,"ван":65485,"вал":18282,"быт":4123,"быч":3846,"был":18105,"га ":14808,"бъе":7043,"бще":10673,"вы ":7062,"бур":7473,"бря":21845,"вто":16228,"все":9508,"вск":27242,"вст":7859,"вра":8893,"вре":17466,"вро":6581,"вол":13485,"вок":4441,"вой":25086,"вои":3759,"воз":12251,"вое":18754,"вод":28316,"вог":6893,"вов":10300,"вны":14251,"вор":11524,"вос":15224,"воп":3740,"вом":8183,"вни":4616,"вне":8106,"вна":13669,"вно":25418,"вля":30877,"вле":21719,"вла":3874,"го ":153532,"вкл":4066,"вка":4191,"вич":20022,"вия":4450,"виж":4984,"вил":7784,"вин":10044,"вис":5933,"вит":12981,"вид":14202,"вес":15630,"вет":22334,"вер":37523,"вен":36297,"ги ":6381,"вел":6076,"век":17069,"вед":16797,"вгу":5781,"ва ":46587,"ающ":19084,"ают":9616,"ачи":5855,"бы ":3828,"ащи":4099,"аще":3705,"ауч":3787,"аук":4642,"афи":6466,"ахо":7975,"ача":13504,"аче":11447,"аци":41912,"апр":13134,"апа":11764,"апо":3746,"апи":8487,"арх":7888,"арс":18625,"арт":22698,"арь":4529,"аря":9072,"аре":7851,"ард":6979,"ара":21925,"арн":5879,"арм":4715,"аро":18219,"ари":19835,"арл":4156,"арк":7904,"асс":17733,"аст":73387,"ась":5439,"ата":17783,"аси":4710,"асе":9811,"асл":4318,"асп":18234,"асн":7236,"ату":10612,"аты":6297,"ать":13519,"ате":41110,"ати":28209,"атн":5152,"ато":21641,"атр":7249,"бол":22933,"бом":6832,"бой":5049,"бор":14402,"бот":13748,"бно":3859,"бро":3695,"бри":6979,"бре":4187,"бра":24036,"бла":17775,"бли":15168,"бле":4490,"во ":31067,"ви ":4378,"бес":5023,"бер":15432,"бел":4899,"бит":4733,"бил":3862,"ве ":17457,"даю":4004,"дах":4951,"дан":21423,"дам":3898,"дар":18766,"дат":8984,"дви":7106,"дал":8909,"дав":5333,"ев ":8761,"дек":7455,"дей":9763,"дем":5618,"дел":26493,"ден":32485,"дер":21083,"ей ":52679,"дес":4085,"дет":5015,"дея":8136,"дву":4656,"дво":4445,"ее ":25157,"ез ":4626,"ды ":12967,"дь ":4234,"ех ":3705,"дст":24032,"дск":13686,"дро":6400,"дру":10418,"дре":7577,"дра":7639,"ет ":49624,"дун":5024,"ец ":6315,"ен ":23878,"дия":4777,"диц":4594,"ем ":32646,"диа":3739,"див":3812,"дим":5230,"дин":31699,"ел ":6867,"дио":3858,"дис":4374,"дит":31866,"дии":5066,"дил":7726,"ек ":8976,"дны":11957,"дож":6200,"дов":21587,"дос":6949,"дор":10512,"дол":8011,"док":5562,"дон":5574,"дом":8881,"дна":11456,"дни":9495,"дне":7494,"ер ":23656,"дно":27634,"для":23991,"дми":9130,"вып":7258,"вый":9785,"вым":6060,"вых":10632,"выс":8860,"al ":4210,"да ":70397,"вяз":5698,"гал":7699,"вят":8003,"ган":16467,"гар":4062,"де ":21659,"гда":5300,"вую":5927,"вхо":16916,"вше":5430,"вша":3781,"вши":12432,"вые":6905,"гон":3740,"гол":6702,"гос":13811,"гот":3744,"гор":32983,"гов":9981,"год":69990,"гру":17285,"ду ":34904,"гро":3890,"гра":40520,"гре":6189,"гус":6354,"ген":9872,"гер":7302,"ди ":8816,"гии":7384,"гио":11815,"гич":7565,"гих":4581,"гла":12247,"до ":13248,"гли":6452,"жан":5670,"еят":7722,"за ":17832,"еще":5057,"жит":5272,"жив":7938,"жис":4312,"жес":7568,"жет":3940,"жду":12980,"жел":6716,"жен":38841,"жде":11274,"жда":4735,"жск":3945,"жно":12596,"жни":4413,"жны":6194,"жур":3704,"ежи":6716,"ежд":16924,"еду":5404,"едс":18751,"еза":4036,"езн":5938,"езо":10562,"езд":5637,"ези":9501,"ева":10267,"еви":11883,"еве":16643,"еат":3835,"дящ":4359,"его":29841,"еда":12960,"еде":22810,"еди":27267,"едо":10112,"едн":12312,"евн":10381,"же ":22970,"ево":16350,"евр":9678,"евс":6183,"евы":4574,"еге":5203,"еги":14109,"ент":47594,"енс":9645,"енц":4019,"ени":132856,"ено":11907,"енн":84711,"ена":18893,"емя":7341,"ене":16352,"енд":7186,"емь":4324,"емы":10998,"еор":5073,"ены":6940,"ень":8052,"епо":3992,"ерх":5155,"ерр":10216,"ерс":17748,"ерт":9897,"ерл":3718,"ерм":10872,"ерн":25326,"еро":19985,"ери":40656,"ерк":7071,"ерг":7482,"ерж":8654,"ере":44413,"ера":37126,"ерв":20819,"ерб":7899,"ейн":5541,"ейс":18646,"еки":4636,"еко":11088,"ект":27081,"екс":15558,"ейш":5926,"ека":19196,"ели":21048,"ело":19740,"еле":41611,"ела":9361,"емл":3751,"емо":8767,"еми":9433,"ему":3888,"емп":6664,"ель":74971,"еме":23861,"еля":16614,"ема":12283,"елё":3840,"ехн":5852,"ецк":9009,"еци":7333,"еча":3853,"ечн":3687,"ече":11632,"еше":3757,"есе":5474,"еск":73489,"есн":5953,"есп":9644,"есс":15061,"ест":79162,"еся":5372,"ета":21699,"ети":10490,"ете":10421,"етр":11442,"ето":9221,"етн":10174,"етс":48640,"еты":4087,"ибо":7839,"иве":7414,"иви":3809,"ива":16306,"иал":20951,"иан":8939,"иже":6481,"идо":4070,"иев":3803,"ием":14066,"ией":9608,"игр":16265,"иго":4097,"ида":4859,"иде":14015,"иво":7871,"ивн":17940,"ивш":6456,"ига":5856,"иги":5053,"икл":4956,"икр":3804,"ико":26837,"ике":6203,"ики":16651,"ика":34115,"ийс":37107,"изм":10707,"изо":8828,"изн":6897,"изи":10371,"изд":4964,"иза":12843,"изв":18761,"ион":41500,"инц":6456,"ины":10703,"иня":5979,"иод":4236,"ине":14798,"ини":31286,"инн":3826,"ино":20897,"инс":24611,"инт":7204,"инф":4063,"ина":38691,"инд":6333,"инг":12127,"ими":15581,"име":21870,"имс":3708,"имо":9905,"имп":12615,"има":14708,"иль":23825,"или":46988,"иле":5170,"илс":4907,"илл":5632,"ило":11583,"ила":9561,"иси":5473,"иса":15121,"исх":5676,"ист":74674,"исс":10637,"исп":16317,"исо":5087,"исл":10745,"иск":11427,"ити":13816,"ите":56212,"ита":24816,"ись":6230,"иту":6461,"ито":18876,"итс":8689,"ипа":14414,"ипе":3791,"ира":11140,"ире":4647,"иру":8486,"ири":4818,"иро":27497,"иха":3800,"ихо":4447,"ице":7291,"ица":16979,"ици":25302,"ицы":6049,"ить":5325,"ифи":3877,"ичи":6000,"ичн":11017,"ича":4726,"иче":66707,"июн":5092,"июл":5316,"ка ":55202,"ив ":4402,"зав":10564,"ид ":4294,"зви":4153,"зве":16359,"зва":15033,"зац":9007,"зат":4555,"зап":11760,"зан":11015,"зам":3927,"зак":6826,"зде":5994,"зда":15915,"ие ":94285,"зво":7967,"ий ":139271,"зер":5619,"ии ":131775,"зем":4097,"из ":38626,"зид":4005,"зил":7018,"ил ":8940,"ик ":27882,"ин ":25833,"им ":24359,"зия":4934,"зит":4573,"зме":6834,"зли":4448,"зна":20529,"зни":4774,"зно":8729,"ир ":5392,"зны":4336,"зов":19642,"ис ":4567,"зон":5974,"зор":7083,"ит ":29545,"зра":8237,"зск":6535,"их ":71178,"зуе":5324,"ич ":17567,"зыв":8397,"ию ":18909,"зык":19420,"ия ":146910,"ьшо":3872,"ьши":3819,"ьше":5810,"ьян":5268,"ьют":3946,"ьма":3914,"ьна":7658,"ьни":4302,"ьно":44690,"ьны":33869,"ько":6149,"ion":5234,"ьзу":6190,"ьзо":5415,"ьту":4384,"ьст":6382,"ьск":16866,"ям ":5239,"ют ":10384,"эко":4504,"эле":8958,"это":14311,"ых ":86372,"he ":5907,"ыва":15848,"ье ":4950,"ые ":43888,"ыл ":8892,"ым ":27422,"ый ":115051,"ычн":4405,"ья ":5879,"ью ":17235,"ьбо":5724,"ьев":4427,"ьер":4030,"ьм ":7924,"ыка":9496,"ыла":5382,"ыми":11026,"ыпу":5527,"ысо":4693,"ённ":14954,"er ":6599,"es ":4533,"яза":3917,"яет":23887,"язы":12445,"явл":14387,"ябр":17050,"ях ":9267,"ютс":8148,"юте":4109,"юща":7138,"ющи":22555,"юще":8725,"юча":3971,"юля":5121,"ят ":7009,"юня":4749,"яют":4520,"яющ":7517,"яще":6980,"ящи":4896,"янс":6220,"янв":5611,"яни":5071,"ями":7921,"ярн":3999,"ято":4282,"яти":7069,"яте":9306,"уще":14055,"уча":15033,"учн":4074,"учи":4977,"уче":7765,"фес":5738,"фев":5319,"фер":4649,"ующ":10346,"фин":4428,"физ":4024,"фил":15705,"фик":3811,"фиц":4922,"фре":4266,"фра":6501,"фор":17745,"фон":4486,"ца ":19779,"це ":5162,"хан":4526,"хар":5112,"хра":4915,"хно":4200,"хни":3959,"хож":3747,"хов":4677,"ход":41602,"цы ":7957,"худ":5932,"сск":14412,"сси":32096,"ссо":7973,"сса":7005,"ссе":5970,"стн":20518,"сто":67851,"стр":70127,"ств":108223,"сте":31686,"сти":71515,"ста":85694,"сст":5885,"суд":15247,"сть":44809,"сты":4236,"сту":9860,"сущ":7215,"схо":7227,"ты ":16675,"ть ":55254,"тав":37473,"так":23239,"тал":22250,"там":8524,"тан":36504,"тай":3709,"тат":27244,"уг ":6960,"тар":13509,"тбо":4051,"тву":7862,"сёр":3801,"тво":31741,"тви":10153,"тве":41916,"тва":21790,"тех":6332,"тем":17566,"тел":76816,"тео":3852,"тен":11539,"тер":49417,"теп":3783,"тет":16644,"тек":8246,"тей":6713,"тив":26421,"тие":8972,"ук ":4128,"тка":5169,"тич":24503,"тия":7560,"тии":4630,"тий":5054,"тин":12258,"тик":14625,"тил":7099,"тир":5823,"тис":5149,"тип":5281,"тит":7029,"тки":3943,"тно":26382,"ток":11951,"тол":12417,"той":7374,"тны":17802,"тов":29921,"тог":9531,"тни":13284,"тна":5384,"ут ":4381,"тре":17053,"тра":54933,"три":18286,"тор":95430,"том":20153,"тон":9780,"ус ":4688,"точ":11868,"тоя":12927,"тст":5913,"тся":53866,"тро":41481,"тру":10653,"тск":19176,"туг":4888,"туп":6027,"тур":21906,"тью":9623,"тый":5760,"ую ":18777,"уго":4888,"уги":7037,"уга":8925,"уда":16459,"тяб":11568,"убл":8850,"убе":4682,"узы":8785,"узс":4917,"уже":5996,"ует":9543,"уем":3815,"уди":4265,"удо":10125,"уме":6894,"уль":15447,"уля":5536,"ули":11772,"укт":4529,"уко":4826,"ука":4047,"упп":14974,"упн":6779,"упр":5417,"ура":6947,"ург":8767,"уре":4620,"унк":5527,"уна":6858,"уни":18074,"уст":16092,"усс":16109,"ута":4019,"уры":4919,"урн":11401,"уро":5529,"уск":4624,"ших":7365,"шир":4576,"шин":5562,"ший":10857,"шен":11201,"шая":6983,"шта":14038,"щих":9037,"щие":5309,"щий":14604,"щин":3869,"щее":7976,"щей":4091,"щег":3871,"щен":12451,"щес":17297,"щая":10075,"щад":4691,"on ":6654,"цен":15360,"чи ":4183,"цел":4906,"цев":3998,"цес":4907,"цер":5569,"циа":13419,"ций":4974,"ции":29185,"цие":3800,"цип":12774,"цио":18812,"ция":20642,"ча ":3694,"цар":3955,"цуз":4865,"цко":4797,"цки":5412,"чем":5556,"чен":25493,"чел":10826,"чес":78218,"чер":6254,"чет":6065,"чле":3904,"чин":7894,"чив":3802,"чис":7524,"чит":8014,"ше ":5808,"чаю":4561,"час":36891,"чат":3981,"чал":9929,"чае":6846,"чны":14169,"чно":19736,"что":8092,"tio":4098,"Кар":6020,"Кра":3888,"Кор":4309,"Кон":5008,"Мар":7555,"Мин":4281,"Мос":11770,"Нас":4395,"Ник":4718,"Нов":5069,"Оли":6672,"Пар":6298,"СР ":8410,"Пет":6304,"Пер":7246,"Але":6898,"Бра":9561,"Бол":4163,"Бел":4463,"Вел":5787,"Гра":3978,"Гер":5952,"Джо":3835,"Евр":4076},"n_words":[36763344,40893832,29165701],"name":"ru"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":6323,"E":4459,"F":4516,"G":4726,"A":10071,"B":10822,"C":7078,"L":6128,"M":12887,"N":7988,"O":4702,"H":6933,"I":5526,"J":8422,"K":8631,"U":2767,"T":8739,"W":2331,"V":8838,"P":14691,"S":17764,"R":6826,"Z":3853,"f":24135,"g":31408,"d":146397,"e":441234,"b":79948,"c":129399,"a":438168,"n":323513,"o":480147,"l":181843,"m":157226,"j":124158,"k":216062,"h":115571,"i":290888,"w":4336,"v":227972,"u":141151,"t":244072,"s":230794,"r":287939,"p":133920,"z":92877,"y":75446,"x":5807,"²":567,"í":50255,"é":45424,"ä":4507,"á":98447,"ý":65120,"ú":38043,"ô":6119,"ó":12207,"ď":2808,"Č":2146,"č":43571,"ĺ":1392,"ľ":19160,"ň":4992,"Ž":835,"ž":30532,"ť":13944,"Š":1929,"š":30141,"Eur":843," l":8693," m":31843," n":43761," o":37444," h":15481," i":8376," j":50814," k":41715," d":25382," e":6213," f":10527," g":4634," a":59730," b":16567," c":8497," z":33097," u":8583," t":26154," v":69047," p":76912," s":70586," r":25827," J":8409," K":8577," H":6876," I":5508," N":7962," O":4629," L":6098," M":12833," B":10746," C":6977," A":10024," F":4466," G":4666," D":6268," E":4439," Z":3841,"и":615,"о":715," S":17653," R":6787,"а":792," P":14620," W":2297," V":8804,"е":558," U":2760," T":8683," ú":5607," ď":577," č":11970," Č":2146," ž":4854," Ž":833," Š":1929," š":8376," ľ":2475,"Fra":832,"A ":1884,"Da":900,"Co":1488,"Ce":608,"Ch":1551,"Do":1311,"De":1168,"Di":711,"Fa":561,"Eu":967,"Ge":866,"Ga":734,"I ":1477,"Fr":1190,"Fi":588,"II ":663,"C ":1204,"Au":625,"Ar":1181,"D ":632,"Ba":1863,"Am":652,"An":1498,"Al":1539,"Bu":812,"Br":2094,"Ca":1126,"Bi":824,"Be":1438,"Bo":1866,"Kr":1442,"Ko":2065,"Le":1477,"Li":1427,"La":1055,"Lu":551,"Lo":964,"Me":2050,"Mi":1818,"O ":889,"Ma":4427,"Mo":1742,"Ni":925,"Ne":1824,"Na":2123,"P ":647,"No":1174,"Ob":931,"Gr":1156,"Ha":1121,"He":1131,"Ho":1814,"Hr":608,"In":1551,"Ja":1230,"Je":4572,"Jo":995,"Ju":970,"Ka":2174,"Tu":851,"Tr":1334,"To":1259,"Th":927,"Ti":635,"Te":1474,"Ta":1216,"V ":2449,"St":2440,"Sv":647,"Su":614,"Wi":788,"Vy":658,"Vo":610,"Vi":1001,"Va":862,"Ve":1362,"Má":761,"Pr":3288,"S ":1036,"Pe":1293,"Pa":2196,"Pl":567,"Po":4755,"Pi":686,"Or":779,"R ":612,"Se":1430,"Sc":541,"Si":786,"Sl":3359,"Sp":1454,"So":1120,"Ru":1051,"Sa":1654,"Re":1050,"Ná":658,"Ri":640,"Ro":1422,"Ra":1481,"b ":2037,"Zá":884,"a ":144799,"Za":604,"Ze":702,"i ":37987,"fy":881,"ge":3887,"ga":3383,"bý":960,"fi":7159,"fr":2471,"ač":5846,"fu":1355,"aď":870,"ft":581,"fo":3584,"bí":670,"j ":28903,"cú":1023,"gy":705,"aľ":716,"dá":2384,"he":5122,"ha":6720,"gn":1011,"gm":704,"gl":1862,"gi":5874,"gh":727,"gu":2013,"gr":4226,"cí":1219,"go":2083,"du":5115,"dv":2815,"dy":3861,"dz":7413,"g ":2452,"ea":2540,"eb":13761,"ec":10112,"ed":25726,"de":19219,"di":12919,"dk":1536,"dm":1805,"dl":3720,"do":19760,"dn":17745,"dp":1210,"ds":3840,"dr":7033,"ew":678,"ex":2935,"eu":1904,"ev":7688,"ey":664,"ez":6532,"fa":1707,"h ":31639,"fe":2493,"bá":980,"eh":6797,"eg":2721,"ef":1680,"ee":903,"el":18740,"ek":12213,"ej":29432,"ei":1554,"ep":6024,"eo":3559,"en":52359,"em":14359,"et":17096,"es":24526,"er":36633,"ca":6242,"e ":118531,"by":4846,"bs":2133,"br":6137,"bu":4959,"bn":3071,"bo":22917,"bj":2626,"bl":4818,"bi":5066,"bc":1208,"bd":823,"be":9106,"dc":900,"db":1133,"da":10163,"f ":1519,"cu":1353,"ct":1197,"co":4858,"cn":1009,"ck":22292,"ci":21044,"ch":49042,"ce":12657,"c ":5075,"az":7180,"ay":741,"ba":5147,"d ":13711,"at":24996,"as":19402,"ar":21528,"ax":910,"av":19342,"au":4063,"ak":14537,"al":32605,"ai":1980,"aj":18129,"ao":743,"ap":5241,"am":14227,"an":46975,"ac":10837,"ad":20535,"ab":3624,"ag":2783,"ah":4710,"ae":1120,"af":1683,"nu":6191,"nt":13652,"ns":14480,"ič":3422,"jí":607,"no":31919,"nn":5955,"nz":946,"ny":10740,"hľ":938,"oe":798,"ká":9527,"of":3569,"oc":9865,"od":30890,"oa":684,"ob":22771,"jú":8749,"om":35081,"ké":8895,"on":18042,"ok":25444,"ol":27522,"oi":991,"oj":9152,"og":4545,"oh":6762,"ot":13213,"m²":561,"os":25341,"ov":73503,"ou":15818,"op":7814,"oo":910,"jč":568,"or":43666,"kú":2339,"r ":14190,"kô":554,"kó":650,"ow":1410,"oz":12370,"pe":7850,"lá":5471,"pa":10171,"ký":14124,"pl":7248,"lé":1049,"pn":1091,"po":39723,"ph":797,"pi":7377,"eň":1574,"lo":31336,"ln":9477,"hé":650,"lm":1360,"ll":2885,"ls":1845,"lu":4726,"lt":2514,"ly":3287,"o ":72538,"ma":12807,"mb":3559,"hý":981,"me":31227,"iá":2685,"ml":855,"eš":1638,"mi":19191,"mn":3217,"mm":559,"mp":2180,"mo":13618,"ií":1342,"ms":1427,"mu":6601,"ió":1393,"my":3831,"p ":1992,"na":48626,"nc":6297,"nd":6014,"ne":40950,"nf":1138,"ež":4352,"ng":5209,"ni":32826,"nk":4794,"jv":2323,"eď":654,"ju":2887,"eč":2428,"js":2102,"jn":2178,"jo":2645,"jm":1649,"jk":595,"ki":2025,"dľ":1507,"ke":14025,"gá":620,"kc":1532,"dĺ":1132,"ka":21392,"m ":45126,"ky":12637,"kt":22417,"ku":19211,"kv":1473,"ko":41039,"kr":12909,"kl":6886,"km":4376,"kn":913,"li":23938,"lh":1065,"lk":2208,"há":3987,"le":30292,"eľ":8588,"ld":1271,"la":30632,"lb":1616,"n ":17290,"hr":6312,"dí":2091,"hv":2290,"ht":1214,"hu":6079,"hi":3695,"hn":2117,"ho":33938,"hl":4308,"hm":1007,"id":6006,"ic":28125,"ib":2309,"dý":547,"ia":31792,"ih":1011,"ig":2598,"if":1907,"eá":715,"ie":44145,"hy":2860,"dô":739,"k ":16704,"ir":3821,"is":15561,"it":16315,"iu":3712,"iv":6007,"eó":677,"aš":705,"ii":2337,"ij":1891,"ik":11705,"il":11847,"im":4228,"ať":2167,"in":30878,"io":5948,"ip":2091,"je":54049,"až":3807,"ji":2876,"iz":4693,"l ":16112,"ja":8703,"xi":1668,"té":3643,"tí":5214,"tó":2119,"xt":831,"sú":8148,"z ":10294,"ož":4892,"tá":7149,"oš":1310,"wi":924,"sé":733,"sí":1587,"rč":1607,"ró":1912,"vy":11079,"rô":641,"vz":3443,"y ":38794,"rú":2323,"rý":5792,"wa":909,"we":537,"sá":670,"vl":2565,"ré":5591,"vk":1421,"nš":1355,"vi":16426,"vu":2054,"vr":2972,"vs":3111,"rí":8465,"vn":10339,"vo":27546,"uz":979,"uv":941,"oľ":2603,"ve":29704,"rá":15217,"vc":1194,"va":30715,"x ":1187,"ui":653,"uj":8555,"uk":2909,"ul":5494,"ue":827,"ug":1127,"uh":4698,"ur":8100,"us":8894,"ut":6390,"um":6737,"un":5022,"up":5983,"ty":4889,"pô":2385,"tz":568,"tu":8639,"tt":1333,"tv":9012,"ub":3487,"ua":850,"ud":5636,"uc":2067,"w ":706,"to":52129,"tn":7740,"tm":1030,"tl":2323,"ts":3310,"tr":24800,"oč":7254,"pí":1372,"pá":1327,"te":28885,"tk":5618,"ti":28975,"th":2145,"v ":48812,"ta":24342,"su":4004,"sv":4018,"ss":1524,"st":58751,"sy":3565,"sl":11132,"sk":44161,"sn":3377,"sm":2199,"sp":10573,"so":10148,"nč":1255,"sc":2214,"sf":608,"se":10443,"sh":654,"si":7301,"rz":1417,"u ":50841,"nú":2094,"sa":16525,"ný":22118,"rr":843,"rs":6494,"rt":4523,"ru":10434,"rv":5009,"nó":730,"ry":3916,"ní":8389,"rp":1158,"ro":47621,"rn":10561,"rm":4664,"né":16353,"rl":1127,"rk":2824,"ri":34707,"jš":2481,"rh":695,"iž":1338,"rg":3525,"rf":643,"ná":21179,"re":40029,"rd":2875,"rc":3598,"rb":1553,"ra":41482,"t ":11585,"mô":731,"mí":2439,"mé":722,"iť":1158,"iš":1309,"má":4691,"mä":1078,"lý":1452,"s ":19518,"lú":1099,"py":785,"pt":2178,"pu":2861,"ló":2004,"lí":2945,"pr":41771,"ps":1762,"zý":682,"zá":7603,"zé":647,"už":5423,"vš":2161,"yč":534,"zí":915,"uš":1188,"vý":14470,"sť":7157,"Hor":544,"zh":1017,"vä":2642,"zi":7079,"rš":1164,"zb":854,"zd":5109,"ze":5890,"vá":7870,"za":12274,"yz":1332,"zv":3082,"zy":2080,"zs":1637,"uč":1098,"zr":1156,"zu":1736,"zo":9017,"zn":10586,"ví":1466,"zp":940,"zk":1321,"vé":4595,"zm":4191,"zl":2181,"yh":649,"uá":1391,"yc":3925,"yd":1380,"tý":3031,"yb":1209,"tú":3411,"yv":2400,"yt":2758,"ys":5625,"yr":1431,"yp":2058,"yn":1959,"ym":2723,"yl":992,"yk":2457,"yj":557,"zť":610,"yš":1686,"ším":570,"šír":654,"šíc":1062,"ťou":1264,"² ":565,"ťah":745,"ťaž":646,"ší ":1009,"Bra":1002,"Bol":718,"ám":3217,"án":4821,"áp":2703,"áj":1169,"ák":2947,"ál":8570,"áh":1148,"áb":1278,"ác":7329,"ád":5449,"ä ":797,"áz":2921,"áv":6149,"ár":8087,"át":6184,"ás":2422,"á ":30384,"ôs":1461,"ôr":598,"óg":1873,"ód":1271,"ór":1755,"óp":1200,"ón":3191,"óm":1121,"ív":4551,"íz":1142,"ín":4699,"ím":4401,"íp":1003,"ír":2051,"ít":2918,"ís":3116,"ík":3632,"íl":827,"íc":3944,"íd":1198,"í ":13762,"áž":617,"áš":773,"ém":3465,"én":937,"ét":1878,"ér":2157,"äč":1548,"éd":556,"éc":848,"éh":11280,"áľ":830,"é ":23043,"áč":817,"ät":901,"äz":636,"úč":2819,"ýk":611,"ôž":785,"ýc":17590,"ýz":1464,"ýv":2338,"ýs":969,"ýr":1274,"ým":8307,"úz":3206,"ý ":29957,"úp":696,"ún":866,"úl":805,"úv":565,"út":1527,"ús":2474,"úr":2901,"úh":1142,"úd":1600,"úb":634,"úc":5635,"ú ":11007,"íš":617,"ôz":595,"ôv":1382,"íž":736,"Nem":547,"Nac":597,"ýš":544,"Če":1109,"či":6991,"čk":1668,"čl":1517,"če":6035,"ča":9218,"ď ":922,"č ":1105,"čn":7371,"čo":2826,"čt":541,"ču":1013,"ďa":1413,"čš":1550,"čí":3126,"Nov":633,"ľ ":1691,"ľa":5647,"ľk":2350,"ľm":577,"ľn":1481,"ľo":3506,"ľs":1492,"ľu":1677,"ĺž":1136,"ň ":1274,"š ":574,"ší":3431,"ťo":1542,"Par":649,"ťa":2465,"ť ":9577,"še":3604,"ša":1343,"šo":1393,"šp":1178,"šn":1118,"šk":1726,"ši":5749,"št":7288,"Št":611,"ňa":1527,"ňo":1237,"ňu":845,"Poľ":669,"Pro":592,"Pre":892,"Pod":721,"žs":1011,"žu":918,"žn":3570,"žo":977,"že":5249,"žd":912,"ža":964,"žk":1609,"ži":6988,"ž ":2893,"ží":3206,"Má ":570,"SA ":809,"Rak":660,"šš":674,"Je ":3173,"Kar":601,"Mes":567,"Mal":724,"Mar":1239,"Mic":549,"áľo":612,"ého":11278,"čít":911,"čís":890,"ém ":887,"éck":749,"Veľ":631,"čuj":916,"čná":722,"čné":1259,"čný":1760,"étk":856,"äčš":1545,"ému":984,"éri":720,"Zem":545,"ďal":723,"Str":817,"Sta":811,"Spo":812,"Slo":2603,"Rus":613,"čši":967,"The":619,"Tur":593,"šet":1217,"šen":585,"štr":907,"šti":599,"šta":1049,"šov":613,"ško":920,"šia":856,"šie":2695,"štá":2185,"átu":553,"bje":988,"áte":733,"bja":1624,"átk":775,"átn":563,"áto":1594,"ást":705,"bil":932,"bo ":9920,"ávi":600,"bli":1892,"ávn":1077,"bla":1662,"áva":2985,"bod":650,"bol":6856,"boj":571,"ázv":643,"ázo":1232,"bor":1466,"bov":774,"áln":5252,"álo":634,"be ":819,"áme":834,"ákl":1321,"ban":633,"bal":748,"áko":904,"án ":994,"áns":810,"bdo":679,"áno":783,"ámy":668,"ár ":1209,"bec":1805,"ber":2830,"ben":812,"bez":962,"ápa":2237,"árs":742,"bia":583,"bie":1078,"át ":1099,"áro":2670,"árn":1647,"ára":896,"ách":1879,"áci":4745,"ádz":3180,"ca ":4873,"ál ":697,"ce ":4495,"bri":844,"bro":596,"bra":2640,"bre":863,"bu ":703,"bný":856,"bsa":802,"bur":546,"bum":1025,"bud":728,"by ":1911,"ábo":552,"byv":1443,"aka":544,"am ":1578,"ake":675,"aji":1734,"ajm":1012,"ajn":753,"ajs":1164,"ajv":1703,"al ":3190,"ak ":1606,"ahu":1045,"ahr":640,"aho":724,"aj ":3272,"adá":567,"ajú":4186,"anu":914,"any":631,"ano":1953,"ann":2369,"ant":2498,"ans":3130,"ane":1786,"ang":1885,"ani":9481,"ank":901,"ana":1968,"anc":2108,"and":2251,"amo":1677,"amn":620,"ami":3340,"ame":3786,"ama":959,"alo":3047,"ali":4506,"ale":12669,"ala":2255,"alb":970,"an ":2683,"akt":2087,"ako":5403,"abs":539,"ae ":616,"aca":545,"ad ":2628,"ac ":550,"afi":807,"ado":1644,"adr":867,"adl":645,"adn":3195,"adi":1835,"ade":3266,"adu":813,"aco":855,"aci":1802,"ach":4201,"ace":1242,"ada":1861,"azo":539,"arš":738,"aze":872,"asť":2479,"azy":1600,"axi":588,"atí":1130,"az ":763,"azé":596,"ba ":1862,"azý":599,"at ":1606,"are":1302,"aná":2114,"ard":1508,"arc":1318,"ara":2212,"aro":2009,"ané":3292,"ark":750,"ari":2403,"aní":1813,"ars":1457,"art":1675,"asi":1254,"aný":3599,"aso":888,"asn":1265,"ar ":1338,"akú":915,"alá":536,"apo":817,"apr":1545,"as ":1458,"alý":759,"ava":2084,"aut":1518,"avs":622,"avo":3114,"avn":2304,"avi":3514,"ave":2415,"avy":1063,"avu":898,"av ":719,"ata":1116,"ast":8626,"atn":905,"atk":1336,"atr":1680,"ato":2184,"ate":5705,"ati":5001,"aur":606,"až ":1038,"jeh":1159,"jej":1018,"jed":5235,"jek":1206,"jem":636,"jen":1683,"ji ":665,"ažd":650,"jad":617,"jav":2393,"jan":692,"jaz":1897,"je ":42288,"jne":544,"jov":1599,"jom":624,"jin":1578,"itn":565,"ito":1341,"itu":927,"its":591,"ity":888,"isk":1618,"isl":1645,"iso":851,"ist":5989,"ita":1760,"ite":3279,"iti":1519,"ivo":1530,"ius":597,"ium":775,"iva":615,"ivi":709,"ive":1126,"is ":1825,"ion":1878,"iou":1025,"iov":556,"iro":841,"iné":635,"isc":586,"iný":615,"iu ":2243,"iná":2028,"it ":625,"izá":1011,"ja ":1196,"itý":663,"ité":878,"itá":857,"eór":603,"izo":825,"izm":1796,"ivá":562,"úry":632,"úsk":784,"úst":1144,"dĺž":1131,"úto":587,"km ":3082,"ki ":721,"úra":899,"dľa":1494,"ked":587,"keh":993,"kej":7958,"ke ":2633,"kci":1471,"kra":3230,"kre":4381,"kt ":843,"ku ":12244,"kro":679,"kri":597,"koz":582,"kov":7957,"km²":548,"kou":1936,"kos":1132,"kor":769,"kop":788,"kon":5151,"kom":7319,"kol":2625,"úzs":928,"úze":2017,"koc":987,"kni":537,"ko ":9276,"kla":4718,"jvo":598,"juh":1174,"jsk":1191,"jst":719,"ečn":971,"ju ":598,"jmä":776,"kat":1398,"kar":683,"kan":1054,"kal":813,"kam":835,"juž":746,"ka ":13070,"jvä":876," Ga":726," Ge":865," Fr":1186," Fi":580," Ha":1115," He":1127," Gr":1147," Hr":606," Ho":1810,"ha ":1403," Je":4569," Ja":1224," In":1544,"cúz":960,"han":1109," Ka":2166,"hal":560,"har":1373," Jo":992," Ju":970," La":1046," Le":1473," Li":1420," Ko":2060," Kr":1440," Ma":4398," Mi":1809," Me":2047,"he ":1247,"dá ":670," Lo":960," Lu":549," Ne":1808," Na":2120," Ni":923," Mo":1735,"dáv":586,"her":779,"hem":744," Am":651," An":1497," Al":1528," Ba":1854," Au":624," Ar":1172," Be":1433," Bi":821," Bo":1848," Br":2086," Bu":806,"his":874,"hit":630," Ca":1107," Ce":603," Ch":1547," Co":1476," Da":898," Di":709," De":1164," Do":1293,"hla":2518," Eu":965,"ho ":17722," Fa":559,"gli":887," Wi":780," Vy":655," Ze":700," Za":604,"gió":559,"gov":534,"údi":539," a ":25661," Zá":880,"úci":2715,"úca":1146,"gra":2234,"úce":1226," Or":774," Po":4736," Pl":565," Pi":684," Pe":1292," Pa":2173," No":1171," Ob":929," Ra":1479," Ro":1415," Re":1042," Ná":657," Ri":640," Pr":3280,"gus":545,"gré":723," Má":761," Sv":647," Su":611," St":2410," Ta":1213," V ":1733," Th":921," Ti":628," Te":1468," Tr":1332," To":1247," Ru":1047,"úhv":785," Sa":1649," Si":774," Sc":533," Se":1428," So":1114," Sp":1448," Sl":3357," Va":861," Ve":1359," Vi":989," Vo":610," Tu":845," ja":3535,"iam":1045,"ial":965,"ian":2212," až":955,"iar":1167," je":43951,"iat":1943," in":4275,"iac":3458,"iad":1663,"ibl":710," dĺ":994," ka":3354," m ":975,"ĺžk":983," ke":682," ju":2279," ha":599," he":772," gr":1776,"ia ":16846," dô":541," k ":1516," id":930," ic":916," hi":1068," hl":2110," hm":698," ho":3959," hr":2464," hu":1376," hv":1179,"iet":1590,"iev":722," ni":2558,"iez":1705,"iel":2995," ne":6825,"iem":1362,"ien":1741," na":26193,"ier":2554,"ies":2613,"ied":1764," my":693,"ieh":899,"iek":2897," mu":967," mo":3923," mn":871," ok":5647," oc":805," od":6282," of":678," jú":829," ob":11824,"ifi":1055," no":2104," le":2787," li":2051,"ick":16621," la":1441," kv":711," ku":1362,"ici":1650," kt":14067,"ich":3629,"ice":1897," kn":595,"ie ":19237," km":3854," kl":1191,"ica":2319," kr":5176," ko":7181," me":12179," mi":3009," o ":2049," ma":5130,"idl":708,"ide":1943,"ida":634," lo":1116," ab":764," am":1765," an":2592," ap":672," aj":2712," ak":5587," al":12145," au":1810," ar":1452," at":917," as":1460," ba":944,"il ":2232," bi":1273," be":1234," bo":7806," bl":568,"ieč":555," by":978," bu":1277," br":1360,"ať ":1963,"im ":540,"ika":2006,"ii ":2231,"ik ":1390,"imo":583," en":822,"ime":618," el":1140," ek":582,"imi":618," fe":960,"inc":1687,"ind":914,"ina":4033," fa":975," ex":1042," fu":1061,"inn":1036," fr":1747,"ino":2279," fo":1624,"int":1331,"ins":1181,"inf":730,"ine":3297,"iež":1149,"ing":1758," fi":2971,"ini":1596,"ink":652," ge":918," ga":535," bý":593,"iká":1538,"inu":1322,"iny":3026," fy":712,"iko":1475,"ikl":600," ce":3110,"ike":783," ch":2700," ci":1436,"ila":1287," da":904,"in ":1617,"iky":1322,"iku":1353," do":7552," dn":677,"ilo":2707," dl":624,"ill":858," dr":3425,"iln":537,"ilm":766," de":4244,"ili":1886," di":2885,"ieľ":575," dv":1786," du":557," vý":6398," zm":1324," zl":1104," zo":2693," zn":2605," zr":719," zv":1435,"hok":654,"hol":1107,"hom":770," za":7013," zd":916,"hos":535," ze":560,"hov":3246," vä":714,"hor":2878,"dí ":1492,"hod":4341," tý":674,"hnu":694," z ":7975," sú":8023,"huj":767,"hud":1122," vš":1583," už":616," zá":4976,"hu ":2863,"hro":776,"hra":2720," ru":986," sa":11939," se":4266," sc":711,"hyb":690," si":1947," sm":780," sl":4314," sk":5136," sp":8017," so":2515," mô":637," ra":2110," re":5038," ná":5061," ri":1768," ro":13226," pr":34757," ps":641," s ":5627,"hy ":1193," má":2179," os":2878," ot":721," op":1633," or":1845," oz":1525," pe":1662," lá":612," pa":3109,"hrá":1069,"hve":833,"hvi":1289," pl":3456," po":28142," pi":764," vy":7755," rô":570," vz":3164," sí":744," va":859," ve":5421," rá":581," vn":537," vo":6181," rí":775," vr":1254," vi":1930," vl":2079," ty":876," pô":1710," tv":1722," tu":1188," us":808," ur":1335," um":1379," un":575," ta":2281," v ":29073," sy":2275," st":8244," sv":3511," su":1035," pí":654," tr":3662," to":6094," th":647," ti":1601," te":5857," Če":1108,"far":539," čí":1229," čo":669,"ext":753,"ez ":733," če":1631," čl":1364," či":2100,"exi":778," ča":4766,"ezd":2253,"evá":647,"eze":538," ďa":573,"eta":1992,"ete":2049,"eti":2038,"etn":619,"etk":1257,"esp":670,"esn":932,"eso":1170,"est":8782,"esu":1559,"eto":2278,"etr":1705,"ets":539,"erá":1903,"eve":2813,"eva":786,"evo":552,"evn":668,"evi":600,"eur":694,"er ":5579,"epa":610,"eol":809,"es ":3178,"ept":842,"epu":784,"epl":609,"epo":608,"epr":993,"eri":5783,"ejš":2097,"erg":805,"ená":3253,"ere":1934,"era":3225,"et ":1437,"emí":1248,"esk":2173,"esi":987,"ený":3926,"ese":2018,"esa":609,"erz":959,"erv":1342,"eru":692,"ení":1974,"ert":777,"ers":1662,"ern":3749,"ené":2633,"erm":1031,"ero":3934,"eko":1708,"ekt":4312,"eku":666,"en ":3989,"ela":1378,"ele":4527,"eli":2168,"eln":1067,"elk":666,"ell":742,"elo":1876,"els":755,"emb":1776,"ema":956,"eme":4087,"emo":708,"emi":2282,"ene":3296,"ena":1245,"end":639,"enc":1378,"eno":3633,"enn":1593,"enk":694,"eni":7601,"ens":6528,"ent":6824,"eká":616,"eob":637,"egi":891,"ej ":24245,"eho":5071,"ek ":1552,"eja":658,"el ":2590,"ejo":597,"eke":577,"eka":784,"em ":1276,"git":547,"gie":653,"gic":1619,"gia":694,"gen":1449,"geo":632,"býv":889,"gal":592,"gan":1363,"fyz":775,"fun":734,"fra":1611,"ače":897,"ači":550,"ačo":893,"ačn":1506,"aču":636,"for":2552,"fic":1396," ľa":1263,"fil":2159,"fik":1059,"fin":689," ľu":1198,"da ":3499,"de ":4522,"dal":718,"daj":777,"dat":690,"dan":1747,"ňov":688,"ňuj":774,"cko":2082,"chá":3113,"cky":1826,"ciá":1302,"cií":762,"cká":2980,"cké":4043,"com":638,"cov":2204,"cou":843,"cký":6183,"ch ":28988,"cer":678,"ces":1333,"cen":1906,"cel":1592,"ci ":4101,"cha":2180,"chu":807,"chy":636,"cia":4155,"cie":3755,"cic":1003,"che":1676,"chl":672,"chi":1393,"cho":6193,"chn":823,"chr":824,"cii":806,"ciu":1018,"cio":1122,"cke":2891,"ed ":1409,"ebe":670,"ebo":10260,"ebr":822,"ec ":2815,"edk":836,"edm":627,"edn":6191,"edi":1893,"ede":2996,"eda":1823,"edz":3180,"edy":1130,"eds":867,"edo":1996,"eck":2662,"ech":1905,"eci":648,"ece":694,"ecn":583,"dy ":3344,"drá":560,"dvo":1617,"dzi":3041,"dze":584,"dza":3109,"dor":670,"dop":717,"dom":1779,"dol":1125,"dok":1204,"dov":3715,"dos":1422,"dpo":810,"dmi":940,"dna":890,"dne":4630,"dni":563,"dno":4328,"ôzn":583,"dob":3335,"dst":1631," ús":674," úz":1992,"duk":644,"duc":748,"dné":1797,"dra":864,"dná":1325,"dre":621,"du ":2083,"dro":1109,"dní":530,"dru":2904,"dsk":1498,"dný":2826,"ôso":1218," úč":553,"dic":713,"dia":1795,"der":1566,"dec":966,"dej":622,"del":1924,"den":5230,"dep":553,"ňa ":821,"dla":715,"do ":3088,"dlo":1153,"dlh":673,"dli":698,"div":859,"din":1669,"dio":707,"dis":1227,"die":2214,"ôvo":1350,"rga":1225,"ri ":3554,"rgi":554,"rge":623,"ižn":609,"ret":2145,"res":5921,"nás":953,"rev":1170,"náv":764,"náz":1745,"rez":1012,"reb":974,"rea":838,"nác":533,"rec":1162,"red":6538,"rej":1959,"reg":962,"reh":593,"nám":1836,"rem":1118,"ren":3464,"rek":894,"rel":975,"nál":1168,"nár":2386,"rep":1499,"óri":1260,"ná ":9951,"re ":5558,"rch":2256,"raz":2538,"rd ":682,"ras":1183,"rat":3414,"rav":4611,"óps":667,"môž":675,"raj":2984,"ran":7562,"ram":1814,"ral":930,"rak":1723,"raf":1089,"rad":3121,"rac":1364,"rs ":542,"ros":1960,"rot":1344,"rom":3824,"ron":1914,"rop":884,"roz":4727,"rou":580,"rov":10282,"rob":1537,"rod":4794,"roc":1978,"ní ":2500,"roj":1677,"rol":693,"rok":6707,"rof":585,"rog":1008,"rno":1434,"rny":1014,"rna":1377,"rež":722,"rne":2865,"rni":769,"rmo":565,"jší":951,"nét":1209,"ro ":773,"rma":1073,"riá":846,"néh":4120,"rka":749,"né ":10505,"rio":767,"rit":1644,"ris":1567,"rig":582,"jši":1295,"ril":574,"rik":952,"rin":1063,"rim":561,"ria":5593,"rib":817,"ric":4203,"rid":872,"rie":6455,"rk ":572,"nóm":573,"ruh":2733,"rum":558,"ruk":768,"rus":1271,"rva":761,"rvk":581,"rve":789,"rvo":671,"ry ":2619,"rsk":3778,"rný":683,"rst":1039,"rto":675,"rti":742,"roč":900,"ním":1698,"ník":2268,"rmá":1093,"níc":1168,"rt ":585,"rné":631,"ru ":1693,"sad":597,"sah":1086,"sam":778,"nýc":6239,"óno":796,"ným":3498,"san":671,"sa ":11522,"ruž":608,"ný ":12353,"ón ":764,"nú ":1355,"rvá":821,"ógi":1295,"rvý":709,"rzi":638,"si ":1248,"sie":988,"sia":1333,"sil":824,"se ":2313,"sch":1140,"sev":1936,"ser":859,"sed":553,"sep":711,"sen":856,"sel":864,"spo":4892,"spr":1629,"spe":1357,"spi":721,"skú":704,"ský":5535,"sov":2280,"ské":3823,"son":682,"ská":3010,"sok":687,"soc":946,"sob":2176,"su ":2117,"st ":1149,"slo":4792,"slu":716,"sky":4413,"sla":2054,"sle":1956,"ski":627,"skl":733,"sko":6875,"skr":1088,"sku":6994,"ska":3470,"ske":5984,"sno":764,"sne":932,"so ":1147,"sme":1025,"stí":1337,"sté":1586,"stá":1256,"syn":543,"sys":1453,"ste":3353,"sta":10107,"stn":2428,"sto":7431,"sti":10473,"stv":3596,"stu":2162,"spô":638,"str":11165,"sts":614,"sve":2116,"svo":1166,"tak":1277,"tal":2000,"tad":583,"tav":3617,"tat":1770,"tas":556,"tar":2723,"tan":2795,"te ":3202,"ta ":5658,"ký ":7646,"kús":915,"ouž":1757,"ozá":729," št":4899,"pa ":715," šk":769," šp":1040," Št":611,"kú ":741,"osť":3922,"pe ":579,"lá ":591,"kýc":4322,"kým":2061,"par":1586,"pat":1317,"pad":2850,"pal":635,"pan":1020,"pev":859,"láv":675,"lác":551,"pec":635,"lád":794,"pen":751,"lán":627,"lár":543,"per":2191,"lát":631,"pel":545,"pla":2526,"ple":978,"pln":576,"plo":927,"ply":661,"pie":636,"pin":2362,"pis":1414,"pit":610,"poz":1867,"pr ":604,"por":2069,"pop":662,"pov":2089,"pou":1725,"pot":3320,"pos":2178,"poj":1973,"poh":1271,"pom":1667,"pon":991,"pok":841,"pol":5900,"poc":543,"pod":5807,"po ":2974,"psk":735,"pub":965,"pte":802,"poč":2131,"pra":4417,"lín":914,"prv":2306,"pri":8032,"pre":12441,"pro":5879,"lóg":1712,"poľ":636,"prá":2352,"prí":5030,"lý ":669,"má ":1849,"mä ":776,"mál":594,"mác":652,"iť ":1130," ži":2499," že":1412,"mí ":997,"ra ":5741,"eži":1187,"ngl":1479,"ni ":1003,"nge":617,"ncú":959,"neh":1387,"nej":10418,"nen":2040,"nem":1523,"nep":832,"ner":1759,"net":1021,"nes":1971,"nev":549,"ež ":1107,"ng ":1176,"nec":565,"nfo":594,"nač":2104,"nez":594,"nco":652,"nci":2449,"nce":985,"ne ":14236,"ndr":573,"ndo":698,"ndi":836,"nde":718,"nda":660,"nak":996,"nal":1254,"nam":2854,"nan":1218,"nap":1468,"nar":576,"nac":1381,"nad":1643,"naj":5055,"nd ":984,"nav":578,"nat":1148,"nas":648,"naz":679,"na ":25237,"mys":1196,"ión":1128,"mož":865,"nyc":1783,"ntá":825,"nož":749,"ny ":7794,"nut":1486,"nto":1924,"ntu":602,"ntr":1488,"nti":2074,"nta":1252,"nte":2106,"nst":911,"nsk":12019,"nný":1960,"nu ":2533,"ičn":889,"nt ":1523,"noh":590,"nom":6261,"not":1866,"nos":7269,"nor":620,"nov":6617,"nou":2761,"nne":1300,"nno":871,"nič":860,"no ":2111,"nka":1319,"nky":647,"nko":836,"eží":971,"nie":10761,"nic":3941,"nia":4773,"niz":1417,"niu":577,"niv":740,"nis":1117,"nit":919,"nin":802,"nik":3242,"ogr":1539,"ogi":1853,"ohr":721,"ohu":747,"oho":1731,"oha":677,"oj ":736,"ok ":6372,"ohy":806,"ojv":611,"ojo":1042,"ojn":974,"oji":587,"oje":2846,"oja":533,"ol ":4900,"oce":1370,"och":5397,"oci":1231,"ock":745,"obs":891,"oby":2302,"ká ":7010,"ode":2380,"odi":1507,"odo":2687,"odp":783,"odn":6582,"ods":1082,"odr":641,"of ":750,"oda":688,"kál":578,"ody":570,"odv":550,"odu":1909,"kác":693,"hľa":877,"ofi":1501,"júc":4607,"oba":663,"od ":5680,"obo":1131,"obr":1755,"obl":2050,"obn":2229,"obj":2245,"obi":1583,"obd":699,"obc":1130,"obe":3213,"nym":950,"jú ":3234,"owi":541,"ový":6197,"ozm":884,"ové":3486,"ozn":2417,"ozl":939,"ozo":2454,"ozd":725,"ová":4125,"otv":654,"otk":570,"oti":1114,"ote":984,"otr":702,"oto":4368,"otn":953,"ost":12819,"ota":944,"ov ":17465,"osi":598,"osk":702,"osp":667,"osl":1521,"oso":997,"orú":1676,"orý":4753,"ovi":5011,"ovn":3441,"oré":3735,"orí":947,"ovo":4172,"ovs":2190,"ova":12599,"ovc":817,"orá":2973,"ove":10952,"opo":1188,"opi":803,"ope":1090,"opa":913,"os ":1011,"oló":1594,"opr":1001,"olí":768,"or ":2600,"ork":577,"orm":2535,"orn":2464,"oro":4190,"ord":568,"ore":3404,"oná":934,"org":1589,"ori":3545,"ou ":11758,"osa":791,"ort":1007,"ors":1188,"oru":1061,"onó":542,"ory":689,"omá":669,"m² ":561,"ora":1870,"ízk":541,"íto":1670,"ola":3491,"on ":2680,"oli":3155,"olk":708,"ole":1539,"oln":999,"olo":6182,"oly":771,"olu":1129,"oka":1003,"om ":22923,"ké ":4451,"ísl":936,"oke":803,"odľ":1432,"okr":4784,"íta":936,"oko":5187,"okt":536,"oku":4912,"ona":1883,"ond":761,"onc":866,"one":891,"ívn":1999,"oni":1442,"onk":548,"ono":1770,"ons":933,"ont":1301,"oma":1179,"ome":2634,"omi":1538,"kéh":4205,"omp":910,"omn":567,"omo":2048,"omu":756,"íva":2087,"la ":7993,"íms":589,"ína":690,"íns":810,"ím ":3028,"íka":561,"íko":789,"ín ":1967,"le ":3820,"eľ ":1241,"íro":535,"ísa":578,"lad":4725,"ípa":543,"laj":540,"lan":3456,"lam":627,"lat":2521,"las":3716,"lav":3631,"lbu":1038,"krá":1832,"kva":670,"kut":584,"kup":2670,"kum":749,"kul":1450,"kuj":582,"ky ":10576,"íck":596,"ích":1516,"íci":652,"ídl":750,"ktr":1255,"kti":779,"kto":15825,"ík ":1507,"ktú":719,"kyc":690,"ktí":857,"lok":542,"lon":552,"lom":1766,"loh":1318,"log":1964,"los":1486,"lov":10431,"loz":1479,"lno":1179,"lež":1031,"lne":3312,"lny":1606,"lič":614,"lna":768,"loč":1880,"lsk":1225,"íc ":720,"lné":534,"liž":568,"lu ":1579,"lný":723,"li ":2968,"lez":532,"lex":642,"eľs":559,"les":1366,"let":1210,"ler":562,"eľo":2345,"eľm":560,"len":3929,"eľn":732,"eľk":1782,"lek":1883,"lej":738,"led":1215,"hád":2792,"lec":545,"leb":10035,"eľa":1087,"lo ":4534,"eň ":963,"lle":613,"lli":641,"lko":992,"lit":2952,"lis":1429,"lin":2535,"liz":1155,"liv":1148,"lic":2198,"lia":1970,"lik":1405,"lie":1755,"ma ":1882,"hý ":549,"maj":736,"mag":530,"mar":970,"mal":1423,"man":1923,"mat":2750,"mbe":991,"mbr":959,"me ":2180,"med":3229,"mec":1667,"met":2730,"mes":5087,"mer":6360,"iál":1808,"mel":722,"men":7980,"ly ":1425,"lož":1837,"ltú":839,"mpl":621,"moc":890,"mod":1086,"mon":943,"mom":711,"mov":2255,"mor":1939,"mos":887,"mot":1139,"mu ":3156,"msk":1141,"my ":1946,"mus":1219,"mun":908,"mi ":7759,"min":2367,"mil":710,"mit":579,"mic":1764,"mia":681,"mie":3132,"ií ":1338,"mno":1169,"Čes":624,"výr":1121,"výs":904,"výv":564,"vým":1096,"výz":1213,"sť ":5294,"sťo":1256,"čas":7534,"zná":1803,"čen":2971,"čes":715,"zsk":1303,"či ":954,"zvy":552,"zuj":707,"čia":1892,"čit":761,"čin":1952,"čka":799,"zyk":1455,"člo":565,"čo ":704,"čle":748,"čeľ":579,"čov":1446,"vý ":3640,"čno":1723,"čne":1110,"výc":4086,"zi ":1861,"zač":716,"vác":643,"zdr":695,"zen":1178,"zem":2639,"vár":636,"zer":667,"vá ":4679,"zdi":837,"zde":597,"zaj":992,"zan":554,"zal":1158,"zar":737,"zos":869,"zor":857,"zof":1473,"zov":2409,"zo ":1433,"väč":1491,"zme":835,"véh":1297,"zna":5129,"zmu":1473,"zmy":578,"zne":607,"zni":1787,"vé ":2772,"zko":574,"zlo":1388,"zdí":688,"väz":626,"zin":1015,"zik":680,"zit":695,"yva":1437,"ytv":779,"yst":2333,"yso":679,"ysl":1049,"za ":3910,"yzi":807,"ych":3717,"tým":683,"yda":835,"uár":767,"túd":552,"tý ":1101,"túr":2064,"týc":709,"ym ":918,"yko":908,"tín":557,"tív":2085,"tém":1500,"tí ":1453,"ože":1844,"ožn":749,"oži":948,"tár":638,"tát":2299,"táv":759,"tác":584,"tál":1075,"súč":1671,"té ":1097,"xis":565,"súh":1058,"sús":702,"sú ":2895,"tá ":794,"sér":579,"síd":652,"rče":541,"rči":735,"vzť":595,"vzn":1177,"vzd":855,"vyr":543,"vyd":936,"vys":1291,"vyt":825,"vyv":562,"rú ":1621,"rý ":3747,"vyš":1099,"rýc":1417,"rís":711,"rít":1609,"ríp":667,"rím":628,"rík":630,"vní":633,"vné":1199,"vrc":1038,"vst":887,"vsk":2197,"vu ":969,"vný":1142,"vuj":550,"vy ":2098,"voľ":584,"róp":1084,"rôz":582,"via":1774,"vil":1662,"vať":944,"vin":2530,"vic":742,"vid":1528,"vie":3018,"nšt":824,"vit":1615,"vis":1230,"važ":892,"ré ":3238,"vla":1346,"ráľ":818,"veľ":1567,"vo ":5058,"réc":806,"réh":740,"vne":2233,"vna":1235,"vno":1530,"vny":872,"rí ":1490,"voc":735,"vod":4147,"voj":3988,"vol":995,"vom":1769,"vor":3219,"vot":802,"vos":2079,"vov":1480,"vou":852,"vlá":818,"vi ":826,"vez":833,"ver":4560,"ves":693,"oľs":926,"rát":1564,"vet":2834,"ráv":2279,"vej":3167,"ven":6559,"rán":812,"oľn":609,"vel":598,"rál":1811,"vek":1439,"ved":2502,"vec":641,"rác":1499,"rá ":3488,"ve ":3005,"val":2956,"van":10444,"var":1669,"vat":3033,"vac":763,"vaj":1282,"va ":7123,"urč":1357,"uró":1104,"usk":1487,"ust":1684,"uti":837,"ute":592,"uto":2154,"us ":3503,"uri":613,"uro":616,"ujú":3483,"upi":2474,"ulá":681,"upe":584,"upn":539,"umb":576,"ume":1559,"unk":1091,"uni":1311,"ukt":726,"um ":2635,"ult":1375,"ulo":557,"uli":689,"uhu":583,"uje":4632,"uho":1552,"ugu":540,"ude":572,"uch":1187,"uh ":722,"udo":1777,"ubl":1069,"tvá":863,"typ":873,"ty ":3084,"očí":1245,"trí":632,"tvo":4048,"trá":1603,"tve":851,"tva":2149,"tur":912,"tuj":829,"tup":1550,"pôv":1057,"pôs":1202,"pís":996,"tná":587,"oče":1143,"tre":4383,"tra":5412,"oča":775,"tné":1110,"oči":806,"tri":3765,"tru":1300,"tro":6109,"očn":2672,"tu ":3234,"tný":1463,"tsk":2708,"toč":963,"to ":9348,"tne":1495,"tno":1220,"toc":600,"tou":572,"tov":6017,"tos":575,"tom":2788,"ton":1192,"tok":6341,"tol":1656,"tor":19018,"top":560,"tký":805,"tik":2033,"tif":575,"tie":2901,"tit":878,"tis":1709,"tin":2110,"tio":893,"tia":1542,"tic":5672,"tiv":708,"tko":851,"tka":2298,"tli":832,"tky":1006,"tla":755,"teľ":4587,"tem":1660,"ten":2822,"tep":676,"tej":1310,"tek":1233,"tel":2263,"tec":925,"ted":826,"tex":545,"ter":5642,"ti ":7610,"tač":896,"ží ":1158,"žív":1856,"zťa":609,"yšš":666,"úča":1833,"zýv":664,"ľko":705,"ľom":604,"ľký":597,"ľov":2803,"žin":599,"žil":543,"živ":1147,"žit":1013,"žia":1434,"žij":593,"žov":702,"žno":718,"žne":1488,"že ":2141,"ľav":960,"ľad":1602,"žen":2262,"ľa ":2178,"zác":1029,"záv":683,"záp":1954,"zák":1752,"uži":784,"ýra":532,"užn":1127,"ýro":660,"ým ":5866,"ými":2162,"ôže":563,"ých":17568,"žsk":553,"ľsk":1094,"ľud":1147,"zém":590,"vše":1630,"uží":1835,"ýzn":1190,"ýva":1701},"n_words":[5274251,6043345,4170145],"name":"sk"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":7697,"E":4326,"F":7483,"G":7482,"A":12478,"B":11289,"C":8095,"L":10635,"M":14480,"N":8171,"O":6564,"H":9046,"I":7126,"J":7268,"K":11757,"U":2722,"T":7824,"W":1905,"V":10234,"Q":288,"P":16672,"S":26591,"R":9751,"Y":433,"X":356,"Z":6077,"f":21271,"g":83131,"d":174445,"e":575804,"b":82172,"Fed":58,"c":56405,"a":599039,"n":410604,"o":476628,"l":247955,"m":151651,"j":244223,"k":242702,"Fel":92,"h":59332,"i":522865,"w":2524,"v":222867,"Fer":370,"u":129396,"t":241091,"s":268170,"r":314054,"q":489,"p":164089,"z":94676,"y":6418,"x":1284,"²":123,"Î":185,"É":129,"Á":62,"Fil":230,"í":2013,"Fin":151,"ë":81,"ê":309,"é":3055,"Fir":82,"è":584,"ç":174,"ä":365,"â":114,"á":3560,"à":133,"ü":557,"ú":870,"ø":79,"ö":488,"ô":419,"ò":234,"ó":1670,"ē":62,"đ":109,"Đ":87,"ā":111,"ć":1063,"Č":1470,"č":56247,"ŕ":73,"ő":245,"ł":69,"ō":86,"Ž":2008,"ž":30614,"Š":2528,"š":51762,"Fak":387,"Fal":70,"Far":111,"Eri":80,"Est":69,"Eti":62,"Ern":82,"Eur":142,"Eva":78,"Evr":794,"Ein":58,"́":345,"Ele":144,"Eko":86,"μ":124,"ν":240,"Ena":60,"ο":342,"ι":214,"κ":105,"λ":169,"δ":61,"ε":131,"η":95,"α":330,"β":59,"γ":116,"ά":79,"ί":88,"Emi":63,"Eli":70,"ό":99,"σ":119,"ς":264,"ρ":222,"π":98,"φ":61,"υ":78,"τ":148," l":19333,"ь":113," m":28795," n":57198,"я":125," o":41952," h":4815,"ш":89," i":55210," j":67383," k":54205,"ы":63," d":42036,"ф":78,"х":75," e":9710,"ц":79," f":9447," g":14912,"ч":253,"р":642,"с":417," a":19069," b":18884,"т":428," c":4312,"у":369," y":127," x":151," z":31775," u":19039," t":27202," w":136," v":56888," p":90919," s":76633," r":20918,"HK ":93,"И":69,"К":110,"Н":65,"М":132,"П":82,"Б":103,"А":111,"Г":80,"В":98,"Д":60," J":7259," K":11717," H":8994," I":7097," N":8139," O":6489," L":10542," M":14428," B":11242," C":8027,"С":106," A":12432," F":7459," G":7430," D":7664," E":4306,"л":503,"к":526," Z":6067,"й":182," Y":432," X":347,"и":1084,"п":111,"о":912,"н":714,"м":224,"г":177," S":26492," R":9719,"Ger":142,"в":657," Q":287,"б":103," P":16617,"а":1195,"з":72," W":1888,"Geo":282," V":10198,"Gen":320," U":2704,"е":796,"д":264," T":7802," č":5716," Č":1467," Đ":87,"Gla":255,"Gia":71,"HL ":80," Á":62,"Gio":131," É":127," Î":185,"Gir":61,"Giu":113," ž":7506," Ž":2008," Š":2526," š":7903,"ա":58,"Gan":63,"Gal":184,"Gam":76,"Gar":174,"Gab":108,"و":76,"ي":148,"ل":189,"م":127,"ن":117,"Fun":63,"د":71,"ب":101,"ا":258,"ر":124,"Flo":89,"Fla":62," А":111," Б":103," В":98," Г":71,"Fra":1714," Д":60," И":69," К":110,"Fri":169," М":132," Н":65,"A ":1521," П":82,"Fre":167,"For":1812," α":63,"F ":287,"Da":1043,"Cu":190,"Cv":64,"Cy":61,"Cl":275,"Co":1643,"Cr":347,"Ce":1243,"Ch":1233,"Ci":465,"G ":285,"Ed":331,"Ea":63,"Dv":143,"Du":856,"Do":1622,"Dr":1054,"De":1300,"Di":918,"Fe":729,"H ":307,"Fa":834,"Eu":245,"Ev":1000,"Er":320,"Et":115,"Es":220,"En":413,"Em":164,"Ep":63,"Ei":124,"El":403,"Ek":172,"Eg":130,"Ge":887,"Ga":902,"I ":771,"Fu":219,"Fr":2141,"Fo":2047,"Fl":225,"Fi":681,"B ":327," С":104,"II ":321,"C ":846,"Av":1043,"Au":489,"Ar":1294,"At":316,"As":365,"D ":417,"Ba":1844,"Az":412,"Af":338,"Ag":178,"Ah":87,"Ab":563,"Ac":172,"Ad":372,"Am":987,"An":1820,"Ap":398,"Ai":256,"Aj":81,"Ak":264,"Al":2250,"Hit":92,"Bu":797,"Br":2247,"Ca":1535,"E ":345,"Bi":1004,"Hid":63,"Be":2195,"Bo":1910,"Hil":81,"Bl":438,"Bj":272,"Hip":72,"Kv":109,"Ku":509,"Kn":259,"IE ":65,"Kl":445,"Kr":2275,"Ko":3816,"Le":3189,"Lj":1723,"Li":1298,"N ":264,"La":1624,"Lu":757,"Ly":76,"Lo":1512,"Me":2219,"Dž":96,"Mi":1869,"Ml":211,"O ":680,"Ma":6025,"Mc":133,"Mu":580,"Mr":101,"Mo":2460,"Nj":399,"Ni":828,"Já":138,"Ne":1598,"Na":2980,"P ":622,"Hel":136,"Ny":85,"Hei":136,"Nu":95,"No":1554,"Ok":258,"Ol":321,"Om":113,"On":187,"Og":200,"Oh":58,"Oc":80,"Od":1568,"Hen":142,"Her":380,"Ob":922,"Gi":553,"Gl":510,"Gr":1909,"Go":1659,"Gu":367,"Gv":83,"Gy":129,"Cô":91,"J ":297,"Ha":1563,"He":1085,"Hi":543,"Ho":701,"Hr":4064,"Hu":331,"K ":438,"Ib":78,"Id":108,"Ig":180,"Im":513,"In":1624,"Il":230,"Iv":415,"Is":1036,"It":703,"Ir":277,"Ja":1804,"L ":443,"Iz":883,"Ji":68,"Je":1541,"Jo":1290,"Hab":58,"Ju":2001,"Hal":126,"Haj":64,"Ka":2508,"Han":157,"M ":389,"Ham":119,"Har":293,"Ki":622,"Győ":92,"Hau":402,"Ke":544,"Us":272,"Ur":262,"Up":188,"Um":80,"Un":907,"Uk":147,"Ul":103,"W ":109,"Ty":77,"Tu":672,"Tr":1582,"To":1468,"Th":691,"Ti":704,"Te":1011,"Ta":1070,"V ":1619,"Côt":91,"Sw":66,"Sz":701,"Sy":147,"St":3308,"Sv":2528,"Su":652,"Wo":195,"Wi":724,"Wh":64,"Wa":405,"We":272,"Vz":150,"Vo":999,"Vr":488,"Vs":193,"Vu":139,"Vi":1620,"Vl":200,"X ":215,"Va":913,"Ve":3902,"Má":81,"Pt":177,"Pu":321,"Pr":3992,"S ":1529,"Pe":1571,"Pa":2197,"Gui":73,"Lé":59,"Pl":789,"Po":5526,"Pi":1027,"Ph":198,"Os":738,"Ot":263," ا":112,"Op":322,"Or":646,"R ":294,"Oz":78,"Se":2093,"Sc":592,"Si":1170,"Sh":254,"Sn":82,"Sm":234,"Sl":6573,"Sk":484,"Sr":839,"Sp":1374,"So":1617,"Ru":971,"Grč":81,"U ":245,"Sa":2421,"Re":3735,"Rd":107,"Ri":922,"Rh":128,"Ro":1660,"Qu":211,"T ":330,"Ra":1534,"Mü":85,"Gre":332,"Gri":94,"Gra":810,"Grb":113,"b ":3960,"Gru":126,"Gro":222,"a ":211856,"Yo":229,"Gle":73,"Z ":240,"Glo":95,"Gol":160,"Gor":907,"Gos":120,"Za":1749,"Zd":593,"Ze":881,"Zi":330,"Zg":337,"Vé":82,"Zm":60,"Zl":152,"Zo":102,"Zn":141,"Zu":99,"God":61,"Zr":80,"Zv":306,"i ":134220,"gd":128,"ge":5872,"ga":24650,"fj":134,"Inf":332,"fl":264,"ff":331,"fi":4842,"fs":419,"fr":3897,"ač":5802,"fu":678,"ft":369,"fo":2045,"Int":382,"fn":91,"j ":12717,"gy":387,"dá":73,"he":3525,"ha":5343,"gn":1368,"gm":166,"gl":6996,"gi":8855,"gh":503,"gg":192,"gv":88,"gu":2562,"gt":124,"gs":203,"bč":5121,"gr":10356,"cí":78,"go":15666,"dt":312,"du":5066,"dv":5016,"dw":173,"dy":144,"dz":369,"g ":4531,"Ima":116,"ea":2500,"eb":7309,"ec":6168,"ed":29381,"de":25427,"Ili":79,"dd":738,"dg":429,"di":24057,"dh":181,"dk":1177,"dj":1456,"dm":1512,"dl":1112,"do":16087,"dn":14568,"dp":1050,"ds":3632,"dr":13867,"ew":457,"ex":237,"eu":1040,"ev":19283,"ey":856,"ez":12153,"fa":2439,"h ":28964,"aú":301,"Ind":547,"fe":2538,"bá":91,"eh":4032,"eg":21346,"ef":1414,"ee":651,"Ime":316,"el":42176,"ek":16159,"ej":9407,"ei":1868,"ep":9382,"eo":3145,"en":61076,"em":38275,"et":32921,"es":22879,"er":40015,"ca":8624,"bz":65,"e ":176361,"bv":197,"by":89,"bs":2013,"br":6519,"bu":2540,"bt":104,"bn":3084,"bo":9119,"bj":937,"bk":143,"cL":77,"bl":11842,"bm":547,"bh":211,"bi":17028,"bb":158,"bd":603,"be":10505,"dc":181,"db":1176,"da":28148,"f ":2764,"cz":61,"cy":123,"cv":189,"cu":951,"ct":549,"cs":478,"cq":69,"cr":248,"co":5124,"cm":121,"cn":95,"ck":1329,"cl":245,"ci":15112,"ch":3183,"ce":12343,"cc":353,"c ":6745,"az":10124,"ay":618,"ba":6779,"d ":25792,"at":30196,"as":24786,"ar":39381,"ax":151,"aw":283,"av":30027,"au":2021,"ak":12407,"al":45711,"ai":1701,"aj":23033,"ao":367,"ap":6069,"am":13386,"an":71011,"ac":6592,"ad":26926,"aa":226,"ab":7824,"ag":7152,"ah":7624,"ae":1031,"af":1621,"nu":5151,"nt":10999,"ns":25512,"ič":13659,"nr":295,"np":329,"no":51231,"nn":1495,"nz":984,"ny":526,"nw":82,"jó":58,"nv":255,"oe":569,"ká":135,"of":3946,"oc":3247,"od":39028,"oa":584,"ob":20785,"om":19239,"on":23036,"ok":14837,"ol":27351,"oi":2548,"oj":11831,"og":11726,"oh":1523,"m²":117,"ot":18168,"os":30813,"ov":55859,"ou":2158,"op":10262,"oo":1164,"or":35498,"jč":131,"r ":20683,"ox":97,"kó":141,"ow":586,"kö":75,"oz":8175,"oy":187,"pd":65,"lá":269,"pe":10198,"pa":26706,"Igr":74,"pc":146,"pl":6471,"lé":474,"pn":2144,"po":53400,"ph":657,"pi":10229,"pj":64,"pk":185,"lo":38721,"ln":9297,"lm":1635,"hé":133,"ll":3272,"ls":3383,"lp":1148,"lv":458,"lu":4829,"lt":2975,"lz":195,"ly":422,"hô":58,"Idr":67,"o ":117173,"mc":91,"md":104,"ma":23441,"mb":4609,"dž":1661,"iá":58,"me":39489,"mf":152,"mk":917,"iè":113,"ml":1947,"mi":12739,"eš":4276,"mj":85,"mn":2345,"mm":463,"mp":2668,"mo":14761,"hč":66,"mr":915,"mt":95,"ms":4458,"mv":69,"mu":5539,"my":115,"p ":1343,"na":91417,"nb":277,"nc":8502,"nd":6315,"ne":43250,"já":118,"nf":942,"ež":5003,"ng":4878,"nh":345,"ni":72174,"nj":24738,"nk":3254,"ić":821,"nl":278,"nm":267,"jv":1409,"jt":127,"ju":14494,"eč":5728,"js":10383,"jp":458,"jn":4767,"jo":15551,"jl":168,"jm":397,"jk":398,"kj":752,"ki":56467,"kh":121,"ke":31686,"kd":477,"kc":1346,"ka":42561,"fü":98,"m ":32866,"jz":162,"ky":130,"ks":2035,"kt":5285,"ku":8273,"kv":2313,"ko":52366,"gí":90,"kr":10395,"kk":78,"kl":5176,"km":1749,"gé":62,"kn":1425,"dš":398,"li":49096,"lh":245,"lk":2648,"lj":32952,"le":34235,"há":163,"ld":976,"lg":1256,"lf":539,"hâ":70,"la":37335,"lc":2469,"lb":823,"n ":53330,"hr":1878,"hs":96,"dí":74,"hw":70,"ht":692,"hu":981,"hj":61,"hk":1523,"hi":3365,"hn":1375,"ho":9636,"hl":379,"dè":58,"dé":139,"hm":293,"id":5268,"ic":16189,"ib":3404,"ia":4880,"ih":24332,"ig":5375,"if":1125,"ie":2994,"hy":222,"dú":72,"k ":19129,"iq":110,"ir":15616,"dč":73,"is":20587,"it":17935,"iu":664,"iv":10504,"ix":194,"aš":9037,"ii":2454,"ij":44668,"ik":30544,"il":26890,"im":25029,"in":69797,"io":5253,"ip":4264,"jc":271,"jb":886,"je":114329,"jd":485,"až":1346,"bš":153,"ji":18381,"jh":438,"iz":20523,"l ":16581,"ja":44343,"tä":161,"pš":106,"xi":105,"té":252,"tí":147,"tó":164,"ww":69,"z ":12173,"xa":126,"ož":4663,"tá":178,"nž":333,"wi":424,"oš":3587,"sé":150,"wn":164,"wo":138,"sí":94,"rč":573,"ws":163,"ró":226,"rô":65,"vz":2935,"y ":2958,"wa":490,"sá":124,"we":348,"rè":76,"vl":5625,"vm":123,"ré":314,"vj":1191,"vk":1293,"vh":61,"nš":866,"vi":26446,"vg":688,"vt":1201,"vu":1200,"vr":6944,"vs":7139,"vp":572,"rí":254,"vn":18643,"vo":20029,"uz":834,"ux":225,"uv":1705,"uu":61,"ve":45929,"rá":407,"vd":234,"vc":816,"vb":122,"va":33280,"x ":755,"mš":2061,"ui":868,"uj":5428,"uk":2181,"ul":7777,"ue":1036,"uf":172,"ug":6200,"lž":478,"uh":1992,"ur":10254,"pč":78,"us":9210,"ut":2587,"um":3595,"un":5703,"uo":144,"up":17758,"ty":284,"tz":304,"tu":11745,"tt":1249,"tw":125,"pó":70,"tv":10431,"ub":7050,"ua":3183,"ud":8270,"uc":1029,"w ":442,"to":33340,"tn":10733,"pé":68,"tm":2491,"tl":2152,"ts":2917,"oč":7724,"tr":18852,"tp":111,"tg":72,"tf":115,"te":38423,"pá":79,"td":278,"tk":3275,"tj":2887,"lš":176,"ti":34585,"th":1742,"v ":46495,"tc":114,"ta":42943,"su":2438,"sv":4904,"ss":1521,"st":64846,"sy":100,"sz":642,"sw":95,"sl":11373,"sk":60513,"sn":5408,"sm":2123,"sp":12768,"so":15794,"nč":1620,"sr":3075,"sd":114,"sc":1363,"sf":304,"se":40074,"sh":762,"sj":389,"kš":233,"si":9098,"rz":2218,"u ":23884,"sa":10107,"sb":1404,"mč":427,"rr":991,"rs":12425,"rt":8497,"ru":11754,"rv":10562,"rw":89,"nó":68,"ry":877,"ní":142,"rp":731,"ro":31529,"rn":11113,"né":172,"rm":4944,"rl":1460,"rk":7297,"nç":122,"rj":7769,"jš":3373,"ri":44436,"rh":1257,"rg":4613,"iž":2554,"rf":351,"ná":202,"re":43763,"rd":5068,"rc":1442,"rb":2537,"ra":68896,"t ":20636,"qu":409,"mí":90,"lč":258,"mé":211,"iš":12414,"má":431,"s ":18299,"px":59,"pt":1806,"pu":4524,"ló":186,"pp":444,"lí":193,"pr":43317,"ps":1569,"Hum":81,"yő":97,"vž":60,"zá":92,"už":4277,"vš":416,"uš":1692,"Hrv":3876,"yí":60,"Hra":77,"rž":2418,"zz":215,"vč":549,"zg":2406,"Hor":93,"zh":2141,"zi":10015,"rš":3989,"zb":1418,"zc":70,"zd":3868,"ze":5649,"vá":315,"za":21344,"Hon":64,"Hok":73,"Hol":102,"zv":5632,"zs":530,"zr":2612,"uč":1996,"zu":2176,"zt":382,"zo":6065,"zn":9166,"ví":186,"zp":931,"zk":568,"zj":291,"zm":3200,"vé":151,"zl":2570,"ye":263,"yc":118,"yd":123,"ya":395,"yt":90,"ys":299,"Hoc":85,"yr":276,"yp":87,"yo":155,"yn":206,"ym":124,"yl":250,"yk":172,"yj":189,"yi":151,"Arg":159,"Arh":66,"Are":64,"Ard":164,"šču":315,"Ara":107,"Arm":203,"ščo":81,"ščn":64,"Ari":101,"šči":1771,"Apo":252,"šče":2731,"šča":1225,"Atl":156,"Ast":131,"Ass":80,"Art":149,"Avt":105,"Avs":709,"Ave":81,"Auv":73,"Aut":64,"Aug":99,"zš":796,"Azi":191,"Azu":144,"Bak":78,"Bal":229,"Ban":193,"Bab":79,"Bad":69,"Bar":430,"Bat":109,"Bas":111,"Bav":61,"Aba":278,"Ada":80,"Ado":61,"Afr":271,"Air":92,"Al ":78,"šće":104,"Aka":59,"Akv":95,"Ala":127,"Alb":257,"Ali":77,"Ale":389,"Alf":117,"Alt":76,"All":122,"Alo":87,"Alp":662,"Ame":656,"Ama":87,"Ang":426,"Ana":133,"And":409,"šč ":144,"Ant":584,"Ann":63,"Buz":68,"Buk":74,"Bur":210,"Bud":88,"Bru":110,"Bož":77,"² ":120,"DA ":787,"DD ":66,"Cal":180,"Cam":157,"Cas":187,"Car":397,"Cat":66,"Can":211,"Cap":85,"Bea":66,"Bes":153,"Ber":630,"Beo":125,"Ben":301,"Bel":559,"Biz":86,"Bje":262,"Bil":214,"Bis":245,"Bit":75,"Bio":71,"Blo":64,"Ble":72,"Bla":204,"Bre":545,"Bra":506,"Bro":313,"Bri":606,"Boh":133,"Bog":157,"Bol":225,"Boj":60,"Bon":91,"Bor":510,"Bos":176,"Bou":105,"Îl":184,"Der":74,"Des":135,"Dev":75,"Dek":70,"Del":334,"Dem":64,"Den":114,"Deb":67,"Dam":75,"Dan":339,"Dar":97,"Dav":176,"Dal":121,"Chr":147,"Che":154,"Chi":125,"ám":76,"án":655,"Cit":86,"áj":126,"Cir":117,"ák":172,"ál":309,"ác":115,"ád":136,"áz":116,"áv":123,"ár":751,"át":278,"ás":315,"ât":67,"Châ":62,"Cla":118,"Cel":355,"Cen":243,"Cer":450,"Ces":88,"à ":58,"á ":69,"Cha":633,"Cre":137,"Cor":603,"Com":262,"Col":181,"Con":211,"Cou":99,"ós":120,"ót":110,"óv":101,"ôm":59,"ôn":89,"Duš":60,"ód":99,"ór":148,"ón":273,"óm":66,"ól":124,"ók":78,"ó ":118,"Drž":130,"ív":88,"íz":94,"ín":324,"ír":134,"ít":126,"ís":96,"ík":80,"íl":246,"íj":224,"íd":88,"ía":72,"Egi":80,"ên":58,"êr":99,"éz":94,"ék":134,"él":241,"éj":87,"ém":138,"én":587,"és":167,"ét":261,"ér":344,"év":100,"éd":144,"ée":75,"Edw":93,"èn":67,"èr":126,"ço":94,"é ":306,"ät":162,"Do ":87,"ć ":449,"Dia":61,"Dic":151,"Dis":100,"Dir":87,"Dio":72,"Din":68,"Die":63,"Div":106,"Dub":388,"Dun":175,"ün":108,"ür":209,"Dvo":99,"Dru":220,"ún":88,"új":321,"úr":86,"Dre":102,"Dra":452,"íš":61,"Dob":200,"ôt":129,"ör":97,"Dou":72,"Dol":339,"Don":353,"Dom":230,"Dor":129,"Ned":79,"Nea":69,"Nem":513,"Nek":87,"Nev":71,"Neu":84,"Jás":105,"Nep":74,"Nas":351,"Nat":199,"Nav":148,"Nic":123,"Niz":179,"Nik":208,"OJ ":106,"New":248,"Nap":119,"Nar":283,"Nam":213,"Nan":68,"Nag":201,"Nah":254,"Naj":294,"Nad":104,"Na ":449,"ći":324,"OV ":124,"ća":95,"će":161,"Či":127,"Če":527,"Ča":315,"Ču":69,"Čr":316,"či":13203,"čj":3069,"čk":2033,"čl":1507,"če":9878,"ča":7103,"čb":207,"č ":3958,"đe":65,"Đu":65,"čn":11436,"čo":315,"čr":1069,"ču":2190,"čv":95,"Nji":74,"Nje":320,"Nov":818,"Nor":440,"Not":61,"Odv":1174,"Ogr":131,"Obč":247,"PL ":89,"Okr":139,"Nyí":60,"Obs":170,"Obi":89,"Obr":75,"Obo":60,"Od ":141,"Île":184,"Oto":151,"Oli":186,"Ont":118,"Ope":134,"Ore":118,"Org":93,"Ost":65,"Osj":214,"Osm":58,"Osn":93,"Po ":423,"š ":428,"Pli":67,"Ple":143,"Pla":466,"Pin":72,"Pik":79,"Pit":66,"Pis":66,"Pir":250,"Pie":165,"Phi":96,"Ped":76,"Per":309,"Pes":215,"Pet":518,"Pen":134,"Pel":66,"šč":6357,"šć":151,"Pat":125,"Pas":151,"Par":669,"Pav":183,"Pau":156,"Pad":61,"Pac":78,"Pan":157,"Pap":82,"Pal":206,"Pak":100,"še":3798,"ša":2845,"šo":760,"šp":1066,"šn":1922,"šk":22840,"šl":713,"ši":4197,"šj":684,"šv":183,"šu":178,"št":5493,"Še":283,"Ša":280,"Šm":149,"Šo":105,"Ši":182,"Šk":346,"Šu":75,"Št":406,"Šv":259,"Šp":321,"Pož":327,"Ptu":112,"Pro":707,"Pri":1370,"Pre":1114,"Prv":307,"Pru":73,"őr":96,"Pra":329,"Pod":622,"Pok":159,"Pol":1336,"Pom":102,"Pon":195,"Pog":199,"Poi":167,"Poj":93,"Pot":652,"Pos":334,"Pov":181,"Pop":91,"Por":385,"Poz":75,"žr":68,"žu":4511,"žn":3757,"žo":181,"žc":62,"žb":1014,"že":6604,"žd":91,"ža":4974,"žk":205,"žj":1185,"žl":145,"žg":130,"ži":6432,"Žu":1147,"Ža":131,"Ži":199,"Že":372,"RS ":268," ال":88,"ž ":1046,"Rac":68,"Rad":357,"Ram":58,"Mün":75,"Ran":79,"Rak":65,"Que":120,"Irs":68,"Ita":679,"Isl":158,"Ist":728,"Ira":123,"Inš":104,"Iva":381,"Izv":187,"Izr":254,"Izd":68,"Jac":161,"Jad":111,"Jav":92,"Jar":108,"Jap":208,"Jan":529,"Jam":191,"Jak":171,"Jel":130,"Jer":198,"Jes":136,"Jez":169,"Jea":145,"Izš":73,"Je ":506,"Jos":354,"Jor":75,"Jon":77,"Joh":406,"Jug":908,"Jud":70,"Jup":160,"Jur":234,"Jul":171,"Jož":199,"LA ":59,"Juž":280,"Kam":278,"Kal":271,"Kap":166,"Kan":372,"Kat":241,"Kas":83,"Kar":544,"Kaz":101,"Kav":68,"Ker":149,"Ken":111,"Kis":102,"Kir":77,"Kit":180,"Kin":85,"Klo":59,"Kli":73,"Kle":103,"Kla":125,"Kon":517,"Kom":582,"Kol":204,"Kos":201,"Kor":867,"Kop":552,"Kov":80,"Kot":149,"Koz":113,"Knj":131,"Kob":74,"Koc":70,"Kre":107,"Kra":1273,"Kri":388,"Krk":87,"Kro":118,"Krš":94,"Koš":77,"Koč":102,"Kul":64,"Kun":76,"Kur":69,"Kva":82,"Lev":100,"Let":1929,"Les":99,"Leo":137,"Len":143,"Lau":76,"Law":92,"Le ":107,"Lag":58,"Lah":83,"Las":91,"Lat":85,"Lar":113,"Lam":67,"Lan":280,"Lab":97,"La ":179,"Lju":1716,"Lib":144,"Lig":77,"Lim":128,"Lin":177,"Lip":125,"Lit":222,"Luk":154,"Lui":81,"Lun":67,"Lud":105,"Luc":114,"Lou":148,"Lov":122,"Los":65,"Lot":115,"MS ":61,"Loi":123,"Log":104,"Lor":123,"Lon":188,"Lok":174,"Lič":69,"Lež":206,"Meh":106,"Men":114,"Mel":123,"Mes":284,"Mer":173,"Met":220,"Med":836,"Mač":73,"Mez":77,"Man":399,"Mal":462,"Mar":2349,"Mas":191,"Mag":152,"Mad":1011,"Maj":111,"Mak":235,"Mai":69,"Mac":176,"McL":66,"Max":58,"Mau":68,"Mat":387,"Mla":152,"Mod":111,"Moh":80,"Moj":64,"Mol":94,"Mon":601,"Mos":493,"Mor":297,"Mou":64,"Mot":314,"Mih":227,"Mik":175,"Mic":218,"Mit":62,"Mir":154,"Mis":119,"Mil":386,"Min":343,"NK ":115,"Mur":270,"Mus":87,"Moš":81,"çoi":67,"Wor":66,"Wol":82,"Wik":71,"Wil":329,"Win":160,"ère":90,"Wes":77,"War":73,"Wal":129,"Vzh":106,"Vse":94,"Vrb":98,"Vra":69,"Vrh":141,"Vol":145,"Voj":511,"Vod":126,"Viš":166,"Več":124,"Vis":126,"Vit":109,"Vla":188,"Zla":131,"čuj":826,"čun":696,"Vél":58,"čut":108,"črk":423,"čre":113,"črt":288,"éte":58,"črn":193,"ču ":419,"Zna":123,"Zdr":568,"ény":95,"Zap":71,"Zar":126,"Zas":101,"Zav":104,"Zag":314,"Zah":111,"Zak":109,"Zal":123,"ékt":72,"én ":269,"éli":106,"Zgr":67,"Zgo":249,"éra":62,"Zim":226,"Zel":179,"Zem":564,"ов ":64,"Zač":142,"之":72,"三":107,"Zad":119,"Za ":213,"Yor":161,"на ":82,"Szo":98,"Sza":480,"Sys":60,"Sve":2328,"Sup":72,"Sud":142,"Str":416,"Stu":150,"Sti":77,"Sto":225,"Sta":2102,"Ste":244,"Teh":61,"Ten":108,"Tem":109,"Teo":88,"Tel":99,"Tek":83,"Tam":78,"Tan":81,"Tar":117,"Tak":138,"Tal":65,"Ta ":195,"Sko":62,"Skr":66,"Sku":204,"Ska":62,"Sha":82,"Sim":142,"Sil":93,"Sis":205,"Sir":188,"Sin":157,"Sib":59,"Sez":419,"Ses":154,"Ser":176,"Sev":340,"Sen":145,"Sel":241,"Sem":91,"Sei":131,"Sed":75,"Srb":301,"Sre":466,"TV ":116,"Sv ":98,"Spa":211,"Spl":481,"Spi":65,"Spe":164,"Spr":96,"Spo":313,"Sod":63,"Sok":63,"Soc":117,"Sob":124,"Sou":80,"Sov":270,"Sol":119,"Som":85,"Son":199,"Sop":149,"Sor":59,"Sla":201,"TO ":84,"Slo":6226,"Sli":73,"So ":69,"Rož":72,"Rus":577,"Rud":148,"Sai":290,"Sam":204,"Sal":223,"Sad":71,"Sco":63,"Sch":394,"Sav":417,"Sat":60,"Sau":74,"Sar":260,"San":495,"ови":138,"Rač":75,"SI ":76,"Res":82,"Rev":59,"нов":79,"Rim":348,"Rib":92,"Ric":128,"ät ":150,"Ras":66,"Rav":91,"Raz":271,"Rde":102,"SG ":79,"Rec":63,"Red":122,"Rei":91,"Reg":444,"Ren":132,"Rek":120,"Rep":2306,"Rog":117,"Rob":196,"Roc":78,"Rod":85,"Rou":102,"Ros":172,"Ron":234,"Rom":237,"SS ":448,"SO ":89,"Vel":2994,"Ven":157,"ски":72,"Vas":75,"Van":123,"Val":340,"Var":221,"Vid":128,"Vic":103,"Vie":59,"Vir":205,"Vil":211,"Vik":75,"Vin":206,"Ver":276,"Ves":184,"Ukr":102,"Uni":842,"Ura":80,"Ust":217,"Upo":141,"Trž":77,"Ter":179,"The":362,"Tho":193,"Tih":67,"Tim":73,"Tis":131,"Tir":83,"To ":334,"Top":167,"Tor":193,"Tok":64,"Tol":157,"Tom":179,"Ton":82,"Tou":87,"Tru":58,"Trs":127,"Tro":210,"Trn":82,"Tri":282,"Trg":65,"Tre":271,"Tra":244,"Tur":330,"Tuk":62,"ši ":887,"šev":379,"šem":193,"šel":270,"šen":350,"šes":555,"šer":80,"šeg":118,"šek":215,"ša ":1145,"še ":1531,"šar":175,"šav":187,"šah":216,"šaj":99,"šal":222,"šan":614,"Šve":109,"Švi":128,"što":95,"štr":65,"šte":2122,"šti":888,"šta":268,"šuj":72,"štv":1659,"štu":262,"švi":77,"šve":96,"špa":358,"šov":61,"špo":639,"št ":64,"šu ":58,"ško":7029,"šlj":265,"šla":312,"вич":166,"šo ":156,"šić":67,"šni":492,"šnj":513,"šne":308,"šna":327,"šič":190,"šno":281,"šol":499,"šic":246,"šib":61,"šin":843,"šil":171,"šim":108,"šik":69,"ših":534,"šit":122,"šir":718,"šje":233,"šja":121,"šji":274,"ška":2789,"ški":6172,"ške":6766,"́н":79,"cLa":71,"bju":136,"bje":482,"bja":288,"áto":72,"biz":60,"bis":206,"bit":738,"biv":1271,"bio":487,"bir":767,"ász":140,"bil":9355,"bim":90,"bin":672,"bij":500,"bo ":1111,"blj":4814,"blo":131,"ble":384,"bli":5531,"bn ":78,"bla":906,"bod":537,"bok":127,"bol":2141,"boj":794,"bog":423,"boh":73,"bič":489,"bno":809,"bna":528,"bni":923,"bne":712,"bmo":523,"biš":76,"bon":136,"bom":159,"bor":1964,"áza":65,"bot":328,"bos":204,"bov":492,"bou":88,"áln":82,"be ":2097,"bam":109,"ban":872,"bak":214,"bal":951,"baj":107,"áko":72,"bah":107,"bac":170,"án ":124,"baz":156,"bav":207,"bat":235,"bas":207,"bar":1233,"ánt":85,"bdo":308,"áno":61,"ány":67,"bde":133,"azš":326,"ánd":61,"bda":111,"bi ":1760,"bej":129,"beh":88,"ár ":304,"bec":216,"ber":3398,"ben":1839,"bel":842,"bez":109,"bes":1297,"bet":155,"baú":258,"bho":160,"bia":67,"bib":116,"bic":125,"áro":114,"ári":73,"áci":80,"buš":62,"ca ":7059,"car":308,"cas":90,"cat":115,"can":153,"cam":193,"cal":177,"cah":276,"ce ":3980,"bri":1142,"bro":951,"brn":171,"bra":2116,"bre":1080,"bu ":321,"brs":91,"bru":767,"bsk":752,"bso":165,"bse":534,"bst":433,"boč":145,"bur":549,"bul":151,"bum":224,"buj":619,"bud":138,"buc":60,"bus":181,"bve":153,"by ":66,"bož":173,"aka":2233,"am ":1501,"ake":764,"akc":307,"aki":595,"ajk":170,"ajl":126,"aji":853,"ajo":3508,"ajp":438,"ajm":203,"ajn":1032,"ajs":1806,"ajt":95,"aju":1775,"ajv":1227,"al ":5680,"ajb":861,"aja":4631,"ajd":410,"ajc":106,"aje":1321,"ajh":414,"ail":145,"ain":656,"air":120,"ais":193,"ak ":1483,"ahk":1261,"ahl":62,"ahi":99,"ahu":67,"aht":170,"aho":1733,"aj ":3209,"agy":182,"aha":1141,"agl":148,"agm":59,"agi":325,"agr":2630,"agu":186,"agn":486,"ago":1560,"anu":2488,"anz":168,"ano":4496,"ann":617,"ant":2576,"ans":8949,"ane":3760,"ang":2192,"anh":58,"ani":12168,"anj":11003,"ank":1463,"ap ":82,"ana":5327,"anc":5066,"and":2328,"amu":229,"amm":99,"amo":2039,"amn":424,"amp":456,"ams":594,"amk":68,"ami":1934,"adž":1259,"ame":3673,"amb":620,"ama":1428,"ao ":196,"alv":164,"alu":459,"alt":357,"als":1671,"alp":219,"alo":3988,"aln":5663,"alm":581,"all":563,"alk":590,"alg":235,"ali":12519,"adš":352,"alj":2204,"alc":1638,"ald":278,"ale":2748,"alf":59,"Šam":81,"ala":5384,"alb":313,"an ":7080,"aks":331,"akr":499,"aku":1530,"akt":954,"ako":3028,"akn":93,"akl":239,"aba":528,"abe":1164,"abi":922,"abl":2808,"abn":285,"abo":950,"abr":232,"abs":510,"abu":83,"ae ":446,"aca":87,"ad ":1592,"ac ":892,"ab ":93,"afo":67,"afr":130,"aft":161,"afs":248,"aff":75,"afe":70,"afi":418,"ai ":215,"aga":832,"age":434,"ael":296,"ah ":2832,"afa":85,"ado":1208,"adr":500,"adl":177,"adk":221,"adn":2260,"adm":341,"adg":67,"adj":251,"adi":2931,"add":208,"adc":70,"ade":1675,"ag ":119,"adz":178,"ads":309,"adu":492,"aco":154,"ack":237,"aci":3927,"ach":405,"ace":463,"acc":103,"ada":12281,"adb":350,"af ":280,"act":73,"azn":554,"azm":458,"azp":368,"azo":464,"arš":186,"azi":1682,"azl":1367,"azk":65,"azv":987,"azu":362,"azr":436,"azt":209,"azs":228,"aze":313,"azg":172,"aza":617,"Špa":241,"azb":60,"azd":489,"avč":119,"azz":92,"az ":748,"ayl":61,"aye":106,"Šta":81,"Šte":144,"ba ":1670,"Štu":63,"at ":1858,"arh":471,"arg":219,"are":1974,"ard":2757,"arc":477,"arb":245,"ara":3352,"arp":61,"aro":2967,"arn":2439,"arm":683,"arl":640,"anç":98,"ark":1199,"arj":3449,"ajš":619,"ari":4347,"aru":188,"arv":516,"arr":314,"ars":3622,"art":3803,"au ":230,"asa":521,"ary":261,"akš":197,"asi":1326,"ash":118,"asc":90,"asb":1150,"ase":7736,"aso":654,"asn":756,"asp":358,"ask":223,"asm":118,"asl":1005,"ar ":4861,"apa":403,"Šen":181,"ape":643,"api":1573,"aph":75,"apn":71,"apl":264,"apo":1357,"app":73,"apr":1152,"aps":75,"apt":76,"apu":142,"as ":2234,"avc":158,"avb":92,"ava":3257,"ax ":66,"aux":63,"aut":447,"avs":1220,"avt":891,"avr":171,"Ško":256,"avo":1996,"avn":8963,"avk":259,"avl":3158,"avi":5083,"anš":152,"avj":154,"avg":550,"ave":2196,"Šma":126,"ay ":236,"awa":58,"avz":193,"avu":60,"arč":77,"awn":81,"anž":60,"av ":1266,"ata":1793,"asu":583,"ast":7399,"ass":309,"anč":577,"atm":280,"atn":621,"atk":2238,"atl":115,"atr":682,"ato":4303,"ate":7456,"alš":59,"ati":8392,"atj":90,"ath":285,"att":232,"ats":603,"atu":979,"aul":252,"aum":70,"aun":59,"aur":182,"aus":178,"aud":100,"auk":85,"ος":138,"ος ":138,"ς ":264,"ν ":77,"Zve":255,"α ":118,"еви":63,"ий ":84,"ич ":167,"až ":103,"jeg":1394,"jej":1047,"jed":501,"jec":86,"jep":94,"jer":828,"jek":653,"jel":447,"jem":3606,"jen":7756,"jez":1544,"jes":258,"jet":1512,"jev":3484,"jač":189,"ji ":9921,"aža":200,"ažd":84,"aže":590,"ažj":94,"aži":70,"ažn":142,"jhe":109,"jhn":247,"jad":100,"jat":493,"jas":144,"jav":1972,"jap":253,"jar":116,"jal":4024,"jak":772,"jan":7670,"jam":569,"jah":261,"jaj":1388,"jaz":69,"jbo":832,"jce":82,"je ":90645,"izš":397,"jci":81,"jde":159,"jda":105,"jna":637,"ješ":262,"jmo":79,"jni":1155,"jne":1049,"jič":83,"jno":1777,"eč ":1193,"jol":65,"jon":462,"jos":83,"jor":101,"jpo":317,"jpr":102,"ск":123,"jiv":675,"jit":167,"jis":201,"jim":790,"jin":1168,"bšk":61,"jik":186,"jil":336,"jaš":1441,"jij":72,"jig":518,"jih":2539,"jic":888,"те":99,"ječ":100,"ст":65,"ул":59,"jn ":65,"jko":128,"jka":118,"jo ":14224,"jma":181,"jlo":62,"itn":496,"itm":98,"itl":62,"itk":153,"itr":810,"ito":1788,"itv":1047,"itu":620,"itt":146,"its":553,"itz":130,"ity":105,"isk":1191,"ism":148,"isl":637,"iso":1260,"isn":596,"isp":122,"iss":283,"inč":105,"isu":398,"ist":8943,"isz":119,"iv ":665,"ita":4085,"itd":78,"ite":3294,"ith":151,"iti":3232,"itj":163,"ivo":647,"ivn":1608,"ivu":289,"inž":250,"ius":336,"ium":162,"iva":2833,"ix ":146,"inš":354,"ivi":1461,"ivj":69,"ivk":187,"ivl":505,"ive":1813,"ipr":269,"ipo":673,"ipp":99,"ipu":59,"ips":60,"ipt":181,"ipi":261,"ipl":628,"is ":1968,"ion":2321,"iop":95,"ior":98,"ios":95,"iot":214,"iog":97,"iok":133,"iol":587,"iom":79,"ipa":1086,"ipe":338,"iov":126,"ir ":1030,"iru":579,"irs":381,"irt":62,"iro":1658,"irn":1433,"irk":3666,"iri":1414,"irj":549,"isi":572,"ish":146,"ise":765,"isc":429,"isa":2843,"iu ":69,"iqu":102,"ilč":111,"ire":909,"irg":67,"ira":3405,"irc":132,"it ":737,"ünc":71,"ivč":108,"ür ":68,"ivš":143,"ja ":24669,"iz ":4501,"izu":488,"izv":2556,"izr":1033,"izs":209,"izp":299,"izo":951,"izn":244,"izm":1841,"izl":440,"izk":366,"izj":155,"irš":113,"izi":2488,"izh":387,"izg":268,"ize":500,"izd":905,"izb":341,"iza":2030,"kaš":110,"kih":6752,"kij":103,"kim":1029,"kil":157,"kie":91,"kiv":97,"kin":454,"kip":232,"kir":103,"kis":399,"kit":420,"kaž":123,"kje":723,"km ":698,"ki ":46414,"ked":200,"keg":3700,"kej":985,"kem":8923,"kel":218,"ken":160,"kes":145,"ker":562,"ket":264,"kev":340,"key":87,"kač":293,"ke ":15947,"kci":1307,"kda":435,"kra":4811,"krb":350,"kre":318,"kt ":433,"ksa":263,"kse":141,"ku ":1833,"kro":1818,"krv":182,"kri":2242,"koz":528,"kov":6012,"km²":98,"kot":4172,"kos":954,"kor":1260,"kop":980,"koo":149,"kon":3042,"kom":2571,"kol":3036,"kok":1914,"koj":101,"koh":100,"kog":105,"kof":1624,"kod":500,"ks ":201,"kmu":212,"kme":268,"kmo":345,"koc":91,"kob":135,"kne":116,"kni":115,"knj":1121,"klu":447,"ko ":24162,"kma":99,"kle":646,"kla":1462,"klo":710,"kli":1093,"klj":712,"jvo":146,"jut":62,"jus":126,"jul":555,"jun":698,"jur":257,"jve":859,"jvi":346,"joč":433,"jub":2164,"juj":406,"jug":1198,"jud":1091,"jsk":8627,"jst":1435,"ečj":1199,"ečk":404,"eči":887,"ečn":355,"ečo":63,"eču":68,"ju ":6445,"jse":265,"jiš":131,"eča":390,"eče":1036,"již":650,"kaz":528,"kav":355,"kat":5637,"für":83,"kar":3117,"kas":410,"kap":270,"kan":1808,"kal":3239,"kam":632,"kaj":559,"kak":434,"kah":366,"kai":81,"kad":433,"kac":404,"juž":772,"ka ":23634,"juč":601,"jze":62," Ga":900," Ge":881," I ":207," Fo":2042," Fu":219," Fr":2139," Fi":677," Fl":224," Ha":1561," He":1083," Cô":91," Gy":129," J ":84," Go":1655," Gr":1898," Gu":360," Gv":83," Gi":550," Gl":510," Ig":179," Id":108," Ib":77," K ":116," Hu":329," Hr":4060," Ho":699,"ha ":480," Hi":543," Ji":67," Je":1541," L ":110," Ja":1800," Iz":882," Iv":415," Ir":276," Is":1032," It":703," Im":512," In":1618," Il":228,"ham":322,"han":759," M ":155,"hai":79," Ka":2499,"haj":1477,"hal":392," Ke":540,"hau":83," Ki":619,"har":950,"has":60,"hat":80," Jo":1288," Ju":1997,"haf":95,"hae":137,"hab":68,"had":64," N ":83," La":1551," Le":3180," Li":1291," Lj":1723," Kl":443," Kn":255," Ko":3810," Kr":2271," Kv":109," Ku":508," Mc":133," Ma":6006," O ":269," Ml":210," Mi":1860," Dž":96," Me":2209,"he ":889," Lo":1511," Ly":76," Lu":757," Já":138," Ne":1590,"а ":247," P ":279," Na":2970," Nj":398," Ni":827," Mr":101," Mo":2454," Mu":575,"hek":94,"hel":368,"hei":125,"heb":110," A ":314,"het":73,"hes":152,"her":719,"heo":145,"hen":330,"hem":187,"hi ":171," B ":212," C ":442," Ap":398," Am":984," An":1817," Ak":264," Al":2235," Ai":256," Aj":81," Ag":178," Ah":85," Af":338," Ac":171," Ad":366," Ab":561," Ba":1832," D ":134," Az":412," Av":1041," Au":488," At":316," As":363," Ar":1291," Be":2192,"hie":110,"hid":239,"hic":86," Bi":1002,"hia":98,"hip":210,"hio":70," Bj":272,"hin":407,"him":147," Bl":437," Bo":1900,"hil":201,"hik":78,"hij":109," Br":2242," Bu":794,"his":178,"hit":605,"hir":188," E ":92," Ca":1521," Ce":1243," Ci":464," Ch":1220," Cl":266," Cr":346," Co":1629," Cu":189," Cv":64," Cy":61," F ":123," Da":1039," Di":915," De":1297," Dr":1054,"hkr":181," Do":1606,"hko":1232," Du":855," Dv":142," Ea":63,"hn ":214," Ed":330," G ":65,"hla":193," El":398," Ek":172," Ei":122," Eg":129," Et":115," Es":219," Er":320," Ep":63," En":408," Em":164," Eu":245," Ev":1000," Fe":726,"ho ":80,"hma":181," Fa":830," H ":159,"gma":95,"go ":1697,"glo":304," Z ":182,"gle":2479,"gli":810,"glj":228,"gla":2978," Wo":190," Wi":722," Wh":62," We":269," Wa":403,"й ":140," Vz":150,"gog":353," Zr":80," Zu":99,"god":1736," Zv":306,"gob":81," Vé":82," Zm":60," Zl":152," Zo":102," Zn":141," Zd":593," Ze":880," Zg":337,"gič":79,"gno":136," Zi":327,"gni":103,"gnj":147," Za":1745,"gne":563,"gna":300," Yo":229,"gs ":62,"о ":66,"goz":506,"goj":282,"н ":123,"gom":421,"gol":311,"gon":470,"gos":3146,"gor":2864,"got":348,"gov":2780,"gu ":806," a ":458,"р ":62,"gro":564,"grm":63,"gru":125,"bču":59,"grs":221,"gra":6878,"grb":61,"bči":4969,"gri":272,"gre":942," R ":122,"в ":115," Oz":78," Os":737,"gto":98," Ot":263," Or":643,"goč":401," Op":320," Po":5507," Lé":59," Pl":785," Pi":1026,"gul":122," Ph":190,"gua":77," Pe":1565,"gub":149," Pa":2188,"gue":188," Ny":85," Nu":95," No":1550," Ol":321," Ok":257," On":182," Om":107," Oh":58," Og":200," Od":1566," Oc":80," Ob":907," Ra":1525," Mü":85," T ":114," Qu":210,"új ":254,"goš":89," Ro":1658," Re":3734," Rd":107," Ri":916," Rh":128," S ":293," Pr":3986,"gur":145,"gus":676," Pt":177," Pu":321,"gun":145," Má":81," Sz":701," Sy":146," Sw":66," Sv":2526," Su":652," St":3290," Ta":1068," V ":1176,"gya":99," Th":689," Ti":702," Te":1006," Tr":1582,"gyk":115," To":1461," Ru":970," Sa":2415," U ":88,"е ":69," Sh":246," Si":1155," Sc":584," Se":2090," So":1610," Sp":1368," Sr":838," Sk":482," Sl":6566," Sm":233," Sn":82," Va":911,"и ":110," X ":144," Ve":3898," Vi":1616," Vl":200," Vo":997," Vu":138," Vr":488," Vs":192," Tu":669," Ty":77,"grš":1077," Uk":147," Ul":103," Um":80," Un":907," Up":188," Ur":251," Us":272," ja":3029," l ":65,"iam":240,"ial":1334," iz":12947,"ian":961," ji":1172,"ias":76,"iar":121," je":57692,"iat":484," io":104," ip":64," im":7503," in":30104," il":218,"ic ":1048,"iac":130," is":755," it":1465,"iag":159," ir":117,"ibl":600," fü":65,"ibi":274," ka":7604,"ibo":889," m ":545,"ibn":134,"ibr":170," kj":700," ki":23502,"ibu":174," ke":816," jo":1895,"id ":690,"iba":621,"ibe":314," ju":3473," ha":275," he":621," gi":540," gl":3153," gr":2915," go":2718,"ia ":1100," gu":114," k ":178," ib":60," id":224," ig":1379,"ib ":92," hi":1051," hk":181," hl":97," ho":1169," hr":955," hu":193,"iet":196,"ieu":64," nj":2131," ni":1612,"iel":266," ne":7311,"ien":545," na":42601,"ier":585,"ies":172,"ied":168,"ieg":71,"и́":65," mu":575," mr":220,"ig ":209," mo":4823," mn":787," mm":80," ok":4185," ol":657,"ifu":62," om":674," on":135," og":566," oh":232,"ifo":187," oc":331," od":6669," of":580," ob":12756,"ifr":67,"ife":161,"ifi":406,"ih ":22740," nu":342," no":2512," np":247,"ifa":63," le":11324," lj":1128,"icr":71,"ics":75,"ict":217," li":1970,"icu":137,"icn":80," n ":309,"ico":974,"ick":188," la":3283," kv":554," ku":716,"ici":3252,"ich":665,"ice":3027," kn":1159,"ie ":718," km":1016,"ica":6322," kl":1067," kr":5141," ko":11511," me":12878," dž":64,"idu":106," mi":1837,"ids":75," ml":614,"я ":69,"idr":346," o ":887,"ido":732," ma":6202,"idn":485," lu":270,"idi":434,"idg":71,"ide":1356,"ida":711," lo":1132," af":94," ag":154,"aša":593," ab":327," ac":59," ad":284,"aše":251," am":1331,"ašk":6766," an":2354,"aši":200," ap":760,"ašn":368,"iin":107," ak":715,"iim":2227," al":6858," av":1858," au":60," ar":1254," at":457,"ašt":146," as":1847," d ":320," ba":1756," az":58,"il ":5175,"ija":12085," bi":10040,"ije":12707," be":1503,"iji":5965," bo":2432," bl":915,"ijo":6465,"ijs":4363," bu":186,"iju":165," br":1850," ca":173," e ":85,"im ":3856,"ika":6461,"ige":666,"iga":1104,"aš ":78,"igl":183,"igh":214,"igi":601,"igu":193,"igr":1537,"igo":249,"ign":280,"ij ":2784,"т ":70," b ":94,"ihe":100,"iha":530,"ihi":87,"iho":729,"ik ":9136,"у ":103," c ":70," er":65,"imo":1733,"imn":223," et":360," es":185," en":4762,"ims":2333," em":109," ep":251,"imp":926,"imf":91," el":1413,"ime":9448," ek":723,"imk":790,"imi":2451,"ip ":337," fe":1007,"inc":748,"ind":925,"ina":10483," fa":1370,"imu":325,"а́":109," ev":586," fu":480,"inn":113," fr":3146,"ino":5575,"ašč":462," fo":718,"int":1271,"ins":4445,"inf":316," fl":126,"ine":5190,"inh":91,"ing":1310,"inj":1193," fi":2355,"ini":4151,"ink":465," ge":1321," ga":4003,"iod":295,"inu":433," i ":115,"inv":62,"inz":59," cl":64,"iko":4804," cm":100,"ikl":979," co":254,"iki":3287," ce":2735," ch":72,"ike":4355," ci":552,"ila":7057,"ilb":66," f ":61,"in ":31978," da":5215,"ikv":58," cv":150,"ikt":162,"iku":799,"ikr":245,"iks":94," do":7621," dn":241,"ilo":4202,"ill":1121,"ilk":457," dr":7206,"iln":1875,"ilm":587,"ilh":107," de":11720,"ilj":766,"ili":2960,"ild":76,"ilc":146," di":5516,"ile":1135,"ima":2334,"imb":293,"ч ":172," ed":1001,"io ":829," dv":2054," du":1905,"ils":366,"ilt":85,"ilu":372,"ilv":125,"ль":58," vč":201," zm":655," zl":487,"ла":90," zo":151," zn":3542," zu":280,"ле":83," uč":541," zr":511,"ли":66," zv":1609,"hok":781,"hol":497,"hom":250,"hon":167," za":14763,"ко":128,"hos":66," zd":1135,"hot":761," ze":976," zb":804,"hov":2393,"ку":58,"hop":61," zi":186,"hor":342," zg":1780,"ка":91,"ки":115,"hod":3942,"hni":523,"hno":289,"hnu":58,"hna":106,"hne":92," z ":4835,"ин":98,"ик":81,"ий":90," ož":121,"ич":191,"ри":87,"ро":86,"ра":120,"ре":58,"htt":71,"hto":63,"htn":72,"hte":153," už":81,"ор":104,"ол":82,"ов":288,"hu ":190,"hrv":430,"но":113,"hro":110,"hre":73,"ни":87,"hri":399,"ht ":188,"на":128,"hra":653,"hiš":212," ru":1131," u ":188," sa":1902," se":18649," sc":148," si":3326," sh":137," sn":739," sm":996," sl":6094," sk":4932," sr":1993," sp":10659," so":10507,"ви":220," t ":95," ra":7736," re":7014," rd":178," ri":2298," ro":2318," pt":209," pu":582," pr":31964," ps":336," s ":4586," px":59,"hy ":96,"ва":103,"ад":75," os":4746," ot":1470,"hum":246," ov":185,"hun":84,"hus":107," op":2076," or":2176,"hur":142,"ан":143,"ак":68," oz":3235," pe":3830," pa":8196,"ар":96," pl":2669," po":40217," pi":2589," y ":101," vz":1551," x ":129," va":2039," ve":6450," uv":1301," vn":129," vo":6456," vp":464," vr":2767," vs":3256," vi":2299," vk":283," vl":673," vm":85," ud":201,"ет":79,"ер":77,"ен":63," tv":236," tu":5865," us":2479," ut":152," ur":2875," up":8796," um":682," un":617," uk":539," ul":238," ug":179," ta":4305," v ":29906," st":7117," sv":3870,"о́":61," su":765,"ев":143," oč":210," tr":3796," tl":193," to":2461," th":371," ti":1200," tk":120," te":8405,"fi ":65,"ffe":78,"ffi":69,"fes":342,"fer":516,"fed":154,"feb":634,"fen":145,"fek":403,"fel":89," Ča":315," Či":126," Če":525,"faz":112,"fat":64,"far":143,"fan":669,"fak":920,"fal":122,"ff ":62,"fe ":105," Đu":65,"fa ":189,"aúj":298," ču":115," čr":759,"eyr":71," Ču":69,"exa":83,"ez ":873," Čr":316," če":1273," čl":1304," či":437," ča":1729,"ezu":507,"eza":1493,"ezd":639,"ezn":2303,"ezo":3022,"eví":88,"euč":161,"eze":1111,"erš":61,"ezi":1770,"eta":9837,"ete":2260,"etd":101,"etj":1541,"eti":2887,"elš":67,"eth":104,"etn":3058,"etl":581,"etk":435,"esp":164,"esn":1865,"eso":719,"est":9497,"esu":218,"enč":254,"esr":69,"ess":341,"ev ":4515,"emš":2022,"eto":5019,"etr":1803,"ets":629,"ett":335,"etu":861,"etv":352,"ew ":247,"eve":3581,"evd":110,"evc":189,"eva":3659,"evo":978,"evn":851,"evl":113,"evk":344,"evj":120,"enš":165,"evi":3431,"euv":65,"eut":92,"eur":130,"eus":174,"ex ":78,"evu":106,"evr":568,"evs":223,"evt":159,"ey ":565,"evz":91,"epe":218,"epi":485,"eph":184,"er ":9909,"epa":2146,"eos":79,"eor":822,"eom":284,"eol":485,"eop":91,"eon":196,"es ":3592,"ept":897,"epu":2717,"epl":115,"epn":163,"elé":65,"epp":110,"epo":984,"epr":1096,"erk":1130,"erl":288,"ejš":1389,"eri":6828,"erj":1569,"erg":1075,"erh":74,"ere":2880,"erf":64,"erc":387,"erd":237,"era":3623,"erb":290,"et ":2922,"esj":60,"esk":362,"esl":322,"esm":246,"esi":510,"esc":198,"ese":3032,"eu ":71,"esa":1428,"erz":1265,"ery":65,"erv":377,"eru":570,"emč":403,"err":420,"ert":841,"ers":1249,"ern":2752,"erm":750,"erp":166,"ero":2923,"eki":619,"ekl":439,"ekm":687,"eko":1561,"ekr":138,"eks":912,"ekt":2647,"eku":579,"ekv":164,"en ":7706,"elb":111,"ela":2552,"eld":165,"elc":327,"elf":63,"ele":5343,"eli":7355,"elj":12080,"elg":274,"ehé":68,"elm":113,"eln":478,"elk":390,"ell":946,"elo":6099,"elu":1481,"els":386,"elt":135,"eo ":194,"emb":2938,"ema":2409,"edž":154,"eme":2514,"emd":94,"eml":1047,"emn":341,"emo":1461,"emi":2775,"emu":847,"emp":655,"ems":675,"ep ":100,"ene":4626,"enh":70,"eng":267,"enb":171,"ena":6583,"end":853,"enc":1426,"eno":7193,"enn":350,"enk":486,"enl":165,"eni":11655,"enj":3527,"enu":1052,"ens":9125,"ent":4611,"enr":131,"enz":343,"eog":377,"eod":241,"eob":101,"egl":515,"ego":1324,"egn":116,"ege":392,"egi":3464,"ej ":1349,"eha":369,"egr":207,"egu":231,"egy":59,"ehn":551,"ehr":149,"eho":984,"ehi":163,"ek ":3893,"eic":68,"eis":179,"eir":92,"eim":143,"eil":148,"ein":591,"eid":153,"eja":1145,"el ":3464,"eiz":90,"eit":89,"ejs":725,"ejo":1467,"ejn":247,"ebš":60,"eji":700,"eje":1829,"ekd":397,"eke":552,"ekc":78,"eka":3433,"em ":19835,"eju":371,"gl ":122,"git":143,"gis":130,"gir":72,"gim":692,"gij":4610,"gik":62,"gip":136,"gin":441,"gio":182,"gie":59,"gib":460,"gih":484,"gia":72,"ght":197,"gha":74,"ggi":66,"gač":142,"gi ":1057,"gen":1655,"geo":621,"get":148,"ger":535,"ges":132,"gh ":98,"geb":124,"geg":139,"gem":133,"gel":572,"gej":96,"gda":62,"ge ":1547,"gac":68,"gad":460,"gah":82,"gas":134,"gar":537,"gat":408,"gaj":272,"gam":198,"gal":701,"gan":2184,"ga ":19235,"fur":60,"fte":66,"fun":414,"ft ":152,"ačb":149,"ača":451,"fra":3096,"fre":206,"ače":1435,"ačj":77,"ačk":163,"fri":439,"ači":1402,"fsk":383,"fro":120,"ačn":463,"ačr":174,"aču":1108,"fov":120,"for":1149,"fos":82,"fot":167,"fon":272,"fol":119,"ač ":265,"fič":180,"fla":71,"fic":219,"fie":63,"fig":93,"fij":889,"fil":1101,"fik":288,"fin":542,"fit":105,"fiz":1064,"fja":62,"db ":97,"da ":14922,"dbe":281,"dba":322,"dbi":164,"dbo":232,"de ":3505,"dac":99,"dal":2088,"daj":1290,"dag":370,"dah":122,"dae":198,"dat":2652,"dar":1727,"dan":3799,"dam":251,"dav":343,"dda":258,"dde":307,"dce":69,"cul":97,"cto":99,"cti":246," Îl":184,"cy ":79,"cve":165,"cus":130,"cur":63,"cks":59,"cko":94,"cla":60,"cle":108,"co ":1155,"con":195,"col":193,"com":102,"cor":143,"cos":2856,"cot":60,"cou":95,"cs ":326,"cqu":61,"cro":134,"cu ":453,"cci":147,"cca":61,"cea":385,"ch ":526,"cev":1776,"cer":935,"ces":1318,"cet":77,"cen":988,"cep":196,"cej":81,"cem":898,"cel":1340,"ceg":119,"ced":151,"ci ":2670,"cha":428,"chw":61,"chu":146,"cia":974,"ck ":555,"cie":148,"cid":94,"che":837,"chl":72,"chi":339,"cho":135,"chm":179,"chn":116,"cht":110,"civ":109,"cij":6609,"cik":850,"cil":290,"cim":128,"cif":150,"cih":145,"cir":298,"cis":476,"cit":587,"cin":475,"cio":727,"cip":225,"cm ":87,"cke":249,"cka":102,"ed ":7632,"eba":410,"ebe":1030,"ebi":1286,"ebl":102,"ebn":705,"ebo":328,"ebr":1109,"ebu":659,"ec ":3986,"eac":70,"eag":60,"eae":122,"ead":77,"eak":245,"ean":473,"eal":308,"ear":308,"eas":76,"eap":68,"eat":181,"eau":166,"eb ":1452,"ea ":232,"efi":249,"efo":206,"efa":145,"efe":483,"ei ":95,"ega":14282,"een":162,"eh ":1490,"eer":68,"eev":58,"edk":372,"edl":222,"edm":691,"edn":4723,"edh":90,"edi":2962,"edj":165,"ede":4374,"ône":77,"eda":2590,"edb":168,"eg ":627,"edt":128,"eds":1765,"edv":775,"edu":552,"edp":144,"edo":1120,"edr":537,"eck":137,"ech":121,"eci":430,"ece":981,"eca":75,"ee ":129,"ef ":166,"ecu":64,"ect":136,"eco":75,"dož":69,"dvs":488,"dwa":102,"dy ":102,"dvi":1792,"dve":1124,"dvo":628,"dur":65,"dus":252,"dva":885,"duš":167,"drž":2000,"dzo":143,"dzi":74,"dze":62,"dor":425,"dop":136,"don":666,"dom":1025,"dol":2617,"dok":396,"doz":167,"dow":104,"dov":2766,"dot":187,"dos":834,"dr ":66,"dpi":182,"dpr":535,"dpo":228,"ds ":150,"diš":1126,"dmi":400,"dmo":356,"dna":1989,"dne":2475,"dni":3893,"dež":1743,"dnj":2661,"dno":3481,"dič":320,"dob":2688,"doc":83,"dod":289,"dog":505,"dst":1636,"dso":71,"dte":147,"dun":61,"duj":151,"dul":104,"duk":196,"duh":1475,"duc":137,"dri":615,"dra":2046,"dt ":61,"dre":1648,"du ":2258,"dro":1558,"drs":225,"dru":5537,"dsk":1257,"dse":412,"dge":82,"dgo":243,"dic":712,"did":62,"dia":709,"dho":97,"ôte":98,"der":1069,"des":1368,"det":180,"dev":559,"dez":147,"deb":135,"dea":152,"ded":131,"dec":783,"def":191,"dej":655,"del":8200,"dek":748,"den":2287,"dem":894,"dep":1917,"deo":172,"di ":7874,"dle":116,"dla":418,"dko":292,"dkr":290,"dki":137,"dme":496,"dma":184,"do ":2879,"dlo":208,"dlj":134,"dli":196,"dja":348,"dje":967,"div":552,"diu":58,"diz":86,"dim":344,"din":3211,"dio":320,"dip":467,"dir":3392,"dis":608,"dit":557,"die":123,"dif":117,"dig":180,"dih":219,"dij":1635,"dik":285,"dil":1003,"dka":143,"dke":226,"dju":78,"deč":365,"rgy":70,"rgu":403,"rhe":114,"rha":142,"rhi":372,"rhu":89,"rhn":68,"rho":225,"iža":226,"rga":1747,"ri ":6205,"rgl":58,"iži":369,"rgi":606,"ižj":188,"iže":597,"rge":583,"rgo":351,"ižn":1012,"rgn":87,"ret":1927,"res":1802,"rev":1974,"reu":328,"rez":1267,"rh ":216,"rfi":77,"rfo":61,"rač":1077,"rdu":102,"rds":107,"rg ":550,"iž ":81,"reb":2051,"rea":620,"ree":132,"ref":492,"rec":915,"red":9699,"rei":333,"rej":1975,"reg":4478,"reh":767,"rem":2494,"ren":2516,"rek":2213,"rel":1062,"rer":139,"reo":180,"rep":1325,"rda":399,"rcu":159,"rdo":364,"rdn":359,"rdi":2193,"rde":564,"re ":2956,"rbu":279,"rbs":360,"rci":266,"rch":173,"rce":392,"rca":244,"raz":7036,"rd ":765,"rap":609,"rar":714,"ras":1404,"rat":4465,"rau":89,"rav":10605,"rbi":646,"rbo":438,"rba":318,"rbe":206,"rc ":65,"raj":3231,"rai":122,"rah":639,"rag":448,"ran":10813,"ram":1883,"ral":3846,"rak":1045,"rab":4754,"raf":1035,"rae":188,"rad":7153,"rac":1106,"rpu":173,"rpo":96,"rs ":406,"rpe":109,"rpa":103,"ror":143,"ros":2112,"rot":1454,"rom":2391,"ron":2361,"roo":117,"rop":1736,"roz":633,"rou":216,"rov":3453,"row":90,"rob":744,"roa":69,"rod":3371,"roc":719,"roj":1187,"roi":1061,"rol":591,"rok":1027,"rof":772,"roe":143,"rog":1831,"rno":2826,"jšč":126,"rič":1125,"rns":80,"rnu":76,"rna":2170,"rež":741,"rne":1804,"rnj":672,"rni":3001,"rmo":552,"rmu":1839,"ro ":1744,"rma":1564,"rme":357,"rmi":362,"reš":389,"rlo":204,"rlj":61,"rli":272,"rle":264,"rla":328,"rn ":268,"rkv":475,"rku":166,"rkt":89,"rks":59,"rkn":66,"nço":87,"rko":671,"rki":446,"rke":698,"rka":3906,"rm ":91,"reč":850,"rju":560,"rji":433,"rja":3706,"raž":658,"rje":3010,"riz":1024,"rl ":197,"rip":1323,"jšo":114,"rio":714,"rir":527,"rit":2886,"ris":2096,"riv":1041,"riu":103,"rih":1145,"rig":966,"rij":4080,"raš":557,"jši":1298,"rii":2351,"ril":2003,"rik":1886,"jšn":99,"rin":1694,"rim":3965,"jša":896,"ria":1305,"rib":1637,"ric":1600,"rid":1029,"rie":837,"jše":763,"rif":96,"rk ":603,"roš":477,"rož":1407,"ruj":117,"ruh":77,"rug":3066,"rud":146,"ruc":91,"rup":276,"run":235,"rum":444,"rul":77,"ruk":405,"ruz":173,"rus":1264,"rut":93,"rva":5491,"rvi":1217,"rve":3036,"rvo":519,"rvn":190,"ry ":643,"rsk":7463,"rsi":153,"rso":356,"rsa":151,"rse":525,"rta":677,"rst":3047,"rtm":1893,"rtn":868,"rto":742,"rte":567,"rth":222,"rti":1649,"rub":82,"rua":643,"rts":92,"roč":1581,"rtu":402,"rtv":101,"riš":1946,"rt ":931,"rro":107,"mči":401,"rri":145,"rre":246,"riž":633,"rra":237,"ru ":1442,"rry":141,"sab":104,"sac":80,"sad":165,"saj":415,"sak":747,"sal":910,"sam":1606,"sba":85,"sbe":912,"sbi":124,"san":1213,"sat":1223,"sas":75,"sar":861,"sav":254,"sa ":1996,"ruž":2445,"ón ":126,"ruš":505,"rze":647,"rza":241,"ryj":60,"rzo":76,"rzi":1047,"sha":61,"sho":59,"shr":72,"she":68,"shi":263,"si ":1301,"sje":270,"siv":155,"seč":117,"sid":220,"sic":368,"sia":89,"sk ":194,"sit":181,"sir":202,"sis":1373,"sip":188,"sin":800,"kšn":168,"sio":244,"sil":1296,"sim":507,"sij":849,"sik":175,"sih":502,"saš":156,"sif":95,"sig":135,"sbo":112,"sbu":112,"se ":9988,"sca":143,"sce":180,"sci":213,"sch":525,"sco":197,"sev":1928,"ser":946,"ses":2057,"set":1089,"sez":2688,"sh ":159,"sfe":159,"sfo":66,"sei":64,"seh":402,"seg":907,"sed":3642,"sec":285,"seb":3164,"sep":883,"sen":803,"sem":2072,"sel":7746,"sek":379,"sej":258,"spu":64,"spo":2044,"spr":1492,"spe":1038,"spl":748,"spi":220,"spa":7101,"sot":293,"sou":74,"sov":1364,"sol":487,"som":341,"son":987,"sop":193,"sor":556,"sos":106,"sod":1527,"sof":102,"sok":589,"soj":102,"soc":318,"sob":183,"su ":1066,"nčn":750,"nči":273,"nče":232,"sre":2496,"srb":412,"nča":248,"st ":5603,"ss ":196,"sli":984,"slo":6036,"slu":603,"sla":2779,"sle":851,"ski":17344,"skl":1236,"sko":15385,"skr":668,"sku":2764,"skv":191,"ska":8402,"ske":14199,"sič":233,"sno":1962,"sna":435,"sni":1800,"snj":102,"sež":377,"sne":1079,"smo":133,"smr":229,"smu":290,"so ":8342,"sma":396,"smi":413,"sme":609,"sz ":97,"sza":136,"sze":92,"szt":71,"sse":338,"ssa":253,"sso":262,"ssi":329,"ste":4516,"sta":14712,"std":76,"stm":122,"stn":2769,"sto":9875,"sti":10401,"stj":760,"stk":173,"stl":589,"stv":6827,"stu":585,"soč":125,"str":7508,"sub":140,"suh":69,"sul":130,"sum":64,"suj":230,"sup":155,"sun":61,"sur":243,"sve":2955,"svi":91,"svo":1687,"tai":89,"taj":1653,"tak":1296,"tal":5739,"taf":62,"tag":120,"tah":112,"tab":357,"tac":360,"tad":115,"td ":92,"taz":72,"tav":4146,"tat":2093,"tas":324,"tar":2532,"tap":65,"tan":6147,"tam":415,"tch":88,"te ":3456,"tde":172,"ta ":16983,"ovš":201," št":2696," šv":175," ši":477,"pa ":6188," šk":1366," šo":471," šp":977," ša":246," še":1284," Šv":259," Šu":75," Št":405," Šp":321," Šo":105," Šm":149," Šk":346," Ši":182," Še":283," Ša":280,"ovč":87," šč":159,"pci":101,"pe ":794,"par":3565,"pat":325,"pas":565,"pav":120,"paz":284,"pac":108,"pad":8267,"pah":117,"pak":187,"pal":492,"paj":410,"pap":279,"pan":5531,"phe":97,"pha":91,"pho":59,"phi":118,"pi ":445,"ph ":153,"pev":467,"pač":83,"pea":111,"pec":334,"ped":485,"pen":956,"pep":58,"per":2309,"pet":1059,"lás":63,"pes":1184,"peh":707,"pel":589,"pek":319,"pla":1692,"plj":349,"pli":1265,"ple":1082,"plo":1730,"piz":105,"peč":83,"phy":92,"pia":104,"pid":74,"pic":81,"pih":119,"pij":931,"pik":108,"pil":764,"pin":2417,"pio":83,"pir":404,"pis":3588,"pit":455,"poz":877,"pr ":382,"por":6507,"pop":763,"pov":2801,"pou":108,"pot":2623,"pos":4743,"poi":280,"poj":1082,"pog":1928,"pom":2891,"pon":1163,"pok":980,"pol":5783,"pob":187,"poe":84,"pod":14532,"ps ":70,"plé":235,"ppo":61,"ppe":190,"lén":278,"peš":381,"po ":5444,"pič":147,"pno":567,"pnj":141,"pež":186,"pni":1125,"pne":206,"pna":93,"pse":98,"psi":273,"psk":942,"pso":79,"ptu":77,"pub":2962,"pte":795,"pti":568,"pto":218,"poč":106,"pra":8108,"pt ":65,"piš":190,"prv":4499,"prs":213,"prt":260,"pru":88,"pu ":357,"pri":11604,"pre":12368,"pro":5665,"poš":266,"pož":71,"pur":71,"pus":357,"put":68,"pun":78,"pul":238,"px ":59,"puš":308,"már":283,"iš ":113,"iše":281,"iša":220,"išl":273,"išn":372,"iši":543,"išk":6388,"išj":568,"išt":1077," Ži":199," Že":372," Ža":131," Žu":1147,"išć":99,"išč":2416," ži":1623," žl":73," ža":202," že":1263," žu":4184,"mén":60,"lčn":96,"qua":66,"que":223,"qui":88,"ra ":5234,"rb ":124,"ežn":508,"ngo":291,"ežj":186,"ngi":146,"eži":1395,"ngl":1754,"ežk":163,"ngu":231,"ngr":157,"ngt":104,"ngs":96,"ni ":21439,"eže":1628,"nge":731,"ngh":66,"nga":200,"eža":393,"nha":154,"nj ":974,"nhe":59,"neh":90,"neg":6715,"nej":1610,"nei":60,"nel":292,"nek":1786,"nen":595,"nem":5832,"nep":765,"neo":239,"ner":2095,"net":1642,"nes":1470,"nev":638,"neu":164,"ndv":192,"ež ":616,"ng ":946,"nea":167,"neb":386,"nec":499,"ned":247,"nfo":286,"nfr":73,"nač":1875,"ney":179,"nez":634,"nfa":314,"nfe":121,"nco":3080,"nci":1963,"nck":74,"nce":1597,"nch":251,"nca":769,"ne ":16673,"nbu":104,"ndu":365,"ndr":735,"nds":176,"ndo":673,"ndi":1187,"nde":738,"nda":1053,"ncu":165,"nak":2105,"nal":3824,"nam":2274,"nan":3844,"nap":1555,"nar":4271,"nac":538,"nad":2224,"nag":2320,"nah":1010,"nai":103,"naj":4748,"nc ":424,"nab":409,"nbe":101,"nd ":921,"nav":1486,"nau":169,"nat":1633,"nas":9591,"naz":549,"na ":45810,"muč":308,"mož":442,"nyi":95,"nz ":124,"nož":611,"ny ":285,"nve":127,"nuk":69,"num":110,"nun":197,"nuj":664,"nus":215,"nut":274,"nua":1914,"nud":60,"ntv":66,"nto":1165,"ntn":394,"ntu":151,"nts":282,"noč":122,"ntr":823,"nti":1732,"nth":129,"ntj":84,"nta":1740,"nte":2274,"nsp":149,"nso":136,"nst":4879,"nsf":59,"nse":185,"nsi":206,"nsk":18646,"nsc":84,"nsa":368,"nu ":1386,"iču":99,"ičn":8484,"ičk":636,"njš":744,"iči":660,"nri":146,"niž":206,"iče":649,"iča":1089,"nt ":1816,"niš":3134,"npr":251,"ns ":498,"noc":81,"nod":149,"noa":98,"nob":153,"nog":856,"nof":95,"nok":237,"nol":557,"noi":86,"noj":94,"noo":307,"nop":217,"nom":2492,"non":367,"not":1702,"nos":5345,"nor":656,"nov":7837,"noz":157,"ič ":1910,"nne":487,"než":185,"nna":175,"nić":78,"nno":127,"nni":311,"nič":1334,"nma":70,"neš":91,"ići":278,"nlj":152,"nn ":230,"no ":28820,"nke":357,"nki":201,"nm ":145,"nkc":434,"nka":820,"nku":97,"nko":712,"nkt":84,"nkr":119,"nji":3942,"njk":62,"nje":10740,"nja":5163,"ić ":428,"nju":1491,"neč":59,"njs":1007,"njo":572,"nij":12636,"naš":856,"nih":7675,"nig":117,"nif":78,"nie":143,"nid":118,"nic":4360,"nia":191,"nk ":305,"niz":1781,"niv":1064,"nis":1438,"nit":923,"nir":658,"nio":254,"nim":2568,"nin":1324,"nik":9988,"nil":489,"obč":4860,"ogr":2005,"ogu":149,"ogi":1571,"ogl":820,"ogo":3723,"ogn":253,"oga":915,"oge":466,"ohr":268,"ohl":58,"ohi":154,"oho":271,"ohn":289,"oha":244,"ohe":84,"oj ":853,"ois":217,"oir":158,"oit":129,"oin":114,"oim":560,"oid":870,"ok ":1437,"ojz":107,"ojv":145,"oju":183,"ojs":1259,"ojo":151,"ojn":3019,"ojm":105,"oji":1325,"oje":1842,"oja":2676,"ol ":641,"oiz":309,"oce":920,"och":145,"oci":1214,"ock":526,"oco":76,"obs":723,"obv":173,"obu":288,"oca":63,"odg":241,"ode":2090,"odk":490,"odl":499,"odi":2311,"odj":811,"odo":2908,"odp":861,"odm":304,"odn":5885,"ods":542,"odt":73,"odr":2425,"of ":1863,"odd":348,"odc":60,"odb":552,"oda":3554,"oel":114,"oen":93,"odz":124,"odv":669,"odu":1390,"og ":1603,"ofi":975,"ofj":119,"ofs":134,"oft":92,"ofo":143,"oh ":61,"oev":59,"off":75,"ofe":287,"ofa":151,"oa ":64,"ob ":1309,"oc ":81,"oam":62,"oak":62,"oba":1124,"od ":12681,"oar":69,"obo":1557,"obr":1689,"obl":2408,"obn":1165,"obm":522,"obh":168,"obj":732,"obi":1733,"obd":521,"obe":1617,"nza":121,"nze":101,"nzi":167,"nzo":136,"nzu":292,"oz ":588,"ows":139,"own":81,"ozv":174,"ozm":90,"ozn":2233,"ozl":77,"ouč":124,"ozo":660,"ozd":363,"oze":956,"ozj":60,"orš":73,"ozi":1844,"oza":922,"otu":189,"oud":99,"ouc":89,"ow ":91,"otl":190,"otj":79,"oti":1413,"oth":99,"ote":2905,"ott":253,"ots":334,"otr":1161,"oto":3788,"otn":1790,"ost":12934,"osu":137,"osv":805,"ota":1246,"ov ":9875,"osi":707,"osk":3606,"ose":3687,"osf":145,"osp":608,"oss":155,"onč":622,"osr":908,"osm":266,"osl":2901,"oso":665,"osn":920,"ovz":1051,"owe":104,"ovj":805,"ovi":6438,"ovn":6797,"ovl":1128,"ovk":184,"ovr":972,"ovp":60,"ovo":2629,"ovs":1169,"ova":8912,"ovc":416,"ove":14862,"olž":454,"oug":85,"oui":145,"oul":129,"oun":202,"ous":295,"our":376,"out":124,"opn":423,"opo":1059,"opi":1707,"opk":100,"opl":563,"ope":1736,"oph":167,"opa":1127,"os ":1104,"opu":601,"opr":1200,"opt":284,"ops":761,"ook":176,"ood":98,"or ":2915,"oot":78,"oos":261,"oor":234,"ork":374,"orl":98,"orm":2841,"orn":2246,"oro":2344,"orp":318,"orr":124,"orc":198,"ord":1013,"ore":2187,"orf":214,"org":1762,"ori":3690,"orj":1749,"ou ":161,"osa":1004,"osc":65,"ort":1525,"ors":2919,"orv":191,"oru":646,"orz":678,"ory":80,"m² ":116,"ot ":4531,"orb":248,"ora":6836,"olč":72,"ola":1198,"old":256,"olc":287,"on ":4034,"olj":3299,"oli":8521,"oll":233,"olk":1067,"olf":200,"ole":2539,"olh":61,"olg":662,"ols":682,"olt":165,"olm":216,"oln":1077,"olo":4662,"olp":199,"olz":68,"olu":478,"okc":353,"oka":3564,"om ":4672,"oki":527,"oke":1295,"okr":2091,"oks":347,"oko":2939,"okl":420,"okv":386,"okt":717,"oku":681,"ona":3944,"ond":566,"onc":719,"onf":152,"one":1109,"ong":354,"onj":581,"oni":3533,"onk":153,"onn":242,"ono":2372,"ons":2332,"ont":1027,"onu":380,"onv":124,"ony":108,"onz":388,"oma":3776,"ome":4659,"omb":385,"omi":1226,"omm":146,"oml":116,"omp":471,"omn":379,"omo":1779,"omt":60,"omu":612,"omr":152,"oms":494,"op ":353,"la ":14406,"kuž":64,"ína":99,"ín ":66,"ílo":74,"kuš":168,"le ":7709,"lce":1290,"lca":262,"lci":518,"lcs":234,"lf ":143,"őr ":86,"lde":127,"lda":104,"ldo":98,"ldi":83,"lab":207,"lac":514,"lad":2741,"lah":1475,"lag":668,"laj":371,"lai":176,"lal":158,"lak":556,"lan":4164,"lam":493,"lap":158,"lao":145,"lar":855,"lat":1921,"las":3314,"lau":130,"lav":3720,"lay":77,"laz":215,"lba":139,"ld ":368,"lbe":231,"lbi":66,"lbo":79,"lbu":240,"kvi":678,"kve":490,"kva":1108,"kus":268,"kur":144,"kup":2763,"kun":201,"kum":211,"kul":2114,"kuj":269,"koš":266,"ky ":76,"kta":268,"kte":244,"ksp":155,"kst":183,"ksi":550,"kso":225,"ksn":135,"kuh":70,"ktr":1174,"koč":387,"ktu":785,"kti":841,"ktn":114,"kto":1394,"krš":290,"íja":164,"kož":132,"lpo":59,"lps":155,"lpe":443,"lpi":97,"lph":69,"ls ":116,"lol":70,"lok":331,"lon":766,"lom":1330,"lop":499,"lor":376,"lod":293,"loc":74,"loh":68,"log":2821,"loj":149,"lpa":140,"los":423,"lot":1148,"lou":85,"lov":16172,"loz":578,"lno":1801,"lić":59,"lnj":113,"lni":3993,"lež":843,"lne":1558,"lob":711,"lič":1905,"lmo":111,"lmi":113,"leš":1366,"lme":146,"lma":647,"lp ":83,"lna":1726,"lmu":66,"hér":83,"lms":222,"lti":179,"lto":145,"ltr":80,"loč":1559,"lts":67,"ltu":477,"luc":178,"lub":512,"lug":80,"lue":65,"lsk":2568,"lso":81,"lst":393,"lta":305,"lte":1274,"ljš":529,"liž":879,"lu ":1284,"liš":2924,"ía ":66,"lt ":201,"lhe":70,"lj ":3087,"lha":78,"lgo":251,"lge":200,"lgi":319,"li ":11443,"lga":376,"lfr":59,"lač":361,"hât":62,"lfo":59,"lfi":89,"lfa":90,"ház":65,"lez":1147,"ley":245,"lex":102,"lev":543,"les":1759,"let":8204,"ler":672,"leo":160,"lep":918,"lem":1328,"len":1784,"lek":2164,"lel":63,"lei":115,"lej":216,"leh":60,"leg":676,"lef":95,"led":2297,"lec":1220,"leb":105,"lea":97,"lg ":68,"lls":71,"llu":110,"lly":120,"lo ":8977,"lla":550,"lle":786,"lli":672,"llo":334,"lko":692,"lku":60,"ln ":62,"lka":768,"lke":292,"lki":170,"ljs":1060,"leč":258,"lju":2873,"ljo":339,"ljn":358,"lm ":297,"lje":13663,"ll ":501,"lja":8525,"ljk":59,"laž":224,"lji":2324,"lit":3130,"lis":1861,"lir":252,"lip":265,"lio":237,"lin":3453,"lim":1141,"liz":841,"liv":1015,"liu":90,"lic":2658,"lid":143,"lia":493,"lib":162,"lk ":580,"lik":9667,"dšk":353,"lil":332,"laš":191,"lij":4137,"lig":667,"lih":765,"lie":274,"lif":220,"ma ":6156,"luž":388,"mb ":135,"mac":507,"mah":62,"maj":3291,"mak":294,"mad":706,"mag":926,"mar":1529,"mas":537,"mal":1048,"mam":116,"man":2972,"maz":61,"mat":4492,"mba":322,"mbi":346,"mbe":1994,"mbr":581,"mbo":391,"mbn":601,"me ":3191,"mbu":131,"mde":94,"med":6987,"meg":105,"mec":75,"met":4065,"mev":70,"mes":5031,"mer":4601,"mem":950,"mel":2712,"meo":80,"men":7879,"meh":365,"mek":1496,"mej":987,"mez":436,"mač":311,"mfo":74,"luz":65,"lva":240,"lve":110,"lvi":73,"luk":109,"luj":862,"lun":128,"lum":243,"lut":187,"lus":353,"ly ":206,"loš":1311,"ltä":132,"lož":762,"lza":60,"luč":58,"luš":63,"mpi":770,"mpe":599,"mpo":176,"mpl":480,"mpu":59,"mpt":60,"ms ":130,"mog":546,"mob":319,"mod":647,"mon":864,"mok":221,"moj":62,"mom":240,"mol":590,"mov":1124,"mor":2946,"mos":831,"mot":853,"mou":135,"mpa":312,"moz":59,"mre":315,"mrl":143,"mrt":287,"mu ":1595,"miš":516,"moč":1268,"mso":95,"msk":4107,"moš":595,"my ":71,"mur":404,"mus":157,"mut":85,"mul":1968,"mun":583,"muz":183,"dža":1295,"mi ":4700,"dži":218,"meč":117,"maž":60,"min":1835,"ešn":268,"mio":71,"mil":655,"mim":87,"mir":615,"mis":564,"mit":679,"ešt":114,"miz":89,"mic":185,"eša":172,"eše":255,"mie":124,"mid":113,"ešk":2738,"mik":628,"mij":927,"maš":150,"eši":154,"mih":245,"mo ":2844,"mlj":1176,"mle":124,"mla":591,"mki":710,"mka":116,"mm ":88,"ešč":404,"mič":519,"mni":612,"mnm":116,"mno":883,"mna":379,"mne":300,"meš":231,"mma":103,"mme":132,"Če ":73,"Čep":77,"Češ":144,"Črn":256,"rža":2068,"rže":110,"rži":130,"ča ":1709,"čal":131,"čam":203,"čan":1056,"včn":97,"čar":720,"čas":2119,"čaj":758,"čak":128,"vče":97,"vča":256,"zre":618,"uče":431,"uča":350,"zra":1527,"če ":2812,"uču":301,"zro":341,"učn":253,"uči":543,"čat":86,"čav":96,"čba":88,"čbe":68,"víl":96,"ziš":265,"zte":155,"čeg":102,"čen":2815,"čem":362,"čel":601,"ček":421,"čev":1119,"čet":926,"čes":137,"čer":105,"zto":106,"čep":104,"zse":225,"zu ":354,"zst":173,"zva":406,"zvi":2202,"zve":1702,"či ":940,"zvr":346,"zvo":918,"zuj":323,"čez":81,"zur":194,"zul":378,"zum":402,"zun":254,"zus":115,"čij":767,"čih":292,"čic":410,"čk ":142,"čit":716,"čis":78,"čin":8235,"čil":961,"čim":249,"čko":555,"čkr":195,"čka":504,"zzo":59,"čke":298,"čki":315,"zza":68,"čjo":263,"čju":736,"čja":509,"čje":920,"čji":637,"češ":144,"člo":587,"čo ":156,"čle":236,"čla":635,"čob":73,"čič":263,"čić":71,"čni":3672,"čno":2056,"čna":1890,"čne":3797,"čiš":160,"zgl":160,"zi ":788,"zač":773,"zha":357,"zgu":93,"zgr":400,"zgo":1653,"zej":178,"zdr":1230,"zdj":83,"zdo":198,"zdn":338,"zet":178,"zen":780,"ván":80,"zem":1646,"zel":858,"vár":138,"zer":530,"ze ":1270,"zbo":448,"zbi":532,"zbu":82,"zbr":206,"zda":703,"zdi":107,"zde":1030,"zab":392,"zad":779,"zac":850,"zaz":93,"zd ":136,"zbe":87,"zai":81,"zaj":329,"zag":420,"zah":1609,"zam":289,"zan":1549,"zak":553,"zal":882,"zar":791,"zap":901,"zav":812,"zas":719,"zat":657,"zod":127,"zob":347,"zor":519,"zom":118,"zon":2846,"zol":213,"zof":467,"zpe":152,"zpa":113,"zoz":300,"zov":527,"zpr":114,"zpo":448,"ال":104,"zo ":291,"zma":791,"zmn":81,"zmo":266,"zme":1501,"zmi":301,"zna":6251,"zmu":231,"zno":574,"zič":152,"ršč":1404,"zne":577,"zni":1693,"zka":85,"zko":152,"zkl":76,"zki":60,"zku":97,"zla":344,"zli":1716,"zle":63,"zlo":375,"zho":1743,"rša":147,"zia":58,"rše":205,"zid":236,"zic":70,"zij":1942,"ršj":95,"zaš":153,"rši":594,"ršn":88,"zin":229,"zim":197,"zil":483,"zik":2477,"ršk":1367,"zio":176,"zir":1419,"zis":483,"zit":356,"ziv":530,"zja":105,"zje":158,"yst":102,"ysi":60,"yro":69,"yon":68,"za ":8558,"ye ":58,"yer":80,"ya ":130,"yar":104,"yku":71,"yle":59,"yi ":114,"yje":66,"yja":58,"ن ":69,"ožu":71,"ože":1171,"oža":441,"ožb":309,"ožn":777,"oži":853,"ožj":690,"ožg":76,"ož ":115,"té ":62,"tät":153,"ći ":280,"xan":70,"ošč":370,"wn ":129,"ws ":103,"rče":97,"rči":287,"wor":59,"wer":110,"wel":68,"nže":273,"oš ":92,"wis":86,"ošt":692,"oši":152,"oše":119,"wic":65,"ošo":62,"ošn":419,"win":68,"ošk":1384,"oša":203,"vzo":115,"vzn":60,"vzr":280,"vzp":183,"vzg":93,"vze":217,"vzd":233,"vrš":1631,"vzh":1577,"vza":88,"wal":60,"war":181,"viš":902,"vrt":231,"vrs":1856,"vrn":62,"vro":1339,"vri":65,"vrh":387,"vre":731,"vra":482,"vso":118,"vst":1541,"vse":2779,"vsk":1809,"vsi":97,"vu ":881,"vsa":763,"vto":922,"vtr":73,"voč":112,"vts":62,"vul":228,"via":94,"vk ":203,"vio":100,"vir":2583,"vik":86,"vil":3700,"vim":485,"vin":3336,"vig":158,"vih":947,"vaš":4759,"vij":2768,"vic":1147,"vid":829,"vie":118,"nše":75,"vja":192,"viz":807,"nšt":314,"vit":1128,"vis":1138,"več":3002,"vje":763,"vka":371,"vju":170,"vko":123,"vke":182,"vkl":325,"vla":601,"vle":130,"vlo":204,"vlj":4644,"vo ":5275,"vme":74,"rén":67,"veš":647,"vež":91,"vne":3166,"vna":1676,"vno":7416,"vić":274,"vnj":103,"vni":6241,"nšč":326,"vič":1157,"vob":458,"vod":2329,"vog":94,"voj":6252,"vol":768,"vok":302,"von":637,"vom":445,"vor":1714,"vot":255,"vos":442,"vov":212,"voz":481,"vpi":58,"vpl":277,"vpr":143,"vgu":581,"vi ":4563,"vač":184,"vey":59,"vez":2827,"ver":4490,"ves":828,"vet":5958,"vdo":109,"vej":290,"veh":446,"veg":583,"rán":91,"ven":15616,"vem":1214,"vel":2593,"vek":749,"ved":1600,"vec":1132,"vcu":59,"vca":96,"ve ":3712,"vci":431,"vce":194,"val":4949,"vak":400,"van":6929,"vam":248,"var":2538,"vat":726,"vas":1162,"vaz":61,"vac":503,"vad":923,"vai":77,"vaj":1562,"vah":328,"va ":7766,"uzi":197,"urš":301,"uze":286,"uza":111,"urč":177,"ux ":175,"uva":93,"uve":379,"uvr":1131,"usl":98,"usm":167,"usk":1090,"usi":676,"use":332,"usa":242,"usu":60,"ust":3507,"uss":205,"usp":414,"uso":223,"usn":197,"utl":65,"utn":316,"uth":139,"uti":197,"ute":579,"uta":264,"utt":69,"uts":102,"uto":194,"utr":115,"us ":1853,"ut ":342,"urb":124,"ura":3309,"urd":58,"ure":1237,"urg":652,"urj":220,"uri":925,"url":67,"urk":122,"urm":63,"urn":892,"uro":433,"urs":478,"urt":136,"uru":97,"ury":61,"upa":4858,"ur ":618,"upi":1896,"upe":339,"upo":4232,"upr":4636,"upl":66,"upn":1389,"umr":152,"umu":107,"umi":296,"umo":158,"uma":367,"umb":237,"ume":1232,"unt":159,"uns":232,"unk":535,"unj":73,"uni":1649,"uno":147,"unc":267,"und":490,"una":1305,"ung":160,"une":190,"up ":120,"uks":65,"ukr":140,"uku":79,"ukt":476,"uko":214,"ukn":60,"ukl":92,"uki":118,"ukc":123,"uke":60,"um ":819,"uka":277,"ulu":100,"ult":2100,"uls":66,"ulo":296,"ull":127,"ulk":70,"ulj":355,"uli":1131,"ule":1791,"ulf":70,"uld":69,"ula":1090,"un ":304,"ukv":294,"uig":66,"mšk":1972,"uil":118,"uin":78,"uir":59,"uis":202,"uk ":155,"uje":4627,"uji":154,"ujo":137,"ujs":75,"uit":156,"ul ":297,"uja":181,"ugh":94,"ugi":1041,"lži":298,"uge":696,"lžn":64,"ugo":2841,"ugl":86,"uga":776,"uhi":80,"uho":1525,"ugu":295,"uha":183,"uj ":60,"uda":313,"ude":487,"udj":82,"udi":5813,"udn":106,"ubo":133,"ubn":112,"ubs":149,"ubr":361,"uca":72,"ue ":195,"uce":114,"uci":418,"uch":129,"uck":88,"uer":142,"ues":152,"uh ":100,"uds":601,"udo":411,"ug ":215,"ued":59,"uen":108,"uel":148,"ub ":374,"uar":2656,"ual":170,"uan":113,"ubi":368,"ubl":4817,"ube":341,"uba":207,"ud ":197,"trž":74,"ty ":183,"tvu":608,"tvo":3047,"tve":2705,"tvi":733,"tva":3299,"tur":2565,"tus":332,"tut":264,"tuj":422,"tul":174,"tun":135,"tum":104,"tub":74,"tua":118,"tud":5466,"tuc":67,"tug":184,"tz ":159,"toš":66,"ts ":250,"tiš":235,"tmá":241,"trd":395,"tre":2501,"oče":1653,"tt ":133,"oča":1047,"tra":4316,"trj":88,"očj":1644,"očk":616,"očl":103,"će ":102,"trm":72,"trg":328,"tri":4216,"oči":1373,"trs":733,"oču":60,"tru":1087,"trt":467,"tro":4048,"trn":84,"očn":740,"tu ":1700,"try":92,"tsc":117,"tsk":2341,"tta":143,"tte":299,"tti":218,"tto":129,"ttp":71,"tts":81,"toč":930,"tma":1947,"to ":6420,"tmo":86,"tmi":151,"teš":130,"tež":380,"tni":5182,"tne":1620,"ća ":63,"tp ":71,"tna":1556,"tič":5076,"tno":2333,"tod":350,"toc":762,"toj":799,"toi":353,"toh":82,"tog":270,"tob":807,"tou":124,"tov":5531,"tos":1124,"tot":329,"toz":100,"tom":1530,"ton":1758,"tok":2514,"tol":3650,"tor":3549,"top":2051,"oč ":359,"tij":686,"til":753,"tik":2804,"tif":115,"tie":119,"tih":1408,"tig":120,"tir":1113,"tit":785,"tis":2436,"tin":2891,"tim":668,"tip":613,"tio":1015,"thu":192,"tia":170,"tib":79,"tic":1448,"tid":147,"tji":207,"tju":277,"teč":443,"tjo":443,"tiz":729,"tiv":1588,"tje":1033,"tja":867,"tki":1874,"tko":496,"tku":163,"tka":483,"tke":192,"tlj":122,"tli":831,"tlo":345,"tla":447,"tle":327,"tem":4827,"ten":1364,"teo":775,"tep":103,"tei":263,"tej":504,"tek":2989,"tel":4105,"tef":109,"teg":1181,"teh":652,"tea":89,"teb":91,"tec":139,"ted":342,"th ":436,"tez":323,"tev":3243,"tet":1626,"tes":590,"ter":10610,"ti ":9411,"tho":148,"the":481,"thi":96,"tha":151,"zšl":321,"zši":325,"zše":149,"Živ":99,"yőr":92,"Žel":286,"ža ":604,"Žup":1073,"žko":65,"žle":58,"žlj":64,"žju":249,"žke":66,"žin":1764,"žim":160,"žil":222,"žir":92,"živ":1723,"žit":230,"žis":155,"žja":222,"žje":479,"žji":178,"žic":387,"žig":82,"žij":171,"žič":188,"žnj":265,"žni":1372,"žno":1191,"žna":269,"žne":657,"žo ":93,"žeš":277,"že ":817,"žbi":268,"žbe":388,"žbo":86,"žav":2001,"žba":234,"žaj":196,"žal":97,"žan":339,"žar":1500,"žga":114,"ži ":1033,"žev":1038,"žej":69,"žek":75,"žel":1057,"žem":871,"žen":2078,"žef":104,"ždi":82,"žuž":106,"užb":674,"uže":844,"užu":116,"uži":1380,"užn":1159,"žuj":195,"žup":4080,"žiš":141,"žu ":73,"všč":108,"všk":96,"vše":78,"vši":66,"yír":60,"ušč":304,"ušt":314,"ušn":118,"ušk":172,"uši":260,"uše":213,"uša":205},"n_words":[5788075,6773679,5606921],"name":"sl"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"YO ":13,"jec":34,"jee":32,"D":313,"E":183,"F":66,"G":214,"A":673,"B":249,"C":240,"L":152,"M":367,"N":163,"O":122,"H":180,"I":236,"J":129,"K":173,"U":82,"T":107,"W":226,"V":11,"Q":76,"P":22,"S":486,"R":114,"Y":96,"X":120,"Z":10,"f":458,"g":2154,"d":5233,"e":4497,"b":2102,"c":900,"a":24510,"n":3878,"o":5982,"l":3786,"m":2460,"j":397,"k":2897,"h":3132,"i":6615,"w":2306,"v":27,"u":3829,"t":1545,"s":2871,"r":2895,"q":718,"p":77,"z":23,"y":3607,"x":1698,"jaa":13,"jab":16,"jar":10,"jam":12,"Xam":11,"joo":14,"Xas":10,"jis":14,"jir":95,"jii":13,"jid":17,"jo ":15,"Far":12,"isk":69,"ism":12,"isl":25,"iso":22,"isu":42,"ist":67,"ita":17,"is ":71,"ion":20,"ir ":84,"irs":56,"irt":28,"iro":22,"irk":32,"iri":56,"isi":32,"ish":96,"ise":18,"isb":17,"Wux":23,"isa":134,"ire":16,"ira":131,"iyi":10,"iyo":394,"iya":423,"iye":65,"ixi":16," l":598," m":880,"kii":161," n":189," o":537," h":365," i":795," j":267," k":1328," d":1214," e":328," f":95," g":401," a":1317," b":593," c":361," y":296," x":283," u":599," t":376," w":1834," q":291," p":20," s":807," r":112,"km ":14," J":125," K":142," H":119," I":161," N":93," O":34," L":81," M":322," B":217,"khd":24," C":229,"kha":11," A":275," F":59," G":169," D":236," E":41," Z":10," Y":40," X":90," S":438," R":66," Q":69," P":18," W":211," U":33," T":83,"kee":20,"key":11,"kh ":38,"Web":10,"Waa":56,"ku ":434,"kor":15,"Wax":40,"koo":94,"War":17,"XEE":11,"مد":16,"Gal":22,"و":25,"ي":76,"ف":13,"ق":12,"ل":77,"م":62,"ن":31,"ه":13,"د":46,"ح":26,"ب":37,"ة":21,"ا":98,"أ":11,"ع":29,"ش":21,"س":23,"ر":49,"kar":49,"kas":30,"kan":49,"kal":143,"kam":32,"kad":48,"kac":14,"kab":10,"kaa":81,"ka ":1268,"A ":83," Ga":53," Ge":18,"Da":59,"DU":11,"Cu":18,"Co":13,"DE":11," Fi":17,"Ce":13,"DH":15,"Ci":23," Ha":35,"Du":13,"EY":13," Go":61," Gu":12,"EG":11,"De":45,"EE":45,"EL":14,"Di":29,"Dh":36,"H ":16,"GA":19,"Fa":23," IY":15,"Er":12," Ho":29,"ha ":334," Hi":37,"Ge":18," Ji":25,"Ga":53,"حم":18,"HA":35,"I ":13," Ja":63," KA":16," Is":32," It":29,"GM":12," In":35,"Fi":17,"ham":43,"han":102," Ka":28,"hal":48,"haw":17,"hax":44,"haq":58," Ki":19,"har":45,"has":76," Kh":10," Ju":19,"hah":12,"hab":77,"haa":189,"had":144,"hac":36,"AS":15,"AR":23," MA":17,"AX":27," La":22,"AY":15,"BA":11," Li":11,"C ":10,"AD":43,"AA":51,"AB":14,"AG":11," Ko":23,"AH":23,"hay":333,"AL":37," Ku":26,"AM":13,"AN":35," Ma":180,"Ax":18,"Ar":12,"D ":22,"بن":10," Mi":27,"Ba":101,"CA":15,"Af":65,"بد":10,"he ":25,"Aa":22,"Ab":33,"Ad":10,"Am":17," Lu":25,"Al":38," Ne":14,"Bu":30," Na":32,"Ca":127,"DA":43,"E ":30,"Bi":19,"Be":25,"hda":27,"Bo":30,"Hin":18," Mu":78,"hel":22,"Ku":26,"hee":112,"Ko":23,"hey":26,"hex":72,"Li":11,"N ":26,"her":11,"MA":41,"La":22,"Lu":25,"hi ":27,"Mi":27,"NK":10,"ال":51,"O ":34,"NA":12,"Ma":180,"Mu":79,"Ne":14,"Na":32," Am":16," Al":38,"Nu":16," Af":65,"No":12,"OO":18," Ad":10," Aa":22," Ab":33," Ba":101," CA":12," Ax":18," Ar":12,"hig":23," Be":25,"hid":12," Bi":19,"hin":40,"Go":61,"him":17,"Gu":12," Bo":30,"hii":170," Bu":30,"his":24,"hir":31,"Ha":35," Ca":127,"Hi":37," Ce":13," DE":10," Ci":23,"IN":12,"Ho":29," DH":13,"IS":10," Co":12," Cu":18,"IY":20," Da":59," Di":29," Dh":36,"In":36," De":45,"Is":32,"It":30,"Ja":63,"KA":33," Du":13,"Ji":25," Er":12,"Ju":19,"LA":35,"Ka":28,"Kh":10,"ho ":53,"Har":14,"Ki":19,"LE":16," Fa":23,"gma":64,"go ":32," Xi":13," Xa":51,"UU":11,"yuu":26," Wu":23,"To":11,"Th":10," Wi":15," We":12,"Ta":37," Wa":133,"St":13,"Su":23,"Wu":23,"gob":97,"Wi":16,"Wa":133,"XA":19,"We":12,"XE":12,"Y ":18,"yst":29," Yu":14,"yso":15," Ya":10,"WA":26,"gmo":41,"ysa":93,"Qa":26,"Qo":17," م":12,"RA":10,"S ":18," ع":21," ا":48,"goo":52,"R ":20," ب":13,"gsa":14,"gu ":229,"Si":17,"Sh":86,"gsi":12,"So":180,"Ru":12,"U ":11,"Sa":70,"TA":13,"Re":13,"SH":11,"Ro":11,"yoo":24,"Qu":16,"SA":16,"Ra":20,"gud":22," Nu":16," No":12,"gta":43," Ra":20," Qu":16,"b ":130," Ro":11," Re":13,"guu":20,"gun":12,"a ":5909," Qo":17," Qa":26,"شي":10," Su":23," St":13," Ta":37,"Ya":10," Th":10,"Yu":14," To":11," Ru":12," Sa":70,"Xa":51,"YO":15," Sh":86," Si":17,"Xi":13," So":180," WA":20,"ري":12,"Gob":48," ja":60,"i ":853,"ye ":36,"ian":11," iy":365," ji":127,"ge":93," je":47,"ga":1135,"fk":16,"Ing":16," im":15," in":148," il":54," ii":23,"ic ":14,"fi":49,"fr":45,"fu":47,"ft":29,"fo":18," is":155," ka":688," kh":13,"hd":44,"he":286," ki":46," ke":11,"ha":1580,"gn":11,"gm":108," jo":14,"gl":15,"gi":72,"id ":171,"gu":305,"iba":32,"gt":52,"gs":27,"gr":15," ju":17,"go":196,"du":188,"dw":36,"dy":13,"g ":83," ha":190,"ea":16,"eb":72,"yee":61,"ec":51," he":28,"ed":360,"de":252,"dd":113,"di":494,"dh":632,"dk":189,"dl":33," go":117,"do":234,"dn":22," gu":55,"ia ":36,"ex":102,"ey":554,"fa":110,"h ":441," id":15,"fe":17,"eh":54,"ib ":32,"eg":202," hi":20,"ee":1263,"el":242,"ek":35," ho":120,"ei":12,"yey":26,"en":172,"em":31,"et":26,"es":93,"er":287,"ya ":266,"ca":427," ni":37,"e ":881," ne":15,"bs":21," na":54,"br":36,"bu":104,"bt":55,"bn":18,"bo":234,"bk":30,"bl":13," mu":48,"ig ":10,"bi":355,"bb":15,"bd":41,"be":201,"db":11,"da":2087," og":18,"f ":98,"cy":18," of":16,"cu":41,"ct":11,"cs":27,"co":62,"cm":24,"cn":13,"cl":19,"ci":73," nu":10,"ch":33," no":73,"ce":64,"cd":20,"yad":111,"yag":10," le":91,"c ":51,"yaa":287," la":334,"icm":22," ku":465,"ici":14," km":14,"ica":25," ko":88," me":49,"az":10,"ay":1458,"idu":13," mi":187,"ba":817,"d ":893,"at":134,"as":580,"yd ":29,"ido":43,"ar":1307,"aq":237," ma":590,"ax":1066,"aw":157,"idk":12,"yay":52," lu":25,"ak":76,"al":1647,"idi":35,"yaw":11,"idh":19,"ai":29,"aj":59,"yar":45,"am":590,"an":1951,"yaq":50,"yan":13,"ac":260,"ida":140,"ad":2243,"aa":4171," lo":138,"ab":630,"ag":664,"ah":1152,"yah":134,"af":128,"iib":15,"nu":38,"iic":11,"nt":263," af":45,"ns":59," ah":473," aa":208,"iig":13," ab":31,"iid":50,"no":160,"nn":18," ad":49,"q ":34," am":103," an":18,"iik":48,"iin":164,"ny":57,"yka":17,"iil":93," al":21,"iim":26,"iis":199,"iir":65,"of":78,"iiq":14,"oc":29," ax":10,"od":156," ar":26,"ob":291," aq":21," as":29,"om":340,"on":186," ba":344,"ok":16,"ol":273," ay":246,"og":129,"il ":80,"ot":41,"os":90," bi":107,"op":10,"oo":1738," be":63,"or":236,"oq":49,"yn ":105," bo":34,"r ":475,"ox":10,"ow":125,"oy":128," bu":35,"pa":14," ca":238,"im ":21,"ika":50,"lo":386,"ige":10,"lm":39,"ll":110,"ls":27,"iga":247,"ii ":339,"lw":14,"lu":48,"igi":31,"yo ":488,"ly":56,"igu":13,"igt":12,"o ":2012,"ma":1465,"mb":52,"mh":21,"me":199,"mk":39,"mi":333,"mp":19,"mo":102,"yna":98,"mu":85,"ihi":82,"yni":14,"na":851,"nb":30,"yne":30,"nc":10,"nd":137,"ne":107,"nf":30,"ng":58,"ynt":29,"ni":213,"nk":312,"nl":21,"imo":20,"ju":17,"jo":31," ee":295,"imi":21,"ki":203,"kh":95,"ke":48,"ind":29,"ina":80," fa":48,"yga":15,"ka":1778,"yi ":19,"m ":103," fu":10,"ino":13,"kt":20," fo":12,"ku":558,"int":102,"ins":10,"ko":130,"ine":14,"ing":16," fi":17,"ini":10,"km":16,"ink":82," ge":36,"li":577,"lk":332,"le":352," ga":186,"ld":23,"lg":22,"inu":15,"la":1306,"lb":52,"iny":13,"n ":1478," co":22,"ht":11,"hu":92,"ikh":54," ce":15,"hi":387,"hn":16,"ho":217," ci":36,"ila":160,"id":471,"ic":103,"yin":59,"ib":108,"ia":61,"ih":88,"in ":262,"ig":350," da":424,"if":21,"yih":49,"yig":21," cu":34,"hy":12,"k ":24,"iq":21," do":45,"ilo":13,"ir":438,"is":630,"it":49,"ill":18,"ilk":32,"ix":28,"ilm":12,"ii":1062,"ij":21,"ik":134," de":120,"ili":51,"il":385,"im":170,"in":663,"io":30," di":70,"yir":13," dh":511,"ima":76,"je":69,"ji":178,"iy":896," du":39,"l ":398,"ja":82,"xi":123,"xo":56,"xm":34,"xw":27,"xu":185,"xb":18,"xa":850,"xe":161,"xd":67,"wg":11,"wi":81,"how":15,"wl":60,"wo":26,"wu":102,"hog":13,"y ":1137,"wa":1722,"wd":13,"hoo":55,"we":185,"hor":60," yi":55," yu":13,"uy":12,"ux":164,"uw":34,"uu":720," ye":13,"ve":10," ya":211,"x ":140," xo":33,"uj":15,"uk":28,"ul":200,"uf":20," xi":90,"ug":210,"uh":16,"uq":90,"ur":259,"hna":12," xu":39,"us":114,"ut":54,"um":90,"un":214,"tu":47,"ub":104,"ua":11,"ud":145,"uc":17," xe":16,"w ":59," xa":103,"to":175,"hul":37,"tr":25,"te":120,"ti":246,"th":37,"ta":784,"su":111,"ss":19,"st":173,"sw":12,"sl":47,"sk":106,"sm":25,"so":371,"sr":10,"sc":17,"se":101,"sh":456,"ي ":20,"xme":19,"si":404,"xma":13,"u ":1296,"sa":722,"sb":21,"rr":20,"rs":115,"rt":160,"ru":77,"rw":11,"rx":11,"ry":27,"ro":144,"rn":40,"rm":32,"rl":22,"rk":200,"ri":397,"hu ":11,"rg":35,"re":258,"rd":49,"rc":12,"rb":25,"ra":754,"t ":51,"qu":35,"qs":10,"xoo":44,"qo":163,"IYO":15,"qi":33,"qe":23,"qa":334,"qd":61,"s ":240,"pu":15,"pr":14," ru":12," u ":194," sa":221," se":17," si":157," sh":112," so":259," qu":21,"xya":13," ra":48," re":33,"ن ":17," ro":11," qe":14," qa":168," qo":69," qi":18," oo":464," or":10,"huu":29," wa":1582," we":88," wo":12," wu":102," wi":39," uu":195,"xud":12,"xuu":133,"Hoo":12," tu":36," us":16," ur":10,"م ":11," um":12," un":11," ug":131,"yg":19," ta":231,"ye":133,"yd":48,"ya":998,"yb":27,"xwe":21,"xy":17," su":25,"yu":34,"ys":166," to":18," th":15," ti":62,"yo":522,"yn":280," te":11,"yk":19,"yi":189,"fee":11,"xey":58,"xee":54,"far":32,"fad":21,"faa":24,"Suu":12,"Axm":14,"xir":17,"xis":13,"xil":26,"xii":17,"xid":14,"xig":24,"Sta":10,"xa ":169,"eyb":17,"eya":63,"eys":74,"Tal":11,"eyn":163,"eyo":14,"eyk":10,"xda":51,"eyd":16,"eye":14,"exa":10,"exd":12,"exe":51,"xe ":46,"xar":38,"Ban":18,"Baa":14,"Bad":22,"xam":54,"xan":16,"Bar":23,"xay":166,"xba":16,"xaa":341,"xad":27,"xag":13,"wux":100,"Aas":11,"Shi":22,"She":12,"Sha":50,"ex ":21,"Af ":19,"ey ":159,"er ":103,"es ":21,"eri":33,"ere":30,"era":49,"Afr":32,"esh":28,"esa":10,"ers":11,"ern":14,"ekh":16,"en ":89,"ela":47,"ele":26,"eli":17,"ell":42,"elo":15,"emb":19,"ena":28,"wla":53,"eny":12,"egm":90,"ego":14,"egt":11,"Som":32,"Soo":136,"woq":10,"el ":65,"wda":13,"Buu":11,"Bur":11,"we ":12,"gir":17,"gii":26,"wey":124,"wee":27,"gey":15,"gee":44,"wi ":14,"wis":10,"wii":22,"Sal":11,"gab":12,"gac":45,"gad":26,"DA ":20,"gaa":436,"gar":35,"gay":21,"gal":70,"gan":69,"ga ":388,"San":27,"wa ":22,"Cab":27,"waq":26,"wan":30,"wal":39,"wax":715,"way":45,"Cal":18,"war":52,"was":18,"Car":40,"waa":581,"wad":168,"Bel":10,"fur":37,"Bis":12,"fri":39,"fii":15,"Boo":10,"fka":13,"da ":918,"de ":22,"dad":131,"daa":159,"dab":19,"dal":113,"WAX":16,"dag":65,"dah":101,"dar":51,"dan":291,"dam":39,"day":61,"dax":79,"daw":32,"Cum":10,"dda":74,"dde":11,"ddi":17,"cun":14,"EEY":13,"EEL":14,"EGM":11,"Deg":30,"cyo":15,"uxu":126,"Daa":22,"Dag":10,"Dal":10,"uxa":15,"uun":88,"uul":63,"uum":13,"uug":15,"uud":50,"uux":10,"ux ":12,"uus":29,"uur":74,"uuq":18,"uut":24,"uwa":28,"co ":26,"cma":23,"ush":13,"usi":11,"use":13,"uu ":316,"usu":26,"uso":11,"uti":16,"uta":19,"cod":10,"com":11,"uqa":33,"uqd":36,"ura":37,"ure":10,"uri":31,"urk":17,"urt":32,"uru":37,"ur ":39,"csi":14,"uma":56,"unt":32,"unk":27,"uni":11,"una":85,"cel":30,"uka":13,"cee":17,"uls":10,"ulo":20,"ull":14,"ulk":27,"uli":14,"ule":16,"ula":26,"un ":29,"che":12,"ul ":36,"ciy":12,"cii":28,"uga":40,"ugu":128,"ugs":11,"ed ":184,"ebi":20,"uf ":13,"uda":33,"udi":12,"eb ":12,"udu":37,"ug ":18,"ega":53,"ub ":32,"eek":25,"een":99,"eel":138,"eem":18,"eeb":23,"eeg":65,"eed":229,"eey":113,"eh ":42,"ees":56,"eer":157,"edk":18,"edi":12,"ede":22,"eda":72,"uba":39,"ubb":11,"edu":15,"ud ":36,"edo":11,"ecl":12,"ece":25,"ee ":319,"dwe":25,"dwa":11,"duu":57,"tuu":22,"doo":96,"dow":37,"tri":10,"The":10,"dna":12,"to ":75,"Dhe":14,"Dhu":12,"dun":12,"dul":20,"dug":23,"too":69,"du ":45,"tii":59,"tig":10,"tir":66,"dha":335,"tio":16,"tic":26,"dhu":33,"dib":25,"dhi":112,"dhe":122,"dho":21,"der":19,"dex":18,"dey":16,"dee":48,"deg":96,"den":15,"di ":38,"dle":11,"dla":17,"tee":36,"dku":14,"dki":33,"do ":77,"ter":36,"diy":39,"din":26,"ti ":29,"dir":60,"dis":51,"dig":42,"dii":165,"dil":12,"dka":134,"the":16,"rga":14,"ri ":48,"rge":14,"rey":42,"ree":110,"rda":15,"rdh":16,"re ":77,"rco":10,"rax":25,"ray":99,"rar":15,"ras":44,"rat":10,"rba":11,"rah":41,"ran":54,"ram":17,"rak":12,"rab":82,"raa":165,"rad":87,"rs ":11,"roo":48,"rna":16,"rne":11,"rni":10,"ro ":63,"rma":23,"Nab":15,"rla":13,"rku":10,"rko":10,"rki":41,"rke":18,"rka":117,"riy":58,"ris":28,"rig":31,"rii":110,"rik":46,"rin":21,"ric":16,"rya":13,"rur":10,"run":18,"ruu":10,"ry ":11,"rsi":16,"rsa":63,"rsh":15,"rta":110,"rto":18,"rte":11,"rti":11,"rub":12,"saa":120,"sab":11,"sad":52,"sag":23,"sah":11,"sal":49,"sam":47,"sbi":14,"san":191,"sas":14,"sar":33,"say":43,"sa ":99,"sha":242,"sho":46,"she":41,"shi":83,"si ":68,"siy":42,"sid":91,"shu":10,"sil":13,"sim":38,"sii":82,"sig":32,"se ":61,"sh ":17,"see":14,"sow":16,"som":59,"soo":214,"soc":14,"su ":25,"sla":30,"sku":37,"ska":59,"so ":55,"sma":15,"حمد":15,"ste":15,"sta":66,"sto":28,"sti":41,"sub":11,"suf":12,"sug":13,"sul":11,"suu":22,"tal":42,"tag":10,"tah":87,"taa":194,"tad":13,"tay":60,"tar":33,"tan":31,"tam":13,"te ":13,"ta ":272,"bka":23,"biy":71,"bis":28,"bir":12,"bil":48,"bin":31,"big":38,"bii":37,"bo ":47,"bol":129,"bna":15,"boo":24,"bba":12,"be ":19,"ban":61,"bal":43,"bah":27,"bad":232,"baa":96,"bab":12,"bay":35,"bax":34,"bas":10,"bar":156,"bdi":25,"bdu":11,"bi ":69,"bee":145,"ber":11,"bey":12,"ca ":55,"car":35,"cas":13,"can":24,"cay":13,"cab":20,"cad":53,"caa":145,"cal":33,"cag":16,"bri":13,"bra":15,"bsa":11,"bta":33,"bti":13,"bur":20,"bul":12,"buu":52,"aka":19,"am ":40,"aki":23,"aji":27,"ajo":16,"qa ":12,"al ":136,"ahi":41,"qar":20,"qay":16,"aho":10,"qad":44,"qab":47,"qaa":149,"ahd":20,"qan":14,"qal":17,"ahe":26,"aha":697,"agm":13,"agt":24,"agu":76,"ago":29,"aq ":22,"qdi":38,"qda":17,"any":23,"ano":51,"ann":10,"ant":70,"ans":32,"ane":21,"ang":10," ال":46,"ani":87,"ank":185,"ana":385,"anb":26,"and":92,"amu":23,"amo":10,"amk":32,"amh":19,"ami":82,"ame":93,"amb":16,"ama":257,"aly":20,"qey":14,"alo":160,"alm":17,"all":22,"alk":165,"alg":17,"ali":424,"ald":14,"ale":110,"ala":480,"alb":42,"an ":924,"aba":194,"abd":37,"abe":56,"abi":146,"abk":18,"abo":40,"abt":38,"abu":36,"aca":130,"aab":114,"aac":13,"aaa":15,"aaf":38,"aag":64,"aad":398,"aaj":28,"aak":21,"aah":75,"aan":742,"aal":743,"aam":113,"aas":211,"aar":259,"aaq":41,"aaw":32,"aat":37,"aay":89,"aax":19,"ad ":334,"qiy":15,"ac ":19,"aa ":1110,"qii":10,"ab ":33,"afr":11,"aft":15,"afi":18,"aga":458,"age":12,"ah ":325,"afa":38,"ado":85,"adl":23,"adk":153,"adn":12,"adh":26,"adi":223,"add":96,"ade":66,"ag ":29,"adw":22,"adu":44,"aci":16,"ace":10,"Qar":12,"acd":15,"ada":1138,"af ":19,"acy":15,"acs":19,"qor":48,"qoo":60,"qof":24,"axi":13,"axm":15,"axo":15,"axu":15,"axa":702,"axb":16,"axd":50,"axe":90,"ayi":11,"ayo":52,"ayn":115,"ays":84,"ayu":13,"axy":16,"axw":26,"ayb":10,"aya":151,"ayg":11,"ayd":32,"aye":26,"ba ":84,"qur":24,"at ":11,"arg":25,"are":96,"ard":30,"arb":14,"ara":357,"aro":72,"arn":19,"arm":17,"arl":10,"ark":135,"ari":153,"aru":20,"ars":39,"art":72,"asa":99,"ary":14,"asi":106,"ash":156,"ase":12,"aso":31,"ask":17,"ar ":198,"as ":80,"aqa":111,"aqi":13,"aqo":51,"ax ":98,"awe":20,"ay ":932,"awa":46,"awl":31,"awi":33,"ata":37,"asu":12,"ast":33,"ato":18,"ate":17,"ra ":58,"ati":34,"ngi":20,"ni ":47,"Isl":11,"neh":11,"ng ":11,"nee":16,"nfu":25,"ney":14,"ne ":43,"ndh":18,"ndi":22,"nan":17,"nac":45,"nad":83,"nah":41,"nab":18,"naa":131,"Ito":28,"nbe":15,"nd ":69,"AXE":10,"AY ":10,"nba":11,"AXA":12,"nay":47,"nax":11,"na ":412,"Jab":13,"Jan":13,"Jam":22,"KA ":11,"KAL":10,"nya":38,"AAL":13,"ADA":25,"nuu":21,"nto":13,"nti":37,"nta":176,"nte":24,"nsi":15,"nsa":22,"AHA":14,"noo":67,"noq":18,"nna":11,"ALA":17,"nle":12,"no ":59,"nki":22,"nka":271,"AN ":16,"nii":13,"nih":11,"nig":39,"niy":10,"nis":15,"nim":17,"nin":39,"ogu":24,"oga":60,"Jub":11,"ol ":60,"oco":11,"odi":15,"of ":38,"oda":43,"ofe":10,"LA ":12,"د ":29,"oba":86,"od ":60,"obo":134,"obi":38,"ة ":21,"oyi":94,"oya":10,"owl":29,"ow ":45,"ost":14,"ota":10,"ose":28,"os ":15,"oon":114,"ool":98,"oom":198,"oof":13,"oog":60,"ood":123,"oob":124,"or ":39,"ooy":111,"oow":16,"oot":14,"oos":65,"oor":31,"Koo":13,"ore":44,"ori":14,"osa":11,"ort":21,"oqo":37,"oqd":11,"ora":61,"ola":52,"on ":52,"olk":99,"ole":20,"olo":14,"oly":10,"ona":28,"onf":25,"oni":16,"onk":11,"ons":12,"ont":14,"oma":298,"oo ":749,"omp":12,"la ":241,"le ":159,"laa":281,"lab":61,"lac":11,"lad":232,"laf":10,"lah":96,"lag":116,"lal":23,"lan":88,"lam":27,"las":21,"lay":70,"lba":15,"lbe":31,"kuw":22,"kuu":18,"kun":22,"kul":14,"kto":17,"MAD":13,"lom":11,"loo":176,"lmo":12,"lmi":13,"lma":10,"lsh":13,"Luu":11,"li ":92,"lga":16,"ley":29,"leh":35,"lee":98,"lo ":165,"lla":49,"lle":32,"lka":311,"lki":14,"lis":19,"lin":48,"lim":15,"liy":204,"lid":28,"lia":24,"lib":24,"lil":40,"lii":17,"lig":30,"ma ":133,"maa":361,"mac":36,"mah":24,"mad":229,"mag":226,"mar":193,"mas":14,"mal":133,"man":32,"may":23,"max":25,"mba":26,"mbe":10,"me ":19,"med":68,"mee":72,"mey":24,"luq":12,"luu":17,"مد ":15,"lya":33,"lyo":10,"Mar":22,"Mas":10,"Mag":51,"Mad":20,"Maa":17,"Max":25,"moo":35,"muq":17,"muu":16,"mul":10,"Mux":13,"mhu":20,"Muq":24,"Mud":14,"mi ":19,"min":17,"mil":14,"mis":11,"miy":27,"mig":18,"mid":170,"mij":10,"mii":25,"mo ":60,"mka":33},"n_words":[94077,109135,83288],"name":"so"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":2951,"E":2593,"F":3211,"G":3160,"A":5867,"B":4180,"C":1638,"L":2905,"M":4987,"N":3399,"O":1229,"H":1921,"I":3066,"J":1447,"K":6663,"U":1057,"T":3497,"W":518,"V":1916,"Q":766,"P":4792,"S":7666,"R":2957,"Y":251,"X":569,"Z":898,"f":14736,"g":26110,"d":50009,"e":167179,"b":16462,"c":10479,"a":127255,"n":113931,"o":70925,"l":52723,"m":56721,"j":47732,"k":46937,"h":74009,"i":153617,"w":760,"v":25346,"u":51367,"t":143796,"s":94347,"r":123043,"q":14863,"p":40169,"z":11328,"y":8479,"x":1327,"Ë":504,"Ç":673,"Fil":198,"ë":138684,"ç":2147,"Fja":191,"Evr":427,"ο":279,"α":259," l":7602," m":19599," n":36778," o":2746," h":2298," i":13440," j":4180," k":16392," d":18386," e":22497," f":7928," g":6344," a":8088," b":5231," c":3040," z":2178," u":2660," t":29267," v":9770," q":6524," p":21985," s":22399," r":6234," J":1246," K":6396," H":1660," I":2018," N":2979," O":928," L":2729," M":4746," B":3697," C":1374," A":5029," F":3007," G":2987," D":2696," E":2074," Z":866," Y":222," X":455," S":7145," R":2508," Q":712," P":4598," W":483," V":1621," U":882," T":3065,"Gje":385," ç":814,"Gji":334," ë":7724,"Gju":218," Ç":648," Ë":309,"Gja":201,"Fra":312,"A ":555,"For":238,"Da":377,"Co":247,"Ch":193,"Du":229,"Do":262,"Dr":303,"De":624,"Di":445,"Dh":185,"Fe":366,"Fa":366,"Ev":475,"El":231,"Ga":322,"I ":544,"Fr":441,"Fo":342,"Fl":194,"Fj":196,"Fi":584,"BA":277,"C ":177,"Au":418,"Ar":625,"At":240,"Ba":1072,"Af":186,"Am":331,"An":632,"Ai":327,"Aj":203,"Al":611,"Bu":398,"Br":503,"Ca":335,"E ":302,"Bi":247,"Be":668,"Bo":499,"Ku":625,"Ky":228,"Kr":523,"Ko":2407,"Le":440,"Li":756,"La":432,"Lu":465,"Lo":267,"Me":757,"Mi":743,"Ma":1783,"Mu":398,"Mo":476,"Nj":248,"Ni":232,"Nd":207,"Ne":392,"Na":320,"No":435,"Gj":1228,"Gr":581,"Go":208,"Gu":206,"Ha":528,"He":303,"II":206,"Hi":261,"Ho":278,"In":443,"Is":481,"It":205,"Ja":385,"Je":182,"Jo":291,"Ju":322,"Ka":1334,"Kj":278,"Ki":240,"Un":312,"Tu":198,"Tr":315,"To":375,"Pë":415,"Th":341,"Ti":609,"Te":638,"Ta":357,"St":538,"Su":265,"Wi":200,"Vi":295,"Va":291,"Ve":412,"Qe":273,"Pu":185,"Pr":997,"Pe":613,"Pa":1058,"Po":615,"Pi":181,"Kë":300,"Se":537,"Si":523,"Sh":3256,"Sk":218,"Sp":262,"So":375,"Ru":267,"Rr":230,"Sa":579,"Re":810,"Ri":264,"Në":821,"Ro":386,"Ra":365,"Gre":313,"b ":407,"a ":28448,"Xh":218,"i ":33796,"ge":656,"ga":7640,"fj":347,"fl":382,"fi":3904,"fs":1052,"fr":816,"fu":1400,"ft":1033,"fo":1696,"bë":2042,"j ":4723,"hf":238,"he":19006,"ha":4203,"gl":638,"gj":7884,"gi":492,"gu":1606,"gr":2543,"cë":728,"go":1368,"du":2130,"dy":756,"g ":1263,"ea":970,"eb":315,"ec":617,"ed":3466,"de":5126,"di":8348,"dh":17111,"dj":1045,"dm":281,"do":4139,"ds":204,"dr":2528,"ew":200,"ex":233,"eu":592,"ev":3916,"ez":1764,"fa":1842,"h ":2770,"fe":1185,"eh":617,"eg":2448,"ef":474,"el":4768,"ek":4482,"ej":2627,"ei":482,"ep":1848,"eo":855,"en":13277,"em":3767,"et":19558,"es":10616,"er":15019,"eq":1060,"ca":1358,"e ":72317,"br":1628,"bu":1107,"bo":1748,"bj":242,"bl":1253,"bi":2183,"be":1978,"da":2581,"f ":429,"co":479,"ck":313,"ci":4788,"ch":654,"ce":1081,"c ":318,"az":1629,"ay":236,"ba":3411,"d ":2270,"at":11806,"as":8047,"ar":20865,"aq":1562,"av":2909,"au":845,"ak":4327,"al":8585,"ai":935,"aj":4288,"ap":2172,"am":3780,"an":16760,"ac":1088,"ad":3850,"ab":980,"ag":1063,"ah":884,"ae":194,"af":1439,"nu":1486,"nt":6015,"ns":1390,"jë":8682,"no":2581,"nn":299,"q ":809,"ny":268,"nx":351,"oe":253,"of":894,"oc":773,"od":2400,"oa":235,"ob":841,"om":4147,"on":11462,"ok":1665,"ol":4317,"oi":918,"oj":2100,"og":2207,"oh":3303,"ot":2941,"os":4887,"ov":2272,"ou":371,"op":2168,"oo":285,"kë":4598,"or":15900,"oq":456,"r ":22499,"oz":751,"pe":3763,"pa":6781,"pl":799,"lë":3066,"po":4293,"pi":2577,"pj":1679,"lo":4348,"hë":4530,"lm":750,"ll":8126,"ls":381,"lu":2680,"lt":1030,"ly":235,"o ":4990,"ma":6825,"mb":3911,"me":13242,"iç":252,"mi":11206,"mj":687,"mp":1128,"mo":2299,"mr":960,"mt":825,"ms":189,"mu":2606,"p ":820,"na":4803,"nc":1647,"nd":14065,"ne":6011,"nf":360,"ng":8085,"ni":9080,"nj":10024,"nk":623,"jy":374,"jv":200,"jt":2182,"ju":2055,"jr":203,"js":193,"jn":989,"fë":409,"jo":2391,"jm":201,"kj":232,"ki":2183,"ke":4651,"ka":7856,"m ":4929,"ky":261,"ks":1435,"kt":3378,"ku":6445,"ko":7127,"gë":1191,"kr":3597,"kl":551,"km":406,"kn":219,"li":13297,"lk":295,"lj":608,"le":5962,"ld":260,"lg":209,"la":6555,"lb":400,"n ":21363,"hr":608,"hs":268,"hp":932,"hq":3267,"hv":498,"ht":18411,"hu":3735,"hj":1397,"hk":4243,"hi":5167,"hn":261,"ho":1827,"dë":2976,"hm":1740,"id":1902,"ic":1580,"ib":803,"ia":5265,"ih":920,"ig":1500,"if":611,"ie":1546,"hy":320,"k ":3634,"iq":788,"ir":3308,"is":14282,"it":23622,"iu":865,"iv":2165,"ij":2183,"eç":454,"ik":8974,"il":7041,"im":10832,"in":18975,"io":3678,"ip":5685,"je":13383,"ji":4180,"iz":2757,"l ":3714,"ja":7433,"xh":647,"të":37539,"z ":666,"së":6850,"y ":1757,"wa":196,"vl":237,"vj":627,"vi":5977,"vr":700,"rë":7665,"vo":819,"uz":609,"uv":188,"ve":12217,"vd":565,"va":1908,"x ":256,"ui":282,"uj":950,"uk":1866,"ul":3717,"ue":2301,"uf":993,"ug":1106,"uh":1399,"uq":312,"ur":9836,"us":3067,"ut":2692,"um":2889,"un":3916,"që":4381,"up":1314,"ty":1124,"tu":7405,"tt":246,"ub":1079,"ua":6884,"ud":1227,"uc":369,"w ":197,"pë":9674,"to":7414,"tm":513,"ts":310,"tr":5518,"te":16922,"tj":1903,"ti":17731,"th":4122,"v ":372,"tb":272,"ta":11255,"su":1269,"sv":254,"ss":508,"st":7650,"sy":189,"sl":488,"sk":843,"sn":215,"sm":700,"sp":784,"so":3394,"sq":194,"sc":195,"se":5448,"sh":35309,"sj":384,"si":11685,"u ":4144,"sa":3279,"rr":4311,"rs":2938,"rt":4337,"ru":3266,"rv":415,"ry":2015,"rq":210,"rp":488,"ro":7859,"në":24491,"rn":818,"rm":3136,"rl":428,"rk":1734,"rj":983,"ri":20960,"rh":205,"rg":2062,"rf":941,"re":16088,"rd":2048,"rc":625,"rb":1536,"ra":14927,"t ":30601,"qy":921,"qu":769,"më":7228,"qj":229,"qi":4293,"qe":2941,"qa":297,"s ":14264,"pt":2183,"pu":2168,"pr":4557,"ps":322,"zë":930,"zg":391,"zh":1019,"zi":2576,"zb":204,"ze":1031,"za":1391,"zy":222,"zu":773,"zo":1184,"vë":1622,"zj":202,"zm":366,"ye":1323,"yt":1297,"ys":927,"yr":1681,"yp":187,"yn":208,"ym":190,"yl":202,"Art":179,"Aut":217,"Bas":348,"Ai ":287,"Ame":229,"Ber":210,"Bot":188,"Ës":295,"Çm":287,"アアア":185,"ër":16534,"çë":180,"ëp":625,"ëm":1771,"ën":7250,"ël":747,"ëz":648,"ëv":1509,"ës":19082,"ët":3696,"ëh":461,"ëj":261,"ëd":282,"ë ":85317,"çm":262,"çi":247,"çe":269,"ça":442,"ç ":230,"Nob":257,"Per":268,"Pas":194,"Par":336,"Pro":243,"Pri":459,"Pre":232,"Ish":253,"Ita":196,"ア":259,"Jug":202,"Ka ":178,"Kal":215,"Kar":206,"Kjo":277,"Kon":248,"Kom":428,"Kos":1014,"Kor":248,"Ky ":207,"Lig":189,"Mal":292,"Mar":396,"Maq":235,"Mad":210,"Min":180,"Ësh":295,"çmi":226,"ëhe":440,"ëm ":968,"ël ":264,"ëll":356,"ën ":3829,"ënt":362,"ëng":554,"ënd":1225,"ëna":195,"ëmi":309,"ëmb":216,"ëdh":247,"Sta":228,"Shk":524,"Shq":1268,"Sht":446,"Ser":197,"Rep":364,"Në ":708,"Uni":285,"The":241,"Tir":480,"Për":406,"çan":218,"bje":222,"bis":202,"bim":300,"bin":241,"ble":244,"bli":768,"bol":404,"bot":888,"be ":208,"ban":768,"baj":338,"baz":296,"bas":738,"bar":627,"bi ":668,"ber":314,"bel":434,"bet":408,"ca ":491,"cav":177,"cak":266,"ce ":197,"bri":501,"bra":276,"bre":593,"bur":304,"bul":254,"aka":205,"am ":292,"ake":288,"aki":184,"afë":206,"ajo":282,"ajt":819,"al ":1170,"aja":239,"ak ":563,"ahi":215,"aj ":1938,"agj":197,"ago":189,"ano":603,"ant":917,"ans":360,"ane":1253,"ang":643,"ani":2814,"anj":226,"ana":956,"anc":511,"and":1227,"amu":220,"amp":346,"ami":877,"ame":814,"amb":181,"ama":345,"alt":205,"alo":479,"all":1341,"ali":1967,"ale":1756,"ala":461,"an ":2721,"aks":181,"aku":485,"akt":1546,"ako":591,"aba":207,"abe":271,"aft":183,"afi":534,"ai ":361,"adm":223,"adh":1978,"adi":505,"ade":333,"aci":658,"ada":241,"azi":334,"aze":285,"azh":332,"atë":1708,"azë":204,"at ":2869,"arg":323,"are":2127,"ard":768,"ara":2483,"aro":307,"anë":3784,"arm":193,"arl":201,"ark":631,"arj":231,"ari":2009,"aru":212,"arr":782,"ars":590,"art":1488,"asa":351,"asi":536,"ash":2587,"ar ":6112,"apa":246,"alë":446,"apo":1058,"as ":2094,"aqe":844,"aqi":214,"aqj":186,"amë":284,"ava":276,"aut":366,"arë":1850,"avi":245,"ave":2116,"ata":707,"asu":211,"ast":1173,"atr":379,"ato":1090,"apë":205,"ate":982,"ati":2851,"atu":362,"aty":276,"アア":222,"jed":481,"jeo":227,"jer":1714,"jek":498,"jel":290,"jen":1698,"jes":2300,"jet":2586,"jev":418,"ji ":371,"jat":998,"jas":292,"jar":261,"jal":704,"jak":272,"jan":2402,"je ":2467,"joh":964,"jon":213,"fër":208,"jit":1409,"jis":586,"jim":228,"jin":428,"jik":319,"jih":193,"jo ":859,"ito":557,"ipë":1386,"itu":2464,"iud":180,"iso":211,"ist":3511,"iv ":257,"ita":1245,"ite":1604,"ith":1252,"iti":3696,"itj":291,"irë":662,"iut":194,"iva":248,"ivi":385,"ive":1169,"ilë":642,"ipt":1558,"ipi":226,"is ":537,"ion":2522,"ikë":1269,"ior":219,"ipa":1258,"ipe":572,"ir ":232,"inë":1756,"iro":194,"iri":583,"isi":327,"ish":5381,"ise":279,"isa":571,"iu ":213,"imë":180,"ire":227,"ira":796,"it ":11180,"ja ":2116,"itë":896,"isë":2854,"izu":355,"izo":367,"izm":294,"izi":684,"iza":492,"kim":794,"kin":237,"kip":221,"kis":422,"km ":346,"ken":688,"ket":291,"ke ":3158,"kra":414,"kre":377,"kt ":208,"kry":877,"ku ":990,"kro":261,"kru":297,"kri":1336,"kov":192,"gët":303,"kos":230,"kor":588,"kon":2239,"kom":1483,"gël":243,"kol":575,"koh":623,"kod":386,"gë ":240,"ko ":239,"kla":195,"jtë":442,"jut":199,"jtj":180,"jtu":499,"jua":282,"juh":823,"jug":449,"jta":181,"jti":303,"jnë":775,"jt ":307,"kat":718,"kar":404,"kas":189,"kan":1846,"kal":946,"kam":249,"kak":195,"ka ":2506,"jyr":194," Ga":321," Fo":338," Fr":441," Fi":583," Fl":194," Fj":187," Ha":528," He":302," Go":206," Gr":579," Gu":204," Gj":1228," Ho":278,"ha ":1068," Hi":261," Je":182," Ja":384," Is":478," It":205," In":442,"ham":267,"han":292,"hap":354," Ka":1328," Ki":237,"har":380," Kj":278,"has":365,"hat":909," Jo":291," Ju":322," La":426," Le":440," Li":748," Ko":2406," Kr":523," Ku":622," Ky":226," Ma":1780," Mi":739," Me":757,"he ":11876," Lo":267," Lu":464," Nd":206," Ne":390," Na":320," Nj":246," Ni":229," Mo":474," Mu":395,"hek":391,"hel":179,"hej":224,"het":2896,"hes":233,"her":715,"heq":210,"hen":1147,"hem":522,"hfa":178,"hi ":531," Am":329," An":631," Al":605," Ai":327," Aj":202," Af":185," Ba":1069," Au":417," At":239," Ar":624," Be":668," Bi":243,"hin":747,"him":565,"hil":181," Bo":497," Br":498," Bu":397,"his":436,"hit":1517,"hir":335," Ca":327,"hje":1227," Ch":191," Co":245,"hka":488," Da":377," Di":443,"hke":445," Dh":185," De":621,"hki":188," Dr":303,"hkr":988," Do":261,"hko":663,"hku":970," Du":229," El":231," Ev":475," Fe":366,"dë ":206," Fa":359,"cë ":308," Xh":218,"gli":277," Wi":198,"gjë":270,"gon":186,"cës":254,"gos":180,"gor":539," a ":180,"gru":420,"gra":1045,"gri":269,"gre":682," Kë":300," Po":615," Pi":181,"gul":267," Pe":612," Pa":1054," No":435," Ra":364," Në":821," Ro":384," Re":809," Ri":264," Pr":996,"gur":313,"gus":310," Pu":185," Qe":273," Su":263," St":520," Ta":356," Th":340," Ti":608," Te":634," Tr":311," To":373," Pë":415," Rr":230," Ru":266," Sa":578," Sh":3248," Si":519," Se":536," So":373," Sp":261," Sk":217," Va":291," Ve":411," Vi":293," Tu":194," Un":311," ja":2600,"ial":691,"ian":1176," je":756," in":1156," is":1788," it":245," ka":3973,"ibr":249," ki":698," ke":222," jo":218," ju":582," ha":507," he":538," gj":3833," gr":1301,"ia ":2790," gu":290," hi":627," dë":296," ho":209," hu":257," nj":8261," ni":252," ng":5742," nd":3078,"iel":198," ne":1135,"ien":214," na":398,"ier":413," mu":1092," mo":751," of":214,"ifi":230," nu":833," no":407," le":585," li":3581," la":1142," ku":2373,"ici":344,"ich":192," ky":232," km":370,"ie ":357," kl":275,"ica":291," kr":2025," ko":4104," me":6491," mi":1023," mj":371," ma":2979," mb":1927," lu":944,"idi":290,"idh":520,"ide":545," ll":324," lo":338," af":314," ad":323," am":508," an":1204," ap":925," ai":237," aj":220," ak":811," al":253," au":397," ar":1194," at":742," as":478," ba":1617,"il ":230,"ija":200," bi":487,"ije":259," be":279," bo":1013," bu":403,"iju":269," br":668," ca":214," e ":17735,"im ":2164,"eça":298,"ika":1647,"iga":202,"igj":563,"igu":244,"icë":252,"ij ":910,"ihe":650,"ik ":1689,"imo":407," et":343," es":291," en":262," em":840," el":514,"ime":1522," ek":723,"imi":5114,"ip ":230," fe":444,"ind":2986,"ina":1004," fa":908,"imt":634," fu":1154," fs":638,"ino":317,"ijë":223," fr":355,"int":466," fo":1164," bë":626,"ins":260,"inf":213,"ine":1441," fl":312,"ing":422," fj":334," fi":2335,"inj":211,"ini":1146,"iq ":474," ga":620,"inu":257," i ":9701,"iko":537,"iki":386,"ike":2366," ci":2530,"ila":1202," da":481,"in ":7905,"ikt":220,"iku":640," do":799,"ilo":291,"ill":1867," dr":678,"ilm":536," de":1917,"ilj":319,"ili":1491," di":2033," dh":10408,"ile":229,"ima":293," ed":1307,"io ":303," du":817," dy":651,"hpr":194," vë":300," zo":209,"hpe":278," zy":211," za":245,"dës":713," zb":179,"hkë":454," zh":535,"hoq":360,"hor":539,"dër":1831," zg":360," të":19747,"hme":1293,"hul":218,"hua":438,"hty":181,"htu":748,"htr":565,"htm":182,"hti":611,"htj":309,"hte":3639,"hta":810,"hsh":200,"hro":441,"ht ":2187,"hqi":3061," ru":213," rr":1833," u ":1417," sa":775," se":1591," si":4496," sh":10305," sk":290," sp":362," so":432," qu":416," qy":665," t ":287," ra":1563," re":1537," ri":358,"htë":9010," në":16417," ro":441," pu":708," pr":3527," qe":1420," më":4730," os":1270,"hum":1047," kë":1907," or":779,"hur":1522," pe":1738," pa":4231,"hvi":461," pl":373," lë":638," po":2196," pi":398," pj":1282," së":2396," va":567," ve":3244," vd":524," vo":419," rë":271," vi":3777," vj":545," vl":199," ty":371," tu":272," us":358," që":3732," uj":303," ta":476," st":879," su":385," tr":1582," to":411," pë":7156," th":1109," ti":1065," tj":633," te":3252,"fes":283,"fer":357,"faq":704,"fam":375,"fak":216,"ezë":244,"ez ":343,"etë":2083,"ezu":183,"eza":184,"ezo":208,"eze":303,"ezi":227,"eta":1185,"ete":1425,"etj":331,"eti":2581,"eth":892,"eso":355,"est":534,"esv":178,"eto":1111,"etr":567,"etu":501,"eve":3023,"erë":1553,"evi":375,"esë":1317,"epe":182,"er ":1621,"eor":217,"ekë":226,"eqi":478,"es ":2822,"epu":459,"elë":206,"epr":500,"eri":4116,"erg":266,"ere":548,"era":1561,"erb":418,"et ":8412,"emë":345,"esm":180,"esh":1754,"esi":1453,"ese":746,"eu ":239,"esa":635,"err":624,"ert":290,"ers":1252,"ern":432,"erm":871,"enë":512,"ero":390,"eki":248,"ekn":209,"eko":404,"egë":220,"eks":618,"ekt":1016,"eku":676,"en ":3312,"ela":426,"ele":1238,"eli":682,"ell":702,"elu":266,"emb":225,"ema":440,"eme":707,"emo":232,"emi":771,"emr":507,"ene":497,"ena":430,"end":3296,"enc":779,"eno":271,"eni":647,"enj":342,"ens":203,"ent":2477,"eog":217,"egj":763,"ego":445,"ej ":1063,"egu":355,"ek ":360,"eja":234,"el ":689,"ejt":732,"eka":189,"em ":322,"gju":704,"gjy":338,"gje":2061,"gji":3096,"gja":1196,"gim":207,"gaz":249,"gar":602,"gat":279,"gan":669,"ga ":5443,"ftë":453,"fus":270,"fut":207,"fun":698,"fra":187,"fri":223,"fsh":1041,"bër":688,"for":1268,"bët":491,"bëh":244,"fil":1245,"fik":505,"fin":217,"fis":186,"fit":614,"fiz":441,"fja":339,"da ":488," Çm":287,"de ":479,"dal":409,"dat":201,"dar":567,"dan":203," Ës":295,"ces":191,"ci ":188,"cia":487,"cil":2535,"cio":782,"ean":206,"eat":241,"ega":205,"edh":1894,"edi":471,"ede":284,"eda":183,"edo":295,"eci":308," çm":240,"dyt":203,"dy ":376,"dur":609," ës":7645,"dor":1271,"don":593,"dom":180,"dok":214,"dos":301,"dmi":238,"dod":448,"duk":778,"dua":191,"dri":384,"dra":292,"dre":608,"dry":602,"dro":361,"dha":308,"dhu":653,"dia":304,"dhj":875,"dhi":1463,"dhe":12079,"der":1468,"des":314,"det":819,"dh ":431,"deg":199,"del":210,"dek":243,"den":513,"dem":246,"di ":1814,"do ":585,"dhë":1093,"dje":856,"dim":1054,"din":518,"dio":362,"diq":424,"dis":945,"dit":1503,"dik":388,"rga":634,"ri ":4362,"rgj":700,"ret":2035,"res":1012,"rev":322,"rez":402,"rfa":431,"rbë":484,"rfs":234,"rg ":266,"rea":282,"rej":1398,"reg":1272,"reh":210,"rem":230,"ren":1060,"rek":369,"req":525,"rdo":695,"rdi":283,"rdh":548,"re ":5892,"rca":229,"rd ":234,"rap":224,"raq":331,"ras":547,"rat":1584,"rav":877,"rbi":290,"rba":177,"rbe":317,"raj":258,"rah":340,"ran":2020,"ram":546,"ral":554,"rak":530,"rab":220,"raf":580,"rad":1584,"rs ":444,"ror":1129,"ros":215,"nës":866,"nët":322,"rot":314,"rom":357,"ron":1148,"nën":960,"rop":794,"rov":207,"rod":677,"roc":243,"roj":419,"roi":191,"rol":305,"rok":233,"rof":239,"roh":382,"rog":307,"rne":248,"rmo":182,"rmu":349,"ro ":340,"në ":21951,"rmb":202,"rma":1098,"rme":349,"rmi":261,"rku":392,"rko":294,"rki":195,"rke":238,"rja":228,"rje":742,"riz":365,"rip":239,"rio":403,"rit":2857,"ris":2678,"riv":179,"riu":462,"rih":296,"rig":343,"rij":533,"ril":452,"rik":1578,"rin":1760,"rim":1881,"ria":1506,"rie":292,"rk ":231,"rtë":586,"rye":967,"rue":227,"rur":358,"rup":598,"rus":297,"rve":228,"rrë":463,"rsi":760,"rso":384,"rsa":288,"rsh":704,"rse":194,"rta":523,"rto":212,"rte":551,"rth":191,"rti":1091,"rua":870,"rtu":307,"rt ":743,"rmë":369,"rri":1024,"rrj":335,"rre":1446,"rra":326,"rru":264,"saj":699,"san":214,"sat":207,"sa ":1426,"rys":615,"sha":1190,"shm":1425,"sho":753,"shp":846,"shq":1997,"shr":422,"sht":17551,"she":1082,"shf":232,"shi":1551,"shk":3709,"si ":3797,"sje":269,"sid":277,"sia":412,"shu":1255,"sit":1207,"sir":214,"sis":1576,"sip":1484,"sin":587,"sio":712,"sim":613,"sik":262,"se ":3292,"ser":504,"set":189,"sh ":1239,"sen":315,"spo":232,"spe":251,"sot":202,"sov":1143,"son":618,"sor":814,"st ":714,"shë":1713,"sla":211,"ske":209,"sma":252,"sme":323,"stë":596,"sse":192,"ste":1041,"sta":1312,"sto":730,"sti":1254,"stu":543,"str":1367,"sua":229,"sue":178,"sur":369,"sve":238,"taj":547,"tal":816,"tav":260,"tat":899,"tas":209,"tar":4736,"tan":685,"te ":5516,"tbo":238,"ta ":2326,"pa ":427,"pe ":427,"par":1922,"pat":286,"pas":2622,"pak":267,"pan":389,"pi ":351,"per":1941,"pet":184,"pes":328,"pla":231,"plo":258,"pje":1425,"pju":181,"pia":517,"pik":379,"pin":244,"pio":290,"pit":346,"poz":244,"lër":206,"por":810,"pop":489,"lëv":248,"lët":262,"lës":678,"pos":377,"lën":745,"pon":243,"pol":674,"lë ":644,"po ":983,"pta":1447,"pub":598,"pti":317,"pto":261,"pra":544,"pri":797,"pre":1386,"pro":1746,"pun":526,"pul":588,"qar":179,"qe ":553,"qet":282,"qev":185,"qer":357,"qen":825,"qed":256,"qi ":290,"qit":232,"qip":2962,"qis":408,"më ":5423,"mës":389,"mër":628,"mën":311,"qua":358,"quh":192,"qyt":799,"ra ":3887,"ncë":386,"ngj":549,"ngl":339,"ngu":279,"ngr":213,"ni ":1941,"nge":207,"nga":5420,"ndë":2245,"nen":1014,"ner":340,"net":653,"nes":253,"nev":297,"ng ":438,"nez":219,"nci":280,"nce":340,"nca":359,"ne ":2694,"ndu":806,"ndr":1255,"ndo":1605,"ndj":868,"ndi":3173,"nde":1242,"nda":1141,"nal":768,"nar":589,"nd ":1287,"nav":195,"nat":905,"nas":219,"na ":1164,"ntë":218,"nxh":177,"nuk":387,"num":496,"nua":315,"nto":514,"ntr":261,"nti":1877,"nta":460,"nte":1912,"nst":368,"nsi":197,"nt ":562,"nom":633,"non":199,"jës":1000,"jër":345,"nor":452,"nov":244,"një":7204,"jë ":6765,"no ":251,"ngë":383,"nji":251,"nje":980,"nja":288,"njo":1016,"nie":218,"nic":209,"nia":391,"niz":506,"niv":445,"nis":1563,"nit":1162,"nim":624,"nin":724,"nik":917,"ogr":753,"ogj":624,"oi ":722,"ohu":860,"ohe":1613,"oj ":252,"ojn":599,"oje":249,"oja":193,"ol ":185,"oci":316,"odh":931,"ode":218,"odi":416,"of ":206,"odu":227,"ofe":213,"obe":363,"otë":804,"ovë":1008,"ozi":268,"otu":316,"oti":443,"ote":245,"oto":318,"opë":263,"ost":299,"ota":201,"osh":522,"ose":1363,"oso":1152,"ovi":292,"orë":1117,"ova":484,"ove":273,"oqë":260,"opo":219,"opi":438,"ope":206,"os ":591,"opu":565,"kën":763,"okë":307,"or ":3887,"kët":1025,"kës":1506,"kër":189,"orm":1202,"onë":412,"orr":382,"orc":230,"ord":255,"ore":3570,"org":551,"ori":2679,"ort":629,"oru":235,"ot ":268,"ora":318,"ola":251,"on ":2800,"oli":1087,"oll":1071,"ole":276,"olo":874,"ohë":394,"oka":403,"ogë":224,"oku":205,"ona":1138,"ond":187,"one":812,"onj":684,"oni":2539,"ojë":384,"ono":475,"ons":340,"ont":1418,"oma":638,"kë ":735,"ome":460,"omb":682,"omi":782,"omp":538,"omo":183,"omu":542,"la ":1588,"le ":1504,"lan":809,"lam":463,"lar":790,"lat":987,"las":448,"lav":342,"lba":191,"kut":329,"kus":201,"kur":1434,"kup":473,"kun":397,"kul":958,"ky ":230,"kth":196,"kte":440,"ksi":671,"kuf":336,"kua":615,"ktr":411,"ktu":532,"kti":608,"kto":809,"llë":931,"lon":395,"hën":1076,"hëm":639,"lor":960,"hër":431,"log":828,"loj":558,"loi":197,"hës":946,"lot":202,"lmi":262,"ltu":283,"lua":941,"luf":367,"lue":177,"lsi":217,"li ":2003,"lez":179,"lev":346,"les":310,"let":699,"ler":367,"lem":300,"len":674,"lek":623,"lls":213,"llu":548,"hë ":1014,"lla":1048,"lle":383,"lli":2212,"llo":1314,"lm ":208,"lje":416,"ll ":881,"lja":178,"lit":1752,"lis":1010,"lir":324,"lio":179,"lin":3413,"lim":1223,"liz":412,"lic":188,"lid":353,"lia":455,"lib":286,"lik":872,"lig":270,"ma ":556,"maj":406,"mak":224,"mad":655,"mar":852,"mas":295,"mal":684,"man":1367,"mat":1185,"mba":779,"mbl":257,"mbi":849,"mbe":401,"mbr":294,"me ":7511,"mbu":225,"med":203,"met":1333,"mev":393,"mes":776,"mer":1044,"mel":355,"men":1333,"mbë":676,"lum":373,"mpj":180,"mpi":353,"ëtë":434,"mpo":182,"mon":405,"mor":732,"mos":301,"mri":623,"mra":263,"mua":310,"mta":664,"mur":234,"mul":232,"mun":1237,"muz":308,"ës ":6600,"ëpi":229,"ër ":5836,"mi ":2352,"ëse":300,"ësa":205,"ërt":763,"ërs":761,"ërp":333,"ënë":345,"ëro":630,"ërr":212,"mje":546,"ërk":679,"ërm":698,"ërg":545,"ëri":1847,"min":2761,"ërf":795,"mil":488,"mim":764,"ërd":700,"mir":472,"ëra":457,"mis":535,"ërb":763,"ët ":1589,"mit":2427,"mik":541,"mij":213,"ëti":364,"ëto":188,"ëpë":198,"ëta":971,"ësh":8836,"ësi":2416,"ëso":446,"ësu":202,"ërë":456,"ëve":1277,"zua":506,"zyr":215,"zgj":372,"zi ":247,"zet":248,"ze ":310,"zan":199,"zak":278,"zat":320,"vës":631,"zon":428,"zoh":205,"vë ":592,"zmi":236,"zhv":458,"zim":386,"zik":591,"zis":248,"zit":331,"yrë":392,"yte":900,"ysh":673,"yrt":226,"yra":227,"yre":469,"za ":311,"ytë":280,"yes":421,"yer":268,"yeq":202,"të ":32538,"tëp":239,"tër":1782,"tët":229,"tës":1610,"tëv":188,"tën":440,"tëm":288,"xha":211,"xhi":228,"Çmi":287,"së ":5936,"sëm":198,"sën":372,"vro":528,"veç":250,"vil":586,"vin":191,"vic":177,"viz":481,"vit":3504,"vis":321,"vje":622,"rë ":3679,"vog":268,"rën":1295,"rët":460,"rës":1368,"rëv":444,"rëz":253,"vep":456,"ver":1289,"vet":682,"ven":1374,"vel":341,"vdi":416,"ve ":7283,"val":324,"var":488,"va ":428,"uzi":400,"urë":434,"ush":1610,"usi":271,"ust":434,"uti":207,"ute":336,"utb":206,"uto":619,"us ":390,"umë":772,"ut ":820,"ura":1239,"ure":247,"urg":207,"uri":1038,"uro":455,"unë":434,"urr":247,"urt":615,"qër":295,"qës":313,"ur ":4520,"upi":325,"upt":364,"umr":331,"umi":458,"uma":183,"umb":309,"ume":412,"që ":3460,"uni":521,"ujë":192,"und":1633,"una":206,"up ":226,"uku":221,"ukt":232,"uke":519,"ult":579,"uhë":641,"ull":1873,"uli":199,"ula":307,"un ":316,"uk ":422,"ujt":177,"ugo":283,"uft":506,"uhe":480,"uha":223,"udh":352,"udi":562,"ues":1871,"ufi":398,"ug ":248,"ua ":660,"uar":4550,"ual":287,"uan":427,"ubl":681,"uaj":694,"tyr":784,"tur":3146,"tul":217,"tua":1367,"tud":541,"tue":838,"tre":747,"tra":1541,"tri":1729,"tru":612,"tro":732,"tu ":717,"tme":342,"to ":593,"pë ":222,"toj":273,"toh":424,"pës":324,"tom":222,"ton":869,"tok":323,"tol":198,"për":8954,"tor":3712,"tij":916,"til":200,"tik":1967,"tit":3342,"tis":395,"tin":4094,"tim":1479,"tio":325,"thu":272,"tia":238,"tiv":962,"tje":1286,"tja":349,"thë":407,"tem":772,"ten":475,"tek":589,"tel":426,"th ":586,"tev":1136,"tet":4737,"tes":527,"ter":1775,"ti ":2991,"tho":197,"ths":192,"the":828,"thi":467,"tj ":257,"tha":760,"BA ":261,"zë ":417},"n_words":[1760559,2076420,1518161],"name":"sq"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":81614,"E":57703,"F":65307,"G":58568,"A":101184,"B":88354,"C":71198,"L":68342,"M":88836,"N":55651,"O":30870,"H":85820,"I":53417,"J":41709,"K":67803,"U":32439,"T":73345,"W":29206,"V":48264,"P":64540,"S":183432,"R":54519,"Y":8819,"X":3563,"Z":5647,"f":573497,"g":796472,"d":1393951,"e":3122256,"b":426497,"c":439610,"a":2769748,"n":2605268,"o":1513455,"l":1576016,"m":1015154,"j":171593,"k":981769,"h":549646,"i":1975038,"w":38244,"v":702277,"u":641405,"t":2130744,"s":2008119,"r":2700083,"q":5367,"p":544314,"z":28076,"y":232177,"x":48501,"Å":5245,"Ö":9894,"é":11827,"å":315734,"ä":562602,"ü":3814,"ö":429328," l":92862," m":210837," n":78082," o":280066," h":123124," i":363557," j":42422," k":154387," d":271388," e":324754," f":359134," g":79790," a":307677," b":150256," c":22076," y":5622," u":95544," t":189989," v":181674," p":161981," s":494735," r":78305," J":40952," K":63008," H":83187," I":43524," N":52283," O":27251," L":64682," M":82331," B":82669," C":63320," A":84049," F":59275," G":56022," D":77148," E":54653," Z":5389," Y":8451,"и":3560,"о":3458," S":163518," R":51071," P":59385,"а":4143," W":28309," V":40400," U":30253," T":68207," å":25155," ä":214016," ö":25267," Å":5217," Ö":9837,"A ":15096,"F ":4011,"Da":13315,"Cl":3705,"Co":16319,"Ce":3786,"Ch":12336,"Ed":3462,"Do":5286,"De":40359,"Di":6082,"Fe":3321,"Fa":6453,"Eu":5433,"Er":5959,"En":12408,"El":5544,"Ge":7613,"Ga":6615,"I ":11291,"Fr":15828,"Fo":6429,"Fl":4338,"Fi":8854,"B ":3976,"C ":5688,"Au":4696,"Ar":9978,"As":4625,"D ":3210,"Ba":14125,"Ad":3948,"Am":3905,"An":15253,"Al":15279,"Bu":5535,"Br":13255,"Ca":15038,"Bi":5634,"Be":16801,"Bo":13961,"Bl":4691,"Ku":4607,"Gö":6526,"Kr":7039,"Ko":8264,"Le":10243,"Li":14030,"La":13793,"Lu":7481,"Lo":9958,"Me":11631,"Mi":11197,"Ma":32867,"Mu":4647,"Mo":11964,"Ni":7405,"Ne":10761,"Na":8237,"P ":3714,"Ny":3435,"No":15326,"Ol":4960,"Gr":10737,"Go":5035,"Gu":8716,"Ha":34281,"He":15981,"II":4483,"Hi":4225,"Ho":12929,"Hu":4943,"K ":4256,"In":13186,"Is":3759,"Ja":10130,"Je":5266,"Jo":15004,"Ju":4068,"Ka":19254,"Fö":6773,"M ":4536,"Ki":6491,"Ke":4271,"Up":4420,"Un":6136,"Ty":4282,"Tu":3610,"US":9839,"Tr":7442,"To":8743,"Th":13440,"Ti":6372,"Te":8278,"Ta":5863,"V ":6654,"Sy":4319,"St":38584,"Sv":20759,"TV":4696,"Su":6305,"Wo":3225,"Wi":9262,"Wa":7328,"We":4981,"Vi":10913,"Va":8115,"Ve":4995,"Pr":8839,"S ":6544,"Pe":11351,"Pa":14568,"Po":7593,"Pi":4124,"Or":4779,"Se":8850,"Sc":5830,"Si":8063,"Sh":4489,"Sk":9057,"Sp":7173,"So":11179,"Ru":3598,"Sa":15777,"Re":10230,"Ri":6924,"Ro":13031,"SA":10320,"Ra":8269,"b ":9647,"a ":475159,"Yo":5029,"Sö":3933,"Vä":9609,"bö":8648,"i ":355245,"fy":6327,"gd":12439,"ge":171880,"gf":3700,"ga":96208,"gb":3970,"fj":3473,"fl":21495,"ff":12688,"bå":4938,"fi":46296,"bä":6295,"fr":73976,"fu":7874,"ft":39725,"fo":44582,"j ":14814,"gy":3273,"he":67749,"ha":96551,"gn":17191,"gl":17131,"gj":3942,"gi":45303,"gh":13879,"gg":27749,"gu":21320,"gt":19887,"gs":51929,"gr":63119,"go":20646,"dt":3864,"du":15217,"dv":9506,"dy":5840,"g ":162227,"ea":25581,"eb":31218,"ec":33642,"ed":130488,"de":463243,"dd":77303,"dg":4171,"df":3528,"di":70271,"dh":3483,"dk":3943,"dj":7587,"dm":5988,"dl":16308,"do":35107,"dn":14977,"ds":66886,"dr":50264,"ew":8421,"ex":20287,"eu":8550,"ev":26002,"ey":12032,"fa":41395,"h ":210556,"fe":29350,"eh":9823,"eg":38958,"ef":28657,"ee":11526,"el":240548,"ek":41076,"ej":4317,"ei":17866,"ep":29857,"eo":14539,"en":790036,"em":80183,"et":324009,"es":162582,"er":632238,"ca":14258,"e ":392431,"by":19363,"br":42376,"bu":29326,"bo":47200,"bl":35898,"bi":33086,"bb":11237,"be":121628,"db":6093,"da":116179,"f ":20176,"cu":4292,"ct":5938,"co":13763,"ck":84782,"ci":26831,"ch":224513,"ce":44300,"c ":6962,"az":4871,"ay":9160,"ba":47465,"d ":342945,"at":219537,"as":105952,"ar":493878,"ax":4752,"aw":3514,"av":157525,"au":25162,"ak":38031,"al":200396,"ai":14768,"aj":13462,"ap":42925,"am":154793,"an":490452,"ac":25353,"ad":157516,"aa":3218,"ab":17368,"ag":71089,"ah":8954,"ae":7341,"af":19685,"nu":26246,"nt":115132,"ns":230163,"nr":8077,"no":73752,"nn":76927,"jö":13694,"ny":12036,"nv":25830,"oe":5372,"of":28837,"oc":231970,"od":38865,"oa":7955,"ob":24414,"om":258987,"on":206933,"ok":32662,"ol":124858,"oi":5452,"kå":10126,"oj":3321,"og":35630,"kä":17405,"oh":11968,"ot":63544,"os":44373,"ov":30611,"ou":23986,"op":33718,"oo":9810,"or":224835,"r ":824959,"ow":7736,"kö":11089,"pe":96760,"pg":3698,"pa":50445,"pl":23211,"pn":3697,"po":41753,"ph":7115,"lä":54929,"pi":19978,"lå":14272,"lo":57125,"ln":12525,"lm":43916,"ll":251645,"ls":70931,"lp":6851,"lv":21024,"lu":27907,"lt":40778,"hö":19038,"ly":18029,"o ":47987,"md":3318,"ma":128458,"mb":42798,"mg":3435,"mh":5236,"me":195253,"mf":8701,"mk":4460,"ml":20883,"mi":57579,"mn":28583,"mm":63704,"mp":23791,"mo":40002,"mr":10561,"mt":14449,"ms":22455,"mu":35770,"my":8276,"p ":30112,"na":190258,"nb":8963,"nc":14129,"nd":274187,"ne":121340,"nf":12519,"ng":233628,"jä":20365,"nh":9404,"ni":141019,"nj":5464,"nk":26276,"nl":21248,"nm":4189,"ju":37966,"jo":14819,"gå":19638,"ki":37850,"kh":15721,"ke":98699,"ka":235170,"m ":249455,"fö":204508,"ky":14497,"gö":7789,"ks":31609,"kt":90094,"ku":21062,"kv":10463,"ko":98732,"kr":52042,"kl":32687,"km":8676,"kn":24158,"li":194145,"hå":8698,"lh":8848,"hä":12770,"lk":26056,"lj":22122,"le":171301,"ld":48550,"lg":7505,"lf":10995,"la":251597,"lb":23488,"n ":932062,"hr":6279,"ht":5130,"hu":27638,"hj":3586,"dä":13148,"då":7895,"hi":28029,"hn":5105,"ho":40609,"hl":4048,"id":81847,"ic":45251,"ib":12524,"ia":49113,"ig":122539,"if":22472,"ie":80770,"dö":42849,"hy":3755,"k ":153527,"ir":29485,"is":203197,"it":107910,"iu":7188,"iv":58134,"ix":3284,"ii":3814,"ik":122112,"il":185111,"im":21187,"in":352330,"io":82024,"ip":11363,"je":24984,"få":5419,"fä":10915,"iz":3965,"l ":174554,"ja":31220,"tä":25279,"xi":4781,"tå":13823,"xt":11670,"sö":12206,"z ":7095,"xa":4078,"xe":8556,"sä":22008,"wi":3916,"så":23262,"rö":18726,"y ":50286,"wa":8836,"we":6979,"vl":8639,"rä":38698,"rå":61767,"vi":89468,"vt":4439,"vu":11778,"vr":3344,"vs":16546,"vn":4095,"vo":9064,"uv":14989,"ve":165060,"vd":3502,"va":154942,"x ":11441,"ui":6645,"uk":16033,"ul":41627,"ue":11748,"ug":17372,"ur":68778,"us":77613,"ut":71499,"um":42001,"un":129135,"up":38948,"ty":33055,"tu":36176,"tt":213949,"tv":19150,"ub":16763,"ua":23575,"ud":29487,"uc":9950,"w ":8904,"to":108377,"tn":19214,"tm":5831,"tl":17707,"ts":67257,"tr":105466,"tg":11122,"tf":8472,"te":298793,"tk":5397,"tj":8416,"ti":267169,"på":66737,"th":27907,"v ":145502,"tb":14691,"ta":250228,"su":13835,"sv":52306,"ss":78482,"st":354837,"sy":19796,"sl":47405,"sk":296865,"sn":10814,"sm":23482,"sp":69363,"so":182040,"sr":7516,"sd":11605,"sc":12968,"sf":11899,"se":125434,"sh":21233,"sg":4215,"sj":15800,"si":100666,"nö":3871,"u ":12792,"sa":104819,"sb":15311,"rr":35803,"rs":138379,"rt":88500,"ru":68780,"rv":15260,"ry":24021,"rp":8643,"ro":113836,"rn":86552,"rm":33875,"rl":42446,"rk":70077,"rj":9549,"ri":267243,"nå":6703,"nä":22292,"rh":9656,"rg":57899,"rf":20073,"re":276377,"rd":70096,"rc":6855,"rb":28710,"ra":275879,"t ":541739,"mö":6674,"qu":3576,"må":14078,"mä":17850,"s ":365755,"lö":8782,"pt":22670,"pu":12367,"pp":63825,"pr":69684,"ps":16974,"vä":52142,"zi":3697,"vå":20606,"za":3995,"yg":18262,"yf":3366,"yc":9359,"yd":14899,"ya":7829,"tö":15637,"yt":14189,"ys":34208,"yr":23125,"yp":6499,"yn":12472,"ym":10053,"yl":9018,"yk":5316,"å ":92466,"äc":6371,"Ös":5286,"ö ":8541,"én":3279,"åv":3406,"ån":81191,"åt":17902,"ås":6794,"år":43331,"åg":13520,"åe":3379,"ål":17121,"åk":8868,"åd":24022,"ät":25342,"äv":21618,"äx":8215,"äm":18322,"äl":34618,"än":82390,"äp":6744,"äs":34864,"är":272645,"äd":8120,"äg":24196,"äk":15670,"öv":17529,"öt":11908,"ör":194602,"ös":15048,"öp":9470,"ön":13721,"öl":7954,"öm":7511,"öj":4231,"ök":5353,"ög":10789,"öd":115283,"一":3570," Ga":6575," Ge":7551," I ":6565," Fo":6382," Fr":15807," Fi":8815," Fl":4322," Ha":34254," He":15951," Go":5001," Gr":10670," Gu":8675," Hu":4927," Ho":12906," Hi":4166," Je":5254," Ja":10108," Is":3740," In":13126," Fö":6755," Ka":19203," Ke":4195," Ki":6438," Jo":14976," Ju":4056," La":13680," Le":10167," Li":13966," Ko":8252," Kr":7023," Ku":4598," Gö":6518," Ma":32732," Mi":11127," Me":11596," Lo":9918," Lu":7453," Ne":10698," Na":8152," Ni":7391," Mo":11908," Mu":4615," Am":3890," An":15211," Al":15206," Ad":3934," Ba":14044," Au":4688," As":4526," Ar":9902," Be":16738," Bi":5612," Bl":4680," Bo":13888," Br":13201," Bu":5515," Ca":14547," Ce":3768," Ch":12294," Cl":3646," Co":16148," Da":13258," Di":6045," De":40274," Do":5127," Ed":3453," El":5523," Er":5935," En":12351," Eu":5425," Fe":3308," Fa":6390," Sö":3929," Wi":9211," We":4953," Wa":7296," Vä":9600," Yo":5022," Or":4762," Po":7530," Pi":4113," Pe":11321," Pa":14488," Ny":3423," No":15289," Ol":4952," Ra":8225," Ro":12950," Re":10191," Ri":6904," Pr":8806," Sy":4310," Sv":20662," TV":4398," Su":6295," St":38280," Ta":5840," Th":13403," Ti":6347," Te":8213," US":9725," Tr":7389," To":8642," Ru":3585," Sa":15732," Sh":4450," Si":8029," Sc":5776," Se":8798," So":11126," Sp":7111," Sk":9040," Va":8077," Ve":4954," Vi":10865," Tu":3571," Ty":4268," Un":6118," Up":4407," ja":13806," få":4608," fä":3840," in":71154," is":5201," ka":33444," fö":177054," gå":6683," ki":4333," ke":3269," jo":4366," ju":20755," ha":54753," he":14640," gi":5926," gr":27602," gu":3989," dö":38312," id":4924," dä":12563," då":6671," hi":6661," ho":7645," hu":13035," ne":4922," na":17184," my":5673," mu":11845," mo":16924," ok":9806," ol":7701," om":30114," kä":11926," oc":192923," of":14077," ny":4446," nu":5991," no":24830," le":9968," hä":8278," li":23241," la":18204," kv":5049," ku":9406," ky":7530," km":5693," kl":6892," kr":12841," ko":48113," me":100026," mi":15594," ma":40584," hö":10368," lo":3574," ad":5086," am":17761," an":46161," ap":9654," ak":3619," al":18398," av":131499," au":10184," ar":19850," at":35597," ba":19765," bi":15445," be":42377," bo":12025," bl":19693," by":8036," br":16393," e ":3326," et":57787," en":197679," el":30240," ef":10304," eg":6206," fe":13408," fa":16157," ex":7709," fu":4697," fr":64090," fo":25076," fl":15697," bå":3785," fi":26001," ge":16400," ga":6755," i ":270663," bö":6965," fy":5095," ce":5878," ci":5961," da":13590," do":7611," dr":6511," de":166176," di":13002," vä":22816," yt":3626," tä":6373," sö":6783," ru":4704," ry":5559," sa":28037," se":37040," sj":9848," si":31916," sm":3812," sl":17774," sk":41581," sp":27636," so":132908," mö":3254," ra":6370," re":30692," ri":11458," nå":5590," nä":11487," ro":9439," pu":3945," pr":32472," ps":3376," s ":6480," mä":6492," må":7724," or":14441," kö":4010," pe":10275," pa":12607," pl":6656," po":22401," lå":10106," lä":20032," rö":4188," så":14446," sä":12510," va":97104," ve":11661," rä":4287," vi":43226," ty":10671," tv":8751," tu":4474," ut":38370," ur":7073," up":19177," un":29257," ta":17457," sy":11768," st":66122," sv":36306," su":3592," tr":20781," to":9251," th":7205," på":64716," ti":85531," te":14430," Ös":5280," år":16994," åt":5457," än":4390," äl":3304," är":187240," äv":12511," ös":4997," öv":12751,"Fin":4121,"Eri":4081,"Eur":4760,"En ":5663,"Eng":3595,"Öst":5201,"Fra":6710,"Fre":3915,"Hel":4484,"Her":3363,"Han":20213,"Har":3270,"Gra":3275,"Ind":4121,"Hon":4948,"Alb":3593,"And":5658,"Car":7211,"Ber":6424,"De ":4186,"Det":11763,"Den":16516,"Dan":4412,"Cha":4718,"Cou":3952,"New":5646,"Nor":12355,"Per":3613,"Pet":3811,"Par":5960,"Pro":3506,"SA ":9788,"Joh":8659,"För":6368,"Kal":4090,"Kar":6529,"Göt":5300,"Lin":4919,"Man":3676,"Mal":3953,"Mar":11224,"Söd":3244,"Wil":4251,"Väs":5464,"Yor":3342,"Sve":19151,"Str":3346,"Sto":17434,"Sta":8398,"Ste":4189,"TV ":4653,"äga":3677,"äge":8468,"ägg":3687,"äck":6350,"änn":6181,"äns":11234,"ämn":7168,"äms":3895,"äng":10799,"änd":34964,"äpp":6344,"Sch":3849,"San":4701,"är ":211888,"älv":4078,"äll":14085,"äkt":7912,"äkn":3856,"äld":3913,"än ":11102,"äve":13254,"ävl":6172,"äxt":6157,"ärk":4071,"ärl":8853,"ärm":4235,"ära":8245,"ärd":6093,"äre":3225,"ärn":6279,"ärs":4830,"äst":24789,"ätt":18556,"åde":18255,"ågo":4233,"åna":9220,"ång":25948,"åll":6899,"ån ":40117,"åre":4622,"åt ":4155,"ård":7005,"år ":24938,"åte":4737,"ått":4014,"Upp":4062,"Tys":3259,"The":8834,"USA":9498,"bis":3915,"bil":16029,"bin":3462,"ble":7136,"bli":9670,"bla":13299,"bok":5630,"bol":12338,"bor":13470,"bbe":3314,"ban":15887,"bas":6749,"bar":10200,"beg":3230,"ber":59590,"ben":7083,"bel":10187,"bes":11143,"bet":17916,"ca ":4920,"ce ":7249,"bri":10671,"bro":7219,"bra":3701,"bre":3338,"bru":13260,"bur":4739,"bun":4581,"bum":12276,"by ":6960,"byg":8032,"aka":4483,"am ":15544,"ake":4072,"al ":27066,"ain":4963,"aj ":9347,"ags":6901,"agn":5084,"anv":11075,"anu":13080,"ano":4964,"ann":22378,"ant":22398,"ans":71523,"ane":7517,"anf":3613,"ang":11657,"ani":15707,"ank":9321,"anl":8379,"ap ":4055,"ana":13418,"anc":5238,"and":127973,"amt":9744,"amm":21193,"aml":16952,"amo":3480,"amn":18156,"amp":5088,"amh":4267,"ami":11713,"amf":3558,"ame":25638,"amb":4074,"ama":6369,"alv":3571,"alt":10698,"als":5130,"alo":3365,"alm":9277,"all":37991,"alk":5184,"ali":21293,"ald":6403,"ale":25866,"ala":19113,"alb":12779,"an ":132556,"akt":15324,"ad ":44388,"aft":3394,"aff":3221,"afi":4365,"aga":8161,"age":18047,"adm":3257,"adi":7918,"ade":78504,"ag ":17369,"ads":8986,"ack":8031,"ach":4677,"ace":4160,"ada":3703,"af ":3707,"at ":35834,"are":94208,"ard":11975,"arb":9854,"ara":27724,"aro":4452,"arn":18125,"arm":4137,"arl":13544,"ark":16487,"ari":37821,"arr":8387,"ars":21054,"art":35107,"asi":5423,"ase":5556,"ask":3632,"ar ":171511,"apa":8845,"ape":7645,"app":4146,"apr":9022,"as ":45411,"ava":3381,"avs":7927,"avi":4830,"ave":5436,"ay ":3818,"av ":125744,"ata":7543,"ast":21648,"ass":11560,"ato":8615,"ate":21603,"ati":48093,"att":63033,"ats":14038,"atu":6856,"aur":3255,"aug":8539,"jer":3396,"jen":6427,"fäl":3203,"fär":6347,"jan":14542,"je ":5239,"jor":7802,"itu":3337,"itt":20776,"ity":3491,"isk":74955,"ism":4171,"iss":12117,"ist":54624,"iv ":6421,"ita":12733,"ite":17827,"iti":20829,"ius":3596,"iva":11600,"ivi":7901,"ive":20061,"is ":19720,"ion":60198,"ir ":3453,"irk":5584,"isi":5287,"ish":4776,"ise":7902,"isa":8098,"ire":6030,"it ":10727,"ja ":4918,"kil":7942,"kiv":4415,"kin":6276,"gån":6463,"går":10145,"kis":6751,"kho":12716,"kel":5730,"ken":19496,"kes":3520,"ker":29895,"ket":13180,"key":3901,"ke ":13529,"kra":8741,"kre":6291,"kt ":28682,"ksa":5304,"ksd":4444,"kro":5638,"kri":26131,"kot":3776,"kor":12173,"kon":20182,"kom":35824,"kol":9820,"ks ":3388,"kna":9166,"kni":12636,"klu":5282,"kla":15409,"kli":6794,"jul":9684,"jun":10738,"jur":5620,"kat":7694,"kar":20696,"kas":4078,"kap":15909,"kan":45226,"kal":21132,"kam":3425,"kad":6591,"ka ":105354,"för":137814,"föd":63483,"ha ":4594,"ham":7622,"han":27562,"hal":5140,"hav":3856,"har":31426,"had":5793,"he ":12764,"hel":8034,"het":17433,"her":10299,"hen":4180,"hem":4216,"då ":5689,"där":12388,"hin":3839,"his":5950,"gli":6286,"gla":6605,"gni":3551,"gna":6982,"gs ":13333,"gon":4582,"gor":4044,"gsk":4034,"gru":20067,"gra":18317,"gt ":15114,"gre":14703,"gst":6826,"gsm":3229,"gus":10096,"grä":4757,"ial":7807,"ian":12040,"ic ":4032,"ibl":3325,"id ":28142,"ibe":3733,"ia ":17620,"iet":8507,"iel":7300,"ien":33298,"ier":11523,"ies":3414,"ig ":23812,"ift":11636,"ick":11163,"ici":4587,"ich":7049,"ice":6107,"ie ":9770,"ica":5181,"ids":4482,"idr":4877,"idn":3610,"idi":9977,"ide":12497,"ida":10207,"il ":15253,"ika":38088,"ige":28072,"iga":25749,"igh":9261,"igi":4197,"igg":9635,"igt":12741,"ik ":16104,"ime":3935,"ind":16350,"ina":18896,"inn":24862,"ino":16003,"int":17796,"ins":22420,"inf":4654,"ine":12860,"ing":135978,"ini":12277,"inl":6319,"ink":4368,"inv":10239,"ike":30688,"ila":3896,"in ":36372,"ikt":12966,"iks":9950,"ilo":5583,"ill":91188,"ilk":6078,"ilm":10451,"ilj":12364,"ili":8683,"ild":15424,"ile":4047,"io ":6788,"ils":3516,"hol":17007,"hon":3609,"hri":3262,"hum":3706,"hus":7589,"huv":8614,"död":37434,"fes":6198,"fer":3302,"feb":8379,"fat":13524,"far":5867,"fam":7188,"fal":3610,"ext":4721,"exe":4418,"eta":15891,"ete":27600,"eti":4494,"esp":9596,"est":29943,"ess":19229,"ev ":6530,"etr":3912,"ets":17582,"ett":72343,"ety":4212,"ew ":5717,"eve":6119,"eva":3607,"evi":3537,"ey ":7588,"elä":5847,"er ":284777,"eor":5154,"es ":68641,"ept":9611,"epp":4218,"epr":4795,"erk":19481,"erl":7951,"eri":65274,"erg":20456,"erh":4470,"enä":3549,"ere":6711,"erf":4436,"era":63522,"erb":6250,"et ":167149,"esk":4221,"esi":7482,"ese":7369,"erv":5509,"err":9816,"ert":13225,"ers":49047,"ern":40094,"erm":7787,"ero":5248,"eki":3784,"eko":7026,"ekt":14604,"en ":561421,"ela":36480,"ele":19021,"eli":7809,"eln":6244,"ell":67565,"els":32415,"elt":6933,"emb":26969,"ema":5588,"eme":8121,"emm":3946,"emo":6706,"emi":7410,"emp":5979,"ene":8729,"enh":4318,"eng":9041,"enb":3805,"ena":11883,"end":16477,"eno":10317,"enn":10256,"eni":7932,"ens":78815,"ent":44100,"enr":3651,"ege":11795,"egi":9515,"egr":5273,"eis":3601,"ein":4280,"el ":33296,"em ":8949,"öte":5512,"gjo":3575,"öst":10660,"git":4157,"gis":8031,"giv":6533,"gin":4809,"gio":4718,"gic":3318,"gif":3672,"örs":34131,"öra":7478,"örb":6194,"örd":7646,"ghe":6517,"öre":25188,"örf":10182,"örj":5505,"örk":3758,"ggn":3889,"gge":11686,"gga":4221,"gi ":4509,"öpi":3748,"ör ":72432,"gen":70552,"get":15734,"ger":32986,"ges":7614,"gel":15568,"gde":4040,"ge ":21010,"ön ":5859,"öm ":3264,"gas":5062,"gar":34878,"gat":4458,"gan":12485,"ga ":26894,"ögs":4071,"ödr":4070,"bör":6225,"frå":39311,"frä":4616,"fte":17736,"fta":8409,"fun":3567,"ft ":9311,"fra":14521,"fri":9705,"for":23834,"fot":7918,"fol":8477,"fle":7024,"flo":3656,"fly":6056,"fic":5656,"fil":11844,"fik":3556,"fin":14513,"fis":3388,"öve":12758,"da ":24533,"dd ":64224,"de ":113906,"dad":11525,"dal":5199,"dag":13716,"dat":8203,"das":4305,"dar":13228,"dan":22463,"dam":6624,"dda":3257,"dde":6028,"cks":6976,"ckh":12838,"ckn":5609,"ckl":6273,"öd ":37381,"öde":6673,"ödd":63162,"ch ":194706,"cer":9280,"cen":9841,"cem":8851,"cha":5427,"cia":6069,"ck ":16616,"cie":3837,"che":9375,"chi":3577,"cir":4840,"cke":18899,"cka":6286,"ed ":60806,"ebo":7090,"ebr":11024,"ean":3203,"eat":4659,"ea ":3260,"efo":3657,"eft":11252,"edl":4748,"edi":6332,"edd":3460,"ede":13539,"eda":22012,"edr":4279,"eck":13530,"eci":3498,"ece":9407,"dvä":3688,"dor":6126,"don":6243,"dom":6715,"ds ":21182,"dmi":3679,"dni":12167,"dst":6539,"duc":4844,"dri":7974,"dra":24350,"dre":6617,"dro":7925,"dsk":11908,"dia":4497,"der":69482,"des":48453,"det":49713,"dec":9237,"del":42235,"den":110439,"dem":7457,"dle":5138,"dla":5392,"dli":4421,"din":7458,"dio":6335,"dis":13125,"dit":3226,"die":4951,"dig":14250,"dju":3500,"näm":4493,"när":11567,"näs":3945,"rga":9150,"ri ":24498,"rgi":4148,"rge":9457,"rgs":6893,"ret":18000,"res":20396,"rev":5759,"rfa":11243,"rds":5163,"rg ":18322,"rea":6436,"red":13615,"reg":14934,"rem":4053,"ren":33831,"rek":11220,"rel":8037,"rer":8988,"rep":9910,"rda":6312,"rdn":3815,"rdi":6203,"rde":19138,"re ":105451,"rbu":4396,"rd ":16211,"rar":16692,"ras":12995,"rat":27144,"rav":4107,"rbe":10058,"rag":4677,"ran":40758,"ram":18971,"ral":14704,"rak":6953,"rab":3374,"raf":9108,"rad":32041,"rs ":30266,"rr ":4068,"rlä":3700,"ror":5868,"ros":5853,"rot":10704,"rom":7155,"ron":11508,"rop":10007,"rov":7683,"rod":8744,"roc":6434,"rol":8031,"rof":6006,"rog":7715,"rna":45582,"rne":7715,"rni":4580,"ro ":5673,"rma":11324,"rme":6990,"rli":7045,"rld":8530,"rle":4342,"rla":6287,"rn ":13803,"rks":6561,"rko":6401,"rki":4271,"rke":8992,"rka":21148,"rm ":4966,"rja":6006,"rl ":5718,"rio":4391,"rit":18193,"ris":26661,"riv":11699,"rig":24904,"någ":4585,"ril":11007,"rik":49805,"rin":29008,"rim":3215,"ria":11330,"ric":6800,"rid":7589,"rie":22621,"rif":3813,"rk ":11840,"rup":12524,"run":19578,"rum":7564,"ruk":6602,"rus":3564,"rva":5265,"rvi":3304,"ry ":6473,"rsk":18149,"rsi":7847,"rso":8438,"rsp":7925,"rsa":13567,"rse":4696,"rta":8444,"rst":26007,"rss":3568,"rte":14318,"rth":3501,"rti":15874,"rua":8438,"rts":3730,"rt ":25088,"rri":6493,"rre":7456,"rra":8312,"sak":4471,"sal":8723,"sam":44053,"san":6209,"sat":10892,"sar":8127,"sa ":10458,"rys":4892,"sho":4590,"shi":4912,"sju":4204,"sie":4171,"sid":6241,"sk ":90513,"sit":12645,"sis":10405,"sin":20425,"sio":10464,"sik":13865,"sig":10296,"sda":5839,"sde":3304,"se ":11017,"sch":5874,"ser":32108,"ses":3294,"set":6663,"sed":10233,"sep":9935,"sen":30980,"sel":3990,"spo":4913,"spr":13840,"slä":13781,"spe":39400,"spa":4154,"som":124363,"son":32242,"sor":7813,"skå":7923,"soc":6710,"st ":46043,"ss ":9668,"sli":3748,"slo":3319,"slu":6612,"sky":3403,"sla":13504,"sle":3801,"ski":13633,"skl":5724,"sko":16207,"skr":17282,"sku":3455,"skt":14119,"sfö":5604,"ska":106537,"ske":8526,"sjö":6673,"sni":4681,"sjä":3770,"sma":7643,"sme":3479,"stå":11548,"stä":10967,"syd":3945,"stö":8124,"sys":5192,"svä":4118,"sse":11502,"ssa":10371,"sso":16553,"ssl":3516,"ssi":9151,"sst":3551,"ssp":3681,"ste":53976,"sta":88293,"stn":5297,"sto":24174,"sti":35352,"stu":7051,"str":43893,"sty":3941,"sva":8262,"sve":36524,"tal":35934,"tag":10048,"tad":34944,"tav":3738,"tat":22284,"tas":6110,"tar":38387,"tan":25694,"tam":3270,"te ":29236,"tbo":6832,"ta ":57380,"pa ":4951,"par":19834,"pas":4154,"pan":9819,"läg":9136,"lär":4866,"läp":5670,"län":20491,"läk":8254,"pen":15225,"per":22100,"pet":5946,"pel":41073,"pla":14119,"pin":6560,"lån":4645,"pis":3811,"låt":5580,"por":8039,"pop":3841,"pos":4574,"pol":17194,"pps":6631,"ppt":7044,"ppl":4401,"ppa":4036,"ppe":13206,"pp ":11986,"pub":4680,"pte":14474,"pru":4841,"psa":5456,"pri":16288,"pre":11182,"pro":27839,"prå":5570,"män":8000,"mäs":4442,"mål":5318,"mån":4648,"ra ":70040,"ngl":9266,"ngr":3616,"ngt":3834,"ngs":25507,"ni ":11042,"nge":55986,"nga":25508,"ngd":5985,"jäl":6437,"jär":7781,"nhe":3391,"neh":3788,"nel":8830,"nen":21817,"ner":30394,"net":14471,"nes":10240,"ng ":83505,"neb":3747,"ned":3616,"nce":6161,"ne ":14860,"ndr":19487,"nds":29703,"ndo":7016,"ndl":5881,"ndi":11752,"nde":95443,"nda":25018,"nal":16599,"nam":15523,"nan":10186,"nar":23024,"nad":14500,"nd ":63041,"nat":21915,"nas":10608,"na ":68634,"ny ":3572,"num":3857,"nus":4249,"nua":9868,"nty":4644,"nto":5785,"ntr":13553,"nti":10623,"nta":12228,"nte":31914,"nsp":4485,"nst":23393,"nss":4316,"nse":13545,"nsi":5055,"nsl":4609,"nsk":92387,"nsa":6209,"nri":3216,"nt ":21290,"ns ":54594,"nom":26142,"nor":19884,"nov":9987,"nne":21868,"nna":22185,"nno":3272,"nni":10650,"nns":9117,"nli":10002,"nn ":5037,"nla":5932,"no ":3529,"nke":3380,"ngå":3281,"nkt":4581,"nkr":3354,"nfö":4263,"nie":10939,"nia":3660,"niv":6000,"nis":23117,"nit":5715,"nio":3817,"nin":62497,"nik":4407,"ogr":10374,"ogi":6891,"ohn":3565,"kän":11729,"oha":5651,"kåd":7525,"ok ":3568,"ol ":3524,"och":189597,"oci":4767,"ock":32660,"ode":11255,"of ":8033,"odu":7243,"og ":7992,"oft":6611,"off":4118,"ofe":5148,"od ":6498,"obe":13311,"nvä":12582,"nvå":8371,"jör":3329,"köp":4602,"ote":5241,"ott":18376,"ots":4197,"oto":5671,"ost":8837,"ota":4879,"otb":6812,"osi":3701,"ose":3675,"oss":3221,"ovi":7790,"ove":12742,"oun":6352,"our":5919,"opp":7590,"ope":5756,"opa":4338,"os ":9800,"or ":33039,"ork":5444,"orm":13532,"orn":10771,"orr":11303,"ord":31398,"ore":8252,"org":22560,"ori":16030,"osa":3703,"ort":28485,"ors":16477,"ot ":11261,"ora":9653,"ola":11528,"on ":79327,"oli":25700,"oll":18015,"olk":10298,"ole":5201,"ols":3442,"olm":16608,"olo":12711,"oly":3210,"oka":3360,"om ":168644,"oke":3223,"okr":4246,"okt":9964,"ona":12877,"ond":7866,"one":32789,"ong":8629,"oni":7172,"ono":6812,"ons":23539,"ont":10940,"oma":9422,"ome":9673,"omb":3421,"omi":4970,"omm":30299,"omk":3355,"omp":5523,"omr":7897,"oms":4749,"op ":3482,"la ":37263,"le ":15487,"lde":11008,"lda":11000,"lds":6297,"ldr":3289,"lac":3318,"lad":21513,"lag":21635,"lan":79933,"lam":3972,"lar":35655,"lat":17536,"las":17797,"lba":3463,"ld ":9319,"lbu":12650,"kvi":3264,"kva":4810,"kus":3478,"kun":6039,"kul":5875,"kså":3945,"kta":4658,"kte":14929,"kti":15289,"kto":12389,"kyr":9176,"gör":5454,"ls ":8684,"lom":6295,"lor":5534,"lod":5577,"log":11812,"los":3900,"lot":3451,"lni":4907,"lme":6188,"lma":4184,"lms":5047,"lti":5165,"lub":5423,"lsk":12154,"lss":5166,"lst":11990,"lta":4364,"lte":4556,"lse":12155,"lsa":4733,"lt ":15274,"häl":3201,"här":4070,"li ":11521,"lev":10258,"les":8492,"let":19531,"ler":49986,"lem":6956,"len":25248,"lek":6522,"led":11323,"lls":18041,"llt":8125,"llv":5128,"lhö":4242,"lla":46043,"lle":53525,"llh":4880,"lli":11251,"lln":3548,"lkr":4019,"ln ":4441,"lke":7097,"lm ":18320,"lje":9323,"ll ":78820,"lja":3349,"lit":24473,"lis":16271,"lin":35763,"liv":5593,"lic":4558,"lia":6013,"lik":14533,"hål":7111,"lig":48482,"lie":9952,"ma ":10795,"maj":9292,"mar":31731,"mas":5427,"mal":5574,"man":40392,"mat":14610,"mbe":28393,"me ":3701,"med":67946,"met":18659,"mes":8746,"mer":42686,"mel":15875,"men":33082,"lva":3742,"lve":7060,"lun":3206,"lut":8771,"lyg":4277,"hög":7787,"hör":6448,"mpi":3263,"mpe":6539,"mpo":3301,"ms ":6348,"mod":5091,"mon":5448,"mok":4020,"mor":4951,"mot":12947,"mt ":8137,"mst":8336,"mrå":8280,"mus":13621,"mun":18113,"mfö":3853,"min":18579,"mil":15426,"mis":6696,"mit":4808,"mli":14006,"mla":4634,"mn ":9157,"mni":3641,"mne":9027,"mmu":16790,"mma":26822,"mme":13993,"vå ":7032,"väg":7022,"vän":13490,"vär":10896,"väs":8080,"väx":7295,"vån":8847,"ytt":4640,"yta":3770,"yst":7068,"ysk":15364,"yrk":11273,"yra":3515,"yde":3835,"yck":8174,"ya ":3709,"ygg":9192,"xte":5064,"tör":12969,"täl":6340,"xem":4510,"tår":5717,"tär":4618,"täv":5108,"söd":5258,"så ":8428,"sån":9241,"sät":7762,"röm":5410,"rör":3335,"vs ":5015,"vud":9343,"rät":6983,"råk":6071,"vik":5560,"vil":12065,"rån":39221,"vin":14334,"råd":11141,"vid":23789,"vit":5788,"vis":16981,"vli":3664,"rän":8989,"räk":4195,"räm":4496,"räd":4698,"ver":61891,"vet":10570,"ven":64039,"vem":8723,"vec":5468,"ve ":6856,"val":12123,"van":15891,"var":99905,"vat":6662,"va ":9694,"uvu":9284,"usi":13675,"use":6334,"ust":23739,"utg":8418,"uti":3815,"ute":6086,"uta":8659,"utt":3405,"uts":5227,"utv":5038,"us ":21399,"ut ":10089,"ura":3453,"ure":6299,"urg":5379,"uri":5728,"urn":5209,"uro":7154,"urs":7342,"ur ":12946,"upp":32257,"umb":4012,"ume":9374,"unt":7357,"unk":5142,"uni":16744,"unn":5336,"und":51074,"ung":19464,"une":3661,"ukt":5611,"um ":17518,"ult":6645,"ull":6732,"uli":10267,"un ":12274,"ugu":9934,"ude":4661,"udi":4906,"uce":4932,"uds":4702,"udo":3784,"uar":18907,"ubl":5383,"ubb":6201,"två":7001,"typ":3540,"tyr":3920,"tys":8073,"ty ":8321,"trö":5020,"tve":6253,"trä":9790,"tur":14907,"tun":3943,"tud":5778,"tyd":4220,"ts ":26900,"tre":12611,"tt ":116227,"tra":36819,"tri":14508,"tru":8774,"tro":12216,"try":3989,"tse":5043,"tsk":5040,"tst":3437,"tta":33856,"tte":27085,"tti":9635,"ttn":5259,"tts":5893,"ttr":3221,"to ":4895,"tni":11444,"tjä":4558,"tna":3229,"tod":3739,"toc":13127,"tog":4722,"tob":9412,"tom":4310,"ton":13637,"tol":5901,"tor":39112,"til":75428,"tik":17763,"tif":5408,"tie":6576,"tig":8660,"tit":6965,"tis":30207,"tin":14073,"tio":41843,"thu":4634,"tia":4024,"tic":3207,"tid":19952,"tiv":14860,"tli":6668,"tla":6760,"tem":18447,"ten":56126,"tek":5853,"tel":9950,"teb":4880,"tec":5494,"th ":6931,"tex":3539,"tet":22695,"tes":9480,"ter":115723,"tgi":4603,"på ":62169,"ti ":13119,"the":7679},"n_words":[31862602,36956776,26222440],"name":"sv"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"jer":348,"jen":305,"ji ":6234,"D":1805,"E":874,"F":1081,"G":1202,"A":4461,"B":2717,"C":2251,"L":1530,"M":12761,"N":2782,"O":860,"H":1677,"I":2605,"J":2641,"K":12188,"U":3120,"T":5185,"W":4730,"V":1116,"P":2090,"S":3343,"R":1632,"Y":517,"Z":395,"f":11048,"g":13829,"d":15034,"e":46694,"Feb":214,"b":19688,"c":9784,"a":289584,"n":90468,"o":57043,"l":42025,"m":53651,"j":21456,"k":76835,"h":32492,"i":164978,"w":60984,"v":3863,"u":57506,"t":40551,"s":35298,"r":27443,"p":13501,"z":18893,"y":38832,"x":501,"jar":185,"jan":137,"jaw":201,"é":167,"jim":1500,"jin":4267,"jil":163,"jij":492,"jia":221,"jib":3854,"ito":288,"itu":317,"itw":269,"isp":140,"ist":592,"ita":1061,"ite":213,"iti":334,"ivy":133,"iwa":2430,"ius":183,"ipo":224,"ipi":265,"is ":521,"ion":720,"iop":279,"ipa":165,"ipe":219,"iro":173,"iri":997,"isi":902,"ish":5756,"isa":694,"ire":164,"ira":314,"ja ":1529,"iyo":4644,"iye":227,"izo":242,"izi":413,"iza":568," l":8602,"kif":518," m":27935," n":19872," o":327,"kik":333," h":7652," i":9059,"kij":166,"kim":258," j":5212,"kil":389," k":27977," d":1010," e":802," f":914,"kia":390," g":257," a":6533," b":1252," c":2191,"kiw":279," y":17767," z":2257,"kin":442," u":4361,"kio":148," t":2402,"kip":379," w":34366," v":1482,"kis":520," p":2154,"kit":315," s":6097," r":837,"ki ":2193," J":2627," K":12017," H":1638," I":2128," N":2678," O":803," L":1487," M":12665," B":2646," C":2112," A":4277," F":1046," G":1172," D":1740," E":782," Z":375," Y":513,"и":142," S":3229," R":1588,"а":137," P":2015," W":4707," V":1031," U":3052," T":5117,"kea":156,"kem":150,"ke ":1988,"ku ":187,"kri":520,"kon":141,"koa":3734,"ko ":1214,"ل":165,"ا":240,"juu":155,"jul":257,"jum":177,"kaz":5045,"kaw":137,"kat":14149,"kar":374,"kas":316,"kan":2795,"kao":197,"kal":354,"kam":1048,"kad":160,"kab":375,"ka ":19783," Ga":196,"Da":365," Ge":229,"Co":364," Fr":177,"Ch":770," Ha":622," He":218," Go":142,"Do":469," Gr":177," Gu":142,"De":497,"Di":169,"Fe":311," Id":148,"Fa":160," Hu":173," Ho":177," II":154,"ha ":2668," Hi":392,"Ge":229," Ji":535,"Ga":198," Je":286,"I ":397," Ja":792,"Fr":177," Ir":284," Is":141," It":181," In":316," Ik":143," Il":224,"ham":522,"han":444,"hap":154," Ka":2225,"hai":238,"haj":163,"hak":611,"hal":314," Ke":708," Ki":3568,"har":1714,"has":255,"hat":148," Jo":255,"II ":207," Ju":691,"hag":267,"hab":181,"had":740," La":231," Le":207," Li":441," Ko":414," Ku":695," Kw":4009,"Au":181," Ma":4258," Mb":461,"Ar":475,"As":222," Mk":3388,"Ba":771," Mi":685," Mj":478," Me":615,"Af":445,"he ":544,"Ag":372," Lo":213,"Am":241,"An":463,"Ap":290," Lu":315,"Al":840," Ne":518,"Bu":429,"Br":278," Na":464,"Ca":592," Ni":435,"Bi":308," Mt":420,"Be":362," Mp":146," Mo":643,"Bo":282," Mu":471," Mw":545,"Ku":695,"Kw":4009,"Ko":415,"hez":299,"Le":210,"Li":441,"hes":336,"her":275,"hen":226,"hem":395,"La":231,"Lu":315,"Lo":213,"Me":621,"hi ":3880,"Mi":690,"Mj":478,"Mk":3388,"Ma":4263,"Mb":461,"Mw":546,"Mu":475,"Mt":420,"Mp":146,"Mo":643,"Ni":437,"Ne":518,"Na":466," Ap":290," Am":240," An":463," Al":833,"Ny":247," Ag":372," Af":443,"No":466," Ba":766,"Ok":277," Au":181," As":222," Ar":474," Be":362," Bi":308,"hio":2603,"Gr":177,"Go":143,"hin":1991,"him":244,"hil":432,"Gu":142," Bo":282,"hii":230," Br":278," Bu":429,"his":266,"hir":394,"Ha":622," Ca":582,"hiy":239,"He":219,"II":286,"Hi":393," Ch":768,"Ho":179,"Hu":173," Co":362,"K ":152,"Id":148," Da":365," Di":167,"In":317," De":495,"Ik":143,"Il":226,"Is":141,"It":181," Do":469,"Ir":284,"Ja":792,"Ji":536,"Je":286,"Jo":255,"Ju":691,"Ka":2234,"Has":225,"ho ":334," Fe":311,"Ki":3577," Fa":159,"Ke":708,"Us":172,"Ut":325,"Ur":181,"go ":920,"Un":355,"Uk":150,"Ul":189,"Ui":244,"Uj":249,"Uh":170,"Uf":251,"Uc":175,"Tu":237,"To":205,"Th":275,"Te":258," Wi":3377,"Ta":3841," We":188," Wa":1003,"St":260,"Su":178,"Wi":3380,"Wa":1003,"We":189," Zi":141," Za":152,"Vi":670," Yo":250,"Pr":150,"Pe":270,"goz":233,"Pa":858,"Po":195,"Pi":163,"gom":190,"gon":205,"gos":279,"gor":306,"Se":532,"gu ":424,"Si":424,"Sh":518,"So":239,"Ru":370,"Sa":668,"Re":188,"Ri":138,"Ro":385,"Ra":354," Po":195,"guj":253," Pi":163," Pe":270," Pa":857," Ny":247," No":466," Ok":277," Ra":354,"b ":211," Ro":385,"gwe":166," Re":188," Ri":138,"gwa":280,"guz":429," Pr":150,"a ":143240," Su":178," St":248," Ta":3838," Th":274,"Yo":250," Te":257," To":205," Ru":370," Sa":668," Sh":517," Si":421," Se":528," So":239," Vi":666," Tu":231,"Za":152,"Zi":141," Uc":175," Uf":251," Uh":170," Ui":243," Uj":249," Uk":150," Ul":189," Un":355," Ur":181," Us":172," Ut":325," ja":134,"iak":142,"i ":52347,"ian":874," ji":4522,"ias":364,"ge":1928,"iar":235," je":226,"ga":2900," im":145," in":3363," ik":274," il":4878,"fi":1075,"fr":504,"fu":1927,"fo":752,"ibl":142,"ibi":603," ka":16147,"gw":483," ki":3027,"he":2541,"ibu":4111,"ha":8898,"gl":145,"gi":1836,"gh":1233,"gu":1858,"iba":566," ju":300,"go":2336,"du":838,"dw":136,"g ":607," ha":1606,"ea":1091,"eb":539," he":144,"ec":251,"ed":686,"de":1841,"di":4816,"dh":617,"do":1639,"ia ":9119,"dr":203,"ew":912,"ex":163,"eu":261,"ev":332,"ey":739,"ez":1828,"fa":6104,"h ":704," id":219,"fe":174,"eh":737," hi":990,"eg":644,"ef":303,"ee":307,"el":2120,"ek":2577,"ej":155," ho":139,"ei":650,"ep":643,"eo":1165,"en":9965,"em":2423,"et":1296," hu":4749,"es":2258,"er":4147," nj":147,"ca":364," ni":9330,"e ":10467," ng":147," nd":690,"bw":843," nc":2455," na":6269,"br":408,"bu":5373,"bo":2905," mw":6857,"bl":321," mu":4335," mt":648," ms":331,"bi":2134," mp":280," mo":680," mn":1501,"be":1280," mm":157,"ifu":393,"da":3239,"f ":246,"ifo":606," of":164,"co":390," ny":523,"ck":301,"ci":283,"ch":7388,"ce":365,"ifa":585," le":184,"c ":192," li":859," la":7153," ku":5668,"ich":830," kw":2736," km":140,"ica":140," ko":150," me":184," mf":368,"az":6015,"ay":5308," mi":1257,"ba":6016," mj":5191," mk":1617,"d ":1205,"at":22079,"as":4908,"ar":9773," ma":3449," mb":469,"aw":1490," mc":155,"av":414,"au":1667," lu":341,"ak":14678,"al":8458,"idi":551,"ai":5267,"aj":1998,"ao":6210,"ap":5739,"ide":157,"am":9111,"an":29556,"ac":1224,"ad":3126,"ida":813,"aa":1773,"ab":2568,"ag":1596,"ah":1414,"ae":682,"af":1092,"nu":591,"nt":1270,"ns":4895,"no":1318,"nn":478," am":1335," an":488,"nz":5093," ai":153,"iin":242,"ny":7307," aj":134," ak":183," al":2589,"of":4380," au":941,"oc":308,"od":678,"oa":4118,"ob":631," at":195," as":220,"om":1846,"on":3853,"ok":2328," ba":679,"ol":1930,"oi":1488,"oj":1425,"og":855,"oh":360,"ija":140,"ot":1280," bi":222,"os":1066,"ov":580,"ou":534,"ije":137,"op":845,"oo":318,"or":2938,"iji":1232,"r ":1622,"ow":244,"oz":397,"oy":154,"pe":836,"pa":6921,"po":1264,"ph":151,"pi":2193,"ika":13864,"lo":1408,"lm":337,"Ida":135,"ll":791,"ls":182,"iga":224,"ii ":525,"lu":868,"lt":178,"igh":170,"igi":384,"ly":147,"o ":24303,"mc":173,"igo":169,"ma":8274,"mb":6660,"mh":261,"me":2630,"mf":564,"mk":1733,"ml":210,"mi":3477,"mj":5199,"mn":1546,"mm":321,"mp":578,"ihe":138,"mo":6079,"mr":140,"mt":753,"ms":447,"mu":6394,"mw":6988,"ihi":187,"p ":352,"na":23279,"nc":2788,"nd":5575,"ne":2353,"ng":6858,"ni":24361,"nj":567,"nk":135,"imo":196," es":141," en":369,"ju":713,"imf":161,"ime":354," el":223,"jo":133,"imi":180,"ki":6922,"kh":154,"ind":834,"ke":2748,"ina":8001," fa":353,"ka":45110,"imu":392,"m ":727," fu":177,"kw":3124,"ino":181,"ks":210,"kt":463,"ku":10532,"ins":133,"ko":5804,"ine":479,"ing":1959,"kr":669," fi":274,"ini":4598,"km":156,"li":17984,"le":2997,"ld":221,"lf":159,"la":14880,"lb":250,"iny":275,"n ":3144,"iko":612,"hw":492,"ht":198,"hu":6825,"iki":2488,"hi":11111," ch":2090,"hn":150,"ho":1180,"ila":4379,"id":1813,"ic":1403,"ib":5595,"ia":11251,"ih":490,"in ":378,"ig":1252," da":146,"if":1790,"ie":672,"iku":2496,"k ":628,"ilo":373,"ir":1982,"is":9376,"it":2904,"ill":288,"iu":466,"iv":385,"iw":2556,"ii":989,"ij":1580,"ik":19966," de":224,"ili":8251,"il":13887,"im":4832,"in":17333,"io":4395,"ile":321,"ip":1169,"ima":914,"je":934,"imb":2471,"io ":2960,"ji":17145,"iz":1362,"iy":4997," du":302,"l ":1018,"ja":2368,"z ":191,"wi":1773,"wo":202,"vy":671," za":1702,"y ":1239,"wa":56175," zi":456,"we":2203,"vi":1632,"vu":418,"vo":138,"uz":1451,"uw":2877,"uv":252,"uu":3068," ye":258,"ve":578," ya":17428,"va":328,"x ":213,"ui":563,"uj":4429,"uk":1643,"ul":2575,"ue":357,"uf":741,"ug":901,"uh":626,"ur":1919,"us":3274,"ut":2784,"um":5397,"un":5099,"uo":368,"up":1077,"ty":166,"tu":2287,"tt":391,"tw":473,"ub":1112,"ua":2111,"ud":534,"uc":476,"w ":435,"to":4407,"huk":345,"hul":146,"tl":220,"ts":343,"tr":455,"te":2280,"ti":12092,"th":999,"ta":14867,"su":644,"ss":500,"st":1842,"sw":308,"sl":142,"sk":865,"sm":139,"sp":289,"so":683,"sc":179,"se":5649,"sh":8151,"si":4764,"u ":13704,"sa":7736,"rr":220,"rs":467,"rt":620,"ru":2279,"ry":287,"ro":1786,"rn":619,"rm":257,"rl":223,"rk":320,"ri":8157,"rg":403,"re":3855,"rd":556,"rc":143,"rb":136,"ra":5018,"t ":1231,"s ":3025,"pt":348,"pu":357,"pw":193,"pr":381," sa":589," se":4480," si":369," sh":318," ra":432," ri":188,"hwa":473,"huo":175,"hum":2789,"hun":282,"hus":506,"hur":418,"huu":1333," pe":176," pa":632," pi":931," wa":33135," we":275," vy":396," wi":862," vi":1013," uc":144,"zi":8597,"ze":368,"za":8043," tu":189,"zw":257," us":165," ut":249," up":502," um":247,"zu":272," un":1571," uk":210,"zo":952," ul":573," uh":139," ta":1410,"ye":2395,"ya":24129,"yu":306," to":170," th":289,"yo":5888," te":201,"yi":4283,"Apr":266,"Asi":146,"Aru":195,"far":316,"fam":283,"fan":4203,"fal":292,"fa ":488,"eya":259,"Bah":237,"Bar":140,"eza":1136,"ezo":172,"ezi":237,"eta":229,"ete":154,"eti":253,"est":247,"ett":212,"ew ":355,"evi":165,"ewe":148,"ey ":361,"ewa":358,"er ":615,"epa":149,"es ":640,"ept":299,"eri":650,"ere":660,"era":456,"Afr":406,"esh":359,"ese":306,"esa":279,"eru":498,"Ago":254,"ert":152,"ers":339,"eku":184,"en ":297,"ela":204,"ele":786,"eli":360,"ell":177,"eo ":852,"emb":1055,"ema":157,"eme":314,"emi":276,"emu":365,"ene":704,"eng":671,"ena":283,"end":498,"eno":221,"eni":486,"ens":4087,"ent":441,"eny":1803,"Ali":478,"ege":351,"Ame":158,"ehe":647,"Ana":176,"el ":260,"eke":267,"eka":1754,"giz":193,"gir":232,"gin":349,"gid":165,"ght":136,"gha":925,"gi ":572,"gen":204,"ger":781,"ge ":611,"gaz":140,"gar":155,"gan":693,"ga ":1334,"Cal":307,"fup":194,"Bib":137,"fua":317,"fum":143,"fun":167,"fri":445,"fu ":810,"for":356,"fo ":342,"fil":269,"fik":168,"fiz":146,"da ":1525,"de ":752,"dad":386,"dae":220,"dar":151,"dan":305,"dam":173,"Des":272,"Dar":167,"Chi":216,"Chu":136,"Cha":300,"ch ":165,"cha":2430,"chu":596,"ck ":143,"che":571,"chi":3152,"cho":370,"ed ":154,"ebr":313,"ea ":663,"ei ":346,"efu":197,"edi":297,"ee ":156,"don":150,"dom":308,"dol":151,"dog":335,"dun":335,"dha":302,"dia":330,"dhi":240,"der":146,"deg":261,"del":152,"di ":2661,"do ":429,"Dod":240,"diy":201,"din":291,"dis":387,"dik":302,"ri ":2373,"rez":420,"rea":148,"ref":154,"reh":266,"ren":163,"rek":1672,"re ":305,"rd ":213,"ras":256,"rat":173,"Ni ":218,"New":381,"rai":160,"ran":867,"ram":226,"rab":297,"rad":150,"ron":135,"rog":253,"rne":169,"rni":283,"ro ":593,"riw":166,"ris":508,"ril":300,"rik":1688,"rin":373,"ria":769,"rib":1011,"ric":160,"rk ":191,"ruf":262,"rum":452,"ruk":315,"rus":423,"ry ":194,"rse":228,"Nya":144,"rua":234,"rt ":160,"ru ":273,"sab":458,"sac":139,"san":482,"sas":180,"sa ":5643,"Nov":242,"sha":1745,"sho":271,"she":240,"shi":5099,"si ":1365,"siw":355,"sia":608,"shw":458,"shu":187,"sis":157,"sin":881,"sil":283,"sim":158,"sik":319,"sey":212,"ser":175,"set":147,"Okt":259,"seh":319,"sen":4083,"sem":335,"spa":151,"son":242,"su ":198,"st ":167,"sko":136,"ska":599,"so ":134,"ssa":198,"ste":192,"sta":295,"sto":444,"sti":401,"str":197,"swa":181,"tai":280,"taj":233,"tak":462,"tal":339,"taa":220,"tab":242,"taw":344,"tat":292,"tar":668,"tao":3872,"tan":641,"tam":288,"te ":507,"ta ":6480,"pa ":765,"pat":4120,"pak":235,"pap":248,"pam":300,"pan":895,"pi ":233,"ped":156,"Pap":368,"pia":789,"pil":189,"pin":267,"pis":162,"pit":144,"po ":743,"pte":287,"pri":298,"pwa":189,"Rai":176,"ra ":1932,"ngo":958,"ngi":1065,"ngu":1084,"ngw":363,"ni ":18823,"Iri":209,"nge":937,"nga":1742,"Ita":147,"neo":505,"nes":161,"ng ":405,"nch":2504,"ne ":911,"ndu":263,"ndo":574,"ndi":1835,"nde":1085,"nda":1162,"nak":251,"nal":257,"nam":1855,"nan":221,"nao":1457,"nap":185,"nac":183,"nad":288,"naf":402,"nai":158,"naj":196,"nd ":409,"nat":353,"nas":439,"nay":454,"na ":15738,"Jan":271,"Jam":281,"nya":1379,"Jer":215,"nye":1338,"nyi":4239,"nus":133,"nua":282,"Jim":174,"Jin":277,"nti":403,"nta":151,"nte":177,"nsi":211,"nsa":4269,"nt ":232,"ns ":140,"nne":236,"no ":948,"nji":138,"nja":269,"Joh":134,"nia":4199,"nis":530,"ogo":593,"ois":1291,"oji":173,"oja":1149,"Jul":285,"Jun":259,"odo":288,"of ":150,"ofu":134,"ofa":3991,"oa ":3810,"oan":188,"oba":375,"nza":3817,"nzi":1111,"Kai":144,"Kag":175,"Kal":167,"Kan":354,"Kat":474,"Kas":372,"Kar":232,"Ken":632,"ozi":165,"Kis":329,"Kir":165,"Kit":204,"Kin":148,"Kib":138,"Kia":309,"ote":378,"Kik":287,"Kil":453,"Kim":202,"oto":331,"Kig":295,"Kii":249,"ost":309,"ota":195,"ove":320,"opo":325,"os ":178,"or ":161,"Kon":197,"orn":300,"oro":673,"ore":188,"ori":369,"ort":147,"ora":378,"ola":427,"on ":838,"oli":431,"ole":357,"olo":331,"oka":1580,"oke":163,"oko":236,"oku":141,"ona":230,"ond":383,"one":151,"ong":860,"oni":784,"oma":766,"omb":303,"omi":249,"omo":182,"op ":143,"la ":8089,"le ":1011,"Kwa":3975,"laa":157,"lai":293,"lak":564,"lan":660,"lam":497,"lat":186,"lay":3727,"Kus":393,"lba":165,"kuz":236,"kuw":2713,"kuu":1305,"kut":1795,"kus":492,"kur":190,"kup":186,"kun":409,"kum":210,"kul":297,"kuj":187,"kwe":591,"kwa":2512,"kub":762,"kuf":233,"kuh":134,"kua":620,"kto":308,"lom":136,"loj":136,"lme":241,"Lin":225,"lug":350,"lu ":155,"li ":2787,"lez":192,"lew":193,"lev":140,"les":155,"leo":178,"lem":198,"len":254,"lek":133,"lo ":347,"lla":138,"lle":153,"lli":198,"ll ":147,"lit":241,"lis":337,"lip":257,"lio":738,"lin":627,"lim":922,"liz":411,"liy":4415,"liw":979,"lic":340,"lia":1497,"lik":2742,"lil":529,"lih":179,"lif":397,"ma ":2611,"mb ":139,"maa":449,"maj":397,"mak":522,"mad":206,"mae":140,"mag":342,"mar":439,"mas":613,"mal":159,"mam":161,"man":1055,"mat":406,"mba":3047,"mbi":361,"mbe":389,"mbo":2343,"me ":516,"mbu":267,"mch":170,"met":211,"mer":252,"men":492,"mfa":152,"mez":387,"mfu":373,"Mei":250,"Man":216,"Mar":1940,"Mas":472,"Mag":282,"Mak":206,"Mac":287,"Mbe":273,"mpi":142,"mon":163,"moj":1127,"mpa":160,"Mor":279,"mu ":1602,"mtu":175,"mto":226,"Mic":182,"Mis":147,"msh":144,"mta":228,"mwe":383,"mwi":345,"Mko":3178,"mwa":6205,"Mku":138,"Mji":464,"muj":3839,"muz":374,"mhu":232,"Mtw":147,"mi ":359,"mji":5175,"min":192,"mil":749,"Mwa":460,"mit":295,"mia":630,"mik":321,"mo ":4413,"mku":1038,"mko":539,"mna":1501,"mmo":145,"Wik":149,"Wil":3077,"Wan":148,"zwa":252,"zi ":5785,"zai":249,"zaj":254,"zam":177,"zan":3194,"zal":783,"zar":173,"zo ":612,"zia":533,"zin":815,"zil":197,"zik":548,"zis":240,"一":303,"yof":3874,"yot":286,"za ":2981,"ye ":1320,"yen":237,"ya ":21762,"yar":252,"yan":567,"yao":167,"yam":250,"yak":657,"yo ":973,"yin":213,"yik":3954,"一一":144,"Tan":3407,"Tab":164,"Shi":315,"Sin":201,"Sep":283,"we ":401,"wez":265,"wen":1037,"wim":286,"wil":741,"Sal":197,"vyo":257,"wa ":33121,"wap":4111,"wan":3901,"wal":617,"wam":169,"wak":9923,"way":141,"wat":368,"war":238,"was":172,"wai":2667,"wah":176,"vu ":165,"vya":351,"vil":200,"vin":183,"vit":187,"vis":284,"Rom":180,"vem":244,"Vij":328,"uzi":743,"uza":470,"Uje":235,"uwa":2760,"uvu":174,"ush":417,"usi":1319,"use":183,"usa":176,"uu ":2892,"usu":216,"ust":207,"uso":141,"uti":211,"ute":137,"uta":560,"Uin":218,"utu":215,"uto":1436,"us ":536,"Ung":252,"ura":183,"ure":140,"uri":491,"uru":630,"unz":137,"Ula":150,"upa":554,"upi":311,"umu":162,"umi":484,"umo":2705,"uma":686,"umb":661,"ume":297,"uo ":238,"uni":940,"und":747,"una":1741,"ung":1193,"uku":302,"uko":457,"uki":429,"uka":247,"ulu":258,"uli":1405,"ule":192,"ula":478,"ukw":139,"uhu":267,"uji":4010,"uja":302,"Utu":261,"ugh":514,"ufu":352,"uhi":136,"ugu":137,"udi":174,"ubw":695,"uch":343,"ufa":176,"ufi":189,"ua ":369,"uat":317,"uar":494,"uan":690,"uba":185,"Uch":175,"ty ":146,"twa":450,"tur":369,"tun":270,"tum":424,"Ufa":219,"ts ":214,"tu ":896,"The":164,"tts":142,"to ":986,"tob":268,"tom":167,"ton":281,"tok":1553,"tol":482,"tor":246,"tik":8147,"tis":158,"tin":351,"tio":199,"thu":171,"tia":156,"tem":384,"ten":273,"tel":171,"th ":160,"ter":432,"ti ":2389,"the":225,"thi":213,"biw":209,"bis":191,"bil":315,"bin":256,"bo ":2326,"bli":173,"bor":262,"be ":229,"bam":230,"ban":516,"bal":619,"bah":147,"baa":227,"bab":179,"bay":333,"bar":432,"bao":277,"bi ":662,"ber":216,"bel":151,"bey":251,"bia":222,"ce ":176,"bu ":4649,"bru":221,"bur":149,"bun":177,"bwa":786,"aka":10583,"am ":337,"ake":1982,"aki":644,"aji":1355,"aju":170,"al ":304,"aja":293,"ain":393,"air":222,"ais":2933,"aif":267,"aid":437,"ahi":308,"aha":751,"agh":475,"agu":395,"aoi":1233,"anu":344,"anz":4756,"any":4453,"ano":638,"ann":141,"ant":323,"ans":490,"ane":261,"ang":1660,"ani":7747,"anj":260,"ana":4702,"anc":133,"and":2300,"amu":1047,"amo":1890,"amp":179,"amh":222,"ami":838,"ame":637,"amb":1658,"ama":1868,"ao ":4649,"alo":269,"alm":262,"all":133,"ali":5324,"ale":476,"ala":1026,"alb":152,"an ":1167,"akr":376,"aku":502,"ako":215,"aba":751,"abe":140,"abi":660,"abo":208,"abu":582,"ae ":291,"aad":302,"aan":389,"aal":140,"aam":185,"aar":236,"aa ":361,"afi":303,"ai ":477,"aga":223,"age":227,"afu":225,"aen":162,"ael":172,"afa":411,"ado":269,"adh":288,"adi":1538,"ach":840,"ada":637,"azo":205,"azi":5401,"aza":186,"ayo":638,"aya":4140,"aye":284,"ba ":2178,"are":1998,"ard":317,"ara":2057,"aro":249,"ari":3153,"aru":316,"art":243,"au ":993,"asa":1084,"asi":1169,"ash":895,"ask":665,"ar ":568,"apa":4869,"api":162,"apo":406,"as ":271,"aut":148,"awa":1126,"awi":190,"ata":10070,"ast":167,"ass":197,"ato":634,"ate":225,"ati":9962,"ath":135,"atu":749},"n_words":[1316698,1560317,1165243],"name":"sw"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":612,"E":464,"F":392,"G":381,"A":1033,"B":541,"C":1226,"L":485,"M":882,"N":489,"O":332,"H":425,"I":870,"K":359,"T":782,"W":287,"V":311,"P":878,"S":1046,"R":508,"f":1073,"g":1998,"d":2759,"e":8290,"b":1281,"c":2926,"a":10092,"n":6488,"o":5986,"l":4436,"m":3185,"k":952,"h":3181,"i":7588,"w":661,"v":857,"u":3410,"t":6374,"s":4316,"r":6588,"p":2229,"y":1574,"x":422,"ித ":544,"ில ":698,"ிய ":7880," m":283," o":562," d":292," a":454," c":438," t":733," p":559," s":429," r":341," K":344," H":326," I":509," N":361," L":347," M":788," B":464," C":741," A":832," F":341," G":361," D":414," E":381," S":843," R":426," P":727," T":657,"ாங்":593,"ாசி":571,"ாகி":1226,"ாகு":5727,"ாகா":403,"ாக்":2294,"ாகக":726,"ாகப":554,"ாகம":1267,"ாகவ":971,"ாகங":337,"ாகத":594,"ானி":1010,"ாந்":804,"ாநி":2007,"ானத":779,"ாப்":821,"ான்":1998,"ாடு":3023,"ாட்":5816,"ாடல":810,"ாடி":363,"ாடக":761,"ாத்":727,"ாண்":1485,"ாதம":306,"ாது":396,"ாதா":507,"ாதி":637,"ாணப":442,"ாணத":365,"ாஸ்":323,"ிங்":711,"ிச்":643,"ிசை":617,"ிடி":332,"ிடப":478,"ிடம":356,"ிடத":463,"ய":68757,"ம":127919,"ன":81690,"ந":46905,"ப":128120,"த":166688,"ண":25132,"ாரா":410,"ாரி":765,"ா":101548,"ி":184542,"ஸ":4761,"ஹ":1196,"ார்":5370,"ழ":21652,"வ":79830,"ஷ":1166,"ர":109358,"ற":58813,"ல":91486,"ள":62504,"ஏ":2039,"எ":16082,"ஊ":889,"உ":10751,"ஈ":845,"இ":32538,"ஆ":19361,"அ":23958,"ஃ":562,"ாறு":598,"ஞ":2219,"ட":101059,"ாலத":613,"ஜ":3516,"ச":50507,"ங":15929,"ாலம":447,"ாற்":1321,"க":197565,"ஒ":11957,"ஓ":1288,"ஐ":1009,"ாயக":290,"ாமல":357,"ாமி":434,"b ":301,"ாம்":1576,"ோ":14983,"ொ":17674,"ை":61878,"்":437238,"ௌ":348,"ூ":11561,"ாயி":304,"ு":185888,"ீ":8590,"ே":20308,"ெ":21142,"ாரண":513,"a ":854,"ாரத":304,"ாய்":1383,"ாரம":364,"ிகா":518,"ாவட":2380,"ாவத":750,"ாவர":299,"ிகை":370,"ிக்":6769,"ாழ்":1243,"ிகோ":462,"ாவி":3762,"ாவா":383,"ாலு":670,"ாலி":445,"ாளர":1347,"ால்":3272,"ாலை":675,"ாளா":324,"ாளி":348,"ிகள":3325,"ிகழ":507,"ாள்":696,"ிகப":373,"i ":647,"ினை":666,"ge":282,"ினி":620,"ga":286,"ினா":801,"ினர":628,"ினம":287,"ிந்":854,"he":687,"ha":472,"g ":377,"ea":314,"ec":300,"ed":390,"de":481,"di":495,"ிப்":4537,"h ":377,"el":404,"en":840,"em":293,"et":386,"es":700,"er":1464,"ின்":17928,"ca":348,"e ":1989,"da":327,"f ":407,"ct":322,"co":377,"ch":435,"ce":352,"ியே":373,"ியூ":340,"ிரத":942,"ியு":993,"ிரம":574,"ியோ":394,"ியை":838,"ிரப":448,"ியவ":789,"ியல":3188,"ியர":810,"ியி":4273,"ியா":6620,"ியத":1203,"ிமு":497,"d ":748,"at":1135,"as":419,"ியம":1215,"ar":1055,"ியன":1057,"ிமை":522,"ியப":743,"ியக":532,"al":1246,"ai":378,"ap":295,"am":828,"an":1637,"ac":346,"ad":343,"ிமா":411,"nt":654,"ns":302,"of":412,"om":409,"on":1256,"ol":463,"ிடை":404,"ou":276,"or":868,"ிட்":1455,"ிடு":635,"r ":1101,"pa":374,"lo":330,"ll":406,"o ":326,"ma":480,"mb":415,"me":406,"mi":333,"na":737,"nd":662,"ne":446,"ng":642,"ni":455,"ித்":3054,"m ":787,"ிணை":304,"ிண்":482,"li":572,"le":592,"ிதி":409,"la":866,"n ":1450,"ht":310,"hu":372,"hi":390,"ic":779,"ia":502,"ig":354,"is":617,"it":600,"il":395,"in":1071,"io":687,"l ":875,"ிஸ்":689,"y ":837,"ve":327,"x ":278,"ur":632,"us":340,"um":519,"un":283,"tt":320,"to":401,"tr":385,"te":756,"ti":1167,"th":985,"ta":522,"st":661,"se":294,"si":396,"rt":301,"ro":534,"ri":933,"re":684,"ra":1084,"t ":1022,"ீடு":326,"ீட்":724,"s ":1609,"ிரி":3610,"ிரா":1282,"ிர்":1294,"ிறப":627,"ிரை":1621,"ிரே":484,"ிறந":629,"ிறத":2122,"ிரு":4047,"ிறா":398,"ிறி":463,"ிறு":1338,"ிலங":466,"ிலம":1354,"ிற்":1776,"ிலத":1747,"ிலா":649,"ிலு":1997,"ிலி":1089,"ில்":23900,"ிலை":1645,"ிலே":416," த":26595,"ிழக":839," ய":1289," ம":33424," ப":40065," ந":18582,"ிழம":471,"ிழர":540," வ":24461," ல":1016," ர":1685,"ிளை":610," ஸ":454," ஹ":795," ஈ":841," உ":10726," ஊ":885," எ":15892," ஏ":2016," அ":23901,"ிழி":439," ஆ":19313," இ":32484," ச":22093,"ிவந":792," ஜ":2159," ட":1284," ஐ":986," ஓ":1277," ஒ":11946," க":32960,"ிவர":337,"ிழ்":3205,"ிவம":324,"ிவி":1334,"ிவா":521,"ிவு":1106,"ீதி":457,"ீர்":719,"ீன்":337,"ுச்":849,"ுங்":895,"ீவு":404,"ீழ்":360,"ுக்":10659,"ுகழ":319,"ுகி":3737,"ுகா":398,"ுகள":4206,"ுட்":1213,"ுடை":558,"ுடி":1579,"ுடு":653,"ுடன":1030,"ுநா":474,"ுனி":297,"ுபட":358,"ுந்":3187,"ுன்":993,"ுனை":280,"ுப்":4254,"ுது":799,"ுதி":3266,"ுண்":554,"ுதல":2039,"ுதப":476,"ுதன":363,"ுத்":6198,"ுரி":516,"ுரு":740,"ுரை":511,"ுறி":2453,"ுறு":308,"ுறை":2894,"ுற்":571,"ுளா":391,"ுளி":404,"ுள்":6921,"ுமை":377,"ுமு":342,"ுமா":937,"ும்":36442,"ுரம":392,"ுவை":304,"ுவி":1320,"ுவா":2204,"ுவர":1344,"ுவம":286,"ுவன":802,"ுவத":1697,"ுழு":744,"ூடி":578,"ூட்":652,"ூன்":565,"ூரா":577,"ூரி":517,"ூர்":1687,"ூறு":472,"ூலம":428,"ூற்":526,"ூல்":531,"ூலி":289,"அக்":419,"அதி":733,"அதன":471,"அணி":287,"அணு":326,"அடி":812,"அப்":365,"அன்":299,"அனை":343,"ஆகி":979,"ஆகு":4632,"ஆக்":285,"அழை":700,"அவர":558,"ென்":1743,"அல்":2972,"அளவ":598,"ெப்":657,"அரு":474,"அறி":1258,"அமெ":613,"அமை":3884,"அரச":1350,"அம்":318,"ெண்":620,"ஆங்":1422,"ெட்":778,"ெடு":671,"ஆசி":448,"ஆண்":2706,"ஆட்":480,"ஆம்":1544,"ைய ":1236,"ஆய்":339,"ஆரம":608,"ஆறு":307,"ஆற்":386,"இக்":380,"ஆவத":1054,"ஆவா":643,"ேயே":307,"ேட்":292,"இங்":400,"இசை":860,"ேண்":284,"இடத":337,"ேதி":335,"இடம":403,"இடை":433,"இத்":877,"இது":3256,"ேசி":564,"ேசு":307,"ேசப":275,"இணை":776,"இதன":761,"இந்":4776,"ெயர":1260,"ெயற":334,"ெயல":776,"இப்":496,"இன்":581,"ெர்":351,"ெறு":383,"ெய்":2021,"ெரி":1478,"ெரு":1277,"இயற":542,"இயல":277,"ெல்":969,"இம்":396,"இரண":675,"ெளி":2069,"இயக":775,"ெற்":1979,"இயங":307,"இலக":714,"இலங":1277,"இரு":3631,"இரா":2331,"இவ்":582,"இவை":339,"இவர":1304,"ைப்":5756,"இல்":712,"ையை":1039,"ையே":430,"ையத":341,"ையா":3283,"ையி":4758,"ையு":2157,"ையம":618,"ைமை":414,"ைத்":2427,"ைந்":3238,"ைநக":408,"ைச்":1529,"ேளம":371,"ேளக":469,"ேற்":1275,"ேலி":402,"ேலு":410,"ேர்":1922,"ேறு":768,"ேரூ":552,"ேரி":362,"ைக்":5944,"ேவை":525,"ைகள":3265,"்க ":1447,"ைவு":308,"ைவி":438,"ைவர":377,"ொன்":519,"்ப ":344,"ோக்":753,"ொள்":838,"ொழு":386,"ொழி":2875,"ொல்":850,"ொலை":513,"ொற்":349,"்ள ":4240,"ொரு":2529,"்ற ":3387,"்ட ":3672,"ொகு":770,"ொண்":2337,"ொது":899,"்த ":5466,"ொடு":300,"ொடங":359,"ொடர":1238,"ோப்":332,"ோன்":1334,"ோயி":659,"ோர்":868,"ோரி":632,"ோரா":279,"ோவி":367,"ோட்":1097,"ோடு":420,"ோது":603,"்கை":2757,"்கோ":938,"்கொ":643,"்க்":1264,"்கூ":637,"்கு":14044,"்கெ":383,"்கே":554,"்கா":4646,"்கி":8271,"்கள":15575,"்கல":1350,"்கர":1200,"்கவ":322,"்கப":3910,"்கம":1458,"்கத":1249,"்கண":471,"்கட":610,"்கங":350,"்சா":888,"்சி":4832,"்சு":884,"்சை":360,"எல்":437,"எழு":1211,"எடு":356,"எண்":595,"எதி":426,"எனப":826,"என்":8881,"்தா":3028,"்து":12325,"்தி":19739,"்த்":1974,"்தோ":339,"்தொ":367,"்தை":2288,"்தத":1008,"்ணி":349,"்தம":644,"்தப":1133,"்தன":629,"்தவ":959,"்தர":950,"்தல":971,"்பை":508,"்பே":287,"்பெ":966,"்போ":717,"்பொ":819,"்பி":4248,"்பா":4739,"்பூ":376,"்பு":5441,"்ப்":1248,"்பந":489,"்னை":343,"்பன":396,"்னி":1023,"்னா":524,"்பட":14770,"்னு":1025,"்பத":6302,"்பவ":714,"்பம":709,"்பர":1317,"்நி":304,"்நா":1754,"்பக":746,"்னர":478,"்ந்":2122,"்டத":3858,"்டன":373,"்டப":375,"்டம":1689,"்டர":578,"்டங":415,"்ச்":1088,"்தக":679,"்ட்":1036,"்டோ":535,"்டை":1115,"்டு":8476,"்டி":5785,"்டா":1951,"்டவ":480,"்வை":282,"்வத":502,"்வர":423,"்வி":886,"்வா":1151,"்வு":876,"்வே":640,"்ஸ்":867,"்மா":564,"்மன":338,"்யு":285,"்மை":1050,"்மு":410,"்யப":368,"்ளத":1941,"்லு":367,"்லூ":277,"்லி":796,"்லா":1037,"்றை":756,"்ளா":277,"்ளி":589,"்ளன":998,"்லை":579,"்றா":2083,"்றி":2742,"்று":6173,"்லத":2780,"்றழ":440,"்றவ":489,"்றம":405,"்றல":286,"்றத":1488,"்றன":1215,"ஐக்":380,"ஏற்":808,"ஊர்":281,"உரி":362,"உரு":1122,"உறு":409,"உலக":903,"உயி":570,"உயர":401,"உள்":3206,"உதவ":285,"உண்":440,"என ":651,"மக":2002,"மங":313,"மத":1505,"மண":816,"மட":614,"மன":2412,"யல":4462,"யற":1277,"யர":2780,"யவ":955,"ரக":668,"ரச":2387,"ரங":785,"ரட":422,"யா":11572,"யு":3982,"யூ":642,"ரண":1624,"யி":11562,"யீ":287,"ரத":2928,"ரப":1866,"யே":1284,"ரன":414,"யை":1942,"ரம":3652,"ய்":4254,"யோ":852,"மய":595,"மம":514,"மற":2418,"மர":1631,"மல":1338,"யக":1931,"யங":868,"மா":15034,"மி":8348,"மீ":1286,"மு":9054,"மூ":1942,"யத":2022,"மெ":1524,"மே":3645,"மை":8679,"யன":3058,"யப":1368,"மொ":2460,"மோ":314,"ம்":59753,"யம":2603,"பங":437,"னவ":812,"பக":2737,"னல":278,"னர":2255,"னம":1302,"ந்":20330,"நோ":770,"னப":1121,"பன":834,"னை":2914,"னே":371,"பந":774,"பத":7767,"பண":1265,"னு":2330,"னி":5915,"னா":2676,"பட":17323,"னக":412,"நவ":322,"நல":298,"நே":639,"நெ":1117,"நு":803,"நீ":1405,"னத":1823,"நூ":1307,"நி":7179,"னட":305,"நா":7474,"னங":575,"ப்":34661,"பொ":3990,"போ":3721,"பல":2780,"பள":417,"பழ":657,"பவ":1066,"னோ":401,"பம":782,"ன்":48370,"பய":2032,"பர":3306,"பற":996,"பூ":994,"பே":2341,"பெ":6101,"பை":873,"பி":11022,"பா":9941,"பு":9453,"பீ":476,"தை":3979,"தோ":1044,"தொ":4546,"தே":2805,"தெ":1540,"த்":34813,"தழ":321,"தவ":1763,"தூ":638,"தீ":1160,"து":39998,"தா":7230,"தி":33129,"தப":1845,"ணை":1673,"தன":4207,"தந":348,"தத":1637,"தள":614,"தல":4546,"தற":1613,"தர":2410,"தய":384,"ண்":13361,"தம":6359,"நட":1795,"நக":2290,"ட்":26642,"டோ":698,"டே":309,"டை":5277,"டெ":379,"டு":23773,"டி":13483,"தட":432,"ணா":409,"ணி":3340,"ணு":927,"தக":1336,"ணவ":477,"தங":477,"ணம":1051,"ணர":362,"ணத":619,"ணப":596,"ணக":587,"ணங":275,"ாக":19861,"ாங":594,"ாச":1598,"ாஜ":510,"ாட":11489,"ாத":4020,"ாண":3622,"ான":10191,"ாந":3098,"ாப":1499,"ார":8693,"ாற":2311,"ாம":3725,"ாய":2982,"ிக":14337,"ாழ":1765,"ாவ":8717,"ால":7588,"ாள":3674,"ாஸ":333,"ிங":712,"ிச":2403,"ிஞ":334,"ித":5697,"ிண":920,"ிட":5029,"ிம":2804,"ிந":1298,"ிப":5786,"ின":22037,"ிள":1636,"ிழ":6094,"ீக":398,"ிவ":5662,"ிய":32313,"ிர":16244,"ிற":8462,"ில":35051,"ீட":1354,"ிஸ":713,"ீச":331,"ீத":858,"ீன":904,"ுக":21093,"ீழ":591,"ீவ":685,"ீர":1646,"ுட":5430,"ுங":895,"ுச":1183,"ஸ்":4460,"ஹா":281,"வந":1310,"வன":1956,"ழை":1756,"வப":343,"வர":10633,"வம":1198,"ழ்":5992,"வள":1206,"வழ":1620,"வற":1169,"வல":1808,"வக":1767,"வச":597,"வங":465,"ழி":4553,"வட":4450,"ழா":328,"வத":5184,"வண":714,"ழு":3446,"வெ":3428,"வே":3105,"வை":3927,"வோ":475,"வ்":1486,"ங் ":289,"வா":8141,"வீ":991,"வி":15999,"வு":6267,"ஷ்":653,"ரோ":1002,"றப":1173,"ரை":4461,"றன":1468,"ர்":31916,"றம":626,"றத":3830,"ரூ":812,"ரு":27373,"ரீ":451,"ரே":1017,"றந":763,"ரெ":389,"றங":282,"ரா":8822,"ரி":12841,"ரல":936,"ரள":279,"ரர":859,"றக":313,"ரவ":1332,"லப":708,"லய":334,"லம":3186,"ற்":17148,"லர":750,"று":11549,"லத":5640,"றை":4508,"றா":2833,"றி":8018,"றல":363,"றழ":451,"லக":2902,"றவ":909,"லங":2178,"ளர":1895,"ல்":45274,"ளம":1268,"லோ":710,"ளப":432,"ளன":1049,"லை":8052,"லே":726,"ளத":2348,"லூ":633,"லு":4434,"லா":5224,"லி":6034,"ளங":419,"ளக":843,"லவ":809,"ழர":599,"ழல":340,"ள்":22645,"ழம":740,"ளை":6098,"ளி":11162,"ளு":3887,"ளா":3042,"ழங":730,"ழக":1857,"ளவ":1419,"எழ":1223,"எல":532,"கமா":1059,"கம்":3148,"என":11165,"எட":532,"எண":597,"எத":529,"கரத":596,"கரம":753,"கரா":588,"கரு":1925,"கரி":702,"கரை":390,"கர்":1591,"ஊர":477,"கற்":356,"கலை":1083,"உய":994,"உர":1677,"உற":823,"உல":1062,"உள":3281,"கலா":280,"கழக":436,"கல்":1081,"களை":4493,"கள்":9772,"களி":6714,"களா":1687,"ஈழ":284,"களு":3253,"உட":908,"உத":494,"உண":848,"இம":448,"கழ்":778,"இந":4780,"இன":1053,"இப":504,"இவ":2571,"இய":2152,"இர":7002,"இற":478,"இல":3110,"கவி":404,"கவல":360,"இட":1530,"இண":833,"இத":5637,"இங":400,"இச":1168,"ஆய":571,"ஆம":1554,"ஆற":784,"ஆர":1083,"ஆவ":1949,"இக":392,"கவு":1099,"ஆன":368,"ஆப":334,"ஆட":604,"ஆண":2834,"ஆங":1423,"அவ":1452,"ஆச":513,"அர":2426,"அம":5128,"ஆக":6177,"அழ":1041,"அள":802,"அல":3618,"அற":1466,"அந":302,"அப":488,"அன":910,"அண":903,"அத":1978,"அட":1678,"அச":379,"அக":783,"ஃப":444,"காண":1037,"காத":305,"காட":1197,"காக":926,"கிப":294,"கிர":1793,"கிற":2778,"கிய":4454,"கிழ":1101,"கில":2584,"கிள":303,"டங":1551,"டக":2338,"ஞர":411,"ஞ்":1568,"கான":927,"டம":3786,"காப":476,"டப":1246,"டன":1702,"காம":341,"கார":1102,"கால":2029,"டத":5604,"கிக":280,"காவ":696,"டா":3183,"கிட":434,"டவ":894,"டற":287,"டல":2022,"கின":3040,"டர":1996,"குள":472,"குர":499,"குற":3053,"ஜன":765,"கும":15556,"குழ":660,"குவ":564,"கூட":1198,"ஜெ":307,"ஜி":368,"ஜா":311,"குக":975,"கீழ":409,"ஜூ":479,"குட":1461,"குப":912,"குத":3019,"சட":699,"சந":447,"சன":838,"சத":750,"ங்":15912,"சம":2542,"சர":1035,"சப":743,"சல":387,"சீ":884,"சு":4739,"சா":3736,"சி":12172,"சை":2278,"செ":5006,"சே":2116,"கூற":724,"சூ":935,"ச்":6667,"சோ":703,"சொ":1085,"கெட":300,"சக":830,"சங":473,"க்":40970,"கைப":322,"கோ":3746,"கொ":4347,"கே":1313,"கை":5279,"கைய":1982,"கெ":595,"கைக":801,"கூ":2267,"கு":33794,"கீ":677,"கி":19030,"கா":10953,"கவ":2505,"கொல":275,"கொள":837,"கொண":2252,"கொட":399,"கம":4718,"கர":7242,"கற":456,"கல":3250,"கள":26914,"கழ":1431,"கன":1126,"கப":5340,"கட":3472,"கத":3423,"கண":2578,"கக":1180,"கச":448,"கங":1149,"ஒன":2135,"க்க":31923,"ஒர":8154,"கோர":492,"கோய":755,"கோவ":374,"ஓர":763,"கோட":700,"ஒல":300,"ஒள":283,"கோண":286,"ஏற":1035,"ஐக":383,"க்ட":361,"சக்":631,"்ட":31792,"்த":54197,"்ண":1523,"am ":356,"்ந":4822,"்ப":46615,"்ன":4757,"்க":63255,"்ச":10386,"al ":518,"ோம":420,"ோய":1038,"ோர":2106,"ோற":374,"ோல":725,"ோவ":577,"ோட":1840,"ோப":835,"ோன":1596,"ோத":1196,"ோண":311,"ொர":2700,"ொற":791,"ோச":380,"ொழ":3265,"ோக":1274,"ொல":1580,"ொள":880,"ொட":2489,"ொன":573,"ொண":2347,"ொத":1247,"ொக":1140,"and":321,"an ":334,"்வ":6141,"்ஸ":1010,"்ல":6654,"்ற":20507,"்ள":9314,"்ம":4103,"்ர":1658,"்ய":1630,"சு ":1200,"ூன":663,"ூத":353,"ூர":3051,"ூற":1305,"ூல":1999,"ூழ":431,"ுந":4175,"ுண":1116,"ுத":14095,"ுய":369,"ுர":3253,"ும":39489,"ுன":1831,"ுப":5592,"ுவ":9858,"ூக":634,"ுழ":1268,"ுள":8293,"ுல":1485,"ுற":6692,"ூட":1699,"ைவ":1932,"ைந":3816,"ைப":6286,"ைய":15390,"ைம":945,"ேல":1655,"ேற":2141,"ேர":3972,"ேய":740,"ேவ":930,"ைக":9533,"ேள":1091,"ேன":443,"ேம":327,"ைத":2725,"ைச":1726,"ெள":2310,"ெல":1418,"ேக":639,"ெய":4586,"ெற":2754,"ெர":3220,"ெப":689,"ென":2012,"ேண":375,"ேத":757,"ேட":601,"சி ":2531,"ேச":2248,"ெக":289,"ெண":621,"ெட":1667,"சை ":644,"ச் ":3259,"ati":457,"ஒளி":281,"ஒலி":275,"ஒரு":7842,"ஒன்":2133,"கே ":463,"கை ":1667,"கா ":505,"ச ":506,"க ":8647,"கு ":5427,"கி ":1073,"கச்":348,"ா ":4799,"ி ":16257,"கங்":1149,"ீ ":453,"ு ":58822,"கக்":1119,"ே ":3658,"ை ":18418,"ோ ":1470,"் ":167861,"கன்":508,"கப்":5197,"கணி":938,"கண்":880,"கத்":2578,"கதை":393,"கணக":371,"கடல":480,"கட்":1973,"ட ":4634,"ண ":792,"த ":6809,"ஓர்":624,"ன ":8248,"ப ":465,"க் ":7275,"ர ":1232,"ய ":9777,"ம ":541,"ழ ":296,"ள ":4773,"ல ":2350,"ற ":3775,"வ ":559,"ஜன்":468,"ion":604,"டி ":2152,"டா ":281,"டு ":6809,"ஜூல":309,"டை ":967,"ட் ":1325,"சங்":473,"he ":350,"சத்":482,"சட்":614,"சமய":437,"ங்க":15531,"சம்":940,"சமூ":398,"சன்":386,"சப்":396,"சந்":419,"சர்":489,"சிவ":528,"சில":849,"சிற":1550,"சிர":415,"சிய":2735,"சிப":459,"சின":560,"சித":585,"சிங":311,"சிக":977,"சால":592,"சார":993,"சாத":575," of":373,"சென":474,"சேவ":338,"சைக":291,"சேர":1297,"செல":774,"செய":2741,"சும":401,"சுற":312,"சுர":362,"சுவ":666,"சீன":355,"ing":312,"சுக":389,"சுத":324,"சூழ":384,"ச்ச":3340,"சைய":843,"சொல":606,"சொற":288,"டக ":523," th":509,"ட்ப":853,"ட்ட":19814,"ட்ச":2725,"ட்க":1235,"டைய":2172,"டைப":576,"டைக":531,"ணி ":660,"er ":398,"es ":343,"ண் ":586,"து ":27911,"தை ":1771,"தே ":341,"தி ":2552,"தா ":680,"டங்":1549,"டக்":1238,"ஞர்":315,"ஞ்ச":1519,"டமா":725,"டமை":297,"டம்":2072,"டர்":1679,"டன்":1240,"டப்":1121,"டத்":4115,"டது":1165,"டிய":2991,"டிப":984,"டின":1240,"டித":607,"டிவ":1222,"டில":1216,"டிர":537,"டாண":290,"டிட":382,"டிச":401,"டிக":1256,"டாவ":366,"டார":303,"டுப":1408,"டும":4650,"டுத":2954,"டுவ":1115,"டுள":785,"டுக":5293,"டாக":695,"டல்":1207,"தாக":1315,"தான":1137,"தாள":370,"தால":691,"தாவ":511,"திக":2302,"திச":403,"தாய":342,"தார":1089,"தின":3651,"திப":796,"திட":432,"திவ":299,"திம":291,"தில":7958,"திற":783,"திர":5993,"திய":6938,"துக":2030,"தீவ":583,"துண":389,"துட":443,"துப":404,"துள":2935,"துற":954,"துர":579,"தும":1383,"துவ":2062,"தூர":370,"தேச":1067,"தெற":289,"தென":697,"தைக":641,"தேவ":512,"தைச":349,"தேர":391,"தைய":745,"தொட":1925,"தொக":1037,"தொழ":704,"தொல":527,"தப்":1587,"ண்ம":630,"ண்ப":578,"தமி":4632,"தமா":339,"ண்க":445,"ண்ண":1196,"ண்ட":9721,"தம்":1071,"தயா":275,"தற்":1515,"தலா":427,"தலி":372,"தல்":1648,"தலை":1494,"தர்":937,"தவர":662,"தவி":419,"த்த":29791,"ணங்":275,"ணத்":603,"த் ":4416,"ணக்":539,"ணுக":317,"ணிக":735,"ணிய":592,"ணின":373,"ணித":474,"ணைய":630,"தந்":298,"தனி":760,"தன்":2053,"தனை":537,"ணைக":298,"தது":557,"தத்":779,"ணம்":660,"ணப்":571,"தங்":477,"தகவ":361,"தக்":543,"ணர்":292,"ந்ந":363,"ந்த":19873,"ng ":303,"னம்":921,"னர்":1811,"பகு":1877,"னது":1015,"னத்":650,"நேர":433,"நெட":597,"னப்":1023,"நோக":445,"நிக":520,"நாள":908,"நாய":319,"நான":310,"நாத":286,"நாட":4702,"நிய":299,"நிர":619,"நிற":1236,"நில":3876,"நீர":548,"நீத":282,"நுட":372,"நூற":484,"நூல":812,"னக்":283,"னங்":574,"பற்":788,"பலர":285,"பர்":1261,"பரி":581,"பரப":390,"பயன":1636,"பம்":365,"of ":362,"பவர":561,"பல்":1018,"பன்":334,"னைத":388,"பந்":594,"னைய":458,"னைவ":278,"னைக":358,"ன்ம":1125,"ன்ற":10571,"ன்ப":7799,"ன்ன":4301,"பமா":350,"ன்ட":278,"ன்க":958,"ன்ச":382,"னும":1622,"னார":414,"படம":801,"னிக":390,"னால":860,"படத":483,"படை":1050,"னின":559,"பட்":5520,"னிய":1758,"னில":287,"படி":920,"படு":7705,"னித":927,"பத்":1191,"பது":4499,"பதா":323,"பதி":845,"பண்":714,"பதற":308,"பணி":371,"பங்":436,"பக்":415,"னவர":288,"on ":583,"பல ":913,"ன் ":22050,"னை ":1086,"பட ":307,"னா ":289,"னி ":918,"ப் ":6813,"பை ":319,"நடி":497,"பு ":2795,"நடை":406,"பி ":577,"நகர":2167,"மணி":297,"ரே ":289,"மது":349,"மதி":359,"மத்":609,"ரு ":7941,"மட்":449,"ர் ":16957,"ரை ":1204,"றன ":856,"மன்":1232,"மனி":790,"மல்":385,"மலை":666,"யக்":1396,"மம்":421,"மரப":361,"மரு":359,"லக ":292,"மற்":2141,"றி ":716,"மாந":2194,"மான":2448,"மாத":610,"மாக":4202,"மிய":352,"மின":800,"று ":3977,"மாவ":2559,"மிக":1315,"மாற":977,"மார":743,"யங்":868,"யா ":1169,"ய் ":704,"யை ":1008,"யே ":849,"மகா":275,"ரா ":278,"ரி ":1486,"மங்":313,"மக்":1291,"மி ":296,"மே ":505,"மை ":1120,"ம் ":53178,"பின":1559,"பிய":742,"பிர":3751,"பிற":1054,"பில":819,"பிக":486,"பிட":1127,"பாத":646,"பான":683,"பாய":372,"பால":940,"பாள":428,"பார":981,"பாக":1005,"பாட":3059,"பாண":798,"பூர":411,"புத":1109,"புவ":296,"புல":660,"புற":357,"புர":1163,"பீட":289,"புக":1683,"பேர":1352,"பெண":307,"பெய":1296,"பெர":1861,"பெற":2100,"பேச":556,"போட":316,"பொற":395,"பொர":2127,"பொத":938,"போல":281,"போர":737,"போன":1011,"போத":864,"ப்ட":297,"ப்ப":26657,"ப்ர":488,"ளகர":469,"ளக்":350,"லம்":2464,"லமா":457,"லர்":460,"லப்":639,"ழ் ":1697,"ற்க":4226,"ற்ச":313,"ற்ப":2340,"ற்ற":9892,"றைக":742,"றைய":1266,"வே ":437,"ளத்":348,"ளது":1904,"லைக":1350,"லேய":338,"வை ":1638,"லும":2793,"லுக":305,"லிர":748,"லிய":1493,"லில":989,"லூர":479,"லுள":616,"வி ":735,"லிக":541,"லாற":423,"லாள":499,"லாம":957,"லிப":277,"லின":516,"வு ":1989,"லாக":721,"லான":565,"லாந":354,"ளங்":417,"லகி":465,"றம்":427,"ர்ச":956,"ர்க":5162,"ர்வ":1496,"ர்ம":805,"ர்ப":1555,"ர்ந":1909,"ர்த":1839,"ர்ட":360,"றப்":1150,"ள் ":12275,"ரைய":957,"றந்":718,"ரைப":1450,"றனர":342,"ளை ":2755,"ரைக":424,"றது":3133,"றத்":625,"லத்":2727,"லது":2796,"றுவ":1225,"றும":3125,"றுப":986,"றுத":532,"றுக":1202,"றில":535,"றிவ":770,"றிய":1917,"றின":604,"றிப":1027,"ழு ":279,"றித":312,"றார":526,"றிக":1316,"றாண":394,"றாக":1080,"ழி ":982,"லங்":2176,"லக்":1207,"றழை":438,"ரபு":329,"ரப்":990,"ல் ":34411,"ய்க":394,"ய்த":601,"ய்ய":1010,"ய்வ":739,"ரம்":2579,"ரமா":606,"ரர்":568,"ரலா":483,"யுள":362,"ரண்":815,"ரதே":555,"ரத்":1609,"ரது":278,"ளன ":701,"லை ":2437,"ரன்":291,"ளி ":635,"ராவ":406,"ரிக":2239,"ரால":435,"ராய":388,"ராம":869,"ரிச":541,"ரித":580,"ரிம":685,"ரின":1028,"ரிப":456,"ராக":2778,"ராச":370,"ராஜ":438,"ராட":1211,"ரான":615,"ருங":473,"ருட":752,"ருக":4077,"ருப":877,"ரும":2991,"ருத":1851,"ருந":3294,"ரிய":3104,"ரில":735,"ரிவ":1163,"ருவ":2848,"ருள":1674,"ரூர":620,"umb":290,"ரல்":362,"றங்":282,"மொழ":2149,"யப்":1295,"மைந":2631,"மைய":1873,"மைப":1460,"யன்":2673,"யமை":293,"யம்":1435,"ம்ச":279,"ம்ம":840,"ம்ப":4940,"யமா":620,"மூல":729,"மூன":356,"முழ":336,"மூக":412,"மீட":317,"மிழ":4631,"முற":1812,"மும":549,"முன":1115,"முத":1979,"முட":981,"முக":1304,"மென":349,"மெர":642,"றை ":1441,"மேல":805,"மேற":920,"மைக":943,"மேள":882,"யத்":1278,"யது":423,"லி ":729,"லா ":378,"ரங்":785,"ரசி":802,"ரசு":618,"யும":2862,"யிர":1042,"யில":6388,"யார":713,"யாவ":2647,"யாள":533,"யாழ":607,"யிட":437,"யின":2554,"யிய":701,"யாக":2905,"யாட":446,"யான":1415,"tio":513,"thu":314,"யல்":3022,"யலா":554,"யலி":566,"யற்":1115,"யர்":1747,"ter":301,"the":291,"யவர":291,"யவற":284,"வும":1702,"வாக":2334,"வாத":306,"வாச":280,"விக":832,"வாழ":922,"வால":316,"வான":889,"வாய":566,"வார":1236,"வித":708,"விண":306,"வின":2946,"விச":326,"விட":1092,"வில":3171,"விள":1040,"விய":1955,"விர":904,"விற":467,"வுக":1344,"வீர":321,"ஸ் ":1708,"வ்வ":1164,"வேற":846,"வேத":466,"வெள":2232,"வைய":661,"வைத":379,"வைக":768,"வேல":442,"வற்":1078,"வரி":937,"வரு":1641,"வரா":523,"வரை":1143,"வர்":4658,"வலை":345,"வல்":686,"வளர":473,"வழங":567,"ழ்ந":1868,"ழ்த":442,"ழ்வ":520,"ழ்ப":759,"ழ்க":412,"வரத":404,"வரல":407,"வம்":690,"வழி":728,"வடக":309,"வடி":817,"ழிப":303,"ழிக":564,"ழுவ":343,"ழும":521,"வட்":2653,"ழிய":1078,"ழில":887,"ழுத":1586,"வதற":541,"வதா":350,"வது":3201,"வத்":327,"வப்":301,"ழைக":1109,"வந்":1239,"வனம":324,"ழைய":304,"வன்":676,"ள்ள":9236,"ள்க":555,"ழர்":543,"ழமை":586,"வகை":1129,"வங்":465,"ழங்":729,"ளாக":1168,"ளிப":395,"ளின":2302,"ளால":742,"ளிக":491,"ளில":4426,"ளிய":1106,"ளிவ":966,"ளுட":327,"ளுக":1636,"ளும":1077,"ளுள":663,"ளைக":897,"ளைப":388,"ளைய":1472,"லைவ":579,"லைய":1591,"லைம":331,"ளனர":279,"லைப":623,"லைந":485,"லைத":348,"ளப்":366,"ல்ல":5939,"ல்வ":1233,"ல்ப":618,"ல்ந":316,"ல்க":1857,"ளம்":661,"ளமா":505,"ளர்":1471,"ழகத":295,"ளவை":282,"ழக்":933,"ளவு":424,"ளவி":419," நட":1718," நக":1501," நா":4504," நி":4279," நெ":933," நே":568," நூ":1176," நீ":1214," நு":519," நவ":316," பட":1904," பண":1105," பத":1073," பன":342," நோ":691," பக":1834," பங":296," பு":2928," பா":4041," பி":5649," பூ":498," பெ":4588," பே":1964," பற":718," பர":1247," பய":1942," பழ":608," பல":2289," பொ":3056," போ":2704," மன":1180," மட":504," மண":418," மத":890,"ிக ":435," மக":1685," மை":493," மே":3012," மெ":772," மொ":2003," மு":6572," மீ":747," மி":1990," மா":7055," மூ":1353," மர":989," மற":2304," மல":556,"ால ":279," டி":434,"ார ":281," தட":289," தக":507," தர":734," தற":353," தல":1342," தம":4737," தய":281," தன":1258," து":1451," தீ":745," தூ":310," தி":4210," தா":1353," தெ":1340," தே":1470," தொ":4079," தோ":559,"�":1175," ஸ்":417,"ான ":5124,"ாண ":290,"ாத ":411," ரா":553," யா":744," வட":1713," வண":356," வத":741," வக":1085," வச":354," வர":2716," வல":690," வழ":1368," வள":796," வந":384," வி":5451," வீ":639," வா":2177," வை":533," வே":1627," வெ":2854," உண":846," உத":491," உட":908," ஈழ":284," உள":3279," உல":1058," உற":820," உர":1675," உய":988," இண":833," இத":5634," இட":1528," இங":400," இச":1164," ஆவ":1943," இக":391," ஆய":570," ஆம":1553," ஆற":781," ஆர":1069," ஆப":332," ஆன":365," இவ":2571," இல":3087," இற":478," இர":6995," இய":2150," இம":448," இப":504," இன":1051," இந":4772," எட":530," எத":528," எண":593," என":11142," ஊர":477,"ாக ":5165," அக":780," அச":377," அட":1678," அத":1971," அண":901," அந":301," அன":909," அப":485," அம":5119," அர":2421," அற":1466," அல":3607," அள":796," ஆக":6168," அழ":1038," அவ":1449," ஆங":1422," ஆச":512," ஆட":602," ஆண":2832," சர":400," சம":1684," சந":326," சட":638," சங":283," சக":625," ஜூ":466," ஜெ":287," ஜன":633," சோ":491," சொ":885," சா":1798," சி":3884," சீ":724," சு":1865," சூ":738," செ":4613," சே":1783," ஒர":8153," ஒன":2134," ஏற":1035," ஐக":383," எழ":1219," எல":516," கவ":401," கு":5595," கூ":1517," கா":3988," கி":3235," கீ":461," கொ":3490," கோ":2006," கை":284," கே":530," கண":2002," கத":414," கட":2692," கன":550," கழ":306," கள":362," கல":1562," கர":2268," ஒல":300," ஒள":281," ஓர":762,"ஸ்க":347,"ஸ்ட":597,"ஸ்த":763},"n_words":[2733895,2995227,2314467],"name":"ta"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ొరి":395,"D":266,"E":177,"F":114,"G":196,"A":519,"B":275,"C":368,"L":194,"M":322,"N":270,"O":138,"H":192,"I":350,"J":111,"K":142,"U":147,"T":316,"V":136,"P":450,"S":521,"R":276,"f":670,"g":908,"d":1461,"e":4276,"b":801,"c":1375,"a":5389,"n":3366,"o":3005,"l":2162,"m":1421,"j":112,"k":391,"h":2159,"i":3899,"w":466,"v":490,"u":1533,"t":3619,"s":2456,"r":3191,"p":1106,"్వ ":163,"y":766,"x":252,"్ల ":671,"్ర ":1330,"ొని":112,"ొన్":374,"్య ":1153,"్మ ":263,"ొదల":145,"ొదట":235,"ొత్":201,"్ప ":113,"్న ":986,"్ద ":312,"ొట్":128,"్ధ ":159,"్త ":383,"్థ ":302,"్ణ ":129,"ొక్":438," l":107," m":164," o":442," h":116," i":265," e":97," f":148," a":447," b":182," c":254,"్క ":414," t":666," w":134," p":388," s":269," r":221," J":106," K":133," H":173," I":300," N":209," O":104," L":173," M":286," B":241," C":316," A":428," F":98," G":181," D":227," E":138," S":425," R":253," P":381," V":124," U":129," T":274,"ొంద":344,"ొండ":319,"ل":112,"ఉద్":166,"ا":171,"ఉన్":1055,"ఉత్":337,"ఉపయ":174,"్కా":196,"్కి":134,"్గం":289,"్కు":347,"్కృ":177,"్కర":105,"్కల":130,"్కడ":197,"Ma":108,"Na":100,"र":112,"ा":126,"In":167,"Th":137,"Ra":122,"b ":223,"a ":692,"i ":282,"ge":141,"ga":105,"he":507,"ha":394,"gh":206,"ోబర":105,"ోయి":127,"g ":144,"ea":167,"ec":109,"ed":292,"de":174,"di":203,"h ":262,"ోమీ":223,"el":221,"en":398,"em":129,"et":241,"es":437,"er":625,"ca":177,"e ":1121,"ౌ":562,"్":81970,"ై":3997,"ొ":4527,"ోర్":149,"ో":18200,"ె":10608,"ే":14569,"ీ":10360,"ు":57347,"ూ":6654,"be":157,"ృ":1348,"da":157,"f ":323,"ct":100,"co":201,"ci":133,"ch":165,"ce":190,"ఘ":447,"ఛ":149,"చ":15206,"c ":186,"జ":11057,"ట":14914,"ఞ":164,"ఐ":184,"ఓ":160,"ఒ":2543,"క":36325,"గ":19947,"ఖ":1770,"ఈ":2196,"ఉ":3268,"ఊ":158,"ఎ":1173,"ఏ":730,"ం":43411,"అ":7490,"ఆ":3748,"ఇ":3357,"హ":4682,"ోలు":99,"స":22838,"ay":103,"ి":69802,"ా":61461,"d ":570,"at":690,"ళ":2227,"as":298,"ల":47610,"ar":638,"ర":61617,"ష":5311,"శ":7224,"వ":26890,"ప":26129,"ఫ":1064,"al":617,"న":46878,"ai":134,"మ":31451,"య":19501,"ap":114,"బ":7288,"am":273,"భ":4603,"an":764,"ac":150,"ణ":4171,"ad":159,"ఠ":238,"డ":16340,"ab":131,"ద":24907,"ag":119,"ధ":5601,"త":28253,"థ":2033,"nt":373,"ns":115,"of":309,"ోత్":140,"om":163,"on":653,"ol":158,"ot":144,"os":117,"ou":169,"op":107,"or":408,"r ":477,"pe":126,"pa":178,"ph":109,"lo":168,"ోదా":327,"ll":190,"o ":201,"ma":274,"mb":260,"me":185,"mi":151,"p ":107,"na":453,"nc":108,"nd":442,"ne":251,"ng":261,"ni":232,"ోనే":164,"ోనూ":131,"ోని":1569,"ka":99,"m ":247,"li":262,"le":271,"la":311,"n ":759,"ht":245,"hu":238,"hi":188,"ho":127,"id":115,"ic":417,"ia":224,"ig":225,"ie":147,"ir":142,"is":387,"it":280,"iv":131,"il":208,"in":615,"io":345,"ోపా":108,"l ":472,"ోజక":467,"ోజు":1239,"y ":444,"vi":97,"ve":202,"va":103,"x ":181,"ul":111,"ur":199,"us":218,"um":308,"un":137,"tt":129,"w ":99,"to":206,"tr":177,"te":479,"ti":559,"th":697,"ta":313,"ss":111,"st":281,"se":152,"sh":199,"si":231,"u ":109,"sa":101,"rs":111,"rt":125,"ry":99,"ro":238,"rn":149,"ri":529,"re":343,"ra":524,"t ":648,"s ":985,"px":158," అ":7472," ఇ":3349," ఆ":3719," ఉ":3240," ఈ":2188," ఊ":158," ఏ":725," ఎ":1157," ఐ":178," ఒ":2539," ఓ":155,"ొలి":167," క":10463," ఖ":358," గ":5736," ఘ":134," చ":6063," జ":5157," ట":497,"ొల్":112,"ోకి":106,"ోక్":172,"ోగి":184,"ya":98,"అసె":136,"అల్":163,"అరబ":119," డ":594," త":4637,"అర్":220," ధ":398," ద":3929," న":6927," ఫ":510," ప":14617," భ":2487," బ":2399," య":1063," మ":10243," ర":6209," ల":4976," వ":10214," శ":2160," ష":216," స":10032," హ":1355,"అమె":140,"అయి":199,"అభి":134,"అప్":103,"ఆధా":126,"్స్":527,"్సు":127,"్సి":119,"్సవ":107,"్ష్":173,"్సర":1414,"ఇంట":99,"ఇండ":128,"ఇంద":101,"ఇంక":425,"అక్":257,"్మక":121,"్మం":100,"్రం":1049,"్మీ":100,"్మి":779,"్మా":335,"్యక":335,"్యం":700,"్రజ":278,"్యా":2178,"్యు":431,"్యల":131,"్యవ":431,"్రక":927,"్యప":108,"్యమ":477,"్యత":143,"్యన":109,"్ధి":325,"్ధా":136,"్నద":482,"్ని":1493,"్నూ":99,"్నా":798,"్పట":140,"్పడ":119,"్పా":169,"్పి":132,"్పు":480,"్వత":158,"్శక":248,"్వవ":145,"్వర":328,"్వన":132,"్వే":159,"్వి":140,"్వా":901,"్వహ":104,"్షణ":127,"్షి":442,"్యే":161,"్రధ":337,"్రప":534,"్రత":526,"్యూ":276,"్రద":990,"్రయ":138,"్రల":146,"్యో":108,"్రమ":1464,"్రభ":230,"్రస":657,"్రహ":283,"్రవ":528,"్రి":1462,"్లం":396,"్రీ":1126,"్రు":244,"్రా":3971,"్రే":209,"్రె":472,"్రో":201,"్లా":2320,"్లి":708,"్లీ":237,"్లు":538,"్లూ":142,"్లె":187,"్లో":427,"్వం":342,"్ళు":159,"అధి":174,"అనగ":184,"్జా":141,"అదే":167,"్టమ":140,"్టణ":417,"అన్":250,"్ఞా":122,"అనే":708,"అని":495,"అను":265,"్గొ":101,"్గా":358,"్గర":100,"్గమ":101,"్చు":270,"్చి":657,"్చే":115,"అతి":194,"అత్":202,"్తు":1013,"్తూ":214,"్తి":885,"్తా":732,"్థల":171,"్త్":578,"్ణా":297,"్థం":104,"్ణు":99,"్తర":228,"ఆంధ":768,"్దు":149,"్ది":135,"్నం":164,"్దా":110,"్ధం":145,"్థా":493,"్థి":196,"ఆంగ":412,"్టి":519,"్టా":168,"్టు":592,"్టీ":169,"్టె":145,"్టే":107,"్టో":130,"్ట్":1367,"్తం":168,"్తక":101,"్డు":210,"్డి":275,"్డా":104,"ఇస్":134,"ఇవి":157,"ఈయన":143,"ఉంద":273,"ఉండ":260,"ఉంట":317,"ఇక్":152,"ఆస్":106,"ఆయన":107,"ఆర్":219,"ఇది":937,"ఇతన":164,"ఇతడ":125,"ంగు":105,"ంగా":1620,"ంగీ":190,"ంగ్":842,"ుబ్":104,"ంచం":115,"ంచడ":196,"ంచబ":269,"ురం":244,"ుమా":305,"ంచి":2098,"ంచా":900,"ంచు":217,"ంచే":428,"ునర":115,"ును":308,"ుని":447,"ంకట":194,"ున్":877,"ంకా":441,"ంగం":127,"ంకు":100,"ంక్":110,"ుపు":220,"ంకే":111,"ుప్":149,"ంగల":110,"ంట్":162,"ుతా":153,"ుతు":406,"ంటూ":289,"ంటు":309,"ంటి":346,"ంటా":630,"ంటే":263,"ండవ":146,"ండర":435,"ండల":1528,"ండా":227,"ుత్":335,"ుతో":165,"ండు":332,"ండి":934,"ుదల":235,"ండే":174,"ండ్":373,"ంతం":178,"ునక":539,"ుద్":398,"ుట్":223,"ుటు":199,"ుడి":231,"ుడై":99,"ుడు":1669,"ూడా":424,"ెం ":180,"ుస్":362,"ువు":534,"ువా":295,"ులన":225,"ులక":309,"ుర్":295,"ురి":204,"ురు":297,"ురా":315,"ులో":1709,"ుల్":297,"ులు":2048,"ులల":295,"ీపు":398,"ీని":626,"ీన్":101,"ుంట":481,"ుంచ":300,"ీటి":186,"ీడా":155,"ుంద":899,"ుండ":917,"ుంబ":248,"ీటర":270,"ీసు":107,"ీస్":140,"ుగు":1331,"ుగా":707,"ుఖ్":339,"ీవి":185,"ీవు":115,"ుక్":186,"ుకొ":218,"ుకో":172,"ుకు":711,"ీలు":116,"ీరు":184,"ీర్":177,"ీరా":161,"ీరి":154,"ేక ":252,"ృతి":120,"అంద":240,"ృత్":158,"అంత":284,"అంట":498,"ృద్":113,"ేట ":153,"ృష్":430,"ూబ్":222,"ంవత":1405,"ూర్":913,"ూరు":920,"ూరి":178,"ూమి":99,"ంస్":461,"ూరం":247,"ూడి":137,"ూన్":122,"ంనగ":106,"ంధి":102,"ంద్":655,"ందు":615,"ందీ":121,"ందూ":127,"ంది":3900,"ందా":120,"ందర":242,"ంత్":325,"ంతో":223,"ంతి":125,"ంతా":223,"ంతు":154,"ంతమ":531,"ంతర":294,"ంప్":193,"ంబం":197,"ంధ్":823,"ంబ్":147,"ంబర":346,"ంబా":123,"ంలో":2499,"ెక్":232,"ెగొ":368,"ెట్":682,"ేంద":485,"ెద్":322,"ెను":111,"ెన్":295,"ెడ్":327,"ైన ":1371,"ెంక":174,"ెండ":749,"ెంట":159,"ెంద":1703,"ెంబ":376,"ేయబ":121,"ేర్":242,"ేరు":810,"ేశమ":156,"ేవా":233,"ేవి":124,"ేవు":99,"ేశం":473,"ేశ్":948,"ేషన":164,"ేశా":391,"ేసు":136,"ేసే":128,"ేస్":215,"ేసి":385,"ైదర":183,"ైనా":98,"ెప్":281,"ెర్":117,"ంత ":201,"ెరి":197,"ెరు":117,"ండ ":260,"ెళ్":227,"ెల్":334,"ెలి":127,"ెలు":1031,"ెస్":269,"ంచ ":118,"ేట్":100,"ేదా":763,"ేది":324,"ేత్":212,"ేపల":115,"ేని":152,"చక్":98,"జీ ":270,"జు ":869,"చడం":113,"చడా":97,"జ్ ":143,"చబడ":285,"చయి":169,"చర్":162,"చరి":182,"టం ":155,"చిమ":219,"చిన":1351,"చిత":839,"చాయ":132,"చార":671,"చాల":242,"చాడ":421,"చిం":529,"చుట":109,"చుక":184,"చెర":102,"చేత":111,"చేశ":113,"చేస":711,"చేయ":279,"చేర":138,"చెం":1703,"చెప":157,"టక ":161,"చ్చ":739,"జకవ":368,"జట్":132,"జయం":110,"జయన":97,"జనా":117,"జనవ":101,"జన్":546,"జరి":198,"జరు":168,"జర్":124,"జిల":1917,"జాత":476,"జాబ":132,"టు ":571,"డం ":328,"టీ ":163,"టి ":1355,"ట్ ":976,"జూన":102,"జుల":578,"టే ":302,"జీవ":381,"జ్య":270,"జ్ఞ":163,"కలవ":99,"కలద":130,"కల్":174,"కలు":183,"కలి":375,"కళా":130,"కవర":379,"కవి":191,"కమై":100,"కము":112,"కరణ":169,"కరి":149,"కరీ":108,"కర్":483,"కృష":374,"కృత":281,"కూర":117,"కేం":355,"కెట":368,"కేత":124,"కొల":164,"కొన":532,"కొత":121,"కొం":355,"కాక":237,"కాన":220,"కిం":115,"కాం":219,"కిల":278,"కుం":288,"కాల":753,"కావ":146,"కాశ":150,"కార":1413,"కువ":215,"కుల":554,"కుమ":181,"కూడ":482,"గంల":249,"కున":307,"కుడ":387,"కుట":214,"క్త":493,"క్ట":234,"క్ర":1156,"క్ష":1241,"క్య":503,"క్స":190,"కోట":132,"కోవ":118,"కోస":136,"క్క":1351,"గణి":99,"చు ":184,"గాం":158,"గిం":251,"చి ":1132,"గల్":120,"ఖ్య":557,"గరం":165,"గము":239,"గర్":467,"గోద":304,"జక ":102,"గ్గ":278,"గ్ల":521,"గ్ర":2803,"గొర":375,"గొం":139,"గుం":380,"గీత":230,"గిన":273,"గిర":115,"గిల":409,"గాన":344,"గాల":538,"గార":322,"గుర":382,"గుల":283,"చే ":527,"గుత":171,"చంద":164,"The":111,"ఒకట":320,"ఈ ":1802,"కడ ":129,"ఆ ":194,"ం ":8664,"జ ":124,"ట ":666,"చ ":142,"క ":4039,"ఖ ":417,"గ ":279,"కా ":636,"కి ":2878,"కీ ":159,"గం ":342,"కు ":3007,"కే ":115,"కల ":176,"ో ":9069,"ొ ":154,"్ ":12057,"ూ ":882,"ే ":3256,"ె ":384,"కంగ":154,"ై ":627,"కంప":118,"క్ ":597,"al ":302,"కటి":342,"ప ":310,"కడప":102,"గి ":200,"ర ":2230,"య ":2100,"and":212,"గు ":978,"మ ":914,"an ":162,"భ ":236,"త ":2433,"ణ ":915,"గ్ ":310,"కని":99,"డ ":765,"కన్":185,"న ":8370,"గే ":100,"ధ ":351,"ద ":698,"కత్":143,"థ ":364,"గల ":283,"ి ":25169,"ు ":28831,"ీ ":2596,"ా ":10070,"గర ":98,"ళ ":256,"ara":113,"గా ":3484,"as ":101,"ల ":3843,"ష ":236,"స ":172,"వ ":1710,"ate":158,"ati":274,"శ ":151,"ఎక్":197,"ఒక ":1730,"ఎన్":170,"ఏర్":182,"ఏప్":108,"కం ":201,"పతి":175,"పత్":270,"పని":184,"పదా":153,"బ్ ":398,"పద్":166,"పబడ":136,"పర్":323,"పరి":615,"పయో":173,"is ":130,"ion":314,"మం ":434,"బీ ":112,"పడి":153,"పట్":664,"పటి":143,"నుల":119,"నుమ":118,"నుం":937,"పండ":161,"పంచ":381,"నిష":106,"నిస":137,"నుక":147,"నేద":234,"నేక":128,"నెల":200,"ఫ్ ":137,"న్స":247,"న్న":4348,"న్య":363,"న్మ":479,"నర్":180,"నల్":184,"నవల":101,"నవర":120,"నవి":505,"నవా":119,"నాన":204,"నాల":394,"నిక":2247,"నామ":117,"నాయ":625,"నార":407,"నాగ":146,"నాట":222,"నాడ":330,"నిం":101,"నిమ":603,"నిప":174,"నిన":273,"నివ":153,"నిల":173,"నియ":723,"నిర":621,"నిజ":109,"నిచ":107," In":167,"పై ":288,"ధుల":101," Ma":108,"he ":336,"నటి":194,"నటు":148,"ధాన":583," Na":99,"ధిక":249,"ధాల":110,"ధార":372,"ధిం":266,"ధ్ర":813,"ధ్య":539,"ధ్వ":128,"నము":174,"నప్":98,"నది":1069,"ప్ ":131,"నకు":648,"ధర్":117,"ద్ద":912,"ద్ధ":740,"ద్య":705,"ద్ర":1024,"ద్వ":366," Ra":122,"పు ":1089," Th":135,"పి ":170,"నగర":694,"నగా":212,"దుర":131,"దుల":359," in":168,"ic ":167,"దుక":199,"నంల":98,"దీన":642,"నంద":215,"నంత":121,"దూర":349," of":288,"నంగ":97,"దిన":1883,"దాల":166,"దార":401,"దిగ":97,"దావ":310,"దిక":108,"దాన":215,"దాయ":205,"దిం":248," an":154,"igh":175,"ing":133," co":127,"in ":172,"దేశ":1819,"దేవ":424,"దరా":213,"దరి":108,"దరు":98,"దర్":467,"దలై":227,"దము":121,"ht ":153," ri":133," px":158,"hum":206," th":488,"యక్":252,"మల్":108,"యకు":188,"ళం ":125,"మహా":326,"మహబ":214,"మస్":105,"భ్య":159,"మర్":164,"మము":1248,"మరి":1583,"్య":7206,"్ర":17902,"్భ":147,"్మ":2496,"్ళ":500,"్ల":6272,"్ష":1432,"్స":2574,"్వ":3316,"్శ":403,"భూమ":101,"ొత":206,"ొన":623,"ొద":448,"er ":201,"ొడ":148,"ొట":141,"ొక":526,"ోత":253,"ోద":489,"ోధ":191,"ోన":2053,"ోప":279,"ోజ":1789,"es ":247,"ోట":326,"మన్":141,"ోడ":255,"ొల":420,"ోక":416,"ోగ":536,"ొప":97,"ొమ":110,"ొర":534,"మధ్":230,"ోహ":103,"ోస":258,"ోష":120,"ోవ":239,"ోళ":97,"ోల":348,"ర్ ":2488,"ోర":356,"ోయ":204,"ోమ":362,"ోబ":166,"్ప":1717,"్బ":390,"్థ":1504,"్ద":1267,"్ధ":1039,"్న":5055,"్డ":878,"్ణ":861,"్త":4966,"మద్":165,"్ఞ":163,"్ట":4282,"్చ":1364,"్జ":379,"్క":2197,"్గ":1283,"ౌర":127,"ెక":273,"ెగ":454,"ెట":731,"ేం":571,"ెడ":401,"ెన":560,"ెద":446,"ెబ":128,"ెప":323,"ెర":576,"ెయ":109,"ేక":542,"ెల":1742,"ెళ":238,"ెస":311,"ేజ":151,"ేడ":137,"ేట":442,"ేద":1312,"ేత":579,"ేమ":144,"ేన":288,"ేప":266,"ైక":118,"ేవ":789,"ేశ":2277,"ేయ":473,"ేర":1324,"రు ":4219,"ేల":302,"ైట":119,"ేస":935,"ేష":349,"ైద":293,"ైప":117,"ైన":1789,"ైల":172,"ent":124,"ైర":115,"రీ ":399,"లం ":787,"ొం":865,"ుప":753,"ుబ":195,"ుమ":706,"ుర":1675,"ుడ":2134,"ుత":1344,"ుణ":146,"ుద":870,"ున":2892,"ూచ":102,"ూట":134,"ూడ":838,"ుల":6256,"ుళ":166,"ూక":114,"ువ":1504,"భుత":182,"ుష":187,"ుస":535,"ూబ":239,"ూప":393,"ూర":2460,"ూమ":147,"ూత":151,"యంగ":158,"భివ":99,"ూన":263,"యంల":180,"ూల":418,"యంత":161,"భావ":218,"ృత":512,"ృద":127,"భాష":346,"ృష":487,"భాగ":404,"ెం":3543,"భార":1164,"ght":162,"రా ":340,"రి ":1816,"బ్ద":180,"బ్య":153,"బ్ర":421,"బ్బ":268,"బ్ల":214,"మంల":137,"మంత":183,"మంద":199,"మండ":1520,"మంచ":160,"బూబ":215,"యి ":1015,"యా ":446,"యు ":1545,"బహు":111,"రత ":484,"రణ ":257,"బాల":182,"బాద":326,"చం":331,"చక":136,"గూ":124,"గు":2649,"గీ":290,"గి":1482,"గా":5233,"గ్":4119,"గో":680,"గొ":668,"గే":237,"చూ":162,"చే":2135,"చె":2255,"చి":4513,"చా":1586,"చు":758,"జం":238,"చీ":143,"జక":593,"చ్":872,"చన":284,"చడ":226,"చల":202,"చబ":291,"చర":425,"చయ":212,"జీ":767,"జు":1617,"జూ":225,"జా":1179,"జి":2298,"బర్":394,"జె":108,"జే":138,"జ్":689,"జట":138,"జన":1034,"జల":247,"జర":556,"జయ":406,"జమ":127,"ఞా":122,"టణ":417,"టన":141,"టమ":207,"టర":627,"టల":211,"టా":1086,"టం":232,"టక":349,"బరు":119,"ఏర":193,"ఒక":2332,"కప":131,"కన":498,"కథ":162,"కళ":218,"కల":1454,"కర":1217,"కమ":420,"కత":246,"కడ":370,"ఖం":105,"కట":682,"కం":721,"గన":109,"గమ":341,"ఖ్":606,"గల":557,"గర":1021,"గవ":148,"గడ":113,"ఖా":134,"గణ":180,"గత":148,"కె":553,"కే":816,"కొ":1436,"కో":857,"క్":5977,"కవ":683,"మ్ ":245,"కా":4315,"కి":3602,"గం":786,"కీ":401,"కు":5487,"కూ":725,"కృ":665,"ఇవ":231,"ఉం":885,"ఇస":145,"ఈయ":144,"ఉత":340,"ఉద":238,"ఉన":1071,"ఉప":428,"ఊర":103,"యన ":254,"మే ":227,"బడు":128,"బడి":519,"బడ్":106,"మె ":112,"ఎం":154,"ఎక":206,"ఫిబ":101,"ఎన":226,"ఎల":121,"ఏప":110,"ంట":2314,"ండ":4890,"ంచ":4715,"ంజ":258,"ంఖ":105,"ంగ":3503,"ంఘ":151,"ంక":1284,"ము ":3877,"ంస":547,"ంహ":127,"ంశ":194,"ంవ":1428,"ంల":2578,"ంభ":203,"మీ ":169,"రం ":1690,"ంబ":1040,"ంప":925,"ంన":133,"ంధ":1232,"ంద":6277,"ంత":2617,"అం":1210,"మి ":320,"అక":311,"మా ":484,"అత":486,"ఆం":1221,"అడ":112,"ఆగ":134,"అవ":290,"అస":267,"ఇం":883,"అప":148,"అద":293,"అధ":264,"అన":2095,"అల":293,"ఆక":150,"అభ":173,"అమ":349,"అయ":276,"అర":448,"ఆస":172,"ఇద":1021,"ఇత":411,"ఆద":158,"ఆధ":194,"ఇక":159,"ఆల":169,"ఆర":366,"ఆమ":98,"ఆయ":142,"ed ":203,"హా":889,"హీ":131,"హి":1070,"హై":216,"హు":207,"హ్":323,"సి":2638,"సా":1884,"సహ":150,"సే":373,"హన":103,"సె":515,"హద":248,"సూ":343,"సు":1296,"సీ":269,"హర":312,"హమ":119,"స్":7213,"సో":169,"హబ":217,"సై":168,"ీన":965,"ీప":582,"ీమ":216,"ీయ":835,"ుం":2995,"ీడ":294,"ీత":443,"ీద":226,"ప్ట":148,"ుచ":224,"ీస":356,"ుజ":115,"ుట":570,"ీర":995,"ీల":543,"ుఖ":811,"ీవ":537,"ుక":1639,"ుగ":2452,"ిప":1339,"ిధ":481,"ప్ర":6213,"ిన":7334,"ిభ":310,"ిమ":1311,"ిబ":193,"ీం":175,"ప్ప":767,"ిడ":512,"ిట":428,"ిద":1195,"ిణ":298,"ప్త":145,"ిత":2784,"ిహ":142,"ిస":1526,"ిష":595,"ీట":504,"ిల":4318,"ిర":1588,"ియ":3643,"ిశ":675,"ివ":1160,"ీక":548,"ిళ":268,"ాప":991,"ాబ":791,"ాభ":232,"ాద":1312,"ాధ":720,"ాన":5606,"ాణ":587,"ాత":1866,"ాథ":104,"ాట":1172,"ిం":5711,"ాడ":1873,"ిజ":737,"ాష":1417,"ిగ":1743,"ాశ":424,"ిచ":498,"ాహ":545,"ాస":1648,"ాళ":307,"ాల":7697,"ావ":1502,"ిక":5536,"ాయ":2657,"ామ":3420,"ార":9423,"ాం":2393,"ాజ":1333,"ాచ":406,"ాగ":1102,"ాఖ":177,"ాక":1433,"ళ్":660,"ళా":226,"ళు":204,"వం":1022,"ళి":199,"లూ":372,"లె":853,"లే":1247,"లై":507,"లొ":182,"లో":10420,"ల్":5828,"లల":1531,"లవ":368,"పొం":272,"లస":257,"లి":3093,"లా":4226,"లు":7313,"లీ":875,"లన":1389,"లద":177,"లత":291,"లర":153,"లమ":664,"లయ":457,"లభ":102,"లప":338,"లగ":164,"లక":1319,"ళం":168,"రె":1156,"రే":571,"లం":1397,"రీ":1644,"రు":6712,"రూ":370,"ర్":10720,"రై":236,"రొ":149,"రో":1788,"రవ":803,"రర":106,"రల":270,"రి":8236,"రా":9645,"రహ":470,"రస":1070,"పోయ":117,"సల":125,"సర":1986,"సవ":160,"సన":295,"షే":125,"సమ":856,"ష్":2463,"సభ":337,"షు":200,"షి":679,"హం":127,"సత":144,"షా":261,"షల":199,"షన":300,"శే":116,"శో":101,"శ్":2342,"శా":1493,"శి":616,"శీ":111,"సం":3277,"శు":134,"షణ":177,"శర":141,"శమ":213,"వ్":1485,"శప":105,"వై":429,"వే":1068,"వె":798,"పెర":121,"వృ":262,"శత":192,"పెద":371,"వు":1823,"వి":5005,"వీ":727,"వా":3654,"పెట":114,"వహ":409,"వస":762,"వవ":165,"వల":675,"శక":362,"వమ":143,"వర":2599,"వయ":126,"వబ":126,"పేర":775,"వద":165,"వన":469,"శం":714,"వడ":233,"వత":1750,"వచ":500,"పేట":181,"వక":188,"వగ":122,"పర":1387,"పయ":194,"పబ":173,"పుడ":183,"పుట":123,"పశ":239,"పవ":130,"పల":696,"పీ":121,"బం":418,"పు":3004,"పున":212,"పా":2599,"పి":1196,"పుల":197,"పై":426,"పుర":635,"పె":871,"పే":1148,"పూ":679,"పో":542,"పొ":493,"ప్":7624,"ఫి":208,"బంధ":198,"బడ":901,"ఫా":98,"పుక":147,"నప":317,"నన":175,"ధ్":1531,"నమ":463,"నల":465,"నర":396,"నవ":1165,"నస":228,"నా":3446,"ని":11511,"నూ":449,"పం":874,"నీ":780,"ను":3710,"నె":404,"నే":1616,"నై":122,"నో":217,"న్":7783,"పక":194,"పట":848,"పడ":347,"పత":560,"పద":604,"పూర":466,"పన":354,"మస":145,"మహ":733,"యక":627,"మవ":108,"మల":476,"మర":2137,"మమ":1293,"భ్":184,"మయ":233,"మ్":1298,"యమ":802,"మో":201,"యబ":173,"మొ":695,"యప":181,"మై":1143,"యన":1189,"మే":512,"మె":642,"యత":237,"మూ":561,"యణ":150,"ము":8249,"యడ":124,"రం":3512,"మీ":939,"మి":2427,"మా":3556,"రజ":343,"యా":3425,"యస":173,"రచ":538,"రక":1588,"యవ":582,"రగ":211,"యర":125,"యల":345,"రభ":250,"రబ":182,"యో":1070,"రయ":209,"రమ":3193,"య్":536,"యే":292,"రన":144,"యొ":325,"రప":785,"రత":1738,"రధ":361,"రద":1093,"యి":2020,"రణ":774,"యూ":387,"యు":2564,"భజ":115,"బహ":139,"ఫ్":329,"బల":111,"బర":571,"బె":226,"పిల":285,"బో":143,"బి":436,"బా":1142,"బూ":263,"బు":283,"మం":2740,"బీ":198,"భవ":117,"పిం":315,"పాడ":180,"పాట":306,"బ్":1702,"పాక":97,"మక":263,"పార":352,"మధ":257,"మద":259,"పాల":798,"మన":607,"భా":2350,"యం":1483,"భి":337,"మణ":175,"పాత":213,"భూ":233,"భు":226,"పాద":161,"మత":260,"తం":1159,"డె":175,"డై":152,"డే":590,"డూ":98,"డు":4437,"డ్":1447,"తగ":160,"తక":320,"ణు":288,"ణి":460,"తడ":168,"థం":125,"ణా":811,"పశ్":219,"ణమ":366,"డక":111,"డం":469,"టీ":380,"టి":2635,"టూ":417,"టు":1542,"టె":412,"టే":482,"టో":219,"ట్":4408,"డల":1645,"డవ":326,"డి":3130,"ణం":589,"డీ":118,"డా":1416,"డన":117,"డప":181,"యం ":918,"డర":467,"డమ":180,"దూ":569,"ది":7459,"దా":2559,"దు":1821,"నం":1132,"దీ":937,"దశ":127,"దల":498,"దవ":129,"దమ":252,"దర":1036,"దన":184,"నద":1194,"నత":159,"ధు":301,"నడ":153,"ధి":1062,"ధా":1185,"నట":452,"నగ":969,"నక":819,"ధర":205,"ధమ":185,"ద్":4147,"దో":106,"దై":97,"పల్":507,"ధన":195,"దే":2544,"తి":3113,"తా":2188,"తు":2234,"తీ":1022,"దం":398,"తూ":504,"తవ":124,"ణ్":199,"తమ":1218,"తయ":143,"తర":1102,"తల":412,"తత":106,"తద":425,"తన":727,"తప":172,"ధం":257,"థి":222,"దట":256,"థా":607,"దగ":155,"త్":7311,"దక":372,"థల":206,"తె":1297,"తే":283,"తొ":248,"తో":1099,"తన ":222,"re ":103,"తమ ":119,"తర ":195,"rna":99,"డర్":430,"rig":147,"డలం":220,"డలా":772,"డలమ":349,"తి ":1458,"తా ":207,"తు ":133,"దం ":185,"తీ ":111,"డాక":183,"తూ ":177,"ణంగ":137,"డిస":137,"డియ":171,"డిప":137,"డిన":531,"డిగ":126,"డాన":284,"డిం":217,"డున":104,"డుత":225,"డుద":220,"తే ":137,"డుగ":154,"టుం":409,"టుడ":112,"టుక":107,"టుల":128,"టాన":112,"టిం":317,"టార":521,"టాయ":145,"టిక":258,"టిన":213,"టిల":128,"టెం":107,"టూర":324,"టెస":126,"ట్ట":1520,"ట్న":97,"ట్ల":382,"ట్ర":1237,"టోబ":103,"థం ":106,"ణి ":141,"ణా ":186,"డే ":362,"టణం":170,"టణమ":136,"ఞాన":102,"డ్ ":409,"టర్":527,"తం ":582,"px ":156,"డప ":101,"డవ ":125,"డల ":98,"డు ":3436,"ణం ":367,"డి ":1391,"డా ":647,"దక్":263,"ng ":118,"దగ్":111,"nal":119,"ని ":5553,"nd ":184,"నా ":456,"nat":122,"తొల":175,"త్స":1562,"త్వ":496,"త్ప":99,"త్య":731,"త్ర":2659,"త్మ":140,"త్త":1214,"న్ ":2086,"nte":106,"నీ ":393,"ను ":1891,"నూ ":221,"థాన":257,"థాప":165,"దటి":194,"నే ":783,"తలు":105,"తర్":233,"తరా":100,"తరు":178,"తయా":124,"of ":287,"తని":130,"తను":162,"తదే":370,"ద్ ":290,"ణ్య":125,"తము":568,"తరం":98,"తమి":191,"తమై":176,"తూర":292,"తీస":125,"తీర":136,"తీయ":499,"తుం":576,"తువ":193,"తుల":549,"తున":363,"తుడ":101,"తుత":113,"తెల":1095,"or ":106,"ధి ":298,"on ":261,"తిం":103,"తాన":260,"తాబ":131,"తాయ":183,"తార":646,"తాల":289,"తిక":313,"తిప":135,"తిన":244,"తిల":128,"తిర":258,"ona":120,"ణము":230,"ణాట":124,"ణాన":134,"ణాల":199,"దే ":212,"తడు":135,"నం ":570,"దీ ":144,"దు ":735,"దూ ":152,"దా ":781,"ది ":4694,"mb ":192,"డ్ర":225,"డ్డ":567,"డైన":112,"త్ ":276,"తో ":862,"తంత":146,"తంల":160,"తంగ":162,"ధం ":174,"హైద":184,"హ్మ":154,"హరి":153,"హిం":546,"హాస":146,"హిత":221,"హార":226,"ాం ":154,"ాల ":1055,"ార ":189,"ాయ ":112,"ామ ":130,"ాష ":111,"ిక ":816,"ాడ ":161,"ాన ":423,"ాణ ":106,"ాత ":349,"షిణ":213,"షియ":100,"సత్":128,"ష్ణ":466,"ష్ట":1410,"ష్మ":151,"ష్య":127,"శ్వ":481,"శ్ర":732,"శ్చ":253,"సూర":115,"సుమ":114,"సుప":118,"సుల":177,"సుక":241,"సిన":1016,"సెప":103,"హదా":155,"సెం":276,"సేవ":130,"స్వ":634,"స్స":166,"స్ట":634,"స్త":2396,"స్క":441,"స్య":121,"స్ల":199,"స్థ":1195,"స్ప":97,"హబూ":215,"సర్":165,"సరా":545,"సరి":163,"సరం":158,"సము":220,"సమా":187,"సరమ":805,"సిం":294,"సాగ":118,"సార":233,"సాయ":137,"సామ":225,"సాల":116,"సాధ":336,"సిద":483,"సాహ":220,"సాం":202,"ాషల":143,"ిగా":515,"ిగి":822,"ాశి":102,"ాష్":1035,"ాసన":172,"ాస్":763,"ాసి":162,"ాసు":140,"ాహ్":114,"ాహి":212,"ిచే":186,"ాల్":490,"ాలె":525,"ాలో":374,"ాలి":340,"ాలా":319,"ాలు":1799,"ికం":141,"ాళ్":125,"ికల":174,"ావర":347,"ిక్":462,"ికె":356,"ికి":2410,"ికా":407,"ాశం":101,"ికీ":129,"ావు":393,"ావి":174,"ావా":119,"ాయక":252,"ామా":422,"ామి":301,"ువ ":165,"ాయణ":146,"ాము":207,"ామీ":105,"ారం":644,"ాయన":97,"ామ్":178,"ాయల":128,"ాయా":112,"ాయి":1156,"ారణ":247,"ాయు":174,"ారత":1096,"ారమ":516,"ాలం":253,"ారు":2695,"ారా":731,"ారి":987,"ారె":142,"ార్":1699,"ాలక":349,"ాలన":602,"ాలల":704,"ాలయ":271,"ాపా":110,"ాపి":155,"ాపు":243,"ుల ":664,"ాప్":131,"ాబ్":203,"ామం":407,"ాబా":337,"ాబి":105,"ాభా":138,"ామమ":1222,"ాయం":147,"ాధి":274,"ాధా":195,"ాద్":200,"ాది":262,"ాదా":163,"ాదు":323,"ానం":337,"ాన్":1330,"ాని":2256,"ానా":239,"ాను":249,"ానీ":138,"ానమ":140,"ానవ":144,"ాణి":118,"ాణా":137,"ాతం":140,"ాత్":473,"ాతి":287,"ాతీ":294,"ాటి":344,"ాటక":227,"ింద":1127,"ింప":204,"ింహ":109,"ున ":292,"ాణం":105,"ాడు":1280,"ాటు":186,"ాట్":155,"ించ":3485,"ింగ":388,"ాచా":147,"ాజక":97,"ాజ్":274,"ాజు":235,"ాజీ":183,"ిస్":1152,"ిసె":112,"ిసి":108,"ిశ్":326,"ిష్":330,"ివర":212,"ీకా":119,"ివి":228,"ివా":253,"ివృ":121,"ిలు":223,"ిలి":585,"ిల్":2371,"ిలో":795,"ీకర":173,"ిరి":250,"ిరా":106,"ిర్":662,"ిరు":214,"ిమా":662,"ిమి":102,"ియన":542,"ియా":466,"ియు":1543,"ియో":582,"ిభజ":97,"ిబ్":136,"ిభా":151,"ిప్":171,"ిపో":97,"ిపా":281,"ిపి":237,"ిన్":485,"ినే":113,"ినా":300,"ినీ":170,"ిని":1388,"ినవ":476,"ిధ్":101,"ినద":304,"ిద్":918,"ిది":107,"ితో":97,"ిత్":1390,"ితు":127,"ితా":191,"ితి":106,"ితమ":107,"ితం":125,"ీంన":105,"ిడి":106,"ిడు":224,"ిట్":149,"ిజ్":98,"ిజయ":327,"ిమ ":204,"ిన ":3712,"ిధ ":97,"us ":116,"umb":214,"ిత ":340,"ిణ ":131,"ాగా":294,"ాగం":166,"ాకు":548,"ాకా":261,"ాక్":268,"ాగమ":122,"ీయ ":614,"ుఖ ":363,"ాంప":98,"ాంత":945,"ాంచ":129,"ాంక":177,"ాంగ":129,"ాండ":272,"ాంట":124,"tio":240,"thu":204,"tic":109,"ీద ":119,"ter":183,"ీత ":149,"the":294,"రెగ":378," టె":198,"రెడ":266,"రెం":338," డి":256," తర":402," తల":122," తయ":126," తమ":296," తన":208,"రుల":346,"రువ":337," తీ":259,"రూప":260," తు":102," తూ":204," తా":347," తి":295," దర":275,"రిస":226,"రీక":244,"లంక":104,"రిశ":167,"లంగ":98,"రియ":2151,"రిల":238," దా":338," ది":263," నం":121," దీ":636," దు":100," దూ":346,"రీడ":197," త్":129," దక":226,"రుగ":346," తే":107," తె":1239,"రుక":143,"లంల":221," తో":176,"రీల":108," తొ":234,"రీర":165,"రుప":202,"రుద":98," దగ":103,"రుత":145,"రుడ":402," ని":1694," నా":986," నే":224," నె":256," ను":1002," నీ":159," పం":410," ధ్":128," నవ":245," నల":140," నర":118," నట":342," నద":241," ద్":355," దే":830," నగ":454," ధర":125," పు":828," బం":140," పి":453," పా":1265,"ర్జ":254,"ర్చ":276," పూ":375,"ర్గ":822," పై":166," పే":871,"ర్క":210," పె":701," పర":985," పల":194," పశ":232,"ర్ల":676," పట":506,"ర్భ":97,"ర్మ":671,"ర్య":611,"ర్ర":164," పత":150,"ర్వ":752,"ర్శ":394," పద":483,"ర్ష":165," పన":214,"ర్స":153,"ర్డ":192,"ర్త":647,"ర్ణ":379," న్":102,"ర్ట":234,"ర్ప":512,"ర్ద":134,"ర్థ":298,"ర్న":217,"ర్ధ":247," బె":166," మం":1840," బు":104," బా":514," బి":196," బహ":138," ఫి":158,"రోజ":1261," పో":317," పొ":389," ప్":5580," మె":212," మే":243," మొ":630," మీ":304," రం":324," ము":950," మా":1189," మి":579," మూ":307," మహ":682," మర":1780," మల":118," మద":144," మధ":241," మన":339," భా":1788," భూ":198," మత":120," బ్":391," రక":185," రచ":380," యా":130," యు":226," యొ":307," రహ":158," రా":2479," రి":152," రూ":154," రే":110," రె":413," రో":1305," లక":229," లా":183," లి":124," లీ":414,"లక్":278," లే":936," లో":2471," వం":288,"లకు":713," వచ":271," వర":687," వల":223," వన":123," శత":174," వృ":111," వు":163," వీ":487," వి":2741," వా":1113," వహ":154," వస":211," శర":115," వ్":920," వే":517," వై":295," వె":707," శి":276," శా":559," సం":2964," శ్":619," సభ":227," సమ":725," సర":319," సత":113," సో":126," సై":131," స్":1289," సహ":136," సా":1231," సి":984," సూ":295," సీ":113," సు":551," సె":214," సే":170," హా":151," హి":411," హై":215,"శం ":350,"లదు":145,"లనా":127,"లను":769," అద":293," అన":2095," అధ":264," అప":148," అమ":347," అభ":171," అర":447," అయ":275," అల":292," ఆక":148," అవ":290," ఆగ":133," అస":267," ఇం":883," అక":309," అత":484," ఆం":1219," అడ":112,"లతో":261," అం":1209,"లము":453," ఉప":425," ఉత":340," ఉద":238," ఉన":1059," ఈయ":143," ఉం":878," ఇస":144," ఇవ":231," ఇత":411," ఇద":1021," ఆస":171," ఆల":168," ఇక":159," ఆమ":98," ఆయ":141," ఆర":349," ఆద":158," ఆధ":194," ఎన":222,"లపా":106," ఎక":205," ఎం":151," ఊర":103,"లయం":174,"లసి":100,"వి ":1063," ఒక":2330," ఎల":121," ఏప":110,"వు ":683," ఏర":193," కడ":121," కన":324," కథ":133," కళ":192," కల":950," కర":453," కమ":146," కవ":208," కూ":537," కు":935," కృ":305,"లలో":1368," కా":1477," కీ":118," కి":608," కొ":916," కో":463," కె":104," కే":473," క్":1409," కం":234,"లవు":100,"లున":100,"లుప":124,"లుగ":1377," చం":138,"లీప":400," చర":227,"లిస":141,"లియ":176," గల":204," గణ":98,"లూర":257," గొ":123," గ్":2511," గో":479," గు":835,"లుస":156,"లువ":181," గా":766,"లాం":234," జీ":342," జూ":200," జి":1930," జా":544,"లిప":160,"లిన":457,"లాల":533," చి":1236," చా":356," జం":150," చూ":133,"లిచ":103,"లిక":169," చే":1234," చె":2126,"లిగ":302,"లిం":322," జర":424,"లాన":792," జట":132,"లాక":349," జన":782,"మాన":550,"మిం":658,"మాణ":180,"మాత":176,"మాజ":244,"మిత":118,"మిన":106,"మాల":362,"మార":804,"మిగ":392,"మిక":194,"మూర":145,"మూల":144,"ముస":98,"మూడ":128,"మీద":160,"రంభ":126,"రంగ":724,"మిళ":192,"మీట":267,"ముం":165,"మున":805,"ముద":263,"ముల":1642,"ముగ":175,"ముఖ":794,"రంల":639,"లన ":189,"మెర":163,"మొద":382,"యబడ":162,"మైన":961,"యన్":580,"యమై":151,"యము":343,"మ్య":156,"మ్మ":696,"రకు":296,"రకా":757,"యలు":117,"లి ":884,"లా ":1657,"రక్":177,"యవా":98,"యవస":290,"యాం":168,"రజల":129,"రజా":121,"లు ":5068,"రచయ":170,"లీ ":280,"రచి":149,"యాత":175,"యిం":162,"యాన":300,"యాప":194,"యిత":265,"యిన":317,"యాల":977,"యాయ":147,"యార":226,"యాస":145,"లె ":114,"యున":120,"యుద":115,"యుడ":244,"రణం":198,"లై ":139,"యుల":185,"రతద":370,"రత్":271,"లొ ":129,"రతి":294,"రతీ":189,"లో ":7792,"రదా":133,"రదే":797,"యేక":110,"ల్ ":1097,"రధా":328,"రపం":261,"యొక":313,"యోగ":322,"రప్":212,"యోజ":484,"రబ్":122,"రభు":164," న ":334,"య్య":475,"రమా":132,"రము":2455,"రమై":178," వ ":948,"రలో":108,"రవే":152,"రవర":163,"రసి":474,"రసా":125,"రహద":155,"రహ్":108,"ళు ":149,"రస్":344,"వం ":351,"రాం":782,"రాష":1004,"రిగ":429,"రాశ":107,"రిచ":101,"రాస":176,"రాల":911,"రావ":479,"రిక":1063,"రాయ":398,"రామ":2348,"రార":157,"రిమ":109,"రిన":146,"రిప":316,"రిత":332,"రీం":118,"రాజ":925,"రాచ":155,"రాక":131,"రాబ":238,"రాన":501,"రాత":218,"రాణ":222,"రిం":589,"సి ":308," ఈ ":1799,"వవి":98,"శకు":169,"వల్":164,"వలన":113,"వర్":961,"వరి":638,"వరా":124,"వరం":218,"వరక":228,"వబడ":120,"వని":123,"�":414," ఆ ":193,"వేశ":97,"వెన":102,"వేత":144,"వేద":126,"వేర":137,"వెల":135,"వెళ":168,"స్ ":1102,"శతా":121,"వెం":227,"వుల":548,"వృద":108,"వృత":103,"విభ":250,"విశ":387,"విష":171,"విస":184,"విల":124,"వివ":223,"వీట":109,"వీర":311,"వుడ":102,"సే ":113,"వున":206,"వాట":147,"వాడ":389,"విం":186,"వాత":351,"వాద":217,"వాన":154,"వాయ":104,"వార":1074,"వామ":220,"విక":160,"వాల":287,"వాస":216,"విజ":433,"విడ":296,"విత":252,"విన":171,"విధ":335,"విద":344,"వస్":529,"వహి":267,"సు ":228,"శము":175,"వ్య":1048,"వ్ర":129,"వ్వ":183,"షన్":215,"సంక":132,"శివ":165,"సంగ":237,"సంఘ":104,"సంవ":1416,"సంస":438,"సంబ":140,"సంప":159,"శాఖ":118,"శాన":154,"శాల":347,"శార":99,"శాస":530,"ల్ప":168,"ల్ల":3746,"ల్గ":136,"లేద":781,"వ్ ":114,"లేక":128,"లెం":508,"లోమ":231,"లోన":1869,"లోక":284,"లైన":312,"వంత":146,"వంట":147,"వంశ":116,"సం ":188,"వగా":101,"శ్ ":738,"ళ్ళ":500,"ళ్ల":102,"శంల":250,"వత్":1433,"సభ ":189,"ష్ ":119,"షి ":144,"వచ్":413},"n_words":[958347,1064684,731588],"name":"te"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":2767,"E":2382,"F":1744,"G":2229,"A":4566,"B":2837,"C":4944,"L":2537,"M":3661,"N":2375,"O":1879,"H":2029,"I":2684,"J":1202,"K":1239,"U":982,"T":3940,"W":1428,"V":1153,"P":3623,"S":5025,"R":2458,"f":4548,"g":7700,"d":10407,"e":36793,"b":4586,"c":13011,"a":35754,"n":27098,"o":27360,"l":17507,"m":9775,"k":3233,"h":11517,"i":30062,"w":2927,"v":3191,"u":11836,"t":23508,"s":19258,"r":25409,"p":8180,"z":1147,"y":6708,"x":1329,"ี่ก":1219,"ี่ค":798,"ี่จ":1624,"ี่แ":1639,"ี้ย":1300,"ี่เ":7102,"ี่ไ":1804,"ี่ใ":2548,"ี่ท":1342,"ี่น":1525,"ี่ต":2048,"ี่ม":5943,"ี่ย":5902,"ี่ป":3449,"ี่พ":1219,"ี้จ":758,"ี่ส":5079,"ี่ร":1809,"ี่ห":1170,"ี่อ":2340,"ู่ ":1041,"ี้เ":1017," l":799," m":1059,"ึกษ":2877," o":2241," h":734," i":844," d":1129," e":750," f":963," a":1997," b":936," c":1950," t":2090," p":1738," s":2098," J":1111," K":1075," H":1671," I":1700," N":1646," O":1116," L":1847," M":2970," B":2345," C":3940," A":3318," F":1429," G":1940," D":1996," E":1607," S":3963," R":1952," P":2925," W":1239," V":824," T":2982,"ีปร":905,"ีนา":987,"ีวิ":1802,"ิ่ง":2648,"ีลั":1263,"ีรถ":1177,"ิโล":1164,"ีย์":1013,"ียบ":1082,"ียน":6852,"ียร":1356,"ียม":1021,"ียว":3305,"ียก":3983,"ิเม":917,"ียง":6904,"ิเว":1991,"ีฬา":1237,"ิ่น":1155,"ิ่ม":2587,"ิ้น":1155,"Co":1009,"Ch":760,"ือ ":11318,"ิทย":5393,"ิทธ":1425,"ินส":994,"ินแ":951,"ินเ":2159,"ินท":2179,"ิธี":1094,"ุด ":981,"Ma":1030,"ำแห":1099,"ิยม":1454,"ิมพ":1364,"ำเภ":2344,"ำเน":1700,"ีขน":839,"ีกา":2681,"ิภา":937,"ำให":1486,"ิร์":1529,"ิริ":860," ศ ":10355,"ีคว":2479,"ิย์":865,"ิยา":1638,"ิลป":1713,"ิวเ":1224,"Th":1304,"ิศา":1224,"ิสต":1802,"ิษั":1436,"ีชื":1806,"ิหา":1129," พ ":6930,"a ":4409,"ีที":1523,"i ":1629,"ge":1151,"he":3360,"ha":1753,"g ":1914,"ea":1472,"ec":1050,"ำตั":743,"ed":1616,"de":1807,"di":1292," ค ":3588,"h ":1481,"ำดั":807,"el":1781,"en":3747,"em":1263,"et":1470,"es":2908,"er":5983,"ca":1761,"e ":9681,"be":757,"da":1163,"f ":1782,"ct":1070,"co":1453,"ci":974,"ch":1872,"ce":1660,"ำนั":1183,"ำนา":1182,"c ":1262,"ำนว":1889,"ำบล":1639,"d ":3285,"at":4029,"as":1743,"ar":4079,"al":3985,"ai":1077,"ap":790,"am":1525,"an":5743,"ac":1436,"ad":868,"ag":1036,"ae":972,"nt":2832,"ns":1221,"no":1065,"ิกส":1152,"of":1717,"oc":793,"od":802,"om":1831,"on":5797,"ol":1972,"ot":991,"os":1228,"ou":1259,"op":973,"or":3612,"r ":3528,"pe":1355,"pa":819,"po":838,"ph":815,"lo":1494,"ll":1602,"o ":1832,"ma":1651,"me":1665,"mi":1301,"p ":991,"na":2372,"nc":1198,"nd":2063,"ne":2324,"ng":2747,"ni":2058,"าใน":1710,"m ":1949,"li":2285,"le":2488,"la":2669,"n ":6518,"ht":776,"hi":1241,"าแล":1356,"ho":994,"id":1548,"ic":3537,"ia":2041,"ig":855,"ie":1170,"k ":958,"ir":922,"is":2584,"it":2279,"il":1527,"าเร":879,"in":5078,"าเล":858,"io":2597,"าเป":2477,"ิกา":4876,"l ":3592,"ิงเ":1081,"ำว่":911,"y ":3290,"ve":1463,"ul":919,"ur":1530,"us":1967,"ut":915,"um":918,"un":1108,"ty":767,"tu":815,"tt":965,"ิงห":822,"to":1767,"ำหร":2147,"tr":1441,"te":3369,"ำหน":2147,"ti":3875,"th":2569,"ta":2328,"ss":1290,"st":2642,"so":847,"se":1374,"si":1814,"่":261149,"้":168823,"๊":2314,"๋":1031,"์":89548,"เ":290692,"rs":890,"แ":101902,"rt":1263,"โ":60108,"ใ":72132,"ไ":49049,"ๆ":5416,"ry":1040,"็":73338,"ro":2585,"ู":59743,"ุ":70364,"ื":81927,"ri":3400,"ึ":35498,"ี":202861,"ิ":162696,"re":2827,"ำ":45160,"า":458974,"rd":813,"ั":226972,"ะ":131072,"ฮ":6759,"ra":3541,"ฯ":856,"t ":3621,"ฬ":1901,"ิชา":1303,"อ":306842,"ส":153652,"ห":123525,"ศ":53898,"ษ":33643,"ว":181464,"ฤ":7018,"ล":179449,"ย":192630,"ร":390285,"ภ":30986,"ม":229909,"ฝ":4270,"ผ":20687,"ฟ":16906,"พ":93163,"น":423152,"ธ":26604,"ป":143207,"บ":116540,"ต":148116,"ด":152617,"ท":165794,"ถ":31675,"ฑ":2084,"ฐ":9273,"ณ":25387,"ฒ":2670,"ญ":19027,"s ":7041,"ฎ":2682,"ฏ":3414,"จ":90844,"ฉ":5998,"ช":80847,"ซ":33709,"ค":113272,"ฆ":1276,"ง":265772,"ก":280773,"ข":76134,"ิดา":785,"ิดห":1109,"ิติ":1042,"ิตา":823,"ิตร":1253,"ิดเ":1452,"ิตย":978,"ิดต":796,"ิดจ":807,"ิดข":1153,"ิดก":963,"าพั":805,"ามก":1469,"ามค":978,"ำกั":1274,"ามป":1047,"ายค":1118,"ามร":1484,"ายก":2414,"ายข":1115,"ามห":1771,"ายช":1084,"ามส":2270,"ายท":1440,"ายถ":2265,"ามี":784,"ายต":803,"ามา":3948,"ายน":2971,"ารจ":1307,"ายว":810,"ามแ":1031,"ารค":1595,"ารก":1847,"ามเ":3279,"ารข":1167,"ายอ":907,"ายส":1187,"ายห":1214,"าบั":892,"าปร":879," ผ":2939," ฟ":1382," พ":16963," น":7737," ธ":1538," ป":9774," บ":5180," ต":11154," ด":4226," ท":12249," ถ":2578," ณ":758," จ":10643," ช":5003," ซ":8285," ค":16867," ก":16644," ข":5596,"าพร":989,"าพย":2192,"ิ์ ":838,"าที":3260,"านก":1639,"านข":1418,"านค":1839,"าทส":973,"านน":858,"านท":1970,"าธิ":1501,"านต":791,"าธา":1105,"านใ":891,"านเ":2380,"านแ":1122,"านห":886,"านอ":1049,"านส":931,"านี":3469,"านา":1107,"านั":986,"าณา":1725,"าดเ":977,"าตุ":736,"าติ":3417,"าตั":740,"าตร":1001,"าดใ":893,"ี้ ":1390,"ี่ ":11834,"าหล":992,"าหา":1172,"าอย":985,"าอา":792,"าอั":1363," ล":3504," ว":6837,"ั่ง":3130," ม":16388," ภ":4698," ร":8182," ย":2545," อ":16304,"าศา":1885," ฮ":1180," ศ":11904," ห":17474," ส":15924,"าศั":834,"าวเ":1521," ๆ":1601," ไ":6086," ใ":10228," โ":13479," แ":27651," เ":59355,"าวิ":2918,"าวอ":979,"าสา":971,"ั้น":6620,"าสน":1688,"ั้ม":1016,"ั่ว":2312,"าษา":8207,"าสต":5817,"ั่น":1240,"ั้ง":16337,"าร์":6532,"ารแ":3430,"ารใ":1567,"ารเ":5283,"ารี":821,"ำคั":1775,"ารา":3089,"ารอ":1573,"ารส":2796,"ารศ":1222,"ารว":1014,"ายใ":2848,"ารร":1717,"ายแ":1390,"ารย":1381,"ายเ":3128,"ารพ":1130,"ารป":2397,"ารน":1040,"ารบ":1445,"ารท":2651,"ารต":1493,"ารถ":2935,"ายา":887,"ารณ":2161,"ารด":1027,"าลี":979,"าลั":2863,"ากค":746,"ากก":2663,"ากน":1203,"ากท":1139,"ากฏ":908,"ากอ":816,"ากส":915,"ะเจ":2079,"ากล":1523,"ากร":2608,"ากม":872,"ะเก":848,"ับอ":925,"ับส":1520,"ับเ":2350,"ับแ":746,"ันด":1864,"ันต":2569,"ันท":6775,"ันน":1059,"ันธ":3528,"ับก":2418,"ับค":1139,"ันย":781,"ันว":1287,"ันส":772,"ันอ":2900,"ับท":1243,"ับป":1271,"ันใ":1276,"ับร":1129,"ันแ":786,"ันเ":2354,"ันไ":773,"ันก":1116,"ัตร":1718,"ัดเ":1458,"ัตถ":1084,"ัดอ":823,"ัดส":1175,"ัติ":3172,"ัตว":1677,"ัดก":1357,"ัณฑ":1093," ๆ ":1525,"ัฐอ":1242,"ัฒน":2389,"ึง ":1975,"ีย ":3548,"าชน":1229,"าชว":1452,"าชอ":929,"าชิ":1510,"าชา":1285,"าชก":821,"ัวเ":1510,"ัศน":1087,"าจั":1830,"าจา":3547,"างๆ":1330,"ัวอ":898,"างเ":3767,"างไ":829,"ัวล":1090,"างใ":893,"างแ":1340,"างอ":972,"างส":1811,"างห":1367,"างว":1756,"างม":850,"างร":1234,"างป":1308,"างต":1708,"างท":1838,"างด":961,"างช":741,"างจ":1536,"างค":2031,"างข":1895,"างก":3719,"ัลบ":846,"ะได":777,"าคา":932,"าคม":4556,"าขา":1162,"ะเล":1717,"ากเ":1543,"ะเภ":1470,"าขอ":2080,"ีน ":1380,"ะเท":8450,"ากั":1026,"ากา":3774,"ัมพ":821,"ะเป":3470,"ะทร":812,"ะทา":1011,"ะที":1426,"ะทำ":752," เก":2484," เข":1752," เค":1359," เจ":1771," เช":2450," เซ":916," เด":1861," เท":1287," เน":1184," เป":26794," เพ":2641,"ะบบ":2022,"ะนา":1109,"ะปร":1082,"ะบา":1151,"ิต ":1048,"ิด ":1226,"ิง ":1118,"ะดั":1976,"ะดู":960,"ีก ":1051,"ะวั":4431," อย":1707,"ังเ":1838," อำ":1631," อิ":974," อั":2991," อา":2413,"ะสา":867," ออ":1734,"ิม ":803," วั":2877," วิ":1583,"ะอง":1516,"ัญญ":1369,"ะหว":2921,"ัญช":920," สถ":1382," สม":1526," หน":1504," สุ":760," สำ":1162," สา":2037," สิ":1143," สั":1226," ส่":1561," หร":10471," หล":1146," หม":2524,"ักด":1151," รว":890,"ักษ":6831," ระ":1251," รั":1312,"ักร":4702," รา":1609,"ะพุ":1098,"ะพั":826,"ักจ":822,"ักก":1221,"ะมา":2026,"ะมี":2632,"ะยั":766,"ะยะ":834,"ิน ":3149,"ักเ":1448,"ะยา":744," ภา":4274,"ะรา":2815," มี":8842," มั":898," มา":1778," มิ":1110,"ัจจ":2448,"ังส":1789,"ังห":4456," มห":1285,"ังก":5613,"ังค":1611,"ังจ":853,"アアア":748," พร":4751," พฤ":1063," ผู":1667," ปล":779," ปร":4492," ปี":1463," ปั":1211," บร":1264," บา":1052," นิ":1151," นั":2273," นา":1380," ทร":1302," ที":5982," ทำ":956," ทั":838," ทา":1100," ต่":814,"าล ":975," ถน":741," ตา":1753," ตั":4204," ตำ":1120," ตร":740,"าร ":2963,"าว ":2585,"ิก ":1504,"ะจั":762,"ะจำ":1148," ชา":799," ชื":1329," ซึ":6040," จำ":738," จา":1386," จั":3956," จะ":808,"ะชา":1877," คื":4533," คำ":751,"ะกอ":2553," คว":1268," ขอ":3435," ก่":1078," คร":1482," คน":1274,"ะคร":1889," กิ":1156," กั":1618," กา":4296," กล":1472," กร":3165,"ะกู":837,"ะกา":2005,"เคร":3765,"เคย":899,"เข้":3046,"เคล":946,"เกี":2538,"เกา":1880,"เกิ":4857,"เขต":2529,"เกล":1212,"เกม":1559,"เกร":925,"เขี":2819,"เขา":2400,"เก็":770,"ใน ":1513,"เซี":1618,"เจ้":6297,"เฉี":1063,"เชื":1779,"เชิ":917,"เชี":2100,"เช่":2859,"เซล":810,"เงิ":900,"เฉพ":1150,"ู้เ":1092,"ู่ห":1111,"ู้จ":1153,"ู่ท":2435,"ู่บ":1069,"ู่ใ":3858,"ู่เ":814,"ู้ท":742,"ア":1141,"ุ่น":2796,"ุ่ม":4128,"ุ่ง":863,"ูมิ":1515,"ูปแ":1059,"ูนย":864,"ื่อ":28338,"ุษย":1008,"ื่น":2592,"ื้อ":3123,"ื้น":2966,"ุรี":1349,"ุริ":781,"ุทร":864,"ุทธ":2795,"ุบั":2386,"ือเ":2326,"ือท":952,"ือน":2586,"ือส":1106,"ืออ":845,"ือว":1045,"ุดท":873,"ุดใ":860,"ุตร":863,"ุงเ":1324,"ึ่ง":15334,"ึ้น":5621,"ือด":804,"ือง":6868,"ือข":1238,"ือก":2818,"ุคค":864,"้า ":2891,"็กซ":748,"็จพ":2426,"็นเ":5516,"็นแ":1460,"็นโ":1337,"็นก":3439,"็นช":1716,"็นค":1866,"็นจ":1055,"็นผ":1694,"็นบ":1033,"็นป":1559,"็นภ":2128,"็นม":771,"็นพ":2607,"็นต":2314,"็นน":1867,"็นท":2718,"็นอ":2614,"็นห":2383,"็นส":4810,"็นร":2205,"็นว":1944,"่ขอ":746,"่งช":1246,"่งข":3362,"่งก":1222,"่งป":1000,"่งท":1841,"่งต":1001,"่งแ":1232,"่งเ":5701,"่งใ":2695,"่จะ":1049,"่งส":1203,"่งห":867,"่งอ":1487,"่งม":1795,"่งร":800,"็อก":867,"่ตั":1043,"่ที":2658,"่ทา":851,"่นด":850,"่นเ":1175,"่น้":1389,"่ปร":1522,"่ปุ":1738,"ไทย":5121,"ไม้":1526,"ไม่":4922,"่น ":4848,"ไฟฟ":846,"一":1792,"ไว้":1146,"่อ ":2927,"้ง ":1525,"่า ":8977,"้น ":3595,"้ว ":1138,"โบร":910,"โปร":1570,"ใกล":918,"โดย":14241,"โทร":1350,"ใช้":8306,"โลเ":1118,"โลก":2916,"โรม":847,"โรง":1846,"็ก ":901,"็น ":2078,"ใต้":3035,"ในช":1757,"ในต":1018,"ในค":959,"ในก":4474,"ในส":3229,"ในห":971,"ในอ":1785,"ในว":1980,"ในป":5976,"ในบ":1100,"ในท":1717,"ในร":2523,"ในพ":1416,"ในภ":1762,"ในแ":1213,"ในเ":3488,"ให้":6947,"่ง ":3045,"ได้":12204,"ใหม":1526,"ใหญ":3818,"เป็":52874,"เบิ":951,"เบี":820,"เปิ":978,"เปล":1464,"เท่":1160,"เธอ":1279,"เนิ":1467,"เนื":3043,"เนี":1468,"เต็":792,"เทค":856,"เทอ":765,"เทศ":9046,"เที":1082,"เทพ":2509,"เดอ":1340,"เดี":3292,"เดิ":2706,"เดื":969,"เตอ":2325,"เด็":3070,"เอ็":1297,"เอเ":1135,"เอง":1028,"เห็":854,"เอก":1715,"เหล":3579,"เส้":1396,"เหม":1208,"เสี":3555,"เหต":864,"เหน":2585,"เสร":829,"เศษ":814,"เศส":1255,"เวี":1017,"เวล":1640,"เล็":2263,"เล่":1978,"เวณ":1738,"เลี":1467,"เลื":1233,"เร็":978,"ไป ":836,"เลข":1395,"เริ":2046,"เรี":7952,"เรื":4783,"เยอ":752,"แข่":1582,"เมร":2010,"แก่":1962,"แก้":737,"เมื":12490,"แกร":866,"เภอ":2333,"เมต":1938,"เภท":1385,"เพล":3180,"เพร":748,"เพี":982,"เพื":3833,"แพท":735,"แผ่":1298,"แนว":1987,"แบบ":4625,"แปล":1818,"แบ่":978,"แดน":987,"แตก":800,"แต่":6839,"แทน":1132,"แดง":995,"แอน":801,"แสด":2090,"แห่":6124,"แหล":799,"แหน":1000,"และ":28762,"แลน":1347,"แล้":2088,"แยก":1197,"แรก":3318,"โคร":1607,"แม่":2248,"แรง":831," แล":16653," แม":1055," โค":742," แอ":790," แห":1041," โด":7452," เว":860," เล":918," เร":2527," เม":4395," เอ":1986," เส":1106," แต":2750," ได":1910," ไม":1165," โร":1135," ใช":876," ใน":8170,"์ใน":1573,"์เน":739,"์เป":837,"์เร":776,"์แล":1851,"์ขอ":1461,"์กา":933,"้ใน":1845,"์ที":2950,"์ตู":1145,"์ปร":767,"้นม":919,"้นส":1040,"้นต":741,"้นท":2678,"้นก":769,"้ที":841,"้ปร":770,"้นไ":757,"้นใ":1235,"้นเ":2203,"่าเ":2240,"アア":942,"่าน":3867,"่าว":1950,"่าย":3313,"่าก":964,"่าง":12952,"่สุ":2074,"่สา":868,"่อก":1359,"่อข":925,"่อม":2720,"่อย":2839,"่อท":791,"่อต":2073,"่อป":1191,"่อน":4061,"่อง":9204,"่อเ":3452,"่อใ":1119,"่ออ":896,"่อว":3681,"่อส":1953,"่วน":5177,"้งอ":3416,"้งส":845,"้งห":1146,"่วง":2102,"้จะ":789,"้จั":1225,"่วม":1638,"้งเ":1096,"่วย":1812,"้งแ":3344,"่วไ":1216,"่ยว":2420,"่ยม":1269,"่รู":752,"้งข":959,"้งท":1116,"่ละ":738,"้กั":1036,"่มา":795,"่ยน":1164,"่มี":6169,"้เป":2320,"้แก":1074,"้เก":745,"้าอ":1154,"้าส":795,"้าห":1044,"้าแ":781,"้าเ":1817,"้าท":1477,"้าร":977,"้าย":2779,"้าม":1703,"้าว":1169,"้าน":6510,"้าง":6437,"้อน":1455,"้อย":2449,"้อม":2269,"้อง":6049,"้ว่":734,"้วย":5521,"่ให":733,"่ได":1564,"่ไม":926,"่ใช":1862,"่ใน":4343,"้รั":2515,"่เก":1904,"่เข":763,"่เร":861,"่เป":2235,"้มี":1570,"่เห":1028," Ma":1010," Ch":742," Co":992," Th":1261," of":1521," co":752," th":1128,"�":5212,"กา ":1384,"กร ":1188,"ก่ ":760,"一一":774,"ของ":31894,"ก่อ":3663,"ขัน":1726,"ขาย":803,"ขาว":1067,"กี่":1664,"กีย":867,"กิโ":1154,"ค์ ":769,"กีฬ":1239,"กิน":769,"กิด":3855,"กิจ":1573,"กำห":759,"กูล":885,"กุล":1306,"กุม":773,"ขนา":2496,"กใน":2081,"กแล":866,"กแบ":824,"กเร":871,"กเม":773,"กเป":1534,"กเฉ":1039,"กเข":1098,"ข่ง":1611,"ข่า":854,"ข้อ":2417,"ข้า":4839,"คณะ":1457,"ขีย":2776,"ขึ้":5628,"คคล":827,"กจะ":954,"กจา":1248,"กคร":1721,"กกา":2113,"กกั":1063,"กกว":874,"คน ":1108,"กขอ":2098,"กซ์":975,"คร ":1077,"คม ":4347,"กล้":2502,"กล่":1300,"กษร":1944,"กว้":773,"กว่":3702,"กษณ":3331,"กษั":896,"กษา":3668,"กรม":2005,"กรณ":1173,"กรา":2022,"กรี":1057,"กรุ":2471,"กระ":5410,"กรร":3527,"กรว":929,"กฤษ":3489,"กลุ":3431,"กลั":865,"กลา":3035,"กัด":1452,"กัน":7783,"กับ":9982,"กาศ":2249,"กาล":1237,"การ":39240,"กาย":2168,"กาะ":1419,"กษ์":1004,"กอง":1215,"กอบ":2433,"กอา":822,"กที":2316,"กต่":822,"กติ":1015,"The":934,"กตั":798,"กดิ":842,"กมา":1372,"กรก":1149,"กภา":827,"กปร":1145,"งค์":4532,"งงา":824,"งจั":1172,"งจา":3155,"งชา":2378,"งกร":1669,"งกฤ":3370,"งกล":1847,"งกา":5753,"งกั":3274,"งขึ":1653,"งขั":1578,"งขอ":4624,"งข้":745,"งคว":1083,"งคล":878,"งคร":2365,"งต่":740,"งตั":1510,"งตะ":916,"งนั":904,"งที":5394,"งทา":994,"งทั":1042,"งปร":4913,"งนี":1331,"งผู":776,"งปี":968,"งด้":1068,"งสม":789,"งศ์":3006,"จจุ":2329,"งวั":2363,"งอย":4387,"งออ":847,"งอั":937,"งอา":1391,"งหน":1700,"งสุ":772,"งสื":1073,"งสิ":892,"งหม":1633,"งสร":855,"งสั":912,"งสา":1650,"งหล":2008,"งหว":4124,"งหา":876,"งภา":1562,"งมี":2826,"งมา":1681,"งพร":2199,"งรา":1862,"งรั":1304,"งระ":937,"จพร":2446,"งาน":5529,"งิน":899,"คลอ":868,"คลื":949,"คล้":1302,"ควา":10480,"ครง":1144,"ครอ":2563,"ครั":3903,"ครา":1915,"คริ":2019,"ครี":810,"ครื":2934,"คุณ":1069,"คือ":7513,"คาร":1456,"คัญ":1773,"คิด":898,"คำว":930,"คอม":1178,"งๆ ":1126,"al ":2103,"ค่า":1374,"and":1056,"an ":1298,"คู่":872,"ค์เ":768,"ค์ก":947,"ati":1494,"ค้า":1139,"ชวง":936,"ชอา":798,"ชาว":3493,"ชาย":1746,"ชาต":3190,"ญ่ ":976,"ชาช":786,"ชัย":783,"ชาก":1050,"ชัน":900,"ชิง":1090,"ชิก":963,"ชั้":1496,"ชั่":737,"ชีย":1882,"ชีว":1797,"ชุด":1385,"ชุม":779,"ชื่":9015,"ช่ว":2205,"ช่อ":1132,"ช่น":2842,"ซอร":818,"ช้ใ":1180,"ช้เ":1405,"ซีย":1624,"ซึ่":8007,"งไม":762,"งไป":735,"งได":1184,"งให":1499,"งเห":1953,"งเส":869,"งแต":2125,"งเพ":1132,"งเป":5244,"งเศ":1374,"งเล":753,"งเร":2306,"งเม":1299,"งเท":2214,"งเด":1083,"งเก":2093,"งเข":1000,"งเค":853,"จริ":1485,"งใต":746,"งใน":4860,"งโด":1508,"งแร":2003,"งแล":1588,"จอม":844,"จัก":4523,"จะม":1426,"จัง":4135,"จึง":1351,"ซ์ ":949,"จีน":2020,"จุบ":2333,"จุด":1489,"จัน":795,"จาก":13356,"จะเ":1881,"จัด":3836,"จาร":1224,"จำน":1462,"จิต":1255,"ฉพา":1147,"ชกา":919,"จ้า":6405,"ชนิ":2672,"ชนะ":817,"ฉีย":1060,"is ":767,"ion":2025,"ญิง":1134,"ญี่":1736,"ญญา":1273,"he ":1580,"ฐอเ":1010,"ดย ":1795,"ic ":1027,"ia ":801,"ica":989,"ine":780,"ing":1208,"in ":877,"ต้ ":1080,"ติ ":2277,"er ":1893,"es ":1392,"ด้ ":1252,"ด์ ":1553,"ent":1216,"ฒนา":1571,"ตร ":2238,"ดา ":815,"ดี ":1229,"ฐาน":2388,"ณ์ ":911,"ดีต":1119,"ดำเ":759,"ดิม":1868,"ตถุ":806,"ดิ์":821,"ท์ ":857,"ดีย":3255,"ดูก":1062,"ดือ":962,"ดับ":3761,"ดัง":1352,"ดาร":853,"ดาว":2014,"ดิน":2900,"ดหน":1239,"ดยก":793,"ดยเ":1721,"ดยม":1638,"ดยส":889,"ดยอ":756,"ดยท":955,"ดนต":1392,"ดที":1456,"ดตั":945,"ed ":825,"ทศ ":1067,"ณาจ":1453,"ณะเ":897,"ทย ":2718,"ดจา":942,"ดขึ":1374,"ดขอ":1317,"ดกั":890,"ดกา":1277,"ต์ ":1632,"ุ ":1109,"นี ":1305,"นา ":1502,"ี ":16016,"ิ ":4510,"ำ ":2043,"re ":749,"า ":25017,"ต้น":2996,"ต้อ":1614,"ต่ง":1112,"ต่อ":4486,"ต่า":3505,"ต่ล":740,"ู ":1781,"นๆ ":762,"่ ":16723,"้ ":5526,"ต์ศ":958,"ๆ ":4266,"ry ":820,"ถาป":753,"ถาน":3944,"ทธศ":829,"ถือ":1350,"ทธิ":2039,"ถูก":2600,"ถึง":6398,"น์ ":1653,"ถิ่":814,"์ ":28985,"ดเม":981,"ดเล":1139,"ดเป":846,"ตรง":1229,"ตรก":784,"ดใน":1781,"ตรี":2898,"ตริ":1006,"ตรา":1575,"ตระ":1105,"ตร์":7354,"ดให":1327,"ตวร":760,"ด็จ":2802,"ตว์":1614,"ด้แ":1081,"ด้เ":1187,"ด้า":3056,"ด้ว":5447,"ด้ร":2493,"ตอร":2348,"ตอน":1918,"ตัว":7601,"ตะว":2896,"ติก":1508,"ติน":757,"ติด":1536,"ตั้":8579,"ตาล":1104,"ตาร":1322,"ตาม":4567,"ตำบ":1639,"ธ์ ":976,"ติเ":954,"ติแ":866,"ติศ":1101,"ตำแ":988,"ถนน":2048,"ตุล":748,"นะ ":754,"ตูน":1178,"นขอ":3384,"นข้":932,"นคว":826,"ธรร":2826,"นคร":3912,"นกร":1617,"นกล":2158,"นกา":7632,"นกั":1355,"นฐา":769,"นช่":1290,"ท้อ":1115,"นชื":1549,"นชา":864,"ท่า":2380,"ท้า":1066,"นจา":1033,"นจั":1141,"นจะ":747,"ข ":896,"นธร":779,"นธุ":1127,"นทร":2489,"นทั":791,"นทา":2750,"นที":13368,"นต้":1225,"นถึ":809,"นต์":1445,"ธิ์":953,"นด้":1065,"ก ":14487,"นตร":4843,"นติ":835,"นตั":2147,"นตา":888,"นด์":1948,"ธิก":846,"นตก":1337,"นดั":1314,"นดิ":1223,"ธาร":1184,"ง ":27727,"ค ":5823,"ธาน":978,"นภา":4234,"บกา":2494,"ช ":2112,"นพื":1006,"จ ":1697,"นพร":3282,"นผู":1561,"นปี":3086,"นปร":4565,"นปั":802,"นบุ":863,"นบร":1204,"ธุ์":901,"นนิ":1044,"นนี":1085,"นนา":1321,"นนั":1485,"ญ ":928,"นธ์":2163,"ทรง":2910,"ทยา":5565,"ทรท":914,"ทรา":963,"ณ ":3143,"ทร์":996,"ถไฟ":2170,"ทพม":811,"ต ":4123,"ด ":8336,"ท ":2444,"น ":40656,"ทสม":792,"ทหา":866,"ป ":2071,"ทอง":1146,"บ ":5246,"ทอร":781,"ทศอ":828,"ฟ ":941,"ทวี":828,"พ ":8598,"ทศไ":1737,"ทศเ":766,"ม ":15299,"ทิน":1259,"ทั่":2123,"ทั้":3993,"ย ":16564,"ร ":10052,"ทำใ":1466,"ทิศ":1076,"ล ":7227,"ว ":7449,"ทัศ":1092,"ทาง":10997,"ทะเ":1496,"ทัพ":825,"ทาน":903,"ศ ":12299,"ษ ":3178,"ส ":5552,"ปี ":4823,"ทุก":1239,"อ ":17206,"ที่":56152,"ะ ":8344,"ปฏิ":1862,"บอล":946,"ng ":1471,"บิน":950,"ne ":979,"บาล":1707,"บั้":776,"บาง":2680,"บัน":3828,"บาท":1409,"nd ":964,"บัต":1362,"บัญ":812,"บวน":840,"นไป":997,"นไม":765,"นให":1732,"นได":1498,"บรา":1706,"บริ":5195,"นใน":4356,"น้ำ":5368,"น้า":3676,"น้อ":1479,"น่า":1206,"น่ว":1000,"น่ง":989,"ปรา":2156,"ประ":25303,"ปริ":937,"nt ":807,"บุค":937,"บีย":817,"บุร":1542,"นหม":998,"นสั":1468,"นสา":1759,"นสำ":796,"นสี":845,"นหน":3532,"นสุ":1261,"นหล":1601,"นส่":1409,"นสถ":1319,"นสม":1639,"นอย":1426,"นอั":1719,"นอา":1493,"นออ":2019,"นอก":1327,"นอง":1083,"นอิ":1016,"of ":1513,"นับ":1179,"นาง":1305,"นาค":1491,"นาด":2642,"นัง":1602,"นัก":6595,"บคว":838,"นยา":1119,"นยุ":813,"นมา":2249,"นมี":1207,"บขอ":813,"นรู":1331,"นรา":1843,"นระ":2096,"นรั":1125,"นย์":888,"นวน":2013,"นวง":1415,"นว่":1138,"นวา":814,"นวิ":1095,"นวั":1960,"บปร":1255,"บบเ":871,"ปกค":1297,"นเค":1189,"นเก":1640,"นเข":1425,"นเจ":957,"นเด":1933,"นเป":2635,"นเพ":1597,"นเท":2022,"นเร":1822,"นเม":2536,"นเส":1392,"นเว":905,"นเอ":1413,"นเห":886,"นแบ":783,"นแร":793,"นแล":1965,"นแห":1112,"นโด":2101,"บรม":1009,"บรร":1513,"นโล":1115,"นั้":3921,"นาม":2497,"นาย":2144,"นาน":1674,"นิน":1076,"นิด":3216,"นิก":1334,"นิว":770,"บด้":1147,"นิย":2396,"นีย":1820,"นีร":1196,"on ":2708,"บที":1359,"นี้":6260,"นือ":2427,"นึ่":6892,"นื้":1785,"นื่":1254,"นุษ":996,"le ":881,"ป่า":900,"ป็น":52765,"ผ่น":1156,"ฝรั":1300,"ผู้":7889,"ปิด":1097,"ปาก":1194,"ปัจ":2459,"บ้า":2101,"ปลา":2984,"ปลี":1161,"บ่ง":1079,"ผลง":1088,"ผลิ":1382,"ปแบ":1026,"ปุ่":1831,"พรร":1714,"พยา":839,"พยน":2010,"พมห":762,"ยน ":4029,"มี ":1149,"มา ":896,"ฝ่า":781,"ยม ":948,"พลั":814,"พวก":739,"พลง":2750,"พระ":17707,"พรา":866,"ยง ":1443,"ผ่า":1411,"ฟฟ้":828,"รม ":1362,"พัฒ":1494,"ม่ ":849,"พัน":3666,"พาะ":1326,"ยว ":1115,"พื่":3826,"พูด":1126,"พื้":2671,"ยา ":1376,"พิธ":838,"พิม":1132,"พิว":738,"พีย":958,"พุท":2067,"มขอ":1433,"มกร":823,"มกั":1433,"มกา":1276,"ฟ้า":1943,"มชา":782,"ลก ":1247,"ภอเ":800,"ลง ":792,"ย์ ":2750,"ระ ":742,"รี ":2785,"รา ":766,"ยขอ":1509,"มมา":901,"มริ":2047,"มรา":849,"ยงเ":962,"ยงใ":1176,"ยชน":801,"มหม":966,"มอง":738,"มหา":5731,"มาช":1046,"มาจ":2680,"มัย":2145,"มาก":4211,"มัน":2325,"มัก":1644,"ภาค":3301,"ฤษ ":2433,"ภาพ":5887,"ภาย":1934,"ภาษ":8339,"มตร":1922,"ร์ ":8527,"มทั":835,"มที":1983,"มนต":905,"มนุ":1036,"ภูม":1417,"มปร":865,"มพิ":882,"มพร":946,"มพ์":1080,"ยกา":2466,"มภา":858,"ยกั":787,"ยกว":1726,"ยกร":1291,"ยาน":1202,"ยาล":2584,"ยาว":2212,"ยาม":807,"ยาย":2092,"รณร":750,"ยาศ":1408,"รณ์":2303,"ยัง":2948,"ยาก":955,"วง ":1110,"ยอด":1124,"ยอร":940,"ยอย":754,"ม่เ":922,"ม่อ":742,"ยสา":1028,"รจั":932,"ม่น":1356,"ม่ม":1196,"ยวก":2166,"รงเ":2099,"มใน":892,"รงก":1367,"รกเ":741,"มเห":813,"รขอ":1042,"ยมี":1590,"รกร":1251,"มเก":770,"มเด":3008,"มเป":1110,"ลี ":1049,"ยปร":756,"มู่":1176,"ลา ":1161,"มูล":1423,"มื่":6537,"ละ ":3400,"มุท":857,"มือ":8088,"ยนต":2593,"ยที":1815,"ยทั":824,"ยถึ":2220,"มีเ":1932,"มีอ":1345,"มีล":1533,"มีห":1060,"มีส":1890,"มีผ":1122,"มีพ":1283,"มีร":1326,"มีท":746,"มีน":1210,"มีป":899,"มีค":3125,"มีช":2478,"มีข":1119,"มีก":2644,"มาเ":1480,"มาร":4462,"มาย":5367,"มาต":1157,"มาณ":1914,"ราบ":751,"ราน":740,"ราณ":897,"ราว":1444,"รั่":1334,"รั้":3560,"ราม":1910,"ราย":2763,"ราะ":1374,"ริญ":758,"ริก":3434,"ริง":1138,"ริน":744,"ริษ":1438,"ริส":1925,"ริม":1124,"ริย":2233,"ริเ":2100,"รีส":866,"ริ่":2037,"รีย":9363,"ระก":4332,"ระจ":2540,"ระช":2359,"ระด":3552,"ระน":1422,"ระบ":4735,"ระท":2215,"ระธ":815,"ระย":1763,"ระม":3043,"ระร":2751,"ระพ":1841,"รัก":1669,"ระส":2582,"รัช":1049,"ระห":3447,"ระว":1743,"รัฐ":5336,"ระอ":2225,"รับ":6837,"ระโ":884,"ราค":879,"ระเ":13435,"ราก":1663,"รัส":1059,"ราช":9495,"ราจ":779,"ราง":1497,"ย่า":4021,"ย่อ":1669,"วย ":1162,"รอง":3048,"รอบ":1542,"รอน":750,"รวม":2782,"รวง":825,"รษท":797,"รศึ":1157,"้ใ":2847,"้ไ":1284,"้แ":2652,"้เ":7277,"ยแล":953,"รรค":1151,"้ำ":5903,"้า":37643,"รรด":1433,"้อ":17505,"รรณ":1404,"้ห":1048,"้ส":2215,"์จ":1166,"์ช":1393,"วน ":1214,"์ค":1626,"์ก":3825,"์ข":1876,"รมั":970,"์ว":1155,"์ล":906,"์ส":3129,"์ห":969,"์ศ":1201,"์พ":863,"์ร":1348,"์ม":2202,"์ท":4622,"์ต":2998,"์ด":2081,"์ป":1557,"์บ":1254,"์น":1942,"์ไ":1599,"รรษ":1352,"รย์":876,"รรม":6124,"์เ":8295,"์แ":4502,"์โ":1808,"์ใ":2084,"ยใน":2770,"์อ":1673,"รพร":845,"ยเป":1383,"แอ":2776,"แส":3385,"โซ":1292,"แห":8308,"แล":33169,"แว":982,"โจ":1094,"แย":1484,"แม":4364,"โค":3872,"แร":4857,"โพ":1125,"ยุโ":733,"ใก":924,"โม":2045,"รปก":742,"โบ":1679,"โป":2206,"โท":2397,"โน":2242,"โด":16368,"โต":2472,"โอ":4042,"ใช":8587,"ใจ":1545,"โล":5940,"โร":5880,"โย":1454,"ไข":762,"ไก":791,"ใน":43294,"ใบ":873,"ใต":3040,"ใด":738,"ให":12321,"ไซ":1010,"ไฟ":3494,"ไม":7091,"ไร":1255,"ได":12875,"ไต":1235,"ไท":6054,"ไป":6850,"ไอ":1157,"ไว":1681,"ไล":879,"ยู่":11314,"ไห":1089,"รปร":1489,"็ก":4152,"็ง":850,"็จ":3701,"็ต":1012,"็ด":1111,"็บ":1365,"รดิ":1521,"็น":54742,"่ง":35546,"่ค":1361,"็ม":1660,"่ข":1323,"่ก":2880,"่จ":2216,"่ช":990,"็อ":1898,"่ด":1050,"่ต":3007,"่ถ":906,"่ท":5172,"่น":17313,"รที":1499,"่ป":4265,"่บ":1847,"่ผ":743,"่พ":1708,"่ม":15356,"้ข":1096,"้ก":2888,"่ร":2826,"้ค":1674,"่ย":6368,"่ล":1168,"ยุค":1712,"้จ":2773,"่ว":14132,"้ง":18710,"่ส":6307,"่ห":2814,"รถไ":2194,"้ช":1217,"่อ":46408,"่า":42021,"้ด":1042,"รทำ":739,"้ท":1936,"้ต":1075,"รทั":1036,"้ป":1226,"้บ":1138,"้น":24081,"๊ก":920,"้พ":1364,"่ใ":7390,"้ร":3437,"่โ":798,"่แ":2495,"้ย":1840,"่เ":10182,"้ม":4709,"้ว":10098,"่ไ":2895,"้ล":760,"เอ":9583,"เฮ":1007,"ศส ":746,"เศ":2656,"เส":8693,"เห":10183,"เล":12934,"เว":8237,"แก":4707,"เภ":3987,"เม":21131,"แข":3088,"เย":2348,"เร":18471,"แค":2187,"แพ":2260,"โก":2046,"แบ":6575,"แน":2938,"แผ":2002,"แป":2594,"แถ":810,"แต":8070,"แท":1899,"แด":2231,"เช":8753,"เซ":6021,"เจ":10004,"เฉ":2866,"เง":1146,"เค":8385,"เข":11528,"เก":16975,"เฟ":1048,"เพ":11355,"เผ":802,"เป":58033,"เบ":4728,"เน":8994,"เธ":1416,"เท":18401,"เต":6088,"เด":13846,"ลที":1395,"ูเ":2044,"ู่":14654,"ู้":10588,"ูป":4480,"ูน":2921,"ูร":1466,"ุโ":1437,"ูม":1757,"ุเ":849,"ูล":3073,"ุ่":7901,"ุ้":931,"ุ์":914,"ูต":1281,"ร่า":1627,"ูด":1928,"ร่ว":1692,"ื้":6123,"ุส":1052,"ื่":31115,"ุษ":1254,"ูง":2041,"ุล":3040,"ุร":4653,"ูก":5828,"ุม":4112,"ุบ":2913,"ุป":1128,"ุธ":955,"ุน":3561,"ร้า":4429,"ุต":2413,"ุด":8113,"ุท":4856,"ุณ":1621,"ุญ":872,"ือ":41365,"ร้อ":2730,"ึ้":5783,"ึ่":15365,"ุค":2816,"ุง":2994,"ุก":3888,"ุข":1229,"ืน":1145,"ี่":66161,"ี้":8515,"ีเ":7027,"ีใ":1123,"ีแ":2432,"ีโ":1682,"ีอ":2953,"ิ์":1865,"ึง":8316,"ีล":2237,"ีว":3304,"ีส":4174,"ิ้":2507,"ิ่":6446,"ีฬ":1240,"ีห":1469,"ีพ":2697,"ึก":5012,"ิเ":7222,"ีม":1954,"ิแ":1214,"ีย":34404,"ิโ":3549,"ีร":4629,"ีต":2652,"ีท":2505,"ีน":5775,"ีบ":2030,"ีป":2156,"ีผ":1219,"ิส":6204,"ิห":1497,"ิศ":3177,"ิษ":2523,"ีช":3206,"ีจ":1149,"ิว":4515,"ร์ส":1173,"ำใ":1865,"ิล":5788,"ำไ":839,"ีด":1673,"ิอ":948,"ิป":1641,"ร์เ":3538,"ิบ":2835,"ิน":18424,"ิธ":2036,"ิท":9171,"ิต":10282,"ิด":13442,"ิร":3582,"ีค":3922,"ำแ":2282,"ิย":5047,"ีข":2272,"ำเ":6458,"ิม":7019,"ีก":7455,"ิภ":1128,"ิพ":1917,"ร์แ":1733,"ร์โ":778,"ำว":1118,"ิจ":3243,"ิช":2649,"ำส":738,"ิค":1757,"าโ":3139,"ำร":1745,"าใ":3181,"ำล":1253,"าไ":2775,"ิง":7259,"ำห":4498,"ิญ":1127,"ำท":973,"ำบ":1971,"ำน":4890,"ำด":1025,"ำต":1664,"ิก":12850,"าแ":4055,"าเ":12583,"ำม":1274,"ำป":747,"ำพ":760,"าล":10057,"าย":37411,"ำค":2190,"าร":63897,"าศ":5622,"าษ":8889,"ั่":6714,"าว":19634,"ำจ":874,"าอ":4975,"าส":13760,"ั้":24167,"าห":5552,"าะ":4602,"าต":8268,"าด":6631,"าณ":5371,"าน":35220,"าธ":2890,"าท":8396,"าถ":784,"าผ":775,"ร์ก":1498,"าป":2833,"าบ":4733,"าม":30146,"ำก":2269,"าภ":1081,"าฟ":1063,"าพ":8053,"ะเ":24505,"าข":3951,"ัม":2974,"ะแ":2571,"ัย":8749,"าค":10848,"ะโ":2150,"ะใ":1602,"ะไ":1960,"ัล":3755,"าง":41529,"ัว":11006,"าจ":10067,"ัศ":1321,"าช":13729,"ัส":4036,"าซ":1724,"ร์ด":1551,"าญ":933,"ร์ต":2154,"ร์ท":1160,"ัฒ":2408,"ัด":14375,"ัณ":1273,"ัต":9709,"ัท":2203,"ร์ม":749,"ับ":25368,"ัน":39721,"ัพ":2281,"าก":31704,"ฮั":845,"ฮิ":902,"ฮา":872,"ะช":3044,"ะจ":3682,"ะข":1220,"ะก":7907,"ะค":3630,"ะธ":897,"ะน":3483,"ะบ":5762,"ะป":1992,"ะพ":3693,"ะด":4349,"ะต":2884,"ะถ":831,"ะท":5514,"ัช":1701,"รเล":875,"ะห":5596,"ะส":5803,"ะอ":4641,"ัญ":5583,"ัฐ":5386,"ะม":6798,"ัก":25499,"ะภ":863,"ะร":4376,"ัค":775,"ะย":3056,"รเม":755,"ะล":1194,"รแข":1253,"ัจ":2657,"ะว":5659,"ัง":23502,"อป":2561,"รเป":876,"อน":18187,"อบ":7490,"อท":2655,"อธ":1085,"อต":3868,"อถ":749,"ฬา":1653,"อด":5925,"ห์":1891,"อห":2050,"อส":6091,"อว":5778,"อล":4526,"อร":15873,"อย":21178,"อม":9516,"อภ":865,"อฟ":1564,"อพ":1610,"อี":4000,"อื":1874,"อุ":3605,"อั":11299,"อา":13455,"อำ":3262,"อิ":6605,"ออ":10729,"อะ":2163,"ฮอ":754,"อ่":1463,"อ็":1408,"อโ":1451,"อแ":1805,"อไ":1254,"อใ":1838,"อเ":12919,"ษ์":1229,"สอ":3466,"หญ":5078,"us ":1226,"สห":2331,"ลงา":1081,"สั":9793,"สะ":1590,"สี":9218,"หต":869,"สิ":7137,"สำ":6109,"สา":15975,"สู":4642,"หน":20409,"สุ":8301,"สื":2761,"หม":12564,"สเ":3613,"สแ":834,"หย":1301,"หล":16331,"สโ":1107,"หร":18933,"ส่":6557,"ส้":1769,"หว":8188,"ส์":4529,"หอ":742,"หา":12502,"หั":3338,"หิ":1200,"อก":17318,"อข":2506,"อค":2179,"อง":70500,"อจ":1268,"ห็":854,"อช":1235,"ห่":6955,"ห้":8276,"ษฐ":1153,"ศอ":967,"ษฎ":870,"ว์":2314,"ว้":3043,"ศส":1760,"ศษ":814,"ว่":16781,"ษท":1007,"ศึ":3055,"ศิ":2300,"ศา":8110,"ษณ":3808,"ศั":2984,"สก":2168,"ศู":867,"ศไ":1794,"สง":2785,"ษร":2008,"ศเ":1312,"ษย":1193,"ศ์":3007,"สถ":6089,"สน":4818,"ษา":12813,"รใน":1139,"ษั":2349,"สต":9741,"สด":3309,"สภ":1290,"สม":10925,"สว":2702,"สร":6497,"สล":1065,"รใช":756,"วย":9723,"วม":5477,"วร":6029,"วล":3904,"วท":1967,"วบ":1401,"วน":11532,"วป":795,"รูป":4257,"วณ":1942,"วด":2173,"วต":1183,"วง":11814,"วจ":1546,"ล็":3380,"รุ่":859,"วช":1103,"ล่":6051,"ล้":7758,"ล์":1887,"ศร":2364,"วโ":1470,"วแ":1429,"วเ":5629,"ศว":1021,"วไ":2301,"วใ":947,"ศน":1526,"วั":23102,"วา":16144,"วะ":947,"วี":3571,"ศท":825,"ศต":1496,"วิ":15211,"วส":1841,"ศจ":1019,"วอ":3830,"วห":1387,"รื่":5863,"ลย":1821,"ลม":2014,"ลร":753,"ลบ":1840,"ลน":3276,"ลป":2282,"ลต":1315,"ลท":2213,"ลด":1361,"รือ":16002,"ร์":31431,"ร็":1969,"ฤษ":4446,"ร่":5273,"ร้":7400,"รใ":2364,"รไ":1004,"ลง":7031,"รุง":2477,"ลจ":865,"ลใ":839,"ลโ":860,"วค":1204,"ลแ":1254,"ลเ":4918,"วข":1721,"วก":5218,"ลู":2435,"ลุ":4907,"ลื":3984,"ลี":7428,"ลำ":2409,"ลิ":7229,"ลั":14699,"ลา":17258,"ละ":32443,"ลอ":5473,"ลห":915,"ลส":1856,"ลว":3836,"ลล":2565,"ยา":16472,"รณ":5716,"รด":4057,"ยิ":1263,"รต":2974,"รถ":6175,"ยี":1186,"รท":5128,"ยื":1006,"รน":2627,"ยุ":5509,"ยู":12722,"รบ":2528,"รป":3771,"รพ":2742,"ยเ":7850,"รม":12636,"รย":2421,"ยแ":3243,"รค":3841,"มโ":1965,"ยร":3391,"มแ":3080,"ยล":1160,"มไ":1232,"มใ":1839,"รจ":2340,"ยว":7799,"รง":8918,"ม่":10097,"รช":1118,"ยศ":1015,"ยห":1973,"ม้":2616,"ยส":4483,"ยอ":5464,"ยั":4025,"ยะ":1489,"รี":20028,"ริ":20674,"รู":7587,"รื":21925,"รุ":4970,"รแ":4843,"รโ":1568,"ลค":797,"ลก":4719,"รเ":8258,"ลข":2218,"รล":1190,"ยไ":1866,"ยใ":4743,"รร":16031,"ยโ":1499,"รส":5411,"ย่":5792,"รษ":2069,"รศ":2322,"รว":7099,"รอ":8553,"ย์":8004,"รห":1918,"รา":29845,"รั":23940,"ระ":59662,"มต":4461,"มถ":925,"มท":4160,"ภั":1399,"มณ":1076,"ภา":21089,"มด":2959,"มพ":4702,"ยก":11626,"มภ":1402,"มน":5654,"ภู":2200,"มบ":2018,"มป":2831,"มผ":871,"มจ":1684,"มง":1139,"มข":2191,"มก":5523,"มค":2300,"ภอ":2334,"มช":2595,"ฟ้":2049,"มื":14762,"ยน":14895,"มุ":2792,"มี":30856,"ยถ":2445,"ยท":4014,"ยด":1555,"ยต":2395,"มิ":5697,"มั":8313,"มา":29552,"ยภ":836,"รก":8900,"มเ":11478,"ยม":7097,"รข":1777,"ยพ":1762,"ยบ":2166,"มู":3200,"ยป":1662,"มศ":1140,"มว":3354,"ยจ":1218,"ยง":8717,"มล":1770,"ยค":2918,"รู้":1907,"มร":6136,"มย":1297,"ยข":2004,"มม":2832,"มะ":1294,"มอ":4804,"มห":8473,"มส":4913,"ยช":2472,"ลาย":5187,"ฝร":1302,"ผ่":2887,"ฝั":851,"ลาด":921,"พท":1912,"พน":931,"ลาน":856,"พบ":1789,"ปแ":1689,"ปเ":1359,"ผล":4309,"ป้":1280,"ป็":53080,"ป่":1217,"ลำด":791,"ลำต":811,"ผิ":1067,"ผู":7995,"ฟร":1492,"ฟล":850,"ฟฟ":1043,"พเ":1196,"ลิม":935,"ฟอ":994,"พ์":1176,"ฟิ":1569,"ภท":1390,"ลิต":1534,"ฟุ":809,"พฤ":1487,"พล":6364,"พย":3936,"พร":23104,"ลีย":1437,"พม":1303,"พอ":1240,"ฝ่":810,"พว":940,"พี":2676,"พิ":6438,"พา":3798,"พั":6166,"พู":1636,"พุ":2493,"พื":7226,"นย":4074,"ลี่":2431,"บข":1492,"นม":6695,"บค":3089,"นร":8383,"นล":2573,"นศ":1853,"บจ":1007,"นว":11694,"นบ":4831,"ธุ":1517,"นน":9487,"นผ":3027,"นป":10367,"นพ":7135,"นภ":4756,"บก":4476,"นฟ":756,"นั":15362,"นา":19798,"บด":2422,"นำ":2990,"บต":1487,"นิ":13549,"นี":13301,"นึ":7032,"บท":4278,"นื":5497,"บน":4184,"นุ":3598,"นส":15743,"นห":8286,"ธ์":2434,"นอ":14760,"นะ":2415,"ธร":3542,"นค":9380,"ลือ":2681,"ธย":895,"ทเ":1085,"นข":6066,"นก":17019,"นจ":5539,"นง":887,"ทุ":2331,"ธา":3053,"นด":8599,"ธั":762,"นท":23586,"นธ":5365,"นต":15763,"ธิ":5695,"นถ":1559,"ธี":1300,"ท้":2426,"นซ":2369,"ธศ":832,"นช":5687,"ท่":3112,"นฐ":772,"ท์":2028,"ธอ":1358,"บ้":2221,"ปส":1027,"บ่":1386,"บใ":1319,"ปร":33042,"บโ":1424,"บไ":1667,"ปล":7500,"ลื่":1081,"บแ":1614,"บเ":5338,"ลูก":1704,"ปุ":2101,"ผน":738,"ปิ":2900,"ปี":8407,"ปะ":1033,"ปั":4469,"ปา":3048,"ปอ":1507,"น่":3725,"น็":766,"นๆ":1075,"บว":2339,"นไ":5452,"บล":3184,"นใ":7373,"นโ":6749,"บร":11830,"นแ":9829,"นเ":26787,"บม":1153,"บภ":734,"ปก":3047,"บพ":1205,"ลุ่":3574,"บป":2025,"บบ":7643,"ปน":1077,"บุ":4548,"บู":1144,"ปท":1302,"บิ":3104,"ปต":759,"บี":2069,"บา":8404,"ปด":805,"บั":9157,"ปฏ":1873,"บอ":4696,"น์":3647,"บส":3398,"น้":10964,"บห":1553,"ด็":3505,"ตว":2762,"ตล":1318,"ดไ":896,"ตส":1012,"ด้":20496,"ด่":844,"ตะ":4134,"ตอ":5825,"ด์":2932,"ติ":15034,"ตำ":3584,"ตา":9651,"ตั":18129,"ดื":1116,"ดิ":9252,"ตต":1078,"ดี":8816,"ตถ":1332,"ดุ":891,"ตน":2338,"ตบ":1002,"ดู":2853,"ตร":23291,"ดโ":963,"ดใ":3358,"ดเ":6934,"ดแ":2124,"ตย":2013,"ดล":1605,"ดว":1797,"ดห":2122,"ดส":2822,"ดอ":4967,"ณ์":3582,"ดั":6569,"ดำ":2067,"ดา":6669,"ดด":953,"ณิ":917,"ดต":2322,"ณี":962,"ดท":2962,"ดน":4803,"ดป":1230,"ดพ":988,"ตก":3537,"ดม":1612,"ดย":15171,"ดร":2790,"ทอ":3103,"ทห":1219,"ทส":1351,"ทศ":9520,"ทว":1599,"ที":59203,"ทิ":4236,"ทำ":5708,"ทา":13110,"ทั":9949,"ทะ":1722,"ทพ":2598,"ทน":2186,"ถุ":1404,"ทบ":1025,"ถู":2630,"ถึ":6411,"ทธ":4848,"ถื":1366,"ถไ":2196,"ลอง":1534,"ธง":1259,"ทร":10558,"ทย":11986,"ต้":8814,"ต์":5183,"ต่":15088,"ต็":806,"ลอด":966,"ถา":5861,"ถิ":1257,"ตู":1933,"ตี":1997,"ตุ":3328,"ถน":2338,"ตแ":903,"ทค":1361,"ตเ":2007,"ฐา":2537,"ฐอ":1245,"ฐม":814,"ละค":2575,"ฏิ":1907,"ละก":1772,"ละน":936,"ณา":2378,"ณะ":4781,"ละท":1069,"ละต":1127,"ดช":1164,"ดจ":1760,"ดง":3356,"ดค":1484,"ณร":978,"ละส":1767,"ดข":3245,"ดก":3268,"ลัง":3957,"ละร":817,"ละม":1718,"ลัก":5507,"ละพ":787,"ละป":964,"ณฑ":1856,"ลับ":937,"ละอ":1448,"ละไ":841,"ลาง":2453,"ลาก":914,"ละเ":4488,"ละแ":849,"ฒน":2389,"ลัย":2733,"ซั":988,"ซา":2350,"ซอ":2165,"ซู":753,"ซี":4385,"ซึ":8304,"ซิ":3377,"ญช":978,"ญญ":1807,"ซ์":1764,"ญิ":1154,"ญา":2270,"ญี":1753,"ญ่":3821,"งิ":983,"งา":6266,"งอ":10609,"งห":11834,"งส":12858,"งศ":4623,"จจ":3207,"งว":4781,"งๆ":1493,"งใ":8125,"ลล์":930,"งไ":4482,"งแ":10959,"งโ":5091,"จร":2901,"จม":901,"งเ":26671,"จพ":2589,"ลวง":2170,"จน":3023,"จี":2856,"จิ":3604,"จึ":1361,"จั":14501,"จะ":8562,"จำ":4839,"จา":15882,"จอ":2342,"ฉล":792,"จเ":946,"ฉพ":1153,"ชก":1650,"จุ":5283,"ฉา":1061,"ชน":7089,"ฉี":1150,"จ้":6665,"ชย":735,"ชร":794,"ชว":2061,"ชบ":758,"ชม":820,"ชา":13533,"ชี":5071,"ชิ":4498,"ชื":9652,"ซน":1061,"ชุ":2369,"ชส":1160,"ชอ":1761,"ชั":4493,"ซล":1003,"ช่":6714,"ช้":8872,"คน":5647,"คต":1444,"คม":6449,"คร":21353,"คย":1040,"tio":1636,"ข่":2813,"ข้":7740,"คค":1249,"ลบั":843,"คณ":2195,"คู":1293,"คุ":2351,"คื":7785,"คี":1217,"ลนด":1156,"คิ":2002,"คโ":1144,"คเ":1342,"คว":12568,"คล":7051,"คา":4157,"คำ":3659,"คั":3422,"คอ":3384,"งข":9913,"งก":19903,"งค":13884,"ค่":1995,"ค้":2249,"ค์":5026,"งบ":3074,"งน":4757,"งท":10771,"งพ":4931,"งผ":1609,"งป":7671,"งย":2006,"งม":7023,"งภ":1825,"จก":1261,"งล":2323,"งร":6456,"งช":5175,"งง":1006,"งจ":7033,"งซ":1220,"งต":6752,"งถ":1603,"งด":3806,"กภ":1049,"กพ":1569,"กป":1954,"กน":2935,"กบ":1539,"กท":4244,"กต":4649,"กด":2721,"กฏ":948,"กฎ":1349,"กซ":2148,"กช":1615,"กจ":3262,"กง":1224,"กค":3642,"กข":3351,"ter":1286,"กก":5870,"กู":1343,"กเ":10000,"กั":20495,"กา":51462,"กำ":3149,"กิ":9995,"ขต":2558,"กี":4840,"ขน":4066,"กุ":3321,"กส":5176,"กษ":11940,"กห":1741,"กอ":8499,"กะ":1354,"กย":1271,"กม":4662,"กฤ":4112,"กร":27767,"กล":15020,"กศ":807,"กว":6893,"ขี":3132,"ขา":5865,"the":1233,"ขุ":925,"ขึ":5668,"ก์":773,"ขอ":32858,"ก๊":859,"ก้":1794,"ขั":2842,"กไ":1756,"กใ":2974,"กโ":1476,"กแ":3798,"ก่":6422,"ก็":3535,"ขว":1297,"วกั":2290,"วขอ":818,"ลเม":1412,"ษา ":1133,"ล่น":1147,"ล่า":2925,"ล้ว":2260,"ล้อ":942,"ล้า":3112,"วงก":746,"วงศ":3050,"ล็ก":2525,"วนใ":973,"วนห":850,"วนก":966,"วที":983,"วยเ":735,"วรร":3044,"วร์":805,"วยก":1046,"วมก":896,"วอร":1058,"วลา":1487,"วละ":990,"ส์ ":2631,"วีย":1077,"ศตว":752,"วิน":754,"วิต":1559,"วิท":5574,"วิช":1485,"วาม":10406,"วาง":1213,"วัล":909,"วัน":10402,"วัต":2453,"วัด":5975,"วัฒ":910,"วัง":868,"ศน์":1136,"วเต":851,"ศรี":1384,"วไป":1324,"อก ":1672,"ว่า":16578,"ว้น":750,"ว้า":1054,"อง ":7392,"ศัย":746,"ศัก":1311,"ษณะ":2477,"ศาส":7153,"ศึก":3054,"ษณ์":883,"ศิล":1821,"ษที":956,"ศูน":857,"ษย์":1021,"ศไท":1672,"สกุ":748,"อน ":3168,"สงค":1388,"อม ":778,"อย ":1240,"ษาก":925,"ษัต":893,"ษัท":1443,"สตร":5931,"ษาอ":1852,"ษาเ":921,"สดง":2076,"สนา":1982,"สต์":1797,"สถา":5432,"สมเ":2557,"สมุ":1016,"สมา":1528,"สมั":2283,"สภา":1234,"สัญ":1359,"สัง":2218,"สัต":1750,"สัน":1365,"สาข":1007,"สัม":975,"สำน":1154,"สิง":944,"สาธ":1138,"สาย":2246,"สำค":1783,"สาร":3032,"สาม":4593,"สาว":795,"สอง":2064,"หญิ":1133,"สหร":1602,"หญ่":3811,"สร้":3626,"หมา":5219,"หมื":980,"หมู":1311,"หม่":2082,"หรื":14127,"หรั":3559,"สเต":1342,"หมด":1024,"สุร":1490,"สูง":1909,"สือ":1352,"หนด":759,"สุด":3697,"หน่":2681,"หน้":3646,"หนั":1888,"สูต":893,"หนื":2453,"หนึ":6865,"สู่":776,"สำเ":741,"สิท":1099,"สิน":1011,"สำห":2054,"สิ่":1600,"สีย":3370,"หตุ":849,"สี่":1091,"สีเ":803,"สุข":808,"หิน":734,"หาว":2346,"หัว":2009,"หาร":3693,"หาน":1005,"หาก":917,"ส้น":1351,"ส่ว":5073,"ส่ง":1119,"หว่":2958,"หวั":4202,"หล่":1190,"หลั":4230,"หลื":1267,"หลา":3254,"หลี":1577,"หลว":2304,"องจ":2432,"องค":5587,"องข":1511,"องก":4220,"องน":1770,"องท":3322,"องด":1078,"องต":1671,"องช":1455,"องว":1091,"องร":2706,"องพ":2170,"องม":1890,"องภ":735,"องป":3285,"องบ":1250,"องส":4135,"องห":2070,"องอ":2544,"อกเ":2315,"อกแ":903,"อขอ":1542,"อกจ":1222,"อกอ":862,"อกั":828,"อกา":1632,"อตั":1635,"อดี":1193,"อนก":910,"อที":1522,"ห่ง":6109,"ัก ":1014,"องโ":2181,"องใ":1090,"องไ":1059,"องเ":6096,"องแ":2259,"ห็น":791,"ห้เ":1731,"ัง ":1312,"ัน ":5497,"อมา":1234,"อมพ":870,"ับ ":2373,"อมเ":1000,"อมู":1073,"อย่":3805,"อยู":11352,"อร์":11720,"อวั":2718,"อวิ":799,"าก ":2111,"อว่":1274,"อนด":815,"อธิ":902,"อนท":1025,"อนุ":1065,"อบด":1116,"อนไ":741,"อนเ":1384,"ัด ":1246,"อิส":796,"อิน":2322,"อำเ":2340,"อีก":2134,"อำน":838,"อาห":1406,"ัส ":1055,"อาศ":740,"อาร":1724,"อาจ":1975,"อัล":1451,"อาณ":1739,"าช ":1056,"อื่":1772,"ออก":6565,"ัย ":1671,"อสา":832,"อัง":3552,"อาก":1660,"อัน":2487,"ัว ":1389,"อัก":2043,"าง ":4454,"อ่า":789,"าม ":1873,"าย ":3426,"าณ ":1847,"อเม":2930,"อเป":1207,"าน ":3701,"อเส":824,"อเร":1957,"อเช":951},"n_words":[7308152,7320273,4252865],"name":"th"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":3787,"E":3422,"F":1488,"G":3274,"A":19564,"B":7360,"C":4965,"L":4772,"M":7066,"N":4375,"O":1801,"H":3508,"I":8185,"J":1743,"K":5752,"U":1449,"T":5401,"W":1008,"V":1273,"Q":420,"P":11919,"S":10977,"R":3177,"Y":471,"X":334,"Z":463,"f":2669,"g":178562,"d":29955,"e":64572,"b":36938,"c":9968,"a":433329,"n":268000,"o":112013,"l":93919,"m":59846,"j":824,"k":54159,"h":28813,"i":161924,"w":15439,"v":3263,"u":56864,"t":84874,"s":113569,"r":58943,"q":542,"p":56795,"z":2158,"y":61992,"x":703,"Fil":231,"í":225,"é":238,"á":213,"ü":252,"ā":268,"Est":484,"Eng":311,"Ene":272," l":8609," m":26008," n":63523," o":8411," h":4923," i":23816," k":20018," d":6208," e":1886," f":514," g":3652," a":41230," b":12249," c":1285," y":708," u":3275," t":9124," w":1107," v":245," p":25524," s":37805," r":2505," J":1713," K":5573," H":3377," I":7706," N":3834," O":1616," L":4623," M":6813," B":6692," C":4389," A":18851," F":1372," G":3134," D":3532," E":3171," Z":430," Y":427," X":260," S":10261," R":2927," Q":408," P":11641," W":933," V":1041," U":1379," T":5149,"ا":230,"A ":369,"Da":1003,"Co":1457,"Ce":320,"Ch":481,"Do":225,"De":356,"Di":1582,"Fe":231,"Eu":252,"Es":754,"En":687,"Em":287,"Ge":222,"Ga":499,"I ":336,"Fr":271,"Fi":327,"C ":400,"Au":300,"Ar":695,"As":627,"Ba":3221,"Ay":1525,"Ag":601,"BC":219,"Ab":447,"Ad":339,"Am":736,"BN":257,"An":10961,"Ap":381,"Ak":282,"Al":957,"Bu":508,"Br":412,"Ca":1287,"Bi":744,"Be":555,"Bo":628,"Hil":266,"Ku":269,"Kr":232,"Ko":739,"Le":701,"Li":502,"N ":411,"La":1279,"Lu":1605,"Lo":364,"Me":527,"Mi":786,"Ma":4128,"Mu":386,"Mo":519,"Ni":284,"Ne":669,"Na":1605,"No":859,"Ok":212,"Ol":246,"Gi":363,"Gr":1131,"Go":284,"Gu":257,"Ha":926,"He":607,"Hi":663,"Ho":302,"Hu":700,"Im":288,"In":3099,"Ik":390,"Il":402,"Is":900,"It":1478,"Ir":258,"Ja":583,"Jo":504,"Ju":288,"Ka":3627,"Hap":403,"Ki":336,"Un":1045,"Tu":257,"Tr":725,"Ts":305,"To":291,"Th":568,"Ti":1025,"Te":370,"Ta":1438,"St":601,"Su":844,"Wi":302,"Wa":226,"Vi":399,"Va":226,"Pu":319,"Pr":784,"S ":239,"Pe":753,"Pa":3232,"Po":515,"Pi":5199,"Ph":519,"Or":329,"Se":751,"Sc":225,"Si":3949,"Sh":296,"So":479,"Sa":2369,"Re":1018,"Ri":307,"Ro":908,"Qu":349,"Ra":347,"Gre":522,"Gri":303,"b ":919,"a ":94136,"Sü":216,"Za":253,"i ":12212,"gd":846,"ge":1642,"ga":29192,"gb":804,"Ing":2213,"fi":306,"fo":309,"gy":549,"he":2166,"ha":12797,"gn":612,"gm":735,"gl":3792,"gk":3515,"gi":5586,"gh":1271,"gg":2300,"gu":3141,"gt":1399,"gs":3014,"gr":1126,"gp":2172,"go":3120,"dt":243,"du":1243,"dy":626,"g ":113439,"ea":1899,"eb":1161,"ec":685,"ed":1334,"de":3129,"di":5171,"do":3993,"Ilo":317,"dr":607,"ew":354,"ex":243,"eu":223,"ev":449,"ey":804,"ez":509,"fa":228,"h ":1453,"Ind":341,"fe":234,"eh":1900,"eg":2092,"ee":503,"el":4423,"ek":1455,"ei":449,"ep":1032,"eo":1328,"Imp":228,"en":9846,"em":2892,"et":2497,"es":7771,"er":9807,"ca":1369,"Ika":371,"e ":10545,"by":545,"br":2325,"bu":3483,"bo":2508,"bl":1265,"bi":7420,"be":1655,"da":7330,"f ":892,"cu":348,"ct":832,"cr":267,"co":1527,"ck":419,"ci":1114,"ch":1408,"ce":1134,"cc":244,"c ":679,"az":285,"ay":33855,"ba":16262,"d ":6470,"at":29146,"as":20781,"ar":16773,"aw":10001,"av":702,"au":1712,"ak":11317,"al":29791,"ai":3152,"ao":3807,"ap":8378,"am":13495,"an":101085,"ac":1440,"ad":5449,"aa":5389,"ab":8765,"ag":22898,"ah":9461,"ae":824,"nu":2551,"nt":6347,"ns":5167,"no":9894,"nn":491,"ny":2397,"of":922,"oc":711,"od":3388,"oa":467,"ob":1692,"om":3049,"on":28949,"ok":1666,"ol":5417,"oi":423,"og":2491,"oh":829,"ot":1896,"os":4877,"ov":562,"ou":1121,"op":3555,"oo":5633,"or":6366,"r ":5313,"ow":489,"oy":1082,"pe":2826,"pa":25955,"pl":758,"po":5042,"ph":416,"pi":11096,"lo":7059,"lm":376,"ll":2040,"ls":232,"lp":216,"lu":4808,"lt":766,"ly":1884,"o ":35938,"ma":23630,"mb":3361,"mg":9391,"me":3130,"mi":5214,"mm":511,"mp":3371,"mo":2915,"mu":4995,"p ":2357,"na":50921,"nc":944,"nd":5861,"ne":3717,"ng":124226,"ni":11322,"nl":1083,"ki":5108,"kh":660,"ke":653,"kb":240,"ka":28062,"m ":2592,"ky":234,"ks":1102,"kt":1232,"ku":3832,"ko":5455,"kr":397,"kl":2445,"li":16771,"le":7298,"ld":536,"la":41710,"lb":521,"n ":41545,"hr":229,"hu":1485,"hi":7425,"hn":275,"ho":2024,"id":2936,"ic":2306,"ib":3763,"ia":4281,"ih":1209,"ig":7316,"if":239,"ie":1122,"hy":251,"k ":4018,"ir":2572,"is":21654,"it":15297,"iu":510,"iv":697,"iw":1139,"ii":644,"ik":10689,"il":18574,"im":4535,"in":32448,"io":2262,"ip":8303,"iz":356,"iy":6303,"l ":8749,"ja":305,"z ":468,"wi":4192,"wo":282,"ws":238,"y ":26266,"wa":7331,"we":663,"vi":929,"vo":293,"uz":222,"uy":540,"uw":938,"uu":278,"ve":1065,"va":717,"x ":270,"ui":867,"uk":2223,"ul":10829,"ue":1048,"ug":2281,"uh":1222,"ur":4663,"us":3997,"ut":2718,"um":4571,"un":11124,"uo":841,"up":1559,"ty":936,"tu":5349,"tt":423,"tw":245,"ub":2276,"ua":2314,"ud":744,"uc":353,"w ":2307,"to":12113,"tn":494,"tl":670,"ts":639,"tr":2947,"te":5598,"ti":12280,"th":1499,"ta":23206,"su":2916,"ss":697,"st":7061,"sy":6204,"sl":357,"sk":592,"sm":643,"sp":948,"so":5456,"sc":415,"se":6243,"sh":1082,"si":9719,"u ":1011,"sa":50153,"rr":466,"rs":1459,"rt":2397,"ru":1218,"ry":2421,"rp":226,"ro":6522,"rn":1128,"rm":690,"rl":534,"rk":708,"ri":11065,"rg":616,"re":7270,"rd":981,"rc":435,"rb":496,"ra":14328,"t ":17794,"qu":482,"s ":20143,"py":238,"pt":257,"pu":5533,"pp":386,"pr":1443,"Hul":231,"za":466,"zo":453,"ye":2458,"ya":16851,"yb":251,"yu":966,"yt":264,"ys":635,"yr":507,"yo":11759,"yn":980,"Ara":274,"Apr":218,"Asy":352,"Ayo":1506,"Bag":274,"Ban":328,"Bay":1488,"Bat":317,"Abr":300,"Adi":220,"Ago":307,"BN ":257,"Ale":251,"Alt":229,"Ame":579,"Ang":10467,"Car":262,"Bib":239,"Com":233,"Col":584,"Dis":592,"üd":216,"Nat":309,"New":234,"Nag":476,"Nor":372,"Nob":295,"Pin":243,"Pil":4628,"Phi":298,"Per":237,"Pas":236,"Par":288,"Pag":236,"Pan":1029,"Pam":480,"Pal":436,"Pro":250,"Pra":302,"Que":246,"Isa":506,"Ita":502,"Ito":837,"Jam":280,"Jos":222,"Kab":349,"Kag":252,"Kal":869,"Kan":214,"Kat":442,"Kas":598,"Kar":312,"Kon":276,"Leo":280,"Lat":224,"Lun":1069,"Man":377,"Mal":287,"Mar":952,"May":1105,"Mat":220,"Min":268,"Süd":214,"Zam":212,"一":621,"一一":311,"Sur":412,"Sta":256,"Tag":668,"Siy":420,"Sil":411,"Set":224,"Si ":2472,"Sam":277,"Sal":320,"San":795,"Sa ":539,"Rey":251,"Rep":257,"Rom":461,"Uni":749,"The":357,"Tim":465,"Tin":234,"Tre":501,"Tsi":277,"bis":856,"bit":236,"bil":2673,"bin":1170,"big":1028,"bo ":562,"bli":613,"bla":330,"bol":310,"bon":590,"ban":3506,"bak":379,"bal":915,"bag":1029,"bah":2518,"bae":247,"bab":1067,"bay":2496,"baw":340,"bat":1046,"bas":870,"bar":318,"bi ":266,"ber":655,"bel":279,"bib":346,"bid":220,"ca ":245,"can":254,"ce ":518,"bri":332,"bra":317,"bre":1519,"buo":641,"bul":259,"bun":360,"bum":222,"buh":438,"but":324,"bye":332,"aka":4840,"am ":867,"aki":2053,"akh":385,"al ":4667,"ail":347,"ain":942,"ais":377,"ak ":1818,"aig":364,"ahi":1712,"ahu":567,"aho":893,"aha":5905,"agk":1800,"agl":815,"agm":409,"agg":261,"agh":599,"agi":2389,"ags":1175,"agt":890,"agu":765,"ago":873,"agp":2086,"anu":622,"any":1534,"ano":2033,"ant":1925,"ans":2023,"ane":370,"ang":54220,"ani":2704,"anl":847,"ap ":1248,"ana":5631,"anc":342,"and":2377,"amu":411,"amo":399,"amp":1054,"ami":2241,"ame":660,"amb":1287,"ama":6244,"ao ":2560,"aly":681,"alu":885,"alo":1422,"alm":256,"all":403,"ali":4268,"ale":940,"ala":15201,"alb":295,"an ":25875,"aku":295,"akt":378,"ako":347,"akl":349,"aba":5223,"abe":285,"abi":1848,"abo":410,"abu":558,"ae ":378,"aca":217,"aaa":370,"aan":2340,"aal":228,"aas":476,"aar":1310,"ad ":1547,"aga":6180,"agb":737,"agd":421,"ado":1685,"adi":344,"ade":215,"ag ":2962,"ada":1065,"ayo":1284,"ayn":676,"ays":426,"ayr":437,"ayu":244,"ayb":228,"aya":7316,"ba ":1007,"at ":11450,"are":434,"ard":406,"arc":218,"ara":6094,"aro":928,"arl":281,"ark":258,"ari":2740,"ars":347,"art":1318,"asa":4127,"ary":1252,"asi":1053,"ase":1923,"aso":401,"aon":1107,"ar ":1507,"apa":4168,"api":926,"apo":968,"apu":525,"as ":8041,"avi":292,"ay ":22870,"awa":4711,"awi":3261,"ata":9859,"asu":540,"ast":1119,"asy":2814,"atl":420,"ato":908,"ate":772,"ati":3674,"ath":277,"aw ":1870,"atu":1238,"aun":478,"aug":319,"itn":327,"ito":5794,"itu":365,"ity":376,"üdt":214,"ism":492,"ist":2727,"isy":1436,"ita":2453,"ite":522,"iti":1834,"iwa":1084,"ius":321,"ive":465,"ipo":219,"ipp":279,"ipi":5953,"is ":1468,"ion":1384,"ipa":894,"iro":325,"iri":318,"isi":1179,"ish":487,"ise":486,"isa":12350,"ire":294,"ira":1126,"it ":3154,"iyo":1973,"iya":3692,"iye":538,"kik":335,"kil":1497,"kin":1435,"kip":244,"kit":510,"ki ":400,"kha":550,"koy":367,"kop":294,"kon":1105,"kom":504,"kol":972,"ko ":1504,"kla":2111,"kay":687,"kat":4082,"kau":481,"kar":1361,"kas":2326,"kap":1524,"kan":3294,"kal":2502,"kam":987,"kak":1114,"kah":780,"kai":697,"kag":306,"kad":618,"kab":2686,"kaa":306,"ka ":4085," Ga":496," Ge":219," Fr":268," Fi":324," Ha":926," He":600," Go":283," Gr":1130," Gu":254," Gi":361," Hu":696," Ho":300,"ha ":509," Hi":657," Ja":583," Ir":258," Is":900," It":1471," Im":284," In":3092," Ik":389," Il":402,"ham":439,"han":3970," Ka":3625,"hal":1546," Ki":335,"har":726,"has":418,"hat":613," Jo":502," Ju":282,"hah":296,"hag":932,"hab":327," La":1273," Le":698," Li":486," Ko":738," Kr":232,"hay":2401," Ku":269," Ma":4113," Mi":784," Me":525,"he ":726," Lo":362," Lu":1602," Ne":660," Na":1601," Ni":283," Mo":516," Mu":380,"her":391,"hen":301,"hi ":256," Ap":381," Am":732," An":10945," Ak":280," Al":957," Ag":601," Ad":338," Ab":444," Ba":3213," Ay":1524," Au":300," As":625," Ar":684,"hig":262," Be":553," Bi":731,"hip":233,"hin":2102,"him":317," Bo":624,"hil":1124," Br":411," Bu":508,"hit":252,"hiy":1822," Ca":1275," Ce":318," Ch":477," Co":1435," Da":1001," Di":1559," De":354," Do":218," Es":753," En":683," Em":287," Eu":251," Fe":229,"gma":404,"go ":1131," Sü":216,"gle":2258,"gli":561,"gla":763," Wi":299,"gko":362," Wa":224," Za":253,"gna":429,"gmu":288,"gpu":1231,"gpa":814,"gon":726,"gos":419,"gor":456,"gsa":661,"gsi":395,"gra":396,"gre":308," Or":329," Po":510,"gui":242," Pi":5194,"gum":355," Ph":516,"gul":619," Pe":751," Pa":3220,"gsu":217,"gso":1676," No":856," Ol":245," Ok":212,"gta":876," Ra":342," Qu":347," Ro":902," Re":1013," Ri":307," Pr":781,"gus":319," Pu":319,"gun":828," Su":840," St":567," Ta":1427,"gya":341," Th":565," Ti":1025," Te":369," Tr":725," Ts":304," To":289," Sa":2358," Sh":292," Si":3946," Sc":224," Se":748," So":478," Va":226," Vi":396," Tu":256," Un":1043,"ial":214,"ian":1841," ip":821," im":295," in":1205," ik":2724," il":559,"ic ":391," is":12317," it":4118,"ibl":298,"ibi":852," ka":12843,"ibo":539," ki":1772,"id ":624,"iba":1373,"ibe":241," ha":2342," he":266," gi":1221," gr":273,"ia ":1668," gu":642," ib":1270," hi":1628," hu":502," ni":3203," ng":29501," ne":259,"ien":313," na":26725," mu":2405,"ig ":1084," mo":343," of":748," no":3658," le":482,"ict":361," li":1169," la":4719," ku":1926,"ich":298," kl":1518,"ica":454," ko":1618," me":432," mg":9378," mi":835," o ":6532,"ido":754," ma":12562," lu":1621,"ide":429,"ida":693," lo":567," ag":287," aa":479," an":11487," ap":351," ak":588," al":688," aw":224," ar":1782," at":8491,"iit":294," as":396," ba":6398," ay":15608,"il ":1019," bi":3286," be":279," bo":263," bl":303," bu":1574," ca":215,"im ":631,"ika":5845,"igd":340,"ige":351,"iga":3450,"igm":226,"igi":608,"iha":656,"ihi":488,"ik ":530,"imo":757," es":477," em":253,"imp":821," el":218,"ime":284," ek":212,"imi":396,"ip ":377,"ind":1253,"ina":12544,"imu":413,"ino":1637,"int":735,"ins":494,"ine":925,"ing":7215,"ini":1991," ga":1201,"inu":1256,"iko":1955," co":595,"iki":665,"ila":8566,"in ":3710," da":2205,"iku":803,"iks":322,"ilo":675,"ill":466," de":1318,"ili":6551," di":1938,"ima":551,"imb":583,"io ":525,"ily":587," du":359,"hol":219,"hon":897," ye":394,"hul":550," sa":28976," se":2207," si":4160," so":365," t ":331," re":1237," ri":835," pu":1239," pr":891," s ":272,"hum":213," op":300," or":499," pe":850," pa":17344," pl":235," po":2264," pi":2508," wa":403," wi":495," tu":1581," ur":1186," up":417," um":236," un":787," ta":3807," su":1227," tr":542," th":440," ti":1513," te":623,"eyn":227,"eta":377,"eti":227,"esp":220,"eso":216,"est":873,"ess":243,"esy":238,"eto":217,"etr":240,"ety":252,"ey ":259,"er ":1701,"es ":4679,"epu":270,"eri":1261,"ere":336,"era":1743,"erb":250,"et ":365,"esi":397,"ery":584,"ert":284,"ers":877,"ern":685,"erm":213,"ero":994,"eks":315,"ekt":395,"en ":605,"ela":487,"ele":614,"eli":929,"ell":447,"eo ":676,"emb":1031,"ema":617,"eme":244,"emo":250,"emi":251,"emp":247,"ene":540,"eng":2031,"ena":338,"end":702,"enc":265,"ens":1983,"ent":2381,"ego":798,"ege":495,"ehi":1394,"el ":1262,"eka":275,"gka":2797,"git":1186,"gis":283,"gil":253,"gin":2170,"gha":696,"ggi":233,"gga":1607,"gi ":930,"gen":305,"gda":265,"gdi":448,"ge ":872,"gbi":301,"gba":311,"gag":358,"gah":212,"gas":721,"gar":389,"gat":1029,"gaw":951,"gay":1184,"gam":1160,"gal":2016,"gan":8911,"gap":698,"ga ":10893,"da ":1215,"de ":630,"dad":487,"daa":229,"dal":1278,"dai":337,"dag":294,"dah":789,"dat":526,"dar":615,"dan":716,"dam":364,"cti":321,"co ":247,"com":405,"ch ":248,"cha":297,"cia":252,"ck ":232,"che":214,"ed ":364,"ebr":372,"ean":218,"ear":435,"eap":318,"ea ":269,"ega":369,"edi":305,"dya":287,"dor":701,"don":511,"dos":801,"dti":217,"dul":227,"duk":249,"dia":320,"der":377,"des":307,"del":555,"dek":245,"den":404,"di ":862,"do ":1342,"diy":230,"din":1025,"dis":689,"dit":243,"dig":691,"rga":250,"ri ":1883,"res":909,"rea":358,"reg":609,"reh":760,"ren":823,"rel":229,"rer":338,"re ":1785,"raw":1292,"rd ":279,"rap":411,"ras":562,"rat":708,"rag":342,"ran":3019,"ram":928,"ral":1710,"rab":295,"raa":290,"rad":734,"rs ":234,"ros":487,"rot":255,"ron":875,"roo":656,"rop":486,"rod":249,"rol":389,"rna":412,"rne":221,"ro ":2000,"rma":307,"riy":449,"rit":895,"ris":641,"rig":228,"ril":614,"rik":932,"rin":2120,"ria":1493,"ric":296,"rie":322,"rk ":265,"rya":408,"rup":247,"rus":239,"ry ":568,"rsi":370,"rso":362,"rte":436,"rti":1012,"saa":543,"sab":432,"sag":340,"sah":246,"sak":534,"sal":1402,"sam":846,"sap":373,"san":13859,"sas":644,"sar":588,"say":842,"sa ":29066,"ryo":1185,"shi":267,"si ":909,"siy":1352,"sid":252,"sia":279,"sit":454,"sis":871,"sip":588,"sin":2020,"sil":886,"sim":530,"sik":667,"sig":229,"se ":703,"ser":748,"ses":480,"sh ":511,"sen":3264,"spe":263,"spa":238,"son":453,"sod":1704,"st ":395,"ss ":216,"sla":213,"smo":455,"so ":2182,"sye":492,"sya":1319,"syo":3972,"syu":310,"ste":723,"sta":2233,"sto":847,"sti":1338,"str":1197,"sub":226,"sul":377,"sum":409,"suk":222,"sun":606,"sus":423,"tak":378,"tal":1825,"tag":2356,"taa":516,"tab":313,"tad":729,"tay":910,"taw":1462,"tat":2169,"tas":1294,"tar":426,"tap":454,"tao":2928,"tan":3973,"tam":283,"te ":1421,"ta ":2620,"pa ":793,"par":2056,"pat":1511,"pas":473,"pay":244,"paa":836,"pab":282,"pag":5944,"pah":588,"pak":582,"pal":1553,"pap":964,"pam":2183,"pan":7375,"pi ":308,"per":1160,"pel":619,"pla":312,"pik":487,"pil":381,"pin":8127,"pis":517,"pit":481,"por":477,"pop":1613,"pos":639,"pon":910,"pol":471,"ppi":245,"po ":494,"pua":1158,"pub":334,"pri":362,"pre":283,"pro":666,"put":222,"pun":775,"pul":2278,"ra ":2777,"ngo":322,"ngi":702,"ngl":2579,"ngk":1509,"ngu":1141,"ngr":246,"ngs":1763,"ni ":1903,"nge":218,"ngg":1999,"ngh":357,"nga":5141,"nel":268,"ner":635,"net":328,"nes":773,"ng ":107416,"nce":344,"ne ":832,"ndu":336,"ndo":855,"ndi":1104,"nde":357,"nda":2122,"nak":3214,"nal":1987,"nam":1069,"nan":5284,"nao":281,"nap":1249,"nar":667,"nad":394,"nag":3410,"nah":1201,"nai":343,"nab":490,"nd ":696,"nau":287,"nat":2025,"nas":5521,"nay":628,"naw":416,"na ":22155,"nya":1473,"nul":380,"num":251,"nun":621,"nus":223,"nut":274,"nub":254,"nto":1170,"ntu":218,"ntr":369,"nti":1113,"nta":1619,"nte":1041,"nsy":292,"nso":1561,"nst":288,"nse":338,"nsi":398,"nsa":1639,"nt ":493,"ns ":315,"nod":444,"noo":3554,"nom":260,"non":1292,"nla":384,"no ":3206,"nlu":471,"nid":488,"nib":262,"nia":263,"niy":508,"niw":515,"niv":230,"nis":952,"nit":1670,"nim":327,"nin":947,"nik":402,"nil":1831,"ogr":242,"ohi":487,"ok ":961,"ol ":1113,"oby":351,"ode":251,"of ":743,"og ":1738,"ob ":490,"od ":2512,"obe":281,"nyo":656,"oto":243,"ost":571,"ota":234,"osi":296,"ose":518,"oso":256,"oy ":861,"oun":298,"opo":245,"opi":377,"ope":331,"os ":2497,"opu":1605,"oon":4273,"ook":406,"oob":454,"or ":1386,"ork":236,"orm":244,"oro":293,"ord":364,"ore":565,"org":297,"ori":875,"osa":225,"ort":603,"ory":303,"ot ":787,"ora":483,"ola":327,"on ":10511,"oli":798,"oll":560,"ole":828,"olo":1158,"ona":1108,"ond":258,"one":582,"ong":14203,"oni":381,"ono":505,"ons":429,"ont":363,"ony":253,"oma":662,"ome":348,"omi":484,"omm":343,"omp":399,"omo":218,"op ":492,"la ":7051,"le ":657,"laa":472,"lab":1035,"lad":618,"lah":600,"lag":1451,"lal":5124,"lak":2054,"lan":8966,"lam":1407,"lap":417,"lar":1484,"lat":1650,"las":3731,"law":4043,"lay":1014,"lba":235,"ld ":260,"kuy":222,"kun":1182,"kum":367,"kul":1301,"ksy":373,"ksi":218,"ktu":302,"kto":418,"lon":1346,"loo":506,"lor":222,"loh":474,"log":979,"los":411,"lto":257,"lug":603,"li ":620,"les":2603,"lem":360,"len":771,"leh":645,"leg":553,"lea":415,"lo ":1738,"lla":492,"lle":760,"ll ":320,"lit":1575,"lis":938,"lip":5499,"lin":1820,"lim":971,"liy":225,"lic":224,"lid":294,"lia":495,"lib":291,"lik":1738,"lil":346,"lii":293,"lig":604,"lih":227,"ma ":1406,"maa":656,"mab":485,"mah":1161,"mai":353,"mak":856,"mad":287,"mag":1774,"map":283,"mar":866,"mas":904,"mal":1856,"mam":1144,"man":4514,"may":4380,"mat":2454,"mba":1424,"mbr":990,"mbo":357,"me ":332,"med":215,"met":267,"mes":455,"mer":827,"men":704,"luk":293,"lup":252,"lun":1075,"lum":601,"lut":234,"lus":323,"lur":429,"lya":1073,"lyo":437,"mpi":405,"mpe":729,"mpo":376,"mpu":307,"mog":636,"mon":703,"mot":220,"mpa":1165,"mus":340,"mut":297,"mul":2510,"mun":975,"mga":9380,"min":1128,"mil":657,"mis":408,"mit":1305,"mik":544,"mo ":604,"mmu":273,"zon":372,"yun":445,"ysa":370,"yro":432,"yos":430,"yon":8581,"yea":322,"yeg":287,"yen":248,"yem":924,"ya ":5481,"yag":346,"yar":349,"yan":8889,"yal":500,"yo ":2106,"yna":235,"yni":658,"wit":490,"wig":2597,"wik":476,"wa ":1340,"wan":2646,"wal":851,"wak":321,"wat":242,"war":240,"wag":936,"ver":481,"ve ":253,"va ":222,"uya":331,"uwe":288,"uwa":445,"usi":309,"usa":537,"usy":243,"usu":363,"ust":456,"uti":335,"ute":297,"uta":417,"utu":829,"uto":373,"us ":1413,"ura":1051,"ure":277,"uri":1726,"uro":457,"uny":224,"uon":264,"upa":766,"ur ":441,"upo":374,"ump":364,"umu":925,"umi":469,"uma":1769,"umb":268,"uly":233,"uo ":439,"unt":520,"unu":378,"uni":882,"uno":1281,"und":796,"una":2530,"ung":3950,"uku":387,"uko":610,"um ":503,"uka":453,"ulu":1033,"ult":291,"ulo":1229,"uli":710,"ula":6857,"uin":299,"ugn":288,"uga":1124,"uha":936,"ubo":309,"ubr":217,"ubu":535,"ue ":245,"uez":255,"uan":1897,"ubi":307,"ubl":385,"uba":263,"tye":228,"ty ":596,"tur":814,"tut":349,"tul":834,"tuk":407,"tun":595,"tum":594,"tub":614,"tra":868,"tri":838,"tro":879,"to ":7587,"tna":387,"tom":241,"ton":2028,"tol":467,"tor":922,"til":777,"tik":1397,"tig":242,"tir":721,"tit":772,"tis":888,"tin":3062,"tim":775,"tip":230,"tio":1023,"tib":304,"tid":407,"tiy":331,"tlo":414,"tem":511,"ten":352,"tel":392,"th ":280,"tes":226,"ter":1925,"ti ":555,"the":612,"tha":237},"n_words":[2110634,2489828,1864789],"name":"tl"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":20468,"E":16965,"F":15212,"G":16978,"A":46399,"B":33447,"C":16597,"L":11614,"M":29609,"N":10463,"O":12233,"H":17902,"I":6437,"J":5964,"K":30894,"U":5006,"T":27864,"W":5022,"V":7330,"P":19836,"S":32251,"R":13212,"Y":12032,"Z":2430,"f":59766,"g":101779,"d":361700,"e":798707,"b":176889,"c":86462,"a":1029574,"n":718304,"o":261031,"l":623439,"m":275808,"j":10372,"k":334154,"h":85819,"i":785261,"w":7611,"v":106213,"u":275921,"t":334795,"s":315971,"r":646028,"q":1181,"p":80800,"z":105754,"y":255623,"x":3410,"²":1600,"Ç":4715,"Ü":2583,"Ö":3499,"î":1753,"é":1705,"ç":69086,"â":3605,"ü":157542,"ö":58180,"ğ":69432,"ı":399538,"İ":20804,"ş":111613,"Ş":5518," l":7220," m":26771," n":32386," o":55007," h":21337," i":59190," k":78855," d":82641," e":36355," f":17358," g":42591,"р":1499," a":67898,"с":1190," b":117978,"т":957," c":5691," y":76657," z":4398," u":11497," t":53835," v":55379," p":15531," s":57822," r":8817," J":5925," K":30750," H":17802," I":6388," N":10386," O":12150," L":11515," M":29458," B":33275," C":16419," A":46229," F":15133," G":16827," D":20328," E":16897,"л":1101," Z":2402,"к":1097," Y":11995,"и":2029,"о":2304,"н":1554," S":31973,"в":1191," R":13120," P":19696,"а":2550," W":4958," V":7287,"е":1678," U":4989," T":27682," ç":18143," ö":14326," ü":15280," Ç":4706," Ö":3487," Ü":2581," ı":6625," İ":20776," ş":10933," Ş":5509,"ي":1130,"ل":1185,"ن":965,"ا":2068,"A ":2665,"Da":3349,"Cu":2398,"Co":2920,"Ce":1839,"Ch":2248,"Do":2641,"De":6139,"Di":2765,"Fe":2199,"Fa":2297,"Ey":1528,"Er":2288,"Es":1409,"En":2354,"El":1347,"Ek":1512,"Ağ":1411,"Ge":3246,"Ga":2711,"Bü":1152,"I ":2251,"Bö":1239,"Fr":4109,"Fo":1223,"Fi":2407,"B ":1016,"C ":1464,"Av":3034,"Ar":5311,"At":1678,"As":1760,"D ":3341,"Ba":8245,"Ay":1472,"Af":1012,"Ab":1087,"Ad":2089,"Am":3467,"An":5110,"Ak":1709,"Al":8286,"Bu":5544,"Br":2172,"Ca":3556,"Bi":5816,"Be":4617,"Bo":2818,"Ku":5039,"Gö":1216,"Kr":1520,"Ko":3871,"Le":1961,"Li":3051,"Gü":3195,"La":2872,"Lo":1867,"Me":5154,"Mi":3881,"Ma":11016,"Mu":2388,"Mo":2807,"Ni":2456,"Ne":2578,"Na":1799,"P ":982,"No":1828,"Ok":1001,"Ol":1014,"Oc":1091,"Gi":1074,"Gr":2260,"Go":1145,"Ha":7669,"He":2472,"Hi":1980,"Ho":2868,"Dü":2155,"In":1300,"Ja":2318,"Jo":1650,"Ka":10073,"M ":1217,"Ki":1749,"Ke":1920,"Ul":1131,"Tu":1634,"Tr":1523,"To":2482,"Th":2500,"Ti":1452,"Te":4160,"Ta":4376,"V ":1033,"St":3068,"Su":2048,"Wi":1541,"Wa":1156,"Vi":2393,"Va":1979,"Ve":1434,"Pr":2222,"S ":1244,"Pe":2037,"Pa":5059,"Kü":1314,"Po":6260,"Pi":1406,"Os":2243,"Or":2331,"Kö":1204,"Se":4094,"Sc":1963,"Si":3179,"Sh":999,"Sp":940,"So":3019,"Ru":3002,"Sa":7772,"Re":2740,"Ri":1066,"Ro":3278,"Kı":2617,"Ra":1699,"Mü":1802,"b ":1498,"a ":176902,"Ye":1685,"Tü":7528,"Ya":4483,"Yo":1238,"Yu":2386,"bö":7750,"i ":134793,"ge":29308,"ağ":18792,"bü":7597,"ga":7799,"fl":1604,"ff":1053,"fi":9181,"fr":3323,"fu":6151,"ft":2020,"fo":3898,"he":11849,"ha":24248,"cü":3165,"gl":1385,"gi":16689,"gh":1246,"gu":5867,"gr":6009,"go":2702,"du":14570,"dy":2265,"g ":6921,"ea":4032,"eb":5387,"ec":6696,"ed":29273,"de":95875,"dd":2946,"di":58969,"dl":2907,"do":12268,"dr":3254,"ew":1275,"eu":1087,"ev":14605,"ey":30087,"ez":8326,"fa":8497,"h ":5851,"fe":6028,"eh":5944,"eg":3973,"ef":3639,"ee":1898,"el":56887,"ek":41993,"aç":5873,"ei":4450,"ep":3432,"eo":3151,"en":100462,"em":27716,"et":47279,"es":52024,"er":131153,"ca":18661,"e ":189222,"br":3182,"bu":23594,"bo":7205,"bl":1863,"bi":65897,"be":19027,"da":97587,"f ":5395,"cu":6717,"ct":1404,"cr":1803,"co":3001,"ck":3008,"ci":12969,"ch":6435,"ce":16707,"c ":1516,"az":19018,"ay":42685,"ba":34644,"d ":11781,"at":38962,"as":49850,"ar":157561,"av":10941,"au":2292,"ak":57066,"al":73836,"ai":6322,"aj":1917,"ap":20439,"am":37584,"an":184545,"ac":10260,"ad":37380,"aa":2080,"ab":13369,"ag":3250,"ah":16357,"ae":2002,"af":13702,"nu":18358,"nt":17557,"ns":13849,"nr":3598,"no":7521,"nn":3019,"nz":1833,"ny":12611,"hı":1036,"iğ":11558,"of":3409,"oc":3225,"od":4927,"ob":3075,"om":13542,"on":43168,"ok":10977,"kç":3190,"ol":59486,"oi":1195,"oj":3578,"og":3689,"oh":1112,"m²":1589,"ot":7864,"os":10639,"ov":4695,"ou":3530,"op":8235,"oo":1893,"or":32678,"r ":170339,"ow":1477,"oz":2422,"kö":4303,"oy":8048,"pe":6240,"kü":6018,"pa":15264,"pl":7151,"pm":1729,"po":6918,"ph":1672,"pi":5799,"lç":5891,"lo":10260,"lm":31640,"ll":28962,"ls":3083,"fı":9951,"lu":32826,"lt":9362,"ly":5619,"o ":14289,"mc":2028,"md":5246,"hü":1497,"ma":67090,"mb":3028,"eş":11455,"mh":2240,"me":45214,"iç":12229,"ml":15462,"mi":33126,"mm":2739,"mp":4822,"mo":6599,"ms":3576,"mu":11366,"gı":2023,"my":1356,"p ":11912,"na":47407,"nb":3039,"nc":18538,"nd":102802,"ne":51401,"nf":1257,"ng":12672,"ni":53546,"nk":5100,"nl":28972,"nm":10641,"dı":37931,"ki":41309,"eğ":6350,"ke":25138,"ka":45970,"m ":32916,"ky":1403,"gö":9935,"ks":8093,"kt":21304,"ku":22990,"ko":13147,"kr":3294,"kk":1610,"kl":25669,"km":5012,"kn":1431,"li":71923,"lk":10655,"le":122046,"ld":15695,"lg":12154,"lf":964,"la":144587,"gü":8845,"lc":1649,"lb":3731,"n ":243619,"hr":4105,"bı":1392,"ht":2266,"hu":4538,"hi":16646,"hn":1197,"ho":2852,"hl":3714,"hm":1584,"id":21397,"ic":9431,"ib":5859,"dü":13708,"ia":8374,"ih":7127,"ig":4782,"aş":24105,"if":4811,"ie":4567,"dö":3928,"k ":83031,"ir":107452,"is":40125,"it":22698,"iu":1112,"iv":5224,"cı":8286,"ii":1103,"ij":1085,"ik":40360,"eç":4865,"il":86504,"im":27783,"in":138518,"io":5300,"ip":8100,"je":1713,"ji":4659,"iz":21969,"iy":28989,"l ":42232,"ja":1035,"tç":1871,"ğlu":963,"rı":43249,"sö":2312,"z ":18621,"sü":4852,"oş":947,"wi":1830,"sç":1475,"pı":8243,"vv":1012,"vy":1154,"y ":15418,"wa":1412,"rü":8173,"ğla":2983,"vl":4067,"rç":3762,"vi":9343,"nş":1036,"vu":2607,"vr":5677,"vo":1424,"uz":10570,"uy":4807,"uv":2249,"ve":59460,"va":15490,"x ":1999,"ui":1273,"uk":6582,"ul":36937,"ue":3947,"oğ":10206,"ug":2004,"ğit":1160,"uh":1861,"ur":35478,"ğin":3622,"us":21592,"ut":7622,"um":12243,"un":39859,"up":6378,"ty":1262,"tu":11035,"tt":7164,"nı":44489,"ub":5081,"ua":1900,"ud":5831,"uc":4903,"w ":1459,"to":17485,"tm":6406,"tl":13327,"ts":2378,"tr":10162,"te":52587,"pç":1206,"tk":2785,"ti":53158,"th":4577,"v ":2143,"tb":2499,"ta":68248,"su":11906,"sv":930,"ss":3728,"st":34655,"sy":6610,"mı":21024,"sw":1258,"sl":8558,"sk":7111,"sn":1024,"sm":5546,"sp":5003,"so":11037,"sc":1810,"se":25089,"sh":2001,"si":63532,"rz":1185,"u ":42818,"sa":39618,"nü":10410,"rr":2777,"rs":8400,"rt":19992,"ru":21036,"rv":1617,"lı":53226,"ry":3722,"rp":1215,"ro":18824,"rn":5785,"rm":14124,"rl":25348,"rk":23839,"nç":1698,"rj":994,"ri":89903,"rh":1272,"rg":7962,"rf":1575,"re":45975,"rd":23811,"rc":3479,"rb":3537,"mü":12609,"ği ":7346,"ra":89176,"t ":36013,"kı":16796,"ğer":2592,"iş":22142,"lü":12025,"s ":31408,"pt":3720,"pu":1218,"pr":5018,"ps":1980,"ğun":2143,"ğus":1809,"ğum":954,"zı":6859,"zü":1410,"yı":27954,"yü":11063,"ğu ":5997,"zö":2165,"ğlı":7152,"ğre":1586,"uş":11784,"yâ":1423,"yö":4435,"ğiş":1548,"uğ":6358,"zg":2173,"rş":2653,"zi":13389,"zc":5363,"zd":2307,"ze":21253,"za":12673,"tı":26503,"zy":1958,"zu":2578,"zo":2918,"zm":3788,"zl":5972,"yg":2621,"ye":34397,"yd":5178,"tü":11302,"ya":90167,"yb":1899,"tö":1628,"sı":43368,"yv":967,"yu":8007,"ys":1270,"yr":3168,"yo":15057,"yn":5330,"ym":1117,"yl":13689,"yk":1018,"uç":1551,"yi":7908,"² ":1587,"ğı ":9316,"ğın":3141,"Çi":1350,"Ça":1454,"âl":1634,"Ün":1471,"Öz":1299,"çı":7148,"çü":4634,"î ":1515,"ço":5321,"çm":1158,"çl":3470,"çi":13394,"çe":14606,"ça":9969,"ç ":5469,"üş":3641,"üğ":1561,"üç":3404,"ün":27564,"üm":12601,"ül":12799,"üs":5069,"ür":27019,"üp":974,"üt":2907,"üz":15474,"üy":5418,"üc":2767,"üf":4960,"üd":3476,"öğ":1623,"ük":10837,"ü ":14372,"öy":3363,"öz":7289,"ör":11792,"ös":1882,"ön":12083,"öl":13407,"ök":2015,"ğ ":1364,"ğu":12250,"ğr":3621,"ğe":3325,"ğd":979,"ğa":3324,"ğl":11320,"ği":14714,"ğı":15723,"ğü":1540,"ığ":10531,"ış":19918,"ı ":102139,"İn":4248,"İm":1089,"İl":1863,"ın":99943,"ım":16215,"ıp":1333,"ıl":42504,"ık":17416,"ıf":1477,"ıb":1069,"ıd":8898,"ıc":3354,"İt":1484,"İr":962,"İs":7391,"ıy":7698,"ız":6701,"ıs":11942,"ıt":2777,"ır":43940,"şt":19651,"şu":2786,"şi":12692,"şl":8667,"şk":4781,"şm":7053,"şa":13302,"şe":7623,"Şu":1166,"Şa":2212,"Şe":1159,"ş ":18890,"şı":10920,"şü":1419," Ga":2694," Bü":1149," Ağ":1410," Ge":3227," Bö":1239," Fo":1207," Fr":4097," Fi":2393," Ha":7643," He":2466," Go":1137," Gr":2225," Gi":1067," Dü":2153," Ho":2861," Hi":1971," Ja":2310," In":1293," Ka":10047," Ke":1904," Ki":1728," Jo":1635," Gü":3188," La":2844," Le":1944," Li":3034," Ko":3863," Kr":1518," Ku":5027," Gö":1215," Ma":10963," Mi":3862," Me":5127," Lo":1852," Ne":2557," Na":1779," Ni":2442," Mo":2790," Mu":2369," Am":3459," An":5094," Ak":1699," Al":8247," Af":1009," Ad":2080," Ab":1075," Ba":8219," Ay":1468," Av":3029," At":1662," As":1754," Ar":5292," Be":4599," Bi":5804," Bo":2788," Br":2154," Bu":5532," Ca":3515," Ce":1828," Ch":2228," Co":2886," Cu":2385," Da":3328," Di":2745," De":6126," Do":2600," El":1340," Ek":1512," Es":1407," Er":2282," En":2338," Ey":1526," Fe":2195," Fa":2283," Wi":1521," Wa":1142," Yu":2383," Yo":1225," Tü":7497," Ya":4478," Ye":1681," a ":1377," Kö":1204," Os":2235," Or":2329," Po":6216," Pi":1404," Pe":2029," Pa":5030," Kü":1311," No":1821," Ol":1012," Ok":999," Oc":1088," Ra":1684," Mü":1801," Kı":2610," Ro":3268," Re":2723," Ri":1059," Pr":2202," Su":2042," St":2970," Ta":4359," Th":2488," Ti":1443," Te":4145," Tr":1513," To":2461," Ru":2999," Sa":7750," Sh":981," Si":3171," Sc":1940," Se":4078," So":2983," Va":1973," Ve":1421," Vi":2383," Tu":1612," Ul":1129," im":933," in":10201," ik":4076," il":20054," is":5383," ka":23639," ki":5592," ke":6355," eğ":1244," dı":1534," ha":11824," he":4045," gi":4830," gr":3277," dö":3823," dü":5597," id":1007," aş":930," hi":2120," ni":6427," nd":4080," ne":3117," na":1507," mu":1078," mo":2617," ok":1391," ol":35321," on":2426," of":1496," nu":1915," no":1087," le":1165," li":3624," gü":6331," la":1347," ku":18460," gö":9671," km":3282," ko":8297," me":8098," mi":3110," iç":9390," hü":1380," ma":6917," ad":10374," am":2886," an":8617," ai":1874," ak":2362," al":14807," ar":9911," at":1881," as":2441," d ":1776," ba":23010," ay":4744," bi":52706," be":11424," bo":2633," bu":15855," ca":1538," e ":1478," er":1285," et":4327," es":3101," en":6931," el":3328," ek":1681," aç":1971," fe":1277," fa":4483," ey":3388," ev":1229," fu":1090," fr":1901," fo":1846," fi":5962," ağ":1494," ge":16462," bü":4020," ga":1355," i ":957," bö":7468," ci":1625," da":23947," do":7108," de":26605," di":9385," ed":6003," du":2416," za":2588," yo":2330," sı":5340," ye":11522," tü":4969," ya":37365," sö":2004," sü":3398," yı":13517," yü":5685," yö":4365," nü":5050," sa":17392," se":8759," si":6463," sp":996," so":7754," kı":5201," ra":1785," mü":4235," re":3088," ro":2639," pr":3082," iş":3101," ot":2226," or":5403," oy":4366," kö":3581," pe":1612," kü":2525," pa":4625," pl":965," po":2333," pi":1501," va":2885," ve":49979," uz":2734," uy":2267," vi":955," nı":8640," tu":1641," un":2824," ul":1616," ta":27334," st":1812," su":2833," to":3720," th":1268," ti":1727," te":11491," İt":1484," İs":7380," İr":961," ın":4768," İn":4243," İl":1857," İm":1089," Çi":1348," Ça":1452," Öz":1296," Ün":1468," çe":4156," ça":3855," ço":4379," çi":1440," çı":3500," öl":1632," ön":4253," ör":1000," öz":4179," ür":2793," üs":1211," ün":2173," ül":2545," üz":3783," öğ":1347," üç":1563," şi":1684," şe":4952," şa":3567," Şa":2210," Şe":1156," Şu":1166,"İst":2249,"İta":1353,"İsp":2587,"İng":3329,"İmp":936,"ıca":1274,"ıda":1227,"ılı":15802,"ısa":1923,"ıra":2874,"ırl":2487,"ırm":2227,"ını":15301,"ımı":5151,"ıya":1118,"ırı":3492,"ız ":2539,"ıl ":1920,"ıdı":7570,"ık ":8525,"ıcı":2074,"ıla":12183,"ın ":32401,"ıld":3344,"ılm":5978,"ıll":2939,"ım ":3853,"ıka":2439,"ıkl":3108,"ıkt":1193,"ınl":2820,"ıp ":1063,"ınd":38554,"ına":8579,"ınm":927,"ıma":1331,"ımc":938,"ıml":3642,"ıs ":2159,"ır ":29899,"ıyı":1283,"ızı":1437,"ısı":6837,"ıyl":4411,"ızl":1239,"Çin":1025,"ığı":10132,"ış ":6594,"ışt":5588,"ışa":989,"ışm":2843,"ışı":2617,"Fil":1047,"Eyl":1137,"Eki":1120,"End":927,"Ağu":1069,"Gen":1382,"Gal":926,"Böl":1142,"Fra":3208,"II ":1197,"Haz":1331,"Her":938,"Hal":1040,"Har":940,"Dün":1753,"Hol":1623,"Ara":2840,"Avr":1832,"Bar":1214,"Bat":956,"BD ":2271,"Ada":1186,"Alm":3781,"Ame":2710,"Ana":1277,"Ant":1215,"Bu ":2451,"şıl":1169,"şık":1556,"şın":2204,"şı ":3074,"Bel":1045,"Bil":1105,"Baş":989,"Bir":3328,"Cum":2030,"Dev":1876,"Den":1321,"Cha":1022,"Doğ":1141,"Nis":1117,"Oca":1015,"Ort":943,"Osm":1787,"Par":1928,"Pro":985,"Por":4336,"Kan":1554,"Kas":1322,"Kar":2603,"Kon":1226,"Kra":1061,"Kur":1360,"Kuz":1311,"Gün":1851,"Mer":1131,"Man":1052,"Mar":3200,"May":1678,"Mil":1269,"çok":3674,"çeş":1324,"çla":1690,"çim":1273,"çin":7152,"Yun":1817,"Tür":7269,"Yar":1131,"Sta":1148,"Tem":1343,"Rus":2328,"Sch":1519,"Sav":1634,"San":1732,"Rom":1453,"Ulu":930,"The":1923,"ça ":3264,"çal":2740,"çe ":2241,"çes":2357,"çer":2328,"çev":1541,"çek":2799,"çi ":1341,"biy":1472,"bit":1248,"bir":44645,"bil":11436,"bin":1380,"baş":7621,"şa ":1752,"bol":2761,"şam":1786,"şan":2545,"şar":3761,"boy":1344,"ban":1510,"bak":1768,"bal":1557,"baz":1091,"bay":1165,"azı":4805,"bat":3330,"bas":2581,"bar":2153,"Şub":979,"bi ":3364,"ber":3678,"ben":1194,"bel":7483,"bes":1722,"bağ":8489,"şla":4267,"şle":3059,"şma":4426,"şme":1364,"ca ":7674,"car":1245,"can":1925,"cak":4366,"ce ":9532,"bra":943,"bu ":5377,"şek":1911,"şeh":2781,"şi ":1181,"bur":1706,"bul":11239,"bun":1530,"buc":1673,"şid":1048,"şit":1478,"şir":1286,"şin":976,"şim":2323,"şil":1127,"şik":1949,"şki":1113,"şke":1220,"şka":1851,"aka":3723,"am ":5523,"aki":6742,"adı":13553,"al ":11243,"ail":1471,"air":932,"ait":1247,"acı":3398,"ak ":21809,"abı":1039,"ahi":5926,"aha":4061,"anu":1353,"any":6219,"ano":1022,"ann":1052,"anm":4730,"ant":4742,"ans":6131,"ane":2508,"ang":2190,"ani":4880,"ank":1605,"anl":12840,"ap ":1282,"ana":13099,"anb":2176,"anc":4224,"and":10443,"aml":2039,"amp":1789,"ami":2705,"ame":1745,"ama":13353,"aly":2496,"afı":8781,"alt":4529,"alm":2361,"all":3746,"alk":2064,"ali":6616,"ald":2079,"ale":5493,"ala":14214,"alb":2602,"an ":84156,"aks":970,"akt":8868,"akk":924,"akl":5036,"aba":3739,"abe":1525,"abi":3498,"abu":1168,"ae ":1012,"aca":2693,"ad ":1275,"ştu":4101,"şti":6965,"afi":1276,"ştı":7023,"ah ":1276,"ado":1356,"adl":2298,"adi":1602,"add":1987,"ade":4140,"ady":944,"ada":8970,"azi":2552,"azl":1262,"atı":8461,"aze":1261,"arş":2391,"aza":4710,"az ":2571,"ayn":2937,"ayl":1883,"ayr":2190,"ası":27693,"arı":35962,"aya":12842,"ayd":1055,"aye":1092,"âle":1449,"ba ":1088,"ayı":12386,"akı":5066,"at ":6328,"are":4111,"ard":9709,"arc":1066,"ara":45240,"arm":1435,"arl":5899,"ark":6902,"ari":9836,"alı":12304,"ars":1642,"art":5112,"asa":4734,"ary":1104,"asi":2326,"ask":1950,"ar ":19629,"apa":2097,"apm":1588,"apl":1156,"apo":1295,"apt":1792,"as ":3211,"ava":5024,"avr":943,"arç":1233,"avi":1237,"ay ":3414,"avu":1433,"apı":7957,"ata":3117,"ast":3088,"asy":2788,"amı":5927,"atm":1041,"apç":991,"atl":2228,"atr":1379,"ato":2437,"ate":2841,"ati":5438,"att":1422,"anı":18491,"Üni":1310,"Şar":1029,"ji ":1449,"jis":950,"itl":1849,"öğr":1364,"ito":956,"cı ":3493,"ism":1455,"ist":11668,"ita":3780,"ite":3995,"iti":3699,"cıl":1667,"üfu":4682,"iva":1157,"ive":2871,"ilç":2129,"is ":3578,"ion":3161,"ir ":71671,"irm":3093,"irk":1523,"irl":6644,"iri":12312,"isi":13092,"ise":2824,"isa":3166,"ire":3266,"ira":2633,"ird":1851,"it ":3783,"ünl":1704,"üni":930,"ünc":1623,"ünd":2862,"üne":6398,"üml":1023,"üme":1310,"ült":1479,"ür ":5331,"üny":3730,"iyi":1021,"ül ":1714,"iyl":2890,"iyo":3774,"iya":6327,"iye":14035,"üdü":2475,"ük ":5512,"cıy":983,"iz ":6631,"üle":2031,"ülk":2638,"üll":944,"üks":1988,"ün ":4631,"izm":2299,"izl":1160,"izi":3733,"izc":3709,"ükl":1155,"iza":1126,"üm ":2100,"kim":2642,"kil":4381,"kiy":3334,"kiz":4406,"kin":3419,"kis":1066,"kit":1608,"km ":1709,"ki ":15464,"eğe":1107,"eği":4944,"kel":2350,"ken":7944,"kes":1806,"ker":2097,"ket":3347,"kez":3142,"ke ":1164,"kra":1708,"kiş":2390,"kse":2982,"klı":3118,"km²":1573,"kor":1133,"kon":4434,"kom":1557,"kol":1873,"kle":9482,"kla":8235,"kli":3448,"dız":949,"dıy":1257,"dır":19814,"dın":2440,"dı ":5738,"kaz":1358,"kay":2732,"kat":3255,"kar":8976,"kas":2234,"kap":1914,"kan":7174,"kal":4961,"kam":982,"kad":3577,"kab":1637,"dış":1185,"ka ":4513,"dığ":4219,"ha ":3071,"cü ":1507,"ham":1081,"han":2727,"hak":1219,"hal":3743,"hav":968,"har":3475,"has":1125,"hat":1036,"haz":1057,"hay":1743,"he ":3168,"her":2799,"hen":1336,"hem":1060,"hi ":1444,"hip":4144,"hin":2793,"hil":1264,"hir":2302,"hle":2209,"ağı":5375,"gru":2866,"gra":2358,"gul":1221,"gue":2075,"dül":2427,"ian":1361,"dün":2104,"dür":3900,"ibi":3243,"düz":2369,"id ":1061,"iba":969,"ia ":4696,"aş ":1969,"ig ":1399,"ici":2058,"ich":1442,"ice":1186,"ie ":1088,"ica":1674,"idi":13477,"ide":3373,"ida":1813,"if ":1449,"düş":1411,"il ":5350,"im ":8509,"ika":5895,"aşa":4663,"aşl":3782,"aşm":1272,"aşk":2892,"aşt":1881,"ihl":1333,"ihi":3044,"ik ":16892,"iml":3173,"imp":924,"ime":2281,"imd":1567,"imi":5215,"ip ":4274,"inc":5308,"ind":30574,"ina":3996,"aşı":5939,"ino":995,"int":1436,"ins":2898,"ine":16354,"ing":3240,"ini":20786,"inl":1777,"iko":990,"ikl":5637,"iki":4685,"eçi":1614,"eçe":1013,"ila":2643,"in ":47078,"ikt":2801,"ilo":1062,"ill":5588,"ilk":4218,"ilm":10420,"ilg":3153,"ili":20573,"ild":2510,"ile":24870,"ima":2176,"io ":955,"ily":1842,"hri":2688,"hur":2425,"dör":1115,"dön":2425,"fes":1014,"fer":1326,"far":1401,"eyâ":1363,"fa ":1026,"eyb":1488,"eya":8242,"eyl":966,"eyi":4871,"eyd":2007,"eye":3542,"ez ":1852,"ezo":1005,"ezi":2727,"eta":1696,"ete":2836,"eti":16248,"etm":3594,"etl":4019,"etk":1732,"est":3861,"ess":1296,"esw":1087,"ev ":1032,"etr":1981,"ett":2719,"eve":1133,"eva":954,"evl":3752,"erç":1693,"evi":3588,"evr":2587,"ey ":4964,"er ":28073,"es ":5122,"erk":5212,"erl":5910,"eri":48739,"erg":2108,"ere":9775,"erc":963,"erd":6317,"era":4204,"erb":1053,"et ":9016,"açı":2094,"esk":1828,"esl":1542,"esm":989,"esi":30715,"ese":2043,"esa":1128,"ert":1494,"ers":4580,"ern":2620,"erm":2340,"eki":11678,"ekl":5425,"açl":1635,"ekn":1222,"eko":1029,"eks":1790,"ekt":6067,"en ":46837,"ela":1161,"eld":2163,"ele":20479,"eli":10508,"elm":1130,"ell":5734,"els":966,"ema":2676,"eme":6814,"eml":3228,"emm":1228,"emi":6693,"ene":8543,"eng":1269,"ena":1544,"end":6506,"enc":1546,"enm":2094,"enk":1143,"enl":4600,"eni":13441,"ens":1925,"ent":5851,"enz":1180,"egu":1647,"ehr":2303,"ehi":2131,"ek ":9883,"aç ":1092,"ein":1910,"el ":11201,"eke":1777,"eka":935,"em ":2604,"öst":1805,"gis":1807,"gir":1360,"gil":5370,"önü":972,"geç":2375,"gin":1021,"gib":2269,"ört":1016,"öre":4482,"ölü":2891,"ölç":2720,"gi ":2386,"ör ":949,"gen":4928,"ger":3542,"ges":4599,"ağa":1186,"gel":7731,"ağl":9979,"önc":1623,"öne":7107,"ge ":2812,"ağ ":1140,"gaz":1147,"ölg":6551,"gar":1277,"büy":3008,"büm":2701,"gan":1892,"böl":7461,"fus":4728,"fut":963,"öyü":1007,"fre":2029,"for":2226,"öze":3441,"özl":926,"örü":2439,"fil":4052,"da ":52350,"de ":46250,"dak":5123,"dal":2767,"dah":2619,"das":1177,"dar":3775,"dan":21526,"dam":1680,"day":1218,"dde":1533,"cul":955,"cus":1172,"cre":977,"cu ":1846,"ch ":1055,"ces":998,"cek":1367,"cel":1540,"ci ":4802,"ck ":1696,"che":1266,"chl":1164,"cil":1965,"cis":1558,"cin":1363,"ed ":1577,"ebe":1387,"ebi":2611,"efe":1034,"edi":16774,"ede":8053,"eda":1066,"eci":1467,"ece":3057,"dyo":1078,"dur":5671,"duğ":3327,"don":972,"dol":1859,"dok":977,"diğ":3682,"doğ":5145,"dra":1187,"dlı":1288,"du ":2273,"dağ":1454,"der":6369,"des":1911,"dev":2820,"ded":1153,"del":2710,"dek":4272,"den":20219,"dem":1663,"di ":6071,"dla":1190,"do ":1033,"diz":1879,"diy":5465,"din":3013,"dir":24487,"dis":2432,"dik":1433,"dil":6616,"değ":2872,"rga":1818,"ri ":19287,"rgi":1489,"rge":959,"ret":5334,"res":4290,"rev":1391,"rdu":2054,"rg ":1524,"rec":1008,"red":1084,"reg":1836,"ren":6966,"rek":4543,"rel":1942,"rda":4941,"rdi":3615,"rde":5880,"re ":11153,"ray":2290,"müz":3407,"rd ":1472,"rap":1757,"rar":2927,"ras":9820,"rat":4222,"rbi":1042,"rba":1205,"mün":3661,"ran":10600,"ram":4044,"ral":5875,"rak":15812,"rab":1433,"raf":10463,"rad":2346,"rac":1216,"rs ":1268,"ros":1512,"rot":967,"rom":2121,"ron":1936,"rol":2443,"rkç":1463,"rog":1323,"rna":1248,"rne":1926,"ro ":1843,"rma":6313,"rme":4031,"rmi":1407,"rlu":1118,"rli":4427,"rle":9208,"rla":6927,"rki":3194,"rkl":1461,"rke":5713,"rka":1543,"rdı":3751,"riy":3959,"rit":1940,"ris":5765,"rih":5101,"müş":1050,"raş":1079,"ril":8019,"rik":4914,"rin":23896,"rim":3407,"ria":1426,"rdü":1150,"ric":1498,"rid":4314,"rk ":5291,"lıl":999,"lık":8208,"lın":9443,"lım":1113,"lır":2906,"rya":1228,"rup":3018,"run":1284,"rum":2945,"rul":5547,"ry ":1364,"rsi":2903,"rsa":1025,"rta":4786,"rte":4662,"rti":1858,"lı ":21277,"rub":2412,"rt ":3885,"rkı":3548,"ru ":1610,"rlı":1952,"sab":1070,"sad":1069,"nüf":4498,"sah":5074,"sal":4864,"nüm":1062,"san":6337,"nün":1717,"sat":1316,"sar":2328,"say":5210,"sav":1790,"sa ":4002,"nü ":1409,"lış":2586,"lığ":4204,"si ":18237,"sağ":1738,"siz":1136,"siy":3676,"sid":3408,"sia":1857,"sit":3495,"sis":2770,"sin":18799,"sil":2083,"sim":2963,"sik":1925,"se ":2953,"ser":3758,"ses":2131,"sen":2086,"sem":1040,"sel":4229,"sek":2392,"spo":1131,"spa":2896,"son":6601,"su ":5315,"st ":3445,"slu":1935,"sla":3121,"sle":1769,"ski":2726,"ske":1671,"sma":2231,"smi":1955,"mın":4103,"swi":1113,"stü":1161,"sya":2595,"syo":3350,"ste":9865,"sta":9233,"sto":2278,"sti":3825,"stl":934,"str":1911,"mı ":4565,"sun":2855,"tak":3619,"tal":5550,"tab":2400,"tad":4726,"tay":1623,"tat":1503,"tas":2835,"tar":17331,"tap":1242,"tan":13930,"tam":2113,"te ":6912,"tbo":1793,"ta ":6535,"mış":10404,"pa ":2055,"par":4696,"kül":1832,"küm":1078,"pan":3919,"per":2301,"küç":956,"pla":3634,"ple":961,"plu":1414,"lçe":2541,"piy":2262,"por":1907,"pon":1081,"pol":1613,"lçü":2547,"pti":1099,"pra":1013,"pro":2802,"ptı":1375,"lü ":3559,"lüm":2580,"lül":1275,"lük":1015,"iş ":5374,"işi":4832,"işk":1088,"işl":2771,"işt":5462,"kı ":2158,"kıl":1124,"kım":2874,"kıs":2851,"kın":2434,"kıy":1259,"ra ":9857,"mü ":1475,"ngi":4542,"ni ":12017,"nge":1612,"ncü":1344,"nel":4863,"nek":1315,"nen":2962,"nem":4576,"ner":2342,"net":4444,"nes":2204,"ng ":2974,"ned":1504,"ney":4102,"nci":4377,"nce":5236,"nca":4009,"ne ":19615,"nbu":2438,"ndu":1472,"ndr":1044,"ndo":1041,"ndi":6541,"nde":35967,"nda":48013,"ncu":1481,"nak":1647,"nal":3192,"nam":1052,"nan":13136,"nar":2141,"nad":2145,"nd ":2524,"nat":3318,"nas":1441,"nay":1104,"na ":14803,"muş":4297,"nya":10222,"nun":5903,"nus":1510,"nuc":968,"nto":981,"ntr":1143,"nti":4165,"nta":1967,"nte":3585,"nmı":2809,"nst":987,"nse":1596,"nsi":1251,"nsa":3394,"nu ":4463,"nlı":4882,"nra":2468,"nt ":2440,"niş":970,"ns ":1946,"nlü":1339,"nom":2114,"nne":1221,"nme":1944,"nma":4498,"nmi":973,"nli":2503,"nla":14247,"nle":4649,"nlu":1296,"nka":1369,"ndı":4155,"nic":987,"ndü":1181,"niy":1052,"niz":3806,"ncı":1170,"niv":2157,"nis":1976,"nir":2294,"nim":1125,"nin":20306,"nik":2175,"nil":944,"ogr":1717,"ok ":4093,"oji":2722,"ol ":4011,"ock":1051,"ode":1635,"of ":1645,"iği":9467,"iğe":1929,"obi":987,"nsı":2063,"köy":1994,"oyu":4442,"oyn":1402,"oto":3517,"osy":1120,"ost":1210,"osu":1204,"ovi":1205,"ova":1326,"opl":2685,"os ":3304,"çıl":1025,"çık":4300,"or ":3792,"ork":1035,"orl":1782,"orm":2268,"ord":2039,"ore":1088,"org":1822,"ori":2004,"ort":8149,"oru":2784,"m² ":1578,"ora":1605,"ola":24963,"old":3573,"olc":925,"on ":12904,"oli":2380,"oll":2123,"ole":1383,"ols":1253,"olm":3895,"olo":4194,"olu":8661,"om ":1843,"kçe":1715,"okt":974,"oku":2095,"ona":2977,"ond":1625,"one":1566,"oni":1789,"onl":1840,"ono":2239,"onr":2464,"ons":1619,"ont":1468,"onu":7635,"ony":1563,"oma":3785,"ome":1724,"omi":1625,"omo":1197,"la ":11386,"le ":19263,"lde":3058,"ldi":2187,"ldu":3454,"lab":1412,"lac":926,"lad":1565,"lah":1109,"lak":1159,"gün":5583,"lan":43556,"lam":9121,"lar":57894,"lat":2773,"las":2417,"lay":4711,"ld ":958,"kuz":3055,"kur":7540,"kul":7562,"kta":6976,"kte":6402,"ksi":2396,"ktr":1368,"kti":2217,"gös":1720,"gör":5828,"ktı":2504,"lon":1166,"liğ":4787,"loj":2682,"lmi":5563,"lme":4109,"leş":5292,"lma":11588,"lmu":3175,"lst":1073,"lmı":3998,"lta":1114,"lte":1773,"lu ":7438,"llı":1332,"liş":3210,"lt ":1120,"lge":7089,"lgi":3114,"li ":16772,"lbü":2715,"lga":999,"ley":2404,"lev":1310,"les":3725,"let":9791,"ler":45886,"lem":4975,"len":16099,"lek":2724,"led":5291,"lec":1154,"lo ":939,"lla":11633,"lle":6955,"lli":5594,"lke":3179,"lm ":1714,"ldı":4073,"ll ":1428,"lit":1562,"lis":4164,"lir":4755,"lin":9226,"lim":4866,"liz":3501,"liy":1446,"lid":1846,"lia":946,"lk ":5135,"lik":9172,"lil":1002,"laş":3620,"ma ":9869,"mac":2119,"mak":8947,"mad":3045,"mar":2763,"mas":9043,"mal":4481,"mam":1405,"man":15375,"may":3231,"mat":2471,"me ":6043,"mda":1303,"mde":1460,"mdi":1706,"med":2057,"eş ":1039,"met":4942,"mes":5862,"mer":6138,"mel":3135,"men":5922,"mek":5999,"maç":1261,"mey":2238,"çüm":2197,"çük":1139,"luk":1924,"lup":1601,"lun":8327,"lum":1008,"lus":2340,"fın":8889,"lya":3819,"luğ":2080,"ltı":2446,"luş":5045,"mpi":1862,"mod":1329,"mon":1220,"mpa":1588,"mu ":1170,"miş":7571,"mun":1116,"muz":1291,"mhu":2129,"eşm":1123,"eşt":1717,"mi ":6405,"eşi":5221,"min":6521,"mil":2577,"mir":1395,"mis":1028,"mcı":1046,"mit":1204,"mid":2070,"mik":1507,"mlu":936,"mli":2428,"mle":5174,"mla":5844,"içi":8066,"içe":2201,"mmu":1223,"uğu":5772,"tı ":4071,"zun":1539,"tıl":3307,"tın":2882,"tır":10360,"tıs":1266,"zyo":962,"tığ":1565,"zi ":3100,"zet":1053,"zey":4798,"zen":2488,"zel":3272,"zer":6678,"ze ":1245,"zce":3650,"zde":1081,"zam":2429,"zan":2260,"zak":1080,"zar":3022,"zon":1273,"zme":978,"rşı":2081,"zla":2044,"zgü":956,"zle":1804,"zin":1617,"zik":2532,"zir":1408,"zis":1290,"yum":1152,"yun":4454,"sı ":14744,"ynı":1581,"ylü":1204,"yol":2904,"yor":1353,"yon":7554,"yrı":1788,"sıd":1928,"sıl":1373,"sım":1355,"sır":2328,"sın":16353,"sız":2708,"sıy":986,"ye ":8402,"yda":1138,"yed":1753,"yes":4583,"yer":7744,"yen":3683,"yel":1262,"yet":5659,"ya ":30890,"rış":2333,"yba":1101,"yaz":5444,"yay":4498,"yat":3352,"yar":6010,"tür":5785,"yas":4408,"yap":11061,"tün":1161,"yan":10579,"yal":5013,"tüm":1022,"yak":2748,"ydı":961,"yla":6744,"yle":4770,"yo ":1109,"yna":2824,"yi ":2354,"ygu":1552,"yin":3381,"yaş":2680,"tör":1578,"rı ":14655,"rım":1734,"rın":16497,"rıl":3423,"rıs":1452,"sür":2901,"söz":1499,"sça":1363,"wig":1137,"rü ":1492,"rün":2219,"rül":1328,"vru":1839,"vri":1244,"vre":1449,"vra":964,"pıl":3787,"pım":2266,"vil":989,"vaş":2523,"viz":960,"vis":1122,"rça":1249,"rçe":1701,"vle":3463,"vi ":1409,"vey":5886,"ver":8628,"vet":1190,"ven":1213,"ve ":40015,"val":1203,"van":2336,"var":3116,"va ":2329,"uzu":1436,"uze":4115,"uyu":1005,"uza":1667,"uyg":1403,"uya":1017,"uz ":2525,"usç":954,"uva":1017,"usl":3168,"usa":1561,"usu":5699,"ust":2940,"utb":1431,"us ":4706,"ut ":1302,"ura":2627,"urd":1061,"urg":1888,"uri":2897,"urm":935,"uro":982,"urt":1354,"uru":9050,"upa":2517,"ur ":10047,"umu":1897,"umh":2127,"uml":1645,"uma":1385,"unu":6201,"unl":2635,"unm":1881,"unc":2040,"und":5507,"una":8329,"up ":2721,"ukl":1910,"um ":2957,"ulu":12477,"ult":959,"ulm":2979,"ull":6976,"ula":6334,"un ":10772,"uk ":2310,"ul ":3821,"uha":1114,"ucu":1955,"udi":996,"ubu":2444,"uca":2065,"oğu":5104,"oğr":1470,"ues":1785,"udu":3053,"oğa":1679,"oğl":957,"uba":1499,"tur":6494,"nır":2256,"nıl":5540,"nın":19519,"nım":1872,"nıf":1036,"tre":1553,"tra":2671,"tri":1839,"tro":3501,"tte":1278,"tti":2653,"nı ":11130,"tme":3621,"tma":1695,"to ":1507,"tiğ":1447,"tos":1656,"tom":1446,"ton":2840,"tol":991,"tor":3257,"top":3422,"til":2825,"tik":5330,"tif":1158,"taş":2105,"tir":10735,"tis":1905,"tin":8295,"tim":3897,"tio":1845,"tic":1549,"tid":1024,"tiy":1388,"tki":2141,"pça":1027,"tli":1970,"tla":3863,"tle":6323,"tem":6048,"ten":3191,"tei":1620,"tek":7857,"tel":3048,"ted":3509,"th ":1012,"tes":4105,"ter":10441,"ti ":6632,"the":1663,"üşü":1357,"üğü":1444,"zı ":1890,"zıl":2487,"üç ":948,"zöl":2079,"üçü":1824,"yı ":2418,"yım":2005,"yıl":14867,"yın":3653,"yıs":2728,"yük":5408,"yüz":3683,"yön":3982,"üzö":2079,"üyü":3989,"üzi":2596,"üze":7514,"üye":1004,"ütü":1249,"üsü":1036,"ürü":4447,"ürk":7688,"ürl":1211,"üre":5286,"ümü":6554,"ülü":2192,"üs ":1086,"üst":1769,"ünü":3990,"uş ":2901,"uşu":1743,"uşt":4122,"uşa":1668,"yâl":1370},"n_words":[9192208,10449574,7620193],"name":"tr"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":2686,"E":45335,"F":1931,"G":2251,"A":4545,"B":2792,"C":4259,"L":2592,"M":3964,"N":23611,"O":1971,"H":1743,"I":29162,"T":3063,"V":2399,"P":3688,"S":26730,"R":2282,"X":2695,"f":3088,"g":5064,"d":7920,"e":26938,"b":4218,"c":9825,"a":26142,"n":18042,"o":19695,"l":12837,"m":7697,"k":2703,"h":7896,"i":22805,"w":2062,"v":2426,"u":10520,"t":16747,"s":15016,"r":19222,"p":6493,"y":4268,"x":1842,"́":45988,"ь":314859,"ю":121712,"я":259056,"ш":55425,"щ":34352,"ф":70318,"х":152145,"ц":190380,"ч":157844,"р":767814,"с":616542,"т":784384,"у":465611," t":1584,"є":83027,"ї":138148,"і":1047876," p":1613,"Є":3706,"І":14787,"Й":2127,"Л":22219,"К":34440,"Н":39528,"М":28217,"П":47409,"О":15488,"Б":23123,"А":35851,"Г":15610,"В":31676,"Е":8630,"Д":40073,"З":11772,"Ж":3520,"Ш":12613," I":24900,"Ю":2003,"Я":3891," L":1894," M":2910," B":2299,"Т":14847," C":3270,"У":18037,"Р":27850," A":3247,"С":43546," F":1599,"Ц":5942,"Ч":5807," G":1797,"Ф":32198," D":1850,"Х":6705,"л":484150,"к":541520,"й":174507," X":1787,"и":768429,"п":339521,"о":1248992,"н":1274404,"м":381536,"г":226618," S":3616," R":1762,"в":579991,"б":162019," P":2630,"а":1251055,"з":257293,"ж":95383,"е":806087," T":2138,"д":454853,"Ґ":1990,"ґ":4623," А":31736," Б":22701," В":30948," Г":15174," Д":39397," Е":7917," Ж":3460," З":11194," Й":2051," К":32754," Л":21825," М":27530," Н":36830," О":14175," П":46014," Є":3619," І":13525,"EE":21655,"I ":2684," б":36681," а":43891," г":34855," в":154625," е":34325," д":117491," з":113779," ж":8768," й":6819," л":26339," к":61923," н":123337," м":87814," п":148797," о":74913," Р":21137," С":39028," Т":13848," У":16956," Ф":31501," Х":6271," Ц":5651," Ч":5720," Ш":10618," Ю":1964," Я":3806," ї":5196," і":83499," є":13641," т":92032," у":89129," р":101219," с":120858," ц":13483," ч":24594," ф":40198," х":11362," ш":8917," щ":18497," ю":2119," я":36739," Ґ":1984,"E ":21977,"NS":21624,"II":2127,"IN":21640,"SE":21638,"a ":4767,"i ":1589,"he":1740,"el":1571,"en":2535,"es":2308,"er":4659,"e ":6591,"d ":1983,"at":2458,"ar":2758,"al":2232,"an":3512,"ac":1574,"nt":1885,"on":3299,"or":2694,"r ":2633,"o ":1648,"na":1574,"ni":1555,"li":1877,"le":1913,"la":1826,"n ":4271,"ic":2243,"ia":1621,"is":1891,"in":3535,"io":1638,"l ":1929,"y ":1830,"us":2293,"um":1631,"te":2376,"ti":2183,"th":1945,"ta":1627,"st":1645,"ro":2012,"ri":2866,"re":1930,"ra":2584,"t ":3059,"s ":6868,"́ ":2290,"́в":3741,"́д":2078,"́к":2065,"́й":3031,"́м":2120,"́л":4305,"́н":8997,"́р":4428,"́с":2026,"́т":2528,"А ":2905,"В ":2961,"ьє":2885,"юв":4387,"юд":3425,"юр":3284,"ют":14456,"юч":4396,"ює":2722,"яд":6152,"яг":3108,"яв":2990,"юю":1542,"ян":10771,"ям":9107,"ял":2194,"як":26414,"яз":4329,"ях":3443,"ят":9738,"яр":2263,"яч":4477,"яє":2354,"ші":2899,"щи":3380,"ще":5720,"ща":2636,"що":19501,"щі":1589,"ьк":115778,"і́":5805,"ьб":1841,"ьв":2256,"ьш":4727,"ьс":24456,"ьт":4512,"ьн":50183,"ьм":4831,"ьп":5070,"ьо":12701,"фі":13912,"хи":2735,"хн":7629,"хо":17158,"хр":2067,"ху":3902,"ха":7837,"ци":27077,"хі":11714,"цт":3698,"цу":23008,"це":16672,"чл":2502,"чн":59778,"чо":6493,"ці":93529,"чи":16012,"чк":3842,"чу":3146,"ць":14520,"че":19603,"ця":6177,"ча":25750,"цю":1804,"шо":7889,"шн":3282,"шк":3613,"шл":2107,"ши":11930,"шт":3498,"шу":1879,"чч":1777,"чі":4713,"ше":8200,"ша":4745,"ск":15952,"см":3119,"сл":40211,"со":21185,"сн":22933,"сп":20278,"св":10260,"се":44728,"си":21636,"рі":55323,"рш":6088,"рю":2898,"са":14767,"ря":9090,"рр":2512,"рс":15180,"рт":37524,"ру":28810,"рх":8486,"рц":1712,"тн":17002,"тл":3183,"тк":8278,"тс":3967,"тр":48477,"то":75584,"те":71719,"тв":21013,"ти":129473,"сі":43933,"сь":82109,"та":153881,"ся":27435,"сю":2514,"су":15433,"сс":3485,"ст":199868,"сх":2897,"сц":3318,"ур":24405,"уп":9084,"ут":35010,"ус":11964,"ум":8171,"ул":17666,"ун":34351,"ті":36808,"уз":30153,"ук":20646,"уд":13417,"уг":6510,"уж":3934,"тю":2714,"уа":9949,"тя":7369,"уб":6901,"ув":18706,"ть":44694,"тт":6769,"ту":74115,"фу":3164,"фт":2346,"фр":25365,"фо":9781,"фе":6303,"ує":8184,"фа":4406,"ую":4341,"уч":6062,"уш":1880,"ух":4146,"іш":7237,"іц":26077,"іч":43837,"іє":28690,"їв":4260,"ію":3757,"ія":30142,"ін":70416,"ім":15479,"іл":34864,"ік":38962,"ій":44252,"із":21994,"іж":6237,"іх":3168,"іф":1617,"іт":52153,"іс":42072,"ір":18028,"іп":25092,"іо":58208,"ів":71042,"іг":5632,"ід":79007,"іа":11156,"іб":26830,"її":1916,"ії":55045,"їн":18326,"їх":2476,"єю":25186,"єм":3400,"єн":2727,"єт":11907,"єр":2623,"єд":3075,"єк":1686,"єв":4497," IN":21609,"а ":272631,"Р ":3804,"У ":2439,"Єв":1986,"Ів":2523,"Ін":2522,"Іл":1807,"к ":38380,"й ":109949,"Ле":2968,"Ла":4470,"Ку":2193,"Ко":9293,"м ":60876,"Кр":3269,"Ки":4604,"Ка":8275,"л ":10034,"Йо":1971,"На":28658,"Не":1779,"Мі":3387,"Мо":5475,"о ":171358,"Ма":9118,"Ль":1829,"Ми":3400,"Лі":2822,"Ме":3642,"н ":48334,"Ло":1601,"Лу":5897,"Па":5129,"Пе":6590,"По":6892,"с ":17750,"р ":43282,"Ос":1646,"Ор":1949,"Ні":1710,"Ол":3009,"Но":2027,"п ":4596,"в ":98488,"Ам":1689,"Ан":4049,"Ак":1666,"Ал":7021,"Ав":2330,"Ба":3984,"Ар":5755,"б ":28913,"Во":3994,"д ":30432,"Ве":7201,"Ви":2444,"Бі":2125,"Га":3476,"Бо":3837,"г ":7123,"Бр":2821,"Бе":3301,"Ва":4390,"Бу":4720,"Дж":1933,"Де":3400,"До":4198,"ж ":10290,"Ві":6492,"Ге":2702,"Гр":3253,"Го":2499,"е ":51777,"Да":23258,"и ":147695,"За":6302,"з ":38125,"Ен":2243,"ь ":77560,"е́":6375,"Ша":6268,"Ше":1818,"и́":5895,"ю ":78537,"я ":164366,"Ст":3877,"Су":1648,"Та":2989,"Сі":1751,"Те":3497,"То":1720,"ф ":2068,"Тр":1849,"Ук":9250,"х ":93221,"РС":2535,"Пр":9085,"Пу":2178,"Ра":2736,"Ре":2944,"Пі":12505,"Ри":1682,"СР":4182,"т ":74217,"Ро":9385,"Ру":1597,"СШ":1868,"Са":5497,"Св":1748,"Си":1634,"Се":8345,"Со":4417,"у ":182563,"Це":4350,"ш ":3438,"а́":12498,"Че":2490,"ША":1865,"Фр":25493,"Фе":1857,"Ха":2266,"ч ":13144,"ль":97373,"лю":13039,"мб":2910,"ма":43992,"ля":27148,"ме":56645,"лі":100876,"ми":33489,"мл":2674,"мк":1772,"лл":4063,"ло":47531,"лу":11894,"ла":56466,"ле":64714,"лк":3429,"ли":41628,"кі":53446,"км":1768,"кн":2648,"кл":18117,"кр":33069,"кс":9526,"ко":160847,"кт":21418,"ку":30802,"кц":4113,"ка":63693,"ки":83769,"кв":7446,"ке":9654,"йн":15208,"йо":12518,"йк":1565,"йл":1986,"йм":3040,"йс":17098,"ия":1553,"ищ":6034,"иш":3403,"иї":2680,"у́":2267,"иє":2892,"йд":1750,"йб":2838,"ип":31662,"им":32760,"ин":52951,"ик":67385,"ил":14249,"ий":79035,"иц":14181,"ич":30391,"иф":2474,"их":75898,"ит":52472,"ир":15232,"ис":66975,"ри":69723,"рк":9097,"рл":2627,"рм":12337,"рн":24183,"ро":132547,"рп":3819,"ра":159212,"рб":3520,"рв":5932,"рг":12811,"рд":11789,"ре":97753,"рж":5968,"пі":30461,"пр":61701,"пт":2856,"пс":1790,"пу":8363,"ої":42339,"пи":15610,"пн":6007,"по":70389,"пл":11153,"ою":42263,"оя":1743,"па":69692,"оє":2366,"пе":54812,"ощ":2597,"ош":4991,"оч":9700,"оц":9082,"ос":101143,"ор":94904,"оп":23531,"оо":1738,"ох":8570,"оф":6129,"от":23405,"ок":38039,"ол":59373,"ом":93606,"он":134604,"ож":13014,"ні":182081,"оз":26801,"ой":3869,"ов":119233,"ог":97492,"од":63564,"ое":3185,"ню":3932,"ня":95444,"об":38077,"нь":43392,"нц":54211,"нш":5309,"нч":1583,"нт":57484,"нс":58539,"нф":2536,"ну":21092,"но":153491,"нн":89867,"нк":13365,"мі":70237,"ни":149376,"не":44107,"нг":6365,"нд":19006,"на":216248,"му":49313,"мс":3566,"мп":12768,"мо":32397,"мн":5261,"мм":1713,"ге":9243,"ві":92696,"ги":4209,"гн":2639,"го":97928,"гл":6748,"гр":20635,"гу":9031,"дв":4924,"дб":1844,"да":33332,"вд":7688,"ве":48786,"вж":2823,"бі":16889,"ви":75994,"вк":5403,"вл":11314,"вн":39402,"є ":23248,"во":52805,"вп":2102,"вр":5405,"вс":21071,"ву":11922,"вт":7129,"вч":4263,"вц":1693,"га":24943,"вя":2583,"би":5176,"аї":20255,"ає":15656,"бе":15560,"бр":8733,"бн":4172,"бо":24369,"бл":17974,"бк":1672,"бу":17942,"бс":1904,"ва":79944,"ад":39043,"аж":6697,"аз":20777,"аб":18055,"ав":68705,"аг":13743,"ам":54378,"ан":186015,"ап":12553,"ай":22198,"ак":29485,"ал":112843,"ах":20860,"аф":6570,"ач":13762,"ац":41300,"ас":69626,"ар":92013,"ау":6764,"ат":76559,"ба":13923,"аю":6775,"аш":6416,"зт":3423,"зр":3339,"зп":3833,"зу":8061,"зк":2540,"зи":11355,"жі":2914,"зо":12242,"зн":22505,"зм":7721,"ив":25953,"иг":5812,"иб":4671,"иж":2800,"зі":9568,"из":10688,"ид":11644,"зь":26870,"жо":4282,"жу":4654,"еї":6545,"жи":9686,"жн":10415,"за":70302,"зб":4513,"зв":12953,"зг":1786,"зд":4686,"зе":9240,"еф":2808,"ет":48754,"ес":21933,"ер":123318,"еп":28664,"ї ":106019,"ео":7564,"ен":200818,"ем":25862,"ел":52238,"ек":44785,"ей":12736,"ез":15415,"ді":60214,"еж":8691,"же":36749,"ея":2396,"жа":10671,"еч":5677,"еш":1971,"ех":4631,"ец":12416,"дс":7505,"др":11609,"ду":13511,"дн":33993,"дм":3635,"дп":3899,"і ":217878,"до":70187,"ди":35057,"дл":9508,"дк":6162,"де":83166,"дз":2175,"гі":41117,"о́":9048,"дж":30353,"еб":4669,"ев":15708,"ег":29992,"ед":62088,"дя":4582,"еа":4570," ар":4635," ба":6422," аб":10662," ав":3504," ад":2033," ал":3311," ак":3721," ан":4873," ам":2325," бу":8984," ва":2979," бе":6626," бр":1983," бо":3438," бл":2347," ву":2932," га":4235," бі":6025," ви":31633," ве":7581," во":6259," є ":4843," вс":3250," вл":2695," дв":2986," да":3768," гу":2423," го":6549," гр":11679," ге":4251," ві":52058," до":45102," і ":33511," др":3031," де":37135," гі":3754," ди":4078," дл":8069," ел":3252," ек":24873," ді":8202," зд":2257," зе":1996," за":56665," зв":4345," зб":3336," жо":2604," жи":2736," зм":2726," зо":3274," зн":5228," зі":1585," йо":3069," ка":7109," кв":3523," кр":6792," ко":25840," кн":1684," кл":3559," ку":3737," ла":4131," кі":5040," ли":5653," ле":3016," ме":10350," лі":7010," ми":2991," лю":4680," ма":12887," мо":13377," му":26214," ни":3719," мі":18386," не":11146," на":101285," но":3271," ок":3237," оз":1668," ні":2363," од":9316," об":14076," от":1642," ор":5923," ос":30485," оп":3801," по":44438," пл":4775," пи":3225," пе":16844," па":8207," Ре":2937," Пі":12498," Ра":2732," Ро":9378," Ри":1682," Пу":2177," Пр":9060," Пе":6585," Па":5105," По":6869," Ос":1642," Ор":1944," р ":1902," Те":3490," Сі":1746," То":1716," Тр":1847," Ст":3864," Су":1643," Та":2975," Св":1745," Си":1629," Се":8339," Со":4411," у ":67844," Ру":1597," СШ":1865," Са":5492," Фр":25493," Фе":1855," Ук":9235," Це":4345," Ха":2264," Ше":1818," Ша":6265," Че":2486," я ":1678," Ба":3980," Ар":5747," в ":34140," Ан":4045," Ам":1689," Ал":7017," Ак":1665," Ав":2326," Ва":4387," Бу":4716," Бо":3831," Бр":2819," Бе":3297," а ":3713," Дж":1930," Де":3390," До":4191," Ен":2238," з ":28463," Га":3472," Ве":7185," Ви":2437," Бі":2122," Во":3987," Да":23253," Ге":2696," Ві":6479," Го":2496," Гр":3249," Йо":1970," Ки":4602," Ка":8259," За":6236," й ":3420," Мо":5471," На":28633," Не":1775," Мі":3382," Но":2025," Ол":3009," Ні":1705," Ко":9278," м ":2294," Кр":3260," Ку":2189," Ла":4467," Ле":2966," Ло":1600," Лу":5896," Ль":1829," Ма":9106," Лі":2816," Ме":3636," Ми":3397," У ":1855," Єв":1986," Ів":2522," Іл":1806," Ін":2499," В ":2749,"Шам":2267,"Шар":2655,"INS":21596,"Кар":1691,"Кор":2456,"Киї":2310," єк":1604," єд":2480," їх":2188," іс":5151," із":3768," ім":4088," ін":33850," її":1897,"Луа":4691," ра":11678," ре":33133," пі":17870," ри":2122," ро":38150," пр":49254," св":7646," рі":9410," си":9133," се":11547," сл":3167," см":1572," ск":6896," сп":12079," со":4130," ру":3107," са":3132," сі":4244," ти":4115," тв":2718," те":13347," то":7539," тр":8770," сх":2630," ст":41843," су":7442," сю":1727," та":50935," ук":7367," ті":1840," ус":2008," ут":2182," ун":1655," фо":4647," фр":23569," фу":2570," фа":2336," уч":1717," хр":1639," хо":1776," ху":1552," фі":5255," ха":2328," ци":1770," це":7308," чо":1959," чл":2302," чи":4976," ці":2356," че":5747," ча":8926,"Льв":1624,"Мар":4041," шт":1850," що":17557," ян":1885," яз":3165," як":23816," ят":2201,"Мик":1636,"Мон":1603,"Нас":21926,"ад ":3964,"ав ":5473,"EE ":21630,"ам ":4361,"ан ":7774,"ак ":2945,"ал ":2859,"ай ":1706,"Оле":2078,"авч":1579,"авт":2891,"ага":5972,"аві":2505,"аго":2958,"ада":8317,"ади":3697,"аде":2606,"аду":2146,"адс":2241,"адо":2036,"адм":1600,"адя":2411,"аді":3733,"би ":2028,"ажа":1619,"або":10777,"абе":1768,"ава":4168,"авн":9017,"авл":3400,"авс":2814,"ає ":8442,"аво":4040,"аве":24071,"ави":3333,"бо ":10197,"ало":3744,"алу":3259,"ала":5763,"али":6052,"акі":1706,"але":7492,"амо":2377,"амп":2753,"ама":2782,"аль":45514,"ами":10086,"аме":25683,"алі":34159,"анн":20969,"ано":8600,"ану":2414,"анс":11789,"ант":9369,"анц":47104,"ань":4811,"ана":9630,"анд":6858,"анг":2311,"ани":9657,"амі":2404,"ане":2623,"анк":2826,"ані":36761,"азу":2305,"азо":3692,"ази":2579,"азв":4425,"аза":2057,"азі":1724,"айс":1694,"айо":6278,"айн":1941,"айб":2377,"акт":6740,"ако":7371,"ака":2878,"ах ":10031,"Пар":1681,"ас ":3290,"ар ":5874,"ат ":4127,"СР ":2617,"ба ":1557,"Пет":1744,"Пер":2277,"Пол":2322,"РСР":2369,"При":2113,"Про":3790,"Пуа":1541,"Пік":2365,"Пір":3646,"Рос":2415,"Роз":1581,"Рон":3384,"Пів":4853,"Аль":5551,"Сан":1563,"США":1862,"Сер":1775,"Сен":4479,"Ста":1577,"Ард":2815,"Тер":1748,"Вол":2219,"Вер":4116,"Вел":1833,"Укр":9121,"Бур":2431,"Фра":24714,"Дан":22031,"ША ":1853,"а́н":3396,"Цен":2677,"Чер":1615,"NSE":21595,"лам":1827,"лан":5122,"лас":12503,"лат":2370,"ма ":7369,"ля ":13589,"лав":3656,"лад":10782,"кці":3993,"ль ":10569,"кул":3808,"кур":1627,"кою":25572,"кої":15644,"кре":3214,"кра":21169,"кри":4236,"кро":1703,"лу ":3819,"кса":1966,"кте":2165,"кти":3833,"кто":5082,"ктр":2798,"кту":3015,"кла":11761,"ло ":6223,"клю":1692,"клі":1698,"ког":16518,"ков":13165,"ком":15826,"кон":32464,"коп":2254,"кор":9858,"кос":2132,"кож":4742,"кол":8207,"ким":4246,"кий":27960,"ких":8957,"кві":3598,"ле ":3371,"кі ":7394,"ли ":7619,"кер":1599,"ква":2483,"ках":2083,"кат":2360,"кар":5682,"кам":2944,"кан":6220,"кал":2575,"каз":1777,"кад":1810,"ла ":13040,"Іва":2158,"йсь":13547,"кт ":1727,"ку ":18769,"йна":1852,"йно":3774,"йни":5036,"йов":2120,"йог":2770,"йон":6057,"ко ":7248,"иїв":2473,"ки ":39616,"ке ":4049,"йбі":2012,"од ":2452,"нах":3061,"нац":23841,"нау":3625,"наф":1792,"нач":7238,"ог ":2185,"нан":4567,"нам":2788,"нал":31101,"нат":2955,"нас":4427,"нар":6454,"нап":3620,"над":4116,"нак":1904,"най":5486,"наз":4802,"нде":1705,"нда":2557,"ож ":4256,"нгл":1926,"неї":3826,"нен":4632,"нер":5081,"нес":1886,"нет":1977,"нец":1734,"нев":2650,"нез":1549,"нді":4444,"ні ":100559,"ндр":3680,"мії":2239,"ник":15713,"ний":35532,"ок ":8024,"мір":2620,"міс":12191,"мік":1956,"мін":9879,"міч":24763,"між":4232,"нь ":34487,"ня ":85321,"ню ":1561,"ов ":5346,"нав":26133,"об ":3066,"мпо":1641,"нт ":27692,"мпе":2663,"мпа":3905,"ну ":12904,"мпі":1622,"мсь":1988,"мун":23434,"муз":3552,"ліз":3946,"лій":2290,"лік":3856,"лід":25282,"лів":5148,"літ":32189,"ліс":3353,"лін":5016,"лії":2011,"мис":2927,"мир":2021,"но ":17011,"мно":2222,"мод":1683,"мог":2124,"мов":8204,"мож":2842,"мон":3152,"мол":2071,"мор":4169,"нс ":2855,"має":2263,"ляє":1872,"мац":1743,"мал":3075,"мад":2394,"ляд":1770,"мат":7712,"мас":1664,"ляр":1752,"мар":2087,"лян":1603,"ман":6730,"люч":1579,"маг":1762,"люд":2454,"лют":2538,"мет":6826,"мен":33354,"ни ":22900,"мер":6133,"меж":1784,"мі ":2838,"не ":13395,"льп":5070,"льн":50159,"льо":2192,"на ":73406,"льм":2015,"льк":3474,"льш":4684,"льт":4502,"льс":6409,"му ":17492,"лок":1590,"лог":8013,"лод":3460,"лор":1601,"лос":2987,"лот":1979,"лом":2920,"лон":2251,"лов":10255,"луж":1851,"ків":10057,"кій":3619,"кіл":3020,"кіп":21719,"кін":3986,"кіс":1867,"лив":4838,"лиз":2163,"лик":4841,"лі ":6988,"леж":2754,"ми ":23542,"лен":36965,"лем":3035,"лек":7457,"лиц":3452,"лиш":2141,"лис":4376,"лин":3727,"лип":2133,"пат":1536,"пад":4424,"пал":24506,"пан":5073,"пар":26316,"ре ":2016,"ра ":10978,"пис":6483,"пла":3785,"пле":2508,"пло":1933,"ро ":4356,"пед":22902,"ри ":9926,"пер":22098,"печ":1592,"ори":10852,"опі":2491,"орд":2209,"оре":6973,"орг":5783,"орс":3875,"оро":12661,"орм":6382,"орн":2519,"опу":1571,"ора":4398,"опе":4273,"опи":2385,"опо":5348,"опа":3672,"осі":28049,"оте":2368,"оти":4264,"ото":4402,"отр":2511,"ота":2076,"орі":12506,"оси":1994,"оск":2565,"осл":27847,"осн":4625,"осо":4627,"осп":1773,"ост":18846,"ору":3825,"орт":4236,"оря":2411,"оми":3494,"олі":11097,"оме":3420,"ома":8391,"оля":2505,"олю":1788,"оль":6830,"олу":1961,"по ":3091,"оло":19189,"оле":3051,"оли":6076,"окі":1858,"ола":2363,"окр":3962,"оку":9695,"око":4015,"оні":30331,"онс":3562,"онт":3739,"ону":5669,"они":2616,"омі":28058,"оно":28802,"онн":3725,"она":36879,"онд":1886,"оне":2765,"омо":4256,"омп":5253,"ому":14955,"оча":2902,"очи":1547,"оці":5589,"офе":1770,"оце":2765,"офі":2407,"охо":3822,"оба":1591,"нят":1850,"ням":4193,"ова":23572,"обу":2657,"обр":3128,"обо":4431,"обн":1797,"обл":9987,"оби":2069,"ою ":40879,"ньо":6069,"па ":1808,"оки":2050,"оке":1657,"ока":3381,"ожн":3109,"озв":2698,"нів":6803,"ніз":4328,"ози":2146,"оза":1905,"озт":3392,"ніт":1771,"ніс":9671,"ніц":22879,"ніш":2881,"ніч":5481,"нік":1930,"ній":8198,"озм":1625,"нім":3177,"озн":3472,"озр":1991,"озп":1689,"нії":3375,"нія":2921,"одн":11088,"оди":15822,"одж":2752,"огі":6272,"оду":4615,"одо":6110,"ої ":40186,"пи ":6419,"оді":8902,"оже":2141,"обі":3439,"ове":4729,"овл":2741,"ови":27036,"ово":14293,"овн":9596,"овт":2617,"ову":5401,"овс":3796,"ога":1966,"ові":15249,"ого":77853,"огр":5122,"ода":5921,"оде":2118,"ної":16682,"ною":5373,"нос":8914,"ноп":1707,"ном":33988,"нок":2048,"нні":5141,"ног":41554,"нов":14623,"ння":64956,"нно":4474,"ор ":9067,"нни":8535,"нна":3011,"SEE":21595,"нко":3013,"он ":7906,"нку":1909,"нка":2370,"ом ":22352,"ним":10232,"нин":2727,"нич":2031,"них":49028,"ниц":6236,"нши":2462,"нці":27792,"нцу":22511,"нув":2948,"нті":3002,"нсь":25045,"нта":5969,"нте":3274,"нти":4476,"нту":1658,"нто":2721,"ох ":2654,"нтр":7071,"нст":25118,"сам":2553,"рям":1886,"сан":2886,"ряд":3770,"сво":2566,"свя":1631,"сві":5195,"сі ":3512,"сел":26979,"ти ":13731,"сен":1784,"сер":9261,"рів":9362,"рід":3016,"ріа":3973,"рій":3132,"різ":4766,"ріо":2440,"ріш":1652,"річ":3119,"рія":3137,"сис":3972,"сит":2195,"рії":5126,"син":2791,"сил":2927,"ска":1987,"сли":1774,"сла":3451,"ско":3258,"скл":4964,"слі":25511,"сля":1724,"слу":2674,"сло":3957,"то ":6783,"сни":4268,"сня":2333,"соб":4324,"сов":3112,"сні":1707,"сок":1638,"сно":9584,"тр ":4211,"сну":2111,"спе":2794,"сор":1743,"сон":2424,"соц":1647,"ту ":33552,"спі":4109,"спу":1543,"спо":6481,"спр":2669,"су ":3870,"роц":5357,"рот":5051,"роф":2524,"роп":4979,"рос":8219,"ст ":6153,"рпн":1909,"рсь":7310,"рта":23604,"рст":1652,"рти":2701,"рси":2327,"рух":1575,"рту":2197,"рті":2581,"рук":2570,"руг":2655,"руд":3764,"руп":2957,"рус":1928,"рхі":2138,"рхн":4291,"рши":1950,"сь ":1944,"та ":53895,"ся ":24676,"рад":5772,"раж":1608,"раз":3483,"рав":11695,"рам":5248,"ран":55851,"рай":6264,"рак":4056,"рал":6105,"рах":2738,"раф":3312,"рац":4988,"рас":1585,"рат":9434,"раї":18701,"рі ":5250,"рде":3344,"ргі":2098,"реб":1606,"рев":3520,"рег":26086,"ред":9949,"реа":2012,"рет":3016,"рес":6263,"си ":1816,"рен":12198,"рем":5175,"рел":2059,"рек":2766,"рез":6938,"рді":3024,"реж":3159,"ржа":4954,"реч":1677,"рец":2506,"рвн":2312,"рга":5371,"ргу":2442,"рим":4841,"рин":4051,"рик":6384,"рил":1738,"рий":1712,"рич":3064,"рит":7476,"рир":1816,"рис":11068,"рка":1676,"пів":7244,"під":9422,"риб":1872,"риг":1977,"рив":2681,"риз":3342,"піл":3701,"піс":2425,"рмі":3148,"рни":5921,"рне":1575,"рна":4926,"рок":13067,"рол":3802,"ром":8841,"рон":6202,"роз":14973,"рні":3617,"ров":13779,"рог":3625,"род":18027,"роб":8445,"рно":5153,"рко":1653,"рма":4259,"пра":9765,"при":17085,"пре":5034,"про":26547,"ру ":4801,"поп":1576,"пор":8542,"пос":5211,"пот":2142,"пох":2319,"поч":2731,"пош":1607,"рт ":1813,"под":5278,"пов":10451,"пня":3871,"пон":3944,"пом":2310,"пол":10620,"пок":1979,"поз":3035,"пуб":2176,"пус":1543,"пря":1882,"са ":2954,"вар":4866,"ват":3699,"вач":1754,"ває":1673,"вав":2297,"ван":27620,"вал":6746,"важ":2723,"га ":2428,"бут":2028,"бул":3726,"бур":1929,"буд":3912,"був":3407,"́н ":1990,"вся":2929,"вто":3926,"втн":2139,"вст":3544,"всь":11822,"гу ":1846,"вро":2652,"вою":2829,"вої":4164,"вол":4005,"вні":6649,"вод":5394,"вог":4666,"вов":1600,"вня":4991,"вор":9030,"вос":3392,"вом":2439,"вон":1910,"вни":11918,"вне":1755,"вна":3546,"вно":8046,"влі":1867,"вля":2045,"вле":3353,"вла":2838,"го ":74789,"вка":1932,"вищ":4417,"вич":10270,"виз":2384,"вий":5063,"вил":1963,"вик":6641,"вин":5363,"вим":3289,"вип":2153,"вис":3214,"вир":3300,"вит":2461,"вих":6912,"вив":1786,"виг":1941,"вид":7090,"біл":7304,"вец":2207,"вер":12033,"ги ":1955,"вел":4069,"вед":23850,"ві ":8320,"вде":5262,"ва ":20184,"ают":4986,"баг":2186,"аці":38441,"ашо":3325,"аук":3811,"аті":2815,"афт":2014,"ахо":3481,"афі":2408,"ача":3308,"аче":5139,"ахі":3064,"апр":2675,"апа":1903,"апо":2274,"апи":1583,"арх":2803,"арс":2742,"арт":27902,"аре":1829,"ард":4062,"ара":11795,"арн":4172,"аро":9064,"ари":4536,"арк":3550,"аст":19420,"ата":3063,"аси":3444,"арі":4603,"асе":23473,"асл":1823,"асо":2485,"асн":7019,"ату":6561,"ать":1702,"ате":4441,"асі":1565,"ати":34490,"атк":2377,"атн":2400,"́ль":1996,"ато":8833,"атр":2250,"бол":1926,"бор":3014,"бни":2582,"бро":2133,"ву ":2725,"бра":3585,"блі":3564,"бла":6820,"бли":3503,"бле":1613,"во ":6738,"ви ":5236,"аєт":5513,"бер":8329,"без":3385,"аїн":17837,"ве ":2027,"дає":2865,"дач":1562,"дан":4800,"дар":3127,"дат":2935,"дал":2067,"дав":3568,"дем":2009,"ден":35742,"дер":7440,"деп":22255,"дже":27277,"ей ":5110,"дво":2066,"ез ":2842,"ді ":6167,"дсь":3872,"дст":2708,"дрі":2262,"дро":2499,"дру":2324,"дра":1728,"дпо":2072,"ет ":25079,"дко":1659,"ен ":10176,"ем ":4029,"див":2382,"гіч":3315,"дин":11940,"дит":3892,"гії":2091,"о́в":1799,"гіо":23749,"гір":2225,"гід":1618,"дня":2562,"доб":2151,"дні":5130,"дов":8472,"дос":25281,"дор":2173,"док":2841,"дон":2228,"дом":6507,"дна":4835,"дмі":2603,"дни":7067,"дне":1547,"дно":10257,"ер ":7534,"для":8246,"да ":6361,"газ":2730,"гал":4240,"гат":2475,"вят":1631,"ган":6922,"де ":6060,"вул":1831,"вує":1869,"вча":1606,"вче":1615,"гол":4270,"гос":1776,"гор":5348,"гов":2480,"год":1736,"гру":5182,"ду ":6755,"гро":3158,"гра":8609,"гре":2241,"гун":2442,"ген":3541,"ди ":6794,"гео":1703,"вів":2194,"вік":22506,"віл":2297,"вій":5760,"від":28365,"віт":10410,"вір":2465,"віс":2374,"він":3115,"гля":1806,"до ":11544,"жав":4567,"за ":31382,"жит":2833,"жив":2319,"жин":1953,"жі ":1618,"жен":31144,"зу ":2376,"жно":2243,"жни":2399,"жна":2353,"жні":1682,"жов":2516,"ежа":1636,"ежн":1674,"ежи":2240,"едс":1966,"еї ":5804,"еді":22755,"дія":9605,"діє":22354,"дії":2193,"діл":4733,"езн":2478,"езп":2124,"дів":4952,"дій":3323,"ева":2033,"еви":2106,"еат":1536,"дян":2535,"еда":2858,"еде":24252,"еди":1957,"егі":24559,"едо":1597,"едн":2110,"евн":1571,"же ":2405,"ево":2649,"еві":2180,"ент":38060,"енс":2476,"енц":1664,"енк":2033,"ени":9106,"ено":4866,"енн":55011,"ена":6246,"емі":4010,"ене":8942,"енд":1893,"еор":1718,"ені":27559,"ень":28255,"епа":22831,"ерш":4377,"ерх":5234,"ерп":2312,"ерс":4919,"ерт":3088,"ерм":2838,"ерн":8048,"еро":3883,"ери":8614,"ерк":2529,"ерд":2064,"ерг":2332,"ерж":5295,"ере":27737,"ера":11283,"ерв":4795,"ерб":1797,"ейс":2515,"еко":24801,"ект":7509,"екс":5740,"ели":5579,"ело":1860,"еле":29499,"ела":1663,"емл":1627,"емо":2244,"емн":1709,"еми":2190,"елі":3500,"ель":6771,"еме":2758,"ема":4374,"ехн":2533,"ець":7827,"еці":2099,"ері":9985,"есн":2797,"есп":2017,"есо":1640,"ест":4336,"ета":3510,"есі":2297,"ети":3023,"ете":1577,"етр":4098,"ето":3229,"ету":1803,"ива":5433,"иді":1970,"иго":1963,"ида":2853,"ивс":1559,"иво":3293,"ивн":5798,"иві":2780,"икл":3021,"ико":10700,"ики":26994,"ика":9412,"изь":1617,"изн":4148,"ині":5661,"имі":1780,"ини":11405,"инн":2229,"ино":4309,"инс":2389,"ину":2117,"ина":9917,"ими":8406,"илі":2149,"имо":2293,"има":1943,"иль":2368,"икі":3590,"или":2166,"ило":1638,"ила":2574,"исе":1748,"иса":1892,"ист":43720,"исо":2640,"исл":3795,"иск":1974,"ити":3061,"ите":2660,"ита":3840,"ися":2391,"ись":3153,"иту":23904,"итт":1716,"ито":5821,"ипа":23599,"ипн":2085,"ире":1798,"иро":6079,"ихо":1721,"ицт":1948,"иць":3344,"ить":5744,"ище":2694,"иці":4564,"ичи":1676,"ичн":15682,"иця":3075,"ича":2349,"ка ":31484,"ив ":2824,"зав":2331,"заб":1710,"заг":2313,"ид ":1647,"зви":3864,"зва":3561,"зац":3958,"зах":3661,"зас":5087,"зап":2437,"зан":2419,"зал":3333,"зак":3350,"ий ":76977,"зер":2249,"зем":2575,"зді":2313,"зі ":4174,"зик":1993,"ик ":11149,"ин ":8446,"им ":13202,"зич":2000,"зна":12042,"зни":3492,"змі":3801,"зно":1802,"зня":3181,"зов":3243,"зон":2606,"зпе":1731,"зпо":1609,"зро":1734,"зта":3312,"их ":71895,"ич ":8041,"зьк":25702,"ьме":1976,"ьна":3031,"ьни":9629,"ьно":31998,"ька":11941,"ьке":2474,"ьки":32360,"ько":60206,"ькі":5969,"ьту":1942,"ься":17960,"ьсь":5340,"ьог":3861,"ьов":1687,"ьні":3450,"ьпи":4442,"як ":5568,"ям ":4621,"ює ":1572,"юва":4241,"юр ":1687,"er ":1536,"яка":3149,"яки":6457,"яко":4501,"які":4711,"яч ":2564,"юто":2149,"ях ":2449,"ють":10792,"ючи":2094,"янс":3724,"ями":2547,"ять":2425,"яти":2263,"уча":3150,"уєт":3972,"фес":1886,"фер":1827,"уют":3197,"фун":1616,"фра":23347,"фор":6458,"фік":1922,"філ":3022,"фіз":1799,"це ":3714,"хан":1700,"хар":2307,"хні":3188,"хов":2787,"ход":8360,"хня":2286,"сто":20885,"стр":14612,"ств":9052,"сте":10268,"сти":58155,"ста":42984,"сті":17422,"стя":1612,"стю":2000,"сть":9349,"сту":4755,"сце":2414,"ть ":23970,"тю ":2076,"ськ":76644,"тя ":4033,"сьм":2215,"сюр":1594,"ув ":2334,"тав":4291,"так":7239,"тал":7016,"там":24076,"тан":13432,"тат":26660,"тах":1715,"тар":4879,"таш":3389,"тво":10485,"тва":6033,"тех":2451,"тец":1853,"тем":5996,"тел":3217,"тен":2843,"тер":16300,"теп":1791,"тет":24922,"тек":2381,"тей":1760,"ті ":19387,"січ":2428,"тив":7452,"сій":4342,"ук ":2255,"сів":2280,"сіб":22500,"тка":1831,"тич":9159,"сії":1716,"тий":2070,"тин":8122,"тик":26054,"тил":2041,"тир":1541,"тис":25636,"тип":1715,"тит":24505,"тку":2518,"тко":2210,"тла":1548,"ур ":1549,"тно":2899,"тні":2826,"тод":1678,"ток":3298,"тол":4612,"тов":10097,"тог":4926,"тня":4250,"тни":4639,"тре":1761,"тра":13299,"три":6247,"тор":19453,"тос":2194,"том":6059,"тон":3478,"топ":3418,"тою":1721,"тсь":3003,"тро":10751,"тру":4586,"трі":6027,"тув":1637,"туп":2434,"тур":8289,"ття":4038,"тут":22979,"тьс":17965,"ує ":4149,"ува":15273,"уго":1668,"уар":5114,"уат":1738,"убл":2373,"узь":23065,"узе":1844,"тій":2025,"узи":2433,"тіл":1694,"тів":6962,"уді":1635,"удо":3223,"удн":2817,"уме":1763,"уль":5593,"уля":1849,"ули":2646,"уло":1791,"ула":2199,"укт":2271,"укр":7476,"уко":2812,"упн":1546,"ура":2553,"ург":3415,"ури":2733,"упа":1808,"унк":2435,"уні":25084,"умо":1836,"унд":2657,"уту":22317,"уст":2645,"утв":1947,"урн":4525,"уро":1539,"що ":17655,"шов":4224,"ших":3225,"шир":2574,"ший":2090,"ще ":2338,"шен":2816,"ші ":1719,"шта":1833,"щин":2002,"щен":2787,"цен":4287,"чи ":3745,"цев":1886,"цес":2327,"цер":1781,"ці ":10080,"хід":5700,"цип":23018,"ць ":4743,"ця ":4768,"ча ":2185,"цуз":22494,"цтв":3628,"ців":2368,"ціа":3468,"ціо":25893,"цій":6921,"ція":7173,"чен":10697,"чер":4797,"чі ":2791,"чле":2375,"чка":1597,"чин":4805,"ціє":1783,"ції":31741,"чис":2289,"цьк":8704,"ша ":1587,"ше ":3944,"чає":2804,"час":12463,"чат":2049,"чай":1572,"ща ":1848,"чні":4741,"чна":4519,"чня":2280,"чни":34595,"чно":11306,"us ":1758,"ію ":3147,"їв ":1974,"іал":5781,"іан":1803,"ія ":24064,"іде":1874,"ідж":22706,"ідк":2690,"ідн":11513,"ії ":54256,"ідр":2425,"ідп":3775,"ідо":5503,"івд":5855,"іве":2535,"ібн":1843,"івс":5463,"івн":8627,"ійн":9751,"ійс":12117,"іжн":2207,"ізо":1615,"ізн":5205,"ізм":1906,"ізи":1715,"іза":4100,"інд":1760,"іне":2360,"іна":3489,"інн":4758,"іно":3712,"імі":1726,"інф":1596,"інц":2809,"інс":24263,"інт":1810,"іль":17941,"іме":3447,"іля":2402,"ілі":1831,"імп":2489,"іле":1779,"ікі":22043,"іло":3466,"іка":6429,"іки":2076,"іко":1702,"ісц":2429,"іст":27606,"ісл":1997,"існ":3890,"іре":3719,"ірн":2678,"іпе":22100,"іні":5468,"іод":1626,"інш":3831,"інь":1631,"іон":51386,"ітн":4596,"іто":2264,"їх ":2010,"ітт":1981,"іту":1943,"іта":3402,"іте":26865,"іти":4695,"ішн":1712,"іше":2070,"ічн":39482,"іці":2111,"іци":22860,"іб ":22595,"ів ":41252,"ід ":15090,"із ":3976,"іж ":3000,"ій ":20369,"ік ":2836,"ім ":3593,"ін ":5333,"ір ":2760,"іх ":2419,"єю ":25174,"єдн":2196,"єкт":1631,"єть":11566,"її ":1912,"ією":24943,"іяч":2557,"ївс":1781,"їнс":8303,"їни":6169},"n_words":[15331232,17151725,12469252],"name":"uk"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"ٹ":7565,"پ":19909,"ٰ":348,"و":91887,"ي":164757,"ً":561,"َ":343,"ُ":600,"ـ":316,"ف":15396,"ق":19382,"ك":314,"ل":61015,"م":85213,"ن":73500,"ه":210,"ّ":218,"ِ":1208,"خ":10520,"د":42999,"ج":29799,"ح":15399,"ت":63424,"ث":2772,"ب":46160,"ئ":12622,"ا":210246,"ؤ":741,"آ":6835,"ء":3457,"غ":3607,"ع":26594,"ظ":4173,"ط":9560,"ض":4853,"ص":11646,"ش":17043,"س":59068,"ز":13283,"ر":100269,"ذ":2393,"،":8584,"؛":310,"ہ":81948,"ۃ":194,"ۂ":442,"ے":72723,"ۓ":1226,"۔":21285,"گ":13942,"ک":96247,"ھ":18217,"ں":33709,"چ":6431,"ڈ":3486,"ڑ":2986,"ژ":212," ،":2410," ۔":1318," ہ":33487," ھ":395," گ":5518," ک":67994," ن":12353," ل":8072," م":43729," ق":5408," ك":206," ف":4429," ي":9396," و":11189," ص":3954," ش":8375," ط":3503," ض":1387," ر":7523," ذ":1057," س":22509," ز":3628," ع":9286," ظ":270," غ":946," ا":57235," ء":2548," آ":6281," ج":19940," ح":6228," خ":5235," د":11120," ب":20334," ت":17083," ث":374," چ":3475," ڈ":1133," ٹ":932," پ":14925,"کا ":9732,"کت ":188," ، ":2340," ء ":2420," و ":1367," بھ":3806," جن":2286," حا":1179," جل":205," بہ":901," جم":727," جي":597," جو":3440," جد":255," جس":2699," جز":358," جر":340," اے":201," اہ":629," جا":6670," جب":836," تي":855," اک":704," بڑ":974," اگ":491," خل":943," تہ":199," تھ":4319," خي":280," خو":966," دا":1316," خص":169," خر":184," خد":283," خط":402," حق":234," حي":633," تک":1112," حم":243," خا":1564," حر":509," حس":437," حد":214," حض":717," حص":647," بے":210," بن":2795," بل":1071," بغ":190," بع":1385," اُ":270," بر":1803," اي":8027," او":11360," بد":254," بح":524," اق":467," اف":961," ال":4248," با":2739," ان":6816," ام":1333," اط":314," اع":876," اد":879," ار":1330," از":241," اس":10716," اش":424," اص":842," اض":219," اب":1381," ات":313," اث":175," اج":297," اح":507," اخ":805," تو":1190," بچ":211," تن":378," تم":482," تق":681," تف":194," تع":1588," تش":261," تص":444," تر":1553," تخ":375," تج":367," تح":884," تا":1192," تب":411," اپ":1467," اٹ":312," بو":485," بي":1555," آپ":775," آل":254," آن":292," آي":279," آخ":168," آر":246," آت":368," آج":212," آئ":402," آب":1319," آف":180," آس":249," آز":331," عہ":219," سے":9513," شک":419," طو":1200," ظا":186," عظ":314," عر":1083," عد":298," عث":218," عا":1056," عب":533," شہ":1939," عي":293," عل":2998," عم":1153," عن":249," عو":213," غي":270," سع":194," سط":261," دھ":223," سف":224," سي":1366," سو":980," سم":787," دہ":248," شا":2109," سن":832," سل":1069," شر":734," شخ":260," شع":372," شي":299," شم":1497," صا":255," صر":324," رک":1032," صح":380," صد":878," صل":252," صف":254," صو":1376," رہ":1328," ضر":190," ضل":1004," طا":301," طب":638," سک":803," طر":1051," دس":404," در":2025," دي":2436," دو":2534," دن":789," دل":217," ذر":344," جگ":269," حک":746," ذي":205," جہ":593," را":903," جھ":293," رس":334," ري":940," زر":261," رق":405," رو":1485," زب":1015," زا":201," رن":190," زي":935," سر":1466," زم":612," سب":960," سا":2953," زن":311," ست":266,"گھر":200,"گہ ":210," ہے":16945,"گوں":240,"گيا":1207," ٹا":182," پڑ":329," پن":536," پو":662," پي":2142," پا":3033," پت":168," پر":5135," پش":170," پس":187," ٹي":317," لف":878,"ں، ":508," لغ":168," لح":265," لئ":237," لا":1034," مل":1468," مق":1520," مف":313," مغ":611," مع":1708," مط":921," مض":238," مص":513," مس":1572," مش":1902," مر":2119," مز":309," مد":642," مذ":261," مح":1355," مخ":914," لي":2681," مث":406," مج":498," لو":614," مت":1246," ما":1837," نف":215," نق":288," نم":568," نظ":923," نس":373," نش":178," مم":601," من":1456," نا":2714," نب":179," مو":1833," مي":17546," نج":183," وا":3957," لڑ":168," نو":730," ني":646," فض":168," فر":1011," فا":763," فت":168," قس":291," قص":318," فل":520," قا":1019," فن":193," فو":398," قب":691," في":493," قد":579," قر":855," فٹ":171," قل":240," قو":565," قي":355," وہ":1582," يہ":4254," لک":550," وج":661," لگ":411," وس":430," وز":353," ور":376," وغ":293," ول":235," وق":563," وف":272," مک":558," وي":428," لے":206," مگ":362," مہ":365," يا":2941," نک":275," نگ":248," يع":665," نہ":1220," يو":1100," نے":2696," گھ":362," ہو":7614," ہم":409," ہن":745," ہي":5912," ہز":243," ہر":384," ہا":430," ہج":240," کت":582," کر":5496," کس":1214," کش":187," کل":1235," کن":341," کم":719," کو":6421," کي":15836," کا":11009," کئ":348," کہ":5408," کھ":713," گا":490," گئ":863," کے":17634," کچ":321," گن":214," گل":202," گي":1388," گو":519," گر":807," ۔ ":1018," پہ":1353," پھ":659," چت":240," چا":498," چل":252," چي":492," چو":467," چن":303," ڈا":359," ڈي":297," چھ":653," چک":241,"کٹر":175,"کيے":174,"کي۔":282,"کلا":210,"کلو":610,"کلي":168,"کيا":2343,"کيو":230,"کيم":299,"کين":186,"کيل":348,"کمي":186,"کمل":193,"کنا":202,"کوئ":495,"کور":176,"کوم":779,"کرا":507,"کتے":245,"کسي":1209,"کست":1507,"کزي":242,"کري":262,"کرت":1247,"کرد":421,"کرن":1152,"کرک":265,"کار":1107,"کائ":219,"کئي":281,"کان":309,"کام":558,"کال":425,"کتي":199,"کثر":209,"کتا":937,"کم ":404,"کل ":646,"کن ":715,"کي ":12997,"کو ":5510,"کر ":2045,"کز ":207,"کس ":278,"گور":172,"گري":1479,"گرو":225,"گرد":220,"گرا":170,"گاہ":224,"گئے":444,"گار":222,"گئي":576,"گي ":628,"کہل":354,"کہت":572,"کہا":1480,"کھا":567,"کھت":425,"کھو":307,"کھن":402,"کھي":483,"گر ":1024,"گا ":209,"کے ":18223,"کچھ":310,"کھ ":350,"کہ ":4209,"ا، ":388,"ھا۔":1135,"ھائ":250,"ں۔ ":2401,"ھان":195,"ھار":432,"عہد":221,"ھتے":283,"پان":548,"پار":326,"پاس":169,"پائ":259,"ہ، ":321,"پي ":246,"ئي ":3847,"ات ":5026,"اح ":512,"اج ":434,"ئم ":385,"پرو":236,"پري":221,"ئل ":289,"اب ":1615,"اء ":706,"اؤ ":172,"پاک":1528,"پنج":401,"پنا":177,"ئع ":214,"پور":484,"پني":552,"ئش ":369,"ھي ":3674,"ئر ":279,"بت ":362,"فر ":258,"ان ":9264,"با ":254,"فظ ":915,"ہم ":759,"اً ":534,"ہو ":977,"پنے":725,"اف ":395,"پيش":361,"پيد":1076,"ام ":5824,"ال ":3321,"ہي ":1279,"اق ":313,"اظ ":454,"اع ":255,"ہائ":399,"ہات":210,"ہار":307,"ار ":4602,"اخ ":174,"فت ":347,"اد ":2459,"ہاں":875,"اص ":271,"از ":1002,"اس ":6443,"ہان":212,"ارے":550,"اطا":177,"ھوا":216,"اسک":1125,"ت، ":322,"بي ":1989,"اضي":270,"بو ":188,"اصط":318,"اصل":1371,"ارہ":558,"اسے":795,"اعت":501,"اعد":311,"اعر":317,"ھوٹ":341,"ھيل":511,"اطي":187,"ھنے":355,"اعظ":290,"ھوں":204,"اعل":259,"قع ":1257,"فار":420,"ھيں":343,"فات":329,"فاظ":273,"ادا":654,"اخل":199,"اتھ":922,"ھي۔":577,"احي":174,"اخت":714,"احم":281,"احت":209,"ئي۔":457,"ہا ":1654,"ارا":821,"ادي":1677,"ئيں":363,"ادل":266,"ادب":182,"ادت":200,"بق ":617,"ادر":214,"اتے":523,"ازي":292,"است":1978,"اري":1870,"ارن":318,"ارو":652,"ہت ":499,"ارف":181,"ارس":382,"بل ":520,"ارد":993,"ارت":911,"ارک":342,"اشي":236,"اسم":259,"ادہ":769,"اسل":804,"اشا":172,"اسي":947,"ارٹ":196,"بن ":873,"بع ":355,"ائد":205,"ائر":395,"ائش":451,"ائع":219,"ائن":582,"ائم":404,"ائل":355,"ائي":3099,"ئنس":292,"ابت":325,"ابر":254,"اؤں":202,"ہد ":227,"ابي":488,"ابل":415,"ابق":678,"ابو":358,"ابن":262,"اتح":175,"اتا":3152,"ہر ":2189,"اثر":224,"ؤں ":253,"اتي":1903,"قت ":631,"ائے":1094,"اجا":187,"بر ":849,"في ":604,"عظي":276,"عظم":356,"پڑھ":180,"عري":274,"عرو":255,"عرب":766,"عرا":186,"عدا":424,"عدد":310,"عثم":220,"ظيم":431,"عبد":345,"عات":242,"شہو":678,"شہر":1734,"عال":536,"عام":681,"عاش":251,"عار":177,"صے ":201,"پہن":208,"پہل":895,"غرب":546,"طہ ":206,"عيا":204,"عيس":213,"پھر":309,"پھي":228,"عمل":531,"عمو":341,"عمي":283,"عمر":256,"غان":180,"عني":881,"غاز":188,"علا":1570,"علق":602,"علي":1229,"علو":368,"عما":1154,"علم":738,"آتا":178,"آئي":238,"آبا":1188,"آخر":190,"آزا":306,"ھر ":618,"آن ":225,"ھا ":1233,"غير":796,"آيا":202,"عے ":290,"آپ ":665,"عہ ":760,"جسم":317,"جزي":258,"خت ":294,"جرا":223,"جري":262,"شعب":197,"جزا":198,"جرم":182,"بکہ":367,"ثيت":188,"ذہب":185,"شما":1408,"جبک":363,"اہم":638,"جائ":411,"اہل":181,"جات":4030,"اہو":304,"جاب":370,"جاس":338,"اہي":375,"جار":388,"جان":1341,"جام":298,"رے ":1358,"بہت":604,"رکز":477,"بہا":219,"صبہ":185,"جما":288,"رکا":241,"صحا":176,"رکي":303,"صدر":477,"حال":384,"جنو":753,"ہے۔":9056,"صدي":342,"حاد":180,"حاص":632,"حاظ":206,"جمع":225,"جمو":257,"شيا":452,"خط ":169,"بھي":3076,"بھا":585,"ضي ":277,"صطل":323,"جسے":332,"رکھ":862,"صرف":317,"جسک":274,"زہ ":268,"طب ":280,"تک ":956,"خي ":172,"حرک":215,"حضر":635,"حصي":251,"دت ":251,"حدہ":191,"سٹي":260,"حسا":181,"دا ":1143,"حسن":171,"سٹر":203,"حرا":239,"حري":437,"طح ":229,"تہ ":472,"تھ ":814,"صوص":325,"ہے،":923,"صول":176,"حدي":195,"صور":542,"صوب":1118,"رہي":215,"رہن":216,"جمہ":270,"رہا":394,"جنگ":528,"جيس":335,"رہت":193,"صلا":189,"جود":775,"ضرت":619,"ضرو":182,"خان":852,"خاص":306,"خار":176,"حمد":801,"حقي":267,"تے ":3295,"رہے":389,"صيل":310,"دس ":176,"در ":1119,"دد ":492,"حصہ":359,"دن ":286,"ٹا ":239,"دو ":1561,"ظر ":265,"خصو":323,"ر، ":449,"دي ":2904,"حير":216,"ضلع":1009,"حيا":329,"ختل":505,"حيث":191,"ختي":341,"سے ":11230,"طان":608,"طال":493,"طاب":536,"طبي":334,"دل ":339,"دم ":215,"خلي":574,"سکت":863,"تہا":216,"سکا":323,"سکي":300,"خلا":443,"سکو":402,"دان":840,"دال":423,"تھے":1383,"دائ":580,"دار":1737,"داز":181,"داد":564,"طرح":391,"طرز":173,"طرف":300,"خوا":291,"خود":282,"طري":194,"ظم ":393,"شہ ":199,"سکے":358,"خيا":180,"عت ":511,"تھي":1189,"عد ":1126,"تھا":1925,"رج ":392,"ٹي ":724,"رت ":2032,"رد ":384,"طلب":258,"طلا":420,"عض ":255,"رح ":408,"طور":1195,"ظام":601,"طنت":356,"شکل":258,"را ":761,"دست":238,"رب ":643,"ظاہ":223,"درس":185,"درج":475,"درا":374,"دري":711,"جہ ":949,"درم":359,"درو":190,"دون":230,"ظري":241,"صہ ":469,"دور":724,"دوس":950,"دني":624,"دوں":249,"رف ":993,"ديو":334,"ديم":368,"دين":825,"ديل":228,"ديا":819,"ديت":278,"ديد":212,"ٹر ":1167,"رس ":290,"عي ":320,"دما":182,"جے ":273,"اقو":437,"اقي":177,"اقت":219,"اقا":176,"افي":307,"اقع":1323,"الق":182,"الف":464,"الل":695,"الي":1802,"ان،":219,"امت":248,"اما":469,"الن":229,"الم":986,"الو":429,"امر":725,"الج":351,"الت":351,"الب":611,"الا":1303,"الس":179,"الر":180,"الد":643,"الح":530,"الع":407,"ري ":3579,"جگہ":221,"افر":440,"افت":417,"ٹري":255,"انے":1223,"ايم":221,"انہ":1162,"ايو":177,"ايس":1069,"انگ":1569,"ايش":229,"امہ":269,"ايا":902,"ايت":316,"انک":290,"اير":272,"، ":8315,"بحي":221,"اوہ":305,"رو ":291,"بحر":243,"اقے":283,"ديگ":275,"اني":2788,"انو":1426,"ديک":357,"رق ":406,"باً":193,"اقہ":415,"بان":1448,"بال":507,"اند":1077,"باد":1658,"باز":180,"بار":1042,"انس":905,"امن":202,"انا":720,"انب":309,"امو":288,"انت":736,"بات":454,"امي":1316,"انج":263,"امل":979,"بائ":435,"الے":826,"بتد":219,"اوي":243,"انڈ":193,"انچ":276,"الہ":394,"اون":198,"اول":497,"اوق":178,"اور":10590,"الک":535,"رم ":316,"ذري":244,"اوا":211,"تا ":6076,"حکو":764,"حکم":277,"رحد":298,"ردو":947,"ردي":295,"ردا":370,"رتے":586,"رتا":498,"ايک":5551,"بدا":306,"راچ":302,"ربع":318,"بدي":235,"برا":685,"ربي":855,"ذيل":186,"راہ":513,"برط":276,"برق":294,"بري":212,"رتي":676,"راک":232,"جہا":530,"تر ":480,"ران":1556,"ربا":206,"راع":230,"راف":173,"رام":330,"رال":643,"راب":283,"رائ":562,"راج":355,"رات":605,"راث":218,"رار":318,"راد":611,"راص":190,"راس":245,"ہيں":6226,"بعد":1053,"تح ":170,"ہو۔":237,"جھي":195,"بعض":255,"رآن":223,"بني":578,"بنا":831,"ہوگ":285,"ہوں":722,"بند":385,"ا ":36261,"بلن":294,"ہنچ":177,"بلو":291,"بلي":191,"ہلے":442,"سر ":306,"بلا":185,"ب ":6862,"ء ":3203,"ہوئ":1384,"ہور":1098,"ہوت":2264,"ہوا":926,"ہون":793,"ہلي":355,"ہند":711,"ہمي":171,"ؤ ":175,"زي ":1743,"رطا":394,"ح ":1752,"ھے ":736,"ہلا":565,"خ ":788,"رسٹ":177,"د ":10666,"بوں":222,"بيٹ":246,"ست ":854,"ذ ":232,"بين":440,"بيل":268,"سا ":510,"رست":341,"بيع":188,"ت ":16193,"بيا":321,"بير":253,"رسي":336,"سب ":865,"رجہ":199,"ث ":408,"بول":436,"ج ":1790,"بلک":176,"سم ":677,"دہ ":2099,"تي ":4132,"سل ":210,"اک ":371,"ريہ":436,"تو ":819,"ٹلي":171,"ريک":876,"دھ ":232,"روں":847,"رپ ":187,"ريع":323,"رين":690,"ريل":294,"ريف":299,"ريق":376,"رنے":958,"تم ":196,"ريا":1581,"ريب":696,"ريخ":486,"ريت":171,"ريز":1341,"ريش":173,"رنگ":273,"روف":378,"رون":314,"روم":315,"روع":324,"رور":309,"روز":221,"روس":205,"روا":481,"اپن":1352,"ہزا":285,"ہري":293,"زبا":959,"زار":465,"رند":247,"زاد":421,"ہرا":229,"رمي":582,"رمن":177,"رنا":383,"زائ":318,"رما":229,"اٹل":178,"ہتے":651,"ثر ":284,"؛ ":306,"سط ":176,"رقي":620,"رقب":303,"ہجر":225,"ئے ":2320,"تبد":215,"تان":2396,"تبا":381,"تار":774,"تاب":521,"اچي":318,"دے ":517,"سن ":294,"ہے ":7219,"اں ":1697,"سو ":217,"تحر":360,"شت ":170,"تحا":210,"تحص":253,"سي ":3401,"تدا":329,"اہ ":1042,"تري":474,"ترا":469,"جا ":352,"تصا":242,"جب ":430,"تصو":215,"ترک":411,"اڑي":213,"ستہ":193,"تظا":196,"سري":335,"تعا":188,"سرح":298,"سرا":421,"تعم":1291,"تعل":965,"تعد":362,"دگي":389,"جد ":384,"شن ":295,"زما":353,"سام":409,"سال":748,"سان":757,"زند":314,"جس ":1740,"سائ":617,"زمي":343,"سات":868,"ساب":328,"ستا":2494,"تقا":197,"سجد":245,"تقر":347,"تقس":240,"ستع":1050,"زيا":694,"ستي":192,"ستو":208,"زيد":178,"اے ":199,"زير":703,"تمل":297,"ا۔ ":2384,"صر ":373,"تلف":484,"تما":488,"اۓ ":304,"سلا":945,"تين":302,"سمج":250,"سلي":236,"سمب":188,"سلم":718,"سما":225,"تيس":201,"تيا":780,"سلط":553,"سلس":275,"توا":248,"شي ":269,"سطح":252,"تيں":193,"پر ":4411,"اکي":196,"اکا":303,"اکس":1424,"سرے":373,"توں":404,"صد ":229,"ٹے ":294,"اگر":397,"جن ":581,"بڑا":364,"رک ":436,"بہ ":1879,"اکھ":210,"رہ ":1976,"جي ":406,"شرق":569,"شرو":345,"شري":335,"صل ":1226,"جو ":2871,"بڑي":256,"حت ":262,"ئے۔":600,"بڑے":246,"حد ":400,"ھے۔":835,"سوي":178,"شتر":182,"سوا":232,"سور":248,"شاع":346,"شام":808,"سمن":228,"شائ":191,"سند":323,"شاخ":186,"سمي":191,"ثلا":193,"شخص":263,"ثما":224,"ذکر":233,"اہر":591,"اہد":216,"سين":345,"سيم":308,"شاہ":679,"شتم":303,"سيد":218,"بے ":441,"سيا":755,"ڑا ":500,"ٹھ":533,"پت":249,"پا":3394,"پس":366,"پش":179,"پر":5467,"ٹے":304,"گ ":1702,"ٹک":243,"پڑ":366,"پو":978,"پن":2069,"پي":2662,"پل":260,"ٹي":1294,"ٹو":369,"ٹن":221,"ٹل":297,"ٹر":1769,"ٹا":643,"ک ":9401,"لے":2164,"يں،":396,"مگ":395,"مک":804,"نڈ":840,"وي":2835,"ي،":686,"نچ":605,"وو":294,"يب":1474,"يا":18357,"مہ":1477,"يئ":237,"يع":1465,"يز":2169,"يس":3006,"نگ":3438,"يش":1424,"يص":226,"يخ":630,"يد":2958,"ير":4679,"يت":2515,"يث":342,"يج":682,"نک":1176,"يح":396,"ين":5944,"يو":3744,"وچ":363,"نھ":192,"يق":966,"يم":3458,"نہ":3293,"يل":3575,"يف":992,"وپ":411,"وٹ":898,"وڑ":349,"نے":8036,"وڈ":316,"يٹ":1722,"وک":763,"پ ":1225,"يٰ":198,"وگ":857,"وہ":2577,"يچ":216,"يڈ":589,"وں":7039,"يپ":229,"و۔":276,"يک":8185,"يگ":581,"يہ":6841,"يں":24637,"ي۔":1652,"يۓ":606,"يے":1374,"فع":234,"فض":206,"فظ":975,"فر":2018,"فس":304,"فت":956,"فا":1934,"يہا":454,"قع":1462,"قط":251,"قص":525,"قس":674,"قر":1456,"قد":958,"قت":918,"في":1664,"قب":1358,"فو":716,"قا":2876,"فن":259,"فل":626,"فق":170,"فٹ":222,"قي":2110,"ل،":228,"قل":641,"قو":1321,"لق":994,"لف":1951,"لط":721,"يں۔":3302,"لغ":238,"لع":1546,"لد":859,"لج":430,"لح":953,"لز":212,"لس":945,"لر":265,"لئ":340,"لا":9180,"لت":997,"لب":1083,"مع":2282,"مغ":639,"مص":529,"مض":257,"مط":1001,"مف":327,"مق":1607,"مل":3799,"مت":2835,"لو":3381,"مج":828,"لي":9899,"ن،":378,"مث":441,"لم":3135,"لل":728,"مب":937,"لن":899,"ما":9656,"مز":517,"مر":3546,"مش":2018,"مس":1772,"مخ":947,"مح":1397,"مذ":278,"مد":1699,"نظ":1248,"نع":313,"نل":169,"قہ":970,"نم":856,"نق":463,"نف":515,"ے، ":1325,"نج":1093,"مي":23019,"نت":1838,"مو":3519,"نب":733,"نا":7139,"من":2588,"فہ":638,"مم":631,"نص":409,"نش":418,"نس":2278,"نز":337,"نر":275,"ند":4576,"قے":375,"مپ":203,"وئ":2114,"وا":9079,"نن":295,"نو":3979,"ني":8247,"و،":187,"ٹ ":1461,"لڑ":176,"وغ":354,"نٹ":588,"وع":906,"وق":1147,"وف":901,"ون":4062,"ول":2928,"وم":3387,"لہ":2193,"وت":3089,"وب":2645,"ود":2057,"لک":2192,"وح":206,"وج":2206,"لگ":620,"وس":2774,"وز":895,"ور":18253,"وط":241,"وض":250,"وص":445,"وش":613,"يکھ":326,"ڑ ":321,"يکن":476,"يکي":320,"يگر":286,"چ ":481,"يکہ":301,"ڈ ":843,"خو":1180,"دت":297,"دا":6455,"دب":338,"خي":646,"خل":1333,"خم":176,"تہ":894,"تھ":5631,"خط":526,"خر":474,"خد":295,"خص":603,"دو":4578,"ر،":467,"دي":7480,"دف":241,"دل":705,"دم":634,"ذا":327,"دن":1041,"تے":3362,"دع":168,"دد":552,"در":3958,"دش":266,"دس":695,"جي":1296,"جو":4658,"حت":458,"جن":2503,"حا":2227,"حب":298,"جل":349,"بہ":2825,"جم":1355,"بھ":4084,"ا۔":3031,"اۓ":335,"اے":213,"جس":2866,"جز":533,"جر":1045,"جد":723,"بک":601,"خت":1742,"حي":1318,"تک":1171,"حم":1392,"خا":1967,"خب":324,"حو":294,"حق":534,"حل":497,"حض":792,"حص":974,"بے":458,"حر":1330,"حس":621,"حد":976,"تف":294,"تم":1316,"تل":864,"تق":1058,"تو":2257,"بچ":213,"ثا":431,"تن":759,"تج":398,"تح":1528,"تر":2642,"تخ":631,"تد":422,"اڑ":396,"تش":356,"تص":699,"تس":246,"تظ":210,"تع":3085,"اں":1731,"ثل":246,"جا":8260,"ثم":243,"اہ":3738,"جب":952,"ثي":412,"تي":6367,"اک":3503,"ثر":502,"اگ":766,"بڑ":994,"ئے":3028,"ؤں":259,"بغ":203,"بع":1847,"بن":3272,"بم":183,"بل":2052,"بق":773,"بد":925,"اً":548,"بج":198,"بح":566,"بت":774,"اي":9647,"او":12882,"بط":281,"بص":181,"بز":199,"بس":224,"اُ":281,"بر":3415,"اپ":1892,"اٹ":526,"تا":10999,"اچ":510,"تب":1015,"بو":1546,"ت،":328,"بي":4832,"ئد":211,"ئر":536,"ئش":454,"ا،":402,"اء":804,"اؤ":588,"ئل":407,"ائ":7576,"ئم":430,"ئع":221,"از":2184,"ار":13604,"اد":7175,"اض":766,"اص":2408,"اش":1157,"اس":13319,"ات":12391,"اب":5157,"ئن":668,"اخ":1510,"اح":1946,"اج":1405,"اث":517,"ئي":5602,"اف":2468,"اق":3776,"ام":10852,"با":7594,"ان":23472,"ال":15347,"اع":2513,"اغ":318,"اط":844,"اظ":619,"آپ":794,"آئ":413,"آب":1377,"آت":374,"آج":214,"آخ":195,"آر":259,"آس":262,"آز":333,"آف":182,"آل":275,"آم":177,"آن":518,"آي":281,"عے":301,"ٰ ":195,"عہ":1012,"طہ":234,"غي":910,"غل":246,"عي":1223,"غر":737,"صے":202,"عق":239,"عل":4649,"صہ":487,"عم":2601,"غا":645,"عن":1246,"عو":588,"عث":370,"ظي":492,"عت":818,"عد":2145,"عز":175,"عر":1942,"عظ":634,"عض":358,"عا":2408,"عب":841,"ظم":515,"شہ":2846,"ظا":851,"طن":541,"سہ":287,"طل":743,"سے":11317,"ظر":549,"شک":686,"طي":706,"طو":1493,"ضم":182,"زہ":344,"ضل":1190,"رے":1402,"طر":1375,"ضي":496,"سک":2597,"طح":262,"طا":2021,"طب":814,"ضو":288,"سپ":375,"صل":1848,"صف":401,"صط":357,"سٹ":824,"ضر":887,"صو":2504,"رہ":3575,"صن":434,"ضا":582,"صي":766,"دے":538,"شع":464,"رک":3328,"صح":404,"صد":1147,"صر":962,"رگ":411,"شم":1789,"ذہ":258,"صا":794,"شن":774,"صب":345,"شو":358,"شي":1319,"سع":252,"سط":727,"دگ":457,"سف":448,"دھ":774,"رپ":455,"رٹ":425,"رڈ":213,"سي":6156,"شت":955,"رچ":305,"سو":1823,"شا":3056,"سن":1275,"دہ":2389,"سم":1996,"سل":3485,"شر":1691,"شد":243,"شخ":304,"ذک":274,"سب":1244,"سا":5309,"زن":603,"ست":5770,"زو":460,"زم":950,"زل":234,"سر":2705,"سج":282,"زي":3808,"دک":188,"رس":1755,"رش":298,"رر":201,"رز":482,"جے":273,"رط":461,"رص":252,"رض":357,"رل":427,"رق":1634,"رف":1286,"رو":5304,"زب":1100,"رن":2342,"زا":1886,"رم":1656,"ري":12753,"زر":534,"ذر":554,"جگ":285,"رآ":320,"جھ":599,"جہ":1619,"رب":2493,"را":9267,"رت":4343,"رج":1190,"ذي":412,"رخ":287,"رح":1014,"حک":1119,"رد":2466,"ف ":3475,"ع ":4219,"ڑي ":543,"غ ":230,"ص ":761,"ض ":697,"ط ":839,"ظ ":1429,"ر ":38086,"ز ":2573,"س ":11232,"ش ":1550,"ِ ":847,"ً ":542,"ي ":57056,"ن ":18794,"و ":15262,"ق ":2735,"م ":14225,"ل ":13265,"ينہ":361,"ينے":248,"يوں":1103,"وہ ":2016,"يقي":265,"يلا":274,"يلي":664,"يما":422,"يلو":184,"ينا":221,"نہي":1033,"يمي":391,"نہو":438,"يٹ ":181,"يني":579,"يقہ":243,"وچس":192,"يور":512,"يوا":288,"يون":715,"ينڈ":257,"وں ":6861,"ہے":17450,"يٰ ":189,"وٹي":197,"يعے":209,"وگ ":226,"يرہ":738,"يشي":298,"يشن":306,"يسي":670,"نگي":216,"نگل":243,"يسو":203,"يسر":205,"ے،":1342,"نگر":1405,"يزي":1326,"نگا":269,"يسا":524,"يرو":314,"يري":407,"يعن":649,"يسے":586,"۔ا":238,"يثي":187,"يا۔":1172,"نکا":269,"ے۔":10861,"يتي":187,"ياں":353,"يرا":565,"نکہ":250,"يدا":1270,"نے ":7909,"يال":283,"يان":856,"يبا":352,"يام":192,"ياس":748,"يار":881,"ياد":1222,"ياض":232,"ياء":241,"يات":2312,"ياب":173,"يائ":508,"کچ":345,"کٹ":398,"کي":17718,"ي۔ ":1247,"کس":3186,"کش":375,"کر":6688,"کز":505,"کث":328,"کت":1905,"کو":8220,"کن":1311,"کم":1336,"کل":1933,"کئ":359,"کب":357,"کا":13105,"يۓ ":585,"يے ":1282,"گہ":320,"گھ":468,"گل":520,"گن":309,"گو":1092,"گي":2325,"گز":223,"گر":3536,"گئ":1042,"گا":1385,"کے":18417,"کھ":2829,"کہ":6908,"ں،":524,"گے":188,"يہ ":6011,"يڈي":227,"ھے":1744,"ہز":291,"ہر":3373,"ہج":298,"ہت":1501,"ہد":644,"ہا":4675,"ہب":255,"ہي":8193,"ہل":1738,"ہم":1207,"ہن":1510,"ہو":9694,"ں۔":3376,"ھم":206,"ھل":258,"ھي":5556,"ھو":1393,"ھن":692,"ہ،":328,"ھر":965,"يں ":20810,"ھا":3850,"ھت":572,"ں ":29650,"ڈر":227,"چي":975,"چن":411,"چو":627,"وہا":172,"ڈا":533,"چک":371,"چہ":230,"چھ":1116,"ڈو":318,"ڈي":856,"پھ":684,"پہ":1407,"چس":237,"چا":849,"چت":269,"چل":282,"ڑا":619,"ۃ ":184,"ۂ ":441,"ڑے ":367,"يٹر":982,"ڑي":670,"يک ":5968,"ھ ":2275,"چے":173,"ہ ":29676,"وگو":199,"ڑک":206,"ڑھ":467,"ڑے":384,"ے ":59880,"ۓ ":1150,"۔ ":15851,"و۔ ":200,"وز ":174,"ور ":14043,"ود ":1161,"ڈي ":274,"لگ ":206,"وس ":423,"چين":228,"نما":532,"وع ":503,"نيا":1482,"نوي":380,"نون":235,"نور":347,"نوب":664,"نوا":373,"نٹ ":234,"وف ":378,"نيہ":422,"نيو":458,"نوں":1111,"وم ":918,"لہ ":1961,"ون ":1017,"چند":174,"ول ":1088,"نڈ ":285,"وي ":1068,"ي، ":669,"نچ ":177,"مغر":519,"معل":267,"معن":336,"معر":258,"مشہ":684,"معا":596,"چست":201,"مقا":854,"مقد":219,"چان":177,"ملت":253,"ملا":357,"چتر":233,"قے ":368,"منا":208,"نائ":404,"مند":339,"نات":285,"منت":213,"نار":304,"مما":299,"ملي":227,"موا":174,"ملک":619,"موج":660,"مور":225,"موس":304,"موع":240,"نام":2044,"نان":513,"ناي":217,"نتق":217,"نتظ":204,"نتخ":209,"مون":289,"موم":337,"مول":250,"ميل":269,"ميد":233,"مير":641,"ميا":812,"ميت":203,"نتي":206,"نجا":562,"مين":762,"ميٹ":952,"موں":292,"ندا":677,"ميں":16302,"ندو":448,"ندر":713,"ندي":569,"چار":237,"نسا":448,"چي ":376,"وا ":736,"ندگ":273,"ندہ":249,"نسي":397,"نسل":289,"ندھ":311,"وب ":577,"وت ":377,"نظا":416,"نظر":493,"وج ":244,"نظي":172,"لک ":878,"ونا":319,"ومي":882,"ونس":207,"وما":454,"ولي":616,"ولو":168,"ومت":782,"يع ":214,"ولا":396,"وقت":472,"وفا":282,"يش ":423,"يس ":618,"نگ ":881,"ويں":364,"يق ":261,"يف ":611,"مکم":188,"ونے":813,"نڈي":216,"چہ ":203,"ونک":263,"چھ ":368,"وني":567,"وٹ ":170,"ونو":241,"وڈ ":203,"يو ":296,"نہ ":1535,"يم ":2131,"ين ":3424,"مگر":341,"يل ":1806,"لے ":2120,"وئے":873,"واں":205,"وتي":759,"وتا":1184,"وبہ":880,"وجي":169,"وجو":858,"وا۔":228,"وار":699,"واز":279,"واد":234,"واج":227,"وئي":1076,"واب":205,"وائ":356,"وبي":420,"واي":171,"واق":1340,"وال":2546,"وان":598,"وبا":263,"وام":369,"وري":875,"مہ ":911,"وست":275,"لگا":240,"وزي":317,"يا ":8493,"وسر":713,"وسط":295,"ورپ":286,"ودہ":176,"وسي":366,"يب ":717,"ورہ":195,"لکي":197,"وتے":500,"ودي":279,"ورا":750,"وجہ":615,"ورس":204,"ورت":506,"لکہ":196,"ورن":218,"لکھ":453,"وغي":292,"يد ":1164,"ير ":2092,"يز ":500,"يت ":1838,"يج ":176,"يح ":219,"يخ ":423,"لد ":289,"قصب":216,"لت ":358,"لا ":1251,"قسم":285,"لب ":395,"قسي":239,"�":423,"لق ":599,"لف ":570,"چھو":396,"قوں":260,"ما ":498,"لم ":1252,"لع ":962,"قيق":283,"قيا":359,"قوم":305,"قوا":311,"قل ":209,"فرا":633,"فرو":181,"فري":361,"قي ":950,"ل، ":225,"فيص":180,"قبہ":284,"قدر":190,"قدي":366,"قري":696,"قرآ":222,"قرا":218,"قال":197,"قائ":507,"قاب":316,"قات":245,"فلم":218,"فوج":255,"قبو":176,"قبل":223,"قان":205,"قبا":188,"قام":703,"فٹ ":202,"لما":635,"ماع":315,"لمي":331,"مات":656,"مار":1346,"ماد":258,"لند":373,"مائ":379,"لفظ":904,"لفا":303,"نس ":426,"لعہ":239,"ند ":917,"لطا":204,"مي ":1920,"لطن":355,"مطل":234,"ني ":4359,"و، ":185,"مطا":719,"مصن":197,"مصر":194,"مرک":583,"نو ":281,"مذہ":183,"مشر":609,"مسل":679,"مشت":387,"مست":254,"مسج":242,"مسا":197,"قہ ":928,"مري":646,"مجھ":254,"مرا":743,"مرب":331,"مرت":186,"مدي":169,"ليے":1014,"ليۓ":470,"مدد":185,"ليک":538,"ليہ":571,"مخت":598,"لوگ":423,"محم":603,"لوں":448,"لوچ":226,"لين":425,"مجم":256,"لنے":191,"ليت":222,"متي":175,"ليا":836,"ماہ":260,"ليم":656,"مثل":227,"لوي":317,"لوم":894,"للہ":672,"متح":211,"متع":399,"مال":2414,"مام":560,"مان":1742,"مبا":175,"ماي":231,"مبر":417,"مر ":315,"مد ":873,"لو ":198,"ے۔ ":8070,"مت ":1234,"لي ":3595,"ن، ":362,"لسل":386,"نب ":260,"نا ":1656,"من ":398,"فہ ":313,"نت ":572,"لدي":343,"لحک":303,"مل ":1925,"لتي":176,"لاک":279,"لئے":321,"لاہ":228,"لتا":182,"لحا":247,"لاح":555,"لاد":170,"لاز":192,"لائ":397,"لات":751,"لاق":1241,"لاف":367,"مع ":244,"لاو":376,"لام":1294,"لان":427,"لبا":312},"n_words":[1602570,1999510,1324903],"name":"ur"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"D":18934,"E":10094,"F":9985,"G":23207,"A":42579,"B":48257,"C":72224,"L":45665,"M":47574,"N":70917,"O":10000,"H":57302,"I":15743,"J":5371,"K":22108,"U":5400,"T":104616,"W":8071,"V":30021,"Q":14744,"P":61692,"S":43824,"R":17604,"Y":3509,"X":6207,"Z":3364,"f":32992,"g":525146,"d":205922,"e":428748,"b":182777,"c":671453,"a":692878,"n":1382200,"o":437498,"l":442210,"m":420259,"j":4762,"k":100510,"h":937660,"i":759263,"w":22119,"v":178461,"u":410719,"t":952446,"s":286409,"r":444790,"q":33677,"p":174320,"z":16996,"y":192811,"x":29082,"²":689,"Î":330,"É":501,"Á":3556,"Â":2373,"Ý":1288,"ß":648,"Ú":1274,"Ô":1751,"í":49339,"ì":32480,"ê":66073,"é":38397,"è":3862,"ç":233,"ä":808,"ã":19808,"â":112923,"á":116051,"à":337783,"ü":3187,"ý":5197,"ú":10002,"ù":30808,"ö":2319,"ô":98885,"õ":1117,"ò":13544,"ó":78081,"ñ":390,"đ":198318,"Đ":41649,"ă":37793,"ā":557,"ĩ":9562,"ī":249,"ō":772,"ũ":5403,"ū":650,"ư":147894,"ơ":25607,"́":204,"ο":236,"ι":146,"λ":139,"α":217,"ς":203,"ρ":170," l":242575,"ь":159," m":173509," n":244497,"я":194," o":12009," h":134731," i":13655," j":644,"ы":142," k":76050," d":93354," e":7007," f":8319," g":49741,"ч":190,"р":638,"с":522," a":22488," b":142456,"т":441,"у":252," c":262807," y":1958," x":17955," z":365," u":1259," t":566783," w":4693," v":156401," q":28586," p":59115," s":94989," r":14869," J":5319," K":22031," H":57162," I":15633," N":70771," O":9935," L":45508," M":47408," B":48077," C":71972," A":42454,"С":156," F":9904," G":23091," D":18825," E":10050,"л":506,"к":551," Z":3238,"й":211," Y":3492,"и":885," X":6141,"о":889,"н":659,"м":235,"г":168," S":43565,"в":508," R":17493," Q":14713,"б":149,"а":1192," P":61533," W":8016," V":29925," U":5377,"е":792,"д":263," T":104253," á":1499," â":3662," í":541," ô":2220," ý":503," ă":627," Đ":41607," đ":198156," Â":2373," Á":3556," É":500," Î":330," Ô":1748," Ú":1274," Ý":1277," ư":1144,"ي":271,"ل":264,"م":187,"ن":142,"ا":398,"ر":178,"A ":2159,"F ":494,"Da":2569,"Cu":1591,"Cy":963,"Cl":1245,"Co":10944,"Cr":1746,"Ce":2576,"Ch":19039,"Ci":1005,"G ":637,"Ec":908,"Ed":361,"Ea":530,"Du":1267,"Dy":152,"Do":2470,"Dr":1247,"De":2217,"Di":2894,"Bà":792,"Bá":858,"Fe":902,"H ":455,"Fa":1831,"Eu":2126,"Ex":464,"Er":617,"Et":367,"Es":894,"En":771,"Em":460,"Ep":314,"Ei":335,"El":872,"Cá":2732,"Ge":2583,"Câ":290,"Cà":157,"Ga":3052,"I ":2174,"Fu":749,"Fr":2327,"Bí":387,"Bì":1773,"Fo":1217,"Bé":191,"Fl":1104,"Fi":1024,"B ":642," С":156,"C ":1227,"Av":428,"Au":2997,"Ar":5387,"Aq":1658,"At":1067,"As":1434,"D ":745,"Ba":13610,"Az":1190,"Ay":139,"Ae":265,"Af":962,"Ag":750,"Ah":192,"Ab":577,"Ac":1193,"Ad":611,"Am":2029,"An":10483,"Ap":626,"Ai":1117,"Ak":164,"Al":6955,"By":154,"Bu":3220,"Br":4012,"Ca":14082,"E ":3496,"Bh":176,"Bi":2488,"Be":3517,"Bo":4388,"Bl":876,"Gò":247,"Ku":658,"Gö":233,"Ky":391,"Kn":144,"Kl":231,"Kr":781,"Ko":952,"Hã":340,"Hà":3871,"Há":6912,"Le":5532,"Li":5716,"N ":1502,"La":11255,"Lu":1591,"Hó":455,"Hò":671,"Ly":614,"Bư":341,"Hé":321,"Lo":11096,"Hì":160,"Me":5622,"Mi":7578,"Cơ":380,"O ":859,"Ma":14564,"Cư":440,"My":979,"Mu":2726,"Mo":6001,"Nh":15252,"Ni":3602,"Ng":8870,"Ne":5246,"Na":14732,"P ":1136,"Dư":2196,"ưu ":1732,"Ny":327,"Nu":290,"No":7260,"Ok":174,"Ol":595,"Om":201,"On":483,"Oh":203,"Oi":748,"Oc":485,"Od":183,"Oe":270,"Oa":270,"Ob":773,"Gi":6635,"Gh":326,"Gl":496,"Gr":4171,"Go":1400,"Gu":2462,"Gy":286,"Cô":3480,"Có":402,"Cú":325,"J ":219,"Ha":7544,"Dâ":1283,"He":2483,"Hi":2709,"Ho":13976,"Hu":4398,"Hy":1927,"Dô":340,"K ":324,"Ib":265,"Id":176,"Ic":220,"Im":190,"In":3623,"Il":428,"Is":1691,"It":2092,"Ir":1003,"Ja":1797,"L ":642,"Ji":230,"Je":710,"Jo":1104,"Ju":1037,"Ka":2594,"M ":626,"Kh":4970,"ưng":2590,"Ki":2990,"Ke":1180,"Ut":638,"Ur":404,"Um":166,"Un":919,"Uk":408,"Ul":140,"Ug":456,"W ":203,"Ty":201,"Tw":155,"Tu":2972,"Tr":21381,"To":2690,"Th":42787,"Ti":3875,"Te":2952,"Ta":3822,"V ":758,"Sw":579,"Sy":613,"St":3687,"Su":1921,"Wo":921,"Wi":1647,"Wh":297,"Sé":358,"Sè":329,"Wa":2024,"Sâ":534,"Sá":354,"We":2017,"Sà":247,"Y ":322,"Lư":806,"Vo":1915,"Vu":318,"Vi":16219,"Ré":208,"X ":401,"Va":2214,"Ve":2267,"Uy":394,"Lă":215,"Mã":1030,"Má":214,"Lý":664,"Lü":151,"Mé":245,"Pt":637,"Pu":1475,"Pr":3920,"Ps":557,"S ":1842,"Py":3299,"Pe":2603,"Là":800,"Lã":279,"Pf":1720,"Lâ":833,"Pa":7810,"Lé":142,"Pl":1384,"Po":3931,"Lê":927,"Pi":2275,"Ph":31098,"Os":655,"Ot":301,"Ou":271,"Ov":238," ا":171,"Op":394,"Or":2938,"R ":429,"Kô":237,"Se":3444,"Sc":3122,"Si":2745,"Nă":720,"Sh":1345,"Sm":208,"Sl":522,"Sk":307,"Sr":367,"Sp":2000,"So":3317,"Ru":1170,"Nô":330,"Ry":155,"Nó":11015,"U ":337,"Hư":909,"Sa":11442,"Re":2085,"Ri":2194,"Rh":4441,"Ro":3835,"Qu":14208,"Mô":839,"T ":649,"Mù":252,"Ra":1904,"Mü":183,"Sơ":2030,"Sư":226,"Yê":502,"Xã":2079,"Wü":541,"Xô":549,"ưa ":1620,"Xí":179,"Vĩ":456,"b ":1444,"a ":226759,"Tư":1027,"Vũ":589,"Tù":160,"Tú":283,"Tô":962,"Xy":153,"Nư":140,"Ye":349,"Ya":435,"Yp":239,"Yo":874,"Sĩ":507,"Yv":187,"Yu":292,"Mư":225,"Só":240,"Sô":413,"Tâ":9624,"Tà":542,"Xe":229,"Tá":302,"Sü":308,"Xa":216,"Tê":676,"Xi":420,"Tò":164,"Xu":1093,"Tí":259,"Vù":202,"Vă":1172,"Za":1272,"Và":236,"Ze":739,"Zh":185,"Zi":458,"Vâ":484,"Tĩ":188,"Zu":199,"Võ":191,"i ":212782,"bó":2113,"cà":270,"gd":295,"cá":22579,"ge":11280,"câ":2231,"bú":210,"ga":12467,"bé":224,"fl":1616,"bã":190,"fg":215,"ff":1242,"fi":2497,"fs":160,"fr":3324,"fu":1164,"ft":684,"fo":4267,"bê":1228,"bí":976,"bì":10910,"j ":155,"cù":1871,"cú":386,"gy":681,"có":47444,"gw":161,"cô":5046,"dâ":28874,"dã":502,"dà":2537,"he":41577,"hb":191,"ha":43156,"gn":5754,"gm":577,"gl":1738,"gk":293,"gi":33763,"bă":263,"gh":11779,"gg":570,"cò":3292,"gu":10662,"gt":510,"gs":1016,"gr":2654,"go":8645,"dt":805,"du":4489,"dw":715,"dy":507,"g ":364697,"ea":15752,"eb":3649,"ec":8564,"ed":11541,"de":24490,"dd":556,"dg":298,"df":167,"di":32181,"dh":710,"dk":145,"dm":293,"dl":1043,"do":14691,"dn":264,"ds":1755,"dr":3533,"ew":3323,"ex":3581,"eu":6762,"ev":2827,"ey":3123,"ez":1216,"fa":5205,"h ":178273,"bà":2398,"bá":2351,"fe":2871,"eh":679,"eg":4460,"ef":1217,"ee":4546,"el":24442,"ek":799,"ej":311,"ei":13045,"ep":4148,"eo":14079,"en":49087,"em":10013,"et":12592,"es":40569,"er":51401,"eq":167,"ca":30360,"Xư":246,"e ":134279,"bw":303,"by":1138,"bs":628,"br":3660,"bu":6071,"bt":170,"bn":144,"bo":4357,"bl":1961,"bf":332,"bh":146,"bi":46026,"bb":920,"be":11159,"db":293,"da":38163,"f ":9188,"cy":1022,"cu":9388,"ct":7017,"cs":417,"cq":226,"cr":3098,"co":14606,"cm":248,"cn":200,"ck":4238,"cl":1934,"ci":14195,"ch":129179,"ce":13915,"cc":1731,"Vư":1101,"c ":282171,"az":1913,"ay":20261,"ba":27162,"d ":28581,"at":21123,"as":21308,"ar":46226,"aq":534,"ax":1123,"aw":1934,"av":3839,"au":17184,"ak":3208,"al":35399,"ai":30021,"aj":1274,"ao":21790,"ap":5292,"am":43172,"an":108096,"ac":18572,"ad":10374,"aa":985,"ab":4752,"ag":8446,"ah":2616,"ae":34413,"af":1148,"nu":5357,"nt":26825,"ns":10879,"nr":434,"nq":250,"np":246,"no":9747,"hĩ":3597,"nn":7993,"q ":277,"nz":1727,"dư":2566,"ny":2595,"nx":314,"nw":217,"nv":655,"oe":1614,"of":8008,"oc":9998,"od":5406,"oa":17234,"ob":3593,"ké":466,"om":14342,"kê":1778,"on":100425,"ok":1733,"kè":139,"ol":16267,"oi":9431,"oj":178,"og":4447,"oh":1218,"ot":8950,"m²":659,"hō":191,"os":13276,"ov":4515,"ou":16281,"kì":215,"op":11878,"oo":2896,"or":34485,"oq":166,"gũ":395,"kí":1225,"r ":23753,"ox":666,"ow":4522,"oz":1526,"oy":925,"là":155485,"lá":810,"pe":13642,"pf":220,"lâ":707,"lã":1014,"pa":8707,"ký":795,"lè":430,"pl":2303,"lé":393,"lê":819,"po":6064,"ph":59995,"pi":7555,"lo":48870,"ln":332,"hê":913,"lm":1499,"hé":1262,"hè":446,"ll":28312,"ls":5922,"hí":22956,"lp":3661,"hì":4519,"hó":4714,"lw":160,"hò":5943,"lv":1420,"lu":8343,"lt":3135,"bư":9431,"lz":2455,"hö":169,"ly":7272,"hô":9522,"hú":4659,"hù":1344,"o ":85116,"iß":177,"ià":688,"hü":788,"ma":19960,"mb":6117,"mg":163,"hă":499,"me":15863,"iá":6621,"cơ":2439,"iè":809,"ml":141,"mi":19185,"mn":579,"iê":19338,"mm":3914,"ié":147,"mp":6181,"mo":8059,"mt":201,"ms":1171,"mu":3841,"iô":221,"ió":194,"cư":776,"my":849,"p ":57766,"iú":305,"na":36440,"nb":2672,"nc":9194,"nd":29108,"ne":33625,"nf":985,"ng":422545,"nh":179739,"ni":21672,"nj":602,"nk":2141,"nl":3093,"nm":685,"ju":396,"jo":698,"ki":21387,"kh":39807,"gã":188,"gâ":871,"ke":3413,"gá":300,"gà":10665,"ka":4069,"m ":167316,"gó":565,"gô":2554,"ky":750,"ks":1201,"kt":384,"ku":945,"ko":1602,"cũ":2937,"kr":861,"kk":293,"kl":1148,"km":4687,"kn":974,"li":29575,"lh":510,"lk":1201,"le":28117,"há":50648,"hà":29720,"ld":3791,"hã":1199,"lg":997,"hâ":38191,"lf":811,"la":39723,"lc":719,"lb":2760,"n ":440934,"hr":3391,"hs":2273,"dò":649,"hw":1075,"ht":2057,"hu":96505,"hk":147,"hh":235,"că":337,"hi":59002,"hn":1134,"ho":33760,"hl":2514,"hm":909,"dé":1520,"id":34385,"ic":24617,"ib":3478,"ia":42159,"ih":435,"ig":7165,"if":3529,"ie":18418,"hy":2981,"dù":2108,"k ":5941,"iq":1214,"ir":10741,"is":30398,"it":19216,"iu":2880,"iv":3060,"iw":152,"eó":2057,"ix":1054,"ii":6395,"ij":559,"ik":1883,"il":38718,"im":11936,"in":70002,"io":10270,"ip":6214,"je":426,"ji":826,"iz":1625,"iy":348,"l ":17786,"ja":1818,"nơ":1994,"să":488,"xi":2236,"tê":7401,"xo":824,"té":201,"tì":8961,"xp":151,"tí":18614,"tò":332,"xt":572,"xu":3759,"mư":601,"sô":2011,"ww":348,"só":466,"z ":4461,"xc":234,"xa":2141,"tâ":11580,"tá":3122,"xe":1418,"tà":3376,"wh":1250,"ră":636,"wi":4079,"sè":470,"wl":199,"sé":203,"wn":1808,"sê":269,"wo":1048,"ws":516,"wt":183,"rò":801,"rõ":266,"rô":669,"lư":3756,"rö":667,"rù":551,"y ":123996,"rú":838,"rü":313,"wa":4838,"sâ":1032,"sá":4087,"we":3258,"rè":166,"ré":3552,"vi":14749,"râ":232,"rã":310,"vu":13359,"vr":721,"rì":2865,"rí":1219,"rê":17007,"vo":1414,"uz":955,"uy":40068,"ux":1932,"uw":188,"uv":1489,"ve":8349,"rá":1135,"rà":746,"va":5946,"x ":5085,"ui":7198,"uj":271,"uk":987,"ul":11719,"ue":8283,"uf":658,"ug":2145,"uh":313,"mũ":152,"ur":22852,"us":22737,"ut":11022,"um":8525,"un":37448,"uo":379,"up":2448,"ty":3384,"tz":1632,"tu":10835,"tt":6267,"tw":506,"tv":273,"ub":3534,"ua":12317,"ud":3186,"uc":4195,"w ":3929,"to":19941,"tn":373,"tm":750,"tl":2439,"ts":3488,"tr":168490,"lũ":299,"tp":612,"tg":165,"tf":340,"te":34125,"ti":45992,"th":244395,"v ":848,"tb":456,"tc":539,"ta":22518,"su":6738,"sv":185,"ss":11481,"st":25304,"sy":900,"sw":1422,"sl":2460,"sk":1966,"sn":3570,"sm":2007,"sp":6570,"so":7204,"sr":482,"sq":267,"oà":40273,"sd":636,"oß":224,"sc":6190,"sf":434,"se":16078,"oá":2469,"nă":29159,"sh":6976,"sg":779,"oã":161,"si":19668,"hư":18279,"rz":821,"u ":92839,"nú":1257,"sa":12158,"sb":1450,"rr":9518,"rs":7765,"rt":11056,"ru":33603,"rv":1083,"rw":648,"nó":3472,"nô":498,"ry":4815,"rq":206,"rp":2369,"lĩ":607,"ro":79309,"nê":766,"rn":10149,"né":3611,"rm":6127,"rl":2540,"nç":154,"rk":3126,"hơ":3447,"ri":45411,"rh":1015,"rg":11132,"nâ":296,"rf":2041,"re":31859,"rd":9664,"nà":32104,"า":167,"rc":4893,"rb":3024,"ra":48559,"mù":496,"t ":240212,"gư":23033,"mó":270,"mô":1896,"qu":33189,"ร":145,"mé":23563,"mì":570,"má":2261,"mã":976,"lý":2656,"mà":2647,"s ":83922,"lú":457,"lô":11642,"px":1137,"py":386,"pt":4331,"pu":3257,"lò":168,"pp":2831,"lí":1363,"pr":2803,"ps":2406,"hū":297,"vư":1017,"zè":171,"vũ":680,"tư":4765,"sơ":283,"yê":5321,"sư":822,"xá":866,"xâ":942,"xã":6617,"rư":7067,"xé":140,"vĩ":1312,"xí":212,"vă":2376,"vù":22603,"zz":263,"vâ":231,"zh":389,"zi":1581,"uý":467,"zb":261,"và":46943,"ze":2177,"za":3062,"yz":169,"vò":489,"võ":378,"vô":1031,"zu":1605,"zo":1512,"zn":191,"ví":191,"vì":1173,"zl":195,"yg":495,"yh":272,"tă":382,"ye":2354,"uá":1249,"uâ":5412,"yf":154,"yc":1911,"yd":753,"ya":3754,"yb":311,"tú":167,"tù":418,"nư":35853,"tô":1034,"xy":294,"yx":252,"uô":12078,"yu":633,"yt":872,"ys":3871,"yr":5371,"yp":2222,"sĩ":1785,"yo":1215,"yn":1357,"uê":505,"ym":1622,"ué":319,"yl":3067,"yi":713,"xư":530,"yū":142,"² ":668,"Á ":1673,"Áv":245,"Áo":1420,"Âu":2129,"Âm":150,"Îl":330,"àn":34510,"ào":9571,"àm":2558,"ài":40656,"ã ":15139,"ày":41288,"àu":3190,"ám":1101,"án":28484,"áo":4300,"áp":27542,"ái":6121,"ác":31049,"áy":2255,"áu":552,"át":6111,"âm":6839,"ân":71359,"âu":6947,"ât":496,"ây":27219,"ãn":2862,"ão":485,"ãi":672,"ãy":559,"à ":205928,"á ":8250,"ße":294,"Úc":1243,"Ý ":1285,"Ôn":1610,"アアア":304,"ôi":2574,"ôm":11781,"ôn":48478,"óa":2709,"õ ":900,"ói":993,"óc":731,"óp":293,"ón":6670,"óm":1531,"ô ":34032,"òa":5612,"òm":318,"òn":6337,"ó ":64834,"ña":207,"ò ":1069,"ín":13049,"ít":636,"ìn":20626,"ìm":8528,"íc":17864,"ía":12239,"í ":5056,"ên":48870,"êm":9168,"êu":3630,"éz":154,"ì ":3075,"él":169,"éo":764,"ép":2326,"ém":434,"én":3441,"és":341,"ét":23592,"ér":825,"év":213,"éb":181,"éd":152,"éc":499,"ée":3394,"ég":181,"èn":257,"èo":280,"èr":1433,"ès":603,"èv":444,"ê ":4050,"é ":1396,"Đưể":826,"è ":371,"är":168,"ăm":28997,"ăn":8563,"ăk":142,"đưể":28774,"ān":253,"ý ":5155,"ể":1508247,"ün":249,"ür":1564,"üt":177,"üc":207,"üd":322,"ùn":27807,"ùi":235,"ùa":1067,"úp":776,"ún":1686,"úy":219,"út":637,"úa":606,"ùy":248,"úi":1359,"úc":3127,"ù ":1286,"ú ":1439,"ôt":1632,"öt":180,"ör":366,"ös":634,"ön":312,"öl":155,"đĩ":597,"đó":3828,"đô":27703,"đú":182,"đă":220,"Đư":848," ể":80643,"đư":29695,"đơ":1908,"tưể":2647,"Đa":443,"Đo":196,"Đi":1420,"đe":478,"đa":1708,"đo":2470,"đi":17430,"Đ ":295,"Đì":322,"Đà":1638,"Đá":165,"Đâ":3555,"đí":392,"đì":460,"đê":8064,"đã":4348,"Đă":188,"đá":3730,"đâ":1604,"đà":1035,"Đô":12692,"đu":489,"Cể":8674,"Dể":759,"Bể":9876,"Hể":9168,"Gể":170,"Lể":3634,"Kể":5925,"ĩ ":4602,"ĩa":3526,"ĩn":1420,"ĩnh":1417,"dể":12219,"ĩa ":3523,"cể":67050,"bể":45726,"mể":131131,"ũ ":2155,"lể":23218,"kể":7585,"iể":166241,"hể":264262,"gể":17545,"Sể":1192,"Tể":7402,"Rể":525,"ō ":339,"Mể":5140,"vưể":475,"Nể":1896,"Xể":142,"Vể":1294,"rể":50434,"sể":54959,"tể":83275,"uể":92128,"nể":21381,"ū ":399,"oể":14719,"ũi":200,"ũn":2955,"vể":53705,"专专 ":169,"xể":2191,"yể":31245,"ơi":3100,"ơm":181,"ơn":18072,"ơ ":4182,"あ":270,"ア":496,"가가 ":224,"ươ":12314,"ư ":8494,"ưa":1622,"ưn":2592,"ưu":1733,"Để":19679,"để":91689,"Đan":304,"đa ":614,"ơm ":179,"ơn ":5934,"đai":310,"đan":664,"ơi ":3096,"đi ":743,"ơng":12128,"đen":294,"đo ":419,"đoà":1003,"đua":287,"乙":857,"乘":179,"之":2791,"丹":642,"临":665,"中":142,"並":1114,"丙":309,"丘":935,"丛":252,"专":2269,"且":143,"丈":196,"三":4224,"丁":2034,"万":989,"亞":563,"亂":327,"侏":215,"ểy ":15070,"ểu ":43780,"ểt ":179200,"Đây":3553,"Đào":412,"Đài":765,"Đà ":277,"Đôn":3441,"圓":161,"Đô ":9162,"Đìn":322,"冲":171,"đêm":8016," 丘":335,"đíc":372," 专":413," 三":1027," 丁":795,"đìn":459,"đây":1563,"đá ":2316,"倉":569,"đán":1131,"đã ":4325,"đào":392,"đài":257,"đàn":318,"đún":140," 倉":244,"ưể":121112," 侏":158,"đón":976,"đôi":461,"đôn":8728," 並":342," 临":320," 丹":351," 之":799," 乙":411,"đó ":2805,"đô ":18495," 亂":152,"大":197,"ểa ":61642,"ểc ":245982,"ểch":9163,"ểi ":112695,"ểk ":174,"ển ":203101,"ểm ":59200,"ểp ":25010,"ểng":124564,"ểnh":41635,"ểo ":9528,"Điể":1233,"ああ":156,"điể":16552,"đoể":920,"đăn":218,"đĩa":596,"ς ":202,"đơn":1904," ểc":15788," ển":5390," ểm":264," ểy":799,"アア":391," ể ":57997,"đươ":308,"đưa":612,"ск":150," vư":1017," xư":529," Áo":1420," Áv":245," Á ":1673," Ga":3043," Câ":289," Cá":2724," Ge":2570," Cà":157," I ":489," Bì":1773," Bí":387," Fo":1209," Fu":749," Fr":2319," Fi":1014," Fl":1075," Bé":190," Ha":7534," He":2474," Dâ":1282," Cô":3477," Có":398," Gy":285," Cú":325," J ":162," Go":1392," Gr":4156," Gu":2454," Gh":322," Gi":6622," Gl":488," Id":176," Ic":219," Ib":265," Hy":1924," Dô":340," Hu":4380," Ho":13959," Hi":2699," Ji":227," Je":694," L ":288," Ja":1791," Ir":1002," Is":1688," It":2091," Im":186," In":3565," Il":425," M ":229," Ka":2587," Ke":1169," Ki":2975," Kh":4951," Jo":1090," Ju":1029," N ":147," La":11244," Hà":3868," Há":6905," Le":5519," Hã":339," Li":5628," Kl":231," Kn":141," Ko":946," Kr":777," Gò":247," Ku":656," Gö":233," Ky":391," Ma":14506," O ":139," Cơ":380," Mi":7555," Me":5610," Hì":157," Lo":11086," Hé":321," Bư":341," Ly":614," Hó":455," Hò":670," Lu":1590," Ne":5220,"а ":296," P ":285," Na":14703," Ng":8844," Nh":15231," Ni":3593," Mo":5991," Cư":440," My":978," Mu":2713," A ":801," B ":314," C ":460," Ap":624," Am":2023," An":10467," Ak":163," Al":6944," Ai":1106," Ag":749," Ah":189," Ae":262," Af":957," Ac":1189," Ad":608," Ab":573," Ba":13586," D ":281," Az":1190," Ay":139," Av":424," Au":2992," At":1064," As":1419," Ar":5374," Aq":1657," Be":3504," Bi":2420," Bh":176," Bl":874," Bo":4376," Br":4006," Bu":3207," By":153," E ":175," Ca":14032," Ce":2572," Ci":1003," Ch":19001," Cl":1229," Cr":1732," Co":10896," Cu":1575," Cy":959," F ":217," Da":2548," Di":2876," De":2206," Dr":1245," Do":2436," Dy":152," Du":1262," Ea":529," Ec":908," Ed":358," G ":173," El":867," Ei":335," Et":365," Es":889," Er":614," Ep":314," En":763," Em":454," Ex":456," Eu":2122," Bà":785," Bá":846," Fe":894," Fa":1822," H ":178," Xu":1090," Tò":163," Tí":259," Tê":651," Xi":413," Tà":542," Xe":204," Tá":302," Tâ":9623," Sü":308," Xa":216," Só":240," Sô":411," Mư":225," Wo":905," Sé":358," Sè":329," Wi":1632," Wh":294," Sá":350," We":2007," Sâ":533," Sà":247," Wa":2017,"й ":151," Y ":281," Lư":805," Võ":190," Tĩ":188," Và":232," Ze":737," Vâ":484," Zh":184," Zi":450," Za":1260," Yv":187," Yu":290," Yp":238," Yo":868," Sĩ":506," Ya":432," Ye":349," Nư":140," Tô":962," Xy":153," Tú":282," Tù":160," Xí":179," Vĩ":455," Xô":549," Wü":541," Xã":2079," Vù":202," Vă":1148," Vũ":587," Tư":1025," a ":4962," Yê":501," Sư":226," Sơ":2029," R ":199," Kô":237," Ou":268," Ov":236," Os":655," Ot":299," Or":2938," Op":391," Po":3911," Lê":922," Lé":142," Pl":1371," Pi":2271," Ph":31029," Lã":279," Pf":1720," Lâ":831," Pe":2592," Là":800," Pa":7774," Dư":2196," Ny":327," Nu":290," No":7248," Ol":594," Ok":173," On":479," Om":200," Oh":202," Oi":748," Od":181," Oc":483," Oe":269," Ob":773," Oa":270," Ra":1894," Mü":182," T ":156," Mù":252," Mô":838," Qu":14182," Ro":3827," Re":2068," Ri":2185," Rh":4439," Py":3297," S ":230," Pr":3910," Ps":554," Pt":637," Pu":1471," Mé":244," Lý":663," Lü":151," Má":214," Lă":214," Mã":1026," Sy":607," Sw":578," Su":1915," St":3635," Ta":3810," V ":169," Th":42680," Ti":3850," Te":2931," Tr":21258," To":2625," Nó":11014," Ry":154," Nô":330," Ru":1162," Sa":11420," Hư":907," U ":154," Nă":720," Sh":1334," Si":2727," Sc":3070," Se":3434," So":3300," Sp":1984," Sr":367," Sk":306," Sl":517," Sm":206," Uy":393," Va":2209," X ":200," Ve":2261," Vi":16180," Ré":207," Vo":1912," Vu":317," Tu":2947," Tw":154," Ty":200," Ug":456," Uk":408," Ul":140," Um":165," Un":913," Ur":403," Ut":638," ja":224," l ":250," im":322," in":6639," is":5408," it":773," ka":329," m ":439," kh":38847," ki":18267," gâ":429," ke":373," gá":282," ju":151," cô":5020," có":47410," cú":382," cù":1871," ha":11491," dã":502," dâ":28872," he":621," dà":2524," gi":28288," bă":261," gh":729," gl":236," gr":899," go":250," gu":378," cò":3288," hy":276," dù":2107," că":333," hi":6669," dé":1144," ho":9851," ht":218," hu":16965," dò":646," nh":45517," ni":1927," ng":51853," ne":507," na":15500," cư":775," mu":1888," mo":1972," ké":437," on":1221," kê":1776," oc":375," of":6994," ob":204," dư":2565," nu":602," no":1173," hã":1017," hà":8670," há":923," le":1986," li":4644," la":5104," gó":504," kn":913," km":4476," cũ":2936," ko":246," me":1022," mi":9905," cơ":2426," hù":342,"я ":139," hú":203," ma":3748," lu":1700," hó":2026," hò":4796," ly":382," hô":283," bư":9431," hè":160," lo":35801," hì":2834," ag":160," ab":359," ac":560," ad":220,"Hểu":669," am":614,"Hểp":191," an":7056," ap":254," ai":278,"Hển":822," al":2427," au":545,"Hểi":3718," ar":2712,"Hểc":428," aq":275," at":395," as":1145," d ":2111," ba":20628," 가가":227," bi":39922," be":1810," bo":1086," bl":299," by":542," bu":1037," br":705," ca":18336," Xư":246,"Hể ":3004," Vư":1101," er":165," et":2265," es":168," en":1522," em":442," el":437," bà":2390," fe":289," bá":2320," fa":2087," eu":165," ex":489," fu":610," fr":2001," bí":964," bì":10908," fo":1760," bê":1224," bé":146," fl":771," fi":679," bã":190," ge":1821," cá":22553," câ":2231," cà":270," ga":2783," bú":206," bó":2103," cl":550," cm":154," co":7147," cr":532," ce":1064," ch":72993," ci":299," da":4945," cu":4863," cy":157," do":4301," dr":248," de":8416," di":19653," ec":238," ed":190," ea":304," du":2623," 三三":209," vù":22601," vă":2376," ví":173," vì":1170," võ":378," vô":1027," vò":489," và":46932," vâ":230," sĩ":1785,"ка":178," tù":416," tú":155," tô":909," nư":35851," tă":379," xo":207," tê":7377," nơ":1991," să":488," tò":331," xu":3665," tí":18540," tì":8960," mư":601," ww":167," só":461," sô":2005," tâ":11572," tà":3363," tá":3073," xe":994," xa":907," tư":4765," vũ":680," 三之":172," sư":822,"ов":213," yê":400," sơ":282," rư":159," vĩ":1312," xã":6617," xâ":941," xá":864," ru":574," nô":442," nó":3467," hư":1543," sa":4809," nú":1256," se":1352," sc":583," si":4586," sh":865," nă":29145," sn":2777," sm":592," sl":244," sp":4116," so":2264," qu":28552," mó":260,"ви":141," mô":1890," mù":492," ra":5117," re":1311," nà":32102," nâ":291," ri":1195," hơ":1747," né":300," ro":791," lĩ":607," nê":751," lò":161," pu":610," pr":1132," lí":1307," lú":446," s ":1052," lô":926," px":1128," py":148," mã":975," má":2233," mà":2646," lý":2654," mì":566," mé":12707," ot":153," kì":215," op":485," kí":1225," or":1862," lá":722," pe":891," là":155391," lã":1013," lâ":684," pa":1485," ký":784," lè":332," pl":606," po":832," lê":796," pi":462," ph":51377," wa":721," sâ":1031," sá":4080," we":797," rõ":266," y ":288," lư":3756," wo":408," wi":1339," wh":1197," sê":264," va":1100," ve":759," uy":141," vo":357," vu":12917," rã":285," vi":8649," ty":993," tu":4206," us":140," mũ":152," up":179," un":373," ta":1679," v ":243," sy":262," st":916," su":3809," lũ":299," tr":156052," to":6595," th":230858," ti":21975," te":899," đu":484," đa":1700," đe":474," đi":17419," đo":2466," Đá":164," Đà":1637," Đâ":3555," Đì":322," Đo":196," Đi":1416," Đa":443," đơ":1908," đư":29691," Đư":847," đí":392," đê":8032," đì":460," đá":3730," đâ":1604," đã":4347," Đă":188," đà":1035," Đô":12690," đă":220," đú":182," đô":27698," đó":3827," đĩ":596,"Lể ":569,"Lểi":178,"Lển":215,"Lểp":1280,"Lểc":1143," Bể":9869," Kể":5918," Hể":9143,"Kể ":5821," Gể":165," Cể":8663," Dể":757," Âm":150," Âu":2129," Îl":330," Ôn":1608,"Nể ":198,"Nển":165,"Nểi":1235," Úc":1243," Ý ":1274," áp":355," án":814," âm":3643," ít":431," ô ":353," ôn":1711,"Mể ":2527," ý ":503,"Mểc":807,"Mểt":1153,"Mểu":174,"Mển":285,"Mểi":146," ăn":626," 가":331," ươ":433," ưu":158,"가":830," hể":63957," gể":12621," lể":23214," kể":7585," cể":67038," dể":12218," bể":45724," Vể":1292," Tể":7366,"Rểp":306," Xể":142," Nể":1895," Lể":3615," Mể":5122," Rể":522," Sể":1093," vể":53702," xể":2191," yể":935," mể":131122," nể":21375," rể":4289," sể":54954," tể":83225,"Tểt":172,"Tểp":225,"Tểc":152,"Tểa":1075,"Tển":2741,"Tểi":245,"Tể ":2617,"Sể ":900,"Vểt":162,"Vển":442,"Vểi":163,"Vể ":500,"ưểt":382,"ưểu":183,"ưển":18802,"ưểm":9501,"ưểi":24657,"ưểc":67509," Để":19647," để":91683," ưể":501,"Ávi":245,"ال":152,"Áo ":1417,"三三 ":318,"三万 ":202,"Âu ":2118,"三专 ":318,"Âm ":149,"ươn":12134,"ươi":146,"가가":499,"Bể ":2169,"AO ":231,"AN ":149,"Bểt":140,"Bểc":4036,"Bển":2788,"Bểo":450,"Cể ":946,"Cểu":671,"Cểp":339,"Cểm":232,"Cển":5510,"Cểc":787,"Dể ":256,"Dểc":190,"Bà ":315,"Bá ":150,"Bài":310,"Bàn":152,"Bác":305,"Fel":162,"Fen":155,"Fer":312,"Fis":282,"Ext":290,"Fas":526,"Fal":176,"Far":151,"Fab":275,"Eri":170,"Ess":158,"Est":392,"Eth":271,"Eup":437,"Eur":1040,"El ":188,"Ele":184,"Eng":198,"Epi":248,"Ent":180,"Các":1801,"Ger":929,"Cát":277,"Geo":862,"Gen":359,"Gla":140,"Gha":245,"Gia":2324,"Gil":159,"Gir":424,"oể ":155,"oểt":2492,"Cá ":259,"oểc":3719,"Gan":147,"Gal":295,"Gam":240,"Gau":147,"Gar":1513,"oển":3941,"oểi":4411,"Gab":195,"Fus":252,"Fro":378,"Flo":588,"Fla":166,"Fle":174,"ũi ":200,"Fra":932,"Fri":349,"ũng":2953,"Fre":553,"Bín":197,"Bìn":1748,"Fon":217,"For":497,"Fou":186,"Dân":1220,"nểu":216,"II ":1176,"Hil":320,"Him":177,"Hin":145,"Hip":145,"nể ":1462,"Hel":390,"nểi":2679,"Hei":256,"nểm":15373,"nển":1144,"Hem":163,"Hen":196,"Hes":405,"nểa":304,"Her":528,"Cúp":236,"Hal":332,"Hai":401,"Han":467,"Ham":344,"Has":179,"Har":748,"Haw":533,"Hau":3745,"Côt":1585,"Guy":153,"Cô ":190,"Gua":732,"Có ":275,"Gui":1036,"Côn":1681,"Gre":1012,"Gri":151,"Gra":2331,"Gro":483,"Glo":169,"Giá":926,"Gon":156,"Goo":205,"Gol":188,"Gom":158,"Úc ":1233,"Inn":161,"Int":561,"Ins":197,"Ill":204,"Ind":2106,"mểc":13680,"mểm":11638,"mểi":2779,"mểu":420,"mểt":99606,"mển":1814,"mể ":1110,"Ibe":205,"亞 ":440,"Hyp":280,"Hyd":396,"Dôm":338,"Hy ":979,"Hun":464,"Huy":2731,"Hue":217,"IV ":160,"Hoà":1902,"Hor":294,"Hou":153,"Hom":153,"Hon":555,"Hok":148,"Hol":1270,"Hoa":8475,"Arg":562,"Are":201,"Arc":363,"Ard":1593,"Ara":960,"Arm":257,"Ari":480,"Aqu":1651,"Apo":203,"Ath":149,"Atl":649,"Ast":281,"Ass":392,"Asi":256,"Arr":175,"Art":235,"Ave":196,"Auv":549,"Aus":949,"Aur":157,"Aud":253,"Aug":204,"Aub":473,"lể ":2800,"Azu":1003,"lểp":5146,"lểt":155,"lểi":2389,"lển":5223,"lểa":430,"lểc":6489,"Ba ":889,"lểy":462,"Bai":215,"Bal":574,"Ban":6419,"Bac":146,"Bad":1148,"Bay":966,"Bar":929,"Bat":270,"Bas":1136,"Bau":155,"Abr":165,"Aca":233,"Acr":321,"Ach":200,"Ade":163,"Ai ":341,"Aga":200,"Afr":687,"Afg":186,"Air":488,"Ala":309,"Alb":681,"An ":1388,"Alg":185,"Ali":170,"Ale":299,"Alv":153,"Als":515,"Alt":493,"Alm":187,"All":366,"Alp":2942,"Ame":689,"Amb":210,"Ama":449,"Amp":158,"Anh":5927,"Ang":642,"Ana":331,"And":666,"Ant":595,"Ano":159,"Ann":219,"Bus":168,"Bul":625,"Bur":1284,"Buc":492,"Bru":412,"kể ":5372,"Cab":144,"kểt":1768,"Cae":179,"Cal":3283,"Cam":1757,"Cai":141,"Cas":3059,"Car":1359,"Cau":236,"Cat":533,"Cao":644,"Can":1671,"kểc":374,"Cap":594,"Bea":394,"Ber":1074,"Ben":559,"Bel":748,"Bin":143,"Bil":278,"Bis":150,"Bit":253,"Bir":234,"Blu":260,"CP ":196,"Biê":192,"CN ":900,"Bla":380,"Bre":721,"Bra":1413,"Bro":439,"Bri":863,"Bol":471,"Boi":232,"Bon":251,"Bor":635,"Bos":277,"Bot":225,"Bou":1517,"Cyp":409,"Cur":167,"Cub":268,"Cun":184,"Cup":164,"EE ":154,"Des":235,"Deu":424,"Del":231,"Dem":185,"Den":364,"Dam":166,"Dan":842,"Dar":270,"Dav":172,"Dal":154,"Cho":474,"Chr":351,"Che":539,"Chi":4142,"Cic":417,"Chu":975,"Cit":157,"Châ":2351,"Cle":204,"Cla":633,"iểt":19441,"iểu":22084,"iểy":197,"Cel":188,"iểm":11357,"iểp":3887,"iển":94986,"Cen":1295,"Cer":828,"iểc":3158,"iểi":7620,"iểa":1763,"Cha":5749,"Cri":167,"Cra":378,"Cre":395,"Chư":286,"Cro":546,"Chù":202,"Chú":672,"Chí":1230,"Coc":220,"Coe":184,"Cop":262,"Cos":1089,"Cor":1423,"Com":752,"Col":1893,"Con":3653,"Cou":867,"FA ":242,"Drô":335,"iể ":1731,"Edw":139,"Ông":1552,"Ect":200,"Ecu":469,"Eas":364,"Do ":273,"Diê":175,"Dic":144,"之三三":139,"Dit":152,"Dis":351,"hểo":739,"hểp":5189,"hểm":3873,"Dip":407,"hển":23819,"Dio":154,"hểy":11164,"hểu":6003,"Die":258,"hểt":14285,"hểa":1704,"hểi":18265,"hểc":23691,"Di ":179,"hể ":155404,"Dun":239,"Duy":294,"Du ":216,"Dri":386,"Dre":146,"Dra":192,"Dou":158,"Don":392,"Dom":257,"Dor":723,"Cươ":311,"Nev":160,"Neu":541,"Net":191,"Nep":895,"Neo":280,"Nas":316,"Nat":595,"Nav":156,"Nig":396,"Nie":1241,"Nic":337,"Nin":1068,"Nhi":250,"Nha":5926,"Nga":1678,"Ngh":1164,"Ngu":2389,"Ngo":273,"New":2499,"Myr":349,"xể ":903,"Mya":329,"Nak":165,"Nam":11941,"Nan":196,"Nag":208,"Na ":357,"xểp":906,"xểy":267,"Như":191,"Nym":236,"Nhó":149,"Nhà":676,"Nhâ":3285,"Ngô":642,"Ngà":567,"Ngâ":218,"Ngư":724,"Nhĩ":654,"Nov":205,"Ngũ":256,"Nor":4126,"Not":381,"Nob":171,"Noc":1779,"Oec":143,"Dươ":1846,"PG ":211,"Ois":715,"Ohi":151,"Oah":152,"Occ":238,"Obe":662,"Île":330,"Ott":171,"Ovu":166,"Kôn":235,"Oly":147,"Oli":201,"Giể":2035,"Ont":199,"Or ":550,"Opo":183,"Ora":158,"Ore":369,"Orc":304,"Ori":453,"Orn":566,"Ost":243,"Phú":926,"Phù":169,"Phò":192,"Phó":180,"Phá":21296,"Ple":538,"Phâ":188,"Pla":641,"Hiể":1343,"Lê ":891,"Pin":370,"Pit":142,"Phy":669,"Pie":506,"Pic":810,"Pho":672,"Phi":3036,"Pha":566,"Lãn":143,"vểy":569,"Lâm":707,"vển":2537,"vểt":17681,"Pfa":1704,"vểa":164,"vểc":13644,"vểi":7805,"Per":956,"Pet":409,"Pen":544,"Pel":140,"Lào":383,"vể ":11302,"Pay":1416,"Là ":298,"Pat":164,"Pas":888,"Par":1717,"Pau":274,"Pac":219,"Pan":548,"Pap":721,"Pal":1091,"Pak":192,"Pyr":3193,"Huể":468,"Pte":513,"Pun":221,"Pup":166,"Pue":171,"Puy":394,"Pro":2087,"Pri":425,"Pre":247,"Phư":678,"Pse":388,"Hoể":173,"Pra":808,"Pol":756,"Pom":148,"Pon":298,"Poi":1472,"Pot":227,"Por":309,"uểc":63038,"uểi":2825,"Lăn":196,"uểt":6647,"uển":17332,"Mã ":868,"uể ":2223,"Lý ":659,"Mùa":206,"SA ":174,"Ram":194,"Ran":617,"Quá":225,"Quâ":725,"Môn":650,"Quý":277,"Qua":706,"Qui":159,"Que":829,"Quy":457,"Ita":500,"Isl":594,"Isr":263,"It ":1456,"Ira":481,"Ire":363,"Isè":447,"tểm":507,"tển":38505,"tểi":9573,"tểc":3283,"tểa":923,"tể ":24294,"Jac":218,"Jav":319,"Jan":148,"Jam":357,"tểo":2243,"tểp":1747,"tểt":2043,"Jer":195,"Jea":217,"Biể":504,"Jos":191,"Jor":166,"Joh":334,"Jul":480,"sể ":37621,"sển":14954,"sểm":218,"sểc":1290,"sểa":158,"Kai":155,"Kam":181,"Kal":268,"Kan":358,"Kau":180,"Kat":153,"Kas":184,"Kar":496,"Kaz":203,"sểt":255,"sểp":188,"Ken":722,"Kir":203,"Kit":143,"Kin":907,"Kim":609,"Kho":491,"Khu":1438,"Kha":464,"Khi":230,"Chể":1285,"Khê":158,"Khá":594,"Khô":380,"Kon":253,"Kor":151,"Kre":398,"Gòn":201,"Cuể":281,"Kus":145,"Kur":182,"Hàn":1241,"Leu":183,"Les":367,"Lep":631,"Leo":339,"Len":152,"Hán":6856,"Lei":182,"rểc":1530,"Lea":285,"rển":39207,"rểm":291,"rểi":1137,"Hà ":2463,"Lau":510,"rể ":6793,"Le ":619,"Lak":163,"Lai":390,"Las":248,"Lat":742,"Lar":215,"Lao":140,"Lam":368,"Lan":4696,"Lac":165,"Lab":180,"倉 ":180,"La ":2717,"Liê":1950,"Hér":279,"Diể":335,"Lib":574,"Lie":155,"Lig":183,"Lim":511,"Lin":692,"Lio":171,"Lis":148,"Lit":507,"Liv":154,"Leó":2050,"Hãn":333,"rểt":1287,"Hòa":572,"Lud":184,"Luc":212,"Loà":1669,"Hìn":160,"Loz":163,"Lou":367,"Los":193,"Lot":580,"MS ":483,"Loi":3201,"Lor":2131,"Lon":1378,"Lom":271,"Loa":517,"Ma ":267,"Hóa":437,"Luâ":253,"Lyc":258,"Mei":187,"Men":179,"Mel":544,"Mes":227,"Mer":503,"Meu":1122,"Met":367,"Mec":579,"Meg":242,"Med":321,"Mex":918,"Man":2248,"Mal":1668,"Mar":4888,"Mas":469,"Mag":297,"Mad":710,"Mah":444,"Mai":1077,"Mac":532,"NE ":2847,"May":519,"Mau":531,"Mat":259,"Miê":150,"Mol":400,"Mon":1606,"Mos":1419,"Mor":654,"Mou":470,"Mot":424,"Moz":414,"Mid":2002,"Mic":1239,"Cơ ":351,"Mit":747,"Mir":161,"Mis":522,"Mil":319,"Min":1719,"Mun":171,"Mur":1848,"Mus":259,"Xià":174,"Phể":1906,"Tây":8477,"Tân":1027,"Tào":168,"Tàu":164,"Süd":288,"Sôn":404,"Sóc":214,"Wor":462,"Wol":216,"Séc":256,"Sèv":318,"Whi":209,"èvr":358,"Wik":174,"Wil":393,"Win":429,"Wie":149,"Wit":233,"ère":1422,"Sài":216,"Web":172,"Wei":433,"Lươ":358,"Wes":961,"Sân":446,"Was":264,"War":327,"Wal":818,"ès ":590,"Lưu":290,"Dưể":277,"èn ":152,"èo ":278,"ém ":263,"之丁":166,"QĐ ":168,"之三":478,"之万":178,"ée ":472,"之专":315,"之之":217,"ées":2904,"Vos":497,"Vor":516,"Vol":589,"éc ":283,"Nhể":3766,"Ngể":811,"Viê":165,"évi":139,"Tĩn":188,"ép ":1113,"Zea":402,"Zar":282,"Zam":723,"én ":239,"éo ":609,"éra":396,"ét ":23407,"éri":203,"éné":2942,"Zim":303,"Zel":153,"épa":1146,"Vân":484,"ên ":47944,"êm ":9120,"Yps":167,"Quể":10633,"Yve":176,"並三":153,"Sĩ ":505,"Yor":442,"You":143,"Yon":155,"ênh":268,"êng":589,"专专":219,"专三":225,"êu ":3625,"Yel":146,"三万":220,"三丁":213,"三三":777,"三专":394,"Tô ":277,"Xuâ":498,"三之":347,"Tôn":652,"Túc":179,"丁专":144,"丁丁":142,"Tên":660,"丁三":258,"Tín":160,"丁之":215,"Xuy":470,"ãi ":669,"Syn":143,"Syr":151,"Swi":186,"Swa":238,"Sur":243,"Sum":294,"Sul":327,"Sun":218,"Sud":207,"Str":573,"Stu":169,"Sti":380,"Sto":266,"Sta":800,"Ste":1346,"Ten":237,"Tel":150,"ãnh":975,"ãng":1319,"ão ":484,"Tam":802,"Tan":929,"Tas":235,"Tar":601,"ãn ":565,"Tai":151,"Tak":148,"Ski":144,"ãy ":559,"Khể":614,"Shi":359,"She":144,"Năm":661,"Sho":220,"Sha":353,"Sim":166,"Sil":271,"Sin":736,"Sie":417,"Sib":258,"Sic":157,"Ses":155,"Ser":590,"Sen":317,"Sel":193,"Hươ":208,"Sei":684,"Seg":435,"Sri":344,"TV ":172,"Spa":202,"Spi":140,"Sph":1036,"Spe":218,"Spr":184,"Sou":1403,"Sol":431,"Som":283,"Son":366,"Sor":294,"Kiể":703,"Slo":396,"Nôn":230,"Rus":514,"Nó ":10964,"Sai":3614,"Sam":307,"Sal":1166,"Saa":431,"Sac":786,"Sab":139,"Sco":422,"Sci":154,"Sch":2083,"Sca":289,"Sax":162,"Sav":348,"Sat":196,"Sau":596,"Sar":1013,"Sap":212,"San":1415,"Sao":173,"Hưn":444,"Sa ":225,"TA ":219,"Res":158,"Rhi":678,"Rhe":1850,"Riv":248,"Ris":546,"Rie":160,"Ric":612,"Red":234,"Rei":173,"Reg":216,"Ren":178,"Rep":253,"Rob":306,"Roc":356,"Rou":1296,"Rot":151,"Ros":295,"Rom":626,"SS ":368,"Rhô":1640,"Ven":831,"Vau":301,"Van":289,"Val":1014,"Var":300,"Vic":424,"Vie":766,"Vir":376,"Vil":850,"Vin":383,"Ver":694,"Vex":266,"Ukr":405,"Uni":646,"Miể":206,"Uy ":300,"Utt":447,"Tră":223,"Luể":194,"Trá":373,"Trà":189,"Trâ":155,"Trì":245,"Bưể":300,"Trư":1573,"Uga":441,"Tex":717,"Ter":1194,"Tha":1705,"The":9028,"Thi":2418,"Tho":351,"Thu":1030,"Til":178,"Tim":168,"Tin":432,"Thà":2636,"Thá":3215,"Liể":150,"Thü":723,"Thô":412,"Tiê":963,"Tor":743,"Tok":152,"Tol":222,"Tom":166,"Tou":295,"Thư":1102,"Tru":10246,"Tro":2116,"Tri":2061,"Tre":371,"Tra":816,"Toà":178,"Tuy":629,"Tur":1442,"Tun":141,"Mưể":195,"ày ":41264,"Tươ":169,"àu ":3188,"gểa":154,"gểc":1587,"gểi":5837,"gểp":336,"gểm":3968,"gển":2170,"gểy":212,"gểt":591,"ành":21823,"àng":7133,"gể ":2467,"ào ":9552,"àn ":5519,"àm ":2556,"ài ":40596,"dể ":1928,"bis":208,"bit":429,"biu":210,"bio":195,"biq":379,"bir":196,"bil":353,"bin":1156,"bii":441,"dểa":1072,"dểc":2450,"bo ":249,"dểi":171,"dểm":153,"dển":5451,"dểu":482,"dểy":237,"áy ":2251,"blo":143,"ble":788,"bli":464,"bla":343,"boa":544,"bol":192,"biê":756,"bon":625,"bom":252,"bop":200,"bor":571,"bot":181,"bos":169,"bou":707,"bbe":548,"be ":978,"áo ":4269,"ban":14532,"bal":489,"bai":195,"bac":1588,"bad":244,"bab":362,"án ":11049,"bay":2258,"bat":528,"bas":507,"bar":886,"bao":2943,"bea":250,"áp ":27496,"ánh":4002,"áng":13329,"bi ":208,"bei":269,"bee":755,"bed":546,"bec":391,"ber":5184,"ben":538,"bel":1263,"bek":171,"bes":289,"bet":259,"bfa":299,"áu ":550,"bia":2262,"bic":185,"bid":364,"át ":6081,"ách":6363,"áce":165,"ái ":6094,"ca ":5266,"car":2682,"cas":523,"cat":809,"cau":258,"can":2819,"cao":13635,"cap":271,"cac":552,"cae":343,"cad":152,"cam":352,"cal":1785,"cai":449,"ce ":3602,"ám ":1096,"bri":1032,"bro":469,"bra":914,"bre":597,"bru":440,"bsi":207,"bur":2940,"bul":333,"bun":198,"bum":722,"but":706,"bus":353,"by ":708,"bwe":288,"ác ":24475,"aka":675,"am ":27480,"ake":553,"aki":591,"akh":416,"aji":229,"ajo":231,"al ":6772,"aja":635,"aii":453,"ail":3696,"ain":10996,"air":827,"ais":1435,"ait":246,"ak ":354,"aig":201,"aid":285,"aic":180,"aia":148,"ây ":27201,"ahi":144,"ahu":249,"ahr":141,"aho":292,"aha":852,"agi":293,"agr":420,"agu":637,"agn":2270,"ago":1655,"aq ":141,"anu":911,"anz":946,"any":498,"ano":1266,"ann":1274,"anm":363,"ant":5267,"ans":1570,"ane":1647,"ang":23334,"anh":11504,"ani":4224,"anj":313,"ank":983,"ap ":200,"ana":4781,"anc":3852,"and":17225,"amu":239,"amm":581,"amo":823,"amp":3495,"ams":266,"ami":3273,"ame":2454,"amb":1921,"ama":1996,"ao ":21236,"alz":1868,"aly":679,"alv":524,"alu":519,"alt":1192,"als":583,"alp":280,"alo":1138,"alm":374,"all":4541,"alk":430,"alg":162,"ali":4610,"alc":275,"ald":946,"ale":3407,"alf":143,"ala":5532,"alb":1038,"an ":27435,"aku":180,"ako":211,"aba":1197,"abe":439,"abi":699,"abl":307,"abo":447,"abr":553,"abu":216,"abw":286,"ae ":31054,"aca":592,"aal":305,"aar":275,"ad ":1523,"ac ":1152,"âng":210,"ab ":257,"aft":152,"aff":222,"ai ":10857,"aga":1564,"age":954,"aeo":180,"aen":693,"ael":524,"aes":144,"aer":162,"Vươ":859,"aei":212,"ah ":482,"âte":406,"ado":1640,"adr":334,"adi":656,"âu ":6939,"ade":2368,"aea":378,"aec":197,"ag ":159,"ady":140,"adt":318,"adu":484,"aco":486,"ack":866,"aci":2053,"ach":4974,"ace":5973,"acc":246,"ada":2077,"af ":167,"act":895,"acu":424,"acr":438,"azo":203,"azi":604,"aze":208,"aza":466,"axi":240,"axo":326,"az ":215,"ayo":158,"ays":2306,"aya":1069,"aye":1477,"ân ":71107,"ba ":1967,"âm ":6828,"aqu":368,"at ":2191,"arh":179,"arg":1494,"are":4513,"ard":3817,"arc":1390,"arb":768,"ara":4118,"arp":777,"aro":1967,"arn":2152,"arm":452,"arl":768,"ark":1469,"ari":8943,"aru":469,"arv":290,"arr":2293,"ars":863,"art":4067,"au ":6244,"asa":466,"ary":947,"arz":380,"asi":1358,"ash":1142,"asc":1250,"ase":497,"aso":213,"asp":226,"ask":259,"asm":354,"aon":161,"ar ":3587,"apa":481,"ape":779,"api":791,"aph":710,"apl":205,"apo":627,"app":349,"apt":205,"apu":632,"as ":6065,"ava":1308,"ax ":315,"aux":509,"auv":229,"aut":4199,"avo":316,"avi":854,"ave":987,"awe":151,"ay ":14622,"awa":1004,"awi":403,"ata":3801,"asu":225,"ast":6128,"ass":2666,"atr":716,"ato":1406,"ate":3193,"atc":165,"ati":4840,"ath":1382,"aua":167,"auc":575,"aub":164,"att":609,"ats":390,"atu":1696,"aty":238,"aul":975,"aum":254,"aun":342,"aur":1043,"aus":1014,"aud":522,"aue":262,"aug":153,"aui":145,"bể ":9115,"Wür":527,"bểy":259,"bểt":2371,"bểu":602,"bểi":3447,"bểo":909,"Xã ":2077,"bển":18106,"bểc":10801,"Thể":19271,"Tiể":1514,"Trể":2534,"Vĩn":400,"Xíc":159,"Võ ":184,"cểu":3163,"cểt":394,"cểp":4267,"cểc":593,"cểa":46999,"cển":3734,"cểm":741,"cểi":1406,"Hưể":237,"Vùn":202,"cể ":5697,"Văn":1172,"Sư ":192,"Viể":12501,"Vũ ":518,"Tư ":617,"Tuể":240,"Xô ":542,"Yên":481,"Sơn":1964,"ji ":324,"jar":408,"jan":213,"biể":38109,"jo ":152,"itr":907,"ito":2297,"itu":456,"itt":1509,"its":449,"itz":828,"ity":598,"ism":206,"isl":191,"iso":476,"isp":337,"iss":3448,"ist":3624,"ita":3373,"itc":146,"ite":2019,"ith":2136,"iti":2423,"ivo":162,"ius":1319,"ium":1206,"iva":362,"ix ":837,"ivi":819,"ive":1549,"ipo":143,"ipp":1792,"ipu":171,"ipt":450,"ipi":254,"iph":349,"ipl":287,"ilô":10662,"is ":16454,"ion":4473,"iop":856,"ior":202,"ios":885,"iot":545,"iou":177,"ioi":173,"iol":823,"ipa":1057,"ipe":657,"ir ":1809,"iru":298,"irs":241,"ück":162,"iro":773,"irk":203,"irl":235,"iri":455,"isi":750,"ish":1324,"ise":1480,"isc":1025,"isa":361,"iqu":1196,"ire":3687,"irg":448,"ira":1107,"irc":652,"it ":1527,"ja ":441,"iya":187,"iz ":244,"eón":2056,"가가가":272,"izo":396,"ize":388,"iza":272,"kim":1056,"kil":10715,"kia":494,"kin":3453,"kip":325,"kir":466,"kis":481,"km ":3828,"chể":21772,"ki ":1183,"khi":3505,"út ":627,"khe":165,"kha":834,"khu":15081,"kho":6571,"gày":9377,"gái":216,"kel":215,"ken":731,"kes":174,"ker":740,"ket":226,"key":304,"gân":416,"gây":413,"ke ":634,"úp ":776,"gàn":962,"úng":1614,"kra":492,"kre":255,"kt ":229,"cũn":2434,"ểa":61690,"ku ":366,"km²":641,"kot":155,"kor":174,"kom":159,"kok":195,"ks ":691,"ể ":376292,"úy ":216,"cũ ":497,"kno":896,"kka":183,"khô":4285,"khó":360,"khí":730,"khú":467,"ko ":151,"khá":4090,"kle":735,"kla":228,"buể":254,"ểo":9548,"ểp":25031,"ểk":175,"ểm":59218,"ển":369560,"ểi":112828,"ểc":255331,"ểy":15079,"ểt":179295,"ểu":43805,"kaz":157,"gà ":270,"kat":204,"kar":238,"kas":219,"kan":739,"kal":182,"kam":185,"kai":366,"ka ":1184,"cùn":1840,"cúp":216,"ha ":8300,"ùng":27727,"ham":3795,"han":4769,"hao":550,"hap":378,"hai":4690,"hal":2163,"hau":2057,"hav":347,"har":4673,"has":721,"hat":786,"hae":501,"hag":376,"hab":337,"had":177,"hac":331,"hay":7652,"he ":18327,"dàn":837,"dài":1594,"hel":1664,"hei":3775,"hec":208,"hed":210,"hea":373,"hey":228,"hev":147,"het":420,"hes":679,"her":2719,"heo":10006,"hen":1600,"hem":538,"hi ":13142,"dây":222,"dân":28557,"dãy":398,"căn":327,"hig":332,"hie":278,"hid":1412,"hic":705,"hia":1555,"hip":346,"hio":999,"hin":3454,"him":2986,"ùy ":240,"hil":2244,"hik":232,"hii":300,"hiu":173,"his":1013,"hit":572,"hir":444,"hn ":286,"hla":242,"hle":1267,"hli":520,"hlo":240,"ho ":7256,"hma":350,"gma":174,"go ":1433,"giá":5603,"gme":307,"già":408,"glo":256,"gle":536,"gli":276,"gla":529,"gko":162,"gog":725,"gny":420,"ghĩ":2919,"gno":258,"gni":256,"gne":3863,"giú":291,"gna":708,"úa ":604,"giô":172,"gs ":297,"goz":262,"úc ":3114,"gom":139,"gol":384,"gon":1688,"gos":596,"gor":601,"gov":334,"gu ":148,"goà":839,"gro":663,"gra":1031,"gri":527,"gre":280,"gto":303,"gui":371,"gum":182,"gul":499,"có ":47334,"gua":512,"gue":1969,"gy ":222,"cô ":368,"guy":4577,"gur":185,"gus":453,"gun":159,"còn":3275,"úi ":1356,"côn":4644,"gyr":164,"iai":626,"iam":328,"ial":1014,"iao":1227,"ian":5609,"ias":397,"iar":265,"iat":696,"ic ":3037,"iac":1407,"iae":280,"ibl":173,"ibi":314,"ibo":164,"ibu":345,"id ":1469,"iba":306,"ibb":546,"ibe":1206,"ia ":29704,"iet":540,"ieu":473,"iel":1226,"ien":2075,"ier":2747,"ies":4767,"ied":1831,"ieg":187,"ig ":1226,"ifo":1413,"iff":148,"ife":614,"ifl":166,"ifi":693,"icr":1054,"ics":295,"ict":897,"icu":1741,"ico":2316,"ick":622,"ici":3591,"ich":3932,"ice":765,"ie ":3741,"ica":6048,"idu":239,"ids":175,"ido":567,"idi":2598,"ide":2455,"ida":26239,"iid":4750,"il ":3121,"ija":174,"iji":157,"im ":5476,"ika":371,"ige":770,"iga":742,"ii ":1370,"igm":416,"igh":923,"igi":710,"igu":318,"igs":187,"igr":220,"igo":300,"ign":1169,"iha":223,"ik ":169,"imo":742,"imm":158,"imp":567,"ime":1906,"imi":643,"ip ":810,"inc":1322,"ind":1704,"ina":4704,"inb":264,"imu":305,"inn":513,"ino":1469,"int":4526,"ins":1803,"inf":237,"ine":10837,"inh":13778,"ing":8141,"ini":3074,"inl":2046,"ink":351,"ioc":292,"iod":167,"inu":932,"inv":156,"inx":181,"iny":208,"iko":368,"iki":420,"ike":257,"ila":1297,"ilb":181,"in ":13262,"ilo":550,"ill":9682,"ilh":181,"ili":3385,"ild":355,"ile":3779,"ima":1310,"imb":551,"io ":1175,"ily":2239,"ils":2324,"ilu":323,"how":204,"hol":800,"hom":456,"hon":2401,"hoi":164,"hos":411,"hot":229,"hou":479,"hoo":159,"hop":241,"hor":2606,"hoa":5214,"hof":203,"hoe":206,"hod":441,"hoc":207,"hni":161,"hne":266,"dée":277,"hme":233,"hmi":203,"hiê":5040,"dép":1137,"hua":252,"htt":326,"htr":324,"hth":197,"hte":186,"hst":227,"hse":1600,"hoá":860,"hoà":1969,"hu ":18061,"hry":423,"hro":1154,"hre":321,"hri":577,"ùa ":1067,"ht ":687,"hra":468,"hya":190,"huê":158,"hyl":1269,"dòn":604,"hy ":171,"hwa":665,"hwe":313,"hum":443,"hun":1586,"hus":902,"hut":182,"hur":289,"huy":20578,"Vưể":237,"dùn":1738,"dù ":368,"hyt":176,"hys":223,"hyr":202,"huô":174,"ùi ":234,"ffe":313,"ffi":237,"fer":812,"báo":721,"bác":152,"fen":465,"bán":1218,"fel":1035,"fgh":188,"bà ":200,"fas":191,"far":156,"fam":2105,"fal":2041,"bày":158,"bàn":328,"bào":422,"bài":1285,"ff ":218,"eya":142,"ext":206,"eyr":325,"eyh":211,"eye":179,"exa":986,"ez ":316,"exi":1325,"exc":181,"ezu":350,"ezi":175,"eta":842,"ete":743,"eti":1014,"eth":666,"etn":165,"etl":616,"esp":684,"esn":283,"eso":340,"est":3808,"ess":1434,"esw":957,"ev ":202,"euc":337,"eud":540,"eui":263,"eum":201,"eto":465,"etr":999,"ets":459,"ett":1071,"etu":203,"etw":156,"etz":312,"ew ":2560,"eve":622,"eva":566,"evi":1144,"euv":222,"eut":564,"eur":1318,"eus":1354,"ex ":606,"euz":190,"eux":671,"ey ":1664,"ewa":232,"erö":586,"epe":234,"epi":495,"eph":665,"er ":9592,"epa":418,"eot":213,"eor":527,"eom":471,"eol":262,"eop":530,"eon":709,"es ":26910,"ept":1404,"epu":235,"epo":154,"erk":202,"erl":752,"eri":6361,"erg":3169,"erh":199,"ere":2445,"erf":557,"erc":1171,"erd":573,"era":3401,"erb":858,"et ":4591,"equ":162,"esl":202,"esh":1269,"esi":2021,"esc":975,"ese":753,"eu ":303,"esa":259,"erz":260,"ery":556,"erv":528,"eru":1012,"erw":278,"err":2985,"ert":1566,"ers":4623,"ern":4630,"erm":2107,"erp":282,"ero":2376,"eki":196,"en ":12018,"elb":206,"ela":2654,"eld":1258,"elf":146,"ele":1267,"eli":1798,"elg":160,"elm":262,"elk":404,"ell":9746,"elo":933,"elu":208,"els":642,"elt":280,"ely":341,"eo ":10593,"eiß":164,"emb":1047,"ema":1158,"eme":2947,"emm":256,"emo":539,"emi":1585,"emp":238,"ems":155,"emy":139,"enf":291,"ene":1945,"enh":484,"eng":730,"enb":2082,"ena":1558,"end":2473,"enc":2127,"eno":956,"enn":3389,"enk":343,"enl":192,"eni":2073,"enu":1664,"ens":4503,"ent":10489,"enr":274,"enz":420,"eny":515,"eoc":157,"ego":1228,"ege":478,"egg":139,"egi":670,"egr":229,"egu":327,"ek ":217,"eic":752,"eis":833,"eir":787,"eim":1035,"eil":952,"ein":4903,"eie":545,"eid":730,"eig":203,"eif":191,"el ":3452,"eiz":186,"eit":258,"em ":1658,"öst":587,"giu":194,"gis":185,"gil":310,"gin":1363,"gio":553,"gid":1547,"gic":199,"gia":8820,"ght":794,"băn":187,"gho":161,"ghi":3850,"ghe":189,"gha":488,"ggi":146,"gge":165,"câu":470,"cây":1629,"gi ":387,"gen":5062,"cán":798,"cáo":198,"ger":1519,"ges":1085,"gh ":607,"các":18946,"geb":333,"cái":701,"gem":243,"gel":544,"cá ":1776,"ge ":1808,"gae":147,"gai":1612,"gas":1474,"gar":1812,"gau":202,"gat":452,"gay":317,"gam":376,"gal":707,"gan":1692,"gap":295,"ga ":2819,"bút":167,"Tưể":239,"fur":264,"fus":239,"bón":2056,"ful":164,"fun":375,"ft ":439,"fra":309,"fre":507,"fri":815,"bín":794,"fro":1604,"fou":879,"for":2385,"fon":152,"fol":504,"bìn":10852,"bên":1143,"bí ":160,"fle":146,"fla":250,"fli":289,"flo":441,"fly":418,"fic":547,"fie":487,"fil":195,"fin":314,"fis":489,"da ":3697,"de ":8611,"dac":762,"dad":140,"dal":972,"dai":189,"dag":446,"dae":24708,"dat":480,"dar":398,"dap":183,"dan":4922,"dam":377,"cun":718,"cul":2298,"cum":328,"cua":546,"cty":284,"ctu":2164,"ctr":238,"cto":873,"cti":1458,"cte":647,"cta":670,"cy ":251,"cus":673,"cur":399,"cut":255,"cyc":140,"cks":263,"cki":166,"ckl":786,"cla":486,"chá":201,"cle":346,"châ":14860,"cky":262,"chí":8076,"chò":225,"clu":420,"chó":180,"cli":182,"ché":314,"chì":193,"clo":464,"chù":312,"co ":1548,"chú":1406,"coi":717,"cod":256,"coa":287,"cob":146,"coc":214,"con":3655,"col":1035,"com":2135,"cor":1035,"cos":565,"cop":801,"cot":443,"cou":1134,"coz":183,"cs ":366,"ct ":454,"cre":241,"cra":438,"chơ":592,"cri":479,"cro":1725,"chư":1623,"ccu":244,"cci":838,"cco":265,"cca":156,"cea":4824,"ch ":37737,"cer":1071,"ces":531,"cet":142,"cen":1512,"cep":386,"Xươ":220,"cel":1002,"ced":395,"cha":3033,"chw":543,"chu":6009,"chy":360,"cia":1452,"ck ":1564,"cie":3416,"cid":2524,"che":4294,"chl":1776,"chi":13490,"cho":7644,"chm":217,"chn":259,"chs":1959,"cht":448,"chr":711,"cil":2248,"cif":277,"cis":311,"cit":448,"cin":1085,"cio":750,"cip":1009,"cm ":153,"cke":753,"ed ":4935,"eba":285,"ebe":884,"ôn ":3722,"ebi":183,"ebo":301,"ebr":934,"ebs":216,"ebu":294,"ec ":362,"eac":334,"ôm ":271,"eag":170,"eaf":201,"eae":4646,"ead":460,"ean":1098,"eal":621,"ear":1037,"eas":698,"eat":882,"eau":1120,"eb ":303,"ea ":3947,"efo":223,"efe":249,"ei ":970,"ega":773,"een":1179,"eel":145,"eed":257,"ees":218,"eer":175,"eep":145,"eet":728,"edi":1118,"ede":1955,"ône":1757,"ông":42975,"eda":404,"eg ":341,"edt":284,"edo":1988,"edr":212,"eck":1046,"ech":639,"eci":3537,"ece":167,"eca":304,"ee ":1150,"ôme":730,"ecu":225,"ect":1210,"eco":767,"dwi":249,"dwe":183,"dwa":258,"dy ":278,"dur":468,"dus":275,"duy":1143,"ôi ":2569,"dor":2168,"dop":306,"don":2791,"dom":306,"dol":520,"dow":395,"dov":304,"dou":172,"dos":360,"ds ":1070,"doa":646,"doc":1372,"dog":488,"dun":458,"dul":398,"duc":230,"dri":731,"dra":523,"dt ":665,"dre":808,"du ":1084,"dro":986,"dha":326,"dge":229,"dic":469,"did":312,"dia":1872,"ôte":1594,"der":3477,"des":3199,"ômé":10659,"dea":701,"ded":323,"dec":231,"del":1465,"den":4157,"dem":911,"deo":220,"di ":3427,"dle":656,"dla":192,"do ":4210,"diu":260,"din":1244,"dio":353,"dis":2108,"dit":379,"die":2847,"dil":249,"rgy":150,"rgu":287,"rhe":221,"rha":208,"rho":196,"rga":532,"ri ":2301,"rgi":1202,"rgh":152,"rge":1773,"rgo":1290,"rgn":567,"ret":1055,"res":3724,"rev":398,"reu":912,"rex":297,"rey":288,"rfa":190,"rfl":255,"nân":179,"rdu":164,"rds":270,"rdr":159,"này":31504,"rg ":4949,"reb":939,"rea":2006,"ree":1006,"ref":247,"rec":523,"red":781,"rei":1635,"reg":1203,"rem":776,"ren":4076,"rel":1428,"rer":300,"reo":199,"rep":265,"rf ":1207,"rda":570,"rcu":715,"rct":554,"rdo":799,"nào":548,"rdi":1755,"rde":2543,"re ":9446,"rbu":335,"rco":417,"rci":274,"rch":1874,"rce":427,"rca":280,"ray":496,"raz":574,"rd ":2860,"rao":655,"rap":552,"raq":160,"rar":494,"ras":1804,"rat":1838,"rau":875,"rav":215,"rbi":1026,"rbo":389,"rba":580,"rbe":418,"rai":3535,"rah":190,"rag":1656,"ran":8266,"ram":1112,"ral":2485,"rak":217,"rab":584,"raf":217,"rae":749,"rad":1615,"rac":3183,"rpu":270,"rpo":754,"rs ":2330,"rpe":165,"rpa":404,"rpi":173,"rph":340,"ror":201,"ros":1675,"rot":1221,"rom":2423,"ron":57984,"lĩn":607,"roo":669,"rop":3656,"roy":149,"rou":1200,"rov":1781,"row":593,"rob":744,"roa":358,"rod":644,"roc":1935,"roi":481,"rol":951,"rof":216,"rog":648,"nên":713,"rno":358,"rns":159,"rna":1180,"rne":2695,"rni":1419,"riè":172,"ném":189,"riê":408,"rmo":497,"rms":184,"ro ":929,"rma":3617,"née":2905,"rme":709,"rmi":637,"rly":155,"rlo":157,"rli":457,"rld":399,"rle":351,"rla":627,"rn ":3576,"hơn":1695,"hơi":801,"rki":204,"rke":393,"rka":202,"né ":169,"rm ":302,"riz":418,"rix":394,"rl ":161,"rip":365,"rio":1072,"rit":3591,"ris":2983,"riv":318,"riu":456,"rig":1218,"rii":3008,"ril":1020,"rin":4090,"rim":450,"ria":5391,"rib":1175,"ric":5324,"rid":4461,"rie":2155,"rif":320,"rk ":1670,"hơ ":851,"rwe":155,"nói":685,"rz ":208,"hư ":5084,"nôn":377,"rya":216,"ryc":442,"rug":231,"rue":707,"ruc":377,"rup":171,"run":25178,"rum":705,"rul":242,"ruy":2675,"ruz":213,"rus":1414,"rva":295,"rvi":473,"rve":224,"rwa":347,"ry ":2417,"rsi":891,"rso":252,"roß":201,"rsc":361,"rsd":187,"rsa":1123,"rsb":257,"rsh":319,"rse":899,"rta":737,"óc ":726,"rst":546,"rto":415,"rtb":145,"rte":1749,"rth":2805,"rti":1215,"nó ":2643,"rub":265,"rts":371,"rtr":430,"rtu":180,"rtt":513,"rt ":1957,"óa ":2701,"rqu":185,"rro":1564,"rrh":197,"rri":1788,"rre":2216,"rra":3074,"ru ":477,"rry":300,"rru":147,"sab":140,"sac":1864,"sai":281,"sak":172,"sal":439,"sam":384,"sba":297,"sbe":425,"sao":776,"óng":4090,"san":1255,"sau":2887,"sat":228,"sas":328,"sar":660,"oà ":337,"óp ":288,"sa ":2015,"núi":1191,"óm ":1521,"ón ":2564,"rze":166,"hưa":582,"rys":387,"ryo":172,"ryp":167,"ryl":162,"ói ":993,"ryn":172,"hưn":1523,"sha":619,"năm":27971,"năn":1182,"sho":293,"shr":233,"sht":381,"she":876,"shi":1264,"si ":634,"oãn":159,"sge":580,"sie":368,"sid":739,"sic":687,"sia":3700,"sk ":591,"shw":300,"shu":294,"sit":709,"sis":3021,"sip":265,"sin":5091,"sio":465,"sil":1895,"sim":553,"sii":258,"sif":151,"sig":220,"scr":307,"scu":281,"òng":2223,"oài":34230,"oàn":5706,"sdo":465,"sbu":460,"se ":4575,"oá ":380,"sca":1351,"sce":236,"sci":1194,"sch":1792,"sco":719,"sey":198,"ser":1028,"ses":272,"set":418,"oát":273,"seu":551,"sh ":1939,"sfe":321,"sea":715,"sei":161,"see":284,"sed":245,"sec":229,"sep":202,"sen":3196,"oán":1638,"sem":1371,"sel":2101,"hươ":4318,"spo":375,"shū":247,"spr":259,"sph":157,"spe":3907,"spi":1229,"spa":351,"sot":292,"sou":945,"sol":467,"som":376,"son":1789,"sop":308,"sor":191,"sof":194,"soi":556,"soc":217,"su ":286,"sra":294,"st ":2960,"squ":256,"ss ":872,"sli":218,"sky":167,"kiể":2684,"sla":1569,"sle":323,"ski":288,"sks":212,"khể":3237,"ska":369,"sna":2980,"sni":207,"sne":250,"smo":583,"siê":264,"so ":947,"sma":1028,"smi":179,"swi":1048,"syn":165,"syl":224,"sse":3231,"soá":206,"ssa":1676,"sso":1413,"ssi":3339,"ssu":544,"ste":5951,"stf":168,"sth":161,"sta":3335,"sto":2068,"stp":174,"sti":4685,"stl":165,"stu":445,"str":4349,"sts":262,"sty":144,"sub":863,"sul":490,"sum":304,"sup":191,"sun":266,"sus":998,"sur":1961,"suy":191,"òa ":5602,"sy ":172,"swa":243,"tai":3180,"tak":236,"tal":2442,"tae":263,"tag":338,"tah":173,"tab":252,"tac":777,"tad":502,"tay":436,"tax":151,"tau":185,"tat":1298,"tas":231,"tar":1441,"tap":149,"tan":2995,"tam":442,"tch":446,"òm ":317,"te ":8300,"tbu":170,"òn ":4112,"ta ":6742,"ký ":793,"ozè":153,"pa ":420,"làm":1846,"làn":610,"lá ":624,"pe ":1111,"par":2272,"pat":396,"pas":155,"là ":152991,"pac":448,"pag":1801,"pal":1430,"pan":1032,"phe":584,"pha":2087,"phu":325,"phr":290,"pho":3352,"phn":208,"phi":5842,"pi ":273,"lãn":895,"ph ":150,"lâu":392,"lâm":192,"pea":417,"pec":3297,"ped":468,"pen":678,"per":2665,"pet":646,"pes":3121,"pel":638,"pla":748,"hiể":23255,"pli":245,"phâ":2752,"phá":7538,"ple":773,"lès":368,"phí":12064,"phê":328,"plo":417,"phé":481,"phò":631,"phó":453,"phy":787,"pia":524,"pid":686,"pic":879,"pie":144,"pil":547,"pin":2627,"pio":231,"pir":455,"pis":429,"pit":358,"por":1000,"pop":359,"pot":426,"pos":699,"pom":618,"pon":447,"pol":658,"poc":162,"pod":1012,"ps ":851,"hū ":248,"ppi":1358,"ppo":224,"ppe":777,"phú":224,"po ":176,"lí ":1217,"lên":761,"pta":227,"pse":181,"psi":853,"pso":273,"ptu":208,"pua":385,"pub":253,"puc":309,"pte":1527,"pti":1324,"pto":542,"ptr":151,"pra":304,"hoể":8808,"pt ":251,"phư":1981,"pri":732,"pre":749,"pro":835,"huể":53256,"lôn":185,"lôm":10657,"pur":817,"pus":345,"pun":212,"pul":573,"lô ":685,"px ":1130,"pyr":166,"lý ":2650,"lúc":371,"mà ":1433,"màn":167,"máy":1955,"mã ":867,"màu":877,"mét":23053,"méo":221,"mìn":517,"qua":5837,"mô ":773,"quy":3085,"que":2033,"qui":2180,"món":229,"môn":561,"môi":520,"quâ":3680,"quá":966,"quê":311,"quý":155,"mùa":374,"ra ":16536,"ngo":2830,"ngi":1741,"ngl":604,"ngk":252,"ngu":5422,"ngr":255,"ngt":407,"ngs":604,"ni ":1381,"nge":3237,"ngh":7783,"nga":2179,"nho":251,"ndé":306,"nhu":514,"nha":2433,"nhi":5815,"nhe":263,"neg":239,"nei":512,"nel":1599,"nen":680,"nem":262,"neo":282,"ner":1201,"net":904,"nes":4373,"nev":164,"neu":463,"ng ":357107,"nea":1462,"neb":278,"nec":300,"ned":346,"nee":182,"nfo":163,"ney":270,"nez":428,"nh ":132657,"nfe":333,"nct":449,"nco":534,"nci":1276,"ncl":257,"nce":3207,"nch":2383,"nca":570,"ne ":19277,"nbu":1513,"ndu":416,"ndr":1581,"nds":990,"ndo":2398,"ndl":619,"ndh":176,"ndi":4609,"nde":3318,"nda":1999,"ncy":170,"nal":1151,"nam":13111,"nan":913,"nar":774,"nac":1244,"nad":927,"nae":1024,"nag":697,"nai":3249,"nbo":145,"nbe":462,"nd ":12230,"nba":380,"nav":164,"nau":516,"nat":2078,"nas":580,"nay":2459,"na ":6835,"iúp":291,"가 ":328,"cư ":415,"iôn":176,"myc":163,"nya":523,"nyi":193,"nz ":267,"ny ":1333,"nvi":479,"nx ":222,"nul":298,"num":443,"nus":3075,"nut":506,"nty":550,"nto":2015,"ntu":367,"nts":334,"ntr":2166,"nti":3110,"nth":1365,"ntl":140,"nta":3026,"nte":4904,"nsu":730,"nsy":214,"nso":325,"nst":981,"nsf":180,"nse":669,"nsh":461,"nsi":2204,"nsl":567,"nsk":139,"nsc":152,"nsa":421,"nsb":373,"như":5336,"nt ":8221,"ngư":22262,"nqu":229,"ns ":3001,"noc":411,"nod":303,"hĩa":2885,"nob":365,"nol":370,"noi":432,"nop":786,"nom":551,"non":799,"not":862,"nos":776,"nor":939,"now":1063,"nov":236,"nou":281,"nne":5126,"nna":750,"nno":345,"nni":716,"nns":289,"nma":453,"niê":790,"nhâ":2593,"nn ":413,"nla":2159,"nhá":489,"nhà":6343,"nhó":1299,"nly":637,"no ":1110,"hĩ ":710,"nhì":354,"ngâ":223,"nke":271,"nki":377,"ngà":10009,"nka":598,"ngô":1896,"nkt":202,"nja":290,"nii":438,"nig":289,"nif":242,"nie":473,"nid":3745,"nic":2037,"nia":5524,"nk ":242,"nix":164,"niu":181,"niv":273,"nis":1435,"nit":1158,"nio":409,"nim":283,"nin":1178,"nik":149,"nil":357,"ogr":267,"ogu":214,"ogi":228,"ogl":349,"ogo":319,"ogn":1408,"oga":317,"oge":346,"oi ":1139,"ohn":303,"oha":229,"ohe":168,"ogy":195,"ois":1103,"oir":3370,"oit":1455,"oin":300,"oil":149,"oid":1389,"oie":174,"ok ":359,"ol ":657,"oce":820,"och":1882,"oci":330,"ock":741,"ocl":146,"oco":568,"ocr":154,"obu":172,"oe ":211,"oca":1110,"occ":493,"ode":1045,"odi":450,"odo":1089,"odr":210,"oct":1918,"ocy":317,"of ":6954,"oda":691,"dươ":848,"oen":367,"odu":285,"oed":204,"og ":551,"oft":282,"off":252,"ofe":161,"oa ":13629,"oc ":1250,"oan":1921,"oad":230,"oba":818,"od ":1168,"oar":504,"oas":324,"oat":240,"obo":175,"obr":145,"obl":325,"obi":968,"obe":655,"nym":211,"nza":905,"nze":176,"oya":255,"oxy":147,"oxi":202,"oz ":147,"guể":1293,"ows":383,"owl":142,"own":1769,"owi":183,"ozo":255,"oza":724,"otu":288,"oud":174,"oub":187,"ouc":419,"oua":154,"ow ":1038,"oti":1060,"oth":1496,"ote":697,"ott":720,"ots":282,"otr":764,"oto":966,"ost":2982,"osu":324,"ota":1031,"ov ":258,"osi":491,"osh":227,"ose":2073,"osg":493,"osp":425,"oss":826,"osm":547,"oso":413,"osn":140,"oy ":273,"owe":547,"ovi":954,"ovo":159,"ouv":263,"oux":242,"ova":705,"ove":2204,"oug":763,"oui":391,"oul":679,"oun":2537,"oup":387,"ous":2368,"our":3149,"out":2276,"opo":1519,"opp":188,"opi":1120,"opl":220,"ope":1422,"oph":3191,"opa":411,"os ":2470,"opu":314,"opt":1403,"ops":1255,"oon":440,"ool":189,"oom":247,"ook":283,"oog":197,"ood":513,"or ":3376,"oot":347,"oor":232,"ork":631,"orl":473,"orm":2975,"orn":1953,"oro":870,"orp":1047,"orr":2602,"orc":598,"ord":3003,"ore":2045,"orf":1239,"org":708,"ori":3584,"ou ":1707,"osa":1094,"osc":366,"ort":2762,"ors":1266,"oru":667,"ory":1225,"kín":427,"kíc":629,"ot ":1197,"goể":939,"m² ":639,"orb":599,"ora":2404,"oqu":150,"ola":2194,"old":517,"kê ":1566,"giể":12764,"on ":15999,"oli":2444,"oll":1993,"olf":295,"ole":1581,"ols":1043,"olt":149,"olm":148,"olo":2220,"oly":684,"olz":312,"olu":1314,"olv":159,"oka":251,"ghể":2336,"om ":2129,"okk":143,"oki":163,"oke":162,"oku":288,"ona":2358,"ond":2416,"onc":461,"onf":214,"one":2805,"ong":59905,"oni":4577,"onl":675,"onn":2036,"kên":176,"ono":1589,"ons":1629,"ont":3656,"onu":697,"onv":312,"ony":549,"gũ ":341,"kí ":158,"oma":3160,"oo ":152,"ome":1667,"omb":1058,"omi":1212,"omm":2408,"omp":760,"omo":798,"kéo":312,"omu":461,"omy":255,"op ":502,"kì ":200,"la ":10761,"ính":11878,"ín ":1003,"há ":535,"le ":10516,"lca":204,"ít ":628,"lch":163,"lf ":224,"lde":761,"ldb":159,"lda":161,"hào":150,"ldo":198,"hàn":21780,"hàm":435,"hài":213,"ldi":250,"lab":417,"lac":1406,"lad":738,"lae":401,"lah":182,"lag":424,"laj":292,"lai":1546,"lal":167,"lan":10384,"lam":1066,"lap":209,"lao":259,"lar":4422,"lat":2293,"las":1229,"law":585,"lau":512,"lav":645,"lay":1413,"lba":423,"hà ":7105,"ld ":1873,"lbe":635,"lbi":189,"lbo":527,"lbu":797,"gô ":876,"góc":144,"ky ":446,"ích":17834,"cuể":3904,"gôi":662,"góp":269,"gôn":1009,"hìn":3185,"llé":141,"hìm":147,"lpe":2886,"lpi":216,"lph":225,"ls ":3004,"híc":534,"hía":11942,"lok":158,"lon":3179,"lom":1001,"lop":1213,"lor":1602,"lod":250,"loc":750,"loe":167,"log":693,"loi":242,"los":940,"lot":496,"lou":372,"lov":598,"low":793,"hêm":276,"lob":302,"hí ":2215,"liê":1520,"hép":885,"lmo":317,"lme":324,"lma":282,"hì ":1109,"lti":486,"lto":187,"hó ":498,"lud":253,"luc":198,"lue":351,"lso":351,"lst":1066,"lta":363,"lte":462,"lu ":183,"lse":196,"loà":31500,"lsa":619,"ía ":12221,"hín":8169,"lt ":1144,"lhe":150,"lha":193,"hãn":1016,"lge":195,"li ":845,"lga":417,"hât":447,"hâu":5143,"hân":32338,"hâm":245,"lfe":172,"ley":541,"lex":375,"leu":725,"lev":279,"les":4804,"hát":4309,"let":890,"ler":1633,"leo":683,"háo":303,"lep":286,"háp":24791,"hám":208,"lem":456,"len":2814,"hán":13040,"lel":163,"lei":443,"hái":3807,"leg":371,"lef":140,"led":786,"hác":3511,"lec":514,"leb":324,"lea":958,"lls":526,"llu":1871,"lly":728,"lo ":703,"lla":8252,"llb":208,"lle":6069,"lli":4980,"llo":3130,"lks":330,"hê ":529,"diể":17680,"lka":390,"lm ":305,"ll ":2030,"hè ":211,"lit":1006,"lis":1811,"lip":1960,"lio":1480,"lin":3515,"lim":803,"liz":196,"liv":616,"liu":661,"lic":1528,"lid":3931,"lia":3554,"lk ":142,"lik":204,"lii":571,"lig":457,"lie":1623,"lif":1017,"ma ":3578,"húa":466,"húc":1589,"hún":1289,"hút":209,"húy":143,"mb ":269,"ìm ":8521,"mac":644,"mai":885,"maj":203,"mad":498,"mae":190,"ìn ":392,"mag":301,"hür":727,"mar":2810,"mas":488,"mal":1566,"man":6200,"maz":173,"mat":1698,"mba":981,"mbl":196,"mbi":1974,"mbe":1329,"mbr":351,"mbo":504,"me ":3769,"iá ":976,"iße":139,"mbu":270,"ình":20204,"iàn":475,"med":457,"meg":343,"mea":185,"iác":256,"mec":144,"met":1158,"iáp":2001,"mes":1080,"mer":2891,"iám":415,"mem":171,"mel":1053,"iáo":2698,"ián":260,"men":4045,"mei":235,"luy":242,"hòa":4573,"lva":958,"hô ":347,"lve":227,"lvi":160,"lul":235,"lun":153,"lum":1803,"lut":702,"lus":2401,"ly ":5413,"hóa":2227,"hòm":296,"hòn":1026,"lz ":1932,"hôi":152,"hón":422,"hóm":1446,"hôn":8959,"luô":144,"lyp":567,"lym":264,"lyn":185,"hù ":377,"hú ":960,"hùn":400,"hùa":429,"mpi":548,"mph":909,"mpe":699,"mpr":166,"mpo":339,"mpl":212,"mpu":526,"mps":257,"ms ":540,"moc":217,"mod":141,"mon":2835,"mop":418,"mol":919,"mor":1191,"mos":484,"mot":349,"mou":592,"mpa":2137,"mu ":149,"mua":252,"mst":192,"my ":346,"mur":466,"mus":478,"mul":387,"mun":1488,"hăm":190,"hăn":308,"mi ":443,"min":2275,"mil":2845,"mir":352,"mis":870,"mit":1161,"cơ ":2355,"mic":1181,"mib":195,"mia":637,"mie":192,"mid":343,"mo ":242,"ièr":623,"mm ":173,"iêu":3120,"mni":139,"iêm":793,"iên":15316,"mno":232,"mmu":465,"mmi":508,"miê":868,"mmo":992,"mma":550,"mme":1127,"xâm":158,"xây":774,"xã ":6613,"xác":753,"thể":105642,"tiể":16334,"vĩ ":1285,"Để ":4107,"suể":581,"Đểt":393,"Đểu":259,"Đểa":425,"Đểc":8666,"Đểi":2655,"Đển":2345,"Đểo":612,"văn":2376,"soể":246,"vô ":966,"vòn":456,"zue":365,"zur":1019,"ruể":225,"võ ":369,"hưể":6739,"vùn":22597,"vây":180,"vào":6478,"vàn":747,"zen":390,"zel":179,"zer":396,"ze ":541,"vài":326,"và ":39376,"zam":416,"zan":947,"zak":194,"zar":156,"ví ":164,"zon":909,"zo ":189,"vì ":1170,"zna":154,"riể":3937,"zia":217,"zie":194,"zin":147,"zil":497,"yré":2878,"yx ":207,"yth":348,"yst":455,"yso":284,"ysi":882,"yri":846,"yro":276,"yra":354,"yrg":171,"yre":315,"ys ":1775,"yph":446,"ypt":526,"ypr":406,"ypo":300,"ype":314,"yon":322,"uý ":440,"za ":604,"gưể":22834,"uôn":11800,"uôi":272,"quể":14720,"ye ":286,"uá ":683,"yca":261,"yce":225,"ych":303,"ycl":287,"yco":153,"yct":483,"ydr":465,"yer":937,"uán":332,"yen":509,"ya ":1624,"yat":304,"yan":957,"yal":151,"uê ":487,"yla":353,"yle":207,"yli":383,"yll":1078,"ylo":482,"ylv":243,"ylu":187,"yma":185,"sĩ ":1783,"yo ":184,"yme":163,"ymp":602,"ymn":164,"yna":220,"yne":184,"yno":257,"uân":5407,"yi ":308,"tăn":371,"yho":211,"yin":196,"tín":2825,"để ":24120,"tíc":15647,"đểy":569,"xtr":322,"đểu":7023,"đểt":4542,"đểa":6133,"đểi":9089,"đểc":6272,"đển":28910,"đểo":4093,"đểp":378,"đểm":552,"tên":7356,"tìn":624,"tìm":8334,"xon":351,"xoa":253,"tù ":245,"tô ":345,"tòa":324,"xuy":162,"xun":238,"tôn":573,"săn":488,"xi ":160,"tây":9977,"tâm":1507,"tán":178,"xem":772,"tác":2521,"tái":142,"tàn":309,"tàu":1978,"nơi":1970,"phể":19501,"xil":335,"xin":153,"xic":980,"xa ":336,"tài":1059,"xce":150,"xe ":304,"xas":712,"xan":855,"ww ":171,"www":171,"són":349,"sôn":1977,"wo ":144,"yểu":888,"yểt":2656,"yển":27686,"sên":250,"wn ":1674,"sèr":449,"ws ":411,"wor":456,"woo":295,"we ":324,"sân":742,"wes":568,"wer":566,"sáu":219,"sát":605,"wen":170,"sán":1258,"wel":193,"wei":504,"wed":169,"wee":268,"web":356,"sác":1906,"whe":719,"whi":453,"răn":535,"sâu":259,"wi ":419,"wit":1057,"wig":1125,"wid":265,"wic":172,"win":477,"wil":183,"rös":588,"lưu":884,"vuô":11318,"dưể":1608,"rùn":419,"wa ":340,"rúc":638,"wan":646,"wal":651,"way":264,"wat":547,"war":864,"was":494,"wai":552,"rüc":145,"ría":161,"vre":519,"rò ":634,"vua":1478,"vul":314,"rõ ":263,"ròn":162,"rôm":369,"via":1024,"vir":262,"vil":2132,"vin":903,"vig":163,"vic":424,"vid":453,"vie":338,"ngể":4112,"vit":212,"vis":359,"nhể":20937,"ré ":197,"niể":631,"rén":2896,"viê":2261,"rì ":216,"rên":16855,"rí ":838,"voi":343,"vol":291,"von":231,"vor":176,"rìn":2591,"vi ":814,"râu":162,"rãi":238,"ver":2511,"ves":443,"vet":169,"ràn":171,"rào":384,"rái":592,"rán":159,"ven":2308,"vel":746,"rác":292,"ve ":1535,"val":496,"vak":211,"van":1011,"var":683,"vat":444,"rà ":170,"vad":360,"vai":638,"va ":1421,"uyê":4853,"cưể":274,"uze":169,"uzn":158,"uya":181,"uxe":167,"uxo":172,"muể":253,"ux ":1397,"uvi":315,"uve":890,"uy ":4510,"usk":685,"ush":495,"usi":1086,"use":2256,"usc":486,"usa":422,"usu":188,"ust":2067,"uss":2125,"utm":305,"uth":2069,"uti":986,"ute":4039,"uta":488,"utt":513,"uts":241,"utu":168,"uto":356,"us ":12540,"ut ":1468,"urb":527,"ura":1745,"urc":370,"ure":2141,"urg":4458,"uri":3154,"urk":298,"urn":505,"uro":993,"urp":172,"urr":1188,"urs":436,"urt":1515,"uru":494,"ury":261,"ur ":3979,"uph":337,"upi":208,"upe":461,"upl":256,"umi":350,"umo":202,"uma":655,"umb":1047,"ume":427,"unt":1093,"uns":220,"uni":1404,"unn":257,"unc":850,"und":2117,"una":410,"ung":28650,"une":861,"up ":561,"uki":214,"um ":5338,"ulu":810,"ult":827,"ulo":594,"ulm":200,"ull":475,"uli":2981,"ulg":343,"ule":468,"ulc":170,"ula":2994,"ulb":224,"miể":7160,"un ":876,"uid":1911,"uil":817,"uin":1133,"uis":537,"uit":1805,"ul ":704,"ugh":635,"uge":202,"ugl":142,"ui ":422,"uga":249,"ugu":343,"uco":341,"uct":140,"ucu":155,"uda":464,"ude":797,"udi":503,"ubs":175,"ubr":196,"uca":423,"ue ":2123,"uce":268,"ucc":527,"uci":266,"uch":1204,"ucl":244,"uck":465,"uet":161,"uev":189,"uer":585,"ues":940,"uff":186,"udo":607,"udw":163,"uee":563,"ued":1142,"ueb":187,"uen":891,"uel":938,"ub ":179,"ua ":4516,"uay":450,"uat":643,"uar":339,"ual":262,"uan":4521,"ubi":473,"ubl":385,"ube":682,"ubf":310,"uba":515,"ud ":256,"uai":209,"uad":968,"tze":378,"tyl":438,"typ":380,"bưể":9335,"trư":5334,"trù":418,"ty ":2056,"trú":686,"trò":743,"trí":887,"trì":2564,"trê":16809,"trá":625,"trà":444,"tvi":145,"tuy":1965,"tur":1038,"tus":1579,"tut":187,"tui":1819,"tul":991,"tun":226,"tum":525,"tud":242,"tuc":215,"luể":1046,"tz ":904,"two":235,"tră":326,"ts ":2029,"lũn":205,"tre":2316,"loể":3367,"tt ":262,"tra":13075,"thơ":816,"tri":6787,"tru":17464,"tro":55540,"thư":5419,"tu ":263,"try":204,"toá":1052,"tsc":237,"toà":1057,"tsu":330,"tsw":157,"tta":873,"tte":2605,"tti":475,"ttl":380,"tto":705,"ttp":325,"tts":284,"thă":146,"tme":444,"tma":160,"thú":506,"thù":145,"to ":4755,"thô":2398,"tiê":3084,"tp ":325,"tna":209,"toe":205,"tod":450,"toc":558,"toi":213,"tog":204,"tob":196,"tou":1449,"tos":429,"tot":153,"tow":612,"tom":1725,"ton":3013,"tol":487,"tor":2310,"top":602,"tr ":327,"tii":370,"til":3476,"tif":586,"tie":501,"tig":741,"tir":204,"tiq":562,"tit":511,"tis":1648,"tin":5106,"tim":1567,"tip":239,"tio":2405,"thy":410,"thu":53072,"thw":183,"tia":1585,"tic":3507,"tid":1768,"tiu":237,"tiv":602,"tli":260,"thê":286,"thé":143,"thí":558,"thì":784,"liể":1722,"tla":967,"thâ":11890,"thà":11154,"tle":1005,"thá":12170,"tem":2257,"ten":2299,"tep":155,"tei":2098,"tel":2040,"tee":338,"teg":403,"tea":605,"tec":172,"ted":1492,"tfa":159,"th ":4032,"tev":154,"teu":168,"tet":184,"tes":3544,"ter":9430,"ti ":720,"tho":1935,"thm":205,"thr":908,"the":21046,"thi":5961,"tha":4235,"之三 ":187,"之万 ":161,"rưể":6336,"之专 ":252,"ăk ":141,"ăm ":28988,"ăn ":5275,"ăng":3284,"xưa":183,"vươ":541,"nưể":35806,"丘 ":240,"mưể":240,"专 ":1368,"xuể":3053,"並 ":377,"tươ":996,"lưể":2704,"三 ":1277,"丁 ":542,"万 ":771,"uyể":30286,"乙 ":247,"sư ":673,"tư ":1117,"zèr":162,"vũ ":670,"ürt":532,"üri":758,"viể":4820,"rươ":293,"rưn":427,"tuể":1284,"trể":43603,"yêu":353,"之 ":589,"yên":4958,"sơ ":163},"n_words":[13809827,17315344,13396979],"name":"vi"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"·":11798,"é":695,"и":659,"о":642,"а":705," 《":2860," 。":4044," 、":2042,"あ":2229,"。":93258,"、":80590,"》":12787,"《":12801,"ア":2133,"乱":17692,"书":4419,"习":841,"乡":1808,"九":2519,"也":8266,"乘":28571,"乐":5294,"乌":1217,"义":4779,"之":17358,"久":747,"主":15497,"为":43069,"举":3076,"丽":1041,"丼":1241,"丰":817,"临":983,"个":19411,"中":47239,"两":4794,"严":592,"丛":1367,"业":8072,"东":12690,"丞":9778,"专":3303,"丕":15196,"世":9256,"丈":146015,"三":7927,"上":15305,"下":7962,"不":69712,"与":14330,"一":51232,"丁":117148,"七":1693,"万":2760,"价":1006,"份":14596,"任":5559,"以":21926,"令":1161,"代":10675,"们":2426,"他":7464,"仙":673,"仅":1129,"仍":888,"从":3822,"今":2840,"亲":1398,"人":38353,"亡":667,"产":5405,"亦":2742,"交":20817,"京":3333,"云":2725,"五":3431,"亚":46408,"些":2757,"了":8577,"争":2347,"予":591,"事":19248,"二":6378,"于":42781,"使":59599,"低":1173,"住":1250,"位":13424,"但":4207,"作":14815,"何":21331,"体":8040,"伯":2492,"传":4668,"伦":1894,"伊":2356,"企":1149,"会":16875,"优":845,"休":48239,"众":1421,"信":7840,"俄":2213,"保":3387,"侧":721,"供":2204," 分":744," 公":1819,"商":3626,"哥":1688,"品":4389,"响":1404,"和":23304,"周":2010,"呼":663,"命":2331,"员":7153,"启":817,"含":1431,"吴":771,"名":20347,"同":8476,"后":12165,"吉":2174,"合":7745,"各":3613,"向":3047," 号":1564," 台":812,"域":2609,"城":5315,"培":3476,"基":6619," 名":675,"址":719,"坦":1302,"坡":2693,"土":2149,"圆":692,"场":5846,"地":26817,"在":35727,"圣":2664,"回":1392,"四":5911,"团":4373,"因":6183,"园":2775,"围":1767,"图":2777,"国":50445,"器":3296,"况":701,"冰":646,"决":1729,"冲":743,"农":1200,"军":7136,"写":2568,"册":628,"再":1221,"内":8287,"击":1569," 丕":929," 世":1671," 丈":5741," 不":1283," 一":646," 丁":3946," 万":1431,"兰":5551,"共":7134,"关":4776,"兴":1795,"其":12214,"具":2485,"兹":634,"养":660,"入":4429,"全":7377,"八":1932,"六":2119,"公":17034,"党":2412,"元":4144,"克":7641,"光":2911,"先":2098,"免":623,"停":661,"原":6718,"压":796,"厂":831,"历":4017,"厅":672,"去":1176,"县":5347,"厦":614,"变":2770,"受":2511,"取":2479,"发":12789,"双":1530,"反":2487,"及":17913,"友":732,"又":5333,"参":2872,"司":7147,"号":5641,"台":13293,"可":7699,"只":2126,"口":4422,"化":7453,"包":4823," 何":732,"区":18524,"医":1928," 位":590,"南":15302,"单":3024," 使":1450,"博":1783,"升":971,"千":1226,"十":5342,"协":2640,"华":6868,"半":2037,"卷":760,"印":3243,"即":2844,"卡":3152,"卫":1639,"力":4944," 人":2100,"办":2818,"动":10155,"助":1407,"加":7203,"务":5246,"势":871,"劳":804,"勞":1639," 休":920,"分":15267," 个":2399," 中":1862,"刘":669,"则":3203,"创":4231,"初":2398," 之":647,"别":3560,"利":8681,"到":6365,"制":7035," 乘":758,"前":10878," 乱":770,"剧":3129,"副":860," 亚":733," 天":581," 大":1004,"工":9317,"已":3013,"巴":5253,"州":7529,"川":3028,"山":12651,"属":10137,"展":3788,"屋":1097,"届":1515,"局":2483,"层":1590,"居":1947,"尼":5208,"就":2894,"尚":3502,"少":2443,"小":25275,"将":4225," 多":650,"岸":1431,"岭":590,"岩":988,"岛":4996,"岁":688,"录":1700,"归":696,"当":5254,"形":4076,"役":939,"影":4410,"式":6888,"异":925,"开":7255,"引":4831,"张":2153,"弹":947,"强":1613," 小":894,"念":1709,"往":1560,"律":1732,"得":4936," 家":600,"德":8327,"广":7680,"并":7400,"年":47488,"干":1556,"平":5989,"帝":2672,"帕":667,"师":2671,"希":2317,"布":7549,"常":6171,"席":1225,"带":2534,"延":769,"建":8867,"库":1470,"底":1106,"应":3484,"店":1431,"庙":584,"庆":953,"康":1164,"度":6769,"座":2603,"大":34268,"央":1321,"天":7035,"夫":2315,"太":3274,"头":2350,"失":939,"备":1259,"处":4116,"复":1700,"外":5857,"多":11036,"奥":3388,"女":3642,"好":1250,"奖":2274,"如":3936,"始":3351,"委":2437," 和":850,"增":971,"境":2254,"声":4998," 在":1140,"子":9623,"存":2538,"学":28915,"安":5174,"它":4574,"宁":1619,"定":6314,"实":4032,"宝":999,"宗":2085,"宫":1054,"客":2272,"宣":1095,"宾":780,"家":33525,"富":1265,"密":1755,"察":855,"导":2989,"对":7380,"威":2129,"媒":887," 国":713,"区,一":689,"更":2085,"曲":2749,"曾":3299,"是":61140,"映":608,"春":892,"显":972,"星":4368,"易":1898,"普":2707,"智":818,"晚":728,"架":926,"林":7036,"果":2196,"极":1459,"构":3349,"查":1580,"机":7949,"未":4221,"木":1876,"术":4302,"本":13564,"望":1007,"朝":3840,"期":7068,"朗":953,"月":15728,"有":24992,"服":2639,"最":10271,"松":1118,"杰":812,"杨":679,"来":9550,"条":3385,"村":1739,"李":1577,"杀":947,"杂":1265,"权":2775,"播":2492,"·丁":954,"·丈":771,"摄":716,"料":1677,"文":14966,"施":1293,"斯":13962,"断":823,"旁":959,"旅":1334,"族":3701," 或":968,"时":14785,"无":3416,"旧":1079,"日":21151,"早":2138,"放":2159,"改":3194,"收":1975,"支":2892,"教":9551,"故":1906,"数":7124,"整":1497,"括":3402,"拥":1168,"拉":7109,"拔":3550,"报":2620,"护":1665,"技":2723,"抗":915,"投":1684,"执":1062,"扩":636,"承":1002,"批":885,"所":11521,"手":3277," 年":38848," 平":1187,"打":1400,"户":1297,"房":1033,"戏":2673,"我":858,"成":16718,"或":10994,"战":6655,"提":4221,"推":2158,"控":1211,"接":3776,"排":1222,"换":868,"据":3024,"持":2131,"指":5889,"情":1949,"息":1214,"态":1462,"总":5550,"感":998,"游":3622,"温":1133,"港":10900,"清":2634,"湾":8336,"湖":3858,"源":3031,"满":827,"演":3628,"区,多":1252,"澳":2170,"气":1964,"民":10950,"水":5383,"江":5857,"汉":2630,"求":1033,"汇":1020,"沟":938,"沙":2368,"河":5488,"油":1041,"治":5001,"没":1471,"泽":750,"波":2738,"派":1913,"活":2831,"洲":5289,"测":1385,"济":2241,"流":4509,"消":1003,"深":1422,"横":631," 是":2568," 月":13702,"区,常":666,"武":2208,"此":5466,"止":1051,"正":4102,"死":6678,"歌":2416,"次":4906,"欧":2758,"款":923,"母":1743,"比":4805,"毕":833,"毒":762,"案":1633,"桥":1371,"树":1023,"标":3335,"样":1395,"栽":2997,"根":2240,"核":1380,"校":3605,"楼":1695,"植":5185," 日":10844,"概":864,"石":3358,"知":2149,"省":5247,"着":1639,"皇":2205,"的":145656,"盖":717,"监":811,"盘":598,"目":9318,"直":3003,"白":2529,"百":1846,"登":1073,"病":1344,"町":605,"画":2451,"甸":900,"电":10706,"田":1994,"由":17497,"用":13911,"生":19204,"甘":1358,"略":871,"留":781,"界":6597,"理":9553,"球":6996,"班":1750,"玛":789,"王":5331,"现":8430,"环":2438,"独":1820,"状":1446,"片":2527,"牌":1136,"物":13958,"特":10095,"爱":2167,"照":1002,"然":2677,"热":1483,"点":3443,"火":1816,"灵":903,"缅":631,"缘":1067,"编":2591,"缩":977,"罗":6436,"网":4079,"置":1765,"署":901,"美":10457,"群":2218,"纳":2534,"纽":906,"线":7551,"红":1720,"约":5717,"级":4065,"纪":4115,"结":3008,"统":6645,"绝":650,"络":1469,"给":1157,"细":1259,"织":2183,"组":5795,"经":8789,"终":1135,"维":3625,"综":661,"绿":800,"继":1038,"续":1408,"索":1233,"素":1524,"类":5149,"米":9738,"系":8292,"等":12322,"策":957,"笔":614,"第":11099,"篇":721,"简":4298,"算":1987,"究":2923,"空":3846," 的":2781,"程":4573,"竞":659,"站":6147,"立":9651,"童":907,"突":730,"票":886,"神":3463,"社":4420,"示":1536,"积":2267,"移":1075,"称":17001,"私":738,"科":10282,"离":1553,"福":3324,"确":1094,"础":692,"破":649,"码":1376,"要":8323,"视":4552,"规":2234,"观":2303,"见":2171,"览":667,"角":3015,"解":2300,"西":17559,"被":7905,"街":2230,"行":14959,"表":5479,"认":2753,"计":5438,"训":622,"讯":1488,"议":3045,"让":773,"记":2357,"设":6237,"许":1740,"论":3372,"该":4765,"说":3972,"诸":678,"诺":1644,"读":946,"证":1510,"识":1123,"评":1093,"词":2012,"译":3205,"诗":967,"试":1000,"话":1875,"警":737,"言":2970,"路":9754,"越":2100,"超":1907,"足":2749,"资":4130,"赛":5990,"起":4293,"负":1207,"贝":1346,"财":832,"责":1208,"败":623,"货":901,"质":2689,"购":664,"贵":1643,"费":1407,"象":1888,"调":1492,"谷":1826,"过":6354,"进":5216,"还":1606,"这":6952,"连":2790,"远":1319,"运":6417,"近":3105,"辽":711,"达":4214,"边":2785,"较":2013,"辖":1462,"输":1210,"辑":1374,"车":6585,"轨":655,"转":1925,"轮":727,"软":1659,"轻":756,"载":1404,"身":2869,"脑":1283,"脉":651,"腊":1076,"致":1313,"至":9855,"自":8792,"而":9496,"者":7418,"职":2157,"联":6161," 米":6968,"肃":1126,"股":1262,"育":2953,"胜":1031,"能":5645,"药":977,"荣":690,"草":3994,"获":2004,"莱":1327,"菲":926,"般":2569,"航":2316,"舰":942,"艺":2269,"艾":805,"色":3575,"花":3115,"节":2402,"英":9686,"苏":3117,"著":2998,"营":2531,"萨":1866,"落":1487,"蓝":701,"蒂":676,"虽":721," 英":812,"频":899,"题":1997,"预":942,"领":2926,"食":1377,"飞":1436,"风":2500,"顿":1362,"顺":631,"项":2377,"顶":716,"页":780,"馆":2371,"香":9519,"验":1104,"高":8988," 阿":664,"马":7083,"鲜":1464,"鲁":1516,"鱼":955," 香":692,"黑":1900,"黄":2293,"龙":3881,"造":2929,"选":3181,"送":619,"适":644,"通":7604,"速":2166,"遗":895,"邻":1194,"郡":912,"部":14531,"都":5243,"配":1056,"金":5677,"野":1099,"量":3834,"里":8179,"重":5583,"释":634,"银":1422,"铁":4586,"针":594,"钟":734,"长":12428,"镜":635,"镇":2371,"锡":594,"锦":662,"销":827,"闻":1352,"间":8313,"问":1446,"门":4685,"际":4272,"陆":5534,"陈":1215,"降":626,"限":2379,"陕":1187,"院":5166,"除":1569,"险":710,"队":3990,"阶":1080,"阴":725,"防":1166,"阳":2041,"阿":4439,"随":1152,"难":627,"隶":660,"青":2398,"非":3368,"面":6531,"需":1195,"音":4816,"韩":1018,"가":589,")":53342,"(":53630,"-":1381,",":211761,":":15751,";":5849," (":2857," )":2917," ,":8290,"国的特":945,"植物。":2891,"植物,":1199,"基丁教":662,"等地,":3010,"民共和":1261,"。 ":4085,"、 ":2429,"》 ":659,"地区,":3536,"。这":1769,"、陕":800," 、 ":1160,"、福":680,"。现":602,"、甘":833,"。由":929,"、西":1056,"、贵":865,"。该":1506,"四川、":1228,"、广":1717,"、山":2033,"国大陆":3832,"、安":622,"。它":1480,"、四":1328,"。在":1915,"、台":689,"。分":2961,"、印":861,"。其":1173,"、湖":1562,"、河":1285,"、江":1104,"。此":760,"》是":802,"、日":634,"ああ":1707,"、《":1198,"》、":1065,"》中":595,"《丈":900,"、休":601,"。他":1369,"、云":1236,"。乘":742,"、人":909,"。丁":940,"、不":1336,"、丁":2633,"、丈":4209,"、乘":1367,"、中":959,"。丈":1034,"アア":1643,"地区的":687,"》(":2220,"》,":1819,"在中国":1180,"栽培。":2912,"、贵州":848,"是香港":1721,"立于 ":1061,"从 ":609,"人 ":623,"以 ":631,"于 ":10544,"了 ":641,"亚·":946,"亚 ":1478,"休 ":1064,"会 ":652,"使 ":1090,"不、":2097,"不。":742,"丈》":600,"丈、":2959,"丈。":2119,"丁、":2210,"丁。":2121,"一。":1805,"东、":1223,"中、":800,"人。":1182,"事。":826,"亚、":1600,"亚。":687,"交。":780,"丈山":758,"丈属":948,"不家":1629,"之后":1285,"丈小":1097,"丈家":1182,"丁属":1251,"中国":13581,"丈子":859,"丈学":961,"一家":830,"乱丈":1142,"为台":739,"丈大":637,"举办":651,"丁大":616,"何。":612,"乘丁":814,"乘一":737,"乘丈":920,"中华":3016,"乘丛":864,"丈地":713,"之一":4710,"丁国":1036,"丁地":630,"东南":852,"主义":1789,"为主":1114,"为中":1501,"为丈":1939,"为一":949,"为丁":953,"丈和":636,"为了":902,"不同":2049,"中使":2667,"不区":1235,"中一":647,"不军":620,"个人":1067,"丈克":1016,"丈其":691,"不公":924,"两个":961,"不使":1931,"休。":1293,"休、":1110,"丁克":614,"东不":1592,"丛中":711,"丁军":992,"丁兰":618,"不交":3736,"不京":1465,"不事":949,"丈休":1776,"丈使":1087,"不休":658,"丕丈":949,"丈不":1018,"丈丈":10643,"丈丕":925,"丈丞":767,"丈中":605,"丁休":2172,"丈为":629,"丈之":694,"上不":1488,"一位":1048,"丈乘":1009,"丈乱":1342,"丁作":779,"丈事":642,"丁使":3178,"丈交":632,"不丈":1417,"不丁":1361,"丈亚":1470,"不不":1140,"与丈":673,"一丈":695,"丁丈":5202,"丁丁":3673,"丁不":1113,"一个":9742,"丁乘":875,"一亚":4760,"一些":1051,"丁乱":789,"丁亚":2029,"份、":719,"丈一":1128,"丈丁":6021,"主教":1120,"任何":656,"于山":605,"交大":1013,"企业":965,"为是":626,"中文":1113,"人口":1818,"一次":948,"丈林":809,"他们":991,"产品":768,"事场":701,"以丈":590,"人使":892,"丁是":769,"丈是":785,"一条":1013,"不拔":3372,"二十":706,"人乘":690,"云南":1737,"人丁":945,"人丈":706,"主席":689,"丁教":806,"于台":1035,"元 ":583,"丁斯":907,"丁文":660,"一所":719,"亚使":601,"交休":767,"事务":628,"乘大":1199,"亚丈":1056,"不式":1110,"亚丁":910,"中学":1267,"亚亚":814,"交不":722,"使。":1718,"使、":1407,"丁店":632,"一座":888,"中央":1281,"于丁":1020,"于不":3550,"于丈":1169,"事交":1241,"于中":1731,"中的":2873,"休如":834,"丈科":1509,"亚栽":2911,"东省":644,"份年":663,"使休":784,"使代":599,"使中":607,"二次":594,"使丈":1433,"使上":903,"使丁":935,"下的":912,"世界":3851,"上的":1685,"何丁":623,"丈的":3617,"于日":763,"乘江":1087,"位于":6910,"丁的":3074,"不生":996,"作为":1517,"不的":1824,"也有":715,"不现":1191,"份围":818,"人工":3099,"丈球":620,"休丈":1574,"休丁":1348,"以及":5720,"不治":1654,"也是":2441,"休休":1177,"休使":948,"交的":754,"何成":650,"但是":741,"人的":1301,"事的":883,"人物":960,"丁西":582,"产生":952,"亚的":1008,"一般":2436,"人民":2356,"丈至":636,"丈草":591,"使团":1466,"前 ":886,"作家":983,"作品":1438,"亚洲":979,"使兰":933,"世纪":2136,"乘的":934,"到 ":1169,"丈 ":2639,"丈·":660,"丁 ":1901,"丁·":957,"为 ":5249,"地、":838,"国、":813,"又译":658,"发行":1130,"可能":964,"南部":1207,"名称":1365,"后的":584,"名的":1485,"同的":919,"商业":792,"使(":777,"使,":2980,"又称":2280,"台湾":6649,"位,":691,"发现":998,"体,":607,"何,":1182,"作,":878,"发生":1318,"后来":990,"只有":591,"南等":606,"休(":899,"休,":2609,"和国":2085,"会,":853,"会(":981,"同时":1360,"在 ":4342,"命名":845,"份,":752,"国 ":738,"区的":1567,"和丈":1029,"和丁":641,"前身":709,"员会":1240,"交,":1266,"制造":737,"反应":687,"人,":2362,"化的":604,"事,":1473,"亚,":1453,"亚(":751,"亚:":3961,"华民":1238,"发展":2345,"多生长":627,"国的":3079,"国王":741,"城事":1744,"因此":1473,"国民":1000,"基丁":1219,"分,":694,"四川":1896,"地区":6955,"国大":4205,"在台":652," 公里":1220,"国家":4191,"在不":766,"在丈":1143,"在丁":899,"地不":1517,"在中":1500,"内,":588,"国国":1280,"因为":1284,"团体":692,"国丁":732,"国丈":695,"国不":1345,"国人":990,"培。":2913,"公园":1063,"全国":1074,"共和":2100,"共同":711,"军事":699,"公司":6085,"和 ":1051,"克斯":708,"分丁":674,"分为":837," 丈丈":703,"及 ":757,"内信":885,"其他":1807,"内丁":823,"公共":779,"其中":1933,"俄罗":1205,"交通":1010,"代表":1920,"使用":3270,"使理":2624,"使的":2300,"何的":1007,"之间":1601,"作用":691,"何用":717,"传统":899,"作的":824,"何能":734,"九龙":820,"于香":923,"会议":766,"人闻":930,"体育":809,"使究":2862,"不部":1509,"代的":607,"人类":940,"使本":654,"丁近":677,"举行":1293,"主要":4270,"于美":614,"一部":1591,"丈车":720,"他的":941,"专辑":636,"信息":699,"保护":898,"亚言":1720,"休画":658,"休的":1686,"会的":752,"不面":1014,"及丈":729,"及中":2186,"下,":946,"上,":1461,"丈(":2586,"丈,":5165,"参与":744,"半岛":715,"不,":1460,"一,":2377,"丁)":1039,"丁,":4237,"丁(":1706,"包括":3325,"参加":688,"动画":653,"中,":3371,"及其":661,"业,":586,"原名":724,"动物":1359,"名为":1425,"乘,":704,"各亚":724,"又名":874,"印度":2180,"可以":2687,"台不":1436,"乱,":730,"名使":755,"合作":595,"包含":732,"单位":907,"员。":605,"华人":1471,"协会":1039,"南不":934,"公路":1194,"区域":737," 世纪":1447,"化学":839,"分类":693,"利用":693,"医学":647,"历使":2363,"创立":811,"公里":2086,"前尚":2725,"内的":698,"关系":1044,"名。":910,"创作":725,"分别":959,"利亚":1729,"共有":696,"制作":1175,"创办":644,"具有":1055,"区、":855,"区。":806," 中国":719,"南、":2847,"分子":582,"分布":3839,"全球":752,"加乘":908,"州事":599,"工业":864,"工作":1348,"实际":605,"巴乘":819,"属的":3366,"学院":1919,"度、":646,"帝国":1098,"工引":2908,"平不":1296,"山谷":602,"并丁":847,"年代":1820,"广东":1877,"广乘":652,"布在":1655,"山西":700,"小都":652,"布于":1877,"建、":593,"巴使":874,"小说":1496,"、陕西":793,"场,":659,"尚未":2839,"定的":781,"州、":1204,"地,":4149,"川、":1254,"它的":633,"山坡":1678,"学的":914,"国,":704,"学生":932,"小的":1375,"德·":778,"就是":947,"家的":1013,"应用":955," 年 ":13492,"形式":842,"形成":784,"德国":1532,"或 ":819,"影响":1080,"德亚":680,"希腊":1007,"年的":1358,"广西":1228,"建立":1106,"当时":1404,"年至":809,"拔 ":3155,"外,":1036,"广场":674,"广州":765,"建于":690,"建丁":1787,"年在":775,"处,":614,"开始":2055,"工程":1106,"已经":808,"常生":724,"引亚":2921,"广播":596,"开发":1428,"大学":4751,"名:":4034,"后,":2099,"名,":804,"大战":639,"号,":798,"司,":904,"司(":706,"太平":714,"员,":1310,"委员":1628,"多生":1267,"处理":904,"国立":626,"动,":770,"基本":664,"境内":659,"声任":875,"区,":4940,"基础":690,"国际":3114,"大利":1318,"大使":1012,"家 ":943,"天休":592,"天主":779,"大丈":869,"学使":616,"大陆":4400,"它们":581,"安丁":901,"学名":3799,"定义":596,"存在":833,"岛、":590,"家亚":645,"家丈":713,"家丁":849,"学家":1872,"对于":704,"年 ":13993,"小丈":990,"家国":1673,"尼亚":1342,"学校":1549,"它是":712,"家律":640,"属于":2022,"宗教":606,"山东":748,"大的":1819,"品,":659,"未由人":2713,"学、":1095,"媒体":593,"家、":1079,"家。":1319,"学中":593,"大部":622,"是由":1948,"最大":1527,"成,":1007,"最早":864,"是美":1276,"日至":621,"林中":937,"林下":866,"林丈":747,"是日":749,"最丈":683,"教育":1765,"斯特":879,"是指":1893,"最后":916,"服务":2010,"时的":619," 日 ":1067,"有一":1032,"有丈":634,"有关":606,"栽培":2968,"曾经":583," 月 ":10460,"期的":814,"未由":2714,"有的":742,"来的":817,"是香":1732," 年,":2603," 年)":2400,"机构":1217,"有植":939,"有时":593,"时间":1469,"标何":1254,"故事":840,"教丁":619,"教会":604,"教休":765,"文使":804,"、湖不":767,"斯丁":803,"文化":2111,"、江西":640,"式,":710,"是台":693,"是在":1454,"早期":592,"时期":1763,"日本":5547,"数学":761,"、湖南":720,"名:)":3257,"时代":1179,"斯坦":867,"文学":1074,"数据":868,"是位":626,"是以":583,"是中":2851,"日在":675,"是丁":1437,"是一":8458,"是不":724,"是丈":2026,"是 ":2352,"推不":777," 年的":914,"或称":642,"成立":2637,"技术":1644,"成的":1249,"日 ":1181,"斯·":1031,"所有":1061,"拉丁":1110,"委员会":1211,"投资":588,"月 ":10522,"有 ":2350,"年(":1135,"年)":2514,"年,":3020," 年至":781,"提不":749,"提供":1571,"控制":687,"或者":862,"拥有":1075,"家(":883,"家,":2992,"学(":696,"学,":1273,"子,":932,"成。":674,"年间":605," 年在":695," 年代":1447," 平不":1027,"所以":844,"成员":1194,"属(":1286,"成为":2403,"战争":1282,"成何":673,"车站,":720,"总统":750,"小,":1060,"流行":630," 日)":775," 日,":1367,"没有":1287,"活动":1238,"比赛":1111,"江苏":807,"江西":972,"湖不":1117,"湖南":1294,"源于":655,"游戏":1924,"江、":1010,"斯(":605,"文:":2387,"毕业":667,"时,":1279,"死关":792,"日,":1548,"日)":813," 日至":616,"、广西":861,"民国":1569,"民主":899,"欧洲":1589,"民共":1266,"正式":1275,"河南":853,"月,":734,"民族":1066,"期,":613,"河不":810,"称为 ":1204,"根据":1261,"来自":905,"朝鲜":1221,"最高":965,"期间":1013,"有限":1519,"植物":4924," 是一":701,"概念":628," 日在":655,"社会":2250,"的西":629,"的重":753,"第 ":1806,"的第":1619,"物,":2113,"目的":1041,"的是":1254,"的最":739,"的植":2910,"称 ":1447,"积 ":789,"生长":4500,"直接":612,"的特":1539,"的电":996,"的小":1525,"的家":638,"电脑":1011,"的大":1230,"的国":1201,"的基":581,"的地":4246,"电视":2674,"目前":4402,"的丁":3401,"的一":9462,"用的":1236,"生的":842,"球队":888,"生物":1143,"甘肃":1053,"的名":795,"的发":604,"的休":1722,"的使":1923,"的何":678,"的作":618,"的主":1363,"的中":1197,"的丈":5079,"的不":2462,"的乱":628,"的乘":866,"的人":1919,"的交":1075,"的亚":598,"的份":596,"用来":773,"电影":2044,"理论":1071,"生活":843,"生在":746,"电子":1067,"的。":902,"的《":764,"由人":3013,"由于":1683,"用于":1226,"生于":2477,"生产":971,"理学":1070,"王朝":657,"独立":1054,"现在":1080,"环境":892,"的 ":3036,"用。":732,"现代":899,"班丁":801,"物理":701,"特有":1087,"由 ":647,"特别":1034,"澳门":921,"、福建":587,"物。":3636,"而成":666,"网络":1319,"等,":840,"。由于":637,"结构":811,"称,":606,"统治":615,"美国":6262,"有限公":1342,"经济":1788,"线的":727,"统的":590,"群岛":661,"罗斯":1403,"组织":2012,"联休":1568,"联合":1420,"经营":801,"美洲":847,"站,":1417,"站(":605,"立,":948,"约丈":612,"系统":2897,"、甘肃":804,"结丁":584,"纪念":701,"缩写":612,"至 ":5691,"肃、":800,"自 ":728,"组成":1458,"米的":3087,"米至":2745,"空间":581,"简称":3399,"目,":621,"系使":1617,"等地":3350,"的,":1232,"算机":592,"约 ":1747,"天主教":743,"立的":1250,"立于":1279,"生,":616,"用,":772,"科技":622,"第三":1132,"第一":3441,"第二":2122,"科丈":1055,"等。":1256,"科学":1716,"福建":1133,"称为":4237,"站。":627,"角色":608,"认为":1515,"西部":624,"计份":1242,"译为":652,"西班":773,"要的":1104,"视台":583,"被称":1048,"设计":1764,"越南":953,"贵州":1074,"计算":1077,"设立":673,"赛事":631,"过 ":590,"达 ":635,"许多":1084,"这亚":795,"这些":921,"这个":1485,"运动":2493,"选举":733,"超过":649,"软交":1060,"路线":871,"负责":846,"资讯":628,"足球":1709,"行,":1162,"资料":726,"都是":704,"说,":693,"进行":2271,"华民国":1215,"过程":791,"部分":2341,"部份":779,"车站":2410,"通常":1655,"连接":661,"系,":584,"英亚":2018,"自治":726,"自然":791,"英国":2374,"艺术":1217,"自由":876,"般生":684,"航空":1141,"草地":757," 米的":3008,"罗马":1148,"自丁":786," 米至":2742,"米,":928,"联赛":872,"者,":759,"节目":923,"英文":2834,"苏联":584,"获得":1131,"线,":677,"著名":1735,"是台湾":602,"虽然":581,"行。":771,"规份":679,"西南":748,"行的":1255,"西份":1008,"西亚":1199,"西不":1211,"行为":642,"行不":2204,"西、":2735,"中,目":813,"马来":712,"香港":8632,"高丁":607,"间,":1121,"队,":590,"高速":865," 香港":632,"及中国":2044,"公里,":1108,"是一个":2236,"是一亚":2252,"是位于":587,"丈(学":778,"是中国":2142,"部的":1067,"通过":1270,"赛,":689,"长 ":666,"路,":847,"重要":1745,"的第一":601,"铁家":726,"重要的":693,"长于":3283,"长在":1116,"部,":1121,"除了":587,"限公":1343,"阿亚":611,"间的":1117,"阿拉":760,"铁路":1902,"银行":879,"陆的":3096,"里,":1518,"陕西":1134,"音乐":1761,"面积":1528,"问题":913,"非洲":689,"领域":706,"需要":625,"项目":737,"有植物":938,"的特有":970,"华人民":1156,"是美国":1183,"生长在":1097,"的植物":2905,"生长于":3254,"日至 ":606,"前尚未":2719,"最大的":950,"加乘大":716,"是日本":698,"目前尚":2723,"的地区":3123,"甘肃、":797,"的一部":774,"的主要":669,"年),":737,"的一亚":1477,"的一个":3092,"限公司":1342,"年( ":710,"长于不":2913," )是":697," ,是":1072,"南等地":586,"),":14139,"()":3758,"(,":1176,",)":1561,":,":733,":)":4083,"由人工":2914,",有":1486,",最":1122,",曾":1063,",是":13170,")是":10848,",此":660,",总":746,",当":886,",并":4017,",常":1051,",故":644,",指":584,",所":997,",或":1328,",成":1293,")的":2201,",目":3648,",第":777,",简":2029,",现":1341,",由":3911,",用":638,",生":3542,",西":833,",被":862,",该":1018,",经":598,",美":598,",而":3223,",英":974,"(英":2411,",香":721,",这":1581,",通":884,",《":690,")。":3713,")、":2880,"(丁":582,"(学":3658,",它":1417,",属":841,",小":789,",因":2416,",在":4524,",多":1573,",大":803,",如":965,",前":662,",分":865,",南":723,",即":1029,",包":1287,")和":952,",后":1255,",同":895,",台":750,",可":1101,",又":2947,",原":1270,",与":1258,",不":2776,",东":861,",中":1697,",主":1658,",为":3616,"(今":666,",乘":1464,",也":3256,",乱":593,"(丈":745,")为":2108,",丈":5138,",丁":4254,",一":2351,",休":1379,",但":3353,",位":2756,",使":1942,",交":917,",亦":1195,",于":2786,",人":883,",以":3498,",他":1341,",从":880,",份":844,",共":756,",其":3552,",全":1258,"( ":4029,") ":1194,", ":8207,"- ":821,": ":1177,"于台湾":746,"云南、":1087,"丁属(":793,"中国的":1325,"中国大":3854,"之一,":2267,"不拔 ":3152,"于不拔":2931,"于中国":1363,"丈属的":710,"人工引":2908," ),":840,"乘江、":593,"特有植":935,"位于香":588,"于日本":718,"交大利":915,"以及中":2006,"一般生":684,"英文:":1509,"亚栽培":2910,"陕西、":802,"人民共":1261,"广西、":800,":)为":1432,":)是":1905,"),又":980,"),是":2411,"()是":1113,",),":1007,"著名的":779,"(),":1527,"年至 ":713,"面积 ":773,"于香港":916,"之间的":629,"内丁使":615,"成立于":917,"英亚:":1340,"俄罗斯":1205,",香港":708,"物。分":2814,"公司(":690,"公司,":865,"行不区":1022,"共和国":1949,"分布于":1823,"分布在":1639,"》、《":946,")是丈":621,")是一":2670,",是一":2180,",是中":1286,"湖南、":689,"あああ":1321,"尚未由":2713,"地,生":2896,",并丁":652,",常生":669,",所以":699,",又称":1386,",台湾":584,",因此":1152,",多生":1256,"(学名":3651,"。分布":2922,"、印度":793,"、乘江":625,"、云南":1182,"被称为":866,"、山坡":669,"、广东":743,"、四川":1251,"大陆的":3080,"アアア":1275,"学名:":3527,"(英文":1335,"(英亚":1020,",目前":3438,",生长":2942,"西班丁":773,",简称":1929,"工引亚":2908,"平不公":791,"贵州、":836,"广东、":761,"引亚栽":2908,",包括":1127,"米的地":2908,",又名":585,",在 ":658,",其中":943,",以及":1003,",也是":1477,",位于":2555,",一般":1217,",主要":1207,",为中":658,",中国":911,"属的植":2824,",于 ":2118,"丁使、":641,"丈丈丁":631,"计算机":592,"江西、":618,"湖不、":763,"不同的":722,"米至 ":2743,"之一。":1753,"中华民":1237,"中华人":1156,"为中国":948,"不公里":767,"丈丈,":642},"n_words":[4792118,1709982,314544],"name":"zh-cn"}
\ No newline at end of file
+++ /dev/null
-{"freq":{"·":11773,"é":695,"區,常":664,"и":659,"о":642,"а":705," 《":2780," 。":1867," 、":1053,"あ":2229,"。":93215,"、":80530,"》":12775,"《":12789,"」":16776,"「":16978,"ア":2133,"九":2518,"也":8240,"乘":27365,"之":17356,"久":750,"主":15490,"丼":1230,"中":47219,"並":6623,"丞":85837,"丕":96789,"世":9230,"丈":143774,"三":7943,"上":15307,"下":7965,"不":69580,"一":51222,"丁":112603,"七":1691,"份":8990,"任":5556,"以":21889,"令":1163,"代":10588,"他":7458,"仙":669,"仍":884,"今":2840,"人":38053,"亡":666,"亦":2741,"交":18627,"京":3334,"五":3430,"些":2759,"亞":10406,"了":8549,"予":591,"事":19252,"二":6375,"使":59515,"低":1174,"住":1250,"位":13512,"但":4206,"作":14816,"何":16276,"佐":33991,"伯":2485,"伊":2316,"企":1149,"休":48236,"信":7776,"俄":2093,"係":1210,"保":3393,"來":9528,"供":2206," 分":709,"單":3071,"問":1448," 公":2117,"商":3624,"員":7098,"哥":1699,"品":4392,"和":23261,"周":1446,"呼":664,"命":2330,"含":1432,"名":20323,"同":8443,"吉":2089,"合":7724," 倫":1516,"各":3613,"向":3031,"域":2612,"城":5349,"執":1078,"培":3458,"基":6608," 名":633,"址":721,"坦":1100,"坡":2688,"團":4364,"土":2148,"園":2775,"圖":2763,"國":50470,"地":26714,"在":35708,"回":1107,"四":5909,"因":6124,"嚴":592," 勞":1513,"器":3287,"冰":646,"再":1221," 丞":2854," 丕":3070," 世":1592," 丈":4439," 不":917," 丁":3186,"優":845,"共":7054,"其":12215,"具":2485,"入":4421,"內":8167,"兩":4796,"全":7445,"八":1932,"六":2118,"公":17482,"兒":1569,"元":4231,"克":7487,"光":2846,"先":2097,"免":625,"傳":4645,"價":1006,"倫":51626,"個":19430,"們":2429,"停":661,"原":6791,"去":1179,"受":2509,"取":2465,"反":2477,"及":17959,"友":732,"又":5332,"參":2854,"司":7145,"台":11564,"可":7754,"只":1856," 個":2397,"口":4174,"化":7454,"包":4825," 佐":611," 位":594,"南":15370,"協":2646," 使":1198,"博":1769,"升":830,"千":1228,"十":5341,"區":18522,"半":2039,"卷":655,"印":3241,"即":2842,"卡":3151,"劃":2056,"劇":3028,"力":4955," 人":1795,"助":1406,"加":7145,"勞":44204,"務":5038,"動":10173," 休":691,"分":15261," 中":1418,"初":2396,"別":3548,"利":8617,"到":6362,"制":3819," 乘":601,"則":3204,"前":10854,"副":861,"創":4236," 大":752,"工":9304,"已":3034,"巴":5247,"州":7520,"川":3027,"山":12644,"屬":10111,"展":3787,"屋":1099,"局":2380,"居":1946,"尼":4905,"就":2895,"尚":3522,"少":2442,"對":12131,"小":25320,"導":2988,"專":3315,"將":4225,"島":4996," 多":609,"岸":1457,"岩":955,"彈":1036,"形":4081,"役":939,"影":4543,"式":7142,"引":4806,"張":2155,"強":1613," 小":712,"念":1686,"往":1561,"律":1728,"後":11603,"得":4825,"從":3819,"德":8443,"幹":610,"年":47473,"平":5943,"帝":2671,"帕":627,"希":2358,"布":6774,"常":6175,"帶":2450,"師":2700,"席":1237,"延":769,"建":8865,"廣":7680,"廠":829,"底":1070,"店":1431,"康":1164,"度":6845,"座":2605,"大":34205,"央":1335,"天":6785,"夫":2310,"太":3614,"失":939,"外":5866,"多":11013,"奧":3348,"女":3636,"好":1255,"如":3939,"始":3424,"委":2437,"場":5840,"報":2620," 和":619,"增":971,"境":2254," 在":1006,"子":9582,"存":2418,"學":28894,"安":5183,"它":4320,"定":6314,"宗":2121,"宮":11665,"客":2255,"宣":1095,"家":33259,"富":1263,"密":1751,"察":902,"寫":2568,"實":4042,"威":2163,"媒":887,"更":2088,"曲":2693,"曾":3297,"書":4421,"是":61135,"映":606,"春":891,"星":4383,"易":1898,"普":2726,"智":821,"時":14789,"晚":728,"架":926,"林":7015,"果":2193,"查":1544,"未":4199,"木":1871,"本":13516,"望":1009,"朝":3760,"期":7068,"朗":950,"月":15722,"有":25006,"服":2638,"最":10258,"會":16872,"松":993,"東":12692,"村":1739,"李":1578,"播":2488,"擊":1563,"據":2801,"·丁":953,"·丕":684,"·丈":742,"料":1908,"文":14926,"於":85512,"施":1293,"斯":13761,"旁":958,"旅":1333,"族":3698," 或":592,"日":21067,"早":2137,"放":2161,"改":3194,"收":1971,"支":2879,"教":9547,"故":1906,"數":7013,"整":1506,"括":3403,"拉":7094,"拔":3534,"技":2726,"抗":915,"投":1689,"承":1001,"批":886,"所":11497,"手":3280," 年":38814," 平":1137,"打":1356,"戰":6668,"戲":2673,"房":1031,"我":858,"成":16686,"或":10999,"提":4217,"推":2160,"控":1213,"接":3506,"排":1256,"持":2131,"指":5892,"情":1951,"息":1210,"應":3488,"感":1000,"愛":2219,"游":601,"測":1417,"港":10903,"清":2633,"湖":3859,"源":2955,"滿":823,"漢":2629,"演":3758,"澳":2170,"濟":2246,"民":10942,"水":5379,"氣":1960,"江":5858,"求":1019,"決":1728,"沒":1469,"沙":2360,"河":5480,"油":1041,"治":5045,"波":2725,"派":1934,"活":2827,"洲":5278,"流":4541,"消":1003,"深":1422,"機":7988,"樓":1698,"標":3283,"樂":5292," 是":2270," 月":13683,"武":2208,"此":5462,"止":1051,"正":4102,"歷":3537,"歲":679,"死":6673,"歌":2412,"歐":2776,"次":4908,"款":922,"權":2775,"母":1810,"比":4792,"毒":761,"殺":3118,"案":1653,"栽":2979,"根":2237,"核":1380,"校":3603,"條":3385,"楊":679,"業":8348,"植":5169,"構":3354," 日":10789,"概":795,"石":3331,"知":2143,"省":5241,"眾":1402,"皇":2166,"的":145617,"目":9288,"直":3002,"發":12643,"白":2538,"百":1845,"登":1101,"病":1342,"町":605,"甸":897,"田":1992,"由":17477,"用":13939,"產":5389,"生":19162,"甘":1400,"當":5199,"畫":2821,"略":871,"留":781,"界":6587,"環":2430,"理":9565,"球":7020,"區,一":683,"現":8423,"班":1762,"王":5328,"獎":2259,"獲":1989,"片":2524,"牌":1136,"物":13950,"特":9797,"爭":2331,"爾":10487,"營":2531,"照":1001,"然":2724,"無":3430,"灣":8336,"火":1815,"置":1766,"署":979,"羅":6361,"美":10443,"群":2244,"義":5695,"習":841,"總":5592,"縣":5350,"繼":1038,"續":1407,"索":1368,"素":1525,"納":2465,"約":5697,"紀":4111,"級":4056,"統":6648,"組":5814,"結":3076,"綠":801,"維":3676,"網":4429,"經":8789,"綜":642,"編":2471,"線":7038,"簡":4320,"米":9228,"系":6773,"等":12258,"策":935,"第":11129,"篇":721,"節":2370,"算":1983,"積":2283,"究":2927,"空":4143," 的":1486,"程":4595,"稱":17006,"種":13270,"站":6147,"立":9659,"競":659,"童":907,"突":731,"票":889,"神":3459,"區,多":1245,"社":4421,"示":1588,"移":1020,"私":736,"科":10232,"福":3334,"破":649,"要":8322,"規":2224,"視":4488,"親":1395,"觀":2300,"角":3003,"解":2370,"西":17796,"被":7902,"製":3213," 萬":1397,"衛":1641,"街":2230,"術":4294,"行":14995,"表":5436,"變":2770,"譯":3201,"警":737,"議":3043,"護":1664,"證":1490,"調":1479,"說":3940,"語":12000,"認":2740,"論":3370," 號":1554,"設":6262,"記":2497,"計":5475,"訊":1564,"言":2973,"該":4761,"話":1875,"評":1093,"路":11367,"越":2102,"超":1905,"足":2739,"起":4291,"賓":786,"資":4396,"賽":6003,"質":2688,"費":1408,"貨":906,"責":1207,"象":1864,"谷":1752,"近":3107,"辦":2818,"農":1197,"轉":1925,"較":2015,"載":1405,"車":6562,"身":2869,"致":1208,"至":9837,"自":8751,"而":9484,"者":7426,"聞":13745,"聖":2663," 米":6617,"聯":14082,"聲":1069,"肅":1122,"股":1263,"育":2954,"能":5599,"草":3974,"菲":888,"華":6888,"般":2564,"航":2033,"興":1795,"舉":3075,"與":14328,"艦":940,"艾":758,"色":3577,"花":3113,"英":9681,"藝":2269,"藥":973,"葉":1953,"著":4637,"萬":2757,"落":1486,"蒂":673,"處":4121,"號":5640,"蘭":5557,"蘇":3088," 英":759,"風":2494,"食":1389,"飛":1667,"領":2927,"預":955,"項":2370,"類":5147,"馬":6983,"香":9518,"館":2367,"體":9444,"高":8889," 阿":586," 香":621,"點":3460,"黑":1831,"黃":2289,"黨":2390,"龍":3877,"連":2837,"造":2926,"進":5235,"送":620,"這":6954,"通":7591,"速":2179,"遺":895,"選":3181,"過":6349,"運":6466,"遊":3024,"達":4169,"還":1604,"邊":2782,"郡":912,"部":14529,"都":5256,"配":1052,"醫":1928,"金":5699,"野":1096,"量":3732,"里":6615,"重":5591,"銀":1423,"錄":1696,"鎮":2372,"鐵":4592,"間":8271,"開":7281,"門":4672,"降":626,"限":2369,"院":5168,"除":1568,"陳":1215,"陸":5523,"陽":2042,"防":1167,"阿":4360,"離":1505,"難":627,"電":10735,"雲":2700,"隊":3986,"際":4571,"青":2396,"非":3371,"面":6432,"需":1196,"響":1404,"音":4823,"가":589,")":53309,"(":53597,"-":1381,",":211682,":":15729,";":5850," (":1546," )":2103," ,":4859,"車站,":720,"國的特":944,"植物。":2875,"植物,":1199,"基丁教":662,"等地,":2994,"民共和":1261,"。 ":3480,"、 ":1427,"」 ":591,"》 ":624,"國大陸":3816,"地區,":3519,"。這":1771,"、雲":1235,"」的":1264,"、福":680,"。現":601,"、甘":835,"。由":929,"、西":1055,"。該":1503,"四川、":1228,"、廣":1717,"、山":2030,"、安":622,"。它":1379,"、四":1327,"。在":1916,"、台":622,"。分":2945,"、勞":792,"、印":867,"、倫":1945,"。其":1173,"、湖":1562,"、河":1284,"、江":1104,"。此":759,"》是":802,"、日":634,"ああ":1707,"、《":1196,"、「":974,"》、":1063,"」、":966,"」。":2150,"》中":594,"「丕":727,"「丞":592,"《丈":880,"、休":613,"。他":1366,"「丈":1131,"《丕":588,"。乘":718,"、人":863,"、丞":3044,"。丁":930,"、丕":3444,"、不":1334,"、丁":2563,"、丈":4244,"、乘":1329,"。丞":586,"。丕":1165,"、中":959,"。丈":1017,"アア":1643,"地區的":687,"」(":1271,"」,":3388,"」)":800,"》(":2216,"》,":1818,"在中國":1179,"立於 ":1061,"栽培。":2896,"是香港":1720,"休 ":665,"使 ":756,"不、":2094,"不。":739,"丈」":981,"丈》":585,"丈、":2906,"丈。":2027,"丁、":2136,"丁。":1921,"一。":1804,"丞、":1947,"丞。":1850,"丕」":693,"丕。":1789,"丕、":2245,"中、":800,"人。":1182,"事。":826,"交。":660,"亞、":852,"倫 ":745,"丈山":729,"丈屬":937,"不家":1629,"丈小":1088,"丈家":1181,"丁屬":1243,"中國":13565,"丈學":945,"丈子":859,"一家":831,"丕大":655,"乘倫":1335,"丈大":605,"丕地":870,"丁大":588,"乘丁":705,"乘一":736,"佐。":1221,"乘丈":663,"佐、":761,"丈地":748,"丞和":584,"之一":4708,"丁國":1032,"丞勞":850,"丕勞":1045,"丈和":615,"不同":2051,"不勞":723,"中使":2667,"不區":1235,"中一":646,"丞使":1157,"丞倫":1278,"丈勞":1514,"丞人":604,"丈克":1050,"丈其":690,"丕使":877,"一勞":758,"丞佐":1297,"不公":954,"丁勞":1850,"並丁":839,"丕倫":1815,"丞休":1067,"丈倫":1781,"不使":1931,"休。":1287,"休、":1097,"丁克":614,"丞丈":3784,"丞不":839,"丞丁":2627,"丞丕":3712,"丕休":1238,"丞丞":2873,"丕佐":865,"不交":3733,"不京":1465,"不事":953,"丈休":1744,"一個":9748,"丈使":1114,"丈佐":1124,"丁倫":2045,"丕丞":2819,"丕丕":4439,"不休":656,"丕不":779,"丕丈":4929,"丕丁":2910,"丕乘":689,"丈不":986,"丈丈":10533,"丈丕":5787,"丈丞":4591,"丁休":2124,"丈之":678,"上不":1486,"一位":1048,"丈乘":1046,"丁佐":630,"丈事":641,"丁使":3115,"不丈":1385,"不丁":1326,"不丕":1047,"不不":1140,"不丞":1080,"一丈":689,"丁丕":3008,"丁丞":3158,"丁丈":5062,"丁丁":3771,"丁不":1124,"丁乘":842,"一些":1051,"份、":614,"丈一":1125,"丈丁":5951,"丕樂":705,"主教":1120,"任何":653,"丞業":742,"中文":1114,"人口":1812,"一次":950,"丈林":808,"一條":1013,"之後":1287,"種栽培":2894,"丕於":989,"他們":991,"事場":701,"丞於":627,"丈於":1045,"人使":893,"丁是":707,"丈是":753,"不拔":3356,"二十":705,"人丁":915,"人丈":695,"主席":690,"丁教":802,"丁斯":911,"丁於":991,"丁文":610,"一所":718,"交休":759,"乘大":1193,"事務":628,"不式":1114,"中學":1267,"交不":718,"使。":1730,"使、":1391,"丁店":631,"一座":888,"中央":1281,"丕小":677,"丕屬":955,"事交":1241,"丞家":1096,"丞寫":642,"佐勞":651,"中的":2868,"休如":836,"丈科":1482,"一種":4747,"丞的":2226,"份年":660,"使休":786,"使代":599,"使中":614,"二次":594,"使丈":1261,"使上":902,"使丁":966,"使丞":904,"使丕":680,"下的":914,"世界":3842,"上的":1685,"丈的":3553,"乘江":1087,"佐丞":598,"佐丕":890,"丁的":2951,"佐不":741,"佐丈":1040,"佐丁":1316,"不生":995,"交於":631,"丕的":2516,"不的":1824,"倫。":904,"倫、":1101,"丈爾":927,"丕然":602,"丁爾":1156,"也有":714,"不現":1191,"人工":3084,"丈球":618,"休丈":1588,"休丁":1289,"休丕":1088,"休丞":1470,"以及":5705,"不治":1653,"休倫":752,"也是":2445,"乘斯":612,"休佐":739,"休休":1180,"休使":949,"丁語":647,"交的":662,"何成":648,"丕西":1266,"但是":741,"人的":1304,"丕語":843,"主義":1789,"丁蘭":617,"事的":883,"人物":959,"企業":965,"丁西":590,"中華":3016,"一般":2431,"不聯":718,"人民":2356,"丈至":635,"丈草":589,"使團":1465,"前 ":869,"作家":983,"作品":1437,"使倫":938,"世紀":2133,"亞洲":978,"丁聯":1080,"乘的":932,"到 ":1125,"丞 ":1327,"丕 ":1248,"稱於「":1031,"丈 ":1426,"丈·":665,"丁 ":1214,"丁·":957,"地、":843,"商業":792,"國、":913,"倫,":2201,"又譯":655,"問宮":915,"單位":908,"員會":1240,"可能":966,"南部":1208,"名稱":1366,"參與":743,"各種":710,"名的":1486,"同的":917,"又稱":2281,"使(":775,"使,":3019,"位,":692,"佐,":2446,"台灣":5826,"何,":895,"作,":878,"只有":589,"南等":606,"休(":898,"休,":2604,"同時":1361,"和國":2079,"名於":1540,"在 ":4228,"命名":845,"和丕":658,"和丞":617,"區的":1565,"創辦":644,"和丈":1045,"和丁":617,"前身":709,"交,":1096,"人,":2364,"反應":687,"分類":693,"化的":604,"事,":1473,"勞立":1122,"國王":742,"城事":1733,"因此":1472,"國民":1001,"基丁":1219,"基丞":778,"因於":1311,"多生聞":623,"分,":730,"國大":4188,"四川":1895,"地區":6934,"國國":1282,"在台":591,"國家":4180," 公里":1210,"國丁":723,"國丈":683,"國不":1346,"國人":979,"內,":588,"在丕":590,"在不":766,"在丈":1169,"在丁":857,"地不":1518,"在中":1499,"培。":2897,"保護":897,"公園":1063,"全國":1075,"共和":2094,"共同":713,"公司":6083,"和 ":817,"分丁":734,"勞。":848,"勞、":713,"使蘭":928,"佐責":846,"倫爾":803,"及 ":587,"來自":904,"人類":938,"其他":1807,"公共":721,"倫用":717,"內信":886,"其中":1932,"兩個":962,"倫的":1758,"俄羅":1193,"內丁":817,"交通":965,"代表":1917,"使用":3276,"使理":2623,"來的":817,"使的":2298,"何的":820,"佐的":1260,"作用":690,"之間":1600,"倫州":1179,"作的":823,"何能":735,"倫有":1307,"九龍":820,"倫斯":769,"使究":2861,"倫何":704,"不部":1510,"倫休":862,"代的":605,"倫倫":983,"倫克":676,"使於":674,"使本":654,"位於":6989,"個人":1069,"丁近":678,"作於":1594,"丈車":636,"主要":4266,"一部":1589,"他的":940,"倫丈":1719,"倫丁":1808,"倫丞":1219,"倫丕":1126,"倫中":896,"信息":690,"丈體":712,"倫多":1308,"倫勞":1159,"休畫":658,"人聞":943,"休的":1677,"不面":1013,"及丈":747,"及中":2170,"丕(":1556,"丕,":3869,"下,":946,"上,":1460,"丈(":2498,"丈,":5085,"半島":716,"不,":1457,"一,":2375,"丁)":1026,"丁,":3944,"丁(":1688,"包括":3326,"參加":686,"員。":603,"中,":3370,"及其":661,"創立":812,"原名":722,"丞,":4229,"丞(":1428,"乘,":704,"動物":1355,"又名":874,"勞灣":831,"印度":2180,"可以":2687,"台不":1162,"勞的":1695,"協會":1036,"名使":755,"動畫":650,"合作":595,"包含":733,"南不":932,"公路":1194,"化學":839,"於香港":1347,"利用":694," 世紀":1443,"區域":760,"公里":2070,"內的":698,"倫體":1121,"前尚":2709,"分於":856,"勞丞":1027,"勞休":691,"勞任":888,"勞不":751,"勞丈":1379,"勞丁":1206,"勞丕":1033,"勞倫":1202,"勞佐":918,"名。":909,"分別":960,"利亞":1460,"傳統":899,"共有":696,"具有":1054,"區。":805,"區、":855,"南、":2848,"創作":724,"分布":3537,"全球":780,"加乘":907,"場,":657,"州事":599,"工作":1351,"巴乘":819,"學院":1919,"屬的":3350,"度、":646,"帝國":1099,"工引":2892,"平不":1295,"山谷":592,"年代":1818,"布在":1588,"山西":699,"小都":651,"實際":604,"建、":593,"小說":1488,"巴使":882,"希丕":1033,"尚未":2823,"定的":781,"州、":1207,"地,":4121,"川、":1254,"、雲南":1181,"對於":1252,"它的":613,"學的":914,"山坡":1674,"國,":702,"學生":932,"小的":1375,"山東":747,"屬於":2028,"德·":729,"定義":598,"媒體":593,"就是":946,"家的":1003," 年 ":13455,"形式":845,"形成":781,"德國":1532,"後來":987,"廣播":596,"廣東":1877,"建於":727,"年的":1359,"建立":1105,"引種":2905,"年至":809,"拔 ":3138,"廣乘":652,"外,":1036,"工業":864,"布於":1775,"建佐":1764,"年在":773,"廣州":765,"工程":1106,"已經":809,"常生":722,"廣場":674,"團體":692,"大學":4750,"國際":3079,"名:":4017,"名,":805,"大戰":639,"司,":903,"司(":707,"太平":715,"委員":1628,"多生":1260,"動,":770,"勞,":1822,"國立":626,"國的":3067,"基本":664,"境內":659,"區,":4922,"大利":1318,"大使":1011,"家 ":671,"天休":592,"天主":779,"大丈":858,"大丕":898,"學中":593,"大陸":4384,"學使":615,"安丁":874,"存在":831,"學名":3781,"家丈":660,"家丁":836,"學家":1871,"島、":590,"年 ":13925,"小丕":726,"小丈":935,"家國":1674,"專倫":637,"對係":1024,"尼亞":1180,"學校":1548,"它是":687,"家律":639,"宗教":606,"山丞":856,"大的":1820,"品,":659,"學、":1095,"未由人":2697,"員,":1301,"太空":651,"家、":1074,"家。":1302,"大部":622,"東不":1591,"是由":1946,"有對":649,"於美":890,"最大":1529,"成,":1008,"最早":863,"是美":1276,"日至":621,"最後":917,"時的":620,"朝宮":1146,"林中":937,"林下":865,"林丈":744,"林丕":654,"東南":851,"是於":666,"是日":750,"最丈":682,"教育":1764,"斯特":828,"是指":1896,"服務":1801," 日 ":1059,"時期":1763,"有一":1032,"有丈":631,"栽培":2950,"時間":1470," 月 ":10453,"期的":813,"未由":2698,"有的":742,"東省":644,"是香":1731," 年,":2603," 年)":2399,"有植":939,"有時":596,"會的":752,"於香":1366,"故事":841,"後,":2045,"教丁":618,"教休":765,"文使":808,"、湖不":767,"斯丁":773,"文化":2109,"、江西":640,"於丁":1931,"於一":1279,"於丈":3104,"於不":3971,"於丕":2015,"於丞":1568,"於中":3228,"於主":1138,"式,":713,"於「":3160,"是「":751,"是台":621,"時代":1179,"是在":1454,"東、":1223,"於是":782,"於日":1024,"早期":592,"日本":5548,"於倫":790,"、湖南":720,"名:)":3240,"於了":898,"於人":919,"斯坦":674,"文學":1074,"於台":1510,"數學":761,"於勞":777,"數據":647,"於山":668,"是位":626,"是丕":1220,"是丞":853,"是中":2852,"日在":676,"是丁":1400,"是一":8457,"是不":724,"是丈":2046,"教會":604,"是 ":1586,"推不":777," 年的":915,"或稱":642,"成立":2635,"戰爭":1266,"成的":1247,"日 ":1136,"於 ":12568,"斯·":1030,"所有":1063,"應用":957,"拉丁":1113,"成於":2501,"投資":589,"月 ":10505,"有 ":1922,"技術":1647,"年(":1135,"年)":2513,"年,":3020," 年至":781,"提不":748,"提供":1573,"控制":697,"或者":865,"宮,":689,"家(":883,"家,":2956,"學(":695,"學,":1273,"子,":930,"成。":673,"年間":605,"廣西":1228," 年在":693,"屬(":1280," 年代":1445," 平不":1026,"成員":1192,"所以":843,"成何":668,"影響":1080,"小,":1059,"委員會":1211,"流行":628,"業,":586," 日)":775," 日,":1367,"源於":682,"爾·":762,"沒有":1285,"活動":1238,"比賽":1111,"江蘇":807,"江西":972,"湖不":1117,"湖南":1294,"江、":1010,"斯(":607,"文:":2388,"機構":1217,"死對":1361,"、廣東":743,"日,":1544,"日)":813,"歷使":2356," 日至":616,"民國":1569,"歐洲":1588,"民主":899,"民共":1266,"正式":1274,"時,":1278,"河南":853,"月,":734,"、廣西":861,"會,":853,"會(":980,"民族":1066,"期,":613,"河不":810,"根據":1260,"會議":766,"最高":966,"期間":1013,"有限":1519,"標佐":1052,"植物":4908," 是一":681,"概念":627," 日在":656,"的電":999,"稱「":1043,"的西":633,"發行":1127,"的重":753,"第 ":1807,"的發":596,"的第":1621,"物,":2112,"目的":1040,"的是":1255,"的最":739,"的植":2894,"直接":612,"發現":996,"的特":1537,"積 ":785,"發生":1316,"的對":648,"的小":1527,"的家":632,"發展":2338,"的大":1229,"的國":1195,"的地":4227,"目前":4386,"的丁":3310,"的一":9461,"用的":1238,"生的":842,"產生":949,"生產":964,"球隊":888,"生物":1143,"生聞":4477,"甘肅":1049,"的名":796,"的勞":1368,"的倫":1653,"的休":1719,"的使":1916,"的佐":1046,"的作":651,"的主":1361,"的中":1197,"的丕":3244,"的丞":2994,"的丈":5094,"的不":2456,"的乘":747,"的人":1899,"的交":1021,"生於":2502,"用於":1261,"由於":1686,"理論":1080,"當時":1405,"生活":843,"生在":746,"產品":771,"的。":900,"的《":764,"的「":1273,"環境":892,"由人":2997,"用來":777,"理學":1070,"現在":1079,"王朝":657,"現代":897,"獲得":1128,"的 ":1662,"用。":731,"班丁":801,"物理":701,"特有":1087,"特別":1033,"澳門":921,"、福建":587,"物。":3616,"聯合":1423,"而成":666,"聞在":1114,"等,":840,"聞於":3447,"肅、":797,"網路":1657,"總統":750,"稱,":608,"種,":743,"美國":6254,"有限公":1342,"義大":928,"群島":661,"羅斯":1391,"聯事":701,"美洲":846,"站,":1417,"站(":605,"聯休":1628,"立,":947,"經濟":1788,"統的":590,"經營":801,"、甘肅":803,"線的":712,"至 ":5663,"自 ":699,"紀念":700,"簡稱":3399,"組成":1456,"。由於":638,"聞 ":661,"米的":3024,"結構":812,"米至":2731,"系統":2899,"統治":615,"目,":621,"結丁":586,"組佐":2037,"約丈":598,"系使":1485,"節目":923,"立於":1349,"等地":3334,"的,":1232,"天主教":743,"立的":1251,"生,":616,"用,":772,"程式":650,"科技":622,"約 ":1720,"第三":1132,"第一":3441,"第二":2120,"稱於":4215,"種栽":2894,"科丈":1047,"等。":1259,"科學":1716,"福建":1133,"社會":2251,"站。":627,"角色":606,"計算":1063,"設立":673,"西部":622,"製造":737,"認於":1497,"西班":773,"要的":1104,"被稱":1048,"計劃":875,"越南":953,"資料":957,"賽事":631,"語言":1720,"設計":1777,"號,":798,"處,":614,"譯於":650,"超過":651,"路線":849,"足球":1703,"資訊":664,"行,":1162,"醫學":647,"進行":2272,"都是":704,"過程":791,"這種":792,"部分":2342,"部份":776,"遊戲":1924,"連接":662,"通常":1653,"運動":2496,"語:":3929,"說,":690,"車站":2410,"這個":1488,"這些":923,"於日本":917,"自治":728,"自然":792,"英國":2374,"自由":877,"般生":678,"線,":642,"航空":1143,"草地":752," 米的":2958,"自丁":786,"羅馬":1149,"與丈":666," 米至":2728,"米,":753,"聯賽":916,"華民":1238,"英語":2016,"者,":759,"舉行":1293,"英文":2836,"舉辦":651,"華人":1471,"著名":1732,"蘇聯":599,"處理":911,"藝術":1217,"行。":771,"聞,":628,"行於":853,"製作":1162,"西南":747,"行的":1255,"西份":1005,"西亞":1235,"西不":1211,"行不":2204,"語 ":675,"西、":2735,"中,目":813,"是位於":586,"間,":1121,"是一種":2255,"香港":8631,"高丁":607,"體的":647,"隊,":588,"高速":865,"體育":810,"及中國":2028,"公里,":1100,"體,":795,"是一個":2235,"丈(學":765,"是中國":2143,"於台灣":1145,"選舉":733,"通過":1270,"部的":1066,"賽,":689,"路,":930,"重要":1746,"銀行":879,"鐵家":726,"的第一":601,"重要的":694,"開始":2056,"部,":1123,"除了":587,"限公":1343,"鐵路":1902,"電勞":1154,"雲南":1736,"阿拉":770,"間的":1117,"開發":1451,"陸的":3080,"里,":1236,"電影":2042,"阿爾":603,"電子":1071,"面積":1525,"非洲":681,"領域":708,"項目":726,"電視":2609,"音樂":1761,"需要":625,"馬來":712,"有植物":938,"的特有":970,"是美國":1183,"的植物":2889,"日至 ":606,"前尚未":2703,"最大的":952,"於美國":799,"加乘大":716,"是日本":699,"目前尚":2707,"的地區":3107,"生聞在":1091,"生聞於":3237,"甘肅、":794,"的一部":774,"的主要":668,"的一種":1474,"年),":737,"的一個":3094,"限公司":1342,"年( ":710,"於不拔":2919,"於中國":2310," ,是":653,"南等地":586,"),":14128,"()":3756,"(,":1173,",)":1558,":,":731,":)":4068,"由人工":2898,",東":861,",有":1487,",最":1123,",曾":1063,",是":13163,")是":10839,",於":6392,",此":660,",後":1253,",從":882,",常":1047,",故":644,")於":2446,",指":584,",所":993,",或":1331,",成":1293,")的":2203,",目":3631,",簡":2031,",第":778,",現":1338,",由":3905,",用":642,",生":3524,",當":880,",西":834,",被":860,",該":1017,",經":598,",總":744,",美":597,",而":3222,",英":973,",與":1255,"(英":2412,",香":721,",通":885,",這":1582,",「":615,",《":690,")。":3711,")、":2876,"(學":3641,",它":1379,",屬":841,",小":787,",對":650,",因":2415,",在":4525,",多":1566,",大":803,",如":966,",前":662,",分":861,",南":727,",即":1028,",勞":1219,",包":1288,")和":952,",同":897,",台":652,",可":1102,",又":2944,",原":1271,",不":2771,",丕":3583,",丞":2297,",並":3993,",中":1698,",主":1659,"(今":666,",乘":1385,",也":3255,"(丈":751,",丈":5090,",丁":3996,",一":2348,",休":1371,",但":3353,",位":2751,",佐":784,",使":1949,",交":834,",亦":1195,",人":871,",以":3497,",他":1340,",份":756,",倫":1535,",共":754,",其":3556,",全":1257,"( ":3933,") ":996,", ":7569,"- ":815,"雲南、":1086,"丁屬(":787,"華民國":1215,"廣西、":800,"中國的":1325,"中國大":3838,"之一,":2265,"英語:":1339,"不拔 ":3137,"引種栽":2892,"丈屬的":703,"人工引":2892," ),":596,"乘江、":593,"特有植":935,"以及中":1990,"一般生":678,"中華人":1156,"英文:":1510,"丕西、":804,"中華民":1237,"廣東、":761,"華人民":1156,"人民共":1261,":)於":1432,":)是":1898,"),又":978,"),是":2411,"()是":1113,",),":1004,"著名的":780,"(),":1524,"位於香":588,"年至 ":713,"面積 ":770,"倫州、":844,"之間的":628,"成立於":922,"內丁使":611,"俄羅斯":1193,",香港":708,"物。分":2798,"公司(":691,"公司,":864,"行不區":1022,"共和國":1943,"分布於":1717,"分布在":1578,"》、《":944,"」、「":864,",於中":674,")是丈":645,")是一":2670,",是一":2178,",是中":1286,"湖南、":689,"あああ":1321,"尚未由":2697,"地,生":2880,",常生":667,",於 ":2197,",所以":698,",又稱":1386,",因此":1151,",多生":1249,"聞於不":2897,"(學名":3634,"。分布":2903,"、印度":793,"、乘江":625,"、丕西":798,"、倫州":850,"、山坡":668,"被稱於":856,"、四川":1250,"大陸的":3064,"アアア":1275,"學名:":3509,"(英語":1020,"(英文":1336,"義大利":914,",目前":3420,"西班丁":773,",生聞":2926,",簡稱":1929,"平不公":818,"工引種":2892,",包括":1128,"米的地":2889,",又名":585,",在 ":642,",其中":943,",以及":1003,",也是":1481,",一般":1212,",主要":1207,",位於":2550,",並丁":649,",中國":911,"屬的植":2808,"丁使、":633,"丈丈丁":628,"江西、":618,"湖不、":763,"不同的":723,"米至 ":2729,"之一。":1752,"不公里":761,"丈丈,":616},"n_words":[4924775,1867501,309785],"name":"zh-tw"}
\ No newline at end of file
# Natural Language Toolkit (NLTK)
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
Natural Language Processing with Python. O'Reilly Media Inc.
http://nltk.org/book
"""
+from __future__ import print_function, absolute_import
import os
# in the file VERSION.
try:
# If a VERSION file exists, use it!
- version_file = os.path.join(os.path.dirname(__file__), "VERSION")
- with open(version_file, "r") as infile:
+ version_file = os.path.join(os.path.dirname(__file__), 'VERSION')
+ with open(version_file, 'r') as infile:
__version__ = infile.read().strip()
except NameError:
- __version__ = "unknown (running code interactively?)"
+ __version__ = 'unknown (running code interactively?)'
except IOError as ex:
__version__ = "unknown (%s)" % ex
if __doc__ is not None: # fix for the ``python -OO``
- __doc__ += "\n@version: " + __version__
+ __doc__ += '\n@version: ' + __version__
# Copyright notice
__copyright__ = """\
-Copyright (C) 2001-2020 NLTK Project.
+Copyright (C) 2001-2017 NLTK Project.
Distributed and Licensed under the Apache License, Version 2.0,
which is included by reference.
__longdescr__ = """\
The Natural Language Toolkit (NLTK) is a Python package for
natural language processing. NLTK requires Python 2.6 or higher."""
-__keywords__ = [
- "NLP",
- "CL",
- "natural language processing",
- "computational linguistics",
- "parsing",
- "tagging",
- "tokenizing",
- "syntax",
- "linguistics",
- "language",
- "natural language",
- "text analytics",
-]
+__keywords__ = ['NLP', 'CL', 'natural language processing',
+ 'computational linguistics', 'parsing', 'tagging',
+ 'tokenizing', 'syntax', 'linguistics', 'language',
+ 'natural language', 'text analytics']
__url__ = "http://nltk.org/"
# Maintainer, contributors, etc.
# "Trove" classifiers for Python Package Index.
__classifiers__ = [
- "Development Status :: 5 - Production/Stable",
- "Intended Audience :: Developers",
- "Intended Audience :: Education",
- "Intended Audience :: Information Technology",
- "Intended Audience :: Science/Research",
- "License :: OSI Approved :: Apache Software License",
- "Operating System :: OS Independent",
- "Programming Language :: Python :: 2.6",
- "Programming Language :: Python :: 2.7",
- "Topic :: Scientific/Engineering",
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
- "Topic :: Scientific/Engineering :: Human Machine Interfaces",
- "Topic :: Scientific/Engineering :: Information Analysis",
- "Topic :: Text Processing",
- "Topic :: Text Processing :: Filters",
- "Topic :: Text Processing :: General",
- "Topic :: Text Processing :: Indexing",
- "Topic :: Text Processing :: Linguistic",
+ 'Development Status :: 5 - Production/Stable',
+ 'Intended Audience :: Developers',
+ 'Intended Audience :: Education',
+ 'Intended Audience :: Information Technology',
+ 'Intended Audience :: Science/Research',
+ 'License :: OSI Approved :: Apache Software License',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python :: 2.6',
+ 'Programming Language :: Python :: 2.7',
+ 'Topic :: Scientific/Engineering',
+ 'Topic :: Scientific/Engineering :: Artificial Intelligence',
+ 'Topic :: Scientific/Engineering :: Human Machine Interfaces',
+ 'Topic :: Scientific/Engineering :: Information Analysis',
+ 'Topic :: Text Processing',
+ 'Topic :: Text Processing :: Filters',
+ 'Topic :: Text Processing :: General',
+ 'Topic :: Text Processing :: Indexing',
+ 'Topic :: Text Processing :: Linguistic',
]
from nltk.internals import config_java
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
-
-if not hasattr(subprocess, "PIPE"):
-
+if not hasattr(subprocess, 'PIPE'):
def _fake_PIPE(*args, **kwargs):
- raise NotImplementedError("subprocess.PIPE is not supported.")
-
+ raise NotImplementedError('subprocess.PIPE is not supported.')
subprocess.PIPE = _fake_PIPE
-if not hasattr(subprocess, "Popen"):
-
+if not hasattr(subprocess, 'Popen'):
def _fake_Popen(*args, **kwargs):
- raise NotImplementedError("subprocess.Popen is not supported.")
-
+ raise NotImplementedError('subprocess.Popen is not supported.')
subprocess.Popen = _fake_Popen
###########################################################
# that can safely fail at run time
from nltk import lazyimport
-
-app = lazyimport.LazyModule("nltk.app", locals(), globals())
-chat = lazyimport.LazyModule("nltk.chat", locals(), globals())
-corpus = lazyimport.LazyModule("nltk.corpus", locals(), globals())
-draw = lazyimport.LazyModule("nltk.draw", locals(), globals())
-toolbox = lazyimport.LazyModule("nltk.toolbox", locals(), globals())
+app = lazyimport.LazyModule('nltk.app', locals(), globals())
+chat = lazyimport.LazyModule('nltk.chat', locals(), globals())
+corpus = lazyimport.LazyModule('nltk.corpus', locals(), globals())
+draw = lazyimport.LazyModule('nltk.draw', locals(), globals())
+toolbox = lazyimport.LazyModule('nltk.toolbox', locals(), globals())
# Optional loading
from nltk import cluster
from nltk.downloader import download, download_shell
-
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
pass
else:
from nltk.downloader import download_gui
except RuntimeError as e:
import warnings
-
- warnings.warn(
- "Corpus downloader GUI not loaded "
- "(RuntimeError during import: %s)" % str(e)
- )
+ warnings.warn("Corpus downloader GUI not loaded "
+ "(RuntimeError during import: %s)" % str(e))
# explicitly import all top-level modules (ensuring
# they override the same names inadvertently imported
from nltk import tag, tbl, text, tokenize, translate, tree, treetransforms, util
-# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116
+# override any accidentally imported demo
def demo():
print("To run the demo code for a module, type nltk.module.demo()")
# Natural Language Toolkit: Applications package
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# Import Tkinter-based modules if Tkinter is installed
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
import warnings
-
- warnings.warn("nltk.app package not loaded " "(please install Tkinter library).")
+ warnings.warn("nltk.app package not loaded "
+ "(please install Tkinter library).")
else:
from nltk.app.chartparser_app import app as chartparser
from nltk.app.chunkparser_app import app as chunkparser
from matplotlib import pylab
except ImportError:
import warnings
-
- warnings.warn(
- "nltk.app.wordfreq not loaded " "(requires the matplotlib library)."
- )
+ warnings.warn("nltk.app.wordfreq not loaded "
+ "(requires the matplotlib library).")
else:
from nltk.app.wordfreq_app import app as wordfreq
# skip doctests from this package
def setup_module(module):
from nose import SkipTest
-
raise SkipTest("nltk.app examples are not doctests")
# Natural Language Toolkit: Chart Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Jean Mark Gawron <gawron@mail.sdsu.edu>
# Steven Bird <stevenbird1@gmail.com>
# widget system.
+from __future__ import division
import pickle
import os.path
-from tkinter import (
- Button,
- Canvas,
- Checkbutton,
- Frame,
- IntVar,
- Label,
- Menu,
- Scrollbar,
- Tk,
- Toplevel,
-)
-from tkinter.font import Font
-from tkinter.messagebox import showerror, showinfo
-from tkinter.filedialog import asksaveasfilename, askopenfilename
-
-from nltk.parse.chart import (
- BottomUpPredictCombineRule,
- BottomUpPredictRule,
- Chart,
- LeafEdge,
- LeafInitRule,
- SingleEdgeFundamentalRule,
- SteppingChartParser,
- TopDownInitRule,
- TopDownPredictRule,
- TreeEdge,
-)
+from six.moves.tkinter import (Button, Canvas, Checkbutton, Frame, IntVar,
+ Label, Menu, Scrollbar, Tk, Toplevel)
+from six.moves.tkinter_font import Font
+from six.moves.tkinter_messagebox import showerror, showinfo
+from six.moves.tkinter_tkfiledialog import asksaveasfilename, askopenfilename
+
+from nltk.parse.chart import (BottomUpPredictCombineRule, BottomUpPredictRule,
+ Chart, LeafEdge, LeafInitRule, SingleEdgeFundamentalRule,
+ SteppingChartParser, TopDownInitRule, TopDownPredictRule,
+ TreeEdge)
from nltk.tree import Tree
from nltk.grammar import Nonterminal, CFG
from nltk.util import in_idle
-from nltk.draw.util import (
- CanvasFrame,
- ColorizedList,
- EntryDialog,
- MutableOptionMenu,
- ShowText,
- SymbolWidget,
-)
+from nltk.draw.util import (CanvasFrame, ColorizedList,
+ EntryDialog, MutableOptionMenu,
+ ShowText, SymbolWidget)
from nltk.draw import CFGEditor, tree_to_treesegment, TreeSegmentWidget
# Known bug: ChartView doesn't handle edges generated by epsilon
# Edge List
#######################################################################
-
class EdgeList(ColorizedList):
- ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+ ARROW = SymbolWidget.SYMBOLS['rightarrow']
def _init_colortags(self, textwidget, options):
- textwidget.tag_config("terminal", foreground="#006000")
- textwidget.tag_config("arrow", font="symbol", underline="0")
- textwidget.tag_config("dot", foreground="#000000")
- textwidget.tag_config(
- "nonterminal", foreground="blue", font=("helvetica", -12, "bold")
- )
+ textwidget.tag_config('terminal', foreground='#006000')
+ textwidget.tag_config('arrow', font='symbol', underline='0')
+ textwidget.tag_config('dot', foreground = '#000000')
+ textwidget.tag_config('nonterminal', foreground='blue',
+ font=('helvetica', -12, 'bold'))
def _item_repr(self, item):
contents = []
- contents.append(("%s\t" % item.lhs(), "nonterminal"))
- contents.append((self.ARROW, "arrow"))
+ contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+ contents.append((self.ARROW, 'arrow'))
for i, elt in enumerate(item.rhs()):
if i == item.dot():
- contents.append((" *", "dot"))
+ contents.append((' *', 'dot'))
if isinstance(elt, Nonterminal):
- contents.append((" %s" % elt.symbol(), "nonterminal"))
+ contents.append((' %s' % elt.symbol(), 'nonterminal'))
else:
- contents.append((" %r" % elt, "terminal"))
+ contents.append((' %r' % elt, 'terminal'))
if item.is_complete():
- contents.append((" *", "dot"))
+ contents.append((' *', 'dot'))
return contents
-
#######################################################################
# Chart Matrix View
#######################################################################
-
class ChartMatrixView(object):
"""
A view of a chart that displays the contents of the corresponding matrix.
"""
-
- def __init__(
- self, parent, chart, toplevel=True, title="Chart Matrix", show_numedges=False
- ):
+ def __init__(self, parent, chart, toplevel=True, title='Chart Matrix',
+ show_numedges=False):
self._chart = chart
self._cells = []
self._marks = []
if toplevel:
self._root = Toplevel(parent)
self._root.title(title)
- self._root.bind("<Control-q>", self.destroy)
+ self._root.bind('<Control-q>', self.destroy)
self._init_quit(self._root)
else:
self._root = Frame(parent)
self.draw()
def _init_quit(self, root):
- quit = Button(root, text="Quit", command=self.destroy)
- quit.pack(side="bottom", expand=0, fill="none")
+ quit = Button(root, text='Quit', command=self.destroy)
+ quit.pack(side='bottom', expand=0, fill='none')
def _init_matrix(self, root):
- cframe = Frame(root, border=2, relief="sunken")
- cframe.pack(expand=0, fill="none", padx=1, pady=3, side="top")
- self._canvas = Canvas(cframe, width=200, height=200, background="white")
- self._canvas.pack(expand=0, fill="none")
+ cframe = Frame(root, border=2, relief='sunken')
+ cframe.pack(expand=0, fill='none', padx=1, pady=3, side='top')
+ self._canvas = Canvas(cframe, width=200, height=200,
+ background='white')
+ self._canvas.pack(expand=0, fill='none')
def _init_numedges(self, root):
- self._numedges_label = Label(root, text="0 edges")
- self._numedges_label.pack(expand=0, fill="none", side="top")
+ self._numedges_label = Label(root, text='0 edges')
+ self._numedges_label.pack(expand=0, fill='none', side='top')
def _init_list(self, root):
self._list = EdgeList(root, [], width=20, height=5)
- self._list.pack(side="top", expand=1, fill="both", pady=3)
-
- def cb(edge, self=self):
- self._fire_callbacks("select", edge)
-
- self._list.add_callback("select", cb)
+ self._list.pack(side='top', expand=1, fill='both', pady=3)
+ def cb(edge, self=self): self._fire_callbacks('select', edge)
+ self._list.add_callback('select', cb)
self._list.focus()
def destroy(self, *e):
- if self._root is None:
- return
- try:
- self._root.destroy()
- except:
- pass
+ if self._root is None: return
+ try: self._root.destroy()
+ except: pass
self._root = None
def set_chart(self, chart):
self.draw()
def update(self):
- if self._root is None:
- return
+ if self._root is None: return
# Count the edges in each cell
N = len(self._cells)
for i in range(N):
for j in range(i, N):
if cell_edges[i][j] == 0:
- color = "gray20"
+ color = 'gray20'
else:
- color = "#00%02x%02x" % (
- min(255, 50 + 128 * cell_edges[i][j] / 10),
- max(0, 128 - 128 * cell_edges[i][j] / 10),
- )
+ color = ('#00%02x%02x' %
+ (min(255, 50+128*cell_edges[i][j]/10),
+ max(0, 128-128*cell_edges[i][j]/10)))
cell_tag = self._cells[i][j]
self._canvas.itemconfig(cell_tag, fill=color)
- if (i, j) == self._selected_cell:
- self._canvas.itemconfig(cell_tag, outline="#00ffff", width=3)
+ if (i,j) == self._selected_cell:
+ self._canvas.itemconfig(cell_tag, outline='#00ffff',
+ width=3)
self._canvas.tag_raise(cell_tag)
else:
- self._canvas.itemconfig(cell_tag, outline="black", width=1)
+ self._canvas.itemconfig(cell_tag, outline='black',
+ width=1)
# Update the edge list.
edges = list(self._chart.select(span=self._selected_cell))
# Update our edge count.
self._num_edges = self._chart.num_edges()
if self._numedges_label is not None:
- self._numedges_label["text"] = "%d edges" % self._num_edges
+ self._numedges_label['text'] = '%d edges' % self._num_edges
def activate(self):
- self._canvas.itemconfig("inactivebox", state="hidden")
+ self._canvas.itemconfig('inactivebox', state='hidden')
self.update()
def inactivate(self):
- self._canvas.itemconfig("inactivebox", state="normal")
+ self._canvas.itemconfig('inactivebox', state='normal')
self.update()
def add_callback(self, event, func):
- self._callbacks.setdefault(event, {})[func] = 1
+ self._callbacks.setdefault(event,{})[func] = 1
def remove_callback(self, event, func=None):
- if func is None:
- del self._callbacks[event]
+ if func is None: del self._callbacks[event]
else:
- try:
- del self._callbacks[event][func]
- except:
- pass
+ try: del self._callbacks[event][func]
+ except: pass
def _fire_callbacks(self, event, *args):
- if event not in self._callbacks:
- return
- for cb_func in list(self._callbacks[event].keys()):
- cb_func(*args)
+ if event not in self._callbacks: return
+ for cb_func in list(self._callbacks[event].keys()): cb_func(*args)
def select_cell(self, i, j):
- if self._root is None:
- return
+ if self._root is None: return
# If the cell is already selected (and the chart contents
# haven't changed), then do nothing.
- if (i, j) == self._selected_cell and self._chart.num_edges() == self._num_edges:
- return
+ if ((i,j) == self._selected_cell and
+ self._chart.num_edges() == self._num_edges): return
- self._selected_cell = (i, j)
+ self._selected_cell = (i,j)
self.update()
# Fire the callback.
- self._fire_callbacks("select_cell", i, j)
+ self._fire_callbacks('select_cell', i, j)
def deselect_cell(self):
- if self._root is None:
- return
+ if self._root is None: return
self._selected_cell = None
self._list.set([])
self.update()
def _click_cell(self, i, j):
- if self._selected_cell == (i, j):
+ if self._selected_cell == (i,j):
self.deselect_cell()
else:
self.select_cell(i, j)
self._list.view(edge)
def mark_edge(self, edge):
- if self._root is None:
- return
+ if self._root is None: return
self.select_cell(*edge.span())
self._list.mark(edge)
def unmark_edge(self, edge=None):
- if self._root is None:
- return
+ if self._root is None: return
self._list.unmark(edge)
def markonly_edge(self, edge):
- if self._root is None:
- return
+ if self._root is None: return
self.select_cell(*edge.span())
self._list.markonly(edge)
def draw(self):
- if self._root is None:
- return
+ if self._root is None: return
LEFT_MARGIN = BOT_MARGIN = 15
TOP_MARGIN = 5
c = self._canvas
- c.delete("all")
- N = self._chart.num_leaves() + 1
- dx = (int(c["width"]) - LEFT_MARGIN) / N
- dy = (int(c["height"]) - TOP_MARGIN - BOT_MARGIN) / N
+ c.delete('all')
+ N = self._chart.num_leaves()+1
+ dx = (int(c['width'])-LEFT_MARGIN)/N
+ dy = (int(c['height'])-TOP_MARGIN-BOT_MARGIN)/N
- c.delete("all")
+ c.delete('all')
# Labels and dotted lines
for i in range(N):
- c.create_text(
- LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor="e"
- )
- c.create_text(
- i * dx + dx / 2 + LEFT_MARGIN,
- N * dy + TOP_MARGIN + 1,
- text=repr(i),
- anchor="n",
- )
- c.create_line(
- LEFT_MARGIN,
- dy * (i + 1) + TOP_MARGIN,
- dx * N + LEFT_MARGIN,
- dy * (i + 1) + TOP_MARGIN,
- dash=".",
- )
- c.create_line(
- dx * i + LEFT_MARGIN,
- TOP_MARGIN,
- dx * i + LEFT_MARGIN,
- dy * N + TOP_MARGIN,
- dash=".",
- )
+ c.create_text(LEFT_MARGIN-2, i*dy+dy/2+TOP_MARGIN,
+ text=repr(i), anchor='e')
+ c.create_text(i*dx+dx/2+LEFT_MARGIN, N*dy+TOP_MARGIN+1,
+ text=repr(i), anchor='n')
+ c.create_line(LEFT_MARGIN, dy*(i+1)+TOP_MARGIN,
+ dx*N+LEFT_MARGIN, dy*(i+1)+TOP_MARGIN, dash='.')
+ c.create_line(dx*i+LEFT_MARGIN, TOP_MARGIN,
+ dx*i+LEFT_MARGIN, dy*N+TOP_MARGIN, dash='.')
# A box around the whole thing
- c.create_rectangle(
- LEFT_MARGIN, TOP_MARGIN, LEFT_MARGIN + dx * N, dy * N + TOP_MARGIN, width=2
- )
+ c.create_rectangle(LEFT_MARGIN, TOP_MARGIN,
+ LEFT_MARGIN+dx*N, dy*N+TOP_MARGIN,
+ width=2)
# Cells
self._cells = [[None for i in range(N)] for j in range(N)]
for i in range(N):
for j in range(i, N):
- t = c.create_rectangle(
- j * dx + LEFT_MARGIN,
- i * dy + TOP_MARGIN,
- (j + 1) * dx + LEFT_MARGIN,
- (i + 1) * dy + TOP_MARGIN,
- fill="gray20",
- )
+ t = c.create_rectangle(j*dx+LEFT_MARGIN, i*dy+TOP_MARGIN,
+ (j+1)*dx+LEFT_MARGIN,
+ (i+1)*dy+TOP_MARGIN,
+ fill='gray20')
self._cells[i][j] = t
-
- def cb(event, self=self, i=i, j=j):
- self._click_cell(i, j)
-
- c.tag_bind(t, "<Button-1>", cb)
+ def cb(event, self=self, i=i, j=j): self._click_cell(i,j)
+ c.tag_bind(t, '<Button-1>', cb)
# Inactive box
- xmax, ymax = int(c["width"]), int(c["height"])
- t = c.create_rectangle(
- -100,
- -100,
- xmax + 100,
- ymax + 100,
- fill="gray50",
- state="hidden",
- tag="inactivebox",
- )
+ xmax, ymax = int(c['width']), int(c['height'])
+ t = c.create_rectangle(-100, -100, xmax+100, ymax+100,
+ fill='gray50', state='hidden',
+ tag='inactivebox')
c.tag_lower(t)
# Update the cells.
def pack(self, *args, **kwargs):
self._root.pack(*args, **kwargs)
-
#######################################################################
# Chart Results View
#######################################################################
-
class ChartResultsView(object):
def __init__(self, parent, chart, grammar, toplevel=True):
self._chart = chart
if toplevel:
self._root = Toplevel(parent)
- self._root.title("Chart Parser Application: Results")
- self._root.bind("<Control-q>", self.destroy)
+ self._root.title('Chart Parser Application: Results')
+ self._root.bind('<Control-q>', self.destroy)
else:
self._root = Frame(parent)
# Buttons
if toplevel:
buttons = Frame(self._root)
- buttons.pack(side="bottom", expand=0, fill="x")
- Button(buttons, text="Quit", command=self.destroy).pack(side="right")
- Button(buttons, text="Print All", command=self.print_all).pack(side="left")
- Button(buttons, text="Print Selection", command=self.print_selection).pack(
- side="left"
- )
+ buttons.pack(side='bottom', expand=0, fill='x')
+ Button(buttons, text='Quit',
+ command=self.destroy).pack(side='right')
+ Button(buttons, text='Print All',
+ command=self.print_all).pack(side='left')
+ Button(buttons, text='Print Selection',
+ command=self.print_selection).pack(side='left')
# Canvas frame.
self._cframe = CanvasFrame(self._root, closeenough=20)
- self._cframe.pack(side="top", expand=1, fill="both")
+ self._cframe.pack(side='top', expand=1, fill='both')
# Initial update
self.update()
def update(self, edge=None):
- if self._root is None:
- return
+ if self._root is None: return
# If the edge isn't a parse edge, do nothing.
if edge is not None:
- if edge.lhs() != self._grammar.start():
- return
- if edge.span() != (0, self._chart.num_leaves()):
- return
+ if edge.lhs() != self._grammar.start(): return
+ if edge.span() != (0, self._chart.num_leaves()): return
for parse in self._chart.parses(self._grammar.start()):
if parse not in self._trees:
c.delete(self._selectbox)
self._selection = widget
(x1, y1, x2, y2) = widget.bbox()
- self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline="#088")
+ self._selectbox = c.create_rectangle(x1, y1, x2, y2,
+ width=2, outline='#088')
def _color(self, treewidget, color):
- treewidget.label()["color"] = color
+ treewidget.label()['color'] = color
for child in treewidget.subtrees():
if isinstance(child, TreeSegmentWidget):
self._color(child, color)
else:
- child["color"] = color
+ child['color'] = color
def print_all(self, *e):
- if self._root is None:
- return
+ if self._root is None: return
self._cframe.print_to_file()
def print_selection(self, *e):
- if self._root is None:
- return
+ if self._root is None: return
if self._selection is None:
- showerror("Print Error", "No tree selected")
+ showerror('Print Error', 'No tree selected')
else:
c = self._cframe.canvas()
for widget in self._treewidgets:
if widget is not self._selection:
self._cframe.destroy_widget(widget)
c.delete(self._selectbox)
- (x1, y1, x2, y2) = self._selection.bbox()
- self._selection.move(10 - x1, 10 - y1)
- c["scrollregion"] = "0 0 %s %s" % (x2 - x1 + 20, y2 - y1 + 20)
+ (x1,y1,x2,y2) = self._selection.bbox()
+ self._selection.move(10-x1,10-y1)
+ c['scrollregion'] = '0 0 %s %s' % (x2-x1+20, y2-y1+20)
self._cframe.print_to_file()
# Restore our state.
self.update()
def clear(self):
- if self._root is None:
- return
+ if self._root is None: return
for treewidget in self._treewidgets:
self._cframe.destroy_widget(treewidget)
self._trees = []
self.update()
def destroy(self, *e):
- if self._root is None:
- return
- try:
- self._root.destroy()
- except:
- pass
+ if self._root is None: return
+ try: self._root.destroy()
+ except: pass
self._root = None
def pack(self, *args, **kwargs):
self._root.pack(*args, **kwargs)
-
#######################################################################
# Chart Comparer
#######################################################################
-
class ChartComparer(object):
"""
:ivar _op_label: A Label containing the most recent operation.
"""
- _OPSYMBOL = {
- "-": "-",
- "and": SymbolWidget.SYMBOLS["intersection"],
- "or": SymbolWidget.SYMBOLS["union"],
- }
+ _OPSYMBOL = {'-': '-',
+ 'and': SymbolWidget.SYMBOLS['intersection'],
+ 'or': SymbolWidget.SYMBOLS['union']}
def __init__(self, *chart_filenames):
# This chart is displayed when we don't have a value (eg
# before any chart is loaded).
- faketok = [""] * 8
+ faketok = [''] * 8
self._emptychart = Chart(faketok)
# The left & right charts start out empty.
- self._left_name = "None"
- self._right_name = "None"
+ self._left_name = 'None'
+ self._right_name = 'None'
self._left_chart = self._emptychart
self._right_chart = self._emptychart
# The charts that have been loaded.
- self._charts = {"None": self._emptychart}
+ self._charts = {'None': self._emptychart}
# The output chart.
self._out_chart = self._emptychart
# Set up the root window.
self._root = Tk()
- self._root.title("Chart Comparison")
- self._root.bind("<Control-q>", self.destroy)
- self._root.bind("<Control-x>", self.destroy)
+ self._root.title('Chart Comparison')
+ self._root.bind('<Control-q>', self.destroy)
+ self._root.bind('<Control-x>', self.destroy)
# Initialize all widgets, etc.
self._init_menubar(self._root)
self.load_chart(filename)
def destroy(self, *e):
- if self._root is None:
- return
- try:
- self._root.destroy()
- except:
- pass
+ if self._root is None: return
+ try: self._root.destroy()
+ except: pass
self._root = None
def mainloop(self, *args, **kwargs):
return
self._root.mainloop(*args, **kwargs)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Initialization
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def _init_menubar(self, root):
menubar = Menu(root)
# File menu
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Load Chart",
- accelerator="Ctrl-o",
- underline=0,
- command=self.load_chart_dialog,
- )
- filemenu.add_command(
- label="Save Output",
- accelerator="Ctrl-s",
- underline=0,
- command=self.save_chart_dialog,
- )
+ filemenu.add_command(label='Load Chart', accelerator='Ctrl-o',
+ underline=0, command=self.load_chart_dialog)
+ filemenu.add_command(label='Save Output', accelerator='Ctrl-s',
+ underline=0, command=self.save_chart_dialog)
filemenu.add_separator()
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-x')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
# Compare menu
opmenu = Menu(menubar, tearoff=0)
- opmenu.add_command(
- label="Intersection", command=self._intersection, accelerator="+"
- )
- opmenu.add_command(label="Union", command=self._union, accelerator="*")
- opmenu.add_command(
- label="Difference", command=self._difference, accelerator="-"
- )
+ opmenu.add_command(label='Intersection',
+ command=self._intersection,
+ accelerator='+')
+ opmenu.add_command(label='Union',
+ command=self._union,
+ accelerator='*')
+ opmenu.add_command(label='Difference',
+ command=self._difference,
+ accelerator='-')
opmenu.add_separator()
- opmenu.add_command(label="Swap Charts", command=self._swapcharts)
- menubar.add_cascade(label="Compare", underline=0, menu=opmenu)
+ opmenu.add_command(label='Swap Charts',
+ command=self._swapcharts)
+ menubar.add_cascade(label='Compare', underline=0, menu=opmenu)
# Add the menu
self._root.config(menu=menubar)
def _init_divider(self, root):
- divider = Frame(root, border=2, relief="sunken")
- divider.pack(side="top", fill="x", ipady=2)
+ divider = Frame(root, border=2, relief='sunken')
+ divider.pack(side='top', fill='x', ipady=2)
def _init_chartviews(self, root):
- opfont = ("symbol", -36) # Font for operator.
- eqfont = ("helvetica", -36) # Font for equals sign.
+ opfont=('symbol', -36) # Font for operator.
+ eqfont=('helvetica', -36) # Font for equals sign.
- frame = Frame(root, background="#c0c0c0")
- frame.pack(side="top", expand=1, fill="both")
+ frame = Frame(root, background='#c0c0c0')
+ frame.pack(side='top', expand=1, fill='both')
# The left matrix.
- cv1_frame = Frame(frame, border=3, relief="groove")
- cv1_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
+ cv1_frame = Frame(frame, border=3, relief='groove')
+ cv1_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
self._left_selector = MutableOptionMenu(
- cv1_frame, list(self._charts.keys()), command=self._select_left
- )
- self._left_selector.pack(side="top", pady=5, fill="x")
- self._left_matrix = ChartMatrixView(
- cv1_frame, self._emptychart, toplevel=False, show_numedges=True
- )
- self._left_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
- self._left_matrix.add_callback("select", self.select_edge)
- self._left_matrix.add_callback("select_cell", self.select_cell)
+ cv1_frame, list(self._charts.keys()), command=self._select_left)
+ self._left_selector.pack(side='top', pady=5, fill='x')
+ self._left_matrix = ChartMatrixView(cv1_frame, self._emptychart,
+ toplevel=False,
+ show_numedges=True)
+ self._left_matrix.pack(side='bottom', padx=5, pady=5,
+ expand=1, fill='both')
+ self._left_matrix.add_callback('select', self.select_edge)
+ self._left_matrix.add_callback('select_cell', self.select_cell)
self._left_matrix.inactivate()
# The operator.
- self._op_label = Label(
- frame, text=" ", width=3, background="#c0c0c0", font=opfont
- )
- self._op_label.pack(side="left", padx=5, pady=5)
+ self._op_label = Label(frame, text=' ', width=3,
+ background='#c0c0c0', font=opfont)
+ self._op_label.pack(side='left', padx=5, pady=5)
# The right matrix.
- cv2_frame = Frame(frame, border=3, relief="groove")
- cv2_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
+ cv2_frame = Frame(frame, border=3, relief='groove')
+ cv2_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
self._right_selector = MutableOptionMenu(
- cv2_frame, list(self._charts.keys()), command=self._select_right
- )
- self._right_selector.pack(side="top", pady=5, fill="x")
- self._right_matrix = ChartMatrixView(
- cv2_frame, self._emptychart, toplevel=False, show_numedges=True
- )
- self._right_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
- self._right_matrix.add_callback("select", self.select_edge)
- self._right_matrix.add_callback("select_cell", self.select_cell)
+ cv2_frame, list(self._charts.keys()), command=self._select_right)
+ self._right_selector.pack(side='top', pady=5, fill='x')
+ self._right_matrix = ChartMatrixView(cv2_frame, self._emptychart,
+ toplevel=False,
+ show_numedges=True)
+ self._right_matrix.pack(side='bottom', padx=5, pady=5,
+ expand=1, fill='both')
+ self._right_matrix.add_callback('select', self.select_edge)
+ self._right_matrix.add_callback('select_cell', self.select_cell)
self._right_matrix.inactivate()
# The equals sign
- Label(frame, text="=", width=3, background="#c0c0c0", font=eqfont).pack(
- side="left", padx=5, pady=5
- )
+ Label(frame, text='=', width=3, background='#c0c0c0',
+ font=eqfont).pack(side='left', padx=5, pady=5)
# The output matrix.
- out_frame = Frame(frame, border=3, relief="groove")
- out_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
- self._out_label = Label(out_frame, text="Output")
- self._out_label.pack(side="top", pady=9)
- self._out_matrix = ChartMatrixView(
- out_frame, self._emptychart, toplevel=False, show_numedges=True
- )
- self._out_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
- self._out_matrix.add_callback("select", self.select_edge)
- self._out_matrix.add_callback("select_cell", self.select_cell)
+ out_frame = Frame(frame, border=3, relief='groove')
+ out_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
+ self._out_label = Label(out_frame, text='Output')
+ self._out_label.pack(side='top', pady=9)
+ self._out_matrix = ChartMatrixView(out_frame, self._emptychart,
+ toplevel=False,
+ show_numedges=True)
+ self._out_matrix.pack(side='bottom', padx=5, pady=5,
+ expand=1, fill='both')
+ self._out_matrix.add_callback('select', self.select_edge)
+ self._out_matrix.add_callback('select_cell', self.select_cell)
self._out_matrix.inactivate()
def _init_buttons(self, root):
buttons = Frame(root)
- buttons.pack(side="bottom", pady=5, fill="x", expand=0)
- Button(buttons, text="Intersection", command=self._intersection).pack(
- side="left"
- )
- Button(buttons, text="Union", command=self._union).pack(side="left")
- Button(buttons, text="Difference", command=self._difference).pack(side="left")
- Frame(buttons, width=20).pack(side="left")
- Button(buttons, text="Swap Charts", command=self._swapcharts).pack(side="left")
-
- Button(buttons, text="Detatch Output", command=self._detatch_out).pack(
- side="right"
- )
+ buttons.pack(side='bottom', pady=5, fill='x', expand=0)
+ Button(buttons, text='Intersection',
+ command=self._intersection).pack(side='left')
+ Button(buttons, text='Union',
+ command=self._union).pack(side='left')
+ Button(buttons, text='Difference',
+ command=self._difference).pack(side='left')
+ Frame(buttons, width=20).pack(side='left')
+ Button(buttons, text='Swap Charts',
+ command=self._swapcharts).pack(side='left')
+
+ Button(buttons, text='Detatch Output',
+ command=self._detatch_out).pack(side='right')
def _init_bindings(self, root):
- # root.bind('<Control-s>', self.save_chart)
- root.bind("<Control-o>", self.load_chart_dialog)
- # root.bind('<Control-r>', self.reset)
+ #root.bind('<Control-s>', self.save_chart)
+ root.bind('<Control-o>', self.load_chart_dialog)
+ #root.bind('<Control-r>', self.reset)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Input Handling
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def _select_left(self, name):
self._left_name = name
self._left_chart = self._charts[name]
self._left_matrix.set_chart(self._left_chart)
- if name == "None":
- self._left_matrix.inactivate()
+ if name == 'None': self._left_matrix.inactivate()
self._apply_op()
def _select_right(self, name):
self._right_name = name
self._right_chart = self._charts[name]
self._right_matrix.set_chart(self._right_chart)
- if name == "None":
- self._right_matrix.inactivate()
+ if name == 'None': self._right_matrix.inactivate()
self._apply_op()
def _apply_op(self):
- if self._operator == "-":
- self._difference()
- elif self._operator == "or":
- self._union()
- elif self._operator == "and":
- self._intersection()
-
- # ////////////////////////////////////////////////////////////
+ if self._operator == '-': self._difference()
+ elif self._operator == 'or': self._union()
+ elif self._operator == 'and': self._intersection()
+
+
+ #////////////////////////////////////////////////////////////
# File
- # ////////////////////////////////////////////////////////////
- CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")]
+ #////////////////////////////////////////////////////////////
+ CHART_FILE_TYPES = [('Pickle file', '.pickle'),
+ ('All files', '*')]
def save_chart_dialog(self, *args):
- filename = asksaveasfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
- )
- if not filename:
- return
+ filename = asksaveasfilename(filetypes=self.CHART_FILE_TYPES,
+ defaultextension='.pickle')
+ if not filename: return
try:
- with open(filename, "wb") as outfile:
+ with open(filename, 'wb') as outfile:
pickle.dump(self._out_chart, outfile)
except Exception as e:
- showerror(
- "Error Saving Chart", "Unable to open file: %r\n%s" % (filename, e)
- )
+ showerror('Error Saving Chart',
+ 'Unable to open file: %r\n%s' %
+ (filename, e))
def load_chart_dialog(self, *args):
- filename = askopenfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
- )
- if not filename:
- return
- try:
- self.load_chart(filename)
+ filename = askopenfilename(filetypes=self.CHART_FILE_TYPES,
+ defaultextension='.pickle')
+ if not filename: return
+ try: self.load_chart(filename)
except Exception as e:
- showerror(
- "Error Loading Chart", "Unable to open file: %r\n%s" % (filename, e)
- )
+ showerror('Error Loading Chart',
+ 'Unable to open file: %r\n%s' %
+ (filename, e))
def load_chart(self, filename):
- with open(filename, "rb") as infile:
+ with open(filename, 'rb') as infile:
chart = pickle.load(infile)
name = os.path.basename(filename)
- if name.endswith(".pickle"):
- name = name[:-7]
- if name.endswith(".chart"):
- name = name[:-6]
+ if name.endswith('.pickle'): name = name[:-7]
+ if name.endswith('.chart'): name = name[:-6]
self._charts[name] = chart
self._left_selector.add(name)
self._right_selector.add(name)
self._right_matrix.update()
self._out_matrix.update()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Selection
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def select_edge(self, edge):
if edge in self._left_chart:
self._right_matrix.select_cell(i, j)
self._out_matrix.select_cell(i, j)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Operations
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def _difference(self):
- if not self._checkcompat():
- return
+ if not self._checkcompat(): return
out_chart = Chart(self._left_chart.tokens())
for edge in self._left_chart:
if edge not in self._right_chart:
out_chart.insert(edge, [])
- self._update("-", out_chart)
+ self._update('-', out_chart)
def _intersection(self):
- if not self._checkcompat():
- return
+ if not self._checkcompat(): return
out_chart = Chart(self._left_chart.tokens())
for edge in self._left_chart:
if edge in self._right_chart:
out_chart.insert(edge, [])
- self._update("and", out_chart)
+ self._update('and', out_chart)
def _union(self):
- if not self._checkcompat():
- return
+ if not self._checkcompat(): return
out_chart = Chart(self._left_chart.tokens())
for edge in self._left_chart:
for edge in self._right_chart:
out_chart.insert(edge, [])
- self._update("or", out_chart)
+ self._update('or', out_chart)
def _swapcharts(self):
left, right = self._left_name, self._right_name
self._right_selector.set(left)
def _checkcompat(self):
- if (
- self._left_chart.tokens() != self._right_chart.tokens()
- or self._left_chart.property_names() != self._right_chart.property_names()
- or self._left_chart == self._emptychart
- or self._right_chart == self._emptychart
- ):
+ if (self._left_chart.tokens() != self._right_chart.tokens() or
+ self._left_chart.property_names() !=
+ self._right_chart.property_names() or
+ self._left_chart == self._emptychart or
+ self._right_chart == self._emptychart):
# Clear & inactivate the output chart.
self._out_chart = self._emptychart
self._out_matrix.set_chart(self._out_chart)
self._out_matrix.inactivate()
- self._out_label["text"] = "Output"
+ self._out_label['text'] = 'Output'
# Issue some other warning?
return False
else:
def _update(self, operator, out_chart):
self._operator = operator
- self._op_label["text"] = self._OPSYMBOL[operator]
+ self._op_label['text'] = self._OPSYMBOL[operator]
self._out_chart = out_chart
self._out_matrix.set_chart(out_chart)
- self._out_label["text"] = "%s %s %s" % (
- self._left_name,
- self._operator,
- self._right_name,
- )
+ self._out_label['text'] = '%s %s %s' % (self._left_name,
+ self._operator,
+ self._right_name)
def _clear_out_chart(self):
self._out_chart = self._emptychart
self._out_matrix.set_chart(self._out_chart)
- self._op_label["text"] = " "
+ self._op_label['text'] = ' '
self._out_matrix.inactivate()
def _detatch_out(self):
- ChartMatrixView(self._root, self._out_chart, title=self._out_label["text"])
+ ChartMatrixView(self._root, self._out_chart,
+ title=self._out_label['text'])
+
+
+
+
+
+
#######################################################################
# Chart View
#######################################################################
-
class ChartView(object):
"""
A component for viewing charts. This is used by ``ChartParserApp`` to
Construct a new ``Chart`` display.
"""
# Process keyword args.
- draw_tree = kw.get("draw_tree", 0)
- draw_sentence = kw.get("draw_sentence", 1)
- self._fontsize = kw.get("fontsize", -12)
+ draw_tree = kw.get('draw_tree', 0)
+ draw_sentence = kw.get('draw_sentence', 1)
+ self._fontsize = kw.get('fontsize', -12)
# The chart!
self._chart = chart
# If they didn't provide a main window, then set one up.
if root is None:
top = Tk()
- top.title("Chart View")
-
- def destroy1(e, top=top):
- top.destroy()
-
- def destroy2(top=top):
- top.destroy()
-
- top.bind("q", destroy1)
- b = Button(top, text="Done", command=destroy2)
- b.pack(side="bottom")
+ top.title('Chart View')
+ def destroy1(e, top=top): top.destroy()
+ def destroy2(top=top): top.destroy()
+ top.bind('q', destroy1)
+ b = Button(top, text='Done', command=destroy2)
+ b.pack(side='bottom')
self._root = top
else:
self._root = root
# Create the chart canvas.
(self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root)
- self._chart_canvas["height"] = 300
- self._chart_canvas["closeenough"] = 15
+ self._chart_canvas['height'] = 300
+ self._chart_canvas['closeenough'] = 15
# Create the sentence canvas.
if draw_sentence:
- cframe = Frame(self._root, relief="sunk", border=2)
- cframe.pack(fill="both", side="bottom")
+ cframe = Frame(self._root, relief='sunk', border=2)
+ cframe.pack(fill='both', side='bottom')
self._sentence_canvas = Canvas(cframe, height=50)
- self._sentence_canvas["background"] = "#e0e0e0"
- self._sentence_canvas.pack(fill="both")
- # self._sentence_canvas['height'] = self._sentence_height
+ self._sentence_canvas['background'] = '#e0e0e0'
+ self._sentence_canvas.pack(fill='both')
+ #self._sentence_canvas['height'] = self._sentence_height
else:
self._sentence_canvas = None
# Create the tree canvas.
if draw_tree:
- (sb, canvas) = self._sb_canvas(self._root, "n", "x")
+ (sb, canvas) = self._sb_canvas(self._root, 'n', 'x')
(self._tree_sb, self._tree_canvas) = (sb, canvas)
- self._tree_canvas["height"] = 200
+ self._tree_canvas['height'] = 200
else:
self._tree_canvas = None
# Set up the configure callback, which will be called whenever
# the window is resized.
- self._chart_canvas.bind("<Configure>", self._configure)
+ self._chart_canvas.bind('<Configure>', self._configure)
def _init_fonts(self, root):
- self._boldfont = Font(family="helvetica", weight="bold", size=self._fontsize)
- self._font = Font(family="helvetica", size=self._fontsize)
+ self._boldfont = Font(family='helvetica', weight='bold',
+ size=self._fontsize)
+ self._font = Font(family='helvetica',
+ size=self._fontsize)
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
- def _sb_canvas(self, root, expand="y", fill="both", side="bottom"):
+ def _sb_canvas(self, root, expand='y',
+ fill='both', side='bottom'):
"""
Helper for __init__: construct a canvas with a scrollbar.
"""
- cframe = Frame(root, relief="sunk", border=2)
+ cframe = Frame(root, relief='sunk', border=2)
cframe.pack(fill=fill, expand=expand, side=side)
- canvas = Canvas(cframe, background="#e0e0e0")
+ canvas = Canvas(cframe, background='#e0e0e0')
# Give the canvas a scrollbar.
- sb = Scrollbar(cframe, orient="vertical")
- sb.pack(side="right", fill="y")
- canvas.pack(side="left", fill=fill, expand="yes")
+ sb = Scrollbar(cframe, orient='vertical')
+ sb.pack(side='right', fill='y')
+ canvas.pack(side='left', fill=fill, expand='yes')
# Connect the scrollbars to the canvas.
- sb["command"] = canvas.yview
- canvas["yscrollcommand"] = sb.set
+ sb['command']= canvas.yview
+ canvas['yscrollcommand'] = sb.set
return (sb, canvas)
def scroll_up(self, *e):
- self._chart_canvas.yview("scroll", -1, "units")
+ self._chart_canvas.yview('scroll', -1, 'units')
def scroll_down(self, *e):
- self._chart_canvas.yview("scroll", 1, "units")
+ self._chart_canvas.yview('scroll', 1, 'units')
def page_up(self, *e):
- self._chart_canvas.yview("scroll", -1, "pages")
+ self._chart_canvas.yview('scroll', -1, 'pages')
def page_down(self, *e):
- self._chart_canvas.yview("scroll", 1, "pages")
+ self._chart_canvas.yview('scroll', 1, 'pages')
def _grow(self):
"""
"""
# Grow, if need-be
N = self._chart.num_leaves()
- width = max(
- int(self._chart_canvas["width"]), N * self._unitsize + ChartView._MARGIN * 2
- )
+ width = max(int(self._chart_canvas['width']),
+ N * self._unitsize + ChartView._MARGIN * 2 )
# It won't resize without the second (height) line, but I
# don't understand why not.
self._chart_canvas.configure(width=width)
- self._chart_canvas.configure(height=self._chart_canvas["height"])
+ self._chart_canvas.configure(height=self._chart_canvas['height'])
- self._unitsize = (width - 2 * ChartView._MARGIN) / N
+ self._unitsize = (width - 2*ChartView._MARGIN) / N
# Reset the height for the sentence window.
if self._sentence_canvas is not None:
- self._sentence_canvas["height"] = self._sentence_height
+ self._sentence_canvas['height'] = self._sentence_height
def set_font_size(self, size):
self._font.configure(size=-abs(size))
canvas.
"""
N = self._chart.num_leaves()
- self._unitsize = (e.width - 2 * ChartView._MARGIN) / N
+ self._unitsize = (e.width - 2*ChartView._MARGIN) / N
self.draw()
def update(self, chart=None):
self._add_edge(edge)
self._resize()
+
def _edge_conflict(self, edge, lvl):
"""
Return True if the given edge overlaps with any edge on the given
(s1, e1) = edge.span()
for otheredge in self._edgelevels[lvl]:
(s2, e2) = otheredge.span()
- if (s1 <= s2 < e1) or (s2 <= s1 < e2) or (s1 == s2 == e1 == e2):
+ if (s1 <= s2 < e1) or (s2 <= s1 < e2) or (s1==s2==e1==e2):
return True
return False
rhs = " ".join(rhselts)
else:
lhs = edge.lhs()
- rhs = ""
+ rhs = ''
for s in (lhs, rhs):
- tag = c.create_text(
- 0, 0, text=s, font=self._boldfont, anchor="nw", justify="left"
- )
+ tag = c.create_text(0,0, text=s,
+ font=self._boldfont,
+ anchor='nw', justify='left')
bbox = c.bbox(tag)
c.delete(tag)
- width = bbox[2] # + ChartView._LEAF_SPACING
+ width = bbox[2] #+ ChartView._LEAF_SPACING
edgelen = max(edge.length(), 1)
- self._unitsize = max(self._unitsize, width / edgelen)
+ self._unitsize = max(self._unitsize, width/edgelen)
self._text_height = max(self._text_height, bbox[3] - bbox[1])
def _add_edge(self, edge, minlvl=0):
- Call _draw_edge
"""
# Do NOT show leaf edges in the chart.
- if isinstance(edge, LeafEdge):
- return
+ if isinstance(edge, LeafEdge): return
- if edge in self._edgetags:
- return
+ if edge in self._edgetags: return
self._analyze_edge(edge)
self._grow()
if not self._compact:
self._edgelevels.append([edge])
- lvl = len(self._edgelevels) - 1
+ lvl = len(self._edgelevels)-1
self._draw_edge(edge, lvl)
self._resize()
return
self._resize()
# Check if we can fit the edge in this level.
- if lvl >= minlvl and not self._edge_conflict(edge, lvl):
+ if lvl>=minlvl and not self._edge_conflict(edge, lvl):
# Go ahead and draw it.
self._edgelevels[lvl].append(edge)
break
if edge in self._edgelevels[i]:
level = i
break
- if level is None:
- return
+ if level is None: return
# Try to view the new edge..
- y = (level + 1) * self._chart_level_size
+ y = (level+1) * self._chart_level_size
dy = self._text_height + 10
- self._chart_canvas.yview("moveto", 1.0)
+ self._chart_canvas.yview('moveto', 1.0)
if self._chart_height != 0:
- self._chart_canvas.yview("moveto", (y - dy) / self._chart_height)
+ self._chart_canvas.yview('moveto',
+ (y-dy)/self._chart_height)
def _draw_edge(self, edge, lvl):
"""
c = self._chart_canvas
# Draw the arrow.
- x1 = edge.start() * self._unitsize + ChartView._MARGIN
- x2 = edge.end() * self._unitsize + ChartView._MARGIN
- if x2 == x1:
- x2 += max(4, self._unitsize / 5)
- y = (lvl + 1) * self._chart_level_size
- linetag = c.create_line(x1, y, x2, y, arrow="last", width=3)
+ x1 = (edge.start() * self._unitsize + ChartView._MARGIN)
+ x2 = (edge.end() * self._unitsize + ChartView._MARGIN)
+ if x2 == x1: x2 += max(4, self._unitsize/5)
+ y = (lvl+1) * self._chart_level_size
+ linetag = c.create_line(x1, y, x2, y, arrow='last', width=3)
# Draw a label for the edge.
if isinstance(edge, TreeEdge):
rhs1 = " ".join(rhs[:pos])
rhs2 = " ".join(rhs[pos:])
- rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor="nw")
+ rhstag1 = c.create_text(x1+3, y, text=rhs1,
+ font=self._font,
+ anchor='nw')
dotx = c.bbox(rhstag1)[2] + 6
- doty = (c.bbox(rhstag1)[1] + c.bbox(rhstag1)[3]) / 2
- dottag = c.create_oval(dotx - 2, doty - 2, dotx + 2, doty + 2)
- rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor="nw")
- lhstag = c.create_text(
- (x1 + x2) / 2, y, text=str(edge.lhs()), anchor="s", font=self._boldfont
- )
+ doty = (c.bbox(rhstag1)[1]+c.bbox(rhstag1)[3])/2
+ dottag = c.create_oval(dotx-2, doty-2, dotx+2, doty+2)
+ rhstag2 = c.create_text(dotx+6, y, text=rhs2,
+ font=self._font,
+ anchor='nw')
+ lhstag = c.create_text((x1+x2)/2, y, text=str(edge.lhs()),
+ anchor='s',
+ font=self._boldfont)
# Keep track of the edge's tags.
- self._edgetags[edge] = (linetag, rhstag1, dottag, rhstag2, lhstag)
+ self._edgetags[edge] = (linetag, rhstag1,
+ dottag, rhstag2, lhstag)
# Register a callback for clicking on the edge.
def cb(event, self=self, edge=edge):
- self._fire_callbacks("select", edge)
-
- c.tag_bind(rhstag1, "<Button-1>", cb)
- c.tag_bind(rhstag2, "<Button-1>", cb)
- c.tag_bind(linetag, "<Button-1>", cb)
- c.tag_bind(dottag, "<Button-1>", cb)
- c.tag_bind(lhstag, "<Button-1>", cb)
+ self._fire_callbacks('select', edge)
+ c.tag_bind(rhstag1, '<Button-1>', cb)
+ c.tag_bind(rhstag2, '<Button-1>', cb)
+ c.tag_bind(linetag, '<Button-1>', cb)
+ c.tag_bind(dottag, '<Button-1>', cb)
+ c.tag_bind(lhstag, '<Button-1>', cb)
self._color_edge(edge)
If no colors are specified, use intelligent defaults
(dependent on selection, etc.)
"""
- if edge not in self._edgetags:
- return
+ if edge not in self._edgetags: return
c = self._chart_canvas
if linecolor is not None and textcolor is not None:
tags = self._edgetags[edge]
c.itemconfig(tags[0], fill=linecolor)
c.itemconfig(tags[1], fill=textcolor)
- c.itemconfig(tags[2], fill=textcolor, outline=textcolor)
+ c.itemconfig(tags[2], fill=textcolor,
+ outline=textcolor)
c.itemconfig(tags[3], fill=textcolor)
c.itemconfig(tags[4], fill=textcolor)
return
N = self._chart.num_leaves()
if edge in self._marks:
self._color_edge(self._marks[edge])
- if edge.is_complete() and edge.span() == (0, N):
- self._color_edge(edge, "#084", "#042")
+ if (edge.is_complete() and edge.span() == (0, N)):
+ self._color_edge(edge, '#084', '#042')
elif isinstance(edge, LeafEdge):
- self._color_edge(edge, "#48c", "#246")
+ self._color_edge(edge, '#48c', '#246')
else:
- self._color_edge(edge, "#00f", "#008")
+ self._color_edge(edge, '#00f', '#008')
- def mark_edge(self, edge, mark="#0df"):
+ def mark_edge(self, edge, mark='#0df'):
"""
Mark an edge
"""
del self._marks[edge]
self._color_edge(edge)
- def markonly_edge(self, edge, mark="#0df"):
+ def markonly_edge(self, edge, mark='#0df'):
self.unmark_edge()
self.mark_edge(edge, mark)
to be, How big the tree should be, etc.
"""
# Figure out the text height and the unit size.
- unitsize = 70 # min unitsize
+ unitsize = 70 # min unitsize
text_height = 0
c = self._chart_canvas
# Check against all tokens
for leaf in self._chart.leaves():
- tag = c.create_text(
- 0, 0, text=repr(leaf), font=self._font, anchor="nw", justify="left"
- )
+ tag = c.create_text(0,0, text=repr(leaf),
+ font=self._font,
+ anchor='nw', justify='left')
bbox = c.bbox(tag)
c.delete(tag)
width = bbox[2] + ChartView._LEAF_SPACING
self._unitsize = unitsize
self._text_height = text_height
- self._sentence_height = self._text_height + 2 * ChartView._MARGIN
+ self._sentence_height = (self._text_height +
+ 2*ChartView._MARGIN)
# Check against edges.
for edge in self._chart.edges():
self._chart_level_size = self._text_height * 2
# Default tree size..
- self._tree_height = 3 * (ChartView._TREE_LEVEL_SIZE + self._text_height)
+ self._tree_height = (3 * (ChartView._TREE_LEVEL_SIZE +
+ self._text_height))
# Resize the scrollregions.
self._resize()
c = self._chart_canvas
# Reset the chart scroll region
- width = self._chart.num_leaves() * self._unitsize + ChartView._MARGIN * 2
+ width = ( self._chart.num_leaves() * self._unitsize +
+ ChartView._MARGIN * 2 )
levels = len(self._edgelevels)
- self._chart_height = (levels + 2) * self._chart_level_size
- c["scrollregion"] = (0, 0, width, self._chart_height)
+ self._chart_height = (levels+2)*self._chart_level_size
+ c['scrollregion']=(0,0,width,self._chart_height)
# Reset the tree scroll region
if self._tree_canvas:
- self._tree_canvas["scrollregion"] = (0, 0, width, self._tree_height)
+ self._tree_canvas['scrollregion'] = (0, 0, width,
+ self._tree_height)
def _draw_loclines(self):
"""
c3 = self._chart_canvas
margin = ChartView._MARGIN
self._loclines = []
- for i in range(0, self._chart.num_leaves() + 1):
- x = i * self._unitsize + margin
+ for i in range(0, self._chart.num_leaves()+1):
+ x = i*self._unitsize + margin
if c1:
- t1 = c1.create_line(x, 0, x, BOTTOM)
+ t1=c1.create_line(x, 0, x, BOTTOM)
c1.tag_lower(t1)
if c2:
- t2 = c2.create_line(x, 0, x, self._sentence_height)
+ t2=c2.create_line(x, 0, x, self._sentence_height)
c2.tag_lower(t2)
- t3 = c3.create_line(x, 0, x, BOTTOM)
+ t3=c3.create_line(x, 0, x, BOTTOM)
c3.tag_lower(t3)
- t4 = c3.create_text(x + 2, 0, text=repr(i), anchor="nw", font=self._font)
+ t4=c3.create_text(x+2, 0, text=repr(i), anchor='nw',
+ font=self._font)
c3.tag_lower(t4)
- # if i % 4 == 0:
+ #if i % 4 == 0:
# if c1: c1.itemconfig(t1, width=2, fill='gray60')
# if c2: c2.itemconfig(t2, width=2, fill='gray60')
# c3.itemconfig(t3, width=2, fill='gray60')
if i % 2 == 0:
- if c1:
- c1.itemconfig(t1, fill="gray60")
- if c2:
- c2.itemconfig(t2, fill="gray60")
- c3.itemconfig(t3, fill="gray60")
+ if c1: c1.itemconfig(t1, fill='gray60')
+ if c2: c2.itemconfig(t2, fill='gray60')
+ c3.itemconfig(t3, fill='gray60')
else:
- if c1:
- c1.itemconfig(t1, fill="gray80")
- if c2:
- c2.itemconfig(t2, fill="gray80")
- c3.itemconfig(t3, fill="gray80")
+ if c1: c1.itemconfig(t1, fill='gray80')
+ if c2: c2.itemconfig(t2, fill='gray80')
+ c3.itemconfig(t3, fill='gray80')
def _draw_sentence(self):
"""Draw the sentence string."""
- if self._chart.num_leaves() == 0:
- return
+ if self._chart.num_leaves() == 0: return
c = self._sentence_canvas
margin = ChartView._MARGIN
y = ChartView._MARGIN
for i, leaf in enumerate(self._chart.leaves()):
x1 = i * self._unitsize + margin
x2 = x1 + self._unitsize
- x = (x1 + x2) / 2
- tag = c.create_text(
- x, y, text=repr(leaf), font=self._font, anchor="n", justify="left"
- )
+ x = (x1+x2)/2
+ tag = c.create_text(x, y, text=repr(leaf),
+ font=self._font,
+ anchor='n', justify='left')
bbox = c.bbox(tag)
- rt = c.create_rectangle(
- x1 + 2,
- bbox[1] - (ChartView._LEAF_SPACING / 2),
- x2 - 2,
- bbox[3] + (ChartView._LEAF_SPACING / 2),
- fill="#f0f0f0",
- outline="#f0f0f0",
- )
+ rt=c.create_rectangle(x1+2, bbox[1]-(ChartView._LEAF_SPACING/2),
+ x2-2, bbox[3]+(ChartView._LEAF_SPACING/2),
+ fill='#f0f0f0', outline='#f0f0f0')
c.tag_lower(rt)
def erase_tree(self):
- for tag in self._tree_tags:
- self._tree_canvas.delete(tag)
+ for tag in self._tree_tags: self._tree_canvas.delete(tag)
self._treetoks = []
self._treetoks_edge = None
self._treetoks_index = 0
def draw_tree(self, edge=None):
- if edge is None and self._treetoks_edge is None:
- return
- if edge is None:
- edge = self._treetoks_edge
+ if edge is None and self._treetoks_edge is None: return
+ if edge is None: edge = self._treetoks_edge
# If it's a new edge, then get a new list of treetoks.
if self._treetoks_edge != edge:
- self._treetoks = [t for t in self._chart.trees(edge) if isinstance(t, Tree)]
+ self._treetoks = [t for t in self._chart.trees(edge)
+ if isinstance(t, Tree)]
self._treetoks_edge = edge
self._treetoks_index = 0
# Make sure there's something to draw.
- if len(self._treetoks) == 0:
- return
+ if len(self._treetoks) == 0: return
# Erase the old tree.
- for tag in self._tree_tags:
- self._tree_canvas.delete(tag)
+ for tag in self._tree_tags: self._tree_canvas.delete(tag)
# Draw the new tree.
tree = self._treetoks[self._treetoks_index]
self._draw_treecycle()
# Update the scroll region.
- w = self._chart.num_leaves() * self._unitsize + 2 * ChartView._MARGIN
- h = tree.height() * (ChartView._TREE_LEVEL_SIZE + self._text_height)
- self._tree_canvas["scrollregion"] = (0, 0, w, h)
+ w = self._chart.num_leaves()*self._unitsize+2*ChartView._MARGIN
+ h = tree.height() * (ChartView._TREE_LEVEL_SIZE+self._text_height)
+ self._tree_canvas['scrollregion'] = (0, 0, w, h)
def cycle_tree(self):
- self._treetoks_index = (self._treetoks_index + 1) % len(self._treetoks)
+ self._treetoks_index = (self._treetoks_index+1)%len(self._treetoks)
self.draw_tree(self._treetoks_edge)
def _draw_treecycle(self):
- if len(self._treetoks) <= 1:
- return
+ if len(self._treetoks) <= 1: return
# Draw the label.
- label = "%d Trees" % len(self._treetoks)
+ label = '%d Trees' % len(self._treetoks)
c = self._tree_canvas
margin = ChartView._MARGIN
- right = self._chart.num_leaves() * self._unitsize + margin - 2
- tag = c.create_text(right, 2, anchor="ne", text=label, font=self._boldfont)
+ right = self._chart.num_leaves()*self._unitsize+margin-2
+ tag = c.create_text(right, 2, anchor='ne', text=label,
+ font=self._boldfont)
self._tree_tags.append(tag)
_, _, _, y = c.bbox(tag)
# Draw the triangles.
for i in range(len(self._treetoks)):
- x = right - 20 * (len(self._treetoks) - i - 1)
- if i == self._treetoks_index:
- fill = "#084"
- else:
- fill = "#fff"
- tag = c.create_polygon(
- x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline="black"
- )
+ x = right - 20*(len(self._treetoks)-i-1)
+ if i == self._treetoks_index: fill = '#084'
+ else: fill = '#fff'
+ tag = c.create_polygon(x, y+10, x-5, y, x-10, y+10,
+ fill=fill, outline='black')
self._tree_tags.append(tag)
# Set up a callback: show the tree if they click on its
def cb(event, self=self, i=i):
self._treetoks_index = i
self.draw_tree()
-
- c.tag_bind(tag, "<Button-1>", cb)
+ c.tag_bind(tag, '<Button-1>', cb)
def _draw_treetok(self, treetok, index, depth=0):
"""
child_xs = []
for child in treetok:
if isinstance(child, Tree):
- child_x, index = self._draw_treetok(child, index, depth + 1)
+ child_x, index = self._draw_treetok(child, index, depth+1)
child_xs.append(child_x)
else:
- child_xs.append((2 * index + 1) * self._unitsize / 2 + margin)
+ child_xs.append((2*index+1)*self._unitsize/2 + margin)
index += 1
# If we have children, then get the node's x by averaging their
# node x's. Otherwise, make room for ourselves.
if child_xs:
- nodex = sum(child_xs) / len(child_xs)
+ nodex = sum(child_xs)/len(child_xs)
else:
# [XX] breaks for null productions.
- nodex = (2 * index + 1) * self._unitsize / 2 + margin
+ nodex = (2*index+1)*self._unitsize/2 + margin
index += 1
# Draw the node
nodey = depth * (ChartView._TREE_LEVEL_SIZE + self._text_height)
- tag = c.create_text(
- nodex,
- nodey,
- anchor="n",
- justify="center",
- text=str(treetok.label()),
- fill="#042",
- font=self._boldfont,
- )
+ tag = c.create_text(nodex, nodey, anchor='n', justify='center',
+ text=str(treetok.label()), fill='#042',
+ font=self._boldfont)
self._tree_tags.append(tag)
# Draw lines to the children.
for childx, child in zip(child_xs, treetok):
if isinstance(child, Tree) and child:
# A "real" tree token:
- tag = c.create_line(
- nodex,
- nodey + self._text_height,
- childx,
- childy,
- width=2,
- fill="#084",
- )
+ tag = c.create_line(nodex, nodey + self._text_height,
+ childx, childy, width=2, fill='#084')
self._tree_tags.append(tag)
if isinstance(child, Tree) and not child:
# An unexpanded tree token:
- tag = c.create_line(
- nodex,
- nodey + self._text_height,
- childx,
- childy,
- width=2,
- fill="#048",
- dash="2 3",
- )
+ tag = c.create_line(nodex, nodey + self._text_height,
+ childx, childy, width=2,
+ fill='#048', dash='2 3')
self._tree_tags.append(tag)
if not isinstance(child, Tree):
# A leaf:
- tag = c.create_line(
- nodex,
- nodey + self._text_height,
- childx,
- 10000,
- width=2,
- fill="#084",
- )
+ tag = c.create_line(nodex, nodey + self._text_height,
+ childx, 10000, width=2, fill='#084')
self._tree_tags.append(tag)
return nodex, index
Draw everything (from scratch).
"""
if self._tree_canvas:
- self._tree_canvas.delete("all")
+ self._tree_canvas.delete('all')
self.draw_tree()
if self._sentence_canvas:
- self._sentence_canvas.delete("all")
+ self._sentence_canvas.delete('all')
self._draw_sentence()
- self._chart_canvas.delete("all")
+ self._chart_canvas.delete('all')
self._edgetags = {}
# Redraw any edges we erased.
self._draw_loclines()
def add_callback(self, event, func):
- self._callbacks.setdefault(event, {})[func] = 1
+ self._callbacks.setdefault(event,{})[func] = 1
def remove_callback(self, event, func=None):
- if func is None:
- del self._callbacks[event]
+ if func is None: del self._callbacks[event]
else:
- try:
- del self._callbacks[event][func]
- except:
- pass
+ try: del self._callbacks[event][func]
+ except: pass
def _fire_callbacks(self, event, *args):
- if event not in self._callbacks:
- return
- for cb_func in list(self._callbacks[event].keys()):
- cb_func(*args)
-
+ if event not in self._callbacks: return
+ for cb_func in list(self._callbacks[event].keys()): cb_func(*args)
#######################################################################
# Edge Rules
# These version of the chart rules only apply to a specific edge.
# This lets the user select an edge, and then apply a rule.
-
class EdgeRule(object):
"""
To create an edge rule, make an empty base class that uses
EdgeRule as the first base class, and the basic rule as the
second base class. (Order matters!)
"""
-
def __init__(self, edge):
super = self.__class__.__bases__[1]
self._edge = edge
- self.NUM_EDGES = super.NUM_EDGES - 1
-
+ self.NUM_EDGES = super.NUM_EDGES-1
def apply(self, chart, grammar, *edges):
super = self.__class__.__bases__[1]
edges += (self._edge,)
- for e in super.apply(self, chart, grammar, *edges):
- yield e
-
+ for e in super.apply(self, chart, grammar, *edges): yield e
def __str__(self):
super = self.__class__.__bases__[1]
return super.__str__(self)
-
class TopDownPredictEdgeRule(EdgeRule, TopDownPredictRule):
pass
-
-
class BottomUpEdgeRule(EdgeRule, BottomUpPredictRule):
pass
-
-
class BottomUpLeftCornerEdgeRule(EdgeRule, BottomUpPredictCombineRule):
pass
-
-
class FundamentalEdgeRule(EdgeRule, SingleEdgeFundamentalRule):
pass
-
#######################################################################
# Chart Parser Application
#######################################################################
-
class ChartParserApp(object):
- def __init__(self, grammar, tokens, title="Chart Parser Application"):
+ def __init__(self, grammar, tokens, title='Chart Parser Application'):
# Initialize the parser
self._init_parser(grammar, tokens)
# Create the root window.
self._root = Tk()
self._root.title(title)
- self._root.bind("<Control-q>", self.destroy)
+ self._root.bind('<Control-q>', self.destroy)
# Set up some frames.
frame3 = Frame(self._root)
frame2 = Frame(self._root)
frame1 = Frame(self._root)
- frame3.pack(side="bottom", fill="none")
- frame2.pack(side="bottom", fill="x")
- frame1.pack(side="bottom", fill="both", expand=1)
+ frame3.pack(side='bottom', fill='none')
+ frame2.pack(side='bottom', fill='x')
+ frame1.pack(side='bottom', fill='both', expand=1)
self._init_fonts(self._root)
self._init_animation()
self._init_bindings()
except:
- print("Error creating Tree View")
+ print('Error creating Tree View')
self.destroy()
raise
def destroy(self, *args):
- if self._root is None:
- return
+ if self._root is None: return
self._root.destroy()
self._root = None
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self._root.mainloop(*args, **kwargs)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Initialization Helpers
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def _init_parser(self, grammar, tokens):
self._grammar = grammar
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
+ self._boldfont = Font(family='helvetica', weight='bold',
+ size=self._size.get())
+ self._font = Font(family='helvetica',
+ size=self._size.get())
def _init_animation(self):
# Are we stepping? (default=yes)
# What's our animation speed (default=fast)
self._animate = IntVar(self._root)
- self._animate.set(3) # Default speed = fast
+ self._animate.set(3) # Default speed = fast
# Are we currently animating?
self._animating = 0
def _init_chartview(self, parent):
- self._cv = ChartView(self._chart, parent, draw_tree=1, draw_sentence=1)
- self._cv.add_callback("select", self._click_cv_edge)
+ self._cv = ChartView(self._chart, parent,
+ draw_tree=1, draw_sentence=1)
+ self._cv.add_callback('select', self._click_cv_edge)
def _init_rulelabel(self, parent):
- ruletxt = "Last edge generated by:"
-
- self._rulelabel1 = Label(parent, text=ruletxt, font=self._boldfont)
- self._rulelabel2 = Label(
- parent, width=40, relief="groove", anchor="w", font=self._boldfont
- )
- self._rulelabel1.pack(side="left")
- self._rulelabel2.pack(side="left")
- step = Checkbutton(parent, variable=self._step, text="Step")
- step.pack(side="right")
+ ruletxt = 'Last edge generated by:'
+
+ self._rulelabel1 = Label(parent,text=ruletxt,
+ font=self._boldfont)
+ self._rulelabel2 = Label(parent, width=40,
+ relief='groove', anchor='w',
+ font=self._boldfont)
+ self._rulelabel1.pack(side='left')
+ self._rulelabel2.pack(side='left')
+ step = Checkbutton(parent, variable=self._step,
+ text='Step')
+ step.pack(side='right')
def _init_buttons(self, parent):
frame1 = Frame(parent)
frame2 = Frame(parent)
- frame1.pack(side="bottom", fill="x")
- frame2.pack(side="top", fill="none")
-
- Button(
- frame1,
- text="Reset\nParser",
- background="#90c0d0",
- foreground="black",
- command=self.reset,
- ).pack(side="right")
+ frame1.pack(side='bottom', fill='x')
+ frame2.pack(side='top', fill='none')
+
+ Button(frame1, text='Reset\nParser',
+ background='#90c0d0', foreground='black',
+ command=self.reset).pack(side='right')
# Button(frame1, text='Pause',
# background='#90c0d0', foreground='black',
# command=self.pause).pack(side='left')
- Button(
- frame1,
- text="Top Down\nStrategy",
- background="#90c0d0",
- foreground="black",
- command=self.top_down_strategy,
- ).pack(side="left")
- Button(
- frame1,
- text="Bottom Up\nStrategy",
- background="#90c0d0",
- foreground="black",
- command=self.bottom_up_strategy,
- ).pack(side="left")
- Button(
- frame1,
- text="Bottom Up\nLeft-Corner Strategy",
- background="#90c0d0",
- foreground="black",
- command=self.bottom_up_leftcorner_strategy,
- ).pack(side="left")
-
- Button(
- frame2,
- text="Top Down Init\nRule",
- background="#90f090",
- foreground="black",
- command=self.top_down_init,
- ).pack(side="left")
- Button(
- frame2,
- text="Top Down Predict\nRule",
- background="#90f090",
- foreground="black",
- command=self.top_down_predict,
- ).pack(side="left")
- Frame(frame2, width=20).pack(side="left")
-
- Button(
- frame2,
- text="Bottom Up Predict\nRule",
- background="#90f090",
- foreground="black",
- command=self.bottom_up,
- ).pack(side="left")
- Frame(frame2, width=20).pack(side="left")
-
- Button(
- frame2,
- text="Bottom Up Left-Corner\nPredict Rule",
- background="#90f090",
- foreground="black",
- command=self.bottom_up_leftcorner,
- ).pack(side="left")
- Frame(frame2, width=20).pack(side="left")
-
- Button(
- frame2,
- text="Fundamental\nRule",
- background="#90f090",
- foreground="black",
- command=self.fundamental,
- ).pack(side="left")
+ Button(frame1, text='Top Down\nStrategy',
+ background='#90c0d0', foreground='black',
+ command=self.top_down_strategy).pack(side='left')
+ Button(frame1, text='Bottom Up\nStrategy',
+ background='#90c0d0', foreground='black',
+ command=self.bottom_up_strategy).pack(side='left')
+ Button(frame1, text='Bottom Up\nLeft-Corner Strategy',
+ background='#90c0d0', foreground='black',
+ command=self.bottom_up_leftcorner_strategy).pack(side='left')
+
+ Button(frame2, text='Top Down Init\nRule',
+ background='#90f090', foreground='black',
+ command=self.top_down_init).pack(side='left')
+ Button(frame2, text='Top Down Predict\nRule',
+ background='#90f090', foreground='black',
+ command=self.top_down_predict).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
+
+ Button(frame2, text='Bottom Up Predict\nRule',
+ background='#90f090', foreground='black',
+ command=self.bottom_up).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
+
+ Button(frame2, text='Bottom Up Left-Corner\nPredict Rule',
+ background='#90f090', foreground='black',
+ command=self.bottom_up_leftcorner).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
+
+ Button(frame2, text='Fundamental\nRule',
+ background='#90f090', foreground='black',
+ command=self.fundamental).pack(side='left')
def _init_bindings(self):
- self._root.bind("<Up>", self._cv.scroll_up)
- self._root.bind("<Down>", self._cv.scroll_down)
- self._root.bind("<Prior>", self._cv.page_up)
- self._root.bind("<Next>", self._cv.page_down)
- self._root.bind("<Control-q>", self.destroy)
- self._root.bind("<Control-x>", self.destroy)
- self._root.bind("<F1>", self.help)
-
- self._root.bind("<Control-s>", self.save_chart)
- self._root.bind("<Control-o>", self.load_chart)
- self._root.bind("<Control-r>", self.reset)
-
- self._root.bind("t", self.top_down_strategy)
- self._root.bind("b", self.bottom_up_strategy)
- self._root.bind("c", self.bottom_up_leftcorner_strategy)
- self._root.bind("<space>", self._stop_animation)
-
- self._root.bind("<Control-g>", self.edit_grammar)
- self._root.bind("<Control-t>", self.edit_sentence)
+ self._root.bind('<Up>', self._cv.scroll_up)
+ self._root.bind('<Down>', self._cv.scroll_down)
+ self._root.bind('<Prior>', self._cv.page_up)
+ self._root.bind('<Next>', self._cv.page_down)
+ self._root.bind('<Control-q>', self.destroy)
+ self._root.bind('<Control-x>', self.destroy)
+ self._root.bind('<F1>', self.help)
+
+ self._root.bind('<Control-s>', self.save_chart)
+ self._root.bind('<Control-o>', self.load_chart)
+ self._root.bind('<Control-r>', self.reset)
+
+ self._root.bind('t', self.top_down_strategy)
+ self._root.bind('b', self.bottom_up_strategy)
+ self._root.bind('c', self.bottom_up_leftcorner_strategy)
+ self._root.bind('<space>', self._stop_animation)
+
+ self._root.bind('<Control-g>', self.edit_grammar)
+ self._root.bind('<Control-t>', self.edit_sentence)
# Animation speed control
- self._root.bind("-", lambda e, a=self._animate: a.set(1))
- self._root.bind("=", lambda e, a=self._animate: a.set(2))
- self._root.bind("+", lambda e, a=self._animate: a.set(3))
+ self._root.bind('-', lambda e,a=self._animate:a.set(1))
+ self._root.bind('=', lambda e,a=self._animate:a.set(2))
+ self._root.bind('+', lambda e,a=self._animate:a.set(3))
# Step control
- self._root.bind("s", lambda e, s=self._step: s.set(not s.get()))
+ self._root.bind('s', lambda e,s=self._step:s.set(not s.get()))
def _init_menubar(self):
menubar = Menu(self._root)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Save Chart",
- underline=0,
- command=self.save_chart,
- accelerator="Ctrl-s",
- )
- filemenu.add_command(
- label="Load Chart",
- underline=0,
- command=self.load_chart,
- accelerator="Ctrl-o",
- )
- filemenu.add_command(
- label="Reset Chart", underline=0, command=self.reset, accelerator="Ctrl-r"
- )
+ filemenu.add_command(label='Save Chart', underline=0,
+ command=self.save_chart, accelerator='Ctrl-s')
+ filemenu.add_command(label='Load Chart', underline=0,
+ command=self.load_chart, accelerator='Ctrl-o')
+ filemenu.add_command(label='Reset Chart', underline=0,
+ command=self.reset, accelerator='Ctrl-r')
filemenu.add_separator()
- filemenu.add_command(label="Save Grammar", command=self.save_grammar)
- filemenu.add_command(label="Load Grammar", command=self.load_grammar)
+ filemenu.add_command(label='Save Grammar',
+ command=self.save_grammar)
+ filemenu.add_command(label='Load Grammar',
+ command=self.load_grammar)
filemenu.add_separator()
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-x')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
- editmenu.add_command(
- label="Edit Grammar",
- underline=5,
- command=self.edit_grammar,
- accelerator="Ctrl-g",
- )
- editmenu.add_command(
- label="Edit Text",
- underline=5,
- command=self.edit_sentence,
- accelerator="Ctrl-t",
- )
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ editmenu.add_command(label='Edit Grammar', underline=5,
+ command=self.edit_grammar,
+ accelerator='Ctrl-g')
+ editmenu.add_command(label='Edit Text', underline=5,
+ command=self.edit_sentence,
+ accelerator='Ctrl-t')
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
viewmenu = Menu(menubar, tearoff=0)
- viewmenu.add_command(
- label="Chart Matrix", underline=6, command=self.view_matrix
- )
- viewmenu.add_command(label="Results", underline=0, command=self.view_results)
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ viewmenu.add_command(label='Chart Matrix', underline=6,
+ command=self.view_matrix)
+ viewmenu.add_command(label='Results', underline=0,
+ command=self.view_results)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
rulemenu = Menu(menubar, tearoff=0)
- rulemenu.add_command(
- label="Top Down Strategy",
- underline=0,
- command=self.top_down_strategy,
- accelerator="t",
- )
- rulemenu.add_command(
- label="Bottom Up Strategy",
- underline=0,
- command=self.bottom_up_strategy,
- accelerator="b",
- )
- rulemenu.add_command(
- label="Bottom Up Left-Corner Strategy",
- underline=0,
- command=self.bottom_up_leftcorner_strategy,
- accelerator="c",
- )
+ rulemenu.add_command(label='Top Down Strategy', underline=0,
+ command=self.top_down_strategy,
+ accelerator='t')
+ rulemenu.add_command(label='Bottom Up Strategy', underline=0,
+ command=self.bottom_up_strategy,
+ accelerator='b')
+ rulemenu.add_command(label='Bottom Up Left-Corner Strategy', underline=0,
+ command=self.bottom_up_leftcorner_strategy,
+ accelerator='c')
rulemenu.add_separator()
- rulemenu.add_command(label="Bottom Up Rule", command=self.bottom_up)
- rulemenu.add_command(
- label="Bottom Up Left-Corner Rule", command=self.bottom_up_leftcorner
- )
- rulemenu.add_command(label="Top Down Init Rule", command=self.top_down_init)
- rulemenu.add_command(
- label="Top Down Predict Rule", command=self.top_down_predict
- )
- rulemenu.add_command(label="Fundamental Rule", command=self.fundamental)
- menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+ rulemenu.add_command(label='Bottom Up Rule',
+ command=self.bottom_up)
+ rulemenu.add_command(label='Bottom Up Left-Corner Rule',
+ command=self.bottom_up_leftcorner)
+ rulemenu.add_command(label='Top Down Init Rule',
+ command=self.top_down_init)
+ rulemenu.add_command(label='Top Down Predict Rule',
+ command=self.top_down_predict)
+ rulemenu.add_command(label='Fundamental Rule',
+ command=self.fundamental)
+ menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
animatemenu = Menu(menubar, tearoff=0)
- animatemenu.add_checkbutton(
- label="Step", underline=0, variable=self._step, accelerator="s"
- )
+ animatemenu.add_checkbutton(label="Step", underline=0,
+ variable=self._step,
+ accelerator='s')
animatemenu.add_separator()
- animatemenu.add_radiobutton(
- label="No Animation", underline=0, variable=self._animate, value=0
- )
- animatemenu.add_radiobutton(
- label="Slow Animation",
- underline=0,
- variable=self._animate,
- value=1,
- accelerator="-",
- )
- animatemenu.add_radiobutton(
- label="Normal Animation",
- underline=0,
- variable=self._animate,
- value=2,
- accelerator="=",
- )
- animatemenu.add_radiobutton(
- label="Fast Animation",
- underline=0,
- variable=self._animate,
- value=3,
- accelerator="+",
- )
+ animatemenu.add_radiobutton(label="No Animation", underline=0,
+ variable=self._animate, value=0)
+ animatemenu.add_radiobutton(label="Slow Animation", underline=0,
+ variable=self._animate, value=1,
+ accelerator='-')
+ animatemenu.add_radiobutton(label="Normal Animation", underline=0,
+ variable=self._animate, value=2,
+ accelerator='=')
+ animatemenu.add_radiobutton(label="Fast Animation", underline=0,
+ variable=self._animate, value=3,
+ accelerator='+')
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
zoommenu = Menu(menubar, tearoff=0)
- zoommenu.add_radiobutton(
- label="Tiny",
- variable=self._size,
- underline=0,
- value=10,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Small",
- variable=self._size,
- underline=0,
- value=12,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Medium",
- variable=self._size,
- underline=0,
- value=14,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Large",
- variable=self._size,
- underline=0,
- value=18,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Huge",
- variable=self._size,
- underline=0,
- value=24,
- command=self.resize,
- )
- menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu)
+ zoommenu.add_radiobutton(label='Tiny', variable=self._size,
+ underline=0, value=10, command=self.resize)
+ zoommenu.add_radiobutton(label='Small', variable=self._size,
+ underline=0, value=12, command=self.resize)
+ zoommenu.add_radiobutton(label='Medium', variable=self._size,
+ underline=0, value=14, command=self.resize)
+ zoommenu.add_radiobutton(label='Large', variable=self._size,
+ underline=0, value=18, command=self.resize)
+ zoommenu.add_radiobutton(label='Huge', variable=self._size,
+ underline=0, value=24, command=self.resize)
+ menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
- )
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ helpmenu.add_command(label='About', underline=0,
+ command=self.about)
+ helpmenu.add_command(label='Instructions', underline=0,
+ command=self.help, accelerator='F1')
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
self._root.config(menu=menubar)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Selection Handling
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def _click_cv_edge(self, edge):
if edge != self._selection:
def _select_edge(self, edge):
self._selection = edge
# Update the chart view.
- self._cv.markonly_edge(edge, "#f00")
+ self._cv.markonly_edge(edge, '#f00')
self._cv.draw_tree(edge)
# Update the matrix view.
- if self._matrix:
- self._matrix.markonly_edge(edge)
- if self._matrix:
- self._matrix.view_edge(edge)
+ if self._matrix: self._matrix.markonly_edge(edge)
+ if self._matrix: self._matrix.view_edge(edge)
def _deselect_edge(self):
self._selection = None
self._cv.unmark_edge()
self._cv.erase_tree()
# Update the matrix view
- if self._matrix:
- self._matrix.unmark_edge()
+ if self._matrix: self._matrix.unmark_edge()
def _show_new_edge(self, edge):
self._display_rule(self._cp.current_chartrule())
# Update the chart view.
self._cv.update()
self._cv.draw_tree(edge)
- self._cv.markonly_edge(edge, "#0df")
+ self._cv.markonly_edge(edge, '#0df')
self._cv.view_edge(edge)
# Update the matrix view.
- if self._matrix:
- self._matrix.update()
- if self._matrix:
- self._matrix.markonly_edge(edge)
- if self._matrix:
- self._matrix.view_edge(edge)
+ if self._matrix: self._matrix.update()
+ if self._matrix: self._matrix.markonly_edge(edge)
+ if self._matrix: self._matrix.view_edge(edge)
# Update the results view.
- if self._results:
- self._results.update(edge)
+ if self._results: self._results.update(edge)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Help/usage
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def help(self, *e):
self._animating = 0
# The default font's not very legible; try using 'fixed' instead.
try:
- ShowText(
- self._root,
- "Help: Chart Parser Application",
- (__doc__ or "").strip(),
- width=75,
- font="fixed",
- )
+ ShowText(self._root, 'Help: Chart Parser Application',
+ (__doc__ or '').strip(), width=75, font='fixed')
except:
- ShowText(
- self._root,
- "Help: Chart Parser Application",
- (__doc__ or "").strip(),
- width=75,
- )
+ ShowText(self._root, 'Help: Chart Parser Application',
+ (__doc__ or '').strip(), width=75)
def about(self, *e):
- ABOUT = "NLTK Chart Parser Application\n" + "Written by Edward Loper"
- showinfo("About: Chart Parser Application", ABOUT)
+ ABOUT = ("NLTK Chart Parser Application\n"+
+ "Written by Edward Loper")
+ showinfo('About: Chart Parser Application', ABOUT)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# File Menu
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
- CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")]
- GRAMMAR_FILE_TYPES = [
- ("Plaintext grammar file", ".cfg"),
- ("Pickle file", ".pickle"),
- ("All files", "*"),
- ]
+ CHART_FILE_TYPES = [('Pickle file', '.pickle'),
+ ('All files', '*')]
+ GRAMMAR_FILE_TYPES = [('Plaintext grammar file', '.cfg'),
+ ('Pickle file', '.pickle'),
+ ('All files', '*')]
def load_chart(self, *args):
"Load a chart from a pickle file"
- filename = askopenfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
- )
- if not filename:
- return
+ filename = askopenfilename(filetypes=self.CHART_FILE_TYPES,
+ defaultextension='.pickle')
+ if not filename: return
try:
- with open(filename, "rb") as infile:
+ with open(filename, 'rb') as infile:
chart = pickle.load(infile)
self._chart = chart
self._cv.update(chart)
- if self._matrix:
- self._matrix.set_chart(chart)
- if self._matrix:
- self._matrix.deselect_cell()
- if self._results:
- self._results.set_chart(chart)
+ if self._matrix: self._matrix.set_chart(chart)
+ if self._matrix: self._matrix.deselect_cell()
+ if self._results: self._results.set_chart(chart)
self._cp.set_chart(chart)
except Exception as e:
raise
- showerror("Error Loading Chart", "Unable to open file: %r" % filename)
+ showerror('Error Loading Chart',
+ 'Unable to open file: %r' % filename)
def save_chart(self, *args):
"Save a chart to a pickle file"
- filename = asksaveasfilename(
- filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
- )
- if not filename:
- return
+ filename = asksaveasfilename(filetypes=self.CHART_FILE_TYPES,
+ defaultextension='.pickle')
+ if not filename: return
try:
- with open(filename, "wb") as outfile:
+ with open(filename, 'wb') as outfile:
pickle.dump(self._chart, outfile)
except Exception as e:
raise
- showerror("Error Saving Chart", "Unable to open file: %r" % filename)
+ showerror('Error Saving Chart',
+ 'Unable to open file: %r' % filename)
def load_grammar(self, *args):
"Load a grammar from a pickle file"
- filename = askopenfilename(
- filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg"
- )
- if not filename:
- return
+ filename = askopenfilename(filetypes=self.GRAMMAR_FILE_TYPES,
+ defaultextension='.cfg')
+ if not filename: return
try:
- if filename.endswith(".pickle"):
- with open(filename, "rb") as infile:
+ if filename.endswith('.pickle'):
+ with open(filename, 'rb') as infile:
grammar = pickle.load(infile)
else:
- with open(filename, "r") as infile:
+ with open(filename, 'r') as infile:
grammar = CFG.fromstring(infile.read())
self.set_grammar(grammar)
except Exception as e:
- showerror("Error Loading Grammar", "Unable to open file: %r" % filename)
+ showerror('Error Loading Grammar',
+ 'Unable to open file: %r' % filename)
def save_grammar(self, *args):
- filename = asksaveasfilename(
- filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg"
- )
- if not filename:
- return
+ filename = asksaveasfilename(filetypes=self.GRAMMAR_FILE_TYPES,
+ defaultextension='.cfg')
+ if not filename: return
try:
- if filename.endswith(".pickle"):
- with open(filename, "wb") as outfile:
+ if filename.endswith('.pickle'):
+ with open(filename, 'wb') as outfile:
pickle.dump((self._chart, self._tokens), outfile)
else:
- with open(filename, "w") as outfile:
+ with open(filename, 'w') as outfile:
prods = self._grammar.productions()
start = [p for p in prods if p.lhs() == self._grammar.start()]
rest = [p for p in prods if p.lhs() != self._grammar.start()]
- for prod in start:
- outfile.write("%s\n" % prod)
- for prod in rest:
- outfile.write("%s\n" % prod)
+ for prod in start: outfile.write('%s\n' % prod)
+ for prod in rest: outfile.write('%s\n' % prod)
except Exception as e:
- showerror("Error Saving Grammar", "Unable to open file: %r" % filename)
+ showerror('Error Saving Grammar',
+ 'Unable to open file: %r' % filename)
def reset(self, *args):
self._animating = 0
self._reset_parser()
self._cv.update(self._chart)
- if self._matrix:
- self._matrix.set_chart(self._chart)
- if self._matrix:
- self._matrix.deselect_cell()
- if self._results:
- self._results.set_chart(self._chart)
-
- # ////////////////////////////////////////////////////////////
+ if self._matrix: self._matrix.set_chart(self._chart)
+ if self._matrix: self._matrix.deselect_cell()
+ if self._results: self._results.set_chart(self._chart)
+
+ #////////////////////////////////////////////////////////////
# Edit
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def edit_grammar(self, *e):
CFGEditor(self._root, self._grammar, self.set_grammar)
def set_grammar(self, grammar):
self._grammar = grammar
self._cp.set_grammar(grammar)
- if self._results:
- self._results.set_grammar(grammar)
+ if self._results: self._results.set_grammar(grammar)
def edit_sentence(self, *e):
sentence = " ".join(self._tokens)
- title = "Edit Text"
- instr = "Enter a new sentence to parse."
+ title = 'Edit Text'
+ instr = 'Enter a new sentence to parse.'
EntryDialog(self._root, sentence, instr, self.set_sentence, title)
def set_sentence(self, sentence):
self._tokens = list(sentence.split())
self.reset()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# View Menu
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def view_matrix(self, *e):
- if self._matrix is not None:
- self._matrix.destroy()
+ if self._matrix is not None: self._matrix.destroy()
self._matrix = ChartMatrixView(self._root, self._chart)
- self._matrix.add_callback("select", self._select_matrix_edge)
+ self._matrix.add_callback('select', self._select_matrix_edge)
def view_results(self, *e):
- if self._results is not None:
- self._results.destroy()
- self._results = ChartResultsView(self._root, self._chart, self._grammar)
+ if self._results is not None: self._results.destroy()
+ self._results = ChartResultsView(self._root, self._chart,
+ self._grammar)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Zoom Menu
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def resize(self):
self._animating = 0
def get_font_size(self):
return abs(self._size.get())
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Parsing
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def apply_strategy(self, strategy, edge_strategy=None):
# If we're animating, then stop.
# Clear the rule display & mark.
self._display_rule(None)
- # self._cv.unmark_edge()
+ #self._cv.unmark_edge()
if self._step.get():
selection = self._selection
self._animate_strategy()
else:
for edge in self._cpstep:
- if edge is None:
- break
+ if edge is None: break
self._cv.update()
- if self._matrix:
- self._matrix.update()
- if self._results:
- self._results.update()
+ if self._matrix: self._matrix.update()
+ if self._results: self._results.update()
def _stop_animation(self, *e):
self._animating = 0
def _animate_strategy(self, speed=1):
- if self._animating == 0:
- return
+ if self._animating == 0: return
if self._apply_strategy() is not None:
if self._animate.get() == 0 or self._step.get() == 1:
return
def _display_rule(self, rule):
if rule is None:
- self._rulelabel2["text"] = ""
+ self._rulelabel2['text'] = ''
else:
name = str(rule)
- self._rulelabel2["text"] = name
+ self._rulelabel2['text'] = name
size = self._cv.get_font_size()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Parsing Strategies
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Basic rules:
- _TD_INIT = [TopDownInitRule()]
- _TD_PREDICT = [TopDownPredictRule()]
- _BU_RULE = [BottomUpPredictRule()]
- _BU_LC_RULE = [BottomUpPredictCombineRule()]
+ _TD_INIT = [TopDownInitRule()]
+ _TD_PREDICT = [TopDownPredictRule()]
+ _BU_RULE = [BottomUpPredictRule()]
+ _BU_LC_RULE = [BottomUpPredictCombineRule()]
_FUNDAMENTAL = [SingleEdgeFundamentalRule()]
# Complete strategies:
- _TD_STRATEGY = _TD_INIT + _TD_PREDICT + _FUNDAMENTAL
+ _TD_STRATEGY = _TD_INIT + _TD_PREDICT + _FUNDAMENTAL
_BU_STRATEGY = _BU_RULE + _FUNDAMENTAL
_BU_LC_STRATEGY = _BU_LC_RULE + _FUNDAMENTAL
# Button callback functions:
def top_down_init(self, *e):
self.apply_strategy(self._TD_INIT, None)
-
def top_down_predict(self, *e):
self.apply_strategy(self._TD_PREDICT, TopDownPredictEdgeRule)
-
def bottom_up(self, *e):
self.apply_strategy(self._BU_RULE, BottomUpEdgeRule)
-
def bottom_up_leftcorner(self, *e):
self.apply_strategy(self._BU_LC_RULE, BottomUpLeftCornerEdgeRule)
-
def fundamental(self, *e):
self.apply_strategy(self._FUNDAMENTAL, FundamentalEdgeRule)
-
def bottom_up_strategy(self, *e):
self.apply_strategy(self._BU_STRATEGY, BottomUpEdgeRule)
-
def bottom_up_leftcorner_strategy(self, *e):
self.apply_strategy(self._BU_LC_STRATEGY, BottomUpLeftCornerEdgeRule)
-
def top_down_strategy(self, *e):
self.apply_strategy(self._TD_STRATEGY, TopDownPredictEdgeRule)
-
def app():
- grammar = CFG.fromstring(
- """
+ grammar = CFG.fromstring("""
# Grammatical productions.
S -> NP VP
VP -> VP PP | V NP | V
N -> 'dog' | 'cookie' | 'table' | 'cake' | 'fork'
V -> 'ate' | 'saw'
P -> 'on' | 'under' | 'with'
- """
- )
+ """)
- sent = "John ate the cake on the table with a fork"
- sent = "John ate the cake on the table"
+ sent = 'John ate the cake on the table with a fork'
+ sent = 'John ate the cake on the table'
tokens = list(sent.split())
- print("grammar= (")
+ print('grammar= (')
for rule in grammar.productions():
- print((" ", repr(rule) + ","))
- print(")")
- print(("tokens = %r" % tokens))
+ print((' ', repr(rule)+','))
+ print(')')
+ print(('tokens = %r' % tokens))
print('Calling "ChartParserApp(grammar, tokens)"...')
ChartParserApp(grammar, tokens).mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
# Chart comparer:
- # charts = ['/tmp/earley.pickle',
+ #charts = ['/tmp/earley.pickle',
# '/tmp/topdown.pickle',
# '/tmp/bottomup.pickle']
- # ChartComparer(*charts).mainloop()
+ #ChartComparer(*charts).mainloop()
- # import profile
- # profile.run('demo2()', '/tmp/profile.out')
- # import pstats
- # p = pstats.Stats('/tmp/profile.out')
- # p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
- # p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
+ #import profile
+ #profile.run('demo2()', '/tmp/profile.out')
+ #import pstats
+ #p = pstats.Stats('/tmp/profile.out')
+ #p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
+ #p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Regexp Chunk Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# configuration parameters to select what's being chunked (eg VP vs NP)
# and what part of the data is being used as the development set.
+from __future__ import division
import time
import textwrap
import re
import random
-from tkinter import (
- Button,
- Canvas,
- Checkbutton,
- Frame,
- IntVar,
- Label,
- Menu,
- Scrollbar,
- Text,
- Tk,
-)
-from tkinter.filedialog import askopenfilename, asksaveasfilename
-from tkinter.font import Font
+from six.moves.tkinter import (Button, Canvas, Checkbutton, Frame, IntVar,
+ Label, Menu, Scrollbar, Text, Tk)
+from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename
+from six.moves.tkinter_font import Font
from nltk.tree import Tree
from nltk.util import in_idle
from nltk.chunk import ChunkScore, RegexpChunkParser
from nltk.chunk.regexp import RegexpChunkRule
-
class RegexpChunkApp(object):
"""
A graphical tool for exploring the regular expression based chunk
#: which is used in the help text. (This should probably live with
#: the conll and/or treebank corpus instead.)
TAGSET = {
- "CC": "Coordinating conjunction",
- "PRP$": "Possessive pronoun",
- "CD": "Cardinal number",
- "RB": "Adverb",
- "DT": "Determiner",
- "RBR": "Adverb, comparative",
- "EX": "Existential there",
- "RBS": "Adverb, superlative",
- "FW": "Foreign word",
- "RP": "Particle",
- "JJ": "Adjective",
- "TO": "to",
- "JJR": "Adjective, comparative",
- "UH": "Interjection",
- "JJS": "Adjective, superlative",
- "VB": "Verb, base form",
- "LS": "List item marker",
- "VBD": "Verb, past tense",
- "MD": "Modal",
- "NNS": "Noun, plural",
- "NN": "Noun, singular or masps",
- "VBN": "Verb, past participle",
- "VBZ": "Verb,3rd ps. sing. present",
- "NNP": "Proper noun, singular",
- "NNPS": "Proper noun plural",
- "WDT": "wh-determiner",
- "PDT": "Predeterminer",
- "WP": "wh-pronoun",
- "POS": "Possessive ending",
- "WP$": "Possessive wh-pronoun",
- "PRP": "Personal pronoun",
- "WRB": "wh-adverb",
- "(": "open parenthesis",
- ")": "close parenthesis",
- "``": "open quote",
- ",": "comma",
- "''": "close quote",
- ".": "period",
- "#": "pound sign (currency marker)",
- "$": "dollar sign (currency marker)",
- "IN": "Preposition/subord. conjunction",
- "SYM": "Symbol (mathematical or scientific)",
- "VBG": "Verb, gerund/present participle",
- "VBP": "Verb, non-3rd ps. sing. present",
- ":": "colon",
- }
+ 'CC': 'Coordinating conjunction', 'PRP$': 'Possessive pronoun',
+ 'CD': 'Cardinal number', 'RB': 'Adverb',
+ 'DT': 'Determiner', 'RBR': 'Adverb, comparative',
+ 'EX': 'Existential there', 'RBS': 'Adverb, superlative',
+ 'FW': 'Foreign word', 'RP': 'Particle',
+ 'JJ': 'Adjective', 'TO': 'to',
+ 'JJR': 'Adjective, comparative', 'UH': 'Interjection',
+ 'JJS': 'Adjective, superlative', 'VB': 'Verb, base form',
+ 'LS': 'List item marker', 'VBD': 'Verb, past tense',
+ 'MD': 'Modal', 'NNS': 'Noun, plural',
+ 'NN': 'Noun, singular or masps', 'VBN': 'Verb, past participle',
+ 'VBZ': 'Verb,3rd ps. sing. present', 'NNP': 'Proper noun, singular',
+ 'NNPS': 'Proper noun plural', 'WDT': 'wh-determiner',
+ 'PDT': 'Predeterminer', 'WP': 'wh-pronoun',
+ 'POS': 'Possessive ending', 'WP$': 'Possessive wh-pronoun',
+ 'PRP': 'Personal pronoun', 'WRB': 'wh-adverb',
+ '(': 'open parenthesis', ')': 'close parenthesis',
+ '``': 'open quote', ',': 'comma',
+ "''": 'close quote', '.': 'period',
+ '#': 'pound sign (currency marker)',
+ '$': 'dollar sign (currency marker)',
+ 'IN': 'Preposition/subord. conjunction',
+ 'SYM': 'Symbol (mathematical or scientific)',
+ 'VBG': 'Verb, gerund/present participle',
+ 'VBP': 'Verb, non-3rd ps. sing. present',
+ ':': 'colon',
+ }
#: Contents for the help box. This is a list of tuples, one for
#: each help page, where each tuple has four elements:
#: like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
#: for a list of tags you can use for colorizing.
HELP = [
- (
- "Help",
- "20",
- "Welcome to the regular expression chunk-parser grammar editor. "
- "You can use this editor to develop and test chunk parser grammars "
- "based on NLTK's RegexpChunkParser class.\n\n"
- # Help box.
- "Use this box ('Help') to learn more about the editor; click on the "
- "tabs for help on specific topics:"
- "<indent>\n"
- "Rules: grammar rule types\n"
- "Regexps: regular expression syntax\n"
- "Tags: part of speech tags\n</indent>\n"
- # Grammar.
- "Use the upper-left box ('Grammar') to edit your grammar. "
- "Each line of your grammar specifies a single 'rule', "
- "which performs an action such as creating a chunk or merging "
- "two chunks.\n\n"
- # Dev set.
- "The lower-left box ('Development Set') runs your grammar on the "
- "development set, and displays the results. "
- "Your grammar's chunks are <highlight>highlighted</highlight>, and "
- "the correct (gold standard) chunks are "
- "<underline>underlined</underline>. If they "
- "match, they are displayed in <green>green</green>; otherwise, "
- "they are displayed in <red>red</red>. The box displays a single "
- "sentence from the development set at a time; use the scrollbar or "
- "the next/previous buttons view additional sentences.\n\n"
- # Performance
- "The lower-right box ('Evaluation') tracks the performance of "
- "your grammar on the development set. The 'precision' axis "
- "indicates how many of your grammar's chunks are correct; and "
- "the 'recall' axis indicates how many of the gold standard "
- "chunks your system generated. Typically, you should try to "
- "design a grammar that scores high on both metrics. The "
- "exact precision and recall of the current grammar, as well "
- "as their harmonic mean (the 'f-score'), are displayed in "
- "the status bar at the bottom of the window.",
- ),
- (
- "Rules",
- "10",
- "<h1>{...regexp...}</h1>"
- "<indent>\nChunk rule: creates new chunks from words matching "
- "regexp.</indent>\n\n"
- "<h1>}...regexp...{</h1>"
- "<indent>\nChink rule: removes words matching regexp from existing "
- "chunks.</indent>\n\n"
- "<h1>...regexp1...}{...regexp2...</h1>"
- "<indent>\nSplit rule: splits chunks that match regexp1 followed by "
- "regexp2 in two.</indent>\n\n"
- "<h1>...regexp...{}...regexp...</h1>"
- "<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
- "and regexp2</indent>\n",
- ),
- (
- "Regexps",
- "10 60",
- # "Regular Expression Syntax Summary:\n\n"
- "<h1>Pattern\t\tMatches...</h1>\n"
- "<hangindent>"
- "\t<<var>T</var>>\ta word with tag <var>T</var> "
- "(where <var>T</var> may be a regexp).\n"
- "\t<var>x</var>?\tan optional <var>x</var>\n"
- "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
- "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
- "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
- "\t.\tmatches any character\n"
- "\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
- "\t# <var>x...</var>\tTreats <var>x...</var> "
- "(to the end of the line) as a comment\n"
- "\t\\<var>C</var>\tmatches character <var>C</var> "
- "(useful when <var>C</var> is a special character "
- "like + or #)\n"
- "</hangindent>"
- "\n<h1>Examples:</h1>\n"
- "<hangindent>"
- "\t<regexp><NN></regexp>\n"
- '\t\tMatches <match>"cow/NN"</match>\n'
- '\t\tMatches <match>"green/NN"</match>\n'
- "\t<regexp><VB.*></regexp>\n"
- '\t\tMatches <match>"eating/VBG"</match>\n'
- '\t\tMatches <match>"ate/VBD"</match>\n'
- "\t<regexp><IN><DT><NN></regexp>\n"
- '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
- "\t<regexp><RB>?<VBD></regexp>\n"
- '\t\tMatches <match>"ran/VBD"</match>\n'
- '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
- "\t<regexp><\#><CD> # This is a comment...</regexp>\n"
- '\t\tMatches <match>"#/# 100/CD"</match>\n'
- "</hangindent>",
- ),
- (
- "Tags",
- "10 60",
- "<h1>Part of Speech Tags:</h1>\n"
- + "<hangindent>"
- + "<<TAGSET>>"
- + "</hangindent>\n", # this gets auto-substituted w/ self.TAGSET
- ),
- ]
+ ('Help', '20',
+ "Welcome to the regular expression chunk-parser grammar editor. "
+ "You can use this editor to develop and test chunk parser grammars "
+ "based on NLTK's RegexpChunkParser class.\n\n"
+ # Help box.
+ "Use this box ('Help') to learn more about the editor; click on the "
+ "tabs for help on specific topics:"
+ "<indent>\n"
+ "Rules: grammar rule types\n"
+ "Regexps: regular expression syntax\n"
+ "Tags: part of speech tags\n</indent>\n"
+ # Grammar.
+ "Use the upper-left box ('Grammar') to edit your grammar. "
+ "Each line of your grammar specifies a single 'rule', "
+ "which performs an action such as creating a chunk or merging "
+ "two chunks.\n\n"
+ # Dev set.
+ "The lower-left box ('Development Set') runs your grammar on the "
+ "development set, and displays the results. "
+ "Your grammar's chunks are <highlight>highlighted</highlight>, and "
+ "the correct (gold standard) chunks are "
+ "<underline>underlined</underline>. If they "
+ "match, they are displayed in <green>green</green>; otherwise, "
+ "they are displayed in <red>red</red>. The box displays a single "
+ "sentence from the development set at a time; use the scrollbar or "
+ "the next/previous buttons view additional sentences.\n\n"
+ # Performance
+ "The lower-right box ('Evaluation') tracks the performance of "
+ "your grammar on the development set. The 'precision' axis "
+ "indicates how many of your grammar's chunks are correct; and "
+ "the 'recall' axis indicates how many of the gold standard "
+ "chunks your system generated. Typically, you should try to "
+ "design a grammar that scores high on both metrics. The "
+ "exact precision and recall of the current grammar, as well "
+ "as their harmonic mean (the 'f-score'), are displayed in "
+ "the status bar at the bottom of the window."
+ ),
+ ('Rules', '10',
+ "<h1>{...regexp...}</h1>"
+ "<indent>\nChunk rule: creates new chunks from words matching "
+ "regexp.</indent>\n\n"
+ "<h1>}...regexp...{</h1>"
+ "<indent>\nChink rule: removes words matching regexp from existing "
+ "chunks.</indent>\n\n"
+ "<h1>...regexp1...}{...regexp2...</h1>"
+ "<indent>\nSplit rule: splits chunks that match regexp1 followed by "
+ "regexp2 in two.</indent>\n\n"
+ "<h1>...regexp...{}...regexp...</h1>"
+ "<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
+ "and regexp2</indent>\n"
+ ),
+ ('Regexps', '10 60',
+ #"Regular Expression Syntax Summary:\n\n"
+ "<h1>Pattern\t\tMatches...</h1>\n"
+ "<hangindent>"
+ "\t<<var>T</var>>\ta word with tag <var>T</var> "
+ "(where <var>T</var> may be a regexp).\n"
+ "\t<var>x</var>?\tan optional <var>x</var>\n"
+ "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
+ "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
+ "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
+ "\t.\tmatches any character\n"
+ "\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
+ "\t# <var>x...</var>\tTreats <var>x...</var> "
+ "(to the end of the line) as a comment\n"
+ "\t\\<var>C</var>\tmatches character <var>C</var> "
+ "(useful when <var>C</var> is a special character "
+ "like + or #)\n"
+ "</hangindent>"
+ "\n<h1>Examples:</h1>\n"
+ "<hangindent>"
+ '\t<regexp><NN></regexp>\n'
+ '\t\tMatches <match>"cow/NN"</match>\n'
+ '\t\tMatches <match>"green/NN"</match>\n'
+ '\t<regexp><VB.*></regexp>\n'
+ '\t\tMatches <match>"eating/VBG"</match>\n'
+ '\t\tMatches <match>"ate/VBD"</match>\n'
+ '\t<regexp><IN><DT><NN></regexp>\n'
+ '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
+ '\t<regexp><RB>?<VBD></regexp>\n'
+ '\t\tMatches <match>"ran/VBD"</match>\n'
+ '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
+ '\t<regexp><\#><CD> # This is a comment...</regexp>\n'
+ '\t\tMatches <match>"#/# 100/CD"</match>\n'
+ "</hangindent>"
+ ),
+ ('Tags', '10 60',
+ "<h1>Part of Speech Tags:</h1>\n" +
+ '<hangindent>' +
+ '<<TAGSET>>' + # this gets auto-substituted w/ self.TAGSET
+ '</hangindent>\n')
+ ]
HELP_AUTOTAG = [
- ("red", dict(foreground="#a00")),
- ("green", dict(foreground="#080")),
- ("highlight", dict(background="#ddd")),
- ("underline", dict(underline=True)),
- ("h1", dict(underline=True)),
- ("indent", dict(lmargin1=20, lmargin2=20)),
- ("hangindent", dict(lmargin1=0, lmargin2=60)),
- ("var", dict(foreground="#88f")),
- ("regexp", dict(foreground="#ba7")),
- ("match", dict(foreground="#6a6")),
- ]
+ ('red', dict(foreground='#a00')),
+ ('green', dict(foreground='#080')),
+ ('highlight', dict(background='#ddd')),
+ ('underline', dict(underline=True)),
+ ('h1', dict(underline=True)),
+ ('indent', dict(lmargin1=20, lmargin2=20)),
+ ('hangindent', dict(lmargin1=0, lmargin2=60)),
+ ('var', dict(foreground='#88f')),
+ ('regexp', dict(foreground='#ba7')),
+ ('match', dict(foreground='#6a6')),
+ ]
##/////////////////////////////////////////////////////////////////
## Config Parmeters
demon each time it runs."""
_EVAL_FREQ = 0.2
"""The frequency (in seconds) at which the eval demon is run"""
- _EVAL_DEMON_MIN = 0.02
+ _EVAL_DEMON_MIN = .02
"""The minimum amount of time that the eval demon should take each time
it runs -- if it takes less than this time, _EVAL_CHUNK will be
modified upwards."""
- _EVAL_DEMON_MAX = 0.04
+ _EVAL_DEMON_MAX = .04
"""The maximum amount of time that the eval demon should take each time
it runs -- if it takes more than this time, _EVAL_CHUNK will be
modified downwards."""
_GRAMMARBOX_PARAMS = dict(
- width=40,
- height=12,
- background="#efe",
- highlightbackground="#efe",
- highlightthickness=1,
- relief="groove",
- border=2,
- wrap="word",
- )
+ width=40, height=12, background='#efe', highlightbackground='#efe',
+ highlightthickness=1, relief='groove', border=2, wrap='word')
_HELPBOX_PARAMS = dict(
- width=15,
- height=15,
- background="#efe",
- highlightbackground="#efe",
- foreground="#555",
- highlightthickness=1,
- relief="groove",
- border=2,
- wrap="word",
- )
+ width=15, height=15, background='#efe', highlightbackground='#efe',
+ foreground='#555',
+ highlightthickness=1, relief='groove', border=2, wrap='word')
_DEVSETBOX_PARAMS = dict(
- width=70,
- height=10,
- background="#eef",
- highlightbackground="#eef",
- highlightthickness=1,
- relief="groove",
- border=2,
- wrap="word",
- tabs=(30,),
- )
- _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2)
- _FONT_PARAMS = dict(family="helvetica", size=-20)
- _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3)
+ width=70, height=10, background='#eef', highlightbackground='#eef',
+ highlightthickness=1, relief='groove', border=2, wrap='word',
+ tabs=(30,))
+ _STATUS_PARAMS = dict(
+ background='#9bb', relief='groove', border=2)
+ _FONT_PARAMS = dict(
+ family='helvetica', size=-20)
+ _FRAME_PARAMS = dict(
+ background='#777', padx=2, pady=2, border=3)
_EVALBOX_PARAMS = dict(
- background="#eef",
- highlightbackground="#eef",
- highlightthickness=1,
- relief="groove",
- border=2,
- width=300,
- height=280,
- )
+ background='#eef', highlightbackground='#eef',
+ highlightthickness=1, relief='groove', border=2,
+ width=300, height=280)
_BUTTON_PARAMS = dict(
- background="#777", activebackground="#777", highlightbackground="#777"
- )
- _HELPTAB_BG_COLOR = "#aba"
- _HELPTAB_FG_COLOR = "#efe"
+ background='#777', activebackground='#777',
+ highlightbackground='#777')
+ _HELPTAB_BG_COLOR = '#aba'
+ _HELPTAB_FG_COLOR = '#efe'
- _HELPTAB_FG_PARAMS = dict(background="#efe")
- _HELPTAB_BG_PARAMS = dict(background="#aba")
+ _HELPTAB_FG_PARAMS = dict(background='#efe')
+ _HELPTAB_BG_PARAMS = dict(background='#aba')
_HELPTAB_SPACER = 6
def normalize_grammar(self, grammar):
# Strip comments
- grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
+ grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar)
# Normalize whitespace
- grammar = re.sub(" +", " ", grammar)
- grammar = re.sub("\n\s+", "\n", grammar)
+ grammar = re.sub(' +', ' ', grammar)
+ grammar = re.sub('\n\s+', '\n', grammar)
grammar = grammar.strip()
# [xx] Hack: automatically backslash $!
- grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
+ grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar)
return grammar
- def __init__(
- self,
- devset_name="conll2000",
- devset=None,
- grammar="",
- chunk_label="NP",
- tagset=None,
- ):
+ def __init__(self, devset_name='conll2000', devset=None,
+ grammar = '', chunk_label='NP', tagset=None):
"""
:param devset_name: The name of the development set; used for
display & for save files. If either the name 'treebank'
"""
self._chunk_label = chunk_label
- if tagset is None:
- tagset = self.TAGSET
+ if tagset is None: tagset = self.TAGSET
self.tagset = tagset
# Named development sets:
if devset is None:
- if devset_name == "conll2000":
- devset = conll2000.chunked_sents("train.txt") # [:100]
- elif devset == "treebank":
- devset = treebank_chunk.chunked_sents() # [:100]
+ if devset_name == 'conll2000':
+ devset = conll2000.chunked_sents('train.txt')#[:100]
+ elif devset == 'treebank':
+ devset = treebank_chunk.chunked_sents()#[:100]
else:
- raise ValueError("Unknown development set %s" % devset_name)
+ raise ValueError('Unknown development set %s' % devset_name)
self.chunker = None
"""The chunker built from the grammar string"""
# Set up the main window.
top = self.top = Tk()
- top.geometry("+50+50")
- top.title("Regexp Chunk Parser App")
- top.bind("<Control-q>", self.destroy)
+ top.geometry('+50+50')
+ top.title('Regexp Chunk Parser App')
+ top.bind('<Control-q>', self.destroy)
# Varaible that restricts how much of the devset we look at.
self._devset_size = IntVar(top)
self._init_menubar(top)
self.grammarbox.focus()
+
# If a grammar was given, then display it.
if grammar:
- self.grammarbox.insert("end", grammar + "\n")
- self.grammarbox.mark_set("insert", "1.0")
+ self.grammarbox.insert('end', grammar+'\n')
+ self.grammarbox.mark_set('insert', '1.0')
# Display the first item in the development set
self.show_devset(0)
self.update()
def _init_bindings(self, top):
- top.bind("<Control-n>", self._devset_next)
- top.bind("<Control-p>", self._devset_prev)
- top.bind("<Control-t>", self.toggle_show_trace)
- top.bind("<KeyPress>", self.update)
- top.bind("<Control-s>", lambda e: self.save_grammar())
- top.bind("<Control-o>", lambda e: self.load_grammar())
- self.grammarbox.bind("<Control-t>", self.toggle_show_trace)
- self.grammarbox.bind("<Control-n>", self._devset_next)
- self.grammarbox.bind("<Control-p>", self._devset_prev)
+ top.bind('<Control-n>', self._devset_next)
+ top.bind('<Control-p>', self._devset_prev)
+ top.bind('<Control-t>', self.toggle_show_trace)
+ top.bind('<KeyPress>', self.update)
+ top.bind('<Control-s>', lambda e: self.save_grammar())
+ top.bind('<Control-o>', lambda e: self.load_grammar())
+ self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
+ self.grammarbox.bind('<Control-n>', self._devset_next)
+ self.grammarbox.bind('<Control-p>', self._devset_prev)
# Redraw the eval graph when the window size changes
- self.evalbox.bind("<Configure>", self._eval_plot)
+ self.evalbox.bind('<Configure>', self._eval_plot)
def _init_fonts(self, top):
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(top)
self._size.set(20)
- self._font = Font(family="helvetica", size=-self._size.get())
- self._smallfont = Font(
- family="helvetica", size=-(int(self._size.get() * 14 // 20))
- )
+ self._font = Font(family='helvetica',
+ size=-self._size.get())
+ self._smallfont = Font(family='helvetica',
+ size=-(int(self._size.get()*14//20)))
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(label="Reset Application", underline=0, command=self.reset)
- filemenu.add_command(
- label="Save Current Grammar",
- underline=0,
- accelerator="Ctrl-s",
- command=self.save_grammar,
- )
- filemenu.add_command(
- label="Load Grammar",
- underline=0,
- accelerator="Ctrl-o",
- command=self.load_grammar,
- )
-
- filemenu.add_command(
- label="Save Grammar History", underline=13, command=self.save_history
- )
-
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Reset Application', underline=0,
+ command=self.reset)
+ filemenu.add_command(label='Save Current Grammar', underline=0,
+ accelerator='Ctrl-s',
+ command=self.save_grammar)
+ filemenu.add_command(label='Load Grammar', underline=0,
+ accelerator='Ctrl-o',
+ command=self.load_grammar)
+
+ filemenu.add_command(label='Save Grammar History', underline=13,
+ command=self.save_history)
+
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-q')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
viewmenu = Menu(menubar, tearoff=0)
- viewmenu.add_radiobutton(
- label="Tiny",
- variable=self._size,
- underline=0,
- value=10,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Small",
- variable=self._size,
- underline=0,
- value=16,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Medium",
- variable=self._size,
- underline=0,
- value=20,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Large",
- variable=self._size,
- underline=0,
- value=24,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Huge",
- variable=self._size,
- underline=0,
- value=34,
- command=self.resize,
- )
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+ underline=0, value=10, command=self.resize)
+ viewmenu.add_radiobutton(label='Small', variable=self._size,
+ underline=0, value=16, command=self.resize)
+ viewmenu.add_radiobutton(label='Medium', variable=self._size,
+ underline=0, value=20, command=self.resize)
+ viewmenu.add_radiobutton(label='Large', variable=self._size,
+ underline=0, value=24, command=self.resize)
+ viewmenu.add_radiobutton(label='Huge', variable=self._size,
+ underline=0, value=34, command=self.resize)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
devsetmenu = Menu(menubar, tearoff=0)
- devsetmenu.add_radiobutton(
- label="50 sentences",
- variable=self._devset_size,
- value=50,
- command=self.set_devset_size,
- )
- devsetmenu.add_radiobutton(
- label="100 sentences",
- variable=self._devset_size,
- value=100,
- command=self.set_devset_size,
- )
- devsetmenu.add_radiobutton(
- label="200 sentences",
- variable=self._devset_size,
- value=200,
- command=self.set_devset_size,
- )
- devsetmenu.add_radiobutton(
- label="500 sentences",
- variable=self._devset_size,
- value=500,
- command=self.set_devset_size,
- )
- menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu)
+ devsetmenu.add_radiobutton(label='50 sentences',
+ variable=self._devset_size,
+ value=50, command=self.set_devset_size)
+ devsetmenu.add_radiobutton(label='100 sentences',
+ variable=self._devset_size,
+ value=100, command=self.set_devset_size)
+ devsetmenu.add_radiobutton(label='200 sentences',
+ variable=self._devset_size,
+ value=200, command=self.set_devset_size)
+ devsetmenu.add_radiobutton(label='500 sentences',
+ variable=self._devset_size,
+ value=500, command=self.set_devset_size)
+ menubar.add_cascade(label='Development-Set', underline=0,
+ menu=devsetmenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ helpmenu.add_command(label='About', underline=0,
+ command=self.about)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
self.show_devset()
else:
self.show_trace()
- return "break"
+ return 'break'
- _SCALE_N = 5 # center on the last 5 examples.
- _DRAW_LINES = False
+ _SCALE_N = 5 # center on the last 5 examples.
+ _DRAW_LINES = False
def _eval_plot(self, *e, **config):
- width = config.get("width", self.evalbox.winfo_width())
- height = config.get("height", self.evalbox.winfo_height())
+ width = config.get('width', self.evalbox.winfo_width())
+ height = config.get('height', self.evalbox.winfo_height())
# Clear the canvas
- self.evalbox.delete("all")
+ self.evalbox.delete('all')
# Draw the precision & recall labels.
- tag = self.evalbox.create_text(
- 10, height // 2 - 10, justify="left", anchor="w", text="Precision"
- )
- left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
- tag = self.evalbox.create_text(
- left + (width - left) // 2,
- height - 10,
- anchor="s",
- text="Recall",
- justify="center",
- )
- top, bot = 10, self.evalbox.bbox(tag)[1] - 10
+ tag = self.evalbox.create_text(10, height//2-10, justify='left',
+ anchor='w', text='Precision')
+ left, right = self.evalbox.bbox(tag)[2] + 5, width-10
+ tag = self.evalbox.create_text(left + (width-left)//2, height-10,
+ anchor='s', text='Recall', justify='center')
+ top, bot = 10, self.evalbox.bbox(tag)[1]-10
# Draw masks for clipping the plot.
- bg = self._EVALBOX_PARAMS["background"]
- self.evalbox.lower(
- self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
- )
- self.evalbox.lower(
- self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg)
- )
+ bg = self._EVALBOX_PARAMS['background']
+ self.evalbox.lower(self.evalbox.create_rectangle(0, 0, left-1, 5000,
+ fill=bg, outline=bg))
+ self.evalbox.lower(self.evalbox.create_rectangle(0, bot+1, 5000, 5000,
+ fill=bg, outline=bg))
# Calculate the plot's scale.
if self._autoscale.get() and len(self._history) > 1:
max_precision = max_recall = 0
min_precision = min_recall = 1
- for i in range(1, min(len(self._history), self._SCALE_N + 1)):
+ for i in range(1, min(len(self._history), self._SCALE_N+1)):
grammar, precision, recall, fmeasure = self._history[-i]
min_precision = min(precision, min_precision)
min_recall = min(recall, min_recall)
max_precision = max(precision, max_precision)
max_recall = max(recall, max_recall)
- # if max_precision-min_precision > max_recall-min_recall:
- # min_recall -= (max_precision-min_precision)/2
- # max_recall += (max_precision-min_precision)/2
- # else:
- # min_precision -= (max_recall-min_recall)/2
- # max_precision += (max_recall-min_recall)/2
- # if min_recall < 0:
- # max_recall -= min_recall
- # min_recall = 0
- # if min_precision < 0:
- # max_precision -= min_precision
- # min_precision = 0
- min_precision = max(min_precision - 0.01, 0)
- min_recall = max(min_recall - 0.01, 0)
- max_precision = min(max_precision + 0.01, 1)
- max_recall = min(max_recall + 0.01, 1)
+# if max_precision-min_precision > max_recall-min_recall:
+# min_recall -= (max_precision-min_precision)/2
+# max_recall += (max_precision-min_precision)/2
+# else:
+# min_precision -= (max_recall-min_recall)/2
+# max_precision += (max_recall-min_recall)/2
+# if min_recall < 0:
+# max_recall -= min_recall
+# min_recall = 0
+# if min_precision < 0:
+# max_precision -= min_precision
+# min_precision = 0
+ min_precision = max(min_precision-.01, 0)
+ min_recall = max(min_recall-.01, 0)
+ max_precision = min(max_precision+.01, 1)
+ max_recall = min(max_recall+.01, 1)
else:
min_precision = min_recall = 0
max_precision = max_recall = 1
# Draw the axis lines & grid lines
for i in range(11):
- x = left + (right - left) * (
- (i / 10.0 - min_recall) / (max_recall - min_recall)
- )
- y = bot - (bot - top) * (
- (i / 10.0 - min_precision) / (max_precision - min_precision)
- )
+ x = left + (right-left)*((i/10.-min_recall)/
+ (max_recall-min_recall))
+ y = bot - (bot-top)*((i/10.-min_precision)/
+ (max_precision-min_precision))
if left < x < right:
- self.evalbox.create_line(x, top, x, bot, fill="#888")
+ self.evalbox.create_line(x, top, x, bot, fill='#888')
if top < y < bot:
- self.evalbox.create_line(left, y, right, y, fill="#888")
+ self.evalbox.create_line(left, y, right, y, fill='#888')
self.evalbox.create_line(left, top, left, bot)
self.evalbox.create_line(left, bot, right, bot)
# Display the plot's scale
self.evalbox.create_text(
- left - 3,
- bot,
- justify="right",
- anchor="se",
- text="%d%%" % (100 * min_precision),
- )
+ left-3, bot, justify='right', anchor='se',
+ text='%d%%' % (100*min_precision))
self.evalbox.create_text(
- left - 3,
- top,
- justify="right",
- anchor="ne",
- text="%d%%" % (100 * max_precision),
- )
+ left-3, top, justify='right', anchor='ne',
+ text='%d%%' % (100*max_precision))
self.evalbox.create_text(
- left,
- bot + 3,
- justify="center",
- anchor="nw",
- text="%d%%" % (100 * min_recall),
- )
+ left, bot+3, justify='center', anchor='nw',
+ text='%d%%' % (100*min_recall))
self.evalbox.create_text(
- right,
- bot + 3,
- justify="center",
- anchor="ne",
- text="%d%%" % (100 * max_recall),
- )
+ right, bot+3, justify='center', anchor='ne',
+ text='%d%%' % (100*max_recall))
# Display the scores.
prev_x = prev_y = None
for i, (_, precision, recall, fscore) in enumerate(self._history):
- x = left + (right - left) * (
- (recall - min_recall) / (max_recall - min_recall)
- )
- y = bot - (bot - top) * (
- (precision - min_precision) / (max_precision - min_precision)
- )
+ x = left + (right-left) * ((recall-min_recall) /
+ (max_recall-min_recall))
+ y = bot - (bot-top) * ((precision-min_precision) /
+ (max_precision-min_precision))
if i == self._history_index:
- self.evalbox.create_oval(
- x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000"
- )
- self.status["text"] = (
- "Precision: %.2f%%\t" % (precision * 100)
- + "Recall: %.2f%%\t" % (recall * 100)
- + "F-score: %.2f%%" % (fscore * 100)
- )
+ self.evalbox.create_oval(x-2,y-2,x+2,y+2,
+ fill='#0f0', outline='#000')
+ self.status['text'] = (
+ 'Precision: %.2f%%\t' % (precision*100)+
+ 'Recall: %.2f%%\t' % (recall*100)+
+ 'F-score: %.2f%%' % (fscore*100))
else:
self.evalbox.lower(
- self.evalbox.create_oval(
- x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8"
- )
- )
+ self.evalbox.create_oval(x-2,y-2,x+2,y+2,
+ fill='#afa', outline='#8c8'))
if prev_x is not None and self._eval_lines.get():
self.evalbox.lower(
- self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8")
- )
+ self.evalbox.create_line(prev_x, prev_y, x, y,
+ fill='#8c8'))
prev_x, prev_y = x, y
_eval_demon_running = False
-
def _eval_demon(self):
- if self.top is None:
- return
+ if self.top is None: return
if self.chunker is None:
self._eval_demon_running = False
return
t0 = time.time()
# If are still typing, then wait for them to finish.
- if (
- time.time() - self._last_keypress < self._EVAL_DELAY
- and self.normalized_grammar != self._eval_normalized_grammar
- ):
+ if (time.time()-self._last_keypress < self._EVAL_DELAY and
+ self.normalized_grammar != self._eval_normalized_grammar):
self._eval_demon_running = True
- return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
+ return self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
# If the grammar changed, restart the evaluation.
if self.normalized_grammar != self._eval_normalized_grammar:
# just use the old evaluation values.
for (g, p, r, f) in self._history:
if self.normalized_grammar == self.normalize_grammar(g):
- self._history.append((g, p, r, f))
+ self._history.append( (g, p, r, f) )
self._history_index = len(self._history) - 1
self._eval_plot()
self._eval_demon_running = False
# If the grammar is empty, the don't bother evaluating it, or
# recording it in history -- the score will just be 0.
- if self.normalized_grammar.strip() == "":
- # self._eval_index = self._devset_size.get()
+ if self.normalized_grammar.strip() == '':
+ #self._eval_index = self._devset_size.get()
self._eval_demon_running = False
return
# Score the next set of examples
- for gold in self.devset[
- self._eval_index : min(
- self._eval_index + self._EVAL_CHUNK, self._devset_size.get()
- )
- ]:
+ for gold in self.devset[self._eval_index:
+ min(self._eval_index+self._EVAL_CHUNK,
+ self._devset_size.get())]:
guess = self._chunkparse(gold.leaves())
self._eval_score.score(gold, guess)
# Check if we're done
if self._eval_index >= self._devset_size.get():
- self._history.append(
- (
- self._eval_grammar,
- self._eval_score.precision(),
- self._eval_score.recall(),
- self._eval_score.f_measure(),
- )
- )
- self._history_index = len(self._history) - 1
+ self._history.append( (self._eval_grammar,
+ self._eval_score.precision(),
+ self._eval_score.recall(),
+ self._eval_score.f_measure()) )
+ self._history_index = len(self._history)-1
self._eval_plot()
self._eval_demon_running = False
self._eval_normalized_grammar = None
else:
- progress = 100 * self._eval_index / self._devset_size.get()
- self.status["text"] = "Evaluating on Development Set (%d%%)" % progress
+ progress = 100*self._eval_index/self._devset_size.get()
+ self.status['text'] = ('Evaluating on Development Set (%d%%)' %
+ progress)
self._eval_demon_running = True
self._adaptively_modify_eval_chunk(time.time() - t0)
- self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
+ self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon)
def _adaptively_modify_eval_chunk(self, t):
"""
:param t: The amount of time that the eval demon took.
"""
if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
- self._EVAL_CHUNK = min(
- self._EVAL_CHUNK - 1,
- max(
- int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)),
- self._EVAL_CHUNK - 10,
- ),
- )
+ self._EVAL_CHUNK = min(self._EVAL_CHUNK-1,
+ max(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MAX/t)),
+ self._EVAL_CHUNK-10))
elif t < self._EVAL_DEMON_MIN:
- self._EVAL_CHUNK = max(
- self._EVAL_CHUNK + 1,
- min(
- int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)),
- self._EVAL_CHUNK + 10,
- ),
- )
+ self._EVAL_CHUNK = max(self._EVAL_CHUNK+1,
+ min(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MIN/t)),
+ self._EVAL_CHUNK+10))
def _init_widgets(self, top):
frame0 = Frame(top, **self._FRAME_PARAMS)
frame0.grid_rowconfigure(5, weight=1)
# The grammar
- self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS)
- self.grammarlabel = Label(
- frame0,
- font=self._font,
- text="Grammar:",
- highlightcolor="black",
- background=self._GRAMMARBOX_PARAMS["background"],
- )
- self.grammarlabel.grid(column=0, row=0, sticky="SW")
- self.grammarbox.grid(column=0, row=1, sticky="NEWS")
+ self.grammarbox = Text(frame0, font=self._font,
+ **self._GRAMMARBOX_PARAMS)
+ self.grammarlabel = Label(frame0, font=self._font, text='Grammar:',
+ highlightcolor='black',
+ background=self._GRAMMARBOX_PARAMS['background'])
+ self.grammarlabel.grid(column=0, row=0, sticky='SW')
+ self.grammarbox.grid(column=0, row=1, sticky='NEWS')
# Scroll bar for grammar
grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
- grammar_scrollbar.grid(column=1, row=1, sticky="NWS")
+ grammar_scrollbar.grid(column=1, row=1, sticky='NWS')
self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
# grammar buttons
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
frame3 = Frame(frame0, background=bg)
- frame3.grid(column=0, row=2, sticky="EW")
- Button(
- frame3,
- text="Prev Grammar",
- command=self._history_prev,
- **self._BUTTON_PARAMS
- ).pack(side="left")
- Button(
- frame3,
- text="Next Grammar",
- command=self._history_next,
- **self._BUTTON_PARAMS
- ).pack(side="left")
+ frame3.grid(column=0, row=2, sticky='EW')
+ Button(frame3, text='Prev Grammar', command=self._history_prev,
+ **self._BUTTON_PARAMS).pack(side='left')
+ Button(frame3, text='Next Grammar', command=self._history_next,
+ **self._BUTTON_PARAMS).pack(side='left')
# Help box
- self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
- self.helpbox.grid(column=3, row=1, sticky="NEWS")
+ self.helpbox = Text(frame0, font=self._smallfont,
+ **self._HELPBOX_PARAMS)
+ self.helpbox.grid(column=3, row=1, sticky='NEWS')
self.helptabs = {}
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
helptab_frame = Frame(frame0, background=bg)
- helptab_frame.grid(column=3, row=0, sticky="SW")
+ helptab_frame.grid(column=3, row=0, sticky='SW')
for i, (tab, tabstops, text) in enumerate(self.HELP):
label = Label(helptab_frame, text=tab, font=self._smallfont)
- label.grid(column=i * 2, row=0, sticky="S")
- # help_frame.grid_columnconfigure(i, weight=1)
- # label.pack(side='left')
- label.bind("<ButtonPress>", lambda e, tab=tab: self.show_help(tab))
+ label.grid(column=i*2, row=0, sticky='S')
+ #help_frame.grid_columnconfigure(i, weight=1)
+ #label.pack(side='left')
+ label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab))
self.helptabs[tab] = label
- Frame(
- helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
- ).grid(column=i * 2 + 1, row=0)
+ Frame(helptab_frame, height=1, width=self._HELPTAB_SPACER,
+ background=bg).grid(column=i*2+1, row=0)
self.helptabs[self.HELP[0][0]].configure(font=self._font)
- self.helpbox.tag_config("elide", elide=True)
+ self.helpbox.tag_config('elide', elide=True)
for (tag, params) in self.HELP_AUTOTAG:
- self.helpbox.tag_config("tag-%s" % tag, **params)
+ self.helpbox.tag_config('tag-%s' % tag, **params)
self.show_help(self.HELP[0][0])
# Scroll bar for helpbox
help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
self.helpbox.config(yscrollcommand=help_scrollbar.set)
- help_scrollbar.grid(column=4, row=1, sticky="NWS")
+ help_scrollbar.grid(column=4, row=1, sticky='NWS')
# The dev set
- frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"])
- self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
- self.devsetbox.pack(expand=True, fill="both")
- self.devsetlabel = Label(
- frame0,
- font=self._font,
- text="Development Set:",
- justify="right",
- background=self._DEVSETBOX_PARAMS["background"],
- )
- self.devsetlabel.grid(column=0, row=4, sticky="SW")
- frame4.grid(column=0, row=5, sticky="NEWS")
+ frame4 = Frame(frame0, background=self._FRAME_PARAMS['background'])
+ self.devsetbox = Text(frame4, font=self._font,
+ **self._DEVSETBOX_PARAMS)
+ self.devsetbox.pack(expand=True, fill='both')
+ self.devsetlabel = Label(frame0, font=self._font,
+ text='Development Set:', justify='right',
+ background=self._DEVSETBOX_PARAMS['background'])
+ self.devsetlabel.grid(column=0, row=4, sticky='SW')
+ frame4.grid(column=0, row=5, sticky='NEWS')
# dev set scrollbars
self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
- self.devset_scroll.grid(column=1, row=5, sticky="NWS")
- self.devset_xscroll = Scrollbar(
- frame4, command=self.devsetbox.xview, orient="horiz"
- )
- self.devsetbox["xscrollcommand"] = self.devset_xscroll.set
- self.devset_xscroll.pack(side="bottom", fill="x")
+ self.devset_scroll.grid(column=1, row=5, sticky='NWS')
+ self.devset_xscroll = Scrollbar(frame4, command=self.devsetbox.xview,
+ orient='horiz')
+ self.devsetbox['xscrollcommand'] = self.devset_xscroll.set
+ self.devset_xscroll.pack(side='bottom', fill='x')
# dev set buttons
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
frame1 = Frame(frame0, background=bg)
- frame1.grid(column=0, row=7, sticky="EW")
- Button(
- frame1,
- text="Prev Example (Ctrl-p)",
- command=self._devset_prev,
- **self._BUTTON_PARAMS
- ).pack(side="left")
- Button(
- frame1,
- text="Next Example (Ctrl-n)",
- command=self._devset_next,
- **self._BUTTON_PARAMS
- ).pack(side="left")
- self.devset_button = Button(
- frame1,
- text="Show example",
- command=self.show_devset,
- state="disabled",
- **self._BUTTON_PARAMS
- )
- self.devset_button.pack(side="right")
- self.trace_button = Button(
- frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS
- )
- self.trace_button.pack(side="right")
+ frame1.grid(column=0, row=7, sticky='EW')
+ Button(frame1, text='Prev Example (Ctrl-p)',
+ command=self._devset_prev,
+ **self._BUTTON_PARAMS).pack(side='left')
+ Button(frame1, text='Next Example (Ctrl-n)',
+ command=self._devset_next,
+ **self._BUTTON_PARAMS).pack(side='left')
+ self.devset_button = Button(frame1, text='Show example',
+ command=self.show_devset,
+ state='disabled',
+ **self._BUTTON_PARAMS)
+ self.devset_button.pack(side='right')
+ self.trace_button = Button(frame1, text='Show trace',
+ command=self.show_trace,
+ **self._BUTTON_PARAMS)
+ self.trace_button.pack(side='right')
+
# evaluation box
self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
- label = Label(
- frame0,
- font=self._font,
- text="Evaluation:",
- justify="right",
- background=self._EVALBOX_PARAMS["background"],
- )
- label.grid(column=3, row=4, sticky="SW")
- self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2)
+ label = Label(frame0, font=self._font, text='Evaluation:',
+ justify='right', background=self._EVALBOX_PARAMS['background'])
+ label.grid(column=3, row=4, sticky='SW')
+ self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2)
# evaluation box buttons
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
frame2 = Frame(frame0, background=bg)
- frame2.grid(column=3, row=7, sticky="EW")
+ frame2.grid(column=3, row=7, sticky='EW')
self._autoscale = IntVar(self.top)
self._autoscale.set(False)
- Checkbutton(
- frame2,
- variable=self._autoscale,
- command=self._eval_plot,
- text="Zoom",
- **self._BUTTON_PARAMS
- ).pack(side="left")
+ Checkbutton(frame2, variable=self._autoscale, command=self._eval_plot,
+ text='Zoom', **self._BUTTON_PARAMS).pack(side='left')
self._eval_lines = IntVar(self.top)
self._eval_lines.set(False)
- Checkbutton(
- frame2,
- variable=self._eval_lines,
- command=self._eval_plot,
- text="Lines",
- **self._BUTTON_PARAMS
- ).pack(side="left")
- Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right")
+ Checkbutton(frame2, variable=self._eval_lines, command=self._eval_plot,
+ text='Lines', **self._BUTTON_PARAMS).pack(side='left')
+ Button(frame2, text='History',
+ **self._BUTTON_PARAMS).pack(side='right')
# The status label
self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
- self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5)
+ self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2,
+ columnspan=5)
# Help box & devset box can't be edited.
- self.helpbox["state"] = "disabled"
- self.devsetbox["state"] = "disabled"
+ self.helpbox['state'] = 'disabled'
+ self.devsetbox['state'] = 'disabled'
# Spacers
- bg = self._FRAME_PARAMS["background"]
+ bg = self._FRAME_PARAMS['background']
Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
# pack the frame.
- frame0.pack(fill="both", expand=True)
+ frame0.pack(fill='both', expand=True)
# Set up colors for the devset box
- self.devsetbox.tag_config("true-pos", background="#afa", underline="True")
- self.devsetbox.tag_config("false-neg", underline="True", foreground="#800")
- self.devsetbox.tag_config("false-pos", background="#faa")
- self.devsetbox.tag_config("trace", foreground="#666", wrap="none")
- self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none")
- self.devsetbox.tag_config("error", foreground="#800")
+ self.devsetbox.tag_config('true-pos', background='#afa',
+ underline='True')
+ self.devsetbox.tag_config('false-neg', underline='True',
+ foreground='#800')
+ self.devsetbox.tag_config('false-pos', background='#faa')
+ self.devsetbox.tag_config('trace', foreground='#666', wrap='none')
+ self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none')
+ self.devsetbox.tag_config('error', foreground='#800')
# And for the grammarbox
- self.grammarbox.tag_config("error", background="#fec")
- self.grammarbox.tag_config("comment", foreground="#840")
- self.grammarbox.tag_config("angle", foreground="#00f")
- self.grammarbox.tag_config("brace", foreground="#0a0")
- self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40)
+ self.grammarbox.tag_config('error', background='#fec')
+ self.grammarbox.tag_config('comment', foreground='#840')
+ self.grammarbox.tag_config('angle', foreground='#00f')
+ self.grammarbox.tag_config('brace', foreground='#0a0')
+ self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)
_showing_trace = False
-
def show_trace(self, *e):
self._showing_trace = True
- self.trace_button["state"] = "disabled"
- self.devset_button["state"] = "normal"
+ self.trace_button['state'] = 'disabled'
+ self.devset_button['state'] = 'normal'
- self.devsetbox["state"] = "normal"
- # self.devsetbox['wrap'] = 'none'
- self.devsetbox.delete("1.0", "end")
- self.devsetlabel["text"] = "Development Set (%d/%d)" % (
- (self.devset_index + 1, self._devset_size.get())
- )
+ self.devsetbox['state'] = 'normal'
+ #self.devsetbox['wrap'] = 'none'
+ self.devsetbox.delete('1.0', 'end')
+ self.devsetlabel['text']='Development Set (%d/%d)' % (
+ (self.devset_index+1, self._devset_size.get()))
if self.chunker is None:
- self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.")
- self.devsetbox.tag_add("error", "1.0", "end")
- return # can't do anything more
+ self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
+ self.devsetbox.tag_add('error', '1.0', 'end')
+ return # can't do anything more
gold_tree = self.devset[self.devset_index]
rules = self.chunker.rules()
# Calculate the tag sequence
- tagseq = "\t"
+ tagseq = '\t'
charnum = [1]
for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
- tagseq += "%s " % pos
+ tagseq += '%s ' % pos
charnum.append(len(tagseq))
- self.charnum = dict(
- ((i, j), charnum[j])
- for i in range(len(rules) + 1)
- for j in range(len(charnum))
- )
- self.linenum = dict((i, i * 2 + 2) for i in range(len(rules) + 1))
-
- for i in range(len(rules) + 1):
+ self.charnum = dict(((i, j), charnum[j])
+ for i in range(len(rules)+1)
+ for j in range(len(charnum)))
+ self.linenum = dict((i,i*2+2) for i in range(len(rules)+1))
+
+ for i in range(len(rules)+1):
if i == 0:
- self.devsetbox.insert("end", "Start:\n")
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+ self.devsetbox.insert('end', 'Start:\n')
+ self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
else:
- self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1])
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+ self.devsetbox.insert('end', 'Apply %s:\n' % rules[i-1])
+ self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
# Display the tag sequence.
- self.devsetbox.insert("end", tagseq + "\n")
- self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c")
+ self.devsetbox.insert('end', tagseq+'\n')
+ self.devsetbox.tag_add('wrapindent','end -2c linestart','end -2c')
# Run a partial parser, and extract gold & test chunks
chunker = RegexpChunkParser(rules[:i])
test_tree = self._chunkparse(gold_tree.leaves())
test_chunks = self._chunks(test_tree)
# Compare them.
for chunk in gold_chunks.intersection(test_chunks):
- self._color_chunk(i, chunk, "true-pos")
+ self._color_chunk(i, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
- self._color_chunk(i, chunk, "false-neg")
+ self._color_chunk(i, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
- self._color_chunk(i, chunk, "false-pos")
- self.devsetbox.insert("end", "Finished.\n")
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+ self._color_chunk(i, chunk, 'false-pos')
+ self.devsetbox.insert('end', 'Finished.\n')
+ self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
# This is a hack, because the x-scrollbar isn't updating its
# position right -- I'm not sure what the underlying cause is
# though. (This is on OS X w/ python 2.5)
- self.top.after(100, self.devset_xscroll.set, 0, 0.3)
+ self.top.after(100, self.devset_xscroll.set, 0, .3)
def show_help(self, tab):
- self.helpbox["state"] = "normal"
- self.helpbox.delete("1.0", "end")
+ self.helpbox['state'] = 'normal'
+ self.helpbox.delete('1.0', 'end')
for (name, tabstops, text) in self.HELP:
if name == tab:
- text = text.replace(
- "<<TAGSET>>",
- "\n".join(
- (
- "\t%s\t%s" % item
- for item in sorted(
- list(self.tagset.items()),
- key=lambda t_w: re.match("\w+", t_w[0])
- and (0, t_w[0])
- or (1, t_w[0]),
- )
- )
- ),
- )
+ text = text.replace('<<TAGSET>>', '\n'.join(
+ ('\t%s\t%s' % item for item in sorted(list(self.tagset.items()),
+ key=lambda t_w:re.match('\w+',t_w[0]) and (0,t_w[0]) or (1,t_w[0])))))
self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
self.helpbox.config(tabs=tabstops)
- self.helpbox.insert("1.0", text + "\n" * 20)
- C = "1.0 + %d chars"
+ self.helpbox.insert('1.0', text+'\n'*20)
+ C = '1.0 + %d chars'
for (tag, params) in self.HELP_AUTOTAG:
- pattern = "(?s)(<%s>)(.*?)(</%s>)" % (tag, tag)
+ pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
for m in re.finditer(pattern, text):
- self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1))
- self.helpbox.tag_add(
- "tag-%s" % tag, C % m.start(2), C % m.end(2)
- )
- self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3))
+ self.helpbox.tag_add('elide',
+ C % m.start(1), C % m.end(1))
+ self.helpbox.tag_add('tag-%s' % tag,
+ C % m.start(2), C % m.end(2))
+ self.helpbox.tag_add('elide',
+ C % m.start(3), C % m.end(3))
else:
self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
- self.helpbox["state"] = "disabled"
+ self.helpbox['state'] = 'disabled'
def _history_prev(self, *e):
- self._view_history(self._history_index - 1)
- return "break"
+ self._view_history(self._history_index-1)
+ return 'break'
def _history_next(self, *e):
- self._view_history(self._history_index + 1)
- return "break"
+ self._view_history(self._history_index+1)
+ return 'break'
def _view_history(self, index):
# Bounds & sanity checking:
- index = max(0, min(len(self._history) - 1, index))
- if not self._history:
- return
+ index = max(0, min(len(self._history)-1, index))
+ if not self._history: return
# Already viewing the requested history item?
if index == self._history_index:
return
# Show the requested grammar. It will get added to _history
# only if they edit it (causing self.update() to get run.)
- self.grammarbox["state"] = "normal"
- self.grammarbox.delete("1.0", "end")
- self.grammarbox.insert("end", self._history[index][0])
- self.grammarbox.mark_set("insert", "1.0")
+ self.grammarbox['state'] = 'normal'
+ self.grammarbox.delete('1.0', 'end')
+ self.grammarbox.insert('end', self._history[index][0])
+ self.grammarbox.mark_set('insert', '1.0')
self._history_index = index
self._syntax_highlight_grammar(self._history[index][0])
# Record the normalized grammar & regenerate the chunker.
- self.normalized_grammar = self.normalize_grammar(self._history[index][0])
+ self.normalized_grammar = self.normalize_grammar(
+ self._history[index][0])
if self.normalized_grammar:
- rules = [
- RegexpChunkRule.fromstring(line)
- for line in self.normalized_grammar.split("\n")
- ]
+ rules = [RegexpChunkRule.fromstring(line)
+ for line in self.normalized_grammar.split('\n')]
else:
rules = []
self.chunker = RegexpChunkParser(rules)
self._eval_plot()
# Update the devset box
self._highlight_devset()
- if self._showing_trace:
- self.show_trace()
+ if self._showing_trace: self.show_trace()
# Update the grammar label
- if self._history_index < len(self._history) - 1:
- self.grammarlabel["text"] = "Grammar %s/%s:" % (
- self._history_index + 1,
- len(self._history),
- )
+ if self._history_index < len(self._history)-1:
+ self.grammarlabel['text'] = 'Grammar %s/%s:' % (
+ self._history_index+1, len(self._history))
else:
- self.grammarlabel["text"] = "Grammar:"
+ self.grammarlabel['text'] = 'Grammar:'
def _devset_next(self, *e):
- self._devset_scroll("scroll", 1, "page")
- return "break"
+ self._devset_scroll('scroll', 1, 'page')
+ return 'break'
def _devset_prev(self, *e):
- self._devset_scroll("scroll", -1, "page")
- return "break"
+ self._devset_scroll('scroll', -1, 'page')
+ return 'break'
def destroy(self, *e):
- if self.top is None:
- return
+ if self.top is None: return
self.top.destroy()
self.top = None
def _devset_scroll(self, command, *args):
- N = 1 # size of a page -- one sentence.
+ N = 1 # size of a page -- one sentence.
showing_trace = self._showing_trace
- if command == "scroll" and args[1].startswith("unit"):
- self.show_devset(self.devset_index + int(args[0]))
- elif command == "scroll" and args[1].startswith("page"):
- self.show_devset(self.devset_index + N * int(args[0]))
- elif command == "moveto":
- self.show_devset(int(float(args[0]) * self._devset_size.get()))
+ if command == 'scroll' and args[1].startswith('unit'):
+ self.show_devset(self.devset_index+int(args[0]))
+ elif command == 'scroll' and args[1].startswith('page'):
+ self.show_devset(self.devset_index+N*int(args[0]))
+ elif command == 'moveto':
+ self.show_devset(int(float(args[0])*self._devset_size.get()))
else:
- assert 0, "bad scroll command %s %s" % (command, args)
+ assert 0, 'bad scroll command %s %s' % (command, args)
if showing_trace:
self.show_trace()
def show_devset(self, index=None):
- if index is None:
- index = self.devset_index
+ if index is None: index = self.devset_index
# Bounds checking
- index = min(max(0, index), self._devset_size.get() - 1)
+ index = min(max(0, index), self._devset_size.get()-1)
- if index == self.devset_index and not self._showing_trace:
- return
+ if index == self.devset_index and not self._showing_trace: return
self.devset_index = index
self._showing_trace = False
- self.trace_button["state"] = "normal"
- self.devset_button["state"] = "disabled"
+ self.trace_button['state'] = 'normal'
+ self.devset_button['state'] = 'disabled'
# Clear the text box.
- self.devsetbox["state"] = "normal"
- self.devsetbox["wrap"] = "word"
- self.devsetbox.delete("1.0", "end")
- self.devsetlabel["text"] = "Development Set (%d/%d)" % (
- (self.devset_index + 1, self._devset_size.get())
- )
+ self.devsetbox['state'] = 'normal'
+ self.devsetbox['wrap'] = 'word'
+ self.devsetbox.delete('1.0', 'end')
+ self.devsetlabel['text']='Development Set (%d/%d)' % (
+ (self.devset_index+1, self._devset_size.get()))
# Add the sentences
- sample = self.devset[self.devset_index : self.devset_index + 1]
+ sample = self.devset[self.devset_index:self.devset_index+1]
self.charnum = {}
- self.linenum = {0: 1}
+ self.linenum = {0:1}
for sentnum, sent in enumerate(sample):
- linestr = ""
+ linestr = ''
for wordnum, (word, pos) in enumerate(sent.leaves()):
self.charnum[sentnum, wordnum] = len(linestr)
- linestr += "%s/%s " % (word, pos)
- self.charnum[sentnum, wordnum + 1] = len(linestr)
- self.devsetbox.insert("end", linestr[:-1] + "\n\n")
+ linestr += '%s/%s ' % (word, pos)
+ self.charnum[sentnum, wordnum+1] = len(linestr)
+ self.devsetbox.insert('end', linestr[:-1]+'\n\n')
# Highlight chunks in the dev set
if self.chunker is not None:
self._highlight_devset()
- self.devsetbox["state"] = "disabled"
+ self.devsetbox['state'] = 'disabled'
# Update the scrollbar
- first = self.devset_index / self._devset_size.get()
+ first = self.devset_index/self._devset_size.get()
last = (self.devset_index + 2) / self._devset_size.get()
self.devset_scroll.set(first, last)
for child in tree:
if isinstance(child, Tree):
if child.label() == self._chunk_label:
- chunks.add((wordnum, wordnum + len(child)))
+ chunks.add( (wordnum, wordnum+len(child)) )
wordnum += len(child)
else:
wordnum += 1
return chunks
def _syntax_highlight_grammar(self, grammar):
- if self.top is None:
- return
- self.grammarbox.tag_remove("comment", "1.0", "end")
- self.grammarbox.tag_remove("angle", "1.0", "end")
- self.grammarbox.tag_remove("brace", "1.0", "end")
- self.grammarbox.tag_add("hangindent", "1.0", "end")
- for lineno, line in enumerate(grammar.split("\n")):
- if not line.strip():
- continue
- m = re.match(r"(\\.|[^#])*(#.*)?", line)
+ if self.top is None: return
+ self.grammarbox.tag_remove('comment', '1.0', 'end')
+ self.grammarbox.tag_remove('angle', '1.0', 'end')
+ self.grammarbox.tag_remove('brace', '1.0', 'end')
+ self.grammarbox.tag_add('hangindent', '1.0', 'end')
+ for lineno, line in enumerate(grammar.split('\n')):
+ if not line.strip(): continue
+ m = re.match(r'(\\.|[^#])*(#.*)?', line)
comment_start = None
if m.group(2):
comment_start = m.start(2)
- s = "%d.%d" % (lineno + 1, m.start(2))
- e = "%d.%d" % (lineno + 1, m.end(2))
- self.grammarbox.tag_add("comment", s, e)
- for m in re.finditer("[<>{}]", line):
+ s = '%d.%d' % (lineno+1, m.start(2))
+ e = '%d.%d' % (lineno+1, m.end(2))
+ self.grammarbox.tag_add('comment', s, e)
+ for m in re.finditer('[<>{}]', line):
if comment_start is not None and m.start() >= comment_start:
break
- s = "%d.%d" % (lineno + 1, m.start())
- e = "%d.%d" % (lineno + 1, m.end())
- if m.group() in "<>":
- self.grammarbox.tag_add("angle", s, e)
+ s = '%d.%d' % (lineno+1, m.start())
+ e = '%d.%d' % (lineno+1, m.end())
+ if m.group() in '<>':
+ self.grammarbox.tag_add('angle', s, e)
else:
- self.grammarbox.tag_add("brace", s, e)
+ self.grammarbox.tag_add('brace', s, e)
+
def _grammarcheck(self, grammar):
- if self.top is None:
- return
- self.grammarbox.tag_remove("error", "1.0", "end")
+ if self.top is None: return
+ self.grammarbox.tag_remove('error', '1.0', 'end')
self._grammarcheck_errs = []
- for lineno, line in enumerate(grammar.split("\n")):
- line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line)
+ for lineno, line in enumerate(grammar.split('\n')):
+ line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
line = line.strip()
if line:
try:
RegexpChunkRule.fromstring(line)
except ValueError as e:
- self.grammarbox.tag_add(
- "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1)
- )
- self.status["text"] = ""
+ self.grammarbox.tag_add('error', '%s.0' % (lineno+1),
+ '%s.0 lineend' % (lineno+1))
+ self.status['text'] = ''
def update(self, *event):
# Record when update was called (for grammarcheck)
self._last_keypress = time.time()
# Read the grammar from the Text box.
- self.grammar = grammar = self.grammarbox.get("1.0", "end")
+ self.grammar = grammar = self.grammarbox.get('1.0', 'end')
# If the grammar hasn't changed, do nothing:
normalized_grammar = self.normalize_grammar(grammar)
# If the grammar has changed, and we're looking at history,
# then stop looking at history.
- if self._history_index < len(self._history) - 1:
- self.grammarlabel["text"] = "Grammar:"
+ if self._history_index < len(self._history)-1:
+ self.grammarlabel['text'] = 'Grammar:'
self._syntax_highlight_grammar(grammar)
try:
# Note: the normalized grammar has no blank lines.
if normalized_grammar:
- rules = [
- RegexpChunkRule.fromstring(line)
- for line in normalized_grammar.split("\n")
- ]
+ rules = [RegexpChunkRule.fromstring(line)
+ for line in normalized_grammar.split('\n')]
else:
rules = []
except ValueError as e:
return
self.chunker = RegexpChunkParser(rules)
- self.grammarbox.tag_remove("error", "1.0", "end")
+ self.grammarbox.tag_remove('error', '1.0', 'end')
self.grammar_changed = time.time()
# Display the results
if self._showing_trace:
def _highlight_devset(self, sample=None):
if sample is None:
- sample = self.devset[self.devset_index : self.devset_index + 1]
+ sample = self.devset[self.devset_index:self.devset_index+1]
- self.devsetbox.tag_remove("true-pos", "1.0", "end")
- self.devsetbox.tag_remove("false-neg", "1.0", "end")
- self.devsetbox.tag_remove("false-pos", "1.0", "end")
+ self.devsetbox.tag_remove('true-pos', '1.0', 'end')
+ self.devsetbox.tag_remove('false-neg', '1.0', 'end')
+ self.devsetbox.tag_remove('false-pos', '1.0', 'end')
# Run the grammar on the test cases.
for sentnum, gold_tree in enumerate(sample):
test_chunks = self._chunks(test_tree)
# Compare them.
for chunk in gold_chunks.intersection(test_chunks):
- self._color_chunk(sentnum, chunk, "true-pos")
+ self._color_chunk(sentnum, chunk, 'true-pos')
for chunk in gold_chunks - test_chunks:
- self._color_chunk(sentnum, chunk, "false-neg")
+ self._color_chunk(sentnum, chunk, 'false-neg')
for chunk in test_chunks - gold_chunks:
- self._color_chunk(sentnum, chunk, "false-pos")
+ self._color_chunk(sentnum, chunk, 'false-pos')
def _chunkparse(self, words):
try:
# There's an error somewhere in the grammar, but we're not sure
# exactly where, so just mark the whole grammar as bad.
# E.g., this is caused by: "({<NN>})"
- self.grammarbox.tag_add("error", "1.0", "end")
+ self.grammarbox.tag_add('error', '1.0', 'end')
# Treat it as tagging nothing:
return words
def _color_chunk(self, sentnum, chunk, tag):
start, end = chunk
- self.devsetbox.tag_add(
- tag,
- "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, start]),
- "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
- )
+ self.devsetbox.tag_add(tag,
+ '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
+ '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end]-1))
def reset(self):
# Clear various variables
self._history = []
self._history_index = 0
# Update the on-screen display.
- self.grammarbox.delete("1.0", "end")
+ self.grammarbox.delete('1.0', 'end')
self.show_devset(0)
self.update()
- # self._eval_plot()
+ #self._eval_plot()
SAVE_GRAMMAR_TEMPLATE = (
- "# Regexp Chunk Parsing Grammar\n"
- "# Saved %(date)s\n"
- "#\n"
- "# Development set: %(devset)s\n"
- "# Precision: %(precision)s\n"
- "# Recall: %(recall)s\n"
- "# F-score: %(fscore)s\n\n"
- "%(grammar)s\n"
- )
+ '# Regexp Chunk Parsing Grammar\n'
+ '# Saved %(date)s\n'
+ '#\n'
+ '# Development set: %(devset)s\n'
+ '# Precision: %(precision)s\n'
+ '# Recall: %(recall)s\n'
+ '# F-score: %(fscore)s\n\n'
+ '%(grammar)s\n')
def save_grammar(self, filename=None):
if not filename:
- ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk")
- if not filename:
- return
- if self._history and self.normalized_grammar == self.normalize_grammar(
- self._history[-1][0]
- ):
- precision, recall, fscore = [
- "%.2f%%" % (100 * v) for v in self._history[-1][1:]
- ]
+ ftypes = [('Chunk Gramamr', '.chunk'),
+ ('All files', '*')]
+ filename = asksaveasfilename(filetypes=ftypes,
+ defaultextension='.chunk')
+ if not filename: return
+ if (self._history and self.normalized_grammar ==
+ self.normalize_grammar(self._history[-1][0])):
+ precision, recall, fscore = ['%.2f%%' % (100*v) for v in
+ self._history[-1][1:]]
elif self.chunker is None:
- precision = recall = fscore = "Grammar not well formed"
+ precision = recall = fscore = 'Grammar not well formed'
else:
- precision = recall = fscore = "Not finished evaluation yet"
-
- with open(filename, "w") as outfile:
- outfile.write(
- self.SAVE_GRAMMAR_TEMPLATE
- % dict(
- date=time.ctime(),
- devset=self.devset_name,
- precision=precision,
- recall=recall,
- fscore=fscore,
- grammar=self.grammar.strip(),
- )
- )
+ precision = recall = fscore = 'Not finished evaluation yet'
+
+ with open(filename, 'w') as outfile:
+ outfile.write(self.SAVE_GRAMMAR_TEMPLATE % dict(
+ date=time.ctime(), devset=self.devset_name,
+ precision=precision, recall=recall, fscore=fscore,
+ grammar=self.grammar.strip()))
def load_grammar(self, filename=None):
if not filename:
- ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
- filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk")
- if not filename:
- return
- self.grammarbox.delete("1.0", "end")
+ ftypes = [('Chunk Gramamr', '.chunk'),
+ ('All files', '*')]
+ filename = askopenfilename(filetypes=ftypes,
+ defaultextension='.chunk')
+ if not filename: return
+ self.grammarbox.delete('1.0', 'end')
self.update()
- with open(filename, "r") as infile:
+ with open(filename, 'r') as infile:
grammar = infile.read()
- grammar = re.sub(
- "^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
- ).lstrip()
- self.grammarbox.insert("1.0", grammar)
+ grammar = re.sub('^\# Regexp Chunk Parsing Grammar[\s\S]*'
+ 'F-score:.*\n', '', grammar).lstrip()
+ self.grammarbox.insert('1.0', grammar)
self.update()
def save_history(self, filename=None):
if not filename:
- ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt")
- if not filename:
- return
-
- with open(filename, "w") as outfile:
- outfile.write("# Regexp Chunk Parsing Grammar History\n")
- outfile.write("# Saved %s\n" % time.ctime())
- outfile.write("# Development set: %s\n" % self.devset_name)
+ ftypes = [('Chunk Gramamr History', '.txt'),
+ ('All files', '*')]
+ filename = asksaveasfilename(filetypes=ftypes,
+ defaultextension='.txt')
+ if not filename: return
+
+ with open(filename, 'w') as outfile:
+ outfile.write('# Regexp Chunk Parsing Grammar History\n')
+ outfile.write('# Saved %s\n' % time.ctime())
+ outfile.write('# Development set: %s\n' % self.devset_name)
for i, (g, p, r, f) in enumerate(self._history):
- hdr = (
- "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, "
- "fscore=%.2f%%)"
- % (i + 1, len(self._history), p * 100, r * 100, f * 100)
- )
- outfile.write("\n%s\n" % hdr)
- outfile.write("".join(" %s\n" % line for line in g.strip().split()))
-
- if not (
- self._history
- and self.normalized_grammar
- == self.normalize_grammar(self._history[-1][0])
- ):
+ hdr = ('Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
+ 'fscore=%.2f%%)' % (i+1, len(self._history),
+ p*100, r*100, f*100))
+ outfile.write('\n%s\n' % hdr)
+ outfile.write(''.join(' %s\n' % line for line in g.strip().split()))
+
+ if not (self._history and self.normalized_grammar ==
+ self.normalize_grammar(self._history[-1][0])):
if self.chunker is None:
- outfile.write("\nCurrent Grammar (not well-formed)\n")
+ outfile.write('\nCurrent Grammar (not well-formed)\n')
else:
- outfile.write("\nCurrent Grammar (not evaluated)\n")
- outfile.write(
- "".join(" %s\n" % line for line in self.grammar.strip().split())
- )
+ outfile.write('\nCurrent Grammar (not evaluated)\n')
+ outfile.write(''.join(' %s\n' % line for line
+ in self.grammar.strip().split()))
def about(self, *e):
- ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
- TITLE = "About: Regular Expression Chunk Parser Application"
+ ABOUT = ("NLTK RegExp Chunk Parser Application\n"+
+ "Written by Edward Loper")
+ TITLE = 'About: Regular Expression Chunk Parser Application'
try:
- from tkinter.messagebox import Message
-
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self.top, TITLE, ABOUT)
def set_devset_size(self, size=None):
- if size is not None:
- self._devset_size.set(size)
+ if size is not None: self._devset_size.set(size)
self._devset_size.set(min(len(self.devset), self._devset_size.get()))
self.show_devset(1)
self.show_devset(0)
# what about history? Evaluated at diff dev set sizes!
def resize(self, size=None):
- if size is not None:
- self._size.set(size)
+ if size is not None: self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
- self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20))
+ self._smallfont.configure(size=min(-10, -(abs(size))*14//20))
def mainloop(self, *args, **kwargs):
"""
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self.top.mainloop(*args, **kwargs)
-
def app():
RegexpChunkApp().mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Collocations Application
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import division
+
import threading
-import queue as q
-from tkinter.font import Font
-from tkinter import (
- Button,
- END,
- Frame,
- IntVar,
- LEFT,
- Label,
- Menu,
- OptionMenu,
- SUNKEN,
- Scrollbar,
- StringVar,
- Text,
- Tk,
-)
-
-from nltk.corpus import (
- cess_cat,
- brown,
- nps_chat,
- treebank,
- sinica_treebank,
- alpino,
- indian,
- floresta,
- mac_morpho,
- machado,
- cess_esp,
-)
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Button, END, Frame, IntVar, LEFT, Label, Menu,
+ OptionMenu, SUNKEN, Scrollbar, StringVar,
+ Text, Tk)
+
+from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino,
+ indian, floresta, mac_morpho, machado, cess_esp)
from nltk.util import in_idle
from nltk.probability import FreqDist
-CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
-ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
POLL_INTERVAL = 100
-_DEFAULT = "English: Brown Corpus (Humor)"
+_DEFAULT = 'English: Brown Corpus (Humor)'
_CORPORA = {
- "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
- "English: Brown Corpus": lambda: brown.words(),
- "English: Brown Corpus (Press)": lambda: brown.words(
- categories=["news", "editorial", "reviews"]
- ),
- "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
- "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
- "English: Brown Corpus (Science Fiction)": lambda: brown.words(
- categories="science_fiction"
- ),
- "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
- "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
- "English: NPS Chat Corpus": lambda: nps_chat.words(),
- "English: Wall Street Journal Corpus": lambda: treebank.words(),
- "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
- "Dutch: Alpino Corpus": lambda: alpino.words(),
- "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
- "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
- "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
- "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
- "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
-}
-
+ 'Catalan: CESS-CAT Corpus':
+ lambda: cess_cat.words(),
+ 'English: Brown Corpus':
+ lambda: brown.words(),
+ 'English: Brown Corpus (Press)':
+ lambda: brown.words(categories=['news', 'editorial', 'reviews']),
+ 'English: Brown Corpus (Religion)':
+ lambda: brown.words(categories='religion'),
+ 'English: Brown Corpus (Learned)':
+ lambda: brown.words(categories='learned'),
+ 'English: Brown Corpus (Science Fiction)':
+ lambda: brown.words(categories='science_fiction'),
+ 'English: Brown Corpus (Romance)':
+ lambda: brown.words(categories='romance'),
+ 'English: Brown Corpus (Humor)':
+ lambda: brown.words(categories='humor'),
+ 'English: NPS Chat Corpus':
+ lambda: nps_chat.words(),
+ 'English: Wall Street Journal Corpus':
+ lambda: treebank.words(),
+ 'Chinese: Sinica Corpus':
+ lambda: sinica_treebank.words(),
+ 'Dutch: Alpino Corpus':
+ lambda: alpino.words(),
+ 'Hindi: Indian Languages Corpus':
+ lambda: indian.words(files='hindi.pos'),
+ 'Portuguese: Floresta Corpus (Portugal)':
+ lambda: floresta.words(),
+ 'Portuguese: MAC-MORPHO Corpus (Brazil)':
+ lambda: mac_morpho.words(),
+ 'Portuguese: Machado Corpus (Brazil)':
+ lambda: machado.words(),
+ 'Spanish: CESS-ESP Corpus':
+ lambda: cess_esp.words()
+ }
class CollocationsView:
- _BACKGROUND_COLOUR = "#FFF" # white
+ _BACKGROUND_COLOUR='#FFF' #white
def __init__(self):
self.queue = q.Queue()
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
- top.geometry("550x650+50+50")
- top.title("NLTK Collocations List")
- top.bind("<Control-q>", self.destroy)
- top.protocol("WM_DELETE_WINDOW", self.destroy)
- top.minsize(550, 650)
+ top.geometry('550x650+50+50')
+ top.title('NLTK Collocations List')
+ top.bind('<Control-q>', self.destroy)
+ top.protocol('WM_DELETE_WINDOW', self.destroy)
+ top.minsize(550,650)
def _init_widgets(self, parent):
- self.main_frame = Frame(
- parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
- )
+ self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1))
self._init_corpus_select(self.main_frame)
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
- self.main_frame.pack(fill="both", expand=True)
+ self.main_frame.pack(fill='both', expand=True)
def _init_corpus_select(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.var = StringVar(innerframe)
self.var.set(self.model.DEFAULT_CORPUS)
- Label(
- innerframe,
- justify=LEFT,
- text=" Corpus: ",
- background=self._BACKGROUND_COLOUR,
- padx=2,
- pady=1,
- border=0,
- ).pack(side="left")
-
- other_corpora = list(self.model.CORPORA.keys()).remove(
- self.model.DEFAULT_CORPUS
- )
- om = OptionMenu(
- innerframe,
- self.var,
- self.model.DEFAULT_CORPUS,
- command=self.corpus_selected,
- *self.model.non_default_corpora()
- )
- om["borderwidth"] = 0
- om["highlightthickness"] = 1
- om.pack(side="left")
- innerframe.pack(side="top", fill="x", anchor="n")
+ Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
+
+ other_corpora = list(self.model.CORPORA.keys()).remove(self.model.DEFAULT_CORPUS)
+ om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
+ om['borderwidth'] = 0
+ om['highlightthickness'] = 1
+ om.pack(side='left')
+ innerframe.pack(side='top', fill='x', anchor='n')
def _init_status(self, parent):
- self.status = Label(
- parent,
- justify=LEFT,
- relief=SUNKEN,
- background=self._BACKGROUND_COLOUR,
- border=0,
- padx=1,
- pady=0,
- )
- self.status.pack(side="top", anchor="sw")
+ self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0)
+ self.status.pack(side='top', anchor='sw')
def _init_menubar(self):
self._result_size = IntVar(self.top)
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-q')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
- rescntmenu.add_radiobutton(
- label="20",
- variable=self._result_size,
- underline=0,
- value=20,
- command=self.set_result_size,
- )
- rescntmenu.add_radiobutton(
- label="50",
- variable=self._result_size,
- underline=0,
- value=50,
- command=self.set_result_size,
- )
- rescntmenu.add_radiobutton(
- label="100",
- variable=self._result_size,
- underline=0,
- value=100,
- command=self.set_result_size,
- )
+ rescntmenu.add_radiobutton(label='20', variable=self._result_size,
+ underline=0, value=20, command=self.set_result_size)
+ rescntmenu.add_radiobutton(label='50', variable=self._result_size,
+ underline=0, value=50, command=self.set_result_size)
+ rescntmenu.add_radiobutton(label='100', variable=self._result_size,
+ underline=0, value=100, command=self.set_result_size)
rescntmenu.invoke(1)
- editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
+ editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
self.top.config(menu=menubar)
def set_result_size(self, **kwargs):
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
- hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
- self.results_box = Text(
- i1,
- font=Font(family="courier", size="16"),
- state="disabled",
- borderwidth=1,
- yscrollcommand=vscrollbar.set,
- xscrollcommand=hscrollbar.set,
- wrap="none",
- width="40",
- height="20",
- exportselection=1,
- )
- self.results_box.pack(side="left", fill="both", expand=True)
- vscrollbar.pack(side="left", fill="y", anchor="e")
+ hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
+ self.results_box = Text(i1,
+ font=Font(family='courier', size='16'),
+ state='disabled', borderwidth=1,
+ yscrollcommand=vscrollbar.set,
+ xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
+ self.results_box.pack(side='left', fill='both', expand=True)
+ vscrollbar.pack(side='left', fill='y', anchor='e')
vscrollbar.config(command=self.results_box.yview)
- hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
+ hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
hscrollbar.config(command=self.results_box.xview)
- # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
- Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
- side="left", anchor="e"
- )
- i1.pack(side="top", fill="both", expand=True, anchor="n")
- i2.pack(side="bottom", fill="x", anchor="s")
- innerframe.pack(side="top", fill="both", expand=True)
+ #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
+ Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
+ i1.pack(side='top', fill='both', expand=True, anchor='n')
+ i2.pack(side='bottom', fill='x', anchor='s')
+ innerframe.pack(side='top', fill='both', expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
- self.prev = prev = Button(
- innerframe,
- text="Previous",
- command=self.previous,
- width="10",
- borderwidth=1,
- highlightthickness=1,
- state="disabled",
- )
- prev.pack(side="left", anchor="center")
- self.next = next = Button(
- innerframe,
- text="Next",
- command=self.__next__,
- width="10",
- borderwidth=1,
- highlightthickness=1,
- state="disabled",
- )
- next.pack(side="right", anchor="center")
- innerframe.pack(side="top", fill="y")
+ self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+ prev.pack(side='left', anchor='center')
+ self.next = next = Button(innerframe, text='Next', command=self.__next__, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+ next.pack(side='right', anchor='center')
+ innerframe.pack(side='top', fill='y')
self.reset_current_page()
def reset_current_page(self):
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
- self.status["text"] = "Error in loading " + self.var.get()
+ self.status['text'] = 'Error in loading ' + self.var.get()
self.unfreeze_editable()
self.clear_results_box()
self.freeze_editable()
self.reset_current_page()
def handle_corpus_loaded(self, event):
- self.status["text"] = self.var.get() + " is loaded"
+ self.status['text'] = self.var.get() + ' is loaded'
self.unfreeze_editable()
self.clear_results_box()
self.reset_current_page()
- # self.next()
+ #self.next()
collocations = self.model.next(self.current_page + 1)
self.write_results(collocations)
self.current_page += 1
def previous(self):
self.freeze_editable()
collocations = self.model.prev(self.current_page - 1)
- self.current_page = self.current_page - 1
+ self.current_page= self.current_page - 1
self.clear_results_box()
self.write_results(collocations)
self.unfreeze_editable()
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
- self.status["text"] = "Loading " + selection + "..."
+ self.status['text'] = 'Loading ' + selection + '...'
self.freeze_editable()
self.model.load_corpus(selection)
def freeze_editable(self):
- self.prev["state"] = "disabled"
- self.next["state"] = "disabled"
+ self.prev['state'] = 'disabled'
+ self.next['state'] = 'disabled'
def clear_results_box(self):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
self.results_box.delete("1.0", END)
- self.results_box["state"] = "disabled"
+ self.results_box['state'] = 'disabled'
def fire_event(self, event):
- # Firing an event so that rendering of widgets happen in the mainloop thread
- self.top.event_generate(event, when="tail")
+ #Firing an event so that rendering of widgets happen in the mainloop thread
+ self.top.event_generate(event, when='tail')
def destroy(self, *e):
- if self.top is None:
- return
+ if self.top is None: return
self.top.after_cancel(self.after)
self.top.destroy()
self.top = None
def mainloop(self, *args, **kwargs):
- if in_idle():
- return
+ if in_idle(): return
self.top.mainloop(*args, **kwargs)
def unfreeze_editable(self):
def set_paging_button_states(self):
if self.current_page == -1 or self.current_page == 0:
- self.prev["state"] = "disabled"
+ self.prev['state'] = 'disabled'
else:
- self.prev["state"] = "normal"
+ self.prev['state'] = 'normal'
if self.model.is_last_page(self.current_page):
- self.next["state"] = "disabled"
+ self.next['state'] = 'disabled'
else:
- self.next["state"] = "normal"
+ self.next['state'] = 'normal'
def write_results(self, results):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
row = 1
for each in results:
- self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
+ self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
row += 1
- self.results_box["state"] = "disabled"
-
+ self.results_box['state'] = 'disabled'
class CollocationsModel:
def __init__(self, queue):
def is_last_page(self, number):
if number < len(self.result_pages):
return False
- return self.results_returned + (
- number - len(self.result_pages)
- ) * self.result_count >= len(self.collocations)
+ return self.results_returned + (number - len(self.result_pages)) * self.result_count >= len(self.collocations)
def next(self, page):
if (len(self.result_pages) - 1) < page:
for i in range(page - (len(self.result_pages) - 1)):
- self.result_pages.append(
- self.collocations[
- self.results_returned : self.results_returned
- + self.result_count
- ]
- )
+ self.result_pages.append(self.collocations[self.results_returned:self.results_returned+self.result_count])
self.results_returned += self.result_count
return self.result_pages[page]
try:
words = self.model.CORPORA[self.name]()
from operator import itemgetter
-
text = [w for w in words if len(w) > 2]
- fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
+ fd = FreqDist(tuple(text[i:i+2]) for i in range(len(text)-1))
vocab = FreqDist(text)
- scored = [
- ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
- for w1, w2 in fd
- ]
+ scored = [((w1,w2), fd[(w1,w2)] ** 3 / (vocab[w1] * vocab[w2])) for w1, w2 in fd]
scored.sort(key=itemgetter(1), reverse=True)
self.model.collocations = list(map(itemgetter(0), scored))
self.model.queue.put(CORPUS_LOADED_EVENT)
print(e)
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
-
-# def collocations():
+#def collocations():
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
-
def app():
c = CollocationsView()
c.mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Concordance Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+
+import nltk.compat
import re
import threading
-import queue as q
-from tkinter.font import Font
-from tkinter import (
- Tk,
- Button,
- END,
- Entry,
- Frame,
- IntVar,
- LEFT,
- Label,
- Menu,
- OptionMenu,
- SUNKEN,
- Scrollbar,
- StringVar,
- Text,
-)
-
-from nltk.corpus import (
- cess_cat,
- brown,
- nps_chat,
- treebank,
- sinica_treebank,
- alpino,
- indian,
- floresta,
- mac_morpho,
- cess_esp,
-)
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Tk, Button, END, Entry, Frame, IntVar, LEFT,
+ Label, Menu, OptionMenu, SUNKEN, Scrollbar,
+ StringVar, Text)
+
+from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank,
+ alpino, indian, floresta, mac_morpho, cess_esp)
from nltk.util import in_idle
from nltk.draw.util import ShowText
-WORD_OR_TAG = "[^/ ]+"
-BOUNDARY = r"\b"
+WORD_OR_TAG = '[^/ ]+'
+BOUNDARY = r'\b'
-CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
-SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
-SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
-ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
+SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
POLL_INTERVAL = 50
# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.
-_DEFAULT = "English: Brown Corpus (Humor, simplified)"
+_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
_CORPORA = {
- "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
- tagset="universal"
- ),
- "English: Brown Corpus": lambda: brown.tagged_sents(),
- "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
- tagset="universal"
- ),
- "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
- categories=["news", "editorial", "reviews"], tagset="universal"
- ),
- "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
- categories="religion", tagset="universal"
- ),
- "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
- categories="learned", tagset="universal"
- ),
- "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
- categories="science_fiction", tagset="universal"
- ),
- "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
- categories="romance", tagset="universal"
- ),
- "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
- categories="humor", tagset="universal"
- ),
- "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
- "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
- tagset="universal"
- ),
- "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
- "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
- tagset="universal"
- ),
- "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
- "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
- tagset="universal"
- ),
- "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
- "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
- tagset="universal"
- ),
- "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
- "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
- files="hindi.pos", tagset="universal"
- ),
- "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
- "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
- tagset="universal"
- ),
- "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
- "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
- tagset="universal"
- ),
- "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
- tagset="universal"
- ),
-}
-
+ 'Catalan: CESS-CAT Corpus (simplified)':
+ lambda: cess_cat.tagged_sents(tagset='universal'),
+ 'English: Brown Corpus':
+ lambda: brown.tagged_sents(),
+ 'English: Brown Corpus (simplified)':
+ lambda: brown.tagged_sents(tagset='universal'),
+ 'English: Brown Corpus (Press, simplified)':
+ lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'),
+ 'English: Brown Corpus (Religion, simplified)':
+ lambda: brown.tagged_sents(categories='religion', tagset='universal'),
+ 'English: Brown Corpus (Learned, simplified)':
+ lambda: brown.tagged_sents(categories='learned', tagset='universal'),
+ 'English: Brown Corpus (Science Fiction, simplified)':
+ lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'),
+ 'English: Brown Corpus (Romance, simplified)':
+ lambda: brown.tagged_sents(categories='romance', tagset='universal'),
+ 'English: Brown Corpus (Humor, simplified)':
+ lambda: brown.tagged_sents(categories='humor', tagset='universal'),
+ 'English: NPS Chat Corpus':
+ lambda: nps_chat.tagged_posts(),
+ 'English: NPS Chat Corpus (simplified)':
+ lambda: nps_chat.tagged_posts(tagset='universal'),
+ 'English: Wall Street Journal Corpus':
+ lambda: treebank.tagged_sents(),
+ 'English: Wall Street Journal Corpus (simplified)':
+ lambda: treebank.tagged_sents(tagset='universal'),
+ 'Chinese: Sinica Corpus':
+ lambda: sinica_treebank.tagged_sents(),
+ 'Chinese: Sinica Corpus (simplified)':
+ lambda: sinica_treebank.tagged_sents(tagset='universal'),
+ 'Dutch: Alpino Corpus':
+ lambda: alpino.tagged_sents(),
+ 'Dutch: Alpino Corpus (simplified)':
+ lambda: alpino.tagged_sents(tagset='universal'),
+ 'Hindi: Indian Languages Corpus':
+ lambda: indian.tagged_sents(files='hindi.pos'),
+ 'Hindi: Indian Languages Corpus (simplified)':
+ lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
+ 'Portuguese: Floresta Corpus (Portugal)':
+ lambda: floresta.tagged_sents(),
+ 'Portuguese: Floresta Corpus (Portugal, simplified)':
+ lambda: floresta.tagged_sents(tagset='universal'),
+ 'Portuguese: MAC-MORPHO Corpus (Brazil)':
+ lambda: mac_morpho.tagged_sents(),
+ 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
+ lambda: mac_morpho.tagged_sents(tagset='universal'),
+ 'Spanish: CESS-ESP Corpus (simplified)':
+ lambda: cess_esp.tagged_sents(tagset='universal'),
+ }
class ConcordanceSearchView(object):
- _BACKGROUND_COLOUR = "#FFF" # white
+ _BACKGROUND_COLOUR='#FFF' #white
- # Colour of highlighted results
- _HIGHLIGHT_WORD_COLOUR = "#F00" # red
- _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
+ #Colour of highlighted results
+ _HIGHLIGHT_WORD_COLOUR='#F00' #red
+ _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'
- _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
- _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
+ _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
+ _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
- # Percentage of text left of the scrollbar position
- _FRACTION_LEFT_TEXT = 0.30
+
+ #Percentage of text left of the scrollbar position
+ _FRACTION_LEFT_TEXT=0.30
def __init__(self):
self.queue = q.Queue()
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
- top.geometry("950x680+50+50")
- top.title("NLTK Concordance Search")
- top.bind("<Control-q>", self.destroy)
- top.protocol("WM_DELETE_WINDOW", self.destroy)
- top.minsize(950, 680)
+ top.geometry('950x680+50+50')
+ top.title('NLTK Concordance Search')
+ top.bind('<Control-q>', self.destroy)
+ top.protocol('WM_DELETE_WINDOW', self.destroy)
+ top.minsize(950,680)
def _init_widgets(self, parent):
- self.main_frame = Frame(
- parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
- )
+ self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1))
self._init_corpus_select(self.main_frame)
self._init_query_box(self.main_frame)
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
- self.main_frame.pack(fill="both", expand=True)
+ self.main_frame.pack(fill='both', expand=True)
def _init_menubar(self):
self._result_size = IntVar(self.top)
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-q')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
- rescntmenu.add_radiobutton(
- label="20",
- variable=self._result_size,
- underline=0,
- value=20,
- command=self.set_result_size,
- )
- rescntmenu.add_radiobutton(
- label="50",
- variable=self._result_size,
- underline=0,
- value=50,
- command=self.set_result_size,
- )
- rescntmenu.add_radiobutton(
- label="100",
- variable=self._result_size,
- underline=0,
- value=100,
- command=self.set_result_size,
- )
+ rescntmenu.add_radiobutton(label='20', variable=self._result_size,
+ underline=0, value=20,
+ command=self.set_result_size)
+ rescntmenu.add_radiobutton(label='50', variable=self._result_size,
+ underline=0, value=50,
+ command=self.set_result_size)
+ rescntmenu.add_radiobutton(label='100', variable=self._result_size,
+ underline=0, value=100,
+ command=self.set_result_size)
rescntmenu.invoke(1)
- editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
+ editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
cntxmenu = Menu(editmenu, tearoff=0)
cntxbfmenu = Menu(cntxmenu, tearoff=0)
- cntxbfmenu.add_radiobutton(
- label="60 characters",
- variable=self._cntx_bf_len,
- underline=0,
- value=60,
- command=self.set_cntx_bf_len,
- )
- cntxbfmenu.add_radiobutton(
- label="80 characters",
- variable=self._cntx_bf_len,
- underline=0,
- value=80,
- command=self.set_cntx_bf_len,
- )
- cntxbfmenu.add_radiobutton(
- label="100 characters",
- variable=self._cntx_bf_len,
- underline=0,
- value=100,
- command=self.set_cntx_bf_len,
- )
+ cntxbfmenu.add_radiobutton(label='60 characters',
+ variable=self._cntx_bf_len,
+ underline=0, value=60,
+ command=self.set_cntx_bf_len)
+ cntxbfmenu.add_radiobutton(label='80 characters',
+ variable=self._cntx_bf_len,
+ underline=0, value=80,
+ command=self.set_cntx_bf_len)
+ cntxbfmenu.add_radiobutton(label='100 characters',
+ variable=self._cntx_bf_len,
+ underline=0, value=100,
+ command=self.set_cntx_bf_len)
cntxbfmenu.invoke(1)
- cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
+ cntxmenu.add_cascade(label='Before', underline=0, menu=cntxbfmenu)
cntxafmenu = Menu(cntxmenu, tearoff=0)
- cntxafmenu.add_radiobutton(
- label="70 characters",
- variable=self._cntx_af_len,
- underline=0,
- value=70,
- command=self.set_cntx_af_len,
- )
- cntxafmenu.add_radiobutton(
- label="90 characters",
- variable=self._cntx_af_len,
- underline=0,
- value=90,
- command=self.set_cntx_af_len,
- )
- cntxafmenu.add_radiobutton(
- label="110 characters",
- variable=self._cntx_af_len,
- underline=0,
- value=110,
- command=self.set_cntx_af_len,
- )
+ cntxafmenu.add_radiobutton(label='70 characters',
+ variable=self._cntx_af_len,
+ underline=0, value=70,
+ command=self.set_cntx_af_len)
+ cntxafmenu.add_radiobutton(label='90 characters',
+ variable=self._cntx_af_len,
+ underline=0, value=90,
+ command=self.set_cntx_af_len)
+ cntxafmenu.add_radiobutton(label='110 characters',
+ variable=self._cntx_af_len,
+ underline=0, value=110,
+ command=self.set_cntx_af_len)
cntxafmenu.invoke(1)
- cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
+ cntxmenu.add_cascade(label='After', underline=0, menu=cntxafmenu)
- editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
+ editmenu.add_cascade(label='Context', underline=0, menu=cntxmenu)
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
self.top.config(menu=menubar)
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.var = StringVar(innerframe)
self.var.set(self.model.DEFAULT_CORPUS)
- Label(
- innerframe,
- justify=LEFT,
- text=" Corpus: ",
- background=self._BACKGROUND_COLOUR,
- padx=2,
- pady=1,
- border=0,
- ).pack(side="left")
-
- other_corpora = list(self.model.CORPORA.keys()).remove(
- self.model.DEFAULT_CORPUS
- )
- om = OptionMenu(
- innerframe,
- self.var,
- self.model.DEFAULT_CORPUS,
- command=self.corpus_selected,
- *self.model.non_default_corpora()
- )
- om["borderwidth"] = 0
- om["highlightthickness"] = 1
- om.pack(side="left")
- innerframe.pack(side="top", fill="x", anchor="n")
+ Label(innerframe, justify=LEFT, text=' Corpus: ',
+ background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
+
+ other_corpora = list(self.model.CORPORA.keys()).remove(self.model.DEFAULT_CORPUS)
+ om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
+ om['borderwidth'] = 0
+ om['highlightthickness'] = 1
+ om.pack(side='left')
+ innerframe.pack(side='top', fill='x', anchor='n')
def _init_status(self, parent):
- self.status = Label(
- parent,
- justify=LEFT,
- relief=SUNKEN,
- background=self._BACKGROUND_COLOUR,
- border=0,
- padx=1,
- pady=0,
- )
- self.status.pack(side="top", anchor="sw")
+ self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0)
+ self.status.pack(side='top', anchor='sw')
def _init_query_box(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
self.query_box = Entry(another, width=60)
- self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
- self.search_button = Button(
- another,
- text="Search",
- command=self.search,
- borderwidth=1,
- highlightthickness=1,
- )
- self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
- self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
+ self.query_box.pack(side='left', fill='x', pady=25, anchor='center')
+ self.search_button = Button(another, text='Search', command=self.search, borderwidth=1, highlightthickness=1)
+ self.search_button.pack(side='left', fill='x', pady=25, anchor='center')
+ self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler)
another.pack()
- innerframe.pack(side="top", fill="x", anchor="n")
+ innerframe.pack(side='top', fill='x', anchor='n')
def search_enter_keypress_handler(self, *event):
self.search()
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
- hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
- self.results_box = Text(
- i1,
- font=Font(family="courier", size="16"),
- state="disabled",
- borderwidth=1,
- yscrollcommand=vscrollbar.set,
- xscrollcommand=hscrollbar.set,
- wrap="none",
- width="40",
- height="20",
- exportselection=1,
- )
- self.results_box.pack(side="left", fill="both", expand=True)
- self.results_box.tag_config(
- self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
- )
- self.results_box.tag_config(
- self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
- )
- vscrollbar.pack(side="left", fill="y", anchor="e")
+ hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
+ self.results_box = Text(i1,
+ font=Font(family='courier', size='16'),
+ state='disabled', borderwidth=1,
+ yscrollcommand=vscrollbar.set,
+ xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
+ self.results_box.pack(side='left', fill='both', expand=True)
+ self.results_box.tag_config(self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR)
+ self.results_box.tag_config(self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR)
+ vscrollbar.pack(side='left', fill='y', anchor='e')
vscrollbar.config(command=self.results_box.yview)
- hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
+ hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
hscrollbar.config(command=self.results_box.xview)
- # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
- Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
- side="left", anchor="e"
- )
- i1.pack(side="top", fill="both", expand=True, anchor="n")
- i2.pack(side="bottom", fill="x", anchor="s")
- innerframe.pack(side="top", fill="both", expand=True)
+ #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
+ Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
+ i1.pack(side='top', fill='both', expand=True, anchor='n')
+ i2.pack(side='bottom', fill='x', anchor='s')
+ innerframe.pack(side='top', fill='both', expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
- self.prev = prev = Button(
- innerframe,
- text="Previous",
- command=self.previous,
- width="10",
- borderwidth=1,
- highlightthickness=1,
- state="disabled",
- )
- prev.pack(side="left", anchor="center")
- self.next = next = Button(
- innerframe,
- text="Next",
- command=self.__next__,
- width="10",
- borderwidth=1,
- highlightthickness=1,
- state="disabled",
- )
- next.pack(side="right", anchor="center")
- innerframe.pack(side="top", fill="y")
+ self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+ prev.pack(side='left', anchor='center')
+ self.next = next = Button(innerframe, text='Next', command=self.__next__, width='10', borderwidth=1, highlightthickness=1, state='disabled')
+ next.pack(side='right', anchor='center')
+ innerframe.pack(side='top', fill='y')
self.current_page = 0
def previous(self):
self.model.next(self.current_page + 1)
def about(self, *e):
- ABOUT = "NLTK Concordance Search Demo\n"
- TITLE = "About: NLTK Concordance Search Demo"
+ ABOUT = ("NLTK Concordance Search Demo\n")
+ TITLE = 'About: NLTK Concordance Search Demo'
try:
- from tkinter.messagebox import Message
-
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
except:
ShowText(self.top, TITLE, ABOUT)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
- self.status["text"] = "Error in loading " + self.var.get()
+ self.status['text'] = 'Error in loading ' + self.var.get()
self.unfreeze_editable()
self.clear_all()
self.freeze_editable()
def handle_corpus_loaded(self, event):
- self.status["text"] = self.var.get() + " is loaded"
+ self.status['text'] = self.var.get() + ' is loaded'
self.unfreeze_editable()
self.clear_all()
self.query_box.focus_set()
def handle_search_terminated(self, event):
- # todo: refactor the model such that it is less state sensitive
+ #todo: refactor the model such that it is less state sensitive
results = self.model.get_results()
self.write_results(results)
- self.status["text"] = ""
+ self.status['text'] = ''
if len(results) == 0:
- self.status["text"] = "No results found for " + self.model.query
+ self.status['text'] = 'No results found for ' + self.model.query
else:
- self.current_page = self.model.last_requested_page
+ self.current_page = self.model.last_requested_page
self.unfreeze_editable()
self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
def handle_search_error(self, event):
- self.status["text"] = "Error in query " + self.model.query
+ self.status['text'] = 'Error in query ' + self.model.query
self.unfreeze_editable()
def corpus_selected(self, *args):
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
- self.status["text"] = "Loading " + selection + "..."
+ self.status['text'] = 'Loading ' + selection + '...'
self.freeze_editable()
self.model.load_corpus(selection)
self.clear_results_box()
self.model.reset_results()
query = self.query_box.get()
- if len(query.strip()) == 0:
- return
- self.status["text"] = "Searching for " + query
+ if (len(query.strip()) == 0): return
+ self.status['text'] = 'Searching for ' + query
self.freeze_editable()
- self.model.search(query, self.current_page + 1)
+ self.model.search(query, self.current_page + 1, )
+
def write_results(self, results):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
row = 1
for each in results:
sent, pos1, pos2 = each[0].strip(), each[1], each[2]
if len(sent) != 0:
- if pos1 < self._char_before:
+ if (pos1 < self._char_before):
sent, pos1, pos2 = self.pad(sent, pos1, pos2)
- sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
+ sentence = sent[pos1-self._char_before:pos1+self._char_after]
if not row == len(results):
- sentence += "\n"
- self.results_box.insert(str(row) + ".0", sentence)
+ sentence += '\n'
+ self.results_box.insert(str(row) + '.0', sentence)
word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
- for marker in word_markers:
- self.results_box.tag_add(
- self._HIGHLIGHT_WORD_TAG,
- str(row) + "." + str(marker[0]),
- str(row) + "." + str(marker[1]),
- )
- for marker in label_markers:
- self.results_box.tag_add(
- self._HIGHLIGHT_LABEL_TAG,
- str(row) + "." + str(marker[0]),
- str(row) + "." + str(marker[1]),
- )
+ for marker in word_markers: self.results_box.tag_add(self._HIGHLIGHT_WORD_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
+ for marker in label_markers: self.results_box.tag_add(self._HIGHLIGHT_LABEL_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
row += 1
- self.results_box["state"] = "disabled"
+ self.results_box['state'] = 'disabled'
def words_and_labels(self, sentence, pos1, pos2):
search_exp = sentence[pos1:pos2]
words, labels = [], []
- labeled_words = search_exp.split(" ")
+ labeled_words = search_exp.split(' ')
index = 0
for each in labeled_words:
- if each == "":
+ if each == '':
index += 1
else:
- word, label = each.split("/")
- words.append(
- (self._char_before + index, self._char_before + index + len(word))
- )
+ word, label = each.split('/')
+ words.append((self._char_before + index, self._char_before + index + len(word)))
index += len(word) + 1
- labels.append(
- (self._char_before + index, self._char_before + index + len(label))
- )
+ labels.append((self._char_before + index, self._char_before + index + len(label)))
index += len(label)
index += 1
return words, labels
if hstart >= self._char_before:
return sent, hstart, hend
d = self._char_before - hstart
- sent = "".join([" "] * d) + sent
+ sent = ''.join([' '] * d) + sent
return sent, hstart + d, hend + d
def destroy(self, *e):
- if self.top is None:
- return
+ if self.top is None: return
self.top.after_cancel(self.after)
self.top.destroy()
self.top = None
self.clear_results_box()
def clear_results_box(self):
- self.results_box["state"] = "normal"
+ self.results_box['state'] = 'normal'
self.results_box.delete("1.0", END)
- self.results_box["state"] = "disabled"
+ self.results_box['state'] = 'disabled'
def freeze_editable(self):
- self.query_box["state"] = "disabled"
- self.search_button["state"] = "disabled"
- self.prev["state"] = "disabled"
- self.next["state"] = "disabled"
+ self.query_box['state'] = 'disabled'
+ self.search_button['state'] = 'disabled'
+ self.prev['state'] = 'disabled'
+ self.next['state'] = 'disabled'
def unfreeze_editable(self):
- self.query_box["state"] = "normal"
- self.search_button["state"] = "normal"
+ self.query_box['state'] = 'normal'
+ self.search_button['state'] = 'normal'
self.set_paging_button_states()
def set_paging_button_states(self):
if self.current_page == 0 or self.current_page == 1:
- self.prev["state"] = "disabled"
+ self.prev['state'] = 'disabled'
else:
- self.prev["state"] = "normal"
+ self.prev['state'] = 'normal'
if self.model.has_more_pages(self.current_page):
- self.next["state"] = "normal"
+ self.next['state'] = 'normal'
else:
- self.next["state"] = "disabled"
+ self.next['state'] = 'disabled'
def fire_event(self, event):
- # Firing an event so that rendering of widgets happen in the mainloop thread
- self.top.event_generate(event, when="tail")
+ #Firing an event so that rendering of widgets happen in the mainloop thread
+ self.top.event_generate(event, when='tail')
def mainloop(self, *args, **kwargs):
- if in_idle():
- return
+ if in_idle(): return
self.top.mainloop(*args, **kwargs)
-
class ConcordanceSearchModel(object):
def __init__(self, queue):
self.queue = queue
def run(self):
try:
ts = self.model.CORPORA[self.name]()
- self.model.tagged_sents = [
- " ".join(w + "/" + t for (w, t) in sent) for sent in ts
- ]
+ self.model.tagged_sents = [' '.join(w+'/'+t for (w,t) in sent) for sent in ts]
self.model.queue.put(CORPUS_LOADED_EVENT)
except Exception as e:
print(e)
def run(self):
q = self.processed_query()
sent_pos, i, sent_count = [], 0, 0
- for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
+ for sent in self.model.tagged_sents[self.model.last_sent_searched:]:
try:
m = re.search(q, sent)
except re.error:
self.model.last_sent_searched += sent_count - 1
break
sent_count += 1
- if self.count >= len(sent_pos):
+ if (self.count >= len(sent_pos)):
self.model.last_sent_searched += sent_count - 1
self.model.last_page = self.page
self.model.set_results(self.page, sent_pos)
def processed_query(self):
new = []
for term in self.model.query.split():
- term = re.sub(r"\.", r"[^/ ]", term)
- if re.match("[A-Z]+$", term):
- new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
- elif "/" in term:
+ term = re.sub(r'\.', r'[^/ ]', term)
+ if re.match('[A-Z]+$', term):
+ new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY)
+ elif '/' in term:
new.append(BOUNDARY + term + BOUNDARY)
else:
- new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
- return " ".join(new)
-
+ new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY)
+ return ' '.join(new)
def app():
d = ConcordanceSearchView()
d.mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
Instant Regular Expressions
Created by Aristide Grange
"""
+
+from six.moves.tkinter import (Frame, Label, PhotoImage, Scrollbar, Text, Tk,
+ SEL_FIRST, SEL_LAST)
import re
import itertools
-from tkinter import (
- Frame,
- Label,
- PhotoImage,
- Scrollbar,
- Text,
- Tk,
- SEL_FIRST,
- SEL_LAST,
-)
-
-
windowTitle = "Finding (and Replacing) Nemo"
initialFind = r"n(.*?)e(.*?)m(.*?)o"
initialRepl = r"M\1A\2K\3I"
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
images = {
- "FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
- "find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
- "REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
- "repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
+ "FIND":"R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
+ "find":"R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
+ "REPL":"R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
+ "repl":"R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
}
-colors = ["#FF7B39", "#80F121"]
-emphColors = ["#DAFC33", "#F42548"]
+colors = ["#FF7B39","#80F121"]
+emphColors = ["#DAFC33","#F42548"]
fieldParams = {
- "height": 3,
- "width": 70,
- "font": ("monaco", 14),
- "highlightthickness": 0,
- "borderwidth": 0,
- "background": "white",
+ "height":3,
+ "width":70,
+ "font":("monaco",14),
+ "highlightthickness":0,
+ "borderwidth":0,
+ "background":"white",
}
textParams = {
- "bg": "#F7E0D4",
- "fg": "#2321F1",
- "highlightthickness": 0,
- "width": 1,
- "height": 10,
- "font": ("verdana", 16),
- "wrap": "word",
+ "bg":"#F7E0D4",
+ "fg":"#2321F1",
+ "highlightthickness":0,
+ "width":1,
+ "height":10,
+ "font":("verdana",16),
+ "wrap":"word",
}
def __init__(self, image, initialField, initialText):
frm = Frame(root)
frm.config(background="white")
- self.image = PhotoImage(format="gif", data=images[image.upper()])
- self.imageDimmed = PhotoImage(format="gif", data=images[image])
+ self.image = PhotoImage(format='gif',data=images[image.upper()])
+ self.imageDimmed = PhotoImage(format='gif',data=images[image])
self.img = Label(frm)
self.img.config(borderwidth=0)
- self.img.pack(side="left")
+ self.img.pack(side = "left")
self.fld = Text(frm, **fieldParams)
- self.initScrollText(frm, self.fld, initialField)
+ self.initScrollText(frm,self.fld,initialField)
frm = Frame(root)
self.txt = Text(frm, **textParams)
- self.initScrollText(frm, self.txt, initialText)
+ self.initScrollText(frm,self.txt,initialText)
for i in range(2):
- self.txt.tag_config(colors[i], background=colors[i])
- self.txt.tag_config("emph" + colors[i], foreground=emphColors[i])
-
- def initScrollText(self, frm, txt, contents):
+ self.txt.tag_config(colors[i], background = colors[i])
+ self.txt.tag_config("emph"+colors[i], foreground = emphColors[i])
+ def initScrollText(self,frm,txt,contents):
scl = Scrollbar(frm)
- scl.config(command=txt.yview)
- scl.pack(side="right", fill="y")
- txt.pack(side="left", expand=True, fill="x")
- txt.config(yscrollcommand=scl.set)
- txt.insert("1.0", contents)
- frm.pack(fill="x")
+ scl.config(command = txt.yview)
+ scl.pack(side="right",fill="y")
+ txt.pack(side = "left", expand=True, fill="x")
+ txt.config(yscrollcommand = scl.set)
+ txt.insert("1.0",contents)
+ frm.pack(fill = "x")
Frame(height=2, bd=1, relief="ridge").pack(fill="x")
-
def refresh(self):
self.colorCycle = itertools.cycle(colors)
try:
self.substitute()
- self.img.config(image=self.image)
+ self.img.config(image = self.image)
except re.error:
- self.img.config(image=self.imageDimmed)
+ self.img.config(image = self.imageDimmed)
class FindZone(Zone):
- def addTags(self, m):
+ def addTags(self,m):
color = next(self.colorCycle)
- self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end())
+ self.txt.tag_add(color,"1.0+%sc"%m.start(),"1.0+%sc"%m.end())
try:
- self.txt.tag_add(
- "emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph")
- )
+ self.txt.tag_add("emph"+color,"1.0+%sc"%m.start("emph"),
+ "1.0+%sc"%m.end("emph"))
except:
pass
-
- def substitute(self, *args):
+ def substitute(self,*args):
for color in colors:
- self.txt.tag_remove(color, "1.0", "end")
- self.txt.tag_remove("emph" + color, "1.0", "end")
- self.rex = re.compile("") # default value in case of misformed regexp
- self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE)
+ self.txt.tag_remove(color,"1.0","end")
+ self.txt.tag_remove("emph"+color,"1.0","end")
+ self.rex = re.compile("") # default value in case of misformed regexp
+ self.rex = re.compile(self.fld.get("1.0","end")[:-1],re.MULTILINE)
try:
- re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST, SEL_LAST))
- self.rexSel = re.compile(
- "%s(?P<emph>%s)%s"
- % (
- self.fld.get("1.0", SEL_FIRST),
- self.fld.get(SEL_FIRST, SEL_LAST),
- self.fld.get(SEL_LAST, "end")[:-1],
- ),
- re.MULTILINE,
- )
+ re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST,
+ SEL_LAST))
+ self.rexSel = re.compile("%s(?P<emph>%s)%s" % (
+ self.fld.get("1.0",SEL_FIRST),
+ self.fld.get(SEL_FIRST,SEL_LAST),
+ self.fld.get(SEL_LAST,"end")[:-1],
+ ),re.MULTILINE)
except:
self.rexSel = self.rex
- self.rexSel.sub(self.addTags, self.txt.get("1.0", "end"))
+ self.rexSel.sub(self.addTags,self.txt.get("1.0","end"))
class ReplaceZone(Zone):
- def addTags(self, m):
- s = sz.rex.sub(self.repl, m.group())
- self.txt.delete(
- "1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff)
- )
- self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle))
+ def addTags(self,m):
+ s = sz.rex.sub(self.repl,m.group())
+ self.txt.delete("1.0+%sc"%(m.start()+self.diff),
+ "1.0+%sc"%(m.end()+self.diff))
+ self.txt.insert("1.0+%sc"%(m.start()+self.diff),s,
+ next(self.colorCycle))
self.diff += len(s) - (m.end() - m.start())
-
def substitute(self):
- self.txt.delete("1.0", "end")
- self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1])
+ self.txt.delete("1.0","end")
+ self.txt.insert("1.0",sz.txt.get("1.0","end")[:-1])
self.diff = 0
- self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1])
- sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1])
+ self.repl = rex0.sub(r"\\g<\1>",self.fld.get("1.0","end")[:-1])
+ sz.rex.sub(self.addTags,sz.txt.get("1.0","end")[:-1])
def launchRefresh(_):
def app():
global root, sz, rz, rex0
root = Tk()
- root.resizable(height=False, width=True)
+ root.resizable(height=False,width=True)
root.title(windowTitle)
- root.minsize(width=250, height=0)
- sz = FindZone("find", initialFind, initialText)
- sz.fld.bind("<Button-1>", launchRefresh)
- sz.fld.bind("<ButtonRelease-1>", launchRefresh)
- sz.fld.bind("<B1-Motion>", launchRefresh)
+ root.minsize(width=250,height=0)
+ sz = FindZone("find",initialFind,initialText)
+ sz.fld.bind("<Button-1>",launchRefresh)
+ sz.fld.bind("<ButtonRelease-1>",launchRefresh)
+ sz.fld.bind("<B1-Motion>",launchRefresh)
sz.rexSel = re.compile("")
- rz = ReplaceZone("repl", initialRepl, "")
+ rz = ReplaceZone("repl",initialRepl,"")
rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
- root.bind_all("<Key>", launchRefresh)
+ root.bind_all("<Key>",launchRefresh)
launchRefresh(None)
root.mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Recursive Descent Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
[Ctrl-p]\t Print
[q]\t Quit
"""
+from __future__ import division
-from tkinter.font import Font
-from tkinter import Listbox, IntVar, Button, Frame, Label, Menu, Scrollbar, Tk
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Listbox, IntVar, Button, Frame, Label, Menu,
+ Scrollbar, Tk)
from nltk.tree import Tree
from nltk.util import in_idle
from nltk.draw.util import TextWidget, ShowText, CanvasFrame, EntryDialog
from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
-
class RecursiveDescentApp(object):
"""
A graphical tool for exploring the recursive descent parser. The tool
through the parsing process, performing the operations that
``RecursiveDescentParser`` would use.
"""
-
def __init__(self, grammar, sent, trace=0):
self._sent = sent
self._parser = SteppingRecursiveDescentParser(grammar, trace)
# Set up the main window.
self._top = Tk()
- self._top.title("Recursive Descent Parser Application")
+ self._top.title('Recursive Descent Parser Application')
# Set up key bindings.
self._init_bindings()
self._parser.initialize(self._sent)
# Resize callback
- self._canvas.bind("<Configure>", self._configure)
+ self._canvas.bind('<Configure>', self._configure)
#########################################
## Initialization Helpers
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
- if self._size.get() < 0:
- big = self._size.get() - 2
- else:
- big = self._size.get() + 2
- self._bigfont = Font(family="helvetica", weight="bold", size=big)
+ self._boldfont = Font(family='helvetica', weight='bold',
+ size=self._size.get())
+ self._font = Font(family='helvetica',
+ size=self._size.get())
+ if self._size.get() < 0: big = self._size.get()-2
+ else: big = self._size.get()+2
+ self._bigfont = Font(family='helvetica', weight='bold',
+ size=big)
def _init_grammar(self, parent):
# Grammar view.
self._prodframe = listframe = Frame(parent)
- self._prodframe.pack(fill="both", side="left", padx=2)
- self._prodlist_label = Label(
- self._prodframe, font=self._boldfont, text="Available Expansions"
- )
+ self._prodframe.pack(fill='both', side='left', padx=2)
+ self._prodlist_label = Label(self._prodframe, font=self._boldfont,
+ text='Available Expansions')
self._prodlist_label.pack()
- self._prodlist = Listbox(
- self._prodframe,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
- font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
- )
-
- self._prodlist.pack(side="right", fill="both", expand=1)
+ self._prodlist = Listbox(self._prodframe, selectmode='single',
+ relief='groove', background='white',
+ foreground='#909090', font=self._font,
+ selectforeground='#004040',
+ selectbackground='#c0f0c0')
+
+ self._prodlist.pack(side='right', fill='both', expand=1)
self._productions = list(self._parser.grammar().productions())
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
self._prodlist.config(height=min(len(self._productions), 25))
# Add a scrollbar if there are more than 25 productions.
if len(self._productions) > 25:
- listscroll = Scrollbar(self._prodframe, orient="vertical")
- self._prodlist.config(yscrollcommand=listscroll.set)
+ listscroll = Scrollbar(self._prodframe,
+ orient='vertical')
+ self._prodlist.config(yscrollcommand = listscroll.set)
listscroll.config(command=self._prodlist.yview)
- listscroll.pack(side="left", fill="y")
+ listscroll.pack(side='left', fill='y')
# If they select a production, apply it.
- self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
+ self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
def _init_bindings(self):
# Key bindings are a good thing.
- self._top.bind("<Control-q>", self.destroy)
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Escape>", self.destroy)
- self._top.bind("e", self.expand)
- # self._top.bind('<Alt-e>', self.expand)
- # self._top.bind('<Control-e>', self.expand)
- self._top.bind("m", self.match)
- self._top.bind("<Alt-m>", self.match)
- self._top.bind("<Control-m>", self.match)
- self._top.bind("b", self.backtrack)
- self._top.bind("<Alt-b>", self.backtrack)
- self._top.bind("<Control-b>", self.backtrack)
- self._top.bind("<Control-z>", self.backtrack)
- self._top.bind("<BackSpace>", self.backtrack)
- self._top.bind("a", self.autostep)
- # self._top.bind('<Control-a>', self.autostep)
- self._top.bind("<Control-space>", self.autostep)
- self._top.bind("<Control-c>", self.cancel_autostep)
- self._top.bind("<space>", self.step)
- self._top.bind("<Delete>", self.reset)
- self._top.bind("<Control-p>", self.postscript)
- # self._top.bind('<h>', self.help)
- # self._top.bind('<Alt-h>', self.help)
- self._top.bind("<Control-h>", self.help)
- self._top.bind("<F1>", self.help)
- # self._top.bind('<g>', self.toggle_grammar)
- # self._top.bind('<Alt-g>', self.toggle_grammar)
- # self._top.bind('<Control-g>', self.toggle_grammar)
- self._top.bind("<Control-g>", self.edit_grammar)
- self._top.bind("<Control-t>", self.edit_sentence)
+ self._top.bind('<Control-q>', self.destroy)
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Escape>', self.destroy)
+ self._top.bind('e', self.expand)
+ #self._top.bind('<Alt-e>', self.expand)
+ #self._top.bind('<Control-e>', self.expand)
+ self._top.bind('m', self.match)
+ self._top.bind('<Alt-m>', self.match)
+ self._top.bind('<Control-m>', self.match)
+ self._top.bind('b', self.backtrack)
+ self._top.bind('<Alt-b>', self.backtrack)
+ self._top.bind('<Control-b>', self.backtrack)
+ self._top.bind('<Control-z>', self.backtrack)
+ self._top.bind('<BackSpace>', self.backtrack)
+ self._top.bind('a', self.autostep)
+ #self._top.bind('<Control-a>', self.autostep)
+ self._top.bind('<Control-space>', self.autostep)
+ self._top.bind('<Control-c>', self.cancel_autostep)
+ self._top.bind('<space>', self.step)
+ self._top.bind('<Delete>', self.reset)
+ self._top.bind('<Control-p>', self.postscript)
+ #self._top.bind('<h>', self.help)
+ #self._top.bind('<Alt-h>', self.help)
+ self._top.bind('<Control-h>', self.help)
+ self._top.bind('<F1>', self.help)
+ #self._top.bind('<g>', self.toggle_grammar)
+ #self._top.bind('<Alt-g>', self.toggle_grammar)
+ #self._top.bind('<Control-g>', self.toggle_grammar)
+ self._top.bind('<Control-g>', self.edit_grammar)
+ self._top.bind('<Control-t>', self.edit_sentence)
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
- buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
- Button(
- buttonframe,
- text="Step",
- background="#90c0d0",
- foreground="black",
- command=self.step,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Autostep",
- background="#90c0d0",
- foreground="black",
- command=self.autostep,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Expand",
- underline=0,
- background="#90f090",
- foreground="black",
- command=self.expand,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Match",
- underline=0,
- background="#90f090",
- foreground="black",
- command=self.match,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Backtrack",
- underline=0,
- background="#f0a0a0",
- foreground="black",
- command=self.backtrack,
- ).pack(side="left")
+ buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
+ Button(buttonframe, text='Step',
+ background='#90c0d0', foreground='black',
+ command=self.step,).pack(side='left')
+ Button(buttonframe, text='Autostep',
+ background='#90c0d0', foreground='black',
+ command=self.autostep,).pack(side='left')
+ Button(buttonframe, text='Expand', underline=0,
+ background='#90f090', foreground='black',
+ command=self.expand).pack(side='left')
+ Button(buttonframe, text='Match', underline=0,
+ background='#90f090', foreground='black',
+ command=self.match).pack(side='left')
+ Button(buttonframe, text='Backtrack', underline=0,
+ background='#f0a0a0', foreground='black',
+ command=self.backtrack).pack(side='left')
# Replace autostep...
-
- # self._autostep_button = Button(buttonframe, text='Autostep',
- # underline=0, command=self.autostep)
- # self._autostep_button.pack(side='left')
+# self._autostep_button = Button(buttonframe, text='Autostep',
+# underline=0, command=self.autostep)
+# self._autostep_button.pack(side='left')
def _configure(self, event):
self._autostep = 0
(x1, y1, x2, y2) = self._cframe.scrollregion()
y2 = event.height - 6
- self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
+ self._canvas['scrollregion'] = '%d %d %d %d' % (x1,y1,x2,y2)
self._redraw()
def _init_feedback(self, parent):
self._feedbackframe = feedbackframe = Frame(parent)
- feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
- self._lastoper_label = Label(
- feedbackframe, text="Last Operation:", font=self._font
- )
- self._lastoper_label.pack(side="left")
- lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
- lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
- self._lastoper1 = Label(
- lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
- )
- self._lastoper2 = Label(
- lastoperframe,
- anchor="w",
- width=30,
- foreground="#004040",
- background="#f0f0f0",
- font=self._font,
- )
- self._lastoper1.pack(side="left")
- self._lastoper2.pack(side="left", fill="x", expand=1)
+ feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
+ self._lastoper_label = Label(feedbackframe, text='Last Operation:',
+ font=self._font)
+ self._lastoper_label.pack(side='left')
+ lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+ lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
+ self._lastoper1 = Label(lastoperframe, foreground='#007070',
+ background='#f0f0f0', font=self._font)
+ self._lastoper2 = Label(lastoperframe, anchor='w', width=30,
+ foreground='#004040', background='#f0f0f0',
+ font=self._font)
+ self._lastoper1.pack(side='left')
+ self._lastoper2.pack(side='left', fill='x', expand=1)
def _init_canvas(self, parent):
- self._cframe = CanvasFrame(
- parent,
- background="white",
- # width=525, height=250,
- closeenough=10,
- border=2,
- relief="sunken",
- )
- self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+ self._cframe = CanvasFrame(parent, background='white',
+ #width=525, height=250,
+ closeenough=10,
+ border=2, relief='sunken')
+ self._cframe.pack(expand=1, fill='both', side='top', pady=2)
canvas = self._canvas = self._cframe.canvas()
# Initially, there's no tree or text
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
- )
- filemenu.add_command(
- label="Print to Postscript",
- underline=0,
- command=self.postscript,
- accelerator="Ctrl-p",
- )
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Reset Parser', underline=0,
+ command=self.reset, accelerator='Del')
+ filemenu.add_command(label='Print to Postscript', underline=0,
+ command=self.postscript, accelerator='Ctrl-p')
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-x')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
- editmenu.add_command(
- label="Edit Grammar",
- underline=5,
- command=self.edit_grammar,
- accelerator="Ctrl-g",
- )
- editmenu.add_command(
- label="Edit Text",
- underline=5,
- command=self.edit_sentence,
- accelerator="Ctrl-t",
- )
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ editmenu.add_command(label='Edit Grammar', underline=5,
+ command=self.edit_grammar,
+ accelerator='Ctrl-g')
+ editmenu.add_command(label='Edit Text', underline=5,
+ command=self.edit_sentence,
+ accelerator='Ctrl-t')
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
rulemenu = Menu(menubar, tearoff=0)
- rulemenu.add_command(
- label="Step", underline=1, command=self.step, accelerator="Space"
- )
+ rulemenu.add_command(label='Step', underline=1,
+ command=self.step, accelerator='Space')
rulemenu.add_separator()
- rulemenu.add_command(
- label="Match", underline=0, command=self.match, accelerator="Ctrl-m"
- )
- rulemenu.add_command(
- label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e"
- )
+ rulemenu.add_command(label='Match', underline=0,
+ command=self.match, accelerator='Ctrl-m')
+ rulemenu.add_command(label='Expand', underline=0,
+ command=self.expand, accelerator='Ctrl-e')
rulemenu.add_separator()
- rulemenu.add_command(
- label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b"
- )
- menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+ rulemenu.add_command(label='Backtrack', underline=0,
+ command=self.backtrack, accelerator='Ctrl-b')
+ menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
viewmenu = Menu(menubar, tearoff=0)
- viewmenu.add_checkbutton(
- label="Show Grammar",
- underline=0,
- variable=self._show_grammar,
- command=self._toggle_grammar,
- )
+ viewmenu.add_checkbutton(label="Show Grammar", underline=0,
+ variable=self._show_grammar,
+ command=self._toggle_grammar)
viewmenu.add_separator()
- viewmenu.add_radiobutton(
- label="Tiny",
- variable=self._size,
- underline=0,
- value=10,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Small",
- variable=self._size,
- underline=0,
- value=12,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Medium",
- variable=self._size,
- underline=0,
- value=14,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Large",
- variable=self._size,
- underline=0,
- value=18,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Huge",
- variable=self._size,
- underline=0,
- value=24,
- command=self.resize,
- )
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+ underline=0, value=10, command=self.resize)
+ viewmenu.add_radiobutton(label='Small', variable=self._size,
+ underline=0, value=12, command=self.resize)
+ viewmenu.add_radiobutton(label='Medium', variable=self._size,
+ underline=0, value=14, command=self.resize)
+ viewmenu.add_radiobutton(label='Large', variable=self._size,
+ underline=0, value=18, command=self.resize)
+ viewmenu.add_radiobutton(label='Huge', variable=self._size,
+ underline=0, value=24, command=self.resize)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
animatemenu = Menu(menubar, tearoff=0)
- animatemenu.add_radiobutton(
- label="No Animation", underline=0, variable=self._animation_frames, value=0
- )
- animatemenu.add_radiobutton(
- label="Slow Animation",
- underline=0,
- variable=self._animation_frames,
- value=10,
- accelerator="-",
- )
- animatemenu.add_radiobutton(
- label="Normal Animation",
- underline=0,
- variable=self._animation_frames,
- value=5,
- accelerator="=",
- )
- animatemenu.add_radiobutton(
- label="Fast Animation",
- underline=0,
- variable=self._animation_frames,
- value=2,
- accelerator="+",
- )
+ animatemenu.add_radiobutton(label="No Animation", underline=0,
+ variable=self._animation_frames,
+ value=0)
+ animatemenu.add_radiobutton(label="Slow Animation", underline=0,
+ variable=self._animation_frames,
+ value=10, accelerator='-')
+ animatemenu.add_radiobutton(label="Normal Animation", underline=0,
+ variable=self._animation_frames,
+ value=5, accelerator='=')
+ animatemenu.add_radiobutton(label="Fast Animation", underline=0,
+ variable=self._animation_frames,
+ value=2, accelerator='+')
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
+
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
- )
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ helpmenu.add_command(label='About', underline=0,
+ command=self.about)
+ helpmenu.add_command(label='Instructions', underline=0,
+ command=self.help, accelerator='F1')
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
#########################################
def _get(self, widget, treeloc):
- for i in treeloc:
- widget = widget.subtrees()[i]
+ for i in treeloc: widget = widget.subtrees()[i]
if isinstance(widget, TreeSegmentWidget):
widget = widget.label()
return widget
self._canvas.delete(self._textline)
# Draw the tree.
- helv = ("helvetica", -self._size.get())
- bold = ("helvetica", -self._size.get(), "bold")
- attribs = {
- "tree_color": "#000000",
- "tree_width": 2,
- "node_font": bold,
- "leaf_font": helv,
- }
+ helv = ('helvetica', -self._size.get())
+ bold = ('helvetica', -self._size.get(), 'bold')
+ attribs = {'tree_color': '#000000', 'tree_width': 2,
+ 'node_font': bold, 'leaf_font': helv,}
tree = self._parser.tree()
self._tree = tree_to_treesegment(canvas, tree, **attribs)
self._cframe.add_widget(self._tree, 30, 5)
# Draw the text.
- helv = ("helvetica", -self._size.get())
+ helv = ('helvetica', -self._size.get())
bottom = y = self._cframe.scrollregion()[3]
- self._textwidgets = [
- TextWidget(canvas, word, font=self._font) for word in self._sent
- ]
+ self._textwidgets = [TextWidget(canvas, word, font=self._font)
+ for word in self._sent]
for twidget in self._textwidgets:
self._cframe.add_widget(twidget, 0, 0)
- twidget.move(0, bottom - twidget.bbox()[3] - 5)
+ twidget.move(0, bottom-twidget.bbox()[3]-5)
y = min(y, twidget.bbox()[1])
# Draw a line over the text, to separate it from the tree.
- self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".")
+ self._textline = canvas.create_line(-5000, y-5, 5000, y-5, dash='.')
# Highlight appropriate nodes.
self._highlight_nodes()
# Make sure the text lines up.
self._position_text()
+
def _redraw_quick(self):
# This should be more-or-less sufficient after an animation.
self._highlight_nodes()
def _highlight_nodes(self):
# Highlight the list of nodes to be checked.
- bold = ("helvetica", -self._size.get(), "bold")
+ bold = ('helvetica', -self._size.get(), 'bold')
for treeloc in self._parser.frontier()[:1]:
- self._get(self._tree, treeloc)["color"] = "#20a050"
- self._get(self._tree, treeloc)["font"] = bold
+ self._get(self._tree, treeloc)['color'] = '#20a050'
+ self._get(self._tree, treeloc)['font'] = bold
for treeloc in self._parser.frontier()[1:]:
- self._get(self._tree, treeloc)["color"] = "#008080"
+ self._get(self._tree, treeloc)['color'] = '#008080'
def _highlight_prodlist(self):
# Highlight the productions that can be expanded.
# Boy, too bad tkinter doesn't implement Listbox.itemconfig;
# that would be pretty useful here.
- self._prodlist.delete(0, "end")
+ self._prodlist.delete(0, 'end')
expandable = self._parser.expandable_productions()
untried = self._parser.untried_expandable_productions()
productions = self._productions
for index in range(len(productions)):
if productions[index] in expandable:
if productions[index] in untried:
- self._prodlist.insert(index, " %s" % productions[index])
+ self._prodlist.insert(index, ' %s' % productions[index])
else:
- self._prodlist.insert(index, " %s (TRIED)" % productions[index])
+ self._prodlist.insert(index, ' %s (TRIED)' %
+ productions[index])
self._prodlist.selection_set(index)
else:
- self._prodlist.insert(index, " %s" % productions[index])
+ self._prodlist.insert(index, ' %s' % productions[index])
def _position_text(self):
# Line up the text widgets that are matched against the tree
for i in range(0, len(leaves)):
widget = self._textwidgets[i]
leaf = leaves[i]
- widget["color"] = "#006040"
- leaf["color"] = "#006040"
+ widget['color'] = '#006040'
+ leaf['color'] = '#006040'
widget.move(leaf.bbox()[0] - widget.bbox()[0], 0)
xmax = widget.bbox()[2] + 10
# Line up the text widgets that are not matched against the tree.
for i in range(len(leaves), numwords):
widget = self._textwidgets[i]
- widget["color"] = "#a0a0a0"
+ widget['color'] = '#a0a0a0'
widget.move(xmax - widget.bbox()[0], 0)
xmax = widget.bbox()[2] + 10
# If we have a complete parse, make everything green :)
if self._parser.currently_complete():
for twidget in self._textwidgets:
- twidget["color"] = "#00a000"
+ twidget['color'] = '#00a000'
# Move the matched leaves down to the text.
for i in range(0, len(leaves)):
leaf.move(0, dy)
def _tree_leaves(self, tree=None):
- if tree is None:
- tree = self._tree
+ if tree is None: tree = self._tree
if isinstance(tree, TreeSegmentWidget):
leaves = []
- for child in tree.subtrees():
- leaves += self._tree_leaves(child)
+ for child in tree.subtrees(): leaves += self._tree_leaves(child)
return leaves
else:
return [tree]
def destroy(self, *e):
self._autostep = 0
- if self._top is None:
- return
+ if self._top is None: return
self._top.destroy()
self._top = None
def reset(self, *e):
self._autostep = 0
self._parser.initialize(self._sent)
- self._lastoper1["text"] = "Reset Application"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Reset Application'
+ self._lastoper2['text'] = ''
self._redraw()
def autostep(self, *e):
self._step()
def cancel_autostep(self, *e):
- # self._autostep_button['text'] = 'Autostep'
+ #self._autostep_button['text'] = 'Autostep'
self._autostep = 0
# Make sure to stop auto-stepping if we get any user input.
- def step(self, *e):
- self._autostep = 0
- self._step()
-
- def match(self, *e):
- self._autostep = 0
- self._match()
-
- def expand(self, *e):
- self._autostep = 0
- self._expand()
-
- def backtrack(self, *e):
- self._autostep = 0
- self._backtrack()
+ def step(self, *e): self._autostep = 0; self._step()
+ def match(self, *e): self._autostep = 0; self._match()
+ def expand(self, *e): self._autostep = 0; self._expand()
+ def backtrack(self, *e): self._autostep = 0; self._backtrack()
def _step(self):
- if self._animating_lock:
- return
+ if self._animating_lock: return
# Try expanding, matching, and backtracking (in that order)
- if self._expand():
- pass
- elif self._parser.untried_match() and self._match():
- pass
- elif self._backtrack():
- pass
+ if self._expand(): pass
+ elif self._parser.untried_match() and self._match(): pass
+ elif self._backtrack(): pass
else:
- self._lastoper1["text"] = "Finished"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Finished'
+ self._lastoper2['text'] = ''
self._autostep = 0
# Check if we just completed a parse.
if self._parser.currently_complete():
self._autostep = 0
- self._lastoper2["text"] += " [COMPLETE PARSE]"
+ self._lastoper2['text'] += ' [COMPLETE PARSE]'
def _expand(self, *e):
- if self._animating_lock:
- return
+ if self._animating_lock: return
old_frontier = self._parser.frontier()
rv = self._parser.expand()
if rv is not None:
- self._lastoper1["text"] = "Expand:"
- self._lastoper2["text"] = rv
- self._prodlist.selection_clear(0, "end")
+ self._lastoper1['text'] = 'Expand:'
+ self._lastoper2['text'] = rv
+ self._prodlist.selection_clear(0, 'end')
index = self._productions.index(rv)
self._prodlist.selection_set(index)
self._animate_expand(old_frontier[0])
return True
else:
- self._lastoper1["text"] = "Expand:"
- self._lastoper2["text"] = "(all expansions tried)"
+ self._lastoper1['text'] = 'Expand:'
+ self._lastoper2['text'] = '(all expansions tried)'
return False
def _match(self, *e):
- if self._animating_lock:
- return
+ if self._animating_lock: return
old_frontier = self._parser.frontier()
rv = self._parser.match()
if rv is not None:
- self._lastoper1["text"] = "Match:"
- self._lastoper2["text"] = rv
+ self._lastoper1['text'] = 'Match:'
+ self._lastoper2['text'] = rv
self._animate_match(old_frontier[0])
return True
else:
- self._lastoper1["text"] = "Match:"
- self._lastoper2["text"] = "(failed)"
+ self._lastoper1['text'] = 'Match:'
+ self._lastoper2['text'] = '(failed)'
return False
def _backtrack(self, *e):
- if self._animating_lock:
- return
+ if self._animating_lock: return
if self._parser.backtrack():
elt = self._parser.tree()
for i in self._parser.frontier()[0]:
elt = elt[i]
- self._lastoper1["text"] = "Backtrack"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Backtrack'
+ self._lastoper2['text'] = ''
if isinstance(elt, Tree):
self._animate_backtrack(self._parser.frontier()[0])
else:
return True
else:
self._autostep = 0
- self._lastoper1["text"] = "Finished"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Finished'
+ self._lastoper2['text'] = ''
return False
def about(self, *e):
- ABOUT = (
- "NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper"
- )
- TITLE = "About: Recursive Descent Parser Application"
+ ABOUT = ("NLTK Recursive Descent Parser Application\n"+
+ "Written by Edward Loper")
+ TITLE = 'About: Recursive Descent Parser Application'
try:
- from tkinter.messagebox import Message
-
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
self._autostep = 0
# The default font's not very legible; try using 'fixed' instead.
try:
- ShowText(
- self._top,
- "Help: Recursive Descent Parser Application",
- (__doc__ or "").strip(),
- width=75,
- font="fixed",
- )
+ ShowText(self._top, 'Help: Recursive Descent Parser Application',
+ (__doc__ or '').strip(), width=75, font='fixed')
except:
- ShowText(
- self._top,
- "Help: Recursive Descent Parser Application",
- (__doc__ or "").strip(),
- width=75,
- )
+ ShowText(self._top, 'Help: Recursive Descent Parser Application',
+ (__doc__ or '').strip(), width=75)
def postscript(self, *e):
self._autostep = 0
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self._top.mainloop(*args, **kwargs)
def resize(self, size=None):
- if size is not None:
- self._size.set(size)
+ if size is not None: self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
self._boldfont.configure(size=-(abs(size)))
self._sysfont.configure(size=-(abs(size)))
- self._bigfont.configure(size=-(abs(size + 2)))
+ self._bigfont.configure(size=-(abs(size+2)))
self._redraw()
#########################################
def _toggle_grammar(self, *e):
if self._show_grammar.get():
- self._prodframe.pack(
- fill="both", side="left", padx=2, after=self._feedbackframe
- )
- self._lastoper1["text"] = "Show Grammar"
+ self._prodframe.pack(fill='both', side='left', padx=2,
+ after=self._feedbackframe)
+ self._lastoper1['text'] = 'Show Grammar'
else:
self._prodframe.pack_forget()
- self._lastoper1["text"] = "Hide Grammar"
- self._lastoper2["text"] = ""
-
- # def toggle_grammar(self, *e):
- # self._show_grammar = not self._show_grammar
- # if self._show_grammar:
- # self._prodframe.pack(fill='both', expand='y', side='left',
- # after=self._feedbackframe)
- # self._lastoper1['text'] = 'Show Grammar'
- # else:
- # self._prodframe.pack_forget()
- # self._lastoper1['text'] = 'Hide Grammar'
- # self._lastoper2['text'] = ''
+ self._lastoper1['text'] = 'Hide Grammar'
+ self._lastoper2['text'] = ''
+
+# def toggle_grammar(self, *e):
+# self._show_grammar = not self._show_grammar
+# if self._show_grammar:
+# self._prodframe.pack(fill='both', expand='y', side='left',
+# after=self._feedbackframe)
+# self._lastoper1['text'] = 'Show Grammar'
+# else:
+# self._prodframe.pack_forget()
+# self._lastoper1['text'] = 'Hide Grammar'
+# self._lastoper2['text'] = ''
def _prodlist_select(self, event):
selection = self._prodlist.curselection()
- if len(selection) != 1:
- return
+ if len(selection) != 1: return
index = int(selection[0])
old_frontier = self._parser.frontier()
production = self._parser.expand(self._productions[index])
if production:
- self._lastoper1["text"] = "Expand:"
- self._lastoper2["text"] = production
- self._prodlist.selection_clear(0, "end")
+ self._lastoper1['text'] = 'Expand:'
+ self._lastoper2['text'] = production
+ self._prodlist.selection_clear(0, 'end')
self._prodlist.selection_set(index)
self._animate_expand(old_frontier[0])
else:
# Reset the production selections.
- self._prodlist.selection_clear(0, "end")
+ self._prodlist.selection_clear(0, 'end')
for prod in self._parser.expandable_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
for i in treeloc:
tree = tree[i]
- widget = tree_to_treesegment(
- self._canvas,
- tree,
- node_font=self._boldfont,
- leaf_color="white",
- tree_width=2,
- tree_color="white",
- node_color="white",
- leaf_font=self._font,
- )
- widget.label()["color"] = "#20a050"
+ widget = tree_to_treesegment(self._canvas, tree,
+ node_font=self._boldfont,
+ leaf_color='white',
+ tree_width=2, tree_color='white',
+ node_color='white',
+ leaf_font=self._font)
+ widget.label()['color'] = '#20a050'
(oldx, oldy) = oldtree.label().bbox()[:2]
(newx, newy) = widget.label().bbox()[:2]
- widget.move(oldx - newx, oldy - newy)
+ widget.move(oldx-newx, oldy-newy)
if top:
self._cframe.add_widget(widget, 0, 5)
- widget.move(30 - widget.label().bbox()[0], 0)
+ widget.move(30-widget.label().bbox()[0], 0)
self._tree = widget
else:
oldtree.parent().replace_child(oldtree, widget)
# Move the children over so they don't overlap.
# Line the children up in a strange way.
if widget.subtrees():
- dx = (
- oldx
- + widget.label().width() / 2
- - widget.subtrees()[0].bbox()[0] / 2
- - widget.subtrees()[0].bbox()[2] / 2
- )
- for subtree in widget.subtrees():
- subtree.move(dx, 0)
+ dx = (oldx + widget.label().width()/2 -
+ widget.subtrees()[0].bbox()[0]/2 -
+ widget.subtrees()[0].bbox()[2]/2)
+ for subtree in widget.subtrees(): subtree.move(dx, 0)
self._makeroom(widget)
else:
oldtree.destroy()
- colors = [
- "gray%d" % (10 * int(10 * x / self._animation_frames.get()))
- for x in range(self._animation_frames.get(), 0, -1)
- ]
+ colors = ['gray%d' % (10*int(10*x/self._animation_frames.get()))
+ for x in range(self._animation_frames.get(),0,-1)]
# Move the text string down, if necessary.
dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1]
if dy > 0:
- for twidget in self._textwidgets:
- twidget.move(0, dy)
+ for twidget in self._textwidgets: twidget.move(0, dy)
self._canvas.move(self._textline, 0, dy)
self._animate_expand_frame(widget, colors)
Make sure that no sibling tree bbox's overlap.
"""
parent = treeseg.parent()
- if not isinstance(parent, TreeSegmentWidget):
- return
+ if not isinstance(parent, TreeSegmentWidget): return
index = parent.subtrees().index(treeseg)
# Handle siblings to the right
- rsiblings = parent.subtrees()[index + 1 :]
+ rsiblings = parent.subtrees()[index+1:]
if rsiblings:
dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10
- for sibling in rsiblings:
- sibling.move(dx, 0)
+ for sibling in rsiblings: sibling.move(dx, 0)
# Handle siblings to the left
if index > 0:
- lsibling = parent.subtrees()[index - 1]
+ lsibling = parent.subtrees()[index-1]
dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10)
treeseg.move(dx, 0)
def _animate_expand_frame(self, widget, colors):
if len(colors) > 0:
self._animating_lock = 1
- widget["color"] = colors[0]
+ widget['color'] = colors[0]
for subtree in widget.subtrees():
if isinstance(subtree, TreeSegmentWidget):
- subtree.label()["color"] = colors[0]
+ subtree.label()['color'] = colors[0]
else:
- subtree["color"] = colors[0]
- self._top.after(50, self._animate_expand_frame, widget, colors[1:])
+ subtree['color'] = colors[0]
+ self._top.after(50, self._animate_expand_frame,
+ widget, colors[1:])
else:
- widget["color"] = "black"
+ widget['color'] = 'black'
for subtree in widget.subtrees():
if isinstance(subtree, TreeSegmentWidget):
- subtree.label()["color"] = "black"
+ subtree.label()['color'] = 'black'
else:
- subtree["color"] = "black"
+ subtree['color'] = 'black'
self._redraw_quick()
- widget.label()["color"] = "black"
+ widget.label()['color'] = 'black'
self._animating_lock = 0
- if self._autostep:
- self._step()
+ if self._autostep: self._step()
def _animate_backtrack(self, treeloc):
# Flash red first, if we're animating.
- if self._animation_frames.get() == 0:
- colors = []
- else:
- colors = ["#a00000", "#000000", "#a00000"]
- colors += [
- "gray%d" % (10 * int(10 * x / (self._animation_frames.get())))
- for x in range(1, self._animation_frames.get() + 1)
- ]
+ if self._animation_frames.get() == 0: colors = []
+ else: colors = ['#a00000', '#000000', '#a00000']
+ colors += ['gray%d' % (10*int(10*x/(self._animation_frames.get())))
+ for x in range(1, self._animation_frames.get()+1)]
widgets = [self._get(self._tree, treeloc).parent()]
for subtree in widgets[0].subtrees():
def _animate_backtrack_frame(self, widgets, colors):
if len(colors) > 0:
self._animating_lock = 1
- for widget in widgets:
- widget["color"] = colors[0]
- self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:])
+ for widget in widgets: widget['color'] = colors[0]
+ self._top.after(50, self._animate_backtrack_frame,
+ widgets, colors[1:])
else:
for widget in widgets[0].subtrees():
widgets[0].remove_child(widget)
widget.destroy()
self._redraw_quick()
self._animating_lock = 0
- if self._autostep:
- self._step()
+ if self._autostep: self._step()
def _animate_match_backtrack(self, treeloc):
widget = self._get(self._tree, treeloc)
node = widget.parent().label()
- dy = (node.bbox()[3] - widget.bbox()[1] + 14) / max(
- 1, self._animation_frames.get()
- )
- self._animate_match_backtrack_frame(self._animation_frames.get(), widget, dy)
+ dy = ((node.bbox()[3] - widget.bbox()[1] + 14) /
+ max(1, self._animation_frames.get()))
+ self._animate_match_backtrack_frame(self._animation_frames.get(),
+ widget, dy)
def _animate_match(self, treeloc):
widget = self._get(self._tree, treeloc)
- dy = (self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) / max(
- 1, self._animation_frames.get()
- )
+ dy = ((self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) /
+ max(1, self._animation_frames.get()))
self._animate_match_frame(self._animation_frames.get(), widget, dy)
def _animate_match_frame(self, frame, widget, dy):
if frame > 0:
self._animating_lock = 1
widget.move(0, dy)
- self._top.after(10, self._animate_match_frame, frame - 1, widget, dy)
+ self._top.after(10, self._animate_match_frame,
+ frame-1, widget, dy)
else:
- widget["color"] = "#006040"
+ widget['color'] = '#006040'
self._redraw_quick()
self._animating_lock = 0
- if self._autostep:
- self._step()
+ if self._autostep: self._step()
def _animate_match_backtrack_frame(self, frame, widget, dy):
if frame > 0:
self._animating_lock = 1
widget.move(0, dy)
- self._top.after(
- 10, self._animate_match_backtrack_frame, frame - 1, widget, dy
- )
+ self._top.after(10, self._animate_match_backtrack_frame,
+ frame-1, widget, dy)
else:
widget.parent().remove_child(widget)
widget.destroy()
self._animating_lock = 0
- if self._autostep:
- self._step()
+ if self._autostep: self._step()
def edit_grammar(self, *e):
CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
def set_grammar(self, grammar):
self._parser.set_grammar(grammar)
self._productions = list(grammar.productions())
- self._prodlist.delete(0, "end")
+ self._prodlist.delete(0, 'end')
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
def edit_sentence(self, *e):
sentence = " ".join(self._sent)
- title = "Edit Text"
- instr = "Enter a new sentence to parse."
+ title = 'Edit Text'
+ instr = 'Enter a new sentence to parse.'
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
def set_sentence(self, sentence):
- self._sent = sentence.split() # [XX] use tagged?
+ self._sent = sentence.split() #[XX] use tagged?
self.reset()
-
def app():
"""
Create a recursive descent parser demo, using a simple grammar and
text.
"""
from nltk.grammar import CFG
-
- grammar = CFG.fromstring(
- """
+ grammar = CFG.fromstring("""
# Grammatical productions.
S -> NP VP
NP -> Det N PP | Det N
N -> 'man' | 'park' | 'dog' | 'telescope'
V -> 'ate' | 'saw'
P -> 'in' | 'under' | 'with'
- """
- )
+ """)
- sent = "the dog saw a man in the park".split()
+ sent = 'the dog saw a man in the park'.split()
RecursiveDescentApp(grammar, sent).mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Shift-Reduce Parser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
-from tkinter.font import Font
-from tkinter import IntVar, Listbox, Button, Frame, Label, Menu, Scrollbar, Tk
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (IntVar, Listbox, Button, Frame, Label, Menu,
+ Scrollbar, Tk)
from nltk.tree import Tree
from nltk.parse import SteppingShiftReduceParser
responsible for that.
"""
-
class ShiftReduceApp(object):
"""
A graphical tool for exploring the shift-reduce parser. The tool
the parsing process, performing the operations that
``nltk.parse.ShiftReduceParser`` would use.
"""
-
def __init__(self, grammar, sent, trace=0):
self._sent = sent
self._parser = SteppingShiftReduceParser(grammar, trace)
# Set up the main window.
self._top = Tk()
- self._top.title("Shift Reduce Parser Application")
+ self._top.title('Shift Reduce Parser Application')
# Animations. animating_lock is a lock to prevent the demo
# from performing new operations while it's animating.
self._animating_lock = 0
self._animate = IntVar(self._top)
- self._animate.set(10) # = medium
+ self._animate.set(10) # = medium
# The user can hide the grammar.
self._show_grammar = IntVar(self._top)
# Reset the demo, and set the feedback frame to empty.
self.reset()
- self._lastoper1["text"] = ""
+ self._lastoper1['text'] = ''
#########################################
## Initialization Helpers
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
+ self._boldfont = Font(family='helvetica', weight='bold',
+ size=self._size.get())
+ self._font = Font(family='helvetica',
+ size=self._size.get())
def _init_grammar(self, parent):
# Grammar view.
self._prodframe = listframe = Frame(parent)
- self._prodframe.pack(fill="both", side="left", padx=2)
- self._prodlist_label = Label(
- self._prodframe, font=self._boldfont, text="Available Reductions"
- )
+ self._prodframe.pack(fill='both', side='left', padx=2)
+ self._prodlist_label = Label(self._prodframe,
+ font=self._boldfont,
+ text='Available Reductions')
self._prodlist_label.pack()
- self._prodlist = Listbox(
- self._prodframe,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
- font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
- )
+ self._prodlist = Listbox(self._prodframe, selectmode='single',
+ relief='groove', background='white',
+ foreground='#909090',
+ font=self._font,
+ selectforeground='#004040',
+ selectbackground='#c0f0c0')
- self._prodlist.pack(side="right", fill="both", expand=1)
+ self._prodlist.pack(side='right', fill='both', expand=1)
self._productions = list(self._parser.grammar().productions())
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
self._prodlist.config(height=min(len(self._productions), 25))
# Add a scrollbar if there are more than 25 productions.
- if 1: # len(self._productions) > 25:
- listscroll = Scrollbar(self._prodframe, orient="vertical")
- self._prodlist.config(yscrollcommand=listscroll.set)
+ if 1:#len(self._productions) > 25:
+ listscroll = Scrollbar(self._prodframe,
+ orient='vertical')
+ self._prodlist.config(yscrollcommand = listscroll.set)
listscroll.config(command=self._prodlist.yview)
- listscroll.pack(side="left", fill="y")
+ listscroll.pack(side='left', fill='y')
# If they select a production, apply it.
- self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
+ self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
# When they hover over a production, highlight it.
self._hover = -1
- self._prodlist.bind("<Motion>", self._highlight_hover)
- self._prodlist.bind("<Leave>", self._clear_hover)
+ self._prodlist.bind('<Motion>', self._highlight_hover)
+ self._prodlist.bind('<Leave>', self._clear_hover)
def _init_bindings(self):
# Quit
- self._top.bind("<Control-q>", self.destroy)
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Alt-q>", self.destroy)
- self._top.bind("<Alt-x>", self.destroy)
+ self._top.bind('<Control-q>', self.destroy)
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Alt-q>', self.destroy)
+ self._top.bind('<Alt-x>', self.destroy)
# Ops (step, shift, reduce, undo)
- self._top.bind("<space>", self.step)
- self._top.bind("<s>", self.shift)
- self._top.bind("<Alt-s>", self.shift)
- self._top.bind("<Control-s>", self.shift)
- self._top.bind("<r>", self.reduce)
- self._top.bind("<Alt-r>", self.reduce)
- self._top.bind("<Control-r>", self.reduce)
- self._top.bind("<Delete>", self.reset)
- self._top.bind("<u>", self.undo)
- self._top.bind("<Alt-u>", self.undo)
- self._top.bind("<Control-u>", self.undo)
- self._top.bind("<Control-z>", self.undo)
- self._top.bind("<BackSpace>", self.undo)
+ self._top.bind('<space>', self.step)
+ self._top.bind('<s>', self.shift)
+ self._top.bind('<Alt-s>', self.shift)
+ self._top.bind('<Control-s>', self.shift)
+ self._top.bind('<r>', self.reduce)
+ self._top.bind('<Alt-r>', self.reduce)
+ self._top.bind('<Control-r>', self.reduce)
+ self._top.bind('<Delete>', self.reset)
+ self._top.bind('<u>', self.undo)
+ self._top.bind('<Alt-u>', self.undo)
+ self._top.bind('<Control-u>', self.undo)
+ self._top.bind('<Control-z>', self.undo)
+ self._top.bind('<BackSpace>', self.undo)
# Misc
- self._top.bind("<Control-p>", self.postscript)
- self._top.bind("<Control-h>", self.help)
- self._top.bind("<F1>", self.help)
- self._top.bind("<Control-g>", self.edit_grammar)
- self._top.bind("<Control-t>", self.edit_sentence)
+ self._top.bind('<Control-p>', self.postscript)
+ self._top.bind('<Control-h>', self.help)
+ self._top.bind('<F1>', self.help)
+ self._top.bind('<Control-g>', self.edit_grammar)
+ self._top.bind('<Control-t>', self.edit_sentence)
# Animation speed control
- self._top.bind("-", lambda e, a=self._animate: a.set(20))
- self._top.bind("=", lambda e, a=self._animate: a.set(10))
- self._top.bind("+", lambda e, a=self._animate: a.set(4))
+ self._top.bind('-', lambda e,a=self._animate:a.set(20))
+ self._top.bind('=', lambda e,a=self._animate:a.set(10))
+ self._top.bind('+', lambda e,a=self._animate:a.set(4))
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
- buttonframe.pack(fill="none", side="bottom")
- Button(
- buttonframe,
- text="Step",
- background="#90c0d0",
- foreground="black",
- command=self.step,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Shift",
- underline=0,
- background="#90f090",
- foreground="black",
- command=self.shift,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Reduce",
- underline=0,
- background="#90f090",
- foreground="black",
- command=self.reduce,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Undo",
- underline=0,
- background="#f0a0a0",
- foreground="black",
- command=self.undo,
- ).pack(side="left")
+ buttonframe.pack(fill='none', side='bottom')
+ Button(buttonframe, text='Step',
+ background='#90c0d0', foreground='black',
+ command=self.step,).pack(side='left')
+ Button(buttonframe, text='Shift', underline=0,
+ background='#90f090', foreground='black',
+ command=self.shift).pack(side='left')
+ Button(buttonframe, text='Reduce', underline=0,
+ background='#90f090', foreground='black',
+ command=self.reduce).pack(side='left')
+ Button(buttonframe, text='Undo', underline=0,
+ background='#f0a0a0', foreground='black',
+ command=self.undo).pack(side='left')
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
- )
- filemenu.add_command(
- label="Print to Postscript",
- underline=0,
- command=self.postscript,
- accelerator="Ctrl-p",
- )
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Reset Parser', underline=0,
+ command=self.reset, accelerator='Del')
+ filemenu.add_command(label='Print to Postscript', underline=0,
+ command=self.postscript, accelerator='Ctrl-p')
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-x')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
- editmenu.add_command(
- label="Edit Grammar",
- underline=5,
- command=self.edit_grammar,
- accelerator="Ctrl-g",
- )
- editmenu.add_command(
- label="Edit Text",
- underline=5,
- command=self.edit_sentence,
- accelerator="Ctrl-t",
- )
- menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+ editmenu.add_command(label='Edit Grammar', underline=5,
+ command=self.edit_grammar,
+ accelerator='Ctrl-g')
+ editmenu.add_command(label='Edit Text', underline=5,
+ command=self.edit_sentence,
+ accelerator='Ctrl-t')
+ menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
rulemenu = Menu(menubar, tearoff=0)
- rulemenu.add_command(
- label="Step", underline=1, command=self.step, accelerator="Space"
- )
+ rulemenu.add_command(label='Step', underline=1,
+ command=self.step, accelerator='Space')
rulemenu.add_separator()
- rulemenu.add_command(
- label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
- )
- rulemenu.add_command(
- label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
- )
+ rulemenu.add_command(label='Shift', underline=0,
+ command=self.shift, accelerator='Ctrl-s')
+ rulemenu.add_command(label='Reduce', underline=0,
+ command=self.reduce, accelerator='Ctrl-r')
rulemenu.add_separator()
- rulemenu.add_command(
- label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
- )
- menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+ rulemenu.add_command(label='Undo', underline=0,
+ command=self.undo, accelerator='Ctrl-u')
+ menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
viewmenu = Menu(menubar, tearoff=0)
- viewmenu.add_checkbutton(
- label="Show Grammar",
- underline=0,
- variable=self._show_grammar,
- command=self._toggle_grammar,
- )
+ viewmenu.add_checkbutton(label="Show Grammar", underline=0,
+ variable=self._show_grammar,
+ command=self._toggle_grammar)
viewmenu.add_separator()
- viewmenu.add_radiobutton(
- label="Tiny",
- variable=self._size,
- underline=0,
- value=10,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Small",
- variable=self._size,
- underline=0,
- value=12,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Medium",
- variable=self._size,
- underline=0,
- value=14,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Large",
- variable=self._size,
- underline=0,
- value=18,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Huge",
- variable=self._size,
- underline=0,
- value=24,
- command=self.resize,
- )
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+ underline=0, value=10, command=self.resize)
+ viewmenu.add_radiobutton(label='Small', variable=self._size,
+ underline=0, value=12, command=self.resize)
+ viewmenu.add_radiobutton(label='Medium', variable=self._size,
+ underline=0, value=14, command=self.resize)
+ viewmenu.add_radiobutton(label='Large', variable=self._size,
+ underline=0, value=18, command=self.resize)
+ viewmenu.add_radiobutton(label='Huge', variable=self._size,
+ underline=0, value=24, command=self.resize)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
animatemenu = Menu(menubar, tearoff=0)
- animatemenu.add_radiobutton(
- label="No Animation", underline=0, variable=self._animate, value=0
- )
- animatemenu.add_radiobutton(
- label="Slow Animation",
- underline=0,
- variable=self._animate,
- value=20,
- accelerator="-",
- )
- animatemenu.add_radiobutton(
- label="Normal Animation",
- underline=0,
- variable=self._animate,
- value=10,
- accelerator="=",
- )
- animatemenu.add_radiobutton(
- label="Fast Animation",
- underline=0,
- variable=self._animate,
- value=4,
- accelerator="+",
- )
+ animatemenu.add_radiobutton(label="No Animation", underline=0,
+ variable=self._animate, value=0)
+ animatemenu.add_radiobutton(label="Slow Animation", underline=0,
+ variable=self._animate, value=20,
+ accelerator='-')
+ animatemenu.add_radiobutton(label="Normal Animation", underline=0,
+ variable=self._animate, value=10,
+ accelerator='=')
+ animatemenu.add_radiobutton(label="Fast Animation", underline=0,
+ variable=self._animate, value=4,
+ accelerator='+')
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
+
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
- )
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ helpmenu.add_command(label='About', underline=0,
+ command=self.about)
+ helpmenu.add_command(label='Instructions', underline=0,
+ command=self.help, accelerator='F1')
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
def _init_feedback(self, parent):
self._feedbackframe = feedbackframe = Frame(parent)
- feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
- self._lastoper_label = Label(
- feedbackframe, text="Last Operation:", font=self._font
- )
- self._lastoper_label.pack(side="left")
- lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
- lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
- self._lastoper1 = Label(
- lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
- )
- self._lastoper2 = Label(
- lastoperframe,
- anchor="w",
- width=30,
- foreground="#004040",
- background="#f0f0f0",
- font=self._font,
- )
- self._lastoper1.pack(side="left")
- self._lastoper2.pack(side="left", fill="x", expand=1)
+ feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
+ self._lastoper_label = Label(feedbackframe, text='Last Operation:',
+ font=self._font)
+ self._lastoper_label.pack(side='left')
+ lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+ lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
+ self._lastoper1 = Label(lastoperframe, foreground='#007070',
+ background='#f0f0f0', font=self._font)
+ self._lastoper2 = Label(lastoperframe, anchor='w', width=30,
+ foreground='#004040', background='#f0f0f0',
+ font=self._font)
+ self._lastoper1.pack(side='left')
+ self._lastoper2.pack(side='left', fill='x', expand=1)
def _init_canvas(self, parent):
- self._cframe = CanvasFrame(
- parent,
- background="white",
- width=525,
- closeenough=10,
- border=2,
- relief="sunken",
- )
- self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+ self._cframe = CanvasFrame(parent, background='white',
+ width=525, closeenough=10,
+ border=2, relief='sunken')
+ self._cframe.pack(expand=1, fill='both', side='top', pady=2)
canvas = self._canvas = self._cframe.canvas()
self._stackwidgets = []
self._rtextwidgets = []
- self._titlebar = canvas.create_rectangle(
- 0, 0, 0, 0, fill="#c0f0f0", outline="black"
- )
- self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
- self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
- size = self._size.get() + 4
- self._stacklabel = TextWidget(
- canvas, "Stack", color="#004040", font=self._boldfont
- )
- self._rtextlabel = TextWidget(
- canvas, "Remaining Text", color="#004040", font=self._boldfont
- )
+ self._titlebar = canvas.create_rectangle(0,0,0,0, fill='#c0f0f0',
+ outline='black')
+ self._exprline = canvas.create_line(0,0,0,0, dash='.')
+ self._stacktop = canvas.create_line(0,0,0,0, fill='#408080')
+ size = self._size.get()+4
+ self._stacklabel = TextWidget(canvas, 'Stack', color='#004040',
+ font=self._boldfont)
+ self._rtextlabel = TextWidget(canvas, 'Remaining Text',
+ color='#004040', font=self._boldfont)
self._cframe.add_widget(self._stacklabel)
self._cframe.add_widget(self._rtextlabel)
#########################################
def _redraw(self):
- scrollregion = self._canvas["scrollregion"].split()
+ scrollregion = self._canvas['scrollregion'].split()
(cx1, cy1, cx2, cy2) = [int(c) for c in scrollregion]
# Delete the old stack & rtext widgets.
# Position the titlebar & exprline
(x1, y1, x2, y2) = self._stacklabel.bbox()
- y = y2 - y1 + 10
- self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4)
- self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10)
+ y = y2-y1+10
+ self._canvas.coords(self._titlebar, -5000, 0, 5000, y-4)
+ self._canvas.coords(self._exprline, 0, y*2-10, 5000, y*2-10)
# Position the titlebar labels..
(x1, y1, x2, y2) = self._stacklabel.bbox()
- self._stacklabel.move(5 - x1, 3 - y1)
+ self._stacklabel.move(5-x1, 3-y1)
(x1, y1, x2, y2) = self._rtextlabel.bbox()
- self._rtextlabel.move(cx2 - x2 - 5, 3 - y1)
+ self._rtextlabel.move(cx2-x2-5, 3-y1)
# Draw the stack.
stackx = 5
for tok in self._parser.stack():
if isinstance(tok, Tree):
- attribs = {
- "tree_color": "#4080a0",
- "tree_width": 2,
- "node_font": self._boldfont,
- "node_color": "#006060",
- "leaf_color": "#006060",
- "leaf_font": self._font,
- }
- widget = tree_to_treesegment(self._canvas, tok, **attribs)
- widget.label()["color"] = "#000000"
+ attribs = {'tree_color': '#4080a0', 'tree_width': 2,
+ 'node_font': self._boldfont,
+ 'node_color': '#006060',
+ 'leaf_color': '#006060', 'leaf_font':self._font}
+ widget = tree_to_treesegment(self._canvas, tok,
+ **attribs)
+ widget.label()['color'] = '#000000'
else:
- widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
+ widget = TextWidget(self._canvas, tok,
+ color='#000000', font=self._font)
widget.bind_click(self._popup_reduce)
self._stackwidgets.append(widget)
self._cframe.add_widget(widget, stackx, y)
# Draw the remaining text.
rtextwidth = 0
for tok in self._parser.remaining_text():
- widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
+ widget = TextWidget(self._canvas, tok,
+ color='#000000', font=self._font)
self._rtextwidgets.append(widget)
self._cframe.add_widget(widget, rtextwidth, y)
rtextwidth = widget.bbox()[2] + 4
# Move the remaining text to the correct location (keep it
# right-justified, when possible); and move the remaining text
# label, if necessary.
- stackx = max(stackx, self._stacklabel.width() + 25)
- rlabelwidth = self._rtextlabel.width() + 10
- if stackx >= cx2 - max(rtextwidth, rlabelwidth):
+ stackx = max(stackx, self._stacklabel.width()+25)
+ rlabelwidth = self._rtextlabel.width()+10
+ if stackx >= cx2-max(rtextwidth, rlabelwidth):
cx2 = stackx + max(rtextwidth, rlabelwidth)
for rtextwidget in self._rtextwidgets:
- rtextwidget.move(4 + cx2 - rtextwidth, 0)
- self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0)
+ rtextwidget.move(4+cx2-rtextwidth, 0)
+ self._rtextlabel.move(cx2-self._rtextlabel.bbox()[2]-5, 0)
- midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2
+ midx = (stackx + cx2-max(rtextwidth, rlabelwidth))/2
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
(x1, y1, x2, y2) = self._stacklabel.bbox()
# Set up binding to allow them to shift a token by dragging it.
if len(self._rtextwidgets) > 0:
-
def drag_shift(widget, midx=midx, self=self):
- if widget.bbox()[0] < midx:
- self.shift()
- else:
- self._redraw()
-
+ if widget.bbox()[0] < midx: self.shift()
+ else: self._redraw()
self._rtextwidgets[0].bind_drag(drag_shift)
self._rtextwidgets[0].bind_click(self.shift)
def _draw_stack_top(self, widget):
# hack..
- midx = widget.bbox()[2] + 50
+ midx = widget.bbox()[2]+50
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
def _highlight_productions(self):
# Highlight the productions that can be reduced.
- self._prodlist.selection_clear(0, "end")
+ self._prodlist.selection_clear(0, 'end')
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
#########################################
def destroy(self, *e):
- if self._top is None:
- return
+ if self._top is None: return
self._top.destroy()
self._top = None
def reset(self, *e):
self._parser.initialize(self._sent)
- self._lastoper1["text"] = "Reset App"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Reset App'
+ self._lastoper2['text'] = ''
self._redraw()
def step(self, *e):
- if self.reduce():
- return True
- elif self.shift():
- return True
+ if self.reduce(): return True
+ elif self.shift(): return True
else:
if list(self._parser.parses()):
- self._lastoper1["text"] = "Finished:"
- self._lastoper2["text"] = "Success"
+ self._lastoper1['text'] = 'Finished:'
+ self._lastoper2['text'] = 'Success'
else:
- self._lastoper1["text"] = "Finished:"
- self._lastoper2["text"] = "Failure"
+ self._lastoper1['text'] = 'Finished:'
+ self._lastoper2['text'] = 'Failure'
def shift(self, *e):
- if self._animating_lock:
- return
+ if self._animating_lock: return
if self._parser.shift():
tok = self._parser.stack()[-1]
- self._lastoper1["text"] = "Shift:"
- self._lastoper2["text"] = "%r" % tok
+ self._lastoper1['text'] = 'Shift:'
+ self._lastoper2['text'] = '%r' % tok
if self._animate.get():
self._animate_shift()
else:
return False
def reduce(self, *e):
- if self._animating_lock:
- return
+ if self._animating_lock: return
production = self._parser.reduce()
if production:
- self._lastoper1["text"] = "Reduce:"
- self._lastoper2["text"] = "%s" % production
+ self._lastoper1['text'] = 'Reduce:'
+ self._lastoper2['text'] = '%s' % production
if self._animate.get():
self._animate_reduce()
else:
return production
def undo(self, *e):
- if self._animating_lock:
- return
+ if self._animating_lock: return
if self._parser.undo():
self._redraw()
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self._top.mainloop(*args, **kwargs)
#########################################
#########################################
def resize(self, size=None):
- if size is not None:
- self._size.set(size)
+ if size is not None: self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
self._boldfont.configure(size=-(abs(size)))
self._sysfont.configure(size=-(abs(size)))
- # self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
- # self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
- # self._lastoper_label['font'] = ('helvetica', -size)
- # self._lastoper1['font'] = ('helvetica', -size)
- # self._lastoper2['font'] = ('helvetica', -size)
- # self._prodlist['font'] = ('helvetica', -size)
- # self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
+ #self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
+ #self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
+ #self._lastoper_label['font'] = ('helvetica', -size)
+ #self._lastoper1['font'] = ('helvetica', -size)
+ #self._lastoper2['font'] = ('helvetica', -size)
+ #self._prodlist['font'] = ('helvetica', -size)
+ #self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
self._redraw()
def help(self, *e):
# The default font's not very legible; try using 'fixed' instead.
try:
- ShowText(
- self._top,
- "Help: Shift-Reduce Parser Application",
- (__doc__ or "").strip(),
- width=75,
- font="fixed",
- )
+ ShowText(self._top, 'Help: Shift-Reduce Parser Application',
+ (__doc__ or '').strip(), width=75, font='fixed')
except:
- ShowText(
- self._top,
- "Help: Shift-Reduce Parser Application",
- (__doc__ or "").strip(),
- width=75,
- )
+ ShowText(self._top, 'Help: Shift-Reduce Parser Application',
+ (__doc__ or '').strip(), width=75)
def about(self, *e):
- ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
- TITLE = "About: Shift-Reduce Parser Application"
+ ABOUT = ("NLTK Shift-Reduce Parser Application\n"+
+ "Written by Edward Loper")
+ TITLE = 'About: Shift-Reduce Parser Application'
try:
- from tkinter.messagebox import Message
-
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
def set_grammar(self, grammar):
self._parser.set_grammar(grammar)
self._productions = list(grammar.productions())
- self._prodlist.delete(0, "end")
+ self._prodlist.delete(0, 'end')
for production in self._productions:
- self._prodlist.insert("end", (" %s" % production))
+ self._prodlist.insert('end', (' %s' % production))
def edit_sentence(self, *e):
sentence = " ".join(self._sent)
- title = "Edit Text"
- instr = "Enter a new sentence to parse."
+ title = 'Edit Text'
+ instr = 'Enter a new sentence to parse.'
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
def set_sentence(self, sent):
- self._sent = sent.split() # [XX] use tagged?
+ self._sent = sent.split() #[XX] use tagged?
self.reset()
#########################################
def _toggle_grammar(self, *e):
if self._show_grammar.get():
- self._prodframe.pack(
- fill="both", side="left", padx=2, after=self._feedbackframe
- )
- self._lastoper1["text"] = "Show Grammar"
+ self._prodframe.pack(fill='both', side='left', padx=2,
+ after=self._feedbackframe)
+ self._lastoper1['text'] = 'Show Grammar'
else:
self._prodframe.pack_forget()
- self._lastoper1["text"] = "Hide Grammar"
- self._lastoper2["text"] = ""
+ self._lastoper1['text'] = 'Hide Grammar'
+ self._lastoper2['text'] = ''
def _prodlist_select(self, event):
selection = self._prodlist.curselection()
- if len(selection) != 1:
- return
+ if len(selection) != 1: return
index = int(selection[0])
production = self._parser.reduce(self._productions[index])
if production:
- self._lastoper1["text"] = "Reduce:"
- self._lastoper2["text"] = "%s" % production
+ self._lastoper1['text'] = 'Reduce:'
+ self._lastoper2['text'] = '%s' % production
if self._animate.get():
self._animate_reduce()
else:
self._redraw()
else:
# Reset the production selections.
- self._prodlist.selection_clear(0, "end")
+ self._prodlist.selection_clear(0, 'end')
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
def _popup_reduce(self, widget):
# Remove old commands.
productions = self._parser.reducible_productions()
- if len(productions) == 0:
- return
+ if len(productions) == 0: return
- self._reduce_menu.delete(0, "end")
+ self._reduce_menu.delete(0, 'end')
for production in productions:
- self._reduce_menu.add_command(label=str(production), command=self.reduce)
- self._reduce_menu.post(
- self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()
- )
+ self._reduce_menu.add_command(label=str(production),
+ command=self.reduce)
+ self._reduce_menu.post(self._canvas.winfo_pointerx(),
+ self._canvas.winfo_pointery())
#########################################
## Animations
# Where are we shifting from & to?
right = widget.bbox()[0]
- if len(self._stackwidgets) == 0:
- left = 5
- else:
- left = self._stackwidgets[-1].bbox()[2] + 10
+ if len(self._stackwidgets) == 0: left = 5
+ else: left = self._stackwidgets[-1].bbox()[2]+10
# Start animating.
dt = self._animate.get()
- dx = (left - right) * 1.0 / dt
+ dx = (left-right)*1.0/dt
self._animate_shift_frame(dt, widget, dx)
def _animate_shift_frame(self, frame, widget, dx):
if frame > 0:
self._animating_lock = 1
widget.move(dx, 0)
- self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx)
+ self._top.after(10, self._animate_shift_frame,
+ frame-1, widget, dx)
else:
# but: stacktop??
def _animate_reduce(self):
# What widgets are we shifting?
- numwidgets = len(self._parser.stack()[-1]) # number of children
+ numwidgets = len(self._parser.stack()[-1]) # number of children
widgets = self._stackwidgets[-numwidgets:]
# How far are we moving?
# Start animating.
dt = self._animate.get()
- dy = ydist * 2.0 / dt
- self._animate_reduce_frame(dt / 2, widgets, dy)
+ dy = ydist*2.0/dt
+ self._animate_reduce_frame(dt/2, widgets, dy)
def _animate_reduce_frame(self, frame, widgets, dy):
if frame > 0:
self._animating_lock = 1
- for widget in widgets:
- widget.move(0, dy)
- self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy)
+ for widget in widgets: widget.move(0, dy)
+ self._top.after(10, self._animate_reduce_frame,
+ frame-1, widgets, dy)
else:
- del self._stackwidgets[-len(widgets) :]
+ del self._stackwidgets[-len(widgets):]
for widget in widgets:
self._cframe.remove_widget(widget)
tok = self._parser.stack()[-1]
- if not isinstance(tok, Tree):
- raise ValueError()
- label = TextWidget(
- self._canvas, str(tok.label()), color="#006060", font=self._boldfont
- )
- widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
+ if not isinstance(tok, Tree): raise ValueError()
+ label = TextWidget(self._canvas, str(tok.label()), color='#006060',
+ font=self._boldfont)
+ widget = TreeSegmentWidget(self._canvas, label, widgets,
+ width=2)
(x1, y1, x2, y2) = self._stacklabel.bbox()
- y = y2 - y1 + 10
- if not self._stackwidgets:
- x = 5
- else:
- x = self._stackwidgets[-1].bbox()[2] + 10
+ y = y2-y1+10
+ if not self._stackwidgets: x = 5
+ else: x = self._stackwidgets[-1].bbox()[2] + 10
self._cframe.add_widget(widget, x, y)
self._stackwidgets.append(widget)
self._draw_stack_top(widget)
self._highlight_productions()
- # # Delete the old widgets..
- # del self._stackwidgets[-len(widgets):]
- # for widget in widgets:
- # self._cframe.destroy_widget(widget)
- #
- # # Make a new one.
- # tok = self._parser.stack()[-1]
- # if isinstance(tok, Tree):
- # attribs = {'tree_color': '#4080a0', 'tree_width': 2,
- # 'node_font': bold, 'node_color': '#006060',
- # 'leaf_color': '#006060', 'leaf_font':self._font}
- # widget = tree_to_treesegment(self._canvas, tok.type(),
- # **attribs)
- # widget.node()['color'] = '#000000'
- # else:
- # widget = TextWidget(self._canvas, tok.type(),
- # color='#000000', font=self._font)
- # widget.bind_click(self._popup_reduce)
- # (x1, y1, x2, y2) = self._stacklabel.bbox()
- # y = y2-y1+10
- # if not self._stackwidgets: x = 5
- # else: x = self._stackwidgets[-1].bbox()[2] + 10
- # self._cframe.add_widget(widget, x, y)
- # self._stackwidgets.append(widget)
-
- # self._redraw()
+# # Delete the old widgets..
+# del self._stackwidgets[-len(widgets):]
+# for widget in widgets:
+# self._cframe.destroy_widget(widget)
+#
+# # Make a new one.
+# tok = self._parser.stack()[-1]
+# if isinstance(tok, Tree):
+# attribs = {'tree_color': '#4080a0', 'tree_width': 2,
+# 'node_font': bold, 'node_color': '#006060',
+# 'leaf_color': '#006060', 'leaf_font':self._font}
+# widget = tree_to_treesegment(self._canvas, tok.type(),
+# **attribs)
+# widget.node()['color'] = '#000000'
+# else:
+# widget = TextWidget(self._canvas, tok.type(),
+# color='#000000', font=self._font)
+# widget.bind_click(self._popup_reduce)
+# (x1, y1, x2, y2) = self._stacklabel.bbox()
+# y = y2-y1+10
+# if not self._stackwidgets: x = 5
+# else: x = self._stackwidgets[-1].bbox()[2] + 10
+# self._cframe.add_widget(widget, x, y)
+# self._stackwidgets.append(widget)
+
+ #self._redraw()
self._animating_lock = 0
#########################################
def _highlight_hover(self, event):
# What production are we hovering over?
index = self._prodlist.nearest(event.y)
- if self._hover == index:
- return
+ if self._hover == index: return
# Clear any previous hover highlighting.
self._clear_hover()
rhslen = len(self._productions[index].rhs())
for stackwidget in self._stackwidgets[-rhslen:]:
if isinstance(stackwidget, TreeSegmentWidget):
- stackwidget.label()["color"] = "#00a000"
+ stackwidget.label()['color'] = '#00a000'
else:
- stackwidget["color"] = "#00a000"
+ stackwidget['color'] = '#00a000'
# Remember what production we're hovering over.
self._hover = index
def _clear_hover(self, *event):
# Clear any previous hover highlighting.
- if self._hover == -1:
- return
+ if self._hover == -1: return
self._hover = -1
for stackwidget in self._stackwidgets:
if isinstance(stackwidget, TreeSegmentWidget):
- stackwidget.label()["color"] = "black"
+ stackwidget.label()['color'] = 'black'
else:
- stackwidget["color"] = "black"
+ stackwidget['color'] = 'black'
def app():
"""
from nltk.grammar import Nonterminal, Production, CFG
-
- nonterminals = "S VP NP PP P N Name V Det"
- (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
+ nonterminals = 'S VP NP PP P N Name V Det'
+ (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
+ for s in nonterminals.split()]
productions = (
# Syntactic Productions
Production(VP, [V, NP, PP]),
Production(VP, [V, NP]),
Production(PP, [P, NP]),
+
# Lexical Productions
- Production(NP, ["I"]),
- Production(Det, ["the"]),
- Production(Det, ["a"]),
- Production(N, ["man"]),
- Production(V, ["saw"]),
- Production(P, ["in"]),
- Production(P, ["with"]),
- Production(N, ["park"]),
- Production(N, ["dog"]),
- Production(N, ["statue"]),
- Production(Det, ["my"]),
- )
+ Production(NP, ['I']), Production(Det, ['the']),
+ Production(Det, ['a']), Production(N, ['man']),
+ Production(V, ['saw']), Production(P, ['in']),
+ Production(P, ['with']), Production(N, ['park']),
+ Production(N, ['dog']), Production(N, ['statue']),
+ Production(Det, ['my']),
+ )
grammar = CFG(S, productions)
# tokenize the sentence
- sent = "my dog saw a man in the park with a statue".split()
+ sent = 'my dog saw a man in the park with a statue'.split()
ShiftReduceApp(grammar, sent).mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Wordfreq Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.text import Text
from nltk.corpus import gutenberg
-
def plot_word_freq_dist(text):
fd = text.vocab()
samples = [item for item, _ in fd.most_common(50)]
values = [fd[sample] for sample in samples]
- values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))]
+ values = [sum(values[:i+1]) * 100.0/fd.N() for i in range(len(values))]
pylab.title(text.name)
pylab.xlabel("Samples")
pylab.ylabel("Cumulative Percentage")
pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
pylab.show()
-
def app():
- t1 = Text(gutenberg.words("melville-moby_dick.txt"))
+ t1 = Text(gutenberg.words('melville-moby_dick.txt'))
plot_word_freq_dist(t1)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: WordNet Browser Application
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
# Paul Bone <pbone@students.csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# modifying to be compliant with NLTK's coding standards. Tests also
# need to be develop to ensure this continues to work in the face of
# changes to other NLTK packages.
+from __future__ import print_function
# Allow this program to run inside the NLTK source tree.
from sys import path
import base64
import pickle
import copy
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from urllib.parse import unquote_plus
+from six.moves.urllib.parse import unquote_plus
+
+from nltk import compat
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset, Lemma
+if compat.PY3:
+ from http.server import HTTPServer, BaseHTTPRequestHandler
+else:
+ from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
+
# now included in local file
# from util import html_header, html_trailer, \
# get_static_index_page, get_static_page_by_path, \
class MyServerHandler(BaseHTTPRequestHandler):
+
def do_HEAD(self):
self.send_head()
def do_GET(self):
global firstClient
sp = self.path[1:]
- if unquote_plus(sp) == "SHUTDOWN THE SERVER":
+ if unquote_plus(sp) == 'SHUTDOWN THE SERVER':
if server_mode:
page = "Server must be killed with SIGTERM."
type = "text/plain"
else:
- print("Server shutting down!")
+ print('Server shutting down!')
os._exit(0)
- elif sp == "": # First request.
- type = "text/html"
+ elif sp == '': # First request.
+ type = 'text/html'
if not server_mode and firstClient:
firstClient = False
page = get_static_index_page(True)
else:
page = get_static_index_page(False)
- word = "green"
+ word = 'green'
- elif sp.endswith(".html"): # Trying to fetch a HTML file TODO:
- type = "text/html"
+ elif sp.endswith('.html'): # Trying to fetch a HTML file TODO:
+ type = 'text/html'
usp = unquote_plus(sp)
- if usp == "NLTK Wordnet Browser Database Info.html":
- word = "* Database Info *"
+ if usp == 'NLTK Wordnet Browser Database Info.html':
+ word = '* Database Info *'
if os.path.isfile(usp):
- with open(usp, "r") as infile:
+ with open(usp, 'r') as infile:
page = infile.read()
else:
- page = (
- (html_header % word) + "<p>The database info file:"
- "<p><b>"
- + usp
- + "</b>"
- + "<p>was not found. Run this:"
- + "<p><b>python dbinfo_html.py</b>"
- + "<p>to produce it."
- + html_trailer
- )
+ page = (html_header % word) + \
+ '<p>The database info file:'\
+ '<p><b>' + usp + '</b>' + \
+ '<p>was not found. Run this:' + \
+ '<p><b>python dbinfo_html.py</b>' + \
+ '<p>to produce it.' + html_trailer
else:
# Handle files here.
word = sp
page = get_static_page_by_path(usp)
elif sp.startswith("search"):
# This doesn't seem to work with MWEs.
- type = "text/html"
+ type = 'text/html'
parts = (sp.split("?")[1]).split("&")
- word = [
- p.split("=")[1].replace("+", " ")
- for p in parts
- if p.startswith("nextWord")
- ][0]
+ word = [p.split("=")[1].replace("+", " ")
+ for p in parts if p.startswith("nextWord")][0]
page, word = page_from_word(word)
elif sp.startswith("lookup_"):
# TODO add a variation of this that takes a non ecoded word or MWE.
- type = "text/html"
- sp = sp[len("lookup_") :]
+ type = 'text/html'
+ sp = sp[len("lookup_"):]
page, word = page_from_href(sp)
elif sp == "start_page":
# if this is the first request we should display help
# information, and possibly set a default word.
- type = "text/html"
+ type = 'text/html'
page, word = page_from_word("wordnet")
else:
- type = "text/plain"
+ type = 'text/plain'
page = "Could not parse request: '%s'" % sp
# Send result.
self.send_head(type)
- self.wfile.write(page.encode("utf8"))
+ self.wfile.write(page.encode('utf8'))
+
def send_head(self, type=None):
self.send_response(200)
- self.send_header("Content-type", type)
+ self.send_header('Content-type', type)
self.end_headers()
def log_message(self, format, *args):
if logfile:
logfile.write(
- "%s - - [%s] %s\n"
- % (self.address_string(), self.log_date_time_string(), format % args)
- )
+ "%s - - [%s] %s\n" %
+ (self.address_string(),
+ self.log_date_time_string(),
+ format%args))
def get_unique_counter_from_url(sp):
Extract the unique counter from the URL if it has one. Otherwise return
null.
"""
- pos = sp.rfind("%23")
+ pos = sp.rfind('%23')
if pos != -1:
- return int(sp[(pos + 3) :])
+ return int(sp[(pos + 3):])
else:
return None
# Setup logging.
if logfilename:
try:
- logfile = open(logfilename, "a", 1) # 1 means 'line buffering'
+ logfile = open(logfilename, "a", 1) # 1 means 'line buffering'
except IOError as e:
- sys.stderr.write("Couldn't open %s for writing: %s", logfilename, e)
+ sys.stderr.write("Couldn't open %s for writing: %s",
+ logfilename, e)
sys.exit(1)
else:
logfile = None
# Compute URL and start web browser
- url = "http://localhost:" + str(port)
+ url = 'http://localhost:' + str(port)
server_ready = None
browser_thread = None
browser_thread = startBrowser(url, server_ready)
# Start the server.
- server = HTTPServer(("", port), MyServerHandler)
+ server = HTTPServer(('', port), MyServerHandler)
if logfile:
- logfile.write("NLTK Wordnet browser server running serving: %s\n" % url)
+ logfile.write(
+ 'NLTK Wordnet browser server running serving: %s\n' % url)
if runBrowser:
server_ready.set()
def startBrowser(url, server_ready):
def run():
server_ready.wait()
- time.sleep(1) # Wait a little bit more, there's still the chance of
- # a race condition.
- webbrowser.open(url, new=2, autoraise=1)
-
+ time.sleep(1) # Wait a little bit more, there's still the chance of
+ # a race condition.
+ webbrowser.open(url, new = 2, autoraise = 1)
t = threading.Thread(target=run)
t.start()
return t
-
#####################################################################
# Utilities
#####################################################################
This provides a backend to both wxbrowse and browserver.py.
"""
+\f
################################################################################
#
# Main logic for wordnet browser.
# WordNet corpus is installed.
def _pos_tuples():
return [
- (wn.NOUN, "N", "noun"),
- (wn.VERB, "V", "verb"),
- (wn.ADJ, "J", "adj"),
- (wn.ADV, "R", "adv"),
- ]
-
+ (wn.NOUN,'N','noun'),
+ (wn.VERB,'V','verb'),
+ (wn.ADJ,'J','adj'),
+ (wn.ADV,'R','adv')]
def _pos_match(pos_tuple):
"""
tuple given to it. It attempts to match it against the first
non-null component of the given pos tuple.
"""
- if pos_tuple[0] == "s":
- pos_tuple = ("a", pos_tuple[1], pos_tuple[2])
- for n, x in enumerate(pos_tuple):
+ if pos_tuple[0] == 's':
+ pos_tuple = ('a', pos_tuple[1], pos_tuple[2])
+ for n,x in enumerate(pos_tuple):
if x is not None:
break
for pt in _pos_tuples():
- if pt[n] == pos_tuple[n]:
- return pt
+ if pt[n] == pos_tuple[n]: return pt
return None
def lemma_property(word, synset, func):
+
def flattern(l):
if l == []:
return []
yet support things such as full hyponym vs direct hyponym.
"""
if synset.pos() == wn.NOUN:
- return (
- (HYPONYM, "Hyponyms", synset.hyponyms()),
- (INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()),
- (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
- (
- INDIRECT_HYPERNYMS,
- "Indirect hypernyms",
- rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
- ),
- # hypernyms', 'Sister terms',
- (INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()),
- # (CLASS_REGIONAL, ['domain term region'], ),
- (PART_HOLONYM, "Part holonyms", synset.part_holonyms()),
- (PART_MERONYM, "Part meronyms", synset.part_meronyms()),
- (SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()),
- (SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()),
- (MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()),
- (MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()),
- (ATTRIBUTE, "Attributes", synset.attributes()),
- (ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())),
- (
- DERIVATIONALLY_RELATED_FORM,
- "Derivationally related form",
- lemma_property(
- word, synset, lambda l: l.derivationally_related_forms()
- ),
- ),
- )
+ return ((HYPONYM, 'Hyponyms',
+ synset.hyponyms()),
+ (INSTANCE_HYPONYM , 'Instance hyponyms',
+ synset.instance_hyponyms()),
+ (HYPERNYM, 'Direct hypernyms',
+ synset.hypernyms()),
+ (INDIRECT_HYPERNYMS, 'Indirect hypernyms',
+ rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1]),
+# hypernyms', 'Sister terms',
+ (INSTANCE_HYPERNYM , 'Instance hypernyms',
+ synset.instance_hypernyms()),
+# (CLASS_REGIONAL, ['domain term region'], ),
+ (PART_HOLONYM, 'Part holonyms',
+ synset.part_holonyms()),
+ (PART_MERONYM, 'Part meronyms',
+ synset.part_meronyms()),
+ (SUBSTANCE_HOLONYM, 'Substance holonyms',
+ synset.substance_holonyms()),
+ (SUBSTANCE_MERONYM, 'Substance meronyms',
+ synset.substance_meronyms()),
+ (MEMBER_HOLONYM, 'Member holonyms',
+ synset.member_holonyms()),
+ (MEMBER_MERONYM, 'Member meronyms',
+ synset.member_meronyms()),
+ (ATTRIBUTE, 'Attributes',
+ synset.attributes()),
+ (ANTONYM, "Antonyms",
+ lemma_property(word, synset, lambda l: l.antonyms())),
+ (DERIVATIONALLY_RELATED_FORM, "Derivationally related form",
+ lemma_property(word, synset, lambda l: l.derivationally_related_forms())))
elif synset.pos() == wn.VERB:
- return (
- (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
- (HYPONYM, "Hyponym", synset.hyponyms()),
- (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
- (
- INDIRECT_HYPERNYMS,
- "Indirect hypernyms",
- rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
- ),
- (ENTAILMENT, "Entailments", synset.entailments()),
- (CAUSE, "Causes", synset.causes()),
- (ALSO_SEE, "Also see", synset.also_sees()),
- (VERB_GROUP, "Verb Groups", synset.verb_groups()),
- (
- DERIVATIONALLY_RELATED_FORM,
- "Derivationally related form",
- lemma_property(
- word, synset, lambda l: l.derivationally_related_forms()
- ),
- ),
- )
+ return ((ANTONYM, 'Antonym',
+ lemma_property(word, synset, lambda l: l.antonyms())),
+ (HYPONYM, 'Hyponym',
+ synset.hyponyms()),
+ (HYPERNYM, 'Direct hypernyms',
+ synset.hypernyms()),
+ (INDIRECT_HYPERNYMS, 'Indirect hypernyms',
+ rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1]),
+ (ENTAILMENT, 'Entailments',
+ synset.entailments()),
+ (CAUSE, 'Causes',
+ synset.causes()),
+ (ALSO_SEE, 'Also see',
+ synset.also_sees()),
+ (VERB_GROUP, 'Verb Groups',
+ synset.verb_groups()),
+ (DERIVATIONALLY_RELATED_FORM, "Derivationally related form",
+ lemma_property(word, synset, lambda l: l.derivationally_related_forms())))
elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT:
- return (
- (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
- (SIMILAR, "Similar to", synset.similar_tos()),
- # Participle of verb - not supported by corpus
- (
- PERTAINYM,
- "Pertainyms",
- lemma_property(word, synset, lambda l: l.pertainyms()),
- ),
- (ATTRIBUTE, "Attributes", synset.attributes()),
- (ALSO_SEE, "Also see", synset.also_sees()),
- )
+ return ((ANTONYM, 'Antonym',
+ lemma_property(word, synset, lambda l: l.antonyms())),
+ (SIMILAR, 'Similar to',
+ synset.similar_tos()),
+ # Participle of verb - not supported by corpus
+ (PERTAINYM, 'Pertainyms',
+ lemma_property(word, synset, lambda l: l.pertainyms())),
+ (ATTRIBUTE, 'Attributes',
+ synset.attributes()),
+ (ALSO_SEE, 'Also see',
+ synset.also_sees()))
elif synset.pos() == wn.ADV:
# This is weird. adverbs such as 'quick' and 'fast' don't seem
# to have antonyms returned by the corpus.a
- return (
- (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
- )
- # Derived from adjective - not supported by corpus
+ return ((ANTONYM, 'Antonym',
+ lemma_property(word, synset, lambda l: l.antonyms())),)
+ # Derived from adjective - not supported by corpus
else:
raise TypeError("Unhandles synset POS type: " + str(synset.pos()))
-html_header = """
+html_header = '''
<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
'http://www.w3.org/TR/html4/strict.dtd'>
<html>
'text/html; charset=us-ascii'>
<title>NLTK Wordnet Browser display of: %s</title></head>
<body bgcolor='#F5F5F5' text='#000000'>
-"""
-html_trailer = """
+'''
+html_trailer = '''
</body>
</html>
-"""
+'''
-explanation = """
+explanation = '''
<h3>Search Help</h3>
<ul><li>The display below the line is an example of the output the browser
shows you when you enter a search word. The search word was <b>green</b>.</li>
<b>Enter/Return</b> key or click the <b>Search</b> button.</li>
</ul>
<hr width='100%'>
-"""
+'''
# HTML oriented functions
+def _bold(txt): return '<b>%s</b>' % txt
-def _bold(txt):
- return "<b>%s</b>" % txt
-
-
-def _center(txt):
- return "<center>%s</center>" % txt
+def _center(txt): return '<center>%s</center>' % txt
+def _hlev(n,txt): return '<h%d>%s</h%d>' % (n,txt,n)
-def _hlev(n, txt):
- return "<h%d>%s</h%d>" % (n, txt, n)
-
-
-def _italic(txt):
- return "<i>%s</i>" % txt
-
-
-def _li(txt):
- return "<li>%s</li>" % txt
+def _italic(txt): return '<i>%s</i>' % txt
+def _li(txt): return '<li>%s</li>' % txt
def pg(word, body):
- """
+ '''
Return a HTML page of NLTK Browser format constructed from the
word and body
:type body: str
:return: a HTML page for the word-body combination
:rtype: str
- """
+ '''
return (html_header % word) + body + html_trailer
-
-def _ul(txt):
- return "<ul>" + txt + "</ul>"
-
+def _ul(txt): return '<ul>' + txt + '</ul>'
def _abbc(txt):
"""
abbc = asterisks, breaks, bold, center
"""
- return _center(_bold("<br>" * 10 + "*" * 10 + " " + txt + " " + "*" * 10))
-
+ return _center(_bold('<br>'*10 + '*'*10 + ' ' + txt + ' ' + '*'*10))
-full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n"
+full_hyponym_cont_text = \
+ _ul(_li(_italic('(has full hyponym continuation)'))) + '\n'
def _get_synset(synset_key):
"""
return wn.synset(synset_key)
-
def _collect_one_synset(word, synset, synset_relations):
- """
+ '''
Returns the HTML string for one synset or word
:param word: the current word
:type synset_relations: dict(synset_key, set(relation_id))
:return: The HTML string built for this synset
:rtype: str
- """
- if isinstance(synset, tuple): # It's a word
+ '''
+ if isinstance(synset, tuple): # It's a word
raise NotImplementedError("word not supported by _collect_one_synset")
- typ = "S"
+ typ = 'S'
pos_tuple = _pos_match((synset.pos(), None, None))
assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos()
descr = pos_tuple[2]
synset_label = typ + ";"
if synset.name() in synset_relations:
synset_label = _bold(synset_label)
- s = "<li>%s (%s) " % (make_lookup_link(ref, synset_label), descr)
-
+ s = '<li>%s (%s) ' % (make_lookup_link(ref, synset_label), descr)
def format_lemma(w):
- w = w.replace("_", " ")
+ w = w.replace('_', ' ')
if w.lower() == word:
return _bold(w)
else:
ref = Reference(w)
return make_lookup_link(ref, w)
- s += ", ".join(format_lemma(l.name()) for l in synset.lemmas())
-
- gl = " (%s) <i>%s</i> " % (
- synset.definition(),
- "; ".join('"%s"' % e for e in synset.examples()),
- )
- return s + gl + _synset_relations(word, synset, synset_relations) + "</li>\n"
+ s += ', '.join(format_lemma(l.name()) for l in synset.lemmas())
+ gl = " (%s) <i>%s</i> " % \
+ (synset.definition(),
+ "; ".join("\"%s\"" % e for e in synset.examples()))
+ return s + gl + _synset_relations(word, synset, synset_relations) + '</li>\n'
def _collect_all_synsets(word, pos, synset_relations=dict()):
"""
Return a HTML unordered list of synsets for the given word and
part of speech.
"""
- return "<ul>%s\n</ul>\n" % "".join(
- (
- _collect_one_synset(word, synset, synset_relations)
- for synset in wn.synsets(word, pos)
- )
- )
-
+ return '<ul>%s\n</ul>\n' % \
+ ''.join((_collect_one_synset(word, synset, synset_relations)
+ for synset
+ in wn.synsets(word, pos)))
def _synset_relations(word, synset, synset_relations):
- """
+ '''
Builds the HTML string for the relations of a synset
:param word: The current word
:type synset_relations: dict(synset_key, set(relation_type))
:return: The HTML for a synset's relations
:rtype: str
- """
+ '''
if not synset.name() in synset_relations:
return ""
elif isinstance(r, tuple):
# It's probably a tuple containing a Synset and a list of
# similar tuples. This forms a tree of synsets.
- return "%s\n<ul>%s</ul>\n" % (
- relation_html(r[0]),
- "".join("<li>%s</li>\n" % relation_html(sr) for sr in r[1]),
- )
+ return "%s\n<ul>%s</ul>\n" % \
+ (relation_html(r[0]),
+ ''.join('<li>%s</li>\n' % relation_html(sr) for sr in r[1]))
else:
- raise TypeError(
- "r must be a synset, lemma or list, it was: type(r) = %s, r = %s"
- % (type(r), r)
- )
+ raise TypeError("r must be a synset, lemma or list, it was: type(r) = %s, r = %s" % (type(r), r))
def make_synset_html(db_name, disp_name, rels):
- synset_html = "<i>%s</i>\n" % make_lookup_link(
- copy.deepcopy(ref).toggle_synset_relation(synset, db_name).encode(),
- disp_name,
- )
+ synset_html = '<i>%s</i>\n' % \
+ make_lookup_link(
+ copy.deepcopy(ref).toggle_synset_relation(synset, db_name).encode(),
+ disp_name)
if db_name in ref.synset_relations[synset.name()]:
- synset_html += "<ul>%s</ul>\n" % "".join(
- "<li>%s</li>\n" % relation_html(r) for r in rels
- )
+ synset_html += '<ul>%s</ul>\n' % \
+ ''.join("<li>%s</li>\n" % relation_html(r) for r in rels)
return synset_html
- html = (
- "<ul>"
- + "\n".join(
- (
- "<li>%s</li>" % make_synset_html(*rel_data)
- for rel_data in get_relations_data(word, synset)
- if rel_data[2] != []
- )
- )
- + "</ul>"
- )
+ html = '<ul>' + \
+ '\n'.join(("<li>%s</li>" % make_synset_html(*rel_data) for rel_data
+ in get_relations_data(word, synset)
+ if rel_data[2] != [])) + \
+ '</ul>'
return html
"""
Return a HTML page for the given word.
- :type word: str
:param word: The currently active word
+ :type word: str
:return: A tuple (page,word), where page is the new current HTML page
- to be sent to the browser and
- word is the new current word
+ to be sent to the browser and
+ word is the new current word
:rtype: A tuple (str,str)
"""
return page_from_reference(Reference(word))
-
def page_from_href(href):
- """
+ '''
Returns a tuple of the HTML page built and the new current word
:param href: The hypertext reference to be solved
to be sent to the browser and
word is the new current word
:rtype: A tuple (str,str)
- """
+ '''
return page_from_reference(Reference.decode(href))
-
def page_from_reference(href):
- """
+ '''
Returns a tuple of the HTML page built and the new current word
:param href: The hypertext reference to be solved
to be sent to the browser and
word is the new current word
:rtype: A tuple (str,str)
- """
+ '''
word = href.word
pos_forms = defaultdict(list)
- words = word.split(",")
- words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""]
+ words = word.split(',')
+ words = [w for w in [w.strip().lower().replace(' ', '_')
+ for w in words]
+ if w != ""]
if len(words) == 0:
# No words were found.
return "", "Please specify a word to search for."
form = wn.morphy(w, pos)
if form and form not in pos_forms[pos]:
pos_forms[pos].append(form)
- body = ""
- for pos, pos_str, name in _pos_tuples():
+ body = ''
+ for pos,pos_str,name in _pos_tuples():
if pos in pos_forms:
- body += _hlev(3, name) + "\n"
+ body += _hlev(3, name) + '\n'
for w in pos_forms[pos]:
# Not all words of exc files are in the database, skip
# to the next word if a KeyError is raised.
return body, word
+\f
#####################################################################
# Static pages
#####################################################################
-
def get_static_page_by_path(path):
"""
Return a static HTML page from the path given.
"""
Return the static web help page.
"""
- return """
+ return \
+"""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2020 NLTK Project
+ Copyright (C) 2001-2017 NLTK Project
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
"""
Get the static welcome page.
"""
- return """
+ return \
+"""
<h3>Search Help</h3>
<ul><li>The display below the line is an example of the output the browser
shows you when you enter a search word. The search word was <b>green</b>.</li>
</ul>
"""
-
def get_static_index_page(with_shutdown):
"""
Get the static index page.
"""
- template = """
+ template = \
+"""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">
<HTML>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2020 NLTK Project
+ Copyright (C) 2001-2017 NLTK Project
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
If with_shutdown is True then a 'shutdown' button is also provided
to shutdown the server.
"""
- template = """
+ template = \
+"""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2020 NLTK Project
+ Copyright (C) 2001-2017 NLTK Project
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
</html>
"""
if with_shutdown:
- shutdown_link = '<a href="SHUTDOWN THE SERVER">Shutdown</a>'
+ shutdown_link = "<a href=\"SHUTDOWN THE SERVER\">Shutdown</a>"
else:
shutdown_link = ""
return template % shutdown_link
+
def usage():
"""
Display the command line help message.
"""
print(__doc__)
-
def app():
# Parse and interpret options.
- (opts, _) = getopt.getopt(
- argv[1:], "l:p:sh", ["logfile=", "port=", "server-mode", "help"]
- )
+ (opts, _) = getopt.getopt(argv[1:], "l:p:sh",
+ ["logfile=", "port=", "server-mode", "help"])
port = 8000
server_mode = False
help_mode = False
else:
wnb(port, not server_mode, logfilename)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
app()
-__all__ = ["app"]
+__all__ = ['app']
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
-from nltk.corpus import (
- gutenberg,
- genesis,
- inaugural,
- nps_chat,
- webtext,
- treebank,
- wordnet,
-)
+from nltk.corpus import (gutenberg, genesis, inaugural,
+ nps_chat, webtext, treebank, wordnet)
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")
-text1 = Text(gutenberg.words("melville-moby_dick.txt"))
+text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)
-text2 = Text(gutenberg.words("austen-sense.txt"))
+text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)
-text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
+text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print("text3:", text3.name)
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)
-text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
+text6 = Text(webtext.words('grail.txt'),
+ name="Monty Python and the Holy Grail")
print("text6:", text6.name)
text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)
-text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
+text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)
-text9 = Text(gutenberg.words("chesterton-thursday.txt"))
+text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print("text9:", text9.name)
print("text8:", text8.name)
print("text9:", text9.name)
-
sent1 = ["Call", "me", "Ishmael", "."]
-sent2 = [
- "The",
- "family",
- "of",
- "Dashwood",
- "had",
- "long",
- "been",
- "settled",
- "in",
- "Sussex",
- ".",
-]
-sent3 = [
- "In",
- "the",
- "beginning",
- "God",
- "created",
- "the",
- "heaven",
- "and",
- "the",
- "earth",
- ".",
-]
-sent4 = [
- "Fellow",
- "-",
- "Citizens",
- "of",
- "the",
- "Senate",
- "and",
- "of",
- "the",
- "House",
- "of",
- "Representatives",
- ":",
-]
-sent5 = [
- "I",
- "have",
- "a",
- "problem",
- "with",
- "people",
- "PMing",
- "me",
- "to",
- "lol",
- "JOIN",
-]
-sent6 = [
- "SCENE",
- "1",
- ":",
- "[",
- "wind",
- "]",
- "[",
- "clop",
- "clop",
- "clop",
- "]",
- "KING",
- "ARTHUR",
- ":",
- "Whoa",
- "there",
- "!",
-]
-sent7 = [
- "Pierre",
- "Vinken",
- ",",
- "61",
- "years",
- "old",
- ",",
- "will",
- "join",
- "the",
- "board",
- "as",
- "a",
- "nonexecutive",
- "director",
- "Nov.",
- "29",
- ".",
-]
-sent8 = [
- "25",
- "SEXY",
- "MALE",
- ",",
- "seeks",
- "attrac",
- "older",
- "single",
- "lady",
- ",",
- "for",
- "discreet",
- "encounters",
- ".",
-]
-sent9 = [
- "THE",
- "suburb",
- "of",
- "Saffron",
- "Park",
- "lay",
- "on",
- "the",
- "sunset",
- "side",
- "of",
- "London",
- ",",
- "as",
- "red",
- "and",
- "ragged",
- "as",
- "a",
- "cloud",
- "of",
- "sunset",
- ".",
-]
+sent2 = ["The", "family", "of", "Dashwood", "had", "long",
+ "been", "settled", "in", "Sussex", "."]
+sent3 = ["In", "the", "beginning", "God", "created", "the",
+ "heaven", "and", "the", "earth", "."]
+sent4 = ["Fellow", "-", "Citizens", "of", "the", "Senate",
+ "and", "of", "the", "House", "of", "Representatives", ":"]
+sent5 = ["I", "have", "a", "problem", "with", "people",
+ "PMing", "me", "to", "lol", "JOIN"]
+sent6 = ['SCENE', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop',
+ 'clop', ']', 'KING', 'ARTHUR', ':', 'Whoa', 'there', '!']
+sent7 = ["Pierre", "Vinken", ",", "61", "years", "old", ",",
+ "will", "join", "the", "board", "as", "a", "nonexecutive",
+ "director", "Nov.", "29", "."]
+sent8 = ['25', 'SEXY', 'MALE', ',', 'seeks', 'attrac', 'older',
+ 'single', 'lady', ',', 'for', 'discreet', 'encounters', '.']
+sent9 = ["THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the",
+ "sunset", "side", "of", "London", ",", "as", "red", "and",
+ "ragged", "as", "a", "cloud", "of", "sunset", "."]
def sents():
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
For more information see nltk/doc/contrib/ccg/ccg.pdf
"""
-from nltk.ccg.combinator import (
- UndirectedBinaryCombinator,
- DirectedBinaryCombinator,
- ForwardCombinator,
- BackwardCombinator,
- UndirectedFunctionApplication,
- ForwardApplication,
- BackwardApplication,
- UndirectedComposition,
- ForwardComposition,
- BackwardComposition,
- BackwardBx,
- UndirectedSubstitution,
- ForwardSubstitution,
- BackwardSx,
- UndirectedTypeRaise,
- ForwardT,
- BackwardT,
-)
+from nltk.ccg.combinator import (UndirectedBinaryCombinator, DirectedBinaryCombinator,
+ ForwardCombinator, BackwardCombinator,
+ UndirectedFunctionApplication, ForwardApplication,
+ BackwardApplication, UndirectedComposition,
+ ForwardComposition, BackwardComposition,
+ BackwardBx, UndirectedSubstitution, ForwardSubstitution,
+ BackwardSx, UndirectedTypeRaise, ForwardT, BackwardT)
from nltk.ccg.chart import CCGEdge, CCGLeafEdge, CCGChartParser, CCGChart
from nltk.ccg.lexicon import CCGLexicon
# Natural Language Toolkit: CCG Categories
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
+from __future__ import unicode_literals
from functools import total_ordering
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.internals import raise_unorderable_types
+from nltk.compat import (python_2_unicode_compatible, unicode_repr)
+
+@add_metaclass(ABCMeta)
@total_ordering
-class AbstractCCGCategory(metaclass=ABCMeta):
- """
+class AbstractCCGCategory(object):
+ '''
Interface for categories in combinatory grammars.
- """
+ '''
@abstractmethod
def is_primitive(self):
pass
def __eq__(self, other):
- return (
- self.__class__ is other.__class__
- and self._comparison_key == other._comparison_key
- )
+ return (self.__class__ is other.__class__ and
+ self._comparison_key == other._comparison_key)
def __ne__(self, other):
return not self == other
return self._hash
+@python_2_unicode_compatible
class CCGVar(AbstractCCGCategory):
- """
+ '''
Class representing a variable CCG category.
Used for conjunctions (and possibly type-raising, if implemented as a
unary rule).
- """
-
+ '''
_maxID = 0
def __init__(self, prim_only=False):
@total_ordering
+@python_2_unicode_compatible
class Direction(object):
- """
+ '''
Class representing the direction of a function application.
Also contains maintains information as to which combinators
may be used with the category.
- """
-
+ '''
def __init__(self, dir, restrictions):
self._dir = dir
self._restrs = restrictions
# Testing the application direction
def is_forward(self):
- return self._dir == "/"
+ return self._dir == '/'
def is_backward(self):
- return self._dir == "\\"
+ return self._dir == '\\'
def dir(self):
return self._dir
return self._restrs
def is_variable(self):
- return self._restrs == "_"
+ return self._restrs == '_'
# Unification and substitution of variable directions.
# Used only if type-raising is implemented as a unary rule, as it
# must inherit restrictions from the argument category.
def can_unify(self, other):
if other.is_variable():
- return [("_", self.restrs())]
+ return [('_', self.restrs())]
elif self.is_variable():
- return [("_", other.restrs())]
+ return [('_', other.restrs())]
else:
if self.restrs() == other.restrs():
return []
return self
for (var, restrs) in subs:
- if var == "_":
+ if var == '_':
return Direction(self._dir, restrs)
return self
# Testing permitted combinators
def can_compose(self):
- return "," not in self._restrs
+ return (',' not in self._restrs)
def can_cross(self):
- return "." not in self._restrs
+ return ('.' not in self._restrs)
def __eq__(self, other):
- return (
- self.__class__ is other.__class__
- and self._comparison_key == other._comparison_key
- )
+ return (self.__class__ is other.__class__ and
+ self._comparison_key == other._comparison_key)
def __ne__(self, other):
return not self == other
# The negation operator reverses the direction of the application
def __neg__(self):
- if self._dir == "/":
- return Direction("\\", self._restrs)
+ if self._dir == '/':
+ return Direction('\\', self._restrs)
else:
- return Direction("/", self._restrs)
+ return Direction('/', self._restrs)
+@python_2_unicode_compatible
class PrimitiveCategory(AbstractCCGCategory):
- """
+ '''
Class representing primitive categories.
Takes a string representation of the category, and a
list of strings specifying the morphological subcategories.
- """
-
+ '''
def __init__(self, categ, restrictions=[]):
self._categ = categ
self._restrs = restrictions
def __str__(self):
if self._restrs == []:
return "%s" % self._categ
- restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
+ restrictions = "[%s]" % ",".join(unicode_repr(r) for r in self._restrs)
return "%s%s" % (self._categ, restrictions)
+@python_2_unicode_compatible
class FunctionalCategory(AbstractCCGCategory):
- """
+ '''
Class that represents a function application category.
Consists of argument and result categories, together with
an application direction.
- """
-
+ '''
def __init__(self, res, arg, dir):
self._res = res
self._arg = arg
sa = self._res.can_unify(other.res())
sd = self._dir.can_unify(other.dir())
if sa is not None and sd is not None:
- sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
+ sb = self._arg.substitute(sa).can_unify(
+ other.arg().substitute(sa))
if sb is not None:
return sa + sb
return None
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
This entire process is shown far more clearly in the demonstration:
python chart.py
"""
+from __future__ import print_function, division, unicode_literals
import itertools
+from six import string_types
+
from nltk.parse import ParserI
from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
from nltk.tree import Tree
from nltk.ccg.lexicon import fromstring, Token
-from nltk.ccg.combinator import (
- ForwardT,
- BackwardT,
- ForwardApplication,
- BackwardApplication,
- ForwardComposition,
- BackwardComposition,
- ForwardSubstitution,
- BackwardBx,
- BackwardSx,
-)
-
+from nltk.ccg.combinator import (ForwardT, BackwardT, ForwardApplication,
+ BackwardApplication, ForwardComposition,
+ BackwardComposition, ForwardSubstitution,
+ BackwardBx, BackwardSx)
+from nltk.compat import python_2_unicode_compatible
from nltk.ccg.combinator import *
from nltk.ccg.logic import *
from nltk.sem.logic import *
self._comparison_key = (span, categ, rule)
# Accessors
- def lhs(self):
- return self._categ
-
- def span(self):
- return self._span
-
- def start(self):
- return self._span[0]
-
- def end(self):
- return self._span[1]
-
- def length(self):
- return self._span[1] - self.span[0]
-
- def rhs(self):
- return ()
-
- def dot(self):
- return 0
-
- def is_complete(self):
- return True
-
- def is_incomplete(self):
- return False
-
- def nextsym(self):
- return None
-
- def categ(self):
- return self._categ
-
- def rule(self):
- return self._rule
-
+ def lhs(self): return self._categ
+ def span(self): return self._span
+ def start(self): return self._span[0]
+ def end(self): return self._span[1]
+ def length(self): return self._span[1] - self.span[0]
+ def rhs(self): return ()
+ def dot(self): return 0
+ def is_complete(self): return True
+ def is_incomplete(self): return False
+ def nextsym(self): return None
+
+ def categ(self): return self._categ
+ def rule(self): return self._rule
class CCGLeafEdge(EdgeI):
- """
+ '''
Class representing leaf edges in a CCG derivation.
- """
-
+ '''
def __init__(self, pos, token, leaf):
self._pos = pos
self._token = token
self._comparison_key = (pos, token.categ(), leaf)
# Accessors
- def lhs(self):
- return self._token.categ()
-
- def span(self):
- return (self._pos, self._pos + 1)
-
- def start(self):
- return self._pos
-
- def end(self):
- return self._pos + 1
-
- def length(self):
- return 1
-
- def rhs(self):
- return self._leaf
-
- def dot(self):
- return 0
-
- def is_complete(self):
- return True
-
- def is_incomplete(self):
- return False
-
- def nextsym(self):
- return None
-
- def token(self):
- return self._token
-
- def categ(self):
- return self._token.categ()
-
- def leaf(self):
- return self._leaf
-
-
+ def lhs(self): return self._token.categ()
+ def span(self): return (self._pos, self._pos+1)
+ def start(self): return self._pos
+ def end(self): return self._pos + 1
+ def length(self): return 1
+ def rhs(self): return self._leaf
+ def dot(self): return 0
+ def is_complete(self): return True
+ def is_incomplete(self): return False
+ def nextsym(self): return None
+
+ def token(self): return self._token
+ def categ(self): return self._token.categ()
+ def leaf(self): return self._leaf
+
+@python_2_unicode_compatible
class BinaryCombinatorRule(AbstractChartRule):
- """
+ '''
Class implementing application of a binary combinator to a chart.
Takes the directed combinator to apply.
- """
-
+ '''
NUMEDGES = 2
-
- def __init__(self, combinator):
+ def __init__(self,combinator):
self._combinator = combinator
# Apply a combinator
# Check if the two edges are permitted to combine.
# If so, generate the corresponding edge.
- if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
+ if self._combinator.can_combine(left_edge.categ(),right_edge.categ()):
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
- new_edge = CCGEdge(
- span=(left_edge.start(), right_edge.end()),
- categ=res,
- rule=self._combinator,
- )
- if chart.insert(new_edge, (left_edge, right_edge)):
+ new_edge = CCGEdge(span=(left_edge.start(), right_edge.end()),categ=res,rule=self._combinator)
+ if chart.insert(new_edge,(left_edge,right_edge)):
yield new_edge
# The representation of the combinator (for printing derivations)
def __str__(self):
return "%s" % self._combinator
-
# Type-raising must be handled slightly differently to the other rules, as the
# resulting rules only span a single edge, rather than both edges.
-
-
+@python_2_unicode_compatible
class ForwardTypeRaiseRule(AbstractChartRule):
- """
+ '''
Class for applying forward type raising
- """
-
+ '''
NUMEDGES = 2
def __init__(self):
- self._combinator = ForwardT
-
+ self._combinator = ForwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
- new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
- if chart.insert(new_edge, (left_edge,)):
+ new_edge = CCGEdge(span=left_edge.span(),categ=res,rule=self._combinator)
+ if chart.insert(new_edge,(left_edge,)):
yield new_edge
def __str__(self):
return "%s" % self._combinator
-
+@python_2_unicode_compatible
class BackwardTypeRaiseRule(AbstractChartRule):
- """
+ '''
Class for applying backward type raising.
- """
-
+ '''
NUMEDGES = 2
def __init__(self):
- self._combinator = BackwardT
-
+ self._combinator = BackwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
- new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
- if chart.insert(new_edge, (right_edge,)):
+ new_edge = CCGEdge(span=right_edge.span(),categ=res,rule=self._combinator)
+ if chart.insert(new_edge,(right_edge,)):
yield new_edge
def __str__(self):
# Common sets of combinators used for English derivations.
-ApplicationRuleSet = [
- BinaryCombinatorRule(ForwardApplication),
- BinaryCombinatorRule(BackwardApplication),
-]
-CompositionRuleSet = [
- BinaryCombinatorRule(ForwardComposition),
- BinaryCombinatorRule(BackwardComposition),
- BinaryCombinatorRule(BackwardBx),
-]
-SubstitutionRuleSet = [
- BinaryCombinatorRule(ForwardSubstitution),
- BinaryCombinatorRule(BackwardSx),
-]
+ApplicationRuleSet = [BinaryCombinatorRule(ForwardApplication),
+ BinaryCombinatorRule(BackwardApplication)]
+CompositionRuleSet = [BinaryCombinatorRule(ForwardComposition),
+ BinaryCombinatorRule(BackwardComposition),
+ BinaryCombinatorRule(BackwardBx)]
+SubstitutionRuleSet = [BinaryCombinatorRule(ForwardSubstitution),
+ BinaryCombinatorRule(BackwardSx)]
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
# The standard English rule set.
-DefaultRuleSet = (
- ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
-)
-
+DefaultRuleSet = ApplicationRuleSet + CompositionRuleSet + \
+ SubstitutionRuleSet + TypeRaiseRuleSet
class CCGChartParser(ParserI):
- """
+ '''
Chart parser for CCGs.
Based largely on the ChartParser class from NLTK.
- """
-
+ '''
def __init__(self, lexicon, rules, trace=0):
self._lexicon = lexicon
self._rules = rules
def lexicon(self):
return self._lexicon
- # Implements the CYK algorithm
+ # Implements the CYK algorithm
def parse(self, tokens):
tokens = list(tokens)
chart = CCGChart(list(tokens))
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
chart.insert(new_edge, ())
+
# Select a span for the new edges
- for span in range(2, chart.num_leaves() + 1):
- for start in range(0, chart.num_leaves() - span + 1):
+ for span in range(2,chart.num_leaves()+1):
+ for start in range(0,chart.num_leaves()-span+1):
# Try all possible pairs of edges that could generate
# an edge for that span
- for part in range(1, span):
+ for part in range(1,span):
lstart = start
mid = start + part
rend = start + span
- for left in chart.select(span=(lstart, mid)):
- for right in chart.select(span=(mid, rend)):
+ for left in chart.select(span=(lstart,mid)):
+ for right in chart.select(span=(mid,rend)):
# Generate all possible combinations of the two edges
for rule in self._rules:
edges_added_by_rule = 0
- for newedge in rule.apply(chart, lex, left, right):
+ for newedge in rule.apply(chart,lex,left,right):
edges_added_by_rule += 1
# Output the resulting parses
return chart.parses(lex.start())
-
class CCGChart(Chart):
def __init__(self, tokens):
Chart.__init__(self, tokens)
if edge in memo:
return memo[edge]
- if isinstance(edge, CCGLeafEdge):
+ if isinstance(edge,CCGLeafEdge):
word = tree_class(edge.token(), [self._tokens[edge.start()]])
leaf = tree_class((edge.token(), "Leaf"), [word])
memo[edge] = [leaf]
trees = []
for cpl in self.child_pointer_lists(edge):
- child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
+ child_choices = [self._trees(cp, complete, memo, tree_class)
+ for cp in cpl]
for children in itertools.product(*child_choices):
- lhs = (
- Token(
- self._tokens[edge.start() : edge.end()],
- edge.lhs(),
- compute_semantics(children, edge),
- ),
- str(edge.rule()),
- )
+ lhs = (Token(self._tokens[edge.start():edge.end()], edge.lhs(), compute_semantics(children, edge)), str(edge.rule()))
trees.append(tree_class(lhs, children))
memo[edge] = trees
if children[0].label()[0].semantics() is None:
return None
- if len(children) == 2:
+ if len(children) is 2:
if isinstance(edge.rule(), BackwardCombinator):
- children = [children[1], children[0]]
+ children = [children[1],children[0]]
combinator = edge.rule()._combinator
function = children[0].label()[0].semantics()
elif isinstance(combinator, UndirectedSubstitution):
return compute_substitution_semantics(function, argument)
else:
- raise AssertionError("Unsupported combinator '" + combinator + "'")
+ raise AssertionError('Unsupported combinator \'' + combinator + '\'')
else:
return compute_type_raised_semantics(children[0].label()[0].semantics())
-
-# --------
+#--------
# Displaying derivations
-# --------
+#--------
def printCCGDerivation(tree):
# Get the leaves and initial categories
leafcats = tree.pos()
- leafstr = ""
- catstr = ""
+ leafstr = ''
+ catstr = ''
# Construct a string with both the leaf word and corresponding
# category aligned.
nextlen = 2 + max(len(leaf), len(str_cat))
lcatlen = (nextlen - len(str_cat)) // 2
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
- catstr += " " * lcatlen + str_cat + " " * rcatlen
+ catstr += ' '*lcatlen + str_cat + ' '*rcatlen
lleaflen = (nextlen - len(leaf)) // 2
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
- leafstr += " " * lleaflen + leaf + " " * rleaflen
+ leafstr += ' '*lleaflen + leaf + ' '*rleaflen
print(leafstr.rstrip())
print(catstr.rstrip())
# Display the derivation steps
- printCCGTree(0, tree)
-
+ printCCGTree(0,tree)
# Prints the sequence of derivation steps.
-def printCCGTree(lwidth, tree):
+def printCCGTree(lwidth,tree):
rwidth = lwidth
# Is a leaf (word).
# Find the width of the current derivation step
for child in tree:
- rwidth = max(rwidth, printCCGTree(rwidth, child))
+ rwidth = max(rwidth, printCCGTree(rwidth,child))
# Is a leaf node.
# Don't print anything, but account for the space occupied.
if not isinstance(tree.label(), tuple):
- return max(
- rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
- )
+ return max(rwidth,2 + lwidth + len("%s" % tree.label()),
+ 2 + lwidth + len(tree[0]))
(token, op) = tree.label()
- if op == "Leaf":
+ if op == 'Leaf':
return rwidth
# Pad to the left with spaces, followed by a sequence of '-'
# and the derivation rule.
- print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
+ print(lwidth*' ' + (rwidth-lwidth)*'-' + "%s" % op)
# Print the resulting category on a new line.
str_res = "%s" % (token.categ())
if token.semantics() is not None:
str_res += " {" + str(token.semantics()) + "}"
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
- print(respadlen * " " + str_res)
+ print(respadlen*' ' + str_res)
return rwidth
-
### Demonstration code
# Construct the lexicon
-lex = fromstring(
- """
+lex = fromstring('''
:- S, NP, N, VP # Primitive categories, S is the target primitive
Det :: NP/N # Family of words
mushrooms => N
parsnips => N
bacon => N
- """
-)
-
+ ''')
def demo():
parser = CCGChartParser(lex, DefaultRuleSet)
for parse in parser.parse("I might cook and eat the bacon".split()):
printCCGDerivation(parse)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
CCG Combinators
"""
+from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from nltk.compat import python_2_unicode_compatible
from nltk.ccg.api import FunctionalCategory
-class UndirectedBinaryCombinator(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class UndirectedBinaryCombinator(object):
"""
Abstract class for representing a binary combinator.
Merely defines functions for checking if the function and argument
of the combinators; these restrictions must be added in the rule
class.
"""
-
@abstractmethod
def can_combine(self, function, argument):
pass
pass
-class DirectedBinaryCombinator(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class DirectedBinaryCombinator(object):
"""
Wrapper for the undirected binary combinator.
It takes left and right categories, and decides which is to be
the function, and which the argument.
It then decides whether or not they can be combined.
"""
-
@abstractmethod
def can_combine(self, left, right):
pass
pass
+@python_2_unicode_compatible
class ForwardCombinator(DirectedBinaryCombinator):
"""
Class representing combinators where the primary functor is on the left.
Takes an undirected combinator, and a predicate which adds constraints
restricting the cases in which it may apply.
"""
-
- def __init__(self, combinator, predicate, suffix=""):
+ def __init__(self, combinator, predicate, suffix=''):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
- return self._combinator.can_combine(left, right) and self._predicate(
- left, right
- )
+ return (self._combinator.can_combine(left, right) and
+ self._predicate(left, right))
def combine(self, left, right):
for cat in self._combinator.combine(left, right):
return ">%s%s" % (self._combinator, self._suffix)
+@python_2_unicode_compatible
class BackwardCombinator(DirectedBinaryCombinator):
"""
The backward equivalent of the ForwardCombinator class.
"""
-
- def __init__(self, combinator, predicate, suffix=""):
+ def __init__(self, combinator, predicate, suffix=''):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
- return self._combinator.can_combine(right, left) and self._predicate(
- left, right
- )
+ return (self._combinator.can_combine(right, left) and
+ self._predicate(left, right))
def combine(self, left, right):
for cat in self._combinator.combine(right, left):
return "<%s%s" % (self._combinator, self._suffix)
+@python_2_unicode_compatible
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
"""
Class representing function application.
yield function.res().substitute(subs)
def __str__(self):
- return ""
+ return ''
# Predicates for function application.
# Application combinator instances
-ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
-BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
+ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(),
+ forwardOnly)
+BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(),
+ backwardOnly)
+@python_2_unicode_compatible
class UndirectedComposition(UndirectedBinaryCombinator):
"""
Functional composition (harmonic) combinator.
X/Y Y/Z -> X/Z (B>)
And the corresponding backwards and crossed variations.
"""
-
def can_combine(self, function, argument):
# Can only combine two functions, and both functions must
# allow composition.
if function.dir().can_compose() and argument.dir().can_compose():
subs = function.arg().can_unify(argument.res())
if subs is not None:
- yield FunctionalCategory(
- function.res().substitute(subs),
- argument.arg().substitute(subs),
- argument.dir(),
- )
+ yield FunctionalCategory(function.res().substitute(subs),
+ argument.arg().substitute(subs),
+ argument.dir())
def __str__(self):
- return "B"
+ return 'B'
# Predicates for restricting application of straight composition.
# Straight composition combinators
-ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
-BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
+ForwardComposition = ForwardCombinator(UndirectedComposition(),
+ forwardOnly)
+BackwardComposition = BackwardCombinator(UndirectedComposition(),
+ backwardOnly)
# Backward crossed composition
-BackwardBx = BackwardCombinator(
- UndirectedComposition(), backwardBxConstraint, suffix="x"
-)
+BackwardBx = BackwardCombinator(UndirectedComposition(), backwardBxConstraint,
+ suffix='x')
+@python_2_unicode_compatible
class UndirectedSubstitution(UndirectedBinaryCombinator):
"""
Substitution (permutation) combinator.
Y/Z (X\Y)/Z -> X/Z (<Sx)
And other variations.
"""
-
def can_combine(self, function, argument):
if function.is_primitive() or argument.is_primitive():
return False
if not (function.dir().can_compose() and argument.dir().can_compose()):
return False
- return (function.res().arg() == argument.res()) and (
- function.arg() == argument.arg()
- )
+ return (function.res().arg() == argument.res()) and (function.arg() == argument.arg())
def combine(self, function, argument):
if self.can_combine(function, argument):
- yield FunctionalCategory(
- function.res().res(), argument.arg(), argument.dir()
- )
+ yield FunctionalCategory(function.res().res(), argument.arg(),
+ argument.dir())
def __str__(self):
- return "S"
+ return 'S'
# Predicate for forward substitution
# Instances of substitution combinators
-ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
-BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
+ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(),
+ forwardSConstraint)
+BackwardSx = BackwardCombinator(UndirectedSubstitution(),
+ backwardSxConstraint, 'x')
# Retrieves the left-most functional category.
return categ
+@python_2_unicode_compatible
class UndirectedTypeRaise(UndirectedBinaryCombinator):
"""
Undirected combinator for type raising.
"""
-
def can_combine(self, function, arg):
# The argument must be a function.
# The restriction that arg.res() must be a function
return False
def combine(self, function, arg):
- if not (
- function.is_primitive() and arg.is_function() and arg.res().is_function()
- ):
+ if not (function.is_primitive() and
+ arg.is_function() and arg.res().is_function()):
return
# Type-raising matches only the innermost application.
subs = function.can_unify(arg.arg())
if subs is not None:
xcat = arg.res().substitute(subs)
- yield FunctionalCategory(
- xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
- )
+ yield FunctionalCategory(xcat,
+ FunctionalCategory(xcat, function,
+ arg.dir()),
+ -(arg.dir()))
def __str__(self):
- return "T"
+ return 'T'
# Predicates for type-raising
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
CCG Lexicons
"""
+from __future__ import unicode_literals
+
import re
from collections import defaultdict
from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
+from nltk.compat import python_2_unicode_compatible
from nltk.internals import deprecated
-from nltk.sem.logic import Expression
+from nltk.sem.logic import *
-# ------------
+#------------
# Regular expressions used for parsing components of the lexicon
-# ------------
+#------------
# Parses a primitive category and subscripts
-PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
+PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
# Separates the next primitive category from the remainder of the
# string
-NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
+NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
# Separates the next application operator from the remainder
-APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
+APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
# Parses the definition of the right-hand side (rhs) of either a word or a family
-LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
+LEX_RE = re.compile(r'''([\S_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
# Parses the right hand side that contains category and maybe semantic predicate
-RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
+RHS_RE = re.compile(r'''([^{}]*[^ {}])\s*(\{[^}]+\})?''', re.UNICODE)
# Parses the semantic predicate
-SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
+SEMANTICS_RE = re.compile(r'''\{([^}]+)\}''', re.UNICODE)
# Strips comments from a line
-COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
-
+COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
class Token(object):
"""
* `categ` (string)
* `semantics` (Expression)
"""
-
def __init__(self, token, categ, semantics=None):
self._token = token
self._categ = categ
self._semantics = semantics
-
+
def categ(self):
return self._categ
-
+
def semantics(self):
return self._semantics
-
+
def __str__(self):
semantics_str = ""
if self._semantics is not None:
semantics_str = " {" + str(self._semantics) + "}"
return "" + str(self._categ) + semantics_str
-
+
def __cmp__(self, other):
- if not isinstance(other, Token):
- return -1
- return cmp((self._categ, self._semantics), other.categ(), other.semantics())
-
+ if not isinstance(other, Token): return -1
+ return cmp((self._categ,self._semantics),
+ other.categ(),other.semantics())
+@python_2_unicode_compatible
class CCGLexicon(object):
"""
Class representing a lexicon for CCG grammars.
* `families`: Families of categories
* `entries`: A mapping of words to possible categories
"""
-
def __init__(self, start, primitives, families, entries):
self._start = PrimitiveCategory(start)
self._primitives = primitives
self._families = families
self._entries = entries
+
def categories(self, word):
"""
Returns all the possible categories for a word
"""
return self._entries[word]
+
def start(self):
"""
Return the target category for the parser
return string
-# -----------
+#-----------
# Parsing lexicons
-# -----------
+#-----------
def matchBrackets(string):
rest = string[1:]
inside = "("
- while rest != "" and not rest.startswith(")"):
- if rest.startswith("("):
+ while rest != "" and not rest.startswith(')'):
+ if rest.startswith('('):
(part, rest) = matchBrackets(rest)
inside = inside + part
else:
inside = inside + rest[0]
rest = rest[1:]
- if rest.startswith(")"):
- return (inside + ")", rest[1:])
- raise AssertionError("Unmatched bracket in string '" + string + "'")
+ if rest.startswith(')'):
+ return (inside + ')', rest[1:])
+ raise AssertionError('Unmatched bracket in string \'' + string + '\'')
def nextCategory(string):
Separate the string for the next portion of the category from the rest
of the string
"""
- if string.startswith("("):
+ if string.startswith('('):
return matchBrackets(string)
return NEXTPRIM_RE.match(string).groups()
-
def parseApplication(app):
"""
Parse an application operator
Parse the subscripts for a primitive category
"""
if subscr:
- return subscr[1:-1].split(",")
+ return subscr[1:-1].split(',')
return []
if catstr in primitives:
subscrs = parseSubscripts(chunks[1])
return (PrimitiveCategory(catstr, subscrs), var)
- raise AssertionError(
- "String '" + catstr + "' is neither a family nor primitive category."
- )
+ raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.')
def augParseCategory(line, primitives, families, var=None):
"""
(cat_string, rest) = nextCategory(line)
- if cat_string.startswith("("):
+ if cat_string.startswith('('):
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
- (res, var) = parsePrimitiveCategory(
- PRIM_RE.match(cat_string).groups(), primitives, families, var
- )
+# print rePrim.match(str).groups()
+ (res, var) =\
+ parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(), primitives,
+ families, var)
while rest != "":
app = APP_RE.match(rest).groups()
rest = app[3]
(cat_string, rest) = nextCategory(rest)
- if cat_string.startswith("("):
+ if cat_string.startswith('('):
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
- (arg, var) = parsePrimitiveCategory(
- PRIM_RE.match(cat_string).groups(), primitives, families, var
- )
+ (arg, var) =\
+ parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(),
+ primitives, families, var)
res = FunctionalCategory(res, arg, direction)
return (res, var)
-
def fromstring(lex_str, include_semantics=False):
"""
Convert string representation into a lexicon for CCGs.
if line == "":
continue
- if line.startswith(":-"):
+ if line.startswith(':-'):
# A line of primitive categories.
# The first one is the target category
# ie, :- S, N, NP, VP
- primitives = primitives + [
- prim.strip() for prim in line[2:].strip().split(",")
- ]
+ primitives = primitives + [prim.strip() for prim in line[2:].strip().split(',')]
else:
# Either a family definition, or a word definition
(ident, sep, rhs) = LEX_RE.match(line).groups()
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
(cat, var) = augParseCategory(catstr, primitives, families)
- if sep == "::":
+ if sep == '::':
# Family definition
# ie, Det :: NP/N
families[ident] = (cat, var)
semantics = None
if include_semantics is True:
if semantics_str is None:
- raise AssertionError(
- line
- + " must contain semantics because include_semantics is set to True"
- )
+ raise AssertionError(line + " must contain semantics because include_semantics is set to True")
else:
- semantics = Expression.fromstring(
- SEMANTICS_RE.match(semantics_str).groups()[0]
- )
+ semantics = Expression.fromstring(SEMANTICS_RE.match(semantics_str).groups()[0])
# Word definition
# ie, which => (N\N)/(S/NP)
entries[ident].append(Token(ident, cat, semantics))
return CCGLexicon(primitives[0], primitives, families, entries)
-@deprecated("Use fromstring() instead.")
+@deprecated('Use fromstring() instead.')
def parseLexicon(lex_str):
return fromstring(lex_str)
-
-openccg_tinytiny = fromstring(
- """
+openccg_tinytiny = fromstring("""
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
# Only incorporates a subset of the morphological subcategories, however.
:- S,NP,N # Primitive categories
see => TransVpl
sees => TransVsg
- """
-)
+ """)
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tanin Na Nakorn (@tanin)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.sem.logic import *
-
def compute_type_raised_semantics(semantics):
core = semantics
parent = None
while isinstance(core, LambdaExpression):
parent = core
core = core.term
-
+
var = Variable("F")
while var in core.free():
var = unique_variable(pattern=var)
core = ApplicationExpression(FunctionVariableExpression(var), core)
-
+
if parent is not None:
parent.term = core
else:
semantics = core
-
+
return LambdaExpression(var, semantics)
-
def compute_function_semantics(function, argument):
return ApplicationExpression(function, argument).simplify()
-
def compute_composition_semantics(function, argument):
- assert isinstance(argument, LambdaExpression), (
- "`" + str(argument) + "` must be a lambda expression"
- )
- return LambdaExpression(
- argument.variable, ApplicationExpression(function, argument.term).simplify()
- )
-
+ assert isinstance(argument, LambdaExpression), "`" + str(argument) + "` must be a lambda expression"
+ return LambdaExpression(argument.variable, ApplicationExpression(function, argument.term).simplify())
def compute_substitution_semantics(function, argument):
- assert isinstance(function, LambdaExpression) and isinstance(
- function.term, LambdaExpression
- ), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
- assert isinstance(argument, LambdaExpression), (
- "`" + str(argument) + "` must be a lambda expression"
- )
+ assert isinstance(function, LambdaExpression) and isinstance(function.term, LambdaExpression), "`" + str(function) + "` must be a lambda expression with 2 arguments"
+ assert isinstance(argument, LambdaExpression), "`" + str(argument) + "` must be a lambda expression"
- new_argument = ApplicationExpression(
- argument, VariableExpression(function.variable)
- ).simplify()
- new_term = ApplicationExpression(function.term, new_argument).simplify()
+ new_argument = ApplicationExpression(argument, VariableExpression(function.variable)).simplify()
+ new_term = ApplicationExpression(function.term, new_argument).simplify()
return LambdaExpression(function.variable, new_term)
# Natural Language Toolkit: Chatbots
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
These chatbots may not work using the windows command line or the
windows IDLE GUI.
"""
+from __future__ import print_function
from nltk.chat.util import Chat
from nltk.chat.eliza import eliza_chat
from nltk.chat.zen import zen_chat
bots = [
- (eliza_chat, "Eliza (psycho-babble)"),
- (iesha_chat, "Iesha (teen anime junky)"),
- (rude_chat, "Rude (abusive bot)"),
- (suntsu_chat, "Suntsu (Chinese sayings)"),
- (zen_chat, "Zen (gems of wisdom)"),
-]
-
+ (eliza_chat, 'Eliza (psycho-babble)'),
+ (iesha_chat, 'Iesha (teen anime junky)'),
+ (rude_chat, 'Rude (abusive bot)'),
+ (suntsu_chat, 'Suntsu (Chinese sayings)'),
+ (zen_chat, 'Zen (gems of wisdom)')]
def chatbots():
import sys
-
- print("Which chatbot would you like to talk to?")
+ print('Which chatbot would you like to talk to?')
botcount = len(bots)
for i in range(botcount):
- print(" %d: %s" % (i + 1, bots[i][1]))
+ print(' %d: %s' % (i+1, bots[i][1]))
while True:
- print("\nEnter a number in the range 1-%d: " % botcount, end=" ")
+ print('\nEnter a number in the range 1-%d: ' % botcount, end=' ')
choice = sys.stdin.readline().strip()
if choice.isdigit() and (int(choice) - 1) in range(botcount):
break
else:
- print(" Error: bad chatbot number")
+ print(' Error: bad chatbot number')
- chatbot = bots[int(choice) - 1][0]
+ chatbot = bots[int(choice)-1][0]
chatbot()
# Natural Language Toolkit: Eliza
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# a translation table used to convert things you say into things the
# computer says back, e.g. "I am" --> "you are"
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
# a table of response pairs, where each pair consists of a
# with group-macros labelled as %1, %2.
pairs = (
- (
- r"I need (.*)",
- (
- "Why do you need %1?",
- "Would it really help you to get %1?",
- "Are you sure you need %1?",
- ),
- ),
- (
- r"Why don\'t you (.*)",
- (
- "Do you really think I don't %1?",
- "Perhaps eventually I will %1.",
- "Do you really want me to %1?",
- ),
- ),
- (
- r"Why can\'t I (.*)",
- (
- "Do you think you should be able to %1?",
- "If you could %1, what would you do?",
- "I don't know -- why can't you %1?",
- "Have you really tried?",
- ),
- ),
- (
- r"I can\'t (.*)",
- (
- "How do you know you can't %1?",
- "Perhaps you could %1 if you tried.",
- "What would it take for you to %1?",
- ),
- ),
- (
- r"I am (.*)",
- (
- "Did you come to me because you are %1?",
- "How long have you been %1?",
- "How do you feel about being %1?",
- ),
- ),
- (
- r"I\'m (.*)",
- (
- "How does being %1 make you feel?",
- "Do you enjoy being %1?",
- "Why do you tell me you're %1?",
- "Why do you think you're %1?",
- ),
- ),
- (
- r"Are you (.*)",
- (
- "Why does it matter whether I am %1?",
- "Would you prefer it if I were not %1?",
- "Perhaps you believe I am %1.",
- "I may be %1 -- what do you think?",
- ),
- ),
- (
- r"What (.*)",
- (
- "Why do you ask?",
- "How would an answer to that help you?",
- "What do you think?",
- ),
- ),
- (
- r"How (.*)",
- (
- "How do you suppose?",
- "Perhaps you can answer your own question.",
- "What is it you're really asking?",
- ),
- ),
- (
- r"Because (.*)",
- (
- "Is that the real reason?",
- "What other reasons come to mind?",
- "Does that reason apply to anything else?",
- "If %1, what else must be true?",
- ),
- ),
- (
- r"(.*) sorry (.*)",
- (
- "There are many times when no apology is needed.",
- "What feelings do you have when you apologize?",
- ),
- ),
- (
- r"Hello(.*)",
- (
- "Hello... I'm glad you could drop by today.",
- "Hi there... how are you today?",
- "Hello, how are you feeling today?",
- ),
- ),
- (
- r"I think (.*)",
- ("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
- ),
- (
- r"(.*) friend (.*)",
- (
- "Tell me more about your friends.",
- "When you think of a friend, what comes to mind?",
- "Why don't you tell me about a childhood friend?",
- ),
- ),
- (r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
- (
- r"(.*) computer(.*)",
- (
- "Are you really talking about me?",
- "Does it seem strange to talk to a computer?",
- "How do computers make you feel?",
- "Do you feel threatened by computers?",
- ),
- ),
- (
- r"Is it (.*)",
- (
- "Do you think it is %1?",
- "Perhaps it's %1 -- what do you think?",
- "If it were %1, what would you do?",
- "It could well be that %1.",
- ),
- ),
- (
- r"It is (.*)",
- (
- "You seem very certain.",
- "If I told you that it probably isn't %1, what would you feel?",
- ),
- ),
- (
- r"Can you (.*)",
- (
- "What makes you think I can't %1?",
- "If I could %1, then what?",
- "Why do you ask if I can %1?",
- ),
- ),
- (
- r"Can I (.*)",
- (
- "Perhaps you don't want to %1.",
- "Do you want to be able to %1?",
- "If you could %1, would you?",
- ),
- ),
- (
- r"You are (.*)",
- (
- "Why do you think I am %1?",
- "Does it please you to think that I'm %1?",
- "Perhaps you would like me to be %1.",
- "Perhaps you're really talking about yourself?",
- ),
- ),
- (
- r"You\'re (.*)",
- (
- "Why do you say I am %1?",
- "Why do you think I am %1?",
- "Are we talking about you, or me?",
- ),
- ),
- (
- r"I don\'t (.*)",
- ("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
- ),
- (
- r"I feel (.*)",
- (
- "Good, tell me more about these feelings.",
- "Do you often feel %1?",
- "When do you usually feel %1?",
- "When you feel %1, what do you do?",
- ),
- ),
- (
- r"I have (.*)",
- (
- "Why do you tell me that you've %1?",
- "Have you really %1?",
- "Now that you have %1, what will you do next?",
- ),
- ),
- (
- r"I would (.*)",
- (
- "Could you explain why you would %1?",
- "Why would you %1?",
- "Who else knows that you would %1?",
- ),
- ),
- (
- r"Is there (.*)",
- (
- "Do you think there is %1?",
- "It's likely that there is %1.",
- "Would you like there to be %1?",
- ),
- ),
- (
- r"My (.*)",
- (
- "I see, your %1.",
- "Why do you say that your %1?",
- "When your %1, how do you feel?",
- ),
- ),
- (
- r"You (.*)",
- (
- "We should be discussing you, not me.",
- "Why do you say that about me?",
- "Why do you care whether I %1?",
- ),
- ),
- (r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
- (
- r"I want (.*)",
- (
- "What would it mean to you if you got %1?",
- "Why do you want %1?",
- "What would you do if you got %1?",
- "If you got %1, then what would you do?",
- ),
- ),
- (
- r"(.*) mother(.*)",
- (
- "Tell me more about your mother.",
- "What was your relationship with your mother like?",
- "How do you feel about your mother?",
- "How does this relate to your feelings today?",
- "Good family relations are important.",
- ),
- ),
- (
- r"(.*) father(.*)",
- (
- "Tell me more about your father.",
- "How did your father make you feel?",
- "How do you feel about your father?",
- "Does your relationship with your father relate to your feelings today?",
- "Do you have trouble showing affection with your family?",
- ),
- ),
- (
- r"(.*) child(.*)",
- (
- "Did you have close friends as a child?",
- "What is your favorite childhood memory?",
- "Do you remember any dreams or nightmares from childhood?",
- "Did the other children sometimes tease you?",
- "How do you think your childhood experiences relate to your feelings today?",
- ),
- ),
- (
- r"(.*)\?",
- (
- "Why do you ask that?",
- "Please consider whether you can answer your own question.",
- "Perhaps the answer lies within yourself?",
- "Why don't you tell me?",
- ),
- ),
- (
- r"quit",
- (
- "Thank you for talking with me.",
- "Good-bye.",
- "Thank you, that will be $150. Have a good day!",
- ),
- ),
- (
- r"(.*)",
- (
- "Please tell me more.",
- "Let's change focus a bit... Tell me about your family.",
- "Can you elaborate on that?",
- "Why do you say that %1?",
- "I see.",
- "Very interesting.",
- "%1.",
- "I see. And what does that tell you?",
- "How does that make you feel?",
- "How do you feel when you say that?",
- ),
- ),
+ (r'I need (.*)',
+ ( "Why do you need %1?",
+ "Would it really help you to get %1?",
+ "Are you sure you need %1?")),
+
+ (r'Why don\'t you (.*)',
+ ( "Do you really think I don't %1?",
+ "Perhaps eventually I will %1.",
+ "Do you really want me to %1?")),
+
+ (r'Why can\'t I (.*)',
+ ( "Do you think you should be able to %1?",
+ "If you could %1, what would you do?",
+ "I don't know -- why can't you %1?",
+ "Have you really tried?")),
+
+ (r'I can\'t (.*)',
+ ( "How do you know you can't %1?",
+ "Perhaps you could %1 if you tried.",
+ "What would it take for you to %1?")),
+
+ (r'I am (.*)',
+ ( "Did you come to me because you are %1?",
+ "How long have you been %1?",
+ "How do you feel about being %1?")),
+
+ (r'I\'m (.*)',
+ ( "How does being %1 make you feel?",
+ "Do you enjoy being %1?",
+ "Why do you tell me you're %1?",
+ "Why do you think you're %1?")),
+
+ (r'Are you (.*)',
+ ( "Why does it matter whether I am %1?",
+ "Would you prefer it if I were not %1?",
+ "Perhaps you believe I am %1.",
+ "I may be %1 -- what do you think?")),
+
+ (r'What (.*)',
+ ( "Why do you ask?",
+ "How would an answer to that help you?",
+ "What do you think?")),
+
+ (r'How (.*)',
+ ( "How do you suppose?",
+ "Perhaps you can answer your own question.",
+ "What is it you're really asking?")),
+
+ (r'Because (.*)',
+ ( "Is that the real reason?",
+ "What other reasons come to mind?",
+ "Does that reason apply to anything else?",
+ "If %1, what else must be true?")),
+
+ (r'(.*) sorry (.*)',
+ ( "There are many times when no apology is needed.",
+ "What feelings do you have when you apologize?")),
+
+ (r'Hello(.*)',
+ ( "Hello... I'm glad you could drop by today.",
+ "Hi there... how are you today?",
+ "Hello, how are you feeling today?")),
+
+ (r'I think (.*)',
+ ( "Do you doubt %1?",
+ "Do you really think so?",
+ "But you're not sure %1?")),
+
+ (r'(.*) friend (.*)',
+ ( "Tell me more about your friends.",
+ "When you think of a friend, what comes to mind?",
+ "Why don't you tell me about a childhood friend?")),
+
+ (r'Yes',
+ ( "You seem quite sure.",
+ "OK, but can you elaborate a bit?")),
+
+ (r'(.*) computer(.*)',
+ ( "Are you really talking about me?",
+ "Does it seem strange to talk to a computer?",
+ "How do computers make you feel?",
+ "Do you feel threatened by computers?")),
+
+ (r'Is it (.*)',
+ ( "Do you think it is %1?",
+ "Perhaps it's %1 -- what do you think?",
+ "If it were %1, what would you do?",
+ "It could well be that %1.")),
+
+ (r'It is (.*)',
+ ( "You seem very certain.",
+ "If I told you that it probably isn't %1, what would you feel?")),
+
+ (r'Can you (.*)',
+ ( "What makes you think I can't %1?",
+ "If I could %1, then what?",
+ "Why do you ask if I can %1?")),
+
+ (r'Can I (.*)',
+ ( "Perhaps you don't want to %1.",
+ "Do you want to be able to %1?",
+ "If you could %1, would you?")),
+
+ (r'You are (.*)',
+ ( "Why do you think I am %1?",
+ "Does it please you to think that I'm %1?",
+ "Perhaps you would like me to be %1.",
+ "Perhaps you're really talking about yourself?")),
+
+ (r'You\'re (.*)',
+ ( "Why do you say I am %1?",
+ "Why do you think I am %1?",
+ "Are we talking about you, or me?")),
+
+ (r'I don\'t (.*)',
+ ( "Don't you really %1?",
+ "Why don't you %1?",
+ "Do you want to %1?")),
+
+ (r'I feel (.*)',
+ ( "Good, tell me more about these feelings.",
+ "Do you often feel %1?",
+ "When do you usually feel %1?",
+ "When you feel %1, what do you do?")),
+
+ (r'I have (.*)',
+ ( "Why do you tell me that you've %1?",
+ "Have you really %1?",
+ "Now that you have %1, what will you do next?")),
+
+ (r'I would (.*)',
+ ( "Could you explain why you would %1?",
+ "Why would you %1?",
+ "Who else knows that you would %1?")),
+
+ (r'Is there (.*)',
+ ( "Do you think there is %1?",
+ "It's likely that there is %1.",
+ "Would you like there to be %1?")),
+
+ (r'My (.*)',
+ ( "I see, your %1.",
+ "Why do you say that your %1?",
+ "When your %1, how do you feel?")),
+
+ (r'You (.*)',
+ ( "We should be discussing you, not me.",
+ "Why do you say that about me?",
+ "Why do you care whether I %1?")),
+
+ (r'Why (.*)',
+ ( "Why don't you tell me the reason why %1?",
+ "Why do you think %1?" )),
+
+ (r'I want (.*)',
+ ( "What would it mean to you if you got %1?",
+ "Why do you want %1?",
+ "What would you do if you got %1?",
+ "If you got %1, then what would you do?")),
+
+ (r'(.*) mother(.*)',
+ ( "Tell me more about your mother.",
+ "What was your relationship with your mother like?",
+ "How do you feel about your mother?",
+ "How does this relate to your feelings today?",
+ "Good family relations are important.")),
+
+ (r'(.*) father(.*)',
+ ( "Tell me more about your father.",
+ "How did your father make you feel?",
+ "How do you feel about your father?",
+ "Does your relationship with your father relate to your feelings today?",
+ "Do you have trouble showing affection with your family?")),
+
+ (r'(.*) child(.*)',
+ ( "Did you have close friends as a child?",
+ "What is your favorite childhood memory?",
+ "Do you remember any dreams or nightmares from childhood?",
+ "Did the other children sometimes tease you?",
+ "How do you think your childhood experiences relate to your feelings today?")),
+
+ (r'(.*)\?',
+ ( "Why do you ask that?",
+ "Please consider whether you can answer your own question.",
+ "Perhaps the answer lies within yourself?",
+ "Why don't you tell me?")),
+
+ (r'quit',
+ ( "Thank you for talking with me.",
+ "Good-bye.",
+ "Thank you, that will be $150. Have a good day!")),
+
+ (r'(.*)',
+ ( "Please tell me more.",
+ "Let's change focus a bit... Tell me about your family.",
+ "Can you elaborate on that?",
+ "Why do you say that %1?",
+ "I see.",
+ "Very interesting.",
+ "%1.",
+ "I see. And what does that tell you?",
+ "How does that make you feel?",
+ "How do you feel when you say that?"))
)
eliza_chatbot = Chat(pairs, reflections)
-
def eliza_chat():
print("Therapist\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('='*72)
print("Hello. How are you feeling today?")
eliza_chatbot.converse()
-
def demo():
eliza_chat()
-
if __name__ == "__main__":
demo()
+
# Natural Language Toolkit: Teen Chatbot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
anime junky that frequents YahooMessenger or MSNM.
All spelling mistakes and flawed grammar are intentional.
"""
+from __future__ import print_function
from nltk.chat.util import Chat
reflections = {
- "am": "r",
- "was": "were",
- "i": "u",
- "i'd": "u'd",
- "i've": "u'v",
- "ive": "u'v",
- "i'll": "u'll",
- "my": "ur",
- "are": "am",
- "you're": "im",
- "you've": "ive",
- "you'll": "i'll",
- "your": "my",
- "yours": "mine",
- "you": "me",
- "u": "me",
- "ur": "my",
- "urs": "mine",
- "me": "u",
+ "am" : "r",
+ "was" : "were",
+ "i" : "u",
+ "i'd" : "u'd",
+ "i've" : "u'v",
+ "ive" : "u'v",
+ "i'll" : "u'll",
+ "my" : "ur",
+ "are" : "am",
+ "you're" : "im",
+ "you've" : "ive",
+ "you'll" : "i'll",
+ "your" : "my",
+ "yours" : "mine",
+ "you" : "me",
+ "u" : "me",
+ "ur" : "my",
+ "urs" : "mine",
+ "me" : "u"
}
# Note: %1/2/etc are used without spaces prior as the chat bot seems
# to add a superfluous space when matching.
pairs = (
- (
- r"I\'m (.*)",
- (
- "ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
- "ur%1? neat!! kekeke >_<",
- ),
- ),
- (
- r"(.*) don\'t you (.*)",
- (
- "u think I can%2??! really?? kekeke \<_\<",
- "what do u mean%2??!",
- "i could if i wanted, don't you think!! kekeke",
- ),
- ),
- (r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
- (
- r"do (you|u) (.*)\??",
- ("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
- ),
- (
- r"(.*)\?",
- (
- "man u ask lots of questions!",
- "booooring! how old r u??",
- "boooooring!! ur not very fun",
- ),
- ),
- (
- r"(cos|because) (.*)",
- ("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
- ),
- (
- r"why can\'t [iI] (.*)",
- (
- "i dunno! y u askin me for!",
- "try harder, silly! hee! ^_^",
- "i dunno! but when i can't%1 i jump up and down!",
- ),
- ),
- (
- r"I can\'t (.*)",
- (
- "u can't what??! >_<",
- "that's ok! i can't%1 either! kekekekeke ^_^",
- "try harder, silly! hee! ^&^",
- ),
- ),
- (
- r"(.*) (like|love|watch) anime",
- (
- "omg i love anime!! do u like sailor moon??! ^&^",
- "anime yay! anime rocks sooooo much!",
- "oooh anime! i love anime more than anything!",
- "anime is the bestest evar! evangelion is the best!",
- "hee anime is the best! do you have ur fav??",
- ),
- ),
- (
- r"I (like|love|watch|play) (.*)",
- ("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
- ),
- (
- r"anime sucks|(.*) (hate|detest) anime",
- (
- "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
- "no way! anime is the best ever!",
- "nuh-uh, anime is the best!",
- ),
- ),
- (
- r"(are|r) (you|u) (.*)",
- ("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
- ),
- (
- r"what (.*)",
- ("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
- ),
- (r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
- (r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
- (
- r"quit",
- (
- "mom says i have to go eat dinner now :,( bye!!",
- "awww u have to go?? see u next time!!",
- "how to see u again soon! ^_^",
- ),
- ),
- (
- r"(.*)",
- (
- "ur funny! kekeke",
- "boooooring! talk about something else! tell me wat u like!",
- "do u like anime??",
- "do u watch anime? i like sailor moon! ^_^",
- "i wish i was a kitty!! kekekeke ^_^",
- ),
- ),
-)
+ (r'I\'m (.*)',
+ ( "ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
+ "ur%1? neat!! kekeke >_<")),
-iesha_chatbot = Chat(pairs, reflections)
+ (r'(.*) don\'t you (.*)',
+ ( "u think I can%2??! really?? kekeke \<_\<",
+ "what do u mean%2??!",
+ "i could if i wanted, don't you think!! kekeke")),
+
+ (r'ye[as] [iI] (.*)',
+ ( "u%1? cool!! how?",
+ "how come u%1??",
+ "u%1? so do i!!")),
+
+ (r'do (you|u) (.*)\??',
+ ( "do i%2? only on tuesdays! kekeke *_*",
+ "i dunno! do u%2??")),
+
+ (r'(.*)\?',
+ ( "man u ask lots of questions!",
+ "booooring! how old r u??",
+ "boooooring!! ur not very fun")),
+
+ (r'(cos|because) (.*)',
+ ( "hee! i don't believe u! >_<",
+ "nuh-uh! >_<",
+ "ooooh i agree!")),
+
+ (r'why can\'t [iI] (.*)',
+ ( "i dunno! y u askin me for!",
+ "try harder, silly! hee! ^_^",
+ "i dunno! but when i can't%1 i jump up and down!")),
+
+ (r'I can\'t (.*)',
+ ( "u can't what??! >_<",
+ "that's ok! i can't%1 either! kekekekeke ^_^",
+ "try harder, silly! hee! ^&^")),
+
+ (r'(.*) (like|love|watch) anime',
+ ( "omg i love anime!! do u like sailor moon??! ^&^",
+ "anime yay! anime rocks sooooo much!",
+ "oooh anime! i love anime more than anything!",
+ "anime is the bestest evar! evangelion is the best!",
+ "hee anime is the best! do you have ur fav??")),
+
+ (r'I (like|love|watch|play) (.*)',
+ ( "yay! %2 rocks!",
+ "yay! %2 is neat!",
+ "cool! do u like other stuff?? ^_^")),
+
+ (r'anime sucks|(.*) (hate|detest) anime',
+ ( "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
+ "no way! anime is the best ever!",
+ "nuh-uh, anime is the best!")),
+ (r'(are|r) (you|u) (.*)',
+ ( "am i%1??! how come u ask that!",
+ "maybe! y shud i tell u?? kekeke >_>")),
+
+ (r'what (.*)',
+ ( "hee u think im gonna tell u? .v.",
+ "booooooooring! ask me somethin else!")),
+
+ (r'how (.*)',
+ ( "not tellin!! kekekekekeke ^_^",)),
+
+ (r'(hi|hello|hey) (.*)',
+ ( "hi!!! how r u!!",)),
+
+ (r'quit',
+ ( "mom says i have to go eat dinner now :,( bye!!",
+ "awww u have to go?? see u next time!!",
+ "how to see u again soon! ^_^")),
+
+ (r'(.*)',
+ ( "ur funny! kekeke",
+ "boooooring! talk about something else! tell me wat u like!",
+ "do u like anime??",
+ "do u watch anime? i like sailor moon! ^_^",
+ "i wish i was a kitty!! kekekeke ^_^"))
+ )
+
+iesha_chatbot = Chat(pairs, reflections)
def iesha_chat():
print("Iesha the TeenBoT\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('='*72)
print("hi!! i'm iesha! who r u??!")
iesha_chatbot.converse()
-
def demo():
iesha_chat()
-
if __name__ == "__main__":
demo()
# Natural Language Toolkit: Rude Chatbot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
pairs = (
- (
- r"We (.*)",
- (
- "What do you mean, 'we'?",
- "Don't include me in that!",
- "I wouldn't be so sure about that.",
- ),
- ),
- (
- r"You should (.*)",
- ("Don't tell me what to do, buddy.", "Really? I should, should I?"),
- ),
- (
- r"You\'re(.*)",
- (
- "More like YOU'RE %1!",
- "Hah! Look who's talking.",
- "Come over here and tell me I'm %1.",
- ),
- ),
- (
- r"You are(.*)",
- (
- "More like YOU'RE %1!",
- "Hah! Look who's talking.",
- "Come over here and tell me I'm %1.",
- ),
- ),
- (
- r"I can\'t(.*)",
- (
- "You do sound like the type who can't %1.",
- "Hear that splashing sound? That's my heart bleeding for you.",
- "Tell somebody who might actually care.",
- ),
- ),
- (
- r"I think (.*)",
- (
- "I wouldn't think too hard if I were you.",
- "You actually think? I'd never have guessed...",
- ),
- ),
- (
- r"I (.*)",
- (
- "I'm getting a bit tired of hearing about you.",
- "How about we talk about me instead?",
- "Me, me, me... Frankly, I don't care.",
- ),
- ),
- (
- r"How (.*)",
- (
- "How do you think?",
- "Take a wild guess.",
- "I'm not even going to dignify that with an answer.",
- ),
- ),
- (r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
- (
- r"Why (.*)",
- (
- "Why not?",
- "That's so obvious I thought even you'd have already figured it out.",
- ),
- ),
- (
- r"(.*)shut up(.*)",
- (
- "Make me.",
- "Getting angry at a feeble NLP assignment? Somebody's losing it.",
- "Say that again, I dare you.",
- ),
- ),
- (
- r"Shut up(.*)",
- (
- "Make me.",
- "Getting angry at a feeble NLP assignment? Somebody's losing it.",
- "Say that again, I dare you.",
- ),
- ),
- (
- r"Hello(.*)",
- ("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
- ),
- (
- r"(.*)",
- (
- "I'm getting bored here. Become more interesting.",
- "Either become more thrilling or get lost, buddy.",
- "Change the subject before I die of fatal boredom.",
- ),
- ),
+ (r'We (.*)',
+ ("What do you mean, 'we'?",
+ "Don't include me in that!",
+ "I wouldn't be so sure about that.")),
+
+ (r'You should (.*)',
+ ("Don't tell me what to do, buddy.",
+ "Really? I should, should I?")),
+
+ (r'You\'re(.*)',
+ ("More like YOU'RE %1!",
+ "Hah! Look who's talking.",
+ "Come over here and tell me I'm %1.")),
+
+ (r'You are(.*)',
+ ("More like YOU'RE %1!",
+ "Hah! Look who's talking.",
+ "Come over here and tell me I'm %1.")),
+
+ (r'I can\'t(.*)',
+ ("You do sound like the type who can't %1.",
+ "Hear that splashing sound? That's my heart bleeding for you.",
+ "Tell somebody who might actually care.")),
+
+ (r'I think (.*)',
+ ("I wouldn't think too hard if I were you.",
+ "You actually think? I'd never have guessed...")),
+
+ (r'I (.*)',
+ ("I'm getting a bit tired of hearing about you.",
+ "How about we talk about me instead?",
+ "Me, me, me... Frankly, I don't care.")),
+
+ (r'How (.*)',
+ ("How do you think?",
+ "Take a wild guess.",
+ "I'm not even going to dignify that with an answer.")),
+
+ (r'What (.*)',
+ ("Do I look like an encyclopedia?",
+ "Figure it out yourself.")),
+
+ (r'Why (.*)',
+ ("Why not?",
+ "That's so obvious I thought even you'd have already figured it out.")),
+
+ (r'(.*)shut up(.*)',
+ ("Make me.",
+ "Getting angry at a feeble NLP assignment? Somebody's losing it.",
+ "Say that again, I dare you.")),
+
+ (r'Shut up(.*)',
+ ("Make me.",
+ "Getting angry at a feeble NLP assignment? Somebody's losing it.",
+ "Say that again, I dare you.")),
+
+ (r'Hello(.*)',
+ ("Oh good, somebody else to talk to. Joy.",
+ "'Hello'? How original...")),
+
+ (r'(.*)',
+ ("I'm getting bored here. Become more interesting.",
+ "Either become more thrilling or get lost, buddy.",
+ "Change the subject before I die of fatal boredom."))
)
rude_chatbot = Chat(pairs, reflections)
-
def rude_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('='*72)
print("I suppose I should say hello.")
rude_chatbot.converse()
-
def demo():
rude_chat()
-
if __name__ == "__main__":
demo()
# Natural Language Toolkit: Sun Tsu-Bot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sam Huston 2007
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Hosted by the Gutenberg Project
http://www.gutenberg.org/
"""
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
pairs = (
- (r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
- (
- r"[^\?]*\?",
- (
- "Please consider whether you can answer your own question.",
- "Ask me no questions!",
- ),
- ),
- (
- r"[0-9]+(.*)",
- (
- "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
- "There are five essentials for victory",
- ),
- ),
- (
- r"[A-Ca-c](.*)",
- (
- "The art of war is of vital importance to the State.",
- "All warfare is based on deception.",
- "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
- "If the campaign is protracted, the resources of the State will not be equal to the strain.",
- "Attack him where he is unprepared, appear where you are not expected.",
- "There is no instance of a country having benefited from prolonged warfare.",
- ),
- ),
- (
- r"[D-Fd-f](.*)",
- (
- "The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
- "Bring war material with you from home, but forage on the enemy.",
- "In war, then, let your great object be victory, not lengthy campaigns.",
- "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.",
- ),
- ),
- (
- r"[G-Ig-i](.*)",
- (
- "Heaven signifies night and day, cold and heat, times and seasons.",
- "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
- "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
- "One may know how to conquer without being able to do it.",
- ),
- ),
- (
- r"[J-Lj-l](.*)",
- (
- "There are three ways in which a ruler can bring misfortune upon his army.",
- "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
- "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
- "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
- "There are five essentials for victory",
- "He will win who knows when to fight and when not to fight.",
- "He will win who knows how to handle both superior and inferior forces.",
- "He will win whose army is animated by the same spirit throughout all its ranks.",
- "He will win who, prepared himself, waits to take the enemy unprepared.",
- "He will win who has military capacity and is not interfered with by the sovereign.",
- ),
- ),
- (
- r"[M-Om-o](.*)",
- (
- "If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
- "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
- "If you know neither the enemy nor yourself, you will succumb in every battle.",
- "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.",
- ),
- ),
- (
- r"[P-Rp-r](.*)",
- (
- "Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
- "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
- "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
- "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
- "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.",
- ),
- ),
- (
- r"[S-Us-u](.*)",
- (
- "What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
- "Hence his victories bring him neither reputation for wisdom nor credit for courage.",
- "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
- "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
- "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
- "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.",
- ),
- ),
- (
- r"[V-Zv-z](.*)",
- (
- "It is a matter of life and death, a road either to safety or to ruin.",
- "Hold out baits to entice the enemy. Feign disorder, and crush him.",
- "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
- "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
- "So in war, the way is to avoid what is strong and to strike at what is weak.",
- "Just as water retains no constant shape, so in warfare there are no constant conditions.",
- ),
- ),
- (r"(.*)", ("Your statement insults me.", "")),
+
+ (r'quit',
+ ( "Good-bye.",
+ "Plan well",
+ "May victory be your future")),
+
+ (r'[^\?]*\?',
+ ("Please consider whether you can answer your own question.",
+ "Ask me no questions!")),
+
+ (r'[0-9]+(.*)',
+ ("It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
+ "There are five essentials for victory")),
+
+
+ (r'[A-Ca-c](.*)',
+ ("The art of war is of vital importance to the State.",
+ "All warfare is based on deception.",
+ "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
+ "If the campaign is protracted, the resources of the State will not be equal to the strain.",
+ "Attack him where he is unprepared, appear where you are not expected.",
+ "There is no instance of a country having benefited from prolonged warfare.")),
+
+ (r'[D-Fd-f](.*)',
+ ("The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
+ "Bring war material with you from home, but forage on the enemy.",
+ "In war, then, let your great object be victory, not lengthy campaigns.",
+ "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.")),
+
+ (r'[G-Ig-i](.*)',
+ ("Heaven signifies night and day, cold and heat, times and seasons.",
+ "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
+ "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
+ "One may know how to conquer without being able to do it.")),
+
+ (r'[J-Lj-l](.*)',
+ ("There are three ways in which a ruler can bring misfortune upon his army.",
+ "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
+ "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
+ "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
+ "There are five essentials for victory",
+ "He will win who knows when to fight and when not to fight.",
+ "He will win who knows how to handle both superior and inferior forces.",
+ "He will win whose army is animated by the same spirit throughout all its ranks.",
+ "He will win who, prepared himself, waits to take the enemy unprepared.",
+ "He will win who has military capacity and is not interfered with by the sovereign.")),
+
+ (r'[M-Om-o](.*)',
+ ("If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
+ "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
+ "If you know neither the enemy nor yourself, you will succumb in every battle.",
+ "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.")),
+
+ (r'[P-Rp-r](.*)',
+ ("Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
+ "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
+ "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
+ "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
+ "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.")),
+
+ (r'[S-Us-u](.*)',
+ ("What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
+ "Hence his victories bring him neither reputation for wisdom nor credit for courage.",
+ "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
+ "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
+ "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
+ "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.")),
+
+ (r'[V-Zv-z](.*)',
+ ("It is a matter of life and death, a road either to safety or to ruin.",
+ "Hold out baits to entice the enemy. Feign disorder, and crush him.",
+ "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
+ "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
+ "So in war, the way is to avoid what is strong and to strike at what is weak.",
+ "Just as water retains no constant shape, so in warfare there are no constant conditions.")),
+
+ (r'(.*)',
+ ( "Your statement insults me.",
+ ""))
)
suntsu_chatbot = Chat(pairs, reflections)
-
def suntsu_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
- print("=" * 72)
+ print('='*72)
print("You seek enlightenment?")
suntsu_chatbot.converse()
-
def demo():
suntsu_chat()
-
if __name__ == "__main__":
demo()
+
# Natural Language Toolkit: Chatbot Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
+from __future__ import print_function
import re
import random
+from six.moves import input
+
reflections = {
- "i am": "you are",
- "i was": "you were",
- "i": "you",
- "i'm": "you are",
- "i'd": "you would",
- "i've": "you have",
- "i'll": "you will",
- "my": "your",
- "you are": "I am",
- "you were": "I was",
- "you've": "I have",
- "you'll": "I will",
- "your": "my",
- "yours": "mine",
- "you": "me",
- "me": "you",
+ "i am" : "you are",
+ "i was" : "you were",
+ "i" : "you",
+ "i'm" : "you are",
+ "i'd" : "you would",
+ "i've" : "you have",
+ "i'll" : "you will",
+ "my" : "your",
+ "you are" : "I am",
+ "you were" : "I was",
+ "you've" : "I have",
+ "you'll" : "I will",
+ "your" : "my",
+ "yours" : "mine",
+ "you" : "me",
+ "me" : "you"
}
-
class Chat(object):
def __init__(self, pairs, reflections={}):
"""
:rtype: None
"""
- self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs]
+ self._pairs = [(re.compile(x, re.IGNORECASE),y) for (x,y) in pairs]
self._reflections = reflections
self._regex = self._compile_reflections()
+
def _compile_reflections(self):
- sorted_refl = sorted(self._reflections, key=len, reverse=True)
- return re.compile(
- r"\b({0})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
- )
+ sorted_refl = sorted(self._reflections.keys(), key=len,
+ reverse=True)
+ return re.compile(r"\b({0})\b".format("|".join(map(re.escape,
+ sorted_refl))), re.IGNORECASE)
def _substitute(self, str):
"""
:rtype: str
"""
- return self._regex.sub(
- lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower()
- )
+ return self._regex.sub(lambda mo:
+ self._reflections[mo.string[mo.start():mo.end()]],
+ str.lower())
def _wildcards(self, response, match):
- pos = response.find("%")
+ pos = response.find('%')
while pos >= 0:
- num = int(response[pos + 1 : pos + 2])
- response = (
- response[:pos]
- + self._substitute(match.group(num))
- + response[pos + 2 :]
- )
- pos = response.find("%")
+ num = int(response[pos+1:pos+2])
+ response = response[:pos] + \
+ self._substitute(match.group(num)) + \
+ response[pos+2:]
+ pos = response.find('%')
return response
def respond(self, str):
# did the pattern match?
if match:
- resp = random.choice(response) # pick a random response
- resp = self._wildcards(resp, match) # process wildcards
+ resp = random.choice(response) # pick a random response
+ resp = self._wildcards(resp, match) # process wildcards
# fix munged punctuation at the end
- if resp[-2:] == "?.":
- resp = resp[:-2] + "."
- if resp[-2:] == "??":
- resp = resp[:-2] + "?"
+ if resp[-2:] == '?.': resp = resp[:-2] + '.'
+ if resp[-2:] == '??': resp = resp[:-2] + '?'
return resp
# Hold a conversation with a chatbot
user_input = ""
while user_input != quit:
user_input = quit
- try:
- user_input = input(">")
+ try: user_input = input(">")
except EOFError:
print(user_input)
if user_input:
- while user_input[-1] in "!.":
- user_input = user_input[:-1]
+ while user_input[-1] in "!.": user_input = user_input[:-1]
print(self.respond(user_input))
# Natural Language Toolkit: Zen Chatbot
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
respond to a question by asking a different question, in much the same way
as Eliza.
"""
+from __future__ import print_function
from nltk.chat.util import Chat, reflections
# responses are matched top to bottom, so non-specific matches occur later
# for each match, a list of possible responses is provided
responses = (
- # Zen Chatbot opens with the line "Welcome, my child." The usual
- # response will be a greeting problem: 'good' matches "good morning",
- # "good day" etc, but also "good grief!" and other sentences starting
- # with the word 'good' that may not be a greeting
- (
- r"(hello(.*))|(good [a-zA-Z]+)",
- (
- "The path to enlightenment is often difficult to see.",
- "Greetings. I sense your mind is troubled. Tell me of your troubles.",
- "Ask the question you have come to ask.",
- "Hello. Do you seek englightenment?",
- ),
- ),
- # "I need" and "I want" can be followed by a thing (eg 'help')
- # or an action (eg 'to see you')
- #
- # This is a problem with this style of response -
- # person: "I need you"
- # chatbot: "me can be achieved by hard work and dedication of the mind"
- # i.e. 'you' is not really a thing that can be mapped this way, so this
- # interpretation only makes sense for some inputs
- #
- (
- r"i need (.*)",
- (
- "%1 can be achieved by hard work and dedication of the mind.",
- "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
- "Focus your mind on%1, and you will find what you need.",
- ),
- ),
- (
- r"i want (.*)",
- (
- "Desires of the heart will distract you from the path to enlightenment.",
- "Will%1 help you attain enlightenment?",
- "Is%1 a desire of the mind, or of the heart?",
- ),
- ),
- # why questions are separated into three types:
- # "why..I" e.g. "why am I here?" "Why do I like cake?"
- # "why..you" e.g. "why are you here?" "Why won't you tell me?"
- # "why..." e.g. "Why is the sky blue?"
- # problems:
- # person: "Why can't you tell me?"
- # chatbot: "Are you sure I tell you?"
- # - this style works for positives (e.g. "why do you like cake?")
- # but does not work for negatives (e.g. "why don't you like cake?")
- (r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
- (r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
- (r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
- # e.g. "are you listening?", "are you a duck"
- (
- r"are you (.*)\?",
- ("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
- ),
- # e.g. "am I a duck?", "am I going to die?"
- (
- r"am i (.*)\?",
- ("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
- ),
- # what questions, e.g. "what time is it?"
- # problems:
- # person: "What do you want?"
- # chatbot: "Seek truth, not what do me want."
- (r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
- # how questions, e.g. "how do you do?"
- (
- r"how (.*)\?",
- (
- "How do you suppose?",
- "Will an answer to that really help in your search for enlightenment?",
- "Ask yourself not how, but why.",
- ),
- ),
- # can questions, e.g. "can you run?", "can you come over here please?"
- (
- r"can you (.*)\?",
- (
- "I probably can, but I may not.",
- "Maybe I can%1, and maybe I cannot.",
- "I can do all, and I can do nothing.",
- ),
- ),
- # can questions, e.g. "can I have some cake?", "can I know truth?"
- (
- r"can i (.*)\?",
- (
- "You can%1 if you believe you can%1, and have a pure spirit.",
- "Seek truth and you will know if you can%1.",
- ),
- ),
- # e.g. "It is raining" - implies the speaker is certain of a fact
- (
- r"it is (.*)",
- (
- "How can you be certain that%1, when you do not even know yourself?",
- "Whether it is%1 or not does not change the way the world is.",
- ),
- ),
- # e.g. "is there a doctor in the house?"
- (
- r"is there (.*)\?",
- ("There is%1 if you believe there is.", "It is possible that there is%1."),
- ),
- # e.g. "is it possible?", "is this true?"
- (r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
- # non-specific question
- (
- r"(.*)\?",
- (
- "Do you think %1?",
- "You seek the truth. Does the truth seek you?",
- "If you intentionally pursue the answers to your questions, the answers become hard to see.",
- "The answer to your question cannot be told. It must be experienced.",
- ),
- ),
- # expression of hate of form "I hate you" or "Kelly hates cheese"
- (
- r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
- (
- "Perhaps it is not about hating %2, but about hate from within.",
- "Weeds only grow when we dislike them",
- "Hate is a very strong emotion.",
- ),
- ),
- # statement containing the word 'truth'
- (
- r"(.*) truth(.*)",
- (
- "Seek truth, and truth will seek you.",
- "Remember, it is not the spoon which bends - only yourself.",
- "The search for truth is a long journey.",
- ),
- ),
- # desire to do an action
- # e.g. "I want to go shopping"
- (
- r"i want to (.*)",
- ("You may %1 if your heart truly desires to.", "You may have to %1."),
- ),
- # desire for an object
- # e.g. "I want a pony"
- (
- r"i want (.*)",
- (
- "Does your heart truly desire %1?",
- "Is this a desire of the heart, or of the mind?",
- ),
- ),
- # e.g. "I can't wait" or "I can't do this"
- (
- r"i can\'t (.*)",
- (
- "What we can and can't do is a limitation of the mind.",
- "There are limitations of the body, and limitations of the mind.",
- "Have you tried to%1 with a clear mind?",
- ),
- ),
- # "I think.." indicates uncertainty. e.g. "I think so."
- # problem: exceptions...
- # e.g. "I think, therefore I am"
- (
- r"i think (.*)",
- (
- "Uncertainty in an uncertain world.",
- "Indeed, how can we be certain of anything in such uncertain times.",
- "Are you not, in fact, certain that%1?",
- ),
- ),
- # "I feel...emotions/sick/light-headed..."
- (
- r"i feel (.*)",
- (
- "Your body and your emotions are both symptoms of your mind."
- "What do you believe is the root of such feelings?",
- "Feeling%1 can be a sign of your state-of-mind.",
- ),
- ),
- # exclaimation mark indicating emotion
- # e.g. "Wow!" or "No!"
- (
- r"(.*)!",
- (
- "I sense that you are feeling emotional today.",
- "You need to calm your emotions.",
- ),
- ),
- # because [statement]
- # e.g. "because I said so"
- (
- r"because (.*)",
- (
- "Does knowning the reasons behind things help you to understand"
- " the things themselves?",
- "If%1, what else must be true?",
- ),
- ),
- # yes or no - raise an issue of certainty/correctness
- (
- r"(yes)|(no)",
- (
- "Is there certainty in an uncertain world?",
- "It is better to be right than to be certain.",
- ),
- ),
- # sentence containing word 'love'
- (
- r"(.*)love(.*)",
- (
- "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
- "Free love!",
- ),
- ),
- # sentence containing word 'understand' - r
- (
- r"(.*)understand(.*)",
- (
- "If you understand, things are just as they are;"
- " if you do not understand, things are just as they are.",
- "Imagination is more important than knowledge.",
- ),
- ),
- # 'I', 'me', 'my' - person is talking about themself.
- # this breaks down when words contain these - eg 'Thyme', 'Irish'
- (
- r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
- (
- "'I', 'me', 'my'... these are selfish expressions.",
- "Have you ever considered that you might be a selfish person?",
- "Try to consider others, not just yourself.",
- "Think not just of yourself, but of others.",
- ),
- ),
- # 'you' starting a sentence
- # e.g. "you stink!"
- (
- r"you (.*)",
- ("My path is not of conern to you.", "I am but one, and you but one more."),
- ),
- # say goodbye with some extra Zen wisdom.
- (
- r"exit",
- (
- "Farewell. The obstacle is the path.",
- "Farewell. Life is a journey, not a destination.",
- "Good bye. We are cups, constantly and quietly being filled."
- "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.",
- ),
- ),
- # fall through case -
- # when stumped, respond with generic zen wisdom
- #
- (
- r"(.*)",
- (
- "When you're enlightened, every word is wisdom.",
- "Random talk is useless.",
- "The reverse side also has a reverse side.",
- "Form is emptiness, and emptiness is form.",
- "I pour out a cup of water. Is the cup empty?",
- ),
- ),
+
+# Zen Chatbot opens with the line "Welcome, my child." The usual
+# response will be a greeting problem: 'good' matches "good morning",
+# "good day" etc, but also "good grief!" and other sentences starting
+# with the word 'good' that may not be a greeting
+
+ (r'(hello(.*))|(good [a-zA-Z]+)',
+ ( "The path to enlightenment is often difficult to see.",
+ "Greetings. I sense your mind is troubled. Tell me of your troubles.",
+ "Ask the question you have come to ask.",
+ "Hello. Do you seek englightenment?")),
+
+
+# "I need" and "I want" can be followed by a thing (eg 'help')
+# or an action (eg 'to see you')
+#
+# This is a problem with this style of response -
+# person: "I need you"
+# chatbot: "me can be achieved by hard work and dedication of the mind"
+# i.e. 'you' is not really a thing that can be mapped this way, so this
+# interpretation only makes sense for some inputs
+#
+ (r'i need (.*)',
+ ( "%1 can be achieved by hard work and dedication of the mind.",
+ "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
+ "Focus your mind on%1, and you will find what you need.")),
+
+ (r'i want (.*)',
+ ( "Desires of the heart will distract you from the path to enlightenment.",
+ "Will%1 help you attain enlightenment?",
+ "Is%1 a desire of the mind, or of the heart?")),
+
+
+# why questions are separated into three types:
+# "why..I" e.g. "why am I here?" "Why do I like cake?"
+# "why..you" e.g. "why are you here?" "Why won't you tell me?"
+# "why..." e.g. "Why is the sky blue?"
+# problems:
+# person: "Why can't you tell me?"
+# chatbot: "Are you sure I tell you?"
+# - this style works for positives (e.g. "why do you like cake?")
+# but does not work for negatives (e.g. "why don't you like cake?")
+ (r'why (.*) i (.*)\?',
+ ( "You%1%2?",
+ "Perhaps you only think you%1%2")),
+
+ (r'why (.*) you(.*)\?',
+ ( "Why%1 you%2?",
+ "%2 I%1",
+ "Are you sure I%2?")),
+
+ (r'why (.*)\?',
+ ( "I cannot tell you why%1.",
+ "Why do you think %1?" )),
+
+# e.g. "are you listening?", "are you a duck"
+ (r'are you (.*)\?',
+ ( "Maybe%1, maybe not%1.",
+ "Whether I am%1 or not is God's business.")),
+
+# e.g. "am I a duck?", "am I going to die?"
+ (r'am i (.*)\?',
+ ( "Perhaps%1, perhaps not%1.",
+ "Whether you are%1 or not is not for me to say.")),
+
+# what questions, e.g. "what time is it?"
+# problems:
+# person: "What do you want?"
+# chatbot: "Seek truth, not what do me want."
+ (r'what (.*)\?',
+ ( "Seek truth, not what%1.",
+ "What%1 should not concern you.")),
+
+# how questions, e.g. "how do you do?"
+ (r'how (.*)\?',
+ ( "How do you suppose?",
+ "Will an answer to that really help in your search for enlightenment?",
+ "Ask yourself not how, but why.")),
+
+# can questions, e.g. "can you run?", "can you come over here please?"
+ (r'can you (.*)\?',
+ ( "I probably can, but I may not.",
+ "Maybe I can%1, and maybe I cannot.",
+ "I can do all, and I can do nothing.")),
+
+# can questions, e.g. "can I have some cake?", "can I know truth?"
+ (r'can i (.*)\?',
+ ( "You can%1 if you believe you can%1, and have a pure spirit.",
+ "Seek truth and you will know if you can%1.")),
+
+# e.g. "It is raining" - implies the speaker is certain of a fact
+ (r'it is (.*)',
+ ( "How can you be certain that%1, when you do not even know yourself?",
+ "Whether it is%1 or not does not change the way the world is.")),
+
+# e.g. "is there a doctor in the house?"
+ (r'is there (.*)\?',
+ ( "There is%1 if you believe there is.",
+ "It is possible that there is%1.")),
+
+# e.g. "is it possible?", "is this true?"
+ (r'is(.*)\?',
+ ( "%1 is not relevant.",
+ "Does this matter?")),
+
+# non-specific question
+ (r'(.*)\?',
+ ( "Do you think %1?",
+ "You seek the truth. Does the truth seek you?",
+ "If you intentionally pursue the answers to your questions, the answers become hard to see.",
+ "The answer to your question cannot be told. It must be experienced.")),
+
+# expression of hate of form "I hate you" or "Kelly hates cheese"
+ (r'(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)',
+ ( "Perhaps it is not about hating %2, but about hate from within.",
+ "Weeds only grow when we dislike them",
+ "Hate is a very strong emotion.")),
+
+# statement containing the word 'truth'
+ (r'(.*) truth(.*)',
+ ( "Seek truth, and truth will seek you.",
+ "Remember, it is not the spoon which bends - only yourself.",
+ "The search for truth is a long journey.")),
+
+# desire to do an action
+# e.g. "I want to go shopping"
+ (r'i want to (.*)',
+ ( "You may %1 if your heart truly desires to.",
+ "You may have to %1.")),
+
+# desire for an object
+# e.g. "I want a pony"
+ (r'i want (.*)',
+ ( "Does your heart truly desire %1?",
+ "Is this a desire of the heart, or of the mind?")),
+
+# e.g. "I can't wait" or "I can't do this"
+ (r'i can\'t (.*)',
+ ( "What we can and can't do is a limitation of the mind.",
+ "There are limitations of the body, and limitations of the mind.",
+ "Have you tried to%1 with a clear mind?")),
+
+# "I think.." indicates uncertainty. e.g. "I think so."
+# problem: exceptions...
+# e.g. "I think, therefore I am"
+ (r'i think (.*)',
+ ( "Uncertainty in an uncertain world.",
+ "Indeed, how can we be certain of anything in such uncertain times.",
+ "Are you not, in fact, certain that%1?")),
+
+# "I feel...emotions/sick/light-headed..."
+ (r'i feel (.*)',
+ ( "Your body and your emotions are both symptoms of your mind."
+ "What do you believe is the root of such feelings?",
+ "Feeling%1 can be a sign of your state-of-mind.")),
+
+
+# exclaimation mark indicating emotion
+# e.g. "Wow!" or "No!"
+ (r'(.*)!',
+ ( "I sense that you are feeling emotional today.",
+ "You need to calm your emotions.")),
+
+# because [statement]
+# e.g. "because I said so"
+ (r'because (.*)',
+ ( "Does knowning the reasons behind things help you to understand"
+ " the things themselves?",
+ "If%1, what else must be true?")),
+
+# yes or no - raise an issue of certainty/correctness
+ (r'(yes)|(no)',
+ ( "Is there certainty in an uncertain world?",
+ "It is better to be right than to be certain.")),
+
+# sentence containing word 'love'
+ (r'(.*)love(.*)',
+ ( "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
+ "Free love!")),
+
+# sentence containing word 'understand' - r
+ (r'(.*)understand(.*)',
+ ( "If you understand, things are just as they are;"
+ " if you do not understand, things are just as they are.",
+ "Imagination is more important than knowledge.")),
+
+# 'I', 'me', 'my' - person is talking about themself.
+# this breaks down when words contain these - eg 'Thyme', 'Irish'
+ (r'(.*)(me )|( me)|(my)|(mine)|(i)(.*)',
+ ( "'I', 'me', 'my'... these are selfish expressions.",
+ "Have you ever considered that you might be a selfish person?",
+ "Try to consider others, not just yourself.",
+ "Think not just of yourself, but of others.")),
+
+# 'you' starting a sentence
+# e.g. "you stink!"
+ (r'you (.*)',
+ ( "My path is not of conern to you.",
+ "I am but one, and you but one more.")),
+
+# say goodbye with some extra Zen wisdom.
+ (r'exit',
+ ( "Farewell. The obstacle is the path.",
+ "Farewell. Life is a journey, not a destination.",
+ "Good bye. We are cups, constantly and quietly being filled."
+ "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.")),
+
+
+# fall through case -
+# when stumped, respond with generic zen wisdom
+#
+ (r'(.*)',
+ ( "When you're enlightened, every word is wisdom.",
+ "Random talk is useless.",
+ "The reverse side also has a reverse side.",
+ "Form is emptiness, and emptiness is form.",
+ "I pour out a cup of water. Is the cup empty?"))
)
zen_chatbot = Chat(responses, reflections)
-
def zen_chat():
- print("*" * 75)
+ print('*'*75)
print("Zen Chatbot!".center(75))
- print("*" * 75)
+ print('*'*75)
print('"Look beyond mere words and letters - look into your mind"'.center(75))
print("* Talk your way to truth with Zen Chatbot.")
print("* Type 'quit' when you have had enough.")
- print("*" * 75)
+ print('*'*75)
print("Welcome, my child.")
zen_chatbot.converse()
-
def demo():
zen_chat()
-
if __name__ == "__main__":
demo()
# Natural Language Toolkit: Chunkers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.data import load
from nltk.chunk.api import ChunkParserI
-from nltk.chunk.util import (
- ChunkScore,
- accuracy,
- tagstr2tree,
- conllstr2tree,
- conlltags2tree,
- tree2conlltags,
- tree2conllstr,
- tree2conlltags,
- ieerstr2tree,
-)
+from nltk.chunk.util import (ChunkScore, accuracy, tagstr2tree, conllstr2tree,
+ conlltags2tree, tree2conlltags, tree2conllstr, tree2conlltags,
+ ieerstr2tree)
from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
# Standard treebank POS tagger
-_BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle"
-_MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle"
-
+_BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
+_MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
def ne_chunk(tagged_tokens, binary=False):
"""
chunker = load(chunker_pickle)
return chunker.parse(tagged_tokens)
-
def ne_chunk_sents(tagged_sentences, binary=False):
"""
Use NLTK's currently recommended named entity chunker to chunk the
chunker_pickle = _MULTICLASS_NE_CHUNKER
chunker = load(chunker_pickle)
return chunker.parse_sents(tagged_sentences)
+
# Natural Language Toolkit: Chunk parsing API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
from nltk.chunk.util import ChunkScore
-
class ChunkParserI(ParserI):
"""
A processing interface for identifying non-overlapping groups in
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
will always generate a parse.
"""
-
def parse(self, tokens):
"""
Return the best chunk structure for the given tokens
for correct in gold:
chunkscore.score(correct, self.parse(correct.leaves()))
return chunkscore
+
# Natural Language Toolkit: Chunk parsing API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Named entity chunker
"""
+from __future__ import print_function
import os, re, pickle
from xml.etree import ElementTree as ET
from nltk.chunk.api import ChunkParserI
from nltk.chunk.util import ChunkScore
-
class NEChunkParserTagger(ClassifierBasedTagger):
"""
The IOB tagger used by the chunk parser.
"""
-
def __init__(self, train):
ClassifierBasedTagger.__init__(
- self, train=train, classifier_builder=self._classifier_builder
- )
+ self, train=train,
+ classifier_builder=self._classifier_builder)
def _classifier_builder(self, train):
- return MaxentClassifier.train(
- train, algorithm="megam", gaussian_prior_sigma=1, trace=2
- )
+ return MaxentClassifier.train(train, algorithm='megam',
+ gaussian_prior_sigma=1,
+ trace=2)
def _english_wordlist(self):
try:
wl = self._en_wordlist
except AttributeError:
from nltk.corpus import words
-
- self._en_wordlist = set(words.words("en-basic"))
+ self._en_wordlist = set(words.words('en-basic'))
wl = self._en_wordlist
return wl
prevpos = prevprevpos = None
prevshape = prevtag = prevprevtag = None
elif index == 1:
- prevword = tokens[index - 1][0].lower()
+ prevword = tokens[index-1][0].lower()
prevprevword = None
- prevpos = simplify_pos(tokens[index - 1][1])
+ prevpos = simplify_pos(tokens[index-1][1])
prevprevpos = None
- prevtag = history[index - 1][0]
+ prevtag = history[index-1][0]
prevshape = prevprevtag = None
else:
- prevword = tokens[index - 1][0].lower()
- prevprevword = tokens[index - 2][0].lower()
- prevpos = simplify_pos(tokens[index - 1][1])
- prevprevpos = simplify_pos(tokens[index - 2][1])
- prevtag = history[index - 1]
- prevprevtag = history[index - 2]
+ prevword = tokens[index-1][0].lower()
+ prevprevword = tokens[index-2][0].lower()
+ prevpos = simplify_pos(tokens[index-1][1])
+ prevprevpos = simplify_pos(tokens[index-2][1])
+ prevtag = history[index-1]
+ prevprevtag = history[index-2]
prevshape = shape(prevword)
- if index == len(tokens) - 1:
+ if index == len(tokens)-1:
nextword = nextnextword = None
nextpos = nextnextpos = None
- elif index == len(tokens) - 2:
- nextword = tokens[index + 1][0].lower()
- nextpos = tokens[index + 1][1].lower()
+ elif index == len(tokens)-2:
+ nextword = tokens[index+1][0].lower()
+ nextpos = tokens[index+1][1].lower()
nextnextword = None
nextnextpos = None
else:
- nextword = tokens[index + 1][0].lower()
- nextpos = tokens[index + 1][1].lower()
- nextnextword = tokens[index + 2][0].lower()
- nextnextpos = tokens[index + 2][1].lower()
+ nextword = tokens[index+1][0].lower()
+ nextpos = tokens[index+1][1].lower()
+ nextnextword = tokens[index+2][0].lower()
+ nextnextpos = tokens[index+2][1].lower()
# 89.6
features = {
- "bias": True,
- "shape": shape(word),
- "wordlen": len(word),
- "prefix3": word[:3].lower(),
- "suffix3": word[-3:].lower(),
- "pos": pos,
- "word": word,
- "en-wordlist": (word in self._english_wordlist()),
- "prevtag": prevtag,
- "prevpos": prevpos,
- "nextpos": nextpos,
- "prevword": prevword,
- "nextword": nextword,
- "word+nextpos": "{0}+{1}".format(word.lower(), nextpos),
- "pos+prevtag": "{0}+{1}".format(pos, prevtag),
- "shape+prevtag": "{0}+{1}".format(prevshape, prevtag),
- }
+ 'bias': True,
+ 'shape': shape(word),
+ 'wordlen': len(word),
+ 'prefix3': word[:3].lower(),
+ 'suffix3': word[-3:].lower(),
+ 'pos': pos,
+ 'word': word,
+ 'en-wordlist': (word in self._english_wordlist()),
+ 'prevtag': prevtag,
+ 'prevpos': prevpos,
+ 'nextpos': nextpos,
+ 'prevword': prevword,
+ 'nextword': nextword,
+ 'word+nextpos': '{0}+{1}'.format(word.lower(), nextpos),
+ 'pos+prevtag': '{0}+{1}'.format(pos, prevtag),
+ 'shape+prevtag': '{0}+{1}'.format(prevshape, prevtag),
+ }
return features
-
class NEChunkParser(ChunkParserI):
"""
Expected input: list of pos-tagged words
"""
-
def __init__(self, train):
self._train(train)
"""
Convert a list of tagged tokens to a chunk-parse tree.
"""
- sent = Tree("S", [])
+ sent = Tree('S', [])
- for (tok, tag) in tagged_tokens:
- if tag == "O":
+ for (tok,tag) in tagged_tokens:
+ if tag == 'O':
sent.append(tok)
- elif tag.startswith("B-"):
+ elif tag.startswith('B-'):
sent.append(Tree(tag[2:], [tok]))
- elif tag.startswith("I-"):
- if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
+ elif tag.startswith('I-'):
+ if (sent and isinstance(sent[-1], Tree) and
+ sent[-1].label() == tag[2:]):
sent[-1].append(tok)
else:
sent.append(Tree(tag[2:], [tok]))
if len(child) == 0:
print("Warning -- empty chunk in sentence")
continue
- toks.append((child[0], "B-{0}".format(child.label())))
+ toks.append((child[0], 'B-{0}'.format(child.label())))
for tok in child[1:]:
- toks.append((tok, "I-{0}".format(child.label())))
+ toks.append((tok, 'I-{0}'.format(child.label())))
else:
- toks.append((child, "O"))
+ toks.append((child, 'O'))
return toks
-
def shape(word):
- if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
- return "number"
- elif re.match("\W+$", word, re.UNICODE):
- return "punct"
- elif re.match("\w+$", word, re.UNICODE):
+ if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word, re.UNICODE):
+ return 'number'
+ elif re.match('\W+$', word, re.UNICODE):
+ return 'punct'
+ elif re.match('\w+$', word, re.UNICODE):
if word.istitle():
- return "upcase"
+ return 'upcase'
elif word.islower():
- return "downcase"
+ return 'downcase'
else:
- return "mixedcase"
+ return 'mixedcase'
else:
- return "other"
-
+ return 'other'
def simplify_pos(s):
- if s.startswith("V"):
- return "V"
- else:
- return s.split("-")[0]
-
+ if s.startswith('V'): return "V"
+ else: return s.split('-')[0]
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in pos_tag(words))
- newtree = Tree("S", [])
+ newtree = Tree('S', [])
for child in tree:
if isinstance(child, Tree):
newtree.append(Tree(child.label(), []))
for subchild in child:
- newtree[-1].append((subchild, next(tag_iter)))
+ newtree[-1].append( (subchild, next(tag_iter)) )
else:
- newtree.append((child, next(tag_iter)))
+ newtree.append( (child, next(tag_iter)) )
return newtree
-
-def load_ace_data(roots, fmt="binary", skip_bnews=True):
+def load_ace_data(roots, fmt='binary', skip_bnews=True):
for root in roots:
for root, dirs, files in os.walk(root):
- if root.endswith("bnews") and skip_bnews:
+ if root.endswith('bnews') and skip_bnews:
continue
for f in files:
- if f.endswith(".sgm"):
+ if f.endswith('.sgm'):
for sent in load_ace_file(os.path.join(root, f), fmt):
yield sent
-
def load_ace_file(textfile, fmt):
- print(" - {0}".format(os.path.split(textfile)[1]))
- annfile = textfile + ".tmx.rdc.xml"
+ print(' - {0}'.format(os.path.split(textfile)[1]))
+ annfile = textfile+'.tmx.rdc.xml'
# Read the xml file, and get a list of entities
entities = []
- with open(annfile, "r") as infile:
+ with open(annfile, 'r') as infile:
xml = ET.parse(infile).getroot()
- for entity in xml.findall("document/entity"):
- typ = entity.find("entity_type").text
- for mention in entity.findall("entity_mention"):
- if mention.get("TYPE") != "NAME":
- continue # only NEs
- s = int(mention.find("head/charseq/start").text)
- e = int(mention.find("head/charseq/end").text) + 1
- entities.append((s, e, typ))
+ for entity in xml.findall('document/entity'):
+ typ = entity.find('entity_type').text
+ for mention in entity.findall('entity_mention'):
+ if mention.get('TYPE') != 'NAME': continue # only NEs
+ s = int(mention.find('head/charseq/start').text)
+ e = int(mention.find('head/charseq/end').text)+1
+ entities.append( (s, e, typ) )
# Read the text file, and mark the entities.
- with open(textfile, "r") as infile:
+ with open(textfile, 'r') as infile:
text = infile.read()
# Strip XML tags, since they don't count towards the indices
- text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
+ text = re.sub('<(?!/?TEXT)[^>]+>', '', text)
# Blank out anything before/after <TEXT>
- def subfunc(m):
- return " " * (m.end() - m.start() - 6)
-
- text = re.sub("[\s\S]*<TEXT>", subfunc, text)
- text = re.sub("</TEXT>[\s\S]*", "", text)
+ def subfunc(m): return ' '*(m.end()-m.start()-6)
+ text = re.sub('[\s\S]*<TEXT>', subfunc, text)
+ text = re.sub('</TEXT>[\s\S]*', '', text)
# Simplify quotes
text = re.sub("``", ' "', text)
text = re.sub("''", '" ', text)
- entity_types = set(typ for (s, e, typ) in entities)
+ entity_types = set(typ for (s,e,typ) in entities)
# Binary distinction (NE or not NE)
- if fmt == "binary":
+ if fmt == 'binary':
i = 0
- toks = Tree("S", [])
- for (s, e, typ) in sorted(entities):
- if s < i:
- s = i # Overlapping! Deal with this better?
- if e <= s:
- continue
+ toks = Tree('S', [])
+ for (s,e,typ) in sorted(entities):
+ if s < i: s = i # Overlapping! Deal with this better?
+ if e <= s: continue
toks.extend(word_tokenize(text[i:s]))
- toks.append(Tree("NE", text[s:e].split()))
+ toks.append(Tree('NE', text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
# Multiclass distinction (NE type)
- elif fmt == "multiclass":
+ elif fmt == 'multiclass':
i = 0
- toks = Tree("S", [])
- for (s, e, typ) in sorted(entities):
- if s < i:
- s = i # Overlapping! Deal with this better?
- if e <= s:
- continue
+ toks = Tree('S', [])
+ for (s,e,typ) in sorted(entities):
+ if s < i: s = i # Overlapping! Deal with this better?
+ if e <= s: continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree(typ, text[s:e].split()))
i = e
yield toks
else:
- raise ValueError("bad fmt value")
-
+ raise ValueError('bad fmt value')
# This probably belongs in a more general-purpose location (as does
# the parse_to_tagged function).
guessed = NEChunkParser._parse_to_tagged(guessed)
ellipsis = False
for (w, ct), (w, gt) in zip(correct, guessed):
- if ct == gt == "O":
+ if ct == gt == 'O':
if not ellipsis:
print(" {:15} {:15} {2}".format(ct, gt, w))
- print(" {:15} {:15} {2}".format("...", "...", "..."))
+ print(' {:15} {:15} {2}'.format('...', '...', '...'))
ellipsis = True
else:
ellipsis = False
print(" {:15} {:15} {2}".format(ct, gt, w))
-
-def build_model(fmt="binary"):
- print("Loading training data...")
- train_paths = [
- find("corpora/ace_data/ace.dev"),
- find("corpora/ace_data/ace.heldout"),
- find("corpora/ace_data/bbn.dev"),
- find("corpora/ace_data/muc.dev"),
- ]
+def build_model(fmt='binary'):
+ print('Loading training data...')
+ train_paths = [find('corpora/ace_data/ace.dev'),
+ find('corpora/ace_data/ace.heldout'),
+ find('corpora/ace_data/bbn.dev'),
+ find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
- print("Training...")
+ print('Training...')
cp = NEChunkParser(train_data)
del train_data
- print("Loading eval data...")
- eval_paths = [find("corpora/ace_data/ace.eval")]
+ print('Loading eval data...')
+ eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
- print("Evaluating...")
+ print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
- if i < 3:
- cmp_chunks(correct, guess)
+ if i < 3: cmp_chunks(correct, guess)
print(chunkscore)
- outfilename = "/tmp/ne_chunker_{0}.pickle".format(fmt)
- print("Saving chunker to {0}...".format(outfilename))
+ outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
+ print('Saving chunker to {0}...'.format(outfilename))
- with open(outfilename, "wb") as outfile:
+ with open(outfilename, 'wb') as outfile:
pickle.dump(cp, outfile, -1)
return cp
-if __name__ == "__main__":
+if __name__ == '__main__':
# Make sure that the pickled object has the right class name:
from nltk.chunk.named_entity import build_model
- build_model("binary")
- build_model("multiclass")
+ build_model('binary')
+ build_model('multiclass')
+
# Natural Language Toolkit: Regular Expression Chunkers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+from __future__ import division
import re
+from six import string_types
+
from nltk.tree import Tree
from nltk.chunk.api import ChunkParserI
+from nltk.compat import python_2_unicode_compatible, unicode_repr
##//////////////////////////////////////////////////////
## ChunkString
##//////////////////////////////////////////////////////
-
+@python_2_unicode_compatible
class ChunkString(object):
"""
A string-based encoding of a particular chunking of a text.
:cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
will only match positions that are in chinks.
"""
+ CHUNK_TAG_CHAR = r'[^\{\}<>]'
+ CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
- CHUNK_TAG_CHAR = r"[^\{\}<>]"
- CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR
-
- IN_CHUNK_PATTERN = r"(?=[^\{]*\})"
- IN_CHINK_PATTERN = r"(?=[^\}]*(\{|$))"
+ IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
+ IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
# These are used by _verify
- _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG
- _CHINK = r"(%s+?)+?" % CHUNK_TAG
- _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG)
- _BRACKETS = re.compile("[^\{\}]+")
- _BALANCED_BRACKETS = re.compile(r"(\{\})*$")
+ _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
+ _CHINK = r'(%s+?)+?' % CHUNK_TAG
+ _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
+ _BRACKETS = re.compile('[^\{\}]+')
+ _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
def __init__(self, chunk_struct, debug_level=1):
"""
self._root_label = chunk_struct.label()
self._pieces = chunk_struct[:]
tags = [self._tag(tok) for tok in self._pieces]
- self._str = "<" + "><".join(tags) + ">"
+ self._str = '<' + '><'.join(tags) + '>'
self._debug = debug_level
def _tag(self, tok):
elif isinstance(tok, Tree):
return tok.label()
else:
- raise ValueError("chunk structures must contain tagged " "tokens or trees")
+ raise ValueError('chunk structures must contain tagged '
+ 'tokens or trees')
def _verify(self, s, verify_tags):
"""
"""
# Check overall form
if not ChunkString._VALID.match(s):
- raise ValueError(
- "Transformation generated invalid " "chunkstring:\n %s" % s
- )
+ raise ValueError('Transformation generated invalid '
+ 'chunkstring:\n %s' % s)
# Check that parens are balanced. If the string is long, we
# have to do this in pieces, to avoid a maximum recursion
# depth limit for regular expressions.
- brackets = ChunkString._BRACKETS.sub("", s)
+ brackets = ChunkString._BRACKETS.sub('', s)
for i in range(1 + len(brackets) // 5000):
- substr = brackets[i * 5000 : i * 5000 + 5000]
+ substr = brackets[i*5000:i*5000+5000]
if not ChunkString._BALANCED_BRACKETS.match(substr):
- raise ValueError(
- "Transformation generated invalid " "chunkstring:\n %s" % s
- )
+ raise ValueError('Transformation generated invalid '
+ 'chunkstring:\n %s' % s)
- if verify_tags <= 0:
- return
+ if verify_tags<=0: return
- tags1 = (re.split(r"[\{\}<>]+", s))[1:-1]
+ tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
tags2 = [self._tag(piece) for piece in self._pieces]
if tags1 != tags2:
- raise ValueError(
- "Transformation generated invalid " "chunkstring: tag changed"
- )
+ raise ValueError('Transformation generated invalid '
+ 'chunkstring: tag changed')
- def to_chunkstruct(self, chunk_label="CHUNK"):
+ def to_chunkstruct(self, chunk_label='CHUNK'):
"""
Return the chunk structure encoded by this ``ChunkString``.
:raise ValueError: If a transformation has generated an
invalid chunkstring.
"""
- if self._debug > 0:
- self._verify(self._str, 1)
+ if self._debug > 0: self._verify(self._str, 1)
# Use this alternating list to create the chunkstruct.
pieces = []
index = 0
piece_in_chunk = 0
- for piece in re.split("[{}]", self._str):
+ for piece in re.split('[{}]', self._str):
# Find the list of tokens contained in this piece.
- length = piece.count("<")
- subsequence = self._pieces[index : index + length]
+ length = piece.count('<')
+ subsequence = self._pieces[index:index+length]
# Add this list of tokens to our pieces.
if piece_in_chunk:
# The substitution might have generated "empty chunks"
# (substrings of the form "{}"). Remove them, so they don't
# interfere with other transformations.
- s = re.sub("\{\}", "", s)
+ s = re.sub('\{\}', '', s)
# Make sure that the transformation was legal.
- if self._debug > 1:
- self._verify(s, self._debug - 2)
+ if self._debug > 1: self._verify(s, self._debug-2)
# Commit the transformation.
self._str = s
:rtype: str
"""
- return "<ChunkString: %s>" % repr(self._str)
+ return '<ChunkString: %s>' % unicode_repr(self._str)
def __str__(self):
"""
:rtype: str
"""
# Add spaces to make everything line up.
- str = re.sub(r">(?!\})", r"> ", self._str)
- str = re.sub(r"([^\{])<", r"\1 <", str)
- if str[0] == "<":
- str = " " + str
+ str = re.sub(r'>(?!\})', r'> ', self._str)
+ str = re.sub(r'([^\{])<', r'\1 <', str)
+ if str[0] == '<': str = ' ' + str
return str
-
##//////////////////////////////////////////////////////
## Chunking Rules
##//////////////////////////////////////////////////////
-
+@python_2_unicode_compatible
class RegexpChunkRule(object):
"""
A rule specifying how to modify the chunking in a ``ChunkString``,
of angle-bracket delimited tags. Furthermore, this transformation
may not result in nested or mismatched bracketing.
"""
-
def __init__(self, regexp, repl, descr):
"""
Construct a new RegexpChunkRule.
:param descr: A short description of the purpose and/or effect
of this rule.
"""
- if isinstance(regexp, str):
+ if isinstance(regexp, string_types):
regexp = re.compile(regexp)
self._repl = repl
self._descr = descr
:rtype: str
"""
- return (
- "<RegexpChunkRule: "
- + repr(self._regexp.pattern)
- + "->"
- + repr(self._repl)
- + ">"
- )
+ return ('<RegexpChunkRule: '+unicode_repr(self._regexp.pattern)+
+ '->'+unicode_repr(self._repl)+'>')
@staticmethod
def fromstring(s):
<ChunkRule: '<DT>?<NN.*>+'>
"""
# Split off the comment (but don't split on '\#')
- m = re.match(r"(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?", s)
- rule = m.group("rule").strip()
- comment = (m.group("comment") or "")[1:].strip()
+ m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
+ rule = m.group('rule').strip()
+ comment = (m.group('comment') or '')[1:].strip()
# Pattern bodies: chunk, chink, split, merge
try:
if not rule:
- raise ValueError("Empty chunk pattern")
- if rule[0] == "{" and rule[-1] == "}":
+ raise ValueError('Empty chunk pattern')
+ if rule[0] == '{' and rule[-1] == '}':
return ChunkRule(rule[1:-1], comment)
- elif rule[0] == "}" and rule[-1] == "{":
+ elif rule[0] == '}' and rule[-1] == '{':
return ChinkRule(rule[1:-1], comment)
- elif "}{" in rule:
- left, right = rule.split("}{")
+ elif '}{' in rule:
+ left, right = rule.split('}{')
return SplitRule(left, right, comment)
- elif "{}" in rule:
- left, right = rule.split("{}")
+ elif '{}' in rule:
+ left, right = rule.split('{}')
return MergeRule(left, right, comment)
- elif re.match("[^{}]*{[^{}]*}[^{}]*", rule):
- left, chunk, right = re.split("[{}]", rule)
+ elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
+ left, chunk, right = re.split('[{}]', rule)
return ChunkRuleWithContext(left, chunk, right, comment)
else:
- raise ValueError("Illegal chunk pattern: %s" % rule)
+ raise ValueError('Illegal chunk pattern: %s' % rule)
except (ValueError, re.error):
- raise ValueError("Illegal chunk pattern: %s" % rule)
+ raise ValueError('Illegal chunk pattern: %s' % rule)
+@python_2_unicode_compatible
class ChunkRule(RegexpChunkRule):
"""
A rule specifying how to add chunks to a ``ChunkString``, using a
already part of a chunk, and create a new chunk containing that
substring.
"""
-
def __init__(self, tag_pattern, descr):
"""
of this rule.
"""
self._pattern = tag_pattern
- regexp = re.compile(
- "(?P<chunk>%s)%s"
- % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
- )
- RegexpChunkRule.__init__(self, regexp, "{\g<chunk>}", descr)
+ regexp = re.compile('(?P<chunk>%s)%s' %
+ (tag_pattern2re_pattern(tag_pattern),
+ ChunkString.IN_CHINK_PATTERN))
+ RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
def __repr__(self):
"""
:rtype: str
"""
- return "<ChunkRule: " + repr(self._pattern) + ">"
-
+ return '<ChunkRule: '+unicode_repr(self._pattern)+'>'
+@python_2_unicode_compatible
class ChinkRule(RegexpChunkRule):
"""
A rule specifying how to remove chinks to a ``ChunkString``,
tag pattern and that is contained in a chunk, and remove it
from that chunk, thus creating two new chunks.
"""
-
def __init__(self, tag_pattern, descr):
"""
Construct a new ``ChinkRule``.
of this rule.
"""
self._pattern = tag_pattern
- regexp = re.compile(
- "(?P<chink>%s)%s"
- % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
- )
- RegexpChunkRule.__init__(self, regexp, "}\g<chink>{", descr)
+ regexp = re.compile('(?P<chink>%s)%s' %
+ (tag_pattern2re_pattern(tag_pattern),
+ ChunkString.IN_CHUNK_PATTERN))
+ RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
def __repr__(self):
"""
:rtype: str
"""
- return "<ChinkRule: " + repr(self._pattern) + ">"
+ return '<ChinkRule: '+unicode_repr(self._pattern)+'>'
+@python_2_unicode_compatible
class UnChunkRule(RegexpChunkRule):
"""
A rule specifying how to remove chunks to a ``ChunkString``,
``ChunkString``, it will find any complete chunk that matches this
tag pattern, and un-chunk it.
"""
-
def __init__(self, tag_pattern, descr):
"""
Construct a new ``UnChunkRule``.
of this rule.
"""
self._pattern = tag_pattern
- regexp = re.compile("\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
- RegexpChunkRule.__init__(self, regexp, "\g<chunk>", descr)
+ regexp = re.compile('\{(?P<chunk>%s)\}' %
+ tag_pattern2re_pattern(tag_pattern))
+ RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
def __repr__(self):
"""
:rtype: str
"""
- return "<UnChunkRule: " + repr(self._pattern) + ">"
+ return '<UnChunkRule: '+unicode_repr(self._pattern)+'>'
+@python_2_unicode_compatible
class MergeRule(RegexpChunkRule):
"""
A rule specifying how to merge chunks in a ``ChunkString``, using
beginning matches right pattern. It will then merge those two
chunks into a single chunk.
"""
-
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
"""
Construct a new ``MergeRule``.
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- "(?P<left>%s)}{(?=%s)"
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, "\g<left>", descr)
+ regexp = re.compile('(?P<left>%s)}{(?=%s)' %
+ (tag_pattern2re_pattern(left_tag_pattern),
+ tag_pattern2re_pattern(right_tag_pattern)))
+ RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
def __repr__(self):
"""
:rtype: str
"""
- return (
- "<MergeRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
- )
+ return ('<MergeRule: '+unicode_repr(self._left_tag_pattern)+', '+
+ unicode_repr(self._right_tag_pattern)+'>')
+@python_2_unicode_compatible
class SplitRule(RegexpChunkRule):
"""
A rule specifying how to split chunks in a ``ChunkString``, using
then split the chunk into two new chunks, at the point between the
two pattern matches.
"""
-
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
"""
Construct a new ``SplitRule``.
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- "(?P<left>%s)(?=%s)"
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, r"\g<left>}{", descr)
+ regexp = re.compile('(?P<left>%s)(?=%s)' %
+ (tag_pattern2re_pattern(left_tag_pattern),
+ tag_pattern2re_pattern(right_tag_pattern)))
+ RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
def __repr__(self):
"""
:rtype: str
"""
- return (
- "<SplitRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
- )
+ return ('<SplitRule: '+unicode_repr(self._left_tag_pattern)+', '+
+ unicode_repr(self._right_tag_pattern)+'>')
+@python_2_unicode_compatible
class ExpandLeftRule(RegexpChunkRule):
"""
A rule specifying how to expand chunks in a ``ChunkString`` to the left,
end matches left pattern. It will then expand the chunk to incorporate
the new material on the left.
"""
-
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
"""
Construct a new ``ExpandRightRule``.
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- "(?P<left>%s)\{(?P<right>%s)"
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, "{\g<left>\g<right>", descr)
+ regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' %
+ (tag_pattern2re_pattern(left_tag_pattern),
+ tag_pattern2re_pattern(right_tag_pattern)))
+ RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
def __repr__(self):
"""
:rtype: str
"""
- return (
- "<ExpandLeftRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
- )
+ return ('<ExpandLeftRule: '+unicode_repr(self._left_tag_pattern)+', '+
+ unicode_repr(self._right_tag_pattern)+'>')
+@python_2_unicode_compatible
class ExpandRightRule(RegexpChunkRule):
"""
A rule specifying how to expand chunks in a ``ChunkString`` to the
a chink whose beginning matches right pattern. It will then
expand the chunk to incorporate the new material on the right.
"""
-
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
"""
Construct a new ``ExpandRightRule``.
self._left_tag_pattern = left_tag_pattern
self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- "(?P<left>%s)\}(?P<right>%s)"
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, "\g<left>\g<right>}", descr)
+ regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' %
+ (tag_pattern2re_pattern(left_tag_pattern),
+ tag_pattern2re_pattern(right_tag_pattern)))
+ RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
def __repr__(self):
"""
:rtype: str
"""
- return (
- "<ExpandRightRule: "
- + repr(self._left_tag_pattern)
- + ", "
- + repr(self._right_tag_pattern)
- + ">"
- )
+ return ('<ExpandRightRule: '+unicode_repr(self._left_tag_pattern)+', '+
+ unicode_repr(self._right_tag_pattern)+'>')
+@python_2_unicode_compatible
class ChunkRuleWithContext(RegexpChunkRule):
"""
A rule specifying how to add chunks to a ``ChunkString``, using
rule matches; therefore, if you need to find overlapping matches,
you will need to apply your rule more than once.
"""
-
- def __init__(
- self,
- left_context_tag_pattern,
- chunk_tag_pattern,
- right_context_tag_pattern,
- descr,
- ):
+ def __init__(self, left_context_tag_pattern, chunk_tag_pattern,
+ right_context_tag_pattern, descr):
"""
Construct a new ``ChunkRuleWithContext``.
self._left_context_tag_pattern = left_context_tag_pattern
self._chunk_tag_pattern = chunk_tag_pattern
self._right_context_tag_pattern = right_context_tag_pattern
- regexp = re.compile(
- "(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s"
- % (
- tag_pattern2re_pattern(left_context_tag_pattern),
- tag_pattern2re_pattern(chunk_tag_pattern),
- tag_pattern2re_pattern(right_context_tag_pattern),
- ChunkString.IN_CHINK_PATTERN,
- )
- )
- replacement = r"\g<left>{\g<chunk>}\g<right>"
+ regexp = re.compile('(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s' %
+ (tag_pattern2re_pattern(left_context_tag_pattern),
+ tag_pattern2re_pattern(chunk_tag_pattern),
+ tag_pattern2re_pattern(right_context_tag_pattern),
+ ChunkString.IN_CHINK_PATTERN))
+ replacement = r'\g<left>{\g<chunk>}\g<right>'
RegexpChunkRule.__init__(self, regexp, replacement, descr)
def __repr__(self):
:rtype: str
"""
- return "<ChunkRuleWithContext: %r, %r, %r>" % (
- self._left_context_tag_pattern,
- self._chunk_tag_pattern,
- self._right_context_tag_pattern,
- )
-
+ return '<ChunkRuleWithContext: %r, %r, %r>' % (
+ self._left_context_tag_pattern, self._chunk_tag_pattern,
+ self._right_context_tag_pattern)
##//////////////////////////////////////////////////////
## Tag Pattern Format Conversion
# this should probably be made more strict than it is -- e.g., it
# currently accepts 'foo'.
-CHUNK_TAG_PATTERN = re.compile(
- r"^((%s|<%s>)*)$" % ("([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", "[^\{\}<>]+")
-)
+CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
+ ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+',
+ '[^\{\}<>]+'))
+
+
+
def tag_pattern2re_pattern(tag_pattern):
``tag_pattern``.
"""
# Clean up the regular expression
- tag_pattern = re.sub(r"\s", "", tag_pattern)
- tag_pattern = re.sub(r"<", "(<(", tag_pattern)
- tag_pattern = re.sub(r">", ")>)", tag_pattern)
+ tag_pattern = re.sub(r'\s', '', tag_pattern)
+ tag_pattern = re.sub(r'<', '(<(', tag_pattern)
+ tag_pattern = re.sub(r'>', ')>)', tag_pattern)
# Check the regular expression
if not CHUNK_TAG_PATTERN.match(tag_pattern):
- raise ValueError("Bad tag pattern: %r" % tag_pattern)
+ raise ValueError('Bad tag pattern: %r' % tag_pattern)
# Replace "." with CHUNK_TAG_CHAR.
# We have to do this after, since it adds {}[]<>s, which would
def reverse_str(str):
lst = list(str)
lst.reverse()
- return "".join(lst)
-
+ return ''.join(lst)
tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
reversed = reverse_str(tag_pattern)
- reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed)
+ reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
tag_pattern = reverse_str(reversed)
return tag_pattern
## RegexpChunkParser
##//////////////////////////////////////////////////////
-
+@python_2_unicode_compatible
class RegexpChunkParser(ChunkParserI):
"""
A regular expression based chunk parser. ``RegexpChunkParser`` uses a
:ivar _trace: The default level of tracing.
"""
-
- def __init__(self, rules, chunk_label="NP", root_label="S", trace=0):
+ def __init__(self, rules, chunk_label='NP', root_label='S', trace=0):
"""
Construct a new ``RegexpChunkParser``.
:param verbose: Whether output should be verbose.
:rtype: None
"""
- print("# Input:")
+ print('# Input:')
print(chunkstr)
for rule in self._rules:
rule.apply(chunkstr)
if verbose:
- print("#", rule.descr() + " (" + repr(rule) + "):")
+ print('#', rule.descr()+' ('+unicode_repr(rule)+'):')
else:
- print("#", rule.descr() + ":")
+ print('#', rule.descr()+':')
print(chunkstr)
def _notrace_apply(self, chunkstr):
used to define this ``RegexpChunkParser``.
"""
if len(chunk_struct) == 0:
- print("Warning: parsing empty text")
+ print('Warning: parsing empty text')
return Tree(self._root_label, [])
try:
chunk_struct = Tree(self._root_label, chunk_struct)
# Use the default trace value?
- if trace is None:
- trace = self._trace
+ if trace is None: trace = self._trace
chunkstr = ChunkString(chunk_struct)
# Apply the sequence of rules to the chunkstring.
if trace:
- verbose = trace > 1
+ verbose = (trace>1)
self._trace_apply(chunkstr, verbose)
else:
self._notrace_apply(chunkstr)
for rule in self._rules:
margin = max(margin, len(rule.descr()))
if margin < 35:
- format = " %" + repr(-(margin + 3)) + "s%s\n"
+ format = " %" + repr(-(margin+3)) + "s%s\n"
else:
format = " %s\n %s\n"
for rule in self._rules:
- s += format % (rule.descr(), repr(rule))
+ s += format % (rule.descr(), unicode_repr(rule))
return s[:-1]
-
##//////////////////////////////////////////////////////
## Chunk Grammar
##//////////////////////////////////////////////////////
-
+@python_2_unicode_compatible
class RegexpParser(ChunkParserI):
"""
A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
:ivar _stages: The list of parsing stages corresponding to the grammar
"""
-
- def __init__(self, grammar, root_label="S", loop=1, trace=0):
+ def __init__(self, grammar, root_label='S', loop=1, trace=0):
"""
Create a new chunk parser, from the given start state
and set of chunk patterns.
self._grammar = grammar
self._loop = loop
- if isinstance(grammar, str):
+ if isinstance(grammar, string_types):
self._read_grammar(grammar, root_label, trace)
else:
# Make sur the grammar looks like it has the right type:
- type_err = (
- "Expected string or list of RegexpChunkParsers " "for the grammar."
- )
- try:
- grammar = list(grammar)
- except:
- raise TypeError(type_err)
+ type_err = ('Expected string or list of RegexpChunkParsers '
+ 'for the grammar.')
+ try: grammar = list(grammar)
+ except: raise TypeError(type_err)
for elt in grammar:
if not isinstance(elt, RegexpChunkParser):
raise TypeError(type_err)
"""
rules = []
lhs = None
- for line in grammar.split("\n"):
+ for line in grammar.split('\n'):
line = line.strip()
# New stage begins if there's an unescaped ':'
- m = re.match("(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))", line)
+ m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
if m:
# Record the stage that we just completed.
self._add_stage(rules, lhs, root_label, trace)
# Start a new stage.
- lhs = m.group("nonterminal").strip()
+ lhs = m.group('nonterminal').strip()
rules = []
- line = m.group("rule").strip()
+ line = m.group('rule').strip()
# Skip blank & comment-only lines
- if line == "" or line.startswith("#"):
- continue
+ if line=='' or line.startswith('#'): continue
# Add the rule
rules.append(RegexpChunkRule.fromstring(line))
"""
if rules != []:
if not lhs:
- raise ValueError("Expected stage marker (eg NP:)")
- parser = RegexpChunkParser(
- rules, chunk_label=lhs, root_label=root_label, trace=trace
- )
+ raise ValueError('Expected stage marker (eg NP:)')
+ parser = RegexpChunkParser(rules, chunk_label=lhs,
+ root_label=root_label, trace=trace)
self._stages.append(parser)
def parse(self, chunk_struct, trace=None):
:return: the chunked output.
:rtype: Tree
"""
- if trace is None:
- trace = self._trace
+ if trace is None: trace = self._trace
for i in range(self._loop):
for parser in self._stages:
chunk_struct = parser.parse(chunk_struct, trace=trace)
s += "%s\n" % parser
return s[:-1]
-
##//////////////////////////////////////////////////////
## Demonstration code
##//////////////////////////////////////////////////////
-
def demo_eval(chunkparser, text):
"""
Demonstration code for evaluating a chunk parser, using a
# Evaluate our chunk parser.
chunkscore = chunk.ChunkScore()
- for sentence in text.split("\n"):
+ for sentence in text.split('\n'):
print(sentence)
sentence = sentence.strip()
- if not sentence:
- continue
+ if not sentence: continue
gold = chunk.tagstr2tree(sentence)
tokens = gold.leaves()
- test = chunkparser.parse(Tree("S", tokens), trace=1)
+ test = chunkparser.parse(Tree('S', tokens), trace=1)
chunkscore.score(gold, test)
print()
- print("/" + ("=" * 75) + "\\")
- print("Scoring", chunkparser)
- print(("-" * 77))
- print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ")
- print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ")
- print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100))
+ print('/'+('='*75)+'\\')
+ print('Scoring', chunkparser)
+ print(('-'*77))
+ print('Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4, end=' ')
+ print('Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6, end=' ')
+ print('F-Measure: %5.1f%%' % (chunkscore.f_measure()*100))
+
# Missed chunks.
if chunkscore.missed():
- print("Missed:")
+ print('Missed:')
missed = chunkscore.missed()
for chunk in missed[:10]:
- print(" ", " ".join(map(str, chunk)))
+ print(' ', ' '.join(map(str,chunk)))
if len(chunkscore.missed()) > 10:
- print(" ...")
+ print(' ...')
# Incorrect chunks.
if chunkscore.incorrect():
- print("Incorrect:")
+ print('Incorrect:')
incorrect = chunkscore.incorrect()
for chunk in incorrect[:10]:
- print(" ", " ".join(map(str, chunk)))
+ print(' ', ' '.join(map(str,chunk)))
if len(chunkscore.incorrect()) > 10:
- print(" ...")
+ print(' ...')
- print("\\" + ("=" * 75) + "/")
+ print('\\'+('='*75)+'/')
print()
-
def demo():
"""
A demonstration for the ``RegexpChunkParser`` class. A single text is
[ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
"""
- print("*" * 75)
- print("Evaluation text:")
+ print('*'*75)
+ print('Evaluation text:')
print(text)
- print("*" * 75)
+ print('*'*75)
print()
grammar = r"""
cp = chunk.RegexpParser(grammar)
demo_eval(cp, text)
- # Evaluation
+# Evaluation
from nltk.corpus import conll2000
print("Demonstration of empty grammar:")
cp = chunk.RegexpParser("")
- print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",))))
+ print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt',
+ chunk_types=('NP',))))
print()
print("Demonstration of accuracy evaluation using CoNLL tags:")
<DT|JJ>{}<NN.*> # merge det/adj with nouns
"""
cp = chunk.RegexpParser(grammar)
- print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5]))
+ print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))
print()
print("Demonstration of tagged token input")
VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
"""
cp = chunk.RegexpParser(grammar)
- print(
- cp.parse(
- [
- ("the", "DT"),
- ("little", "JJ"),
- ("cat", "NN"),
- ("sat", "VBD"),
- ("on", "IN"),
- ("the", "DT"),
- ("mat", "NN"),
- (".", "."),
- ]
- )
- )
-
-
-if __name__ == "__main__":
+ print(cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"),
+ ("sat", "VBD"), ("on", "IN"), ("the", "DT"),
+ ("mat", "NN"), (".", ".")]))
+
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Chunk format conversions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
import re
from nltk.tree import Tree
from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple
+from nltk.compat import python_2_unicode_compatible
##//////////////////////////////////////////////////////
## EVALUATION
##//////////////////////////////////////////////////////
from nltk.metrics import accuracy as _accuracy
-
-
def accuracy(chunker, gold):
"""
Score the accuracy of the chunker against the gold standard.
gold_tags += tree2conlltags(gold_tree)
test_tags += tree2conlltags(test_tree)
- # print 'GOLD:', gold_tags[:50]
- # print 'TEST:', test_tags[:50]
+# print 'GOLD:', gold_tags[:50]
+# print 'TEST:', test_tags[:50]
return _accuracy(gold_tags, test_tags)
:type _fn_num: int
:ivar _fn_num: Number of false negatives.
"""
-
def __init__(self, **kwargs):
self._correct = set()
self._guessed = set()
self._tp = set()
self._fp = set()
self._fn = set()
- self._max_tp = kwargs.get("max_tp_examples", 100)
- self._max_fp = kwargs.get("max_fp_examples", 100)
- self._max_fn = kwargs.get("max_fn_examples", 100)
- self._chunk_label = kwargs.get("chunk_label", ".*")
+ self._max_tp = kwargs.get('max_tp_examples', 100)
+ self._max_fp = kwargs.get('max_fp_examples', 100)
+ self._max_fn = kwargs.get('max_fn_examples', 100)
+ self._chunk_label = kwargs.get('chunk_label', '.*')
self._tp_num = 0
self._fp_num = 0
self._fn_num = 0
self._measuresNeedUpdate = False
def _updateMeasures(self):
- if self._measuresNeedUpdate:
- self._tp = self._guessed & self._correct
- self._fn = self._correct - self._guessed
- self._fp = self._guessed - self._correct
- self._tp_num = len(self._tp)
- self._fp_num = len(self._fp)
- self._fn_num = len(self._fn)
- self._measuresNeedUpdate = False
+ if (self._measuresNeedUpdate):
+ self._tp = self._guessed & self._correct
+ self._fn = self._correct - self._guessed
+ self._fp = self._guessed - self._correct
+ self._tp_num = len(self._tp)
+ self._fp_num = len(self._fp)
+ self._fn_num = len(self._fn)
+ self._measuresNeedUpdate = False
def score(self, correct, guessed):
"""
# is too deeply nested to be printed in CoNLL format."
correct_tags = guessed_tags = ()
self._tags_total += len(correct_tags)
- self._tags_correct += sum(
- 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
- )
+ self._tags_correct += sum(1 for (t,g) in zip(guessed_tags,
+ correct_tags)
+ if t==g)
def accuracy(self):
"""
:rtype: float
"""
- if self._tags_total == 0:
- return 1
- return self._tags_correct / self._tags_total
+ if self._tags_total == 0: return 1
+ return self._tags_correct/self._tags_total
def precision(self):
"""
"""
self._updateMeasures()
div = self._tp_num + self._fp_num
- if div == 0:
- return 0
- else:
- return self._tp_num / div
+ if div == 0: return 0
+ else: return self._tp_num / div
def recall(self):
"""
"""
self._updateMeasures()
div = self._tp_num + self._fn_num
- if div == 0:
- return 0
- else:
- return self._tp_num / div
+ if div == 0: return 0
+ else: return self._tp_num / div
def f_measure(self, alpha=0.5):
"""
self._updateMeasures()
p = self.precision()
r = self.recall()
- if p == 0 or r == 0: # what if alpha is 0 or 1?
+ if p == 0 or r == 0: # what if alpha is 0 or 1?
return 0
- return 1 / (alpha / p + (1 - alpha) / r)
+ return 1/(alpha/p + (1-alpha)/r)
def missed(self):
"""
:rtype: str
"""
- return "<ChunkScoring of " + repr(len(self)) + " chunks>"
+ return '<ChunkScoring of '+repr(len(self))+' chunks>'
def __str__(self):
"""
:rtype: str
"""
- return (
- "ChunkParse score:\n"
- + (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))
- + (" Precision: {:5.1f}%%\n".format(self.precision() * 100))
- + (" Recall: {:5.1f}%%\n".format(self.recall() * 100))
- + (" F-Measure: {:5.1f}%%".format(self.f_measure() * 100))
- )
-
+ return ("ChunkParse score:\n" +
+ (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy()*100)) +
+ (" Precision: {:5.1f}%%\n".format(self.precision()*100)) +
+ (" Recall: {:5.1f}%%\n".format(self.recall()*100))+
+ (" F-Measure: {:5.1f}%%".format(self.f_measure()*100)))
# extract chunks, and assign unique id, the absolute position of
# the first word of the chunk
return set(chunks)
-def tagstr2tree(
- s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
-):
+def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
+ source_tagset=None, target_tagset=None):
"""
Divide a string of bracketted tagged text into
chunks and unchunked tokens, and produce a Tree.
:rtype: Tree
"""
- WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
+ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
stack = [Tree(root_label, [])]
for match in WORD_OR_BRACKET.finditer(s):
text = match.group()
- if text[0] == "[":
+ if text[0] == '[':
if len(stack) != 1:
- raise ValueError("Unexpected [ at char {:d}".format(match.start()))
+ raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
chunk = Tree(chunk_label, [])
stack[-1].append(chunk)
stack.append(chunk)
- elif text[0] == "]":
+ elif text[0] == ']':
if len(stack) != 2:
- raise ValueError("Unexpected ] at char {:d}".format(match.start()))
+ raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
stack.pop()
else:
if sep is None:
stack[-1].append((word, tag))
if len(stack) != 1:
- raise ValueError("Expected ] at char {:d}".format(len(s)))
+ raise ValueError('Expected ] at char {:d}'.format(len(s)))
return stack[0]
-
### CONLL
-_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
-
-
-def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
+_LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
+def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
"""
Return a chunk structure for a single sentence
encoded in the given CONLL 2000 style string.
stack = [Tree(root_label, [])]
- for lineno, line in enumerate(s.split("\n")):
- if not line.strip():
- continue
+ for lineno, line in enumerate(s.split('\n')):
+ if not line.strip(): continue
# Decode the line.
match = _LINE_RE.match(line)
if match is None:
- raise ValueError("Error on line {:d}".format(lineno))
+ raise ValueError('Error on line {:d}'.format(lineno))
(word, tag, state, chunk_type) = match.groups()
# If it's a chunk type we don't care about, treat it as O.
- if chunk_types is not None and chunk_type not in chunk_types:
- state = "O"
+ if (chunk_types is not None and
+ chunk_type not in chunk_types):
+ state = 'O'
# For "Begin"/"Outside", finish any completed chunks -
# also do so for "Inside" which don't match the previous token.
- mismatch_I = state == "I" and chunk_type != stack[-1].label()
- if state in "BO" or mismatch_I:
- if len(stack) == 2:
- stack.pop()
+ mismatch_I = state == 'I' and chunk_type != stack[-1].label()
+ if state in 'BO' or mismatch_I:
+ if len(stack) == 2: stack.pop()
# For "Begin", start a new chunk.
- if state == "B" or mismatch_I:
+ if state == 'B' or mismatch_I:
chunk = Tree(chunk_type, [])
stack[-1].append(chunk)
stack.append(chunk)
return stack[0]
-
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
- raise ValueError(
- "Tree is too deeply nested to be printed in CoNLL format"
- )
- tags.append((contents[0], contents[1], prefix + category))
+ raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
+ tags.append((contents[0], contents[1], prefix+category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
-
-def conlltags2tree(
- sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
-):
+def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
+ root_label='S', strict=False):
"""
Convert the CoNLL IOB format to a tree.
"""
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
- tree.append((word, postag))
- elif chunktag.startswith("B-"):
- tree.append(Tree(chunktag[2:], [(word, postag)]))
- elif chunktag.startswith("I-"):
- if (
- len(tree) == 0
- or not isinstance(tree[-1], Tree)
- or tree[-1].label() != chunktag[2:]
- ):
+ tree.append((word,postag))
+ elif chunktag.startswith('B-'):
+ tree.append(Tree(chunktag[2:], [(word,postag)]))
+ elif chunktag.startswith('I-'):
+ if (len(tree)==0 or not isinstance(tree[-1], Tree) or
+ tree[-1].label() != chunktag[2:]):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
- tree.append(Tree(chunktag[2:], [(word, postag)]))
+ tree.append(Tree(chunktag[2:], [(word,postag)]))
else:
- tree[-1].append((word, postag))
- elif chunktag == "O":
- tree.append((word, postag))
+ tree[-1].append((word,postag))
+ elif chunktag == 'O':
+ tree.append((word,postag))
else:
raise ValueError("Bad conll tag {0!r}".format(chunktag))
return tree
-
def tree2conllstr(t):
"""
Return a multiline string where each line contains a word, tag and IOB tag.
:rtype: str
"""
lines = [" ".join(token) for token in tree2conlltags(t)]
- return "\n".join(lines)
-
+ return '\n'.join(lines)
### IEER
-_IEER_DOC_RE = re.compile(
- r"<DOC>\s*"
- r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
- r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
- r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
- r"<BODY>\s*"
- r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
- r"<TEXT>(?P<text>.*?)</TEXT>\s*"
- r"</BODY>\s*</DOC>\s*",
- re.DOTALL,
-)
+_IEER_DOC_RE = re.compile(r'<DOC>\s*'
+ r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
+ r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
+ r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
+ r'<BODY>\s*'
+ r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
+ r'<TEXT>(?P<text>.*?)</TEXT>\s*'
+ r'</BODY>\s*</DOC>\s*', re.DOTALL)
_IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
-
def _ieer_read_text(s, root_label):
stack = [Tree(root_label, [])]
# s will be None if there is no headline in the text
# return the empty list in place of a Tree
if s is None:
return []
- for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
+ for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
piece = piece_m.group()
try:
- if piece.startswith("<b_"):
+ if piece.startswith('<b_'):
m = _IEER_TYPE_RE.match(piece)
- if m is None:
- print("XXXX", piece)
- chunk = Tree(m.group("type"), [])
+ if m is None: print('XXXX', piece)
+ chunk = Tree(m.group('type'), [])
stack[-1].append(chunk)
stack.append(chunk)
- elif piece.startswith("<e_"):
+ elif piece.startswith('<e_'):
stack.pop()
- # elif piece.startswith('<'):
- # print "ERROR:", piece
- # raise ValueError # Unexpected HTML
+# elif piece.startswith('<'):
+# print "ERROR:", piece
+# raise ValueError # Unexpected HTML
else:
stack[-1].append(piece)
except (IndexError, ValueError):
- raise ValueError(
- "Bad IEER string (error at character {:d})".format(piece_m.start())
- )
+ raise ValueError('Bad IEER string (error at character {:d})'.format \
+ (piece_m.start()))
if len(stack) != 1:
- raise ValueError("Bad IEER string")
+ raise ValueError('Bad IEER string')
return stack[0]
-
-def ieerstr2tree(
- s,
- chunk_types=[
- "LOCATION",
- "ORGANIZATION",
- "PERSON",
- "DURATION",
- "DATE",
- "CARDINAL",
- "PERCENT",
- "MONEY",
- "MEASURE",
- ],
- root_label="S",
-):
+def ieerstr2tree(s, chunk_types = ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
+ 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], root_label="S"):
"""
Return a chunk structure containing the chunked tagged text that is
encoded in the given IEER style string.
m = _IEER_DOC_RE.match(s)
if m:
return {
- "text": _ieer_read_text(m.group("text"), root_label),
- "docno": m.group("docno"),
- "doctype": m.group("doctype"),
- "date_time": m.group("date_time"),
+ 'text': _ieer_read_text(m.group('text'), root_label),
+ 'docno': m.group('docno'),
+ 'doctype': m.group('doctype'),
+ 'date_time': m.group('date_time'),
#'headline': m.group('headline')
# we want to capture NEs in the headline too!
- "headline": _ieer_read_text(m.group("headline"), root_label),
- }
+ 'headline': _ieer_read_text(m.group('headline'), root_label),
+ }
else:
return _ieer_read_text(s, root_label)
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
import nltk
-
- t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
+ t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
t.pprint()
print()
. . O
"""
- conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
+ conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
conll_tree.pprint()
# Demonstrate CoNLL output
print()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
+
# Natural Language Toolkit: Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
>>> from nltk.corpus import gutenberg
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
... doc = gutenberg.words(fileid) # doctest: +SKIP
- ... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
+ ... print fileid, classifier.classify(document_features(doc)) # doctest: +SKIP
The parameters that a feature detector expects will vary, depending on
the task and the needs of the feature detector. For example, a
from nltk.classify.rte_classify import rte_classifier, rte_features, RTEFeatureExtractor
from nltk.classify.util import accuracy, apply_features, log_likelihood
from nltk.classify.scikitlearn import SklearnClassifier
-from nltk.classify.maxent import (
- MaxentClassifier,
- BinaryMaxentFeatureEncoding,
- TypedMaxentFeatureEncoding,
- ConditionalExponentialClassifier,
-)
+from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
+ TypedMaxentFeatureEncoding,
+ ConditionalExponentialClassifier)
from nltk.classify.senna import Senna
from nltk.classify.textcat import TextCat
# Natural Language Toolkit: Classifier Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
from nltk.internals import overridden
##//////////////////////////////////////////////////////
-# { Classification Interfaces
+#{ Classification Interfaces
##//////////////////////////////////////////////////////
-
class ClassifierI(object):
"""
A processing interface for labeling tokens with a single category
Subclasses may define:
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
"""
-
def labels(self):
"""
:return: the list of category labels used by this classifier.
Subclasses may define:
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
"""
-
def labels(self):
"""
:return: the list of category labels used by this classifier.
# of ``featuresets``.
# """
# raise NotImplementedError()
+
# Natural Language Toolkit: Decision Tree Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
the basis of a tree structure, where branches correspond to conditions
on feature values, and leaves correspond to label assignments.
"""
+from __future__ import print_function, unicode_literals, division
from collections import defaultdict
from nltk.probability import FreqDist, MLEProbDist, entropy
from nltk.classify.api import ClassifierI
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class DecisionTreeClassifier(ClassifierI):
def __init__(self, label, feature_name=None, decisions=None, default=None):
"""
for featureset, label in labeled_featuresets:
if self.classify(featureset) != label:
errors += 1
- return errors / len(labeled_featuresets)
+ return errors/len(labeled_featuresets)
- def pretty_format(self, width=70, prefix="", depth=4):
+ def pretty_format(self, width=70, prefix='', depth=4):
"""
Return a string containing a pretty-printed version of this
decision tree. Each line in this string corresponds to a
"""
# [xx] display default!!
if self._fname is None:
- n = width - len(prefix) - 15
- return '{0}{1} {2}\n'.format(prefix, '.' * n, self._label)
+ n = width-len(prefix)-15
+ return '{0}{1} {2}\n'.format(prefix, '.'*n, self._label)
s = ''
- for i, (fval, result) in enumerate(sorted(self._decisions.items(),
- key=lambda item:
- (item[0] in [None, False, True], str(item[0]).lower())
- )
- ):
+ for i, (fval, result) in enumerate(sorted(self._decisions.items())):
hdr = '{0}{1}={2}? '.format(prefix, self._fname, fval)
- n = width - 15 - len(hdr)
- s += "{0}{1} {2}\n".format(hdr, "." * (n), result._label)
- if result._fname is not None and depth > 1:
- s += result.pretty_format(width, prefix + " ", depth - 1)
+ n = width-15-len(hdr)
+ s += '{0}{1} {2}\n'.format(hdr, '.'*(n), result._label)
+ if result._fname is not None and depth>1:
+ s += result.pretty_format(width, prefix+' ', depth-1)
if self._default is not None:
- n = width - len(prefix) - 21
- s += "{0}else: {1} {2}\n".format(prefix, "." * n, self._default._label)
- if self._default._fname is not None and depth > 1:
- s += self._default.pretty_format(width, prefix + " ", depth - 1)
+ n = width-len(prefix)-21
+ s += '{0}else: {1} {2}\n'.format(prefix, '.'*n, self._default._label)
+ if self._default._fname is not None and depth>1:
+ s += self._default.pretty_format(width, prefix+' ', depth-1)
return s
- def pseudocode(self, prefix="", depth=4):
+ def pseudocode(self, prefix='', depth=4):
"""
Return a string representation of this decision tree that
expresses the decisions it makes as a nested set of pseudocode
if self._fname is None:
return "{0}return {1!r}\n".format(prefix, self._label)
s = ''
- for (fval, result) in sorted(self._decisions.items(),
- key=lambda item:
- (item[0] in [None, False, True], str(item[0]).lower())
- ):
+ for (fval, result) in sorted(self._decisions.items()):
s += '{0}if {1} == {2!r}: '.format(prefix, self._fname, fval)
- if result._fname is not None and depth > 1:
- s += "\n" + result.pseudocode(prefix + " ", depth - 1)
+ if result._fname is not None and depth>1:
+ s += '\n'+result.pseudocode(prefix+' ', depth-1)
else:
- s += "return {0!r}\n".format(result._label)
+ s += 'return {0!r}\n'.format(result._label)
if self._default is not None:
if len(self._decisions) == 1:
- s += "{0}if {1} != {2!r}: ".format(
- prefix, self._fname, list(self._decisions.keys())[0]
- )
+ s += '{0}if {1} != {2!r}: '.format(prefix, self._fname,
+ list(self._decisions.keys())[0])
else:
- s += "{0}else: ".format(prefix)
- if self._default._fname is not None and depth > 1:
- s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
+ s += '{0}else: '.format(prefix)
+ if self._default._fname is not None and depth>1:
+ s += '\n'+self._default.pseudocode(prefix+' ', depth-1)
else:
- s += "return {0!r}\n".format(self._default._label)
+ s += 'return {0!r}\n'.format(self._default._label)
return s
def __str__(self):
return self.pretty_format()
@staticmethod
- def train(
- labeled_featuresets,
- entropy_cutoff=0.05,
- depth_cutoff=100,
- support_cutoff=10,
- binary=False,
- feature_values=None,
- verbose=False,
- ):
+ def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100,
+ support_cutoff=10, binary=False, feature_values=None,
+ verbose=False):
"""
:param binary: If true, then treat all feature/value pairs as
individual binary features, rather than using a single n-way
# Start with a stump.
if not binary:
tree = DecisionTreeClassifier.best_stump(
- feature_names, labeled_featuresets, verbose
- )
+ feature_names, labeled_featuresets, verbose)
else:
tree = DecisionTreeClassifier.best_binary_stump(
- feature_names, labeled_featuresets, feature_values, verbose
- )
+ feature_names, labeled_featuresets, feature_values, verbose)
# Refine the stump.
- tree.refine(
- labeled_featuresets,
- entropy_cutoff,
- depth_cutoff - 1,
- support_cutoff,
- binary,
- feature_values,
- verbose,
- )
+ tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1,
+ support_cutoff, binary, feature_values, verbose)
# Return it
return tree
@staticmethod
def leaf(labeled_featuresets):
- label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
+ label = FreqDist(label for (featureset, label)
+ in labeled_featuresets).max()
return DecisionTreeClassifier(label)
@staticmethod
def stump(feature_name, labeled_featuresets):
- label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
+ label = FreqDist(label for (featureset, label)
+ in labeled_featuresets).max()
# Find the best label for each value.
- freqs = defaultdict(FreqDist) # freq(label|value)
+ freqs = defaultdict(FreqDist) # freq(label|value)
for featureset, label in labeled_featuresets:
feature_value = featureset.get(feature_name)
freqs[feature_value][label] += 1
- decisions = dict(
- (val, DecisionTreeClassifier(freqs[val].max())) for val in freqs
- )
+ decisions = dict((val, DecisionTreeClassifier(freqs[val].max()))
+ for val in freqs)
return DecisionTreeClassifier(label, feature_name, decisions)
- def refine(
- self,
- labeled_featuresets,
- entropy_cutoff,
- depth_cutoff,
- support_cutoff,
- binary=False,
- feature_values=None,
- verbose=False,
- ):
- if len(labeled_featuresets) <= support_cutoff:
- return
- if self._fname is None:
- return
- if depth_cutoff <= 0:
- return
+ def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff,
+ support_cutoff, binary=False, feature_values=None,
+ verbose=False):
+ if len(labeled_featuresets) <= support_cutoff: return
+ if self._fname is None: return
+ if depth_cutoff <= 0: return
for fval in self._decisions:
- fval_featuresets = [
- (featureset, label)
- for (featureset, label) in labeled_featuresets
- if featureset.get(self._fname) == fval
- ]
+ fval_featuresets = [(featureset, label) for (featureset, label)
+ in labeled_featuresets
+ if featureset.get(self._fname) == fval]
- label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
+ label_freqs = FreqDist(label for (featureset, label)
+ in fval_featuresets)
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
self._decisions[fval] = DecisionTreeClassifier.train(
- fval_featuresets,
- entropy_cutoff,
- depth_cutoff,
- support_cutoff,
- binary,
- feature_values,
- verbose,
- )
+ fval_featuresets, entropy_cutoff, depth_cutoff,
+ support_cutoff, binary, feature_values, verbose)
if self._default is not None:
- default_featuresets = [
- (featureset, label)
- for (featureset, label) in labeled_featuresets
- if featureset.get(self._fname) not in self._decisions
- ]
- label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
+ default_featuresets = [(featureset, label) for (featureset, label)
+ in labeled_featuresets
+ if featureset.get(self._fname) not in
+ self._decisions]
+ label_freqs = FreqDist(label for (featureset, label)
+ in default_featuresets)
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
self._default = DecisionTreeClassifier.train(
- default_featuresets,
- entropy_cutoff,
- depth_cutoff,
- support_cutoff,
- binary,
- feature_values,
- verbose,
- )
+ default_featuresets, entropy_cutoff, depth_cutoff,
+ support_cutoff, binary, feature_values, verbose)
@staticmethod
def best_stump(feature_names, labeled_featuresets, verbose=False):
best_error = stump_error
best_stump = stump
if verbose:
- print(
- (
- "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
- len(labeled_featuresets), best_stump._fname, best_error
- )
- )
- )
+ print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
+ (len(labeled_featuresets), best_stump._fname, best_error)))
return best_stump
@staticmethod
def binary_stump(feature_name, feature_value, labeled_featuresets):
- label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
+ label = FreqDist(label for (featureset, label)
+ in labeled_featuresets).max()
# Find the best label for each value.
pos_fdist = FreqDist()
else:
neg_fdist[label] += 1
+
decisions = {}
default = label
# But hopefully we have observations!
return DecisionTreeClassifier(label, feature_name, decisions, default)
@staticmethod
- def best_binary_stump(
- feature_names, labeled_featuresets, feature_values, verbose=False
- ):
+ def best_binary_stump(feature_names, labeled_featuresets, feature_values,
+ verbose=False):
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
best_error = best_stump.error(labeled_featuresets)
for fname in feature_names:
for fval in feature_values[fname]:
stump = DecisionTreeClassifier.binary_stump(
- fname, fval, labeled_featuresets
- )
+ fname, fval, labeled_featuresets)
stump_error = stump.error(labeled_featuresets)
if stump_error < best_error:
best_error = stump_error
best_stump = stump
if verbose:
if best_stump._decisions:
- descr = "{0}={1}".format(
- best_stump._fname, list(best_stump._decisions.keys())[0]
- )
+ descr = '{0}={1}'.format(best_stump._fname,
+ list(best_stump._decisions.keys())[0])
else:
- descr = "(default)"
- print(
- (
- "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
- len(labeled_featuresets), descr, best_error
- )
- )
- )
+ descr = '(default)'
+ print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
+ (len(labeled_featuresets), descr, best_error)))
return best_stump
-
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
-
def f(x):
return DecisionTreeClassifier.train(x, binary=True, verbose=True)
-
def demo():
from nltk.classify.util import names_demo, binary_names_demo_features
-
- classifier = names_demo(
- f, binary_names_demo_features # DecisionTreeClassifier.train,
- )
- print(classifier.pretty_format(depth=7))
+ classifier = names_demo(f, #DecisionTreeClassifier.train,
+ binary_names_demo_features)
+ print(classifier.pp(depth=7))
print(classifier.pseudocode(depth=7))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
+
# Natural Language Toolkit: Maximum Entropy Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
# URL: <http://nltk.org/>
performed by classes that implement the ``MaxentFeatureEncodingI``
interface.
"""
+from __future__ import print_function, unicode_literals
+
try:
import numpy
except ImportError:
import os
from collections import defaultdict
+from six import integer_types
+
+from nltk import compat
from nltk.data import gzip_open_unicode
from nltk.util import OrderedDict
from nltk.probability import DictionaryProbDist
from nltk.classify.api import ClassifierI
from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
-from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights
+from nltk.classify.megam import (call_megam,
+ write_megam_file, parse_megam_weights)
from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
######################################################################
-# { Classifier Model
+#{ Classifier Model
######################################################################
-
+@compat.python_2_unicode_compatible
class MaxentClassifier(ClassifierI):
"""
A maximum entropy classifier (also known as a "conditional
dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
"""
-
def __init__(self, encoding, weights, logarithmic=True):
"""
Construct a new maxent classifier model. Typically, new
self._encoding = encoding
self._weights = weights
self._logarithmic = logarithmic
- # self._logarithmic = False
+ #self._logarithmic = False
assert encoding.length() == len(weights)
def labels(self):
prob_dict[label] = prod
# Normalize the dictionary to give a probability distribution
- return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)
+ return DictionaryProbDist(prob_dict, log=self._logarithmic,
+ normalize=True)
def explain(self, featureset, columns=4):
"""
probabilities of each label for that featureset.
"""
descr_width = 50
- TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f"
+ TEMPLATE = ' %-'+str(descr_width-2)+'s%s%8.3f'
pdist = self.prob_classify(featureset)
labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
labels = labels[:columns]
- print(
- " Feature".ljust(descr_width)
- + "".join("%8s" % (("%s" % l)[:7]) for l in labels)
- )
- print(" " + "-" * (descr_width - 2 + 8 * len(labels)))
+ print(' Feature'.ljust(descr_width)+''.join(
+ '%8s' % (("%s" % l)[:7]) for l in labels))
+ print(' '+'-'*(descr_width-2+8*len(labels)))
sums = defaultdict(int)
for i, label in enumerate(labels):
feature_vector = self._encoding.encode(featureset, label)
- feature_vector.sort(
- key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True
- )
+ feature_vector.sort(key=lambda fid__: abs(self._weights[fid__[0]]),
+ reverse=True)
for (f_id, f_val) in feature_vector:
if self._logarithmic:
score = self._weights[f_id] * f_val
- else:
- score = self._weights[f_id] ** f_val
+ else: score = self._weights[f_id] ** f_val
descr = self._encoding.describe(f_id)
- descr = descr.split(" and label is ")[0] # hack
- descr += " (%s)" % f_val # hack
+ descr = descr.split(' and label is ')[0] # hack
+ descr += ' (%s)' % f_val # hack
if len(descr) > 47:
- descr = descr[:44] + "..."
- print(TEMPLATE % (descr, i * 8 * " ", score))
+ descr = descr[:44]+'...'
+ print(TEMPLATE % (descr, i*8*' ', score))
sums[label] += score
- print(" " + "-" * (descr_width - 1 + 8 * len(labels)))
- print(
- " TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
- )
- print(
- " PROBS:".ljust(descr_width)
- + "".join("%8.3f" % pdist.prob(l) for l in labels)
- )
-
- def most_informative_features(self, n=10):
- """
- Generates the ranked list of informative features from most to least.
- """
- if hasattr(self, "_most_informative_features"):
- return self._most_informative_features[:n]
- else:
- self._most_informative_features = sorted(
- list(range(len(self._weights))),
- key=lambda fid: abs(self._weights[fid]),
- reverse=True,
- )
- return self._most_informative_features[:n]
-
- def show_most_informative_features(self, n=10, show="all"):
+ print(' '+'-'*(descr_width-1+8*len(labels)))
+ print(' TOTAL:'.ljust(descr_width)+''.join(
+ '%8.3f' % sums[l] for l in labels))
+ print(' PROBS:'.ljust(descr_width)+''.join(
+ '%8.3f' % pdist.prob(l) for l in labels))
+
+ def show_most_informative_features(self, n=10, show='all'):
"""
:param show: all, neg, or pos (for negative-only or positive-only)
- :type show: str
- :param n: The no. of top features
- :type n: int
"""
- # Use None the full list of ranked features.
- fids = self.most_informative_features(None)
- if show == "pos":
+ fids = sorted(list(range(len(self._weights))),
+ key=lambda fid: abs(self._weights[fid]),
+ reverse=True)
+ if show == 'pos':
fids = [fid for fid in fids if self._weights[fid] > 0]
- elif show == "neg":
+ elif show == 'neg':
fids = [fid for fid in fids if self._weights[fid] < 0]
for fid in fids[:n]:
- print("%8.3f %s" % (self._weights[fid], self._encoding.describe(fid)))
+ print('%8.3f %s' % (self._weights[fid],
+ self._encoding.describe(fid)))
def __repr__(self):
- return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
- len(self._encoding.labels()),
- self._encoding.length(),
- )
+ return ('<ConditionalExponentialClassifier: %d labels, %d features>' %
+ (len(self._encoding.labels()), self._encoding.length()))
#: A list of the algorithm names that are accepted for the
#: ``train()`` method's ``algorithm`` parameter.
- ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]
+ ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM']
@classmethod
- def train(
- cls,
- train_toks,
- algorithm=None,
- trace=3,
- encoding=None,
- labels=None,
- gaussian_prior_sigma=0,
- **cutoffs
- ):
+ def train(cls, train_toks, algorithm=None, trace=3, encoding=None,
+ labels=None, gaussian_prior_sigma=0, **cutoffs):
"""
Train a new maxent classifier based on the given corpus of
training samples. This classifier will have its weights
log likelihood by less than ``v``.
"""
if algorithm is None:
- algorithm = "iis"
+ algorithm = 'iis'
for key in cutoffs:
- if key not in (
- "max_iter",
- "min_ll",
- "min_lldelta",
- "max_acc",
- "min_accdelta",
- "count_cutoff",
- "norm",
- "explicit",
- "bernoulli",
- ):
- raise TypeError("Unexpected keyword arg %r" % key)
+ if key not in ('max_iter', 'min_ll', 'min_lldelta',
+ 'max_acc', 'min_accdelta', 'count_cutoff',
+ 'norm', 'explicit', 'bernoulli'):
+ raise TypeError('Unexpected keyword arg %r' % key)
algorithm = algorithm.lower()
- if algorithm == "iis":
+ if algorithm == 'iis':
return train_maxent_classifier_with_iis(
- train_toks, trace, encoding, labels, **cutoffs
- )
- elif algorithm == "gis":
+ train_toks, trace, encoding, labels, **cutoffs)
+ elif algorithm == 'gis':
return train_maxent_classifier_with_gis(
- train_toks, trace, encoding, labels, **cutoffs
- )
- elif algorithm == "megam":
+ train_toks, trace, encoding, labels, **cutoffs)
+ elif algorithm == 'megam':
return train_maxent_classifier_with_megam(
- train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
- )
- elif algorithm == "tadm":
+ train_toks, trace, encoding, labels,
+ gaussian_prior_sigma, **cutoffs)
+ elif algorithm == 'tadm':
kwargs = cutoffs
- kwargs["trace"] = trace
- kwargs["encoding"] = encoding
- kwargs["labels"] = labels
- kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
+ kwargs['trace'] = trace
+ kwargs['encoding'] = encoding
+ kwargs['labels'] = labels
+ kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma
return TadmMaxentClassifier.train(train_toks, **kwargs)
else:
- raise ValueError("Unknown algorithm %s" % algorithm)
+ raise ValueError('Unknown algorithm %s' % algorithm)
#: Alias for MaxentClassifier.
######################################################################
-# { Feature Encodings
+#{ Feature Encodings
######################################################################
-
class MaxentFeatureEncodingI(object):
"""
A mapping that converts a set of input-feature values to a vector
input-feature values and labels that are present in a given
corpus.
"""
-
def encode(self, featureset, label):
"""
Given a (featureset, label) pair, return the corresponding
"""
raise NotImplementedError()
-
class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
"""
A feature encoding that calls a user-supplied function to map a
given featureset/label pair to a sparse joint-feature vector.
"""
-
def __init__(self, func, length, labels):
"""
Construct a new feature encoding based on the given function.
return self._labels
def describe(self, fid):
- return "no description available"
-
+ return 'no description available'
class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
"""
These always-on features allow the maxent model to directly model
the prior probabilities of each label.
"""
-
- def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
+ def __init__(self, labels, mapping, unseen_features=False,
+ alwayson_features=False):
"""
:param labels: A list of the \"known labels\" for this encoding.
features in the generated joint-feature vectors.
"""
if set(mapping.values()) != set(range(len(mapping))):
- raise ValueError(
- "Mapping values must be exactly the "
- "set of integers from 0...len(mapping)"
- )
+ raise ValueError('Mapping values must be exactly the '
+ 'set of integers from 0...len(mapping)')
self._labels = list(labels)
"""A list of attested labels."""
"""dict mapping from fname -> fid"""
if alwayson_features:
- self._alwayson = dict(
- (label, i + self._length) for (i, label) in enumerate(labels)
- )
+ self._alwayson = dict((label, i+self._length)
+ for (i, label) in enumerate(labels))
self._length += len(self._alwayson)
if unseen_features:
fnames = set(fname for (fname, fval, label) in mapping)
- self._unseen = dict(
- (fname, i + self._length) for (i, fname) in enumerate(fnames)
- )
+ self._unseen = dict((fname, i+self._length)
+ for (i, fname) in enumerate(fnames))
self._length += len(fnames)
def encode(self, featureset, label):
# Have we seen this fname/fval combination with any label?
for label2 in self._labels:
if (fname, fval, label2) in self._mapping:
- break # we've seen this fname/fval combo
+ break # we've seen this fname/fval combo
# We haven't -- fire the unseen-value feature
else:
if fname in self._unseen:
def describe(self, f_id):
# Inherit docs.
- if not isinstance(f_id, int):
- raise TypeError("describe() expected an int")
+ if not isinstance(f_id, integer_types):
+ raise TypeError('describe() expected an int')
try:
self._inv_mapping
except AttributeError:
- self._inv_mapping = [-1] * len(self._mapping)
+ self._inv_mapping = [-1]*len(self._mapping)
for (info, i) in self._mapping.items():
self._inv_mapping[i] = info
if f_id < len(self._mapping):
(fname, fval, label) = self._inv_mapping[f_id]
- return "%s==%r and label is %r" % (fname, fval, label)
+ return '%s==%r and label is %r' % (fname, fval, label)
elif self._alwayson and f_id in self._alwayson.values():
for (label, f_id2) in self._alwayson.items():
if f_id == f_id2:
- return "label is %r" % label
+ return 'label is %r' % label
elif self._unseen and f_id in self._unseen.values():
for (fname, f_id2) in self._unseen.items():
if f_id == f_id2:
- return "%s is unseen" % fname
+ return '%s is unseen' % fname
else:
- raise ValueError("Bad feature id")
+ raise ValueError('Bad feature id')
def labels(self):
# Inherit docs.
:param options: Extra parameters for the constructor, such as
``unseen_features`` and ``alwayson_features``.
"""
- mapping = {} # maps (fname, fval, label) -> fid
- seen_labels = set() # The set of labels we've encountered
+ mapping = {} # maps (fname, fval, label) -> fid
+ seen_labels = set() # The set of labels we've encountered
count = defaultdict(int) # maps (fname, fval) -> count
for (tok, label) in train_toks:
if labels and label not in labels:
- raise ValueError("Unexpected label %s" % label)
+ raise ValueError('Unexpected label %s' % label)
seen_labels.add(label)
# Record each of the features.
labels = seen_labels
return cls(labels, mapping, **options)
-
class GISEncoding(BinaryMaxentFeatureEncoding):
"""
A binary feature encoding which adds one new joint-feature to the
- The feature vector must sum to a constant non-negative number
for every token.
"""
-
- def __init__(
- self, labels, mapping, unseen_features=False, alwayson_features=False, C=None
- ):
+ def __init__(self, labels, mapping, unseen_features=False,
+ alwayson_features=False, C=None):
"""
:param C: The correction constant. The value of the correction
feature is based on this value. In particular, its value is
:seealso: ``BinaryMaxentFeatureEncoding.__init__``
"""
BinaryMaxentFeatureEncoding.__init__(
- self, labels, mapping, unseen_features, alwayson_features
- )
+ self, labels, mapping, unseen_features, alwayson_features)
if C is None:
- C = len(set(fname for (fname, fval, label) in mapping)) + 1
+ C = len(set(fname for (fname, fval, label) in mapping))+1
self._C = C
@property
# Add a correction feature.
total = sum(v for (f, v) in encoding)
if total >= self._C:
- raise ValueError("Correction feature is not high enough!")
- encoding.append((base_length, self._C - total))
+ raise ValueError('Correction feature is not high enough!')
+ encoding.append((base_length, self._C-total))
# Return the result
return encoding
def describe(self, f_id):
if f_id == BinaryMaxentFeatureEncoding.length(self):
- return "Correction feature (%s)" % self._C
+ return 'Correction feature (%s)' % self._C
else:
return BinaryMaxentFeatureEncoding.describe(self, f_id)
class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
- def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
+ def __init__(self, labels, mapping, unseen_features=False,
+ alwayson_features=False):
self._mapping = OrderedDict(mapping)
self._label_mapping = OrderedDict()
- BinaryMaxentFeatureEncoding.__init__(
- self, labels, self._mapping, unseen_features, alwayson_features
- )
+ BinaryMaxentFeatureEncoding.__init__(self, labels, self._mapping,
+ unseen_features,
+ alwayson_features)
def encode(self, featureset, label):
encoding = []
self._label_mapping[value] = len(self._label_mapping)
else:
self._label_mapping[value] = value
- encoding.append(
- (self._mapping[(feature, label)], self._label_mapping[value])
- )
+ encoding.append((self._mapping[(feature, label)],
+ self._label_mapping[value]))
return encoding
def labels(self):
These always-on features allow the maxent model to directly model
the prior probabilities of each label.
"""
-
- def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
+ def __init__(self, labels, mapping, unseen_features=False,
+ alwayson_features=False):
"""
:param labels: A list of the \"known labels\" for this encoding.
features in the generated joint-feature vectors.
"""
if set(mapping.values()) != set(range(len(mapping))):
- raise ValueError(
- "Mapping values must be exactly the "
- "set of integers from 0...len(mapping)"
- )
+ raise ValueError('Mapping values must be exactly the '
+ 'set of integers from 0...len(mapping)')
self._labels = list(labels)
"""A list of attested labels."""
"""dict mapping from fname -> fid"""
if alwayson_features:
- self._alwayson = dict(
- (label, i + self._length) for (i, label) in enumerate(labels)
- )
+ self._alwayson = dict((label, i+self._length)
+ for (i, label) in enumerate(labels))
self._length += len(self._alwayson)
if unseen_features:
fnames = set(fname for (fname, fval, label) in mapping)
- self._unseen = dict(
- (fname, i + self._length) for (i, fname) in enumerate(fnames)
- )
+ self._unseen = dict((fname, i+self._length)
+ for (i, fname) in enumerate(fnames))
self._length += len(fnames)
def encode(self, featureset, label):
# Convert input-features to joint-features:
for fname, fval in featureset.items():
- if isinstance(fval, (int, float)):
+ if isinstance(fval, (integer_types, float)):
# Known feature name & value:
if (fname, type(fval), label) in self._mapping:
- encoding.append((self._mapping[fname, type(fval), label], fval))
+ encoding.append((self._mapping[fname, type(fval),
+ label], fval))
else:
# Known feature name & value:
if (fname, fval, label) in self._mapping:
# Have we seen this fname/fval combination with any label?
for label2 in self._labels:
if (fname, fval, label2) in self._mapping:
- break # we've seen this fname/fval combo
+ break # we've seen this fname/fval combo
# We haven't -- fire the unseen-value feature
else:
if fname in self._unseen:
encoding.append((self._unseen[fname], 1))
+
# Add always-on features:
if self._alwayson and label in self._alwayson:
encoding.append((self._alwayson[label], 1))
def describe(self, f_id):
# Inherit docs.
- if not isinstance(f_id, int):
- raise TypeError("describe() expected an int")
+ if not isinstance(f_id, integer_types):
+ raise TypeError('describe() expected an int')
try:
self._inv_mapping
except AttributeError:
- self._inv_mapping = [-1] * len(self._mapping)
+ self._inv_mapping = [-1]*len(self._mapping)
for (info, i) in self._mapping.items():
self._inv_mapping[i] = info
if f_id < len(self._mapping):
(fname, fval, label) = self._inv_mapping[f_id]
- return "%s==%r and label is %r" % (fname, fval, label)
+ return '%s==%r and label is %r' % (fname, fval, label)
elif self._alwayson and f_id in self._alwayson.values():
for (label, f_id2) in self._alwayson.items():
if f_id == f_id2:
- return "label is %r" % label
+ return 'label is %r' % label
elif self._unseen and f_id in self._unseen.values():
for (fname, f_id2) in self._unseen.items():
if f_id == f_id2:
- return "%s is unseen" % fname
+ return '%s is unseen' % fname
else:
- raise ValueError("Bad feature id")
+ raise ValueError('Bad feature id')
def labels(self):
# Inherit docs.
:param options: Extra parameters for the constructor, such as
``unseen_features`` and ``alwayson_features``.
"""
- mapping = {} # maps (fname, fval, label) -> fid
- seen_labels = set() # The set of labels we've encountered
+ mapping = {} # maps (fname, fval, label) -> fid
+ seen_labels = set() # The set of labels we've encountered
count = defaultdict(int) # maps (fname, fval) -> count
for (tok, label) in train_toks:
if labels and label not in labels:
- raise ValueError("Unexpected label %s" % label)
+ raise ValueError('Unexpected label %s' % label)
seen_labels.add(label)
# Record each of the features.
return cls(labels, mapping, **options)
+
+
######################################################################
-# { Classifier Trainer: Generalized Iterative Scaling
+#{ Classifier Trainer: Generalized Iterative Scaling
######################################################################
-
-def train_maxent_classifier_with_gis(
- train_toks, trace=3, encoding=None, labels=None, **cutoffs
-):
+def train_maxent_classifier_with_gis(train_toks, trace=3, encoding=None,
+ labels=None, **cutoffs):
"""
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the Generalized Iterative Scaling
:see: ``train_maxent_classifier()`` for parameter descriptions.
"""
- cutoffs.setdefault("max_iter", 100)
+ cutoffs.setdefault('max_iter', 100)
cutoffchecker = CutoffChecker(cutoffs)
# Construct an encoding from the training data.
if encoding is None:
encoding = GISEncoding.train(train_toks, labels=labels)
- if not hasattr(encoding, "C"):
- raise TypeError(
- "The GIS algorithm requires an encoding that "
- "defines C (e.g., GISEncoding)."
- )
+ if not hasattr(encoding, 'C'):
+ raise TypeError('The GIS algorithm requires an encoding that '
+ 'defines C (e.g., GISEncoding).')
# Cinv is the inverse of the sum of each joint feature vector.
# This controls the learning rate: higher Cinv (or lower C) gives
# faster learning.
- Cinv = 1.0 / encoding.C
+ Cinv = 1.0/encoding.C
# Count how many times each feature occurs in the training data.
empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
# Build the classifier. Start with weight=0 for each attested
# feature, and weight=-infinity for each unattested feature.
- weights = numpy.zeros(len(empirical_fcount), "d")
+ weights = numpy.zeros(len(empirical_fcount), 'd')
for fid in unattested:
weights[fid] = numpy.NINF
classifier = ConditionalExponentialClassifier(encoding, weights)
del empirical_fcount
if trace > 0:
- print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
+ print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
if trace > 2:
print()
- print(" Iteration Log Likelihood Accuracy")
- print(" ---------------------------------------")
+ print(' Iteration Log Likelihood Accuracy')
+ print(' ---------------------------------------')
# Train the classifier.
try:
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
iternum = cutoffchecker.iter
- print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
+ print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
# Use the model to estimate the number of times each
# feature should occur in the training data.
estimated_fcount = calculate_estimated_fcount(
- classifier, train_toks, encoding
- )
+ classifier, train_toks, encoding)
# Take the log of estimated fcount (avoid taking log(0).)
for fid in unattested:
break
except KeyboardInterrupt:
- print(" Training stopped: keyboard interrupt")
+ print(' Training stopped: keyboard interrupt')
except:
raise
if trace > 2:
ll = log_likelihood(classifier, train_toks)
acc = accuracy(classifier, train_toks)
- print(" Final %14.5f %9.3f" % (ll, acc))
+ print(' Final %14.5f %9.3f' % (ll, acc))
- # Return the classifier.
+# Return the classifier.
return classifier
-
def calculate_empirical_fcount(train_toks, encoding):
- fcount = numpy.zeros(encoding.length(), "d")
+ fcount = numpy.zeros(encoding.length(), 'd')
for tok, label in train_toks:
for (index, val) in encoding.encode(tok, label):
return fcount
-
def calculate_estimated_fcount(classifier, train_toks, encoding):
- fcount = numpy.zeros(encoding.length(), "d")
+ fcount = numpy.zeros(encoding.length(), 'd')
for tok, label in train_toks:
pdist = classifier.prob_classify(tok)
for label in pdist.samples():
prob = pdist.prob(label)
for (fid, fval) in encoding.encode(tok, label):
- fcount[fid] += prob * fval
+ fcount[fid] += prob*fval
return fcount
######################################################################
-# { Classifier Trainer: Improved Iterative Scaling
+#{ Classifier Trainer: Improved Iterative Scaling
######################################################################
-
-def train_maxent_classifier_with_iis(
- train_toks, trace=3, encoding=None, labels=None, **cutoffs
-):
+def train_maxent_classifier_with_iis(train_toks, trace=3, encoding=None,
+ labels=None, **cutoffs):
"""
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the Improved Iterative Scaling algorithm.
:see: ``train_maxent_classifier()`` for parameter descriptions.
"""
- cutoffs.setdefault("max_iter", 100)
+ cutoffs.setdefault('max_iter', 100)
cutoffchecker = CutoffChecker(cutoffs)
# Construct an encoding from the training data.
encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
# Count how many times each feature occurs in the training data.
- empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)
+ empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) /
+ len(train_toks))
# Find the nf map, and related variables nfarray and nfident.
# nf is the sum of the features for a given labeled text.
# nfarray performs the reverse operation. nfident is
# nfarray multiplied by an identity matrix.
nfmap = calculate_nfmap(train_toks, encoding)
- nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
+ nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
# Check for any features that are not attested in train_toks.
# Build the classifier. Start with weight=0 for each attested
# feature, and weight=-infinity for each unattested feature.
- weights = numpy.zeros(len(empirical_ffreq), "d")
+ weights = numpy.zeros(len(empirical_ffreq), 'd')
for fid in unattested:
weights[fid] = numpy.NINF
classifier = ConditionalExponentialClassifier(encoding, weights)
if trace > 0:
- print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
+ print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
if trace > 2:
print()
- print(" Iteration Log Likelihood Accuracy")
- print(" ---------------------------------------")
+ print(' Iteration Log Likelihood Accuracy')
+ print(' ---------------------------------------')
# Train the classifier.
try:
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
iternum = cutoffchecker.iter
- print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
+ print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
# Calculate the deltas for this iteration, using Newton's method.
deltas = calculate_deltas(
- train_toks,
- classifier,
- unattested,
- empirical_ffreq,
- nfmap,
- nfarray,
- nftranspose,
- encoding,
- )
+ train_toks, classifier, unattested, empirical_ffreq,
+ nfmap, nfarray, nftranspose, encoding)
# Use the deltas to update our weights.
weights = classifier.weights()
break
except KeyboardInterrupt:
- print(" Training stopped: keyboard interrupt")
+ print(' Training stopped: keyboard interrupt')
except:
raise
+
if trace > 2:
ll = log_likelihood(classifier, train_toks)
acc = accuracy(classifier, train_toks)
- print(" Final %14.5f %9.3f" % (ll, acc))
+ print(' Final %14.5f %9.3f' % (ll, acc))
# Return the classifier.
return classifier
-
def calculate_nfmap(train_toks, encoding):
"""
Construct a map that can be used to compress ``nf`` (which is
nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
return dict((nf, i) for (i, nf) in enumerate(nfset))
-
-def calculate_deltas(
- train_toks,
- classifier,
- unattested,
- ffreq_empirical,
- nfmap,
- nfarray,
- nftranspose,
- encoding,
-):
+def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical,
+ nfmap, nfarray, nftranspose, encoding):
"""
Calculate the update values for the classifier weights for
this iteration of IIS. These update weights are the value of
NEWTON_CONVERGE = 1e-12
MAX_NEWTON = 300
- deltas = numpy.ones(encoding.length(), "d")
+ deltas = numpy.ones(encoding.length(), 'd')
# Precompute the A matrix:
# A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
# over all label,fs s.t. num_features[label,fs]=nf
- A = numpy.zeros((len(nfmap), encoding.length()), "d")
+ A = numpy.zeros((len(nfmap), encoding.length()), 'd')
for tok, label in train_toks:
dist = classifier.prob_classify(tok)
deltas -= (ffreq_empirical - sum1) / -sum2
# We can stop once we converge.
- n_error = numpy.sum(abs((ffreq_empirical - sum1))) / numpy.sum(abs(deltas))
+ n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/
+ numpy.sum(abs(deltas)))
if n_error < NEWTON_CONVERGE:
return deltas
return deltas
-
######################################################################
-# { Classifier Trainer: megam
+#{ Classifier Trainer: megam
######################################################################
# [xx] possible extension: add support for using implicit file format;
# this would need to put requirements on what encoding is used. But
# we may need this for other maxent classifier trainers that require
# implicit formats anyway.
-def train_maxent_classifier_with_megam(
- train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs
-):
+def train_maxent_classifier_with_megam(train_toks, trace=3, encoding=None,
+ labels=None, gaussian_prior_sigma=0,
+ **kwargs):
"""
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the external ``megam`` library. This
explicit = True
bernoulli = True
- if "explicit" in kwargs:
- explicit = kwargs["explicit"]
- if "bernoulli" in kwargs:
- bernoulli = kwargs["bernoulli"]
+ if 'explicit' in kwargs:
+ explicit = kwargs['explicit']
+ if 'bernoulli' in kwargs:
+ bernoulli = kwargs['bernoulli']
# Construct an encoding from the training data.
if encoding is None:
# Count cutoff can also be controlled by megam with the -minfc
# option. Not sure where the best place for it is.
- count_cutoff = kwargs.get("count_cutoff", 0)
- encoding = BinaryMaxentFeatureEncoding.train(
- train_toks, count_cutoff, labels=labels, alwayson_features=True
- )
+ count_cutoff = kwargs.get('count_cutoff', 0)
+ encoding = BinaryMaxentFeatureEncoding.train(train_toks, count_cutoff,
+ labels=labels,
+ alwayson_features=True)
elif labels is not None:
- raise ValueError("Specify encoding or labels, not both")
+ raise ValueError('Specify encoding or labels, not both')
# Write a training file for megam.
try:
- fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
- with open(trainfile_name, "w") as trainfile:
- write_megam_file(
- train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
- )
+ fd, trainfile_name = tempfile.mkstemp(prefix='nltk-')
+ with open(trainfile_name, 'w') as trainfile:
+ write_megam_file(train_toks, encoding, trainfile,
+ explicit=explicit, bernoulli=bernoulli)
os.close(fd)
except (OSError, IOError, ValueError) as e:
- raise ValueError("Error while creating megam training file: %s" % e)
+ raise ValueError('Error while creating megam training file: %s' % e)
# Run megam on the training file.
options = []
- options += ["-nobias", "-repeat", "10"]
+ options += ['-nobias', '-repeat', '10']
if explicit:
- options += ["-explicit"]
+ options += ['-explicit']
if not bernoulli:
- options += ["-fvals"]
+ options += ['-fvals']
if gaussian_prior_sigma:
# Lambda is just the precision of the Gaussian prior, i.e. it's the
# inverse variance, so the parameter conversion is 1.0/sigma**2.
# See http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf.
- inv_variance = 1.0 / gaussian_prior_sigma ** 2
+ inv_variance = 1.0 / gaussian_prior_sigma**2
else:
inv_variance = 0
- options += ["-lambda", "%.2f" % inv_variance, "-tune"]
+ options += ['-lambda', '%.2f' % inv_variance, '-tune']
if trace < 3:
- options += ["-quiet"]
- if "max_iter" in kwargs:
- options += ["-maxi", "%s" % kwargs["max_iter"]]
- if "ll_delta" in kwargs:
+ options += ['-quiet']
+ if 'max_iter' in kwargs:
+ options += ['-maxi', '%s' % kwargs['max_iter']]
+ if 'll_delta' in kwargs:
# [xx] this is actually a perplexity delta, not a log
# likelihood delta
- options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
- if hasattr(encoding, "cost"):
- options += ["-multilabel"] # each possible la
- options += ["multiclass", trainfile_name]
+ options += ['-dpp', '%s' % abs(kwargs['ll_delta'])]
+ if hasattr(encoding, 'cost'):
+ options += ['-multilabel'] # each possible la
+ options += ['multiclass', trainfile_name]
stdout = call_megam(options)
- # print('./megam_i686.opt ', ' '.join(options))
+ # print './megam_i686.opt ', ' '.join(options)
# Delete the training file
try:
os.remove(trainfile_name)
except (OSError, IOError) as e:
- print("Warning: unable to delete %s: %s" % (trainfile_name, e))
+ print('Warning: unable to delete %s: %s' % (trainfile_name, e))
# Parse the generated weight vector.
weights = parse_megam_weights(stdout, encoding.length(), explicit)
# Build the classifier
return MaxentClassifier(encoding, weights)
-
######################################################################
-# { Classifier Trainer: tadm
+#{ Classifier Trainer: tadm
######################################################################
-
class TadmMaxentClassifier(MaxentClassifier):
@classmethod
def train(cls, train_toks, **kwargs):
- algorithm = kwargs.get("algorithm", "tao_lmvm")
- trace = kwargs.get("trace", 3)
- encoding = kwargs.get("encoding", None)
- labels = kwargs.get("labels", None)
- sigma = kwargs.get("gaussian_prior_sigma", 0)
- count_cutoff = kwargs.get("count_cutoff", 0)
- max_iter = kwargs.get("max_iter")
- ll_delta = kwargs.get("min_lldelta")
+ algorithm = kwargs.get('algorithm', 'tao_lmvm')
+ trace = kwargs.get('trace', 3)
+ encoding = kwargs.get('encoding', None)
+ labels = kwargs.get('labels', None)
+ sigma = kwargs.get('gaussian_prior_sigma', 0)
+ count_cutoff = kwargs.get('count_cutoff', 0)
+ max_iter = kwargs.get('max_iter')
+ ll_delta = kwargs.get('min_lldelta')
# Construct an encoding from the training data.
if not encoding:
- encoding = TadmEventMaxentFeatureEncoding.train(
- train_toks, count_cutoff, labels=labels
- )
+ encoding = TadmEventMaxentFeatureEncoding.train(train_toks,
+ count_cutoff,
+ labels=labels)
- trainfile_fd, trainfile_name = tempfile.mkstemp(
- prefix="nltk-tadm-events-", suffix=".gz"
- )
- weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-")
+ trainfile_fd, trainfile_name = \
+ tempfile.mkstemp(prefix='nltk-tadm-events-', suffix='.gz')
+ weightfile_fd, weightfile_name = \
+ tempfile.mkstemp(prefix='nltk-tadm-weights-')
- trainfile = gzip_open_unicode(trainfile_name, "w")
+ trainfile = gzip_open_unicode(trainfile_name, 'w')
write_tadm_file(train_toks, encoding, trainfile)
trainfile.close()
options = []
- options.extend(["-monitor"])
- options.extend(["-method", algorithm])
+ options.extend(['-monitor'])
+ options.extend(['-method', algorithm])
if sigma:
- options.extend(["-l2", "%.6f" % sigma ** 2])
+ options.extend(['-l2', '%.6f' % sigma**2])
if max_iter:
- options.extend(["-max_it", "%d" % max_iter])
+ options.extend(['-max_it', '%d' % max_iter])
if ll_delta:
- options.extend(["-fatol", "%.6f" % abs(ll_delta)])
- options.extend(["-events_in", trainfile_name])
- options.extend(["-params_out", weightfile_name])
+ options.extend(['-fatol', '%.6f' % abs(ll_delta)])
+ options.extend(['-events_in', trainfile_name])
+ options.extend(['-params_out', weightfile_name])
if trace < 3:
- options.extend(["2>&1"])
+ options.extend(['2>&1'])
else:
- options.extend(["-summary"])
+ options.extend(['-summary'])
call_tadm(options)
- with open(weightfile_name, "r") as weightfile:
+ with open(weightfile_name, 'r') as weightfile:
weights = parse_tadm_weights(weightfile)
os.remove(trainfile_name)
# Build the classifier
return cls(encoding, weights)
-
######################################################################
-# { Demo
+#{ Demo
######################################################################
def demo():
from nltk.classify.util import names_demo
-
classifier = names_demo(MaxentClassifier.train)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Interface to Megam Classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
.. _megam: http://www.umiacs.umd.edu/~hal/megam/index.html
"""
+from __future__ import print_function
+
import subprocess
-from nltk.internals import find_binary
+from six import string_types
+from nltk import compat
+from nltk.internals import find_binary
try:
import numpy
except ImportError:
numpy = None
######################################################################
-# { Configuration
+#{ Configuration
######################################################################
_megam_bin = None
-
-
def config_megam(bin=None):
"""
Configure NLTK's interface to the ``megam`` maxent optimization
"""
global _megam_bin
_megam_bin = find_binary(
- "megam",
- bin,
- env_vars=["MEGAM"],
- binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
- url="http://www.umiacs.umd.edu/~hal/megam/index.html",
- )
-
+ 'megam', bin,
+ env_vars=['MEGAM'],
+ binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
+ url='http://www.umiacs.umd.edu/~hal/megam/index.html')
######################################################################
-# { Megam Interface Functions
+#{ Megam Interface Functions
######################################################################
-
-def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
+def write_megam_file(train_toks, encoding, stream,
+ bernoulli=True, explicit=True):
"""
Generate an input file for ``megam`` based on the given corpus of
classified tokens.
# Write the file, which contains one line per instance.
for featureset, label in train_toks:
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
- if hasattr(encoding, "cost"):
- stream.write(
- ":".join(str(encoding.cost(featureset, label, l)) for l in labels)
- )
+ if hasattr(encoding, 'cost'):
+ stream.write(':'.join(str(encoding.cost(featureset, label, l))
+ for l in labels))
else:
- stream.write("%d" % labelnum[label])
+ stream.write('%d' % labelnum[label])
# For implicit file formats, just list the features that fire
# for this instance's actual label.
if not explicit:
- _write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
+ _write_megam_features(encoding.encode(featureset, label),
+ stream, bernoulli)
# For explicit formats, list the features that would fire for
# any of the possible labels.
else:
for l in labels:
- stream.write(" #")
- _write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
+ stream.write(' #')
+ _write_megam_features(encoding.encode(featureset, l),
+ stream, bernoulli)
# End of the instance.
- stream.write("\n")
-
+ stream.write('\n')
def parse_megam_weights(s, features_count, explicit=True):
"""
vector. This function does not currently handle bias features.
"""
if numpy is None:
- raise ValueError("This function requires that numpy be installed")
- assert explicit, "non-explicit not supported yet"
- lines = s.strip().split("\n")
- weights = numpy.zeros(features_count, "d")
+ raise ValueError('This function requires that numpy be installed')
+ assert explicit, 'non-explicit not supported yet'
+ lines = s.strip().split('\n')
+ weights = numpy.zeros(features_count, 'd')
for line in lines:
if line.strip():
fid, weight = line.split()
weights[int(fid)] = float(weight)
return weights
-
def _write_megam_features(vector, stream, bernoulli):
if not vector:
- raise ValueError(
- "MEGAM classifier requires the use of an " "always-on feature."
- )
+ raise ValueError('MEGAM classifier requires the use of an '
+ 'always-on feature.')
for (fid, fval) in vector:
if bernoulli:
if fval == 1:
- stream.write(" %s" % fid)
+ stream.write(' %s' % fid)
elif fval != 0:
- raise ValueError(
- "If bernoulli=True, then all" "features must be binary."
- )
+ raise ValueError('If bernoulli=True, then all'
+ 'features must be binary.')
else:
- stream.write(" %s %s" % (fid, fval))
-
+ stream.write(' %s %s' % (fid, fval))
def call_megam(args):
"""
Call the ``megam`` binary with the given arguments.
"""
- if isinstance(args, str):
- raise TypeError("args should be a list of strings")
+ if isinstance(args, string_types):
+ raise TypeError('args should be a list of strings')
if _megam_bin is None:
config_megam()
if p.returncode != 0:
print()
print(stderr)
- raise OSError("megam command failed!")
+ raise OSError('megam command failed!')
- if isinstance(stdout, str):
+ if isinstance(stdout, string_types):
return stdout
else:
- return stdout.decode("utf-8")
+ return stdout.decode('utf-8')
# Natural Language Toolkit: Naive Bayes Classifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
| P(label|features) = --------------------------------------------
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""
+from __future__ import print_function, unicode_literals
from collections import defaultdict
## Naive Bayes Classifier
##//////////////////////////////////////////////////////
-
class NaiveBayesClassifier(ClassifierI):
"""
A Naive Bayes classifier. Naive Bayes classifiers are
you generally should not use 'None' as a feature value for one of
your own features.
"""
-
def __init__(self, label_probdist, feature_probdist):
"""
:param label_probdist: P(label), the probability distribution
if (label, fname) in self._feature_probdist:
break
else:
- # print('Ignoring unseen feature %s' % fname)
+ #print 'Ignoring unseen feature %s' % fname
del featureset[fname]
# Find the log probabilty of each label, given the features.
# nb: This case will never come up if the
# classifier was created by
# NaiveBayesClassifier.train().
- logprob[label] += sum_logs([]) # = -INF.
+ logprob[label] += sum_logs([]) # = -INF.
return DictionaryProbDist(logprob, normalize=True, log=True)
def show_most_informative_features(self, n=10):
# Determine the most relevant features, and display them.
cpdist = self._feature_probdist
- print("Most Informative Features")
+ print('Most Informative Features')
for (fname, fval) in self.most_informative_features(n):
-
def labelprob(l):
return cpdist[l, fname].prob(fval)
- labels = sorted(
- [l for l in self._labels if fval in cpdist[l, fname].samples()],
- key=lambda element: (-labelprob(element), element),
- reverse=True
- )
+ labels = sorted([l for l in self._labels
+ if fval in cpdist[l, fname].samples()],
+ key=labelprob)
if len(labels) == 1:
continue
l0 = labels[0]
l1 = labels[-1]
if cpdist[l0, fname].prob(fval) == 0:
- ratio = "INF"
+ ratio = 'INF'
else:
- ratio = "%8.1f" % (
- cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
- )
- print(
- (
- "%24s = %-14r %6s : %-6s = %s : 1.0"
- % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
- )
- )
+ ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) /
+ cpdist[l0, fname].prob(fval))
+ print(('%24s = %-14r %6s : %-6s = %s : 1.0' %
+ (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)))
def most_informative_features(self, n=100):
"""
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
"""
- if hasattr(self, "_most_informative_features"):
- return self._most_informative_features[:n]
- else:
- # The set of (fname, fval) pairs used by this classifier.
- features = set()
- # The max & min probability associated w/ each (fname, fval)
- # pair. Maps (fname,fval) -> float.
- maxprob = defaultdict(lambda: 0.0)
- minprob = defaultdict(lambda: 1.0)
-
- for (label, fname), probdist in self._feature_probdist.items():
- for fval in probdist.samples():
- feature = (fname, fval)
- features.add(feature)
- p = probdist.prob(fval)
- maxprob[feature] = max(p, maxprob[feature])
- minprob[feature] = min(p, minprob[feature])
- if minprob[feature] == 0:
- features.discard(feature)
-
- # Convert features to a list, & sort it by how informative
- # features are.
- self._most_informative_features = sorted(
- features, key=lambda feature_: (minprob[feature_] / maxprob[feature_], feature_[0],
- feature_[1] in [None, False, True], str(feature_[1]).lower())
- )
- return self._most_informative_features[:n]
+ # The set of (fname, fval) pairs used by this classifier.
+ features = set()
+ # The max & min probability associated w/ each (fname, fval)
+ # pair. Maps (fname,fval) -> float.
+ maxprob = defaultdict(lambda: 0.0)
+ minprob = defaultdict(lambda: 1.0)
+
+ for (label, fname), probdist in self._feature_probdist.items():
+ for fval in probdist.samples():
+ feature = (fname, fval)
+ features.add(feature)
+ p = probdist.prob(fval)
+ maxprob[feature] = max(p, maxprob[feature])
+ minprob[feature] = min(p, minprob[feature])
+ if minprob[feature] == 0:
+ features.discard(feature)
+
+ # Convert features to a list, & sort it by how informative
+ # features are.
+ features = sorted(features,
+ key=lambda feature_:
+ minprob[feature_]/maxprob[feature_])
+ return features[:n]
@classmethod
def train(cls, labeled_featuresets, estimator=ELEProbDist):
return cls(label_probdist, feature_probdist)
-
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
-
def demo():
from nltk.classify.util import names_demo
-
classifier = names_demo(NaiveBayesClassifier.train)
classifier.show_most_informative_features()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
+
+
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
- >>> positive_featuresets = map(features, sports_sentences)
- >>> unlabeled_featuresets = map(features, various_sentences)
+ >>> positive_featuresets = list(map(features, sports_sentences))
+ >>> unlabeled_featuresets = list(map(features, various_sentences))
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
... unlabeled_featuresets)
## Positive Naive Bayes Classifier
##//////////////////////////////////////////////////////
-
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
@staticmethod
- def train(
- positive_featuresets,
- unlabeled_featuresets,
- positive_prob_prior=0.5,
- estimator=ELEProbDist,
- ):
+ def train(positive_featuresets, unlabeled_featuresets, positive_prob_prior=0.5,
+ estimator=ELEProbDist):
"""
- :param positive_featuresets: An iterable of featuresets that are known as positive
+ :param positive_featuresets: A list of featuresets that are known as positive
examples (i.e., their label is ``True``).
- :param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
+ :param unlabeled_featuresets: A list of featuresets whose label is unknown.
:param positive_prob_prior: A prior estimate of the probability of the label
``True`` (default 0.5).
fnames = set()
# Count up how many times each feature value occurred in positive examples.
- num_positive_examples = 0
for featureset in positive_featuresets:
for fname, fval in featureset.items():
positive_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
- num_positive_examples += 1
# Count up how many times each feature value occurred in unlabeled examples.
- num_unlabeled_examples = 0
for featureset in unlabeled_featuresets:
for fname, fval in featureset.items():
unlabeled_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
- num_unlabeled_examples += 1
# If a feature didn't have a value given for an instance, then we assume that
# it gets the implicit value 'None'.
+ num_positive_examples = len(positive_featuresets)
for fname in fnames:
count = positive_feature_freqdist[fname].N()
positive_feature_freqdist[fname][None] += num_positive_examples - count
feature_values[fname].add(None)
+ num_unlabeled_examples = len(unlabeled_featuresets)
for fname in fnames:
count = unlabeled_feature_freqdist[fname].N()
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
negative_prob_prior = 1.0 - positive_prob_prior
# Create the P(label) distribution.
- label_probdist = DictionaryProbDist(
- {True: positive_prob_prior, False: negative_prob_prior}
- )
+ label_probdist = DictionaryProbDist({True: positive_prob_prior,
+ False: negative_prob_prior})
# Create the P(fval|label, fname) distribution.
feature_probdist = {}
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
negative_feature_probs = {}
for fval in feature_values[fname]:
- prob = (
- global_probdist.prob(fval)
- - positive_prob_prior * feature_probdist[True, fname].prob(fval)
- ) / negative_prob_prior
+ prob = (global_probdist.prob(fval)
+ - positive_prob_prior *
+ feature_probdist[True, fname].prob(fval)) \
+ / negative_prob_prior
# TODO: We need to add some kind of smoothing here, instead of
# setting negative probabilities to zero and normalizing.
negative_feature_probs[fval] = max(prob, 0.0)
- feature_probdist[False, fname] = DictionaryProbDist(
- negative_feature_probs, normalize=True
- )
+ feature_probdist[False, fname] = DictionaryProbDist(negative_feature_probs,
+ normalize=True)
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
-
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
-
def demo():
from nltk.classify.util import partial_names_demo
-
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
classifier.show_most_informative_features()
+
# Natural Language Toolkit: RTE Classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
TO DO: better Named Entity classification
TO DO: add lemmatization
"""
+from __future__ import print_function
from nltk.tokenize import RegexpTokenizer
from nltk.classify.util import accuracy, check_megam_config
from nltk.classify.maxent import MaxentClassifier
-
class RTEFeatureExtractor(object):
"""
This builds a bag of words for both the text and the hypothesis after
throwing away some stopwords, then calculates overlap and difference.
"""
-
def __init__(self, rtepair, stop=True, use_lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:type stop: bool
"""
self.stop = stop
- self.stopwords = set(
- [
- "a",
- "the",
- "it",
- "they",
- "of",
- "in",
- "to",
- "is",
- "have",
- "are",
- "were",
- "and",
- "very",
- ".",
- ",",
- ]
- )
-
- self.negwords = set(["no", "not", "never", "failed", "rejected", "denied"])
+ self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
+ 'have', 'are', 'were', 'and', 'very', '.', ','])
+
+ self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
+ 'denied'])
# Try to tokenize so that abbreviations, monetary amounts, email
# addresses, URLs are single tokens.
- tokenizer = RegexpTokenizer("[\w.@:/]+|\w+|\$[\d.]+")
+ tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')
- # Get the set of word types for text and hypothesis
+ #Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
+
def overlap(self, toktype, debug=False):
"""
Compute the overlap between text and hypothesis.
:type toktype: 'ne' or 'word'
"""
ne_overlap = set(token for token in self._overlap if self._ne(token))
- if toktype == "ne":
+ if toktype == 'ne':
if debug:
print("ne overlap", ne_overlap)
return ne_overlap
- elif toktype == "word":
+ elif toktype == 'word':
if debug:
print("word overlap", self._overlap - ne_overlap)
return self._overlap - ne_overlap
:type toktype: 'ne' or 'word'
"""
ne_extra = set(token for token in self._hyp_extra if self._ne(token))
- if toktype == "ne":
+ if toktype == 'ne':
return ne_extra
- elif toktype == "word":
+ elif toktype == 'word':
return self._hyp_extra - ne_extra
else:
raise ValueError("Type not recognized: '%s'" % toktype)
def rte_features(rtepair):
extractor = RTEFeatureExtractor(rtepair)
features = {}
- features["alwayson"] = True
- features["word_overlap"] = len(extractor.overlap("word"))
- features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
- features["ne_overlap"] = len(extractor.overlap("ne"))
- features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
- features["neg_txt"] = len(extractor.negwords & extractor.text_words)
- features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
+ features['alwayson'] = True
+ features['word_overlap'] = len(extractor.overlap('word'))
+ features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
+ features['ne_overlap'] = len(extractor.overlap('ne'))
+ features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
+ features['neg_txt'] = len(extractor.negwords & extractor.text_words)
+ features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words)
return features
def rte_classifier(algorithm):
from nltk.corpus import rte as rte_corpus
-
- train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
- test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
+ train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
+ test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
featurized_train_set = rte_featurize(train_set)
featurized_test_set = rte_featurize(test_set)
# Train the classifier
- print("Training classifier...")
- if algorithm in ["megam", "BFGS"]: # MEGAM based algorithms.
+ print('Training classifier...')
+ if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms.
# Ensure that MEGAM is configured first.
check_megam_config()
clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
- elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm
+ elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm
clf = MaxentClassifier.train(featurized_train_set, algorithm)
else:
- err_msg = str(
- "RTEClassifier only supports these algorithms:\n "
- "'megam', 'BFGS', 'GIS', 'IIS'.\n"
- )
+ err_msg = str("RTEClassifier only supports these algorithms:\n "
+ "'megam', 'BFGS', 'GIS', 'IIS'.\n")
raise Exception(err_msg)
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(clf, featurized_test_set)
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
return clf
... ('nb', MultinomialNB())])
>>> classif = SklearnClassifier(pipeline)
"""
+from __future__ import print_function, unicode_literals
+
+from six.moves import zip
from nltk.classify.api import ClassifierI
from nltk.probability import DictionaryProbDist
+from nltk import compat
try:
from sklearn.feature_extraction import DictVectorizer
except ImportError:
pass
-__all__ = ["SklearnClassifier"]
+__all__ = ['SklearnClassifier']
+@compat.python_2_unicode_compatible
class SklearnClassifier(ClassifierI):
"""Wrapper for scikit-learn classifiers."""
def _make_probdist(self, y_proba):
classes = self._encoder.classes_
- return DictionaryProbDist(dict((classes[i], p) for i, p in enumerate(y_proba)))
+ return DictionaryProbDist(dict((classes[i], p)
+ for i, p in enumerate(y_proba)))
# skip doctests if scikit-learn is not installed
def setup_module(module):
from nose import SkipTest
-
try:
import sklearn
except ImportError:
# Bernoulli Naive Bayes is designed for binary classification. We set the
# binarize option to False since we know we're passing boolean features.
print("scikit-learn Naive Bayes:")
- names_demo(
- SklearnClassifier(BernoulliNB(binarize=False)).train,
- features=names_demo_features,
- )
+ names_demo(SklearnClassifier(BernoulliNB(binarize=False)).train,
+ features=names_demo_features)
# The C parameter on logistic regression (MaxEnt) controls regularization.
# The higher it's set, the less regularized the classifier is.
print("\n\nscikit-learn logistic regression:")
- names_demo(
- SklearnClassifier(LogisticRegression(C=1000)).train,
- features=names_demo_features,
- )
+ names_demo(SklearnClassifier(LogisticRegression(C=1000)).train,
+ features=names_demo_features)
# encoding: utf-8
# Natural Language Toolkit: Senna Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Note: Unit tests for this module can be found in test/unit/test_senna.py
+ >>> from __future__ import unicode_literals
>>> from nltk.classify import Senna
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
>>> sent = 'Dusseldorf is an international business center'.split()
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
"""
+
+from __future__ import unicode_literals
from os import path, sep, environ
from subprocess import Popen, PIPE
from platform import architecture, system
+from six import text_type
+
from nltk.tag.api import TaggerI
+from nltk.compat import python_2_unicode_compatible
-_senna_url = "http://ml.nec-labs.com/senna/"
+_senna_url = 'http://ml.nec-labs.com/senna/'
+@python_2_unicode_compatible
class Senna(TaggerI):
- SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
+ SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
- def __init__(self, senna_path, operations, encoding="utf-8"):
+ def __init__(self, senna_path, operations, encoding='utf-8'):
self._encoding = encoding
self._path = path.normpath(senna_path) + sep
# Verifies the existence of the executable on the self._path first
- # senna_binary_file_1 = self.executable(self._path)
+ #senna_binary_file_1 = self.executable(self._path)
exe_file_1 = self.executable(self._path)
if not path.isfile(exe_file_1):
# Check for the system environment
- if "SENNA" in environ:
- # self._path = path.join(environ['SENNA'],'')
- self._path = path.normpath(environ["SENNA"]) + sep
+ if 'SENNA' in environ:
+ #self._path = path.join(environ['SENNA'],'')
+ self._path = path.normpath(environ['SENNA']) + sep
exe_file_2 = self.executable(self._path)
if not path.isfile(exe_file_2):
- raise OSError(
- "Senna executable expected at %s or %s but not found"
- % (exe_file_1, exe_file_2)
- )
+ raise OSError("Senna executable expected at %s or %s but not found" % (exe_file_1,exe_file_2))
self.operations = operations
+
def executable(self, base_path):
"""
The function that determines the system specific binary that should be
be used.
"""
os_name = system()
- if os_name == "Linux":
+ if os_name == 'Linux':
bits = architecture()[0]
- if bits == "64bit":
- return path.join(base_path, "senna-linux64")
- return path.join(base_path, "senna-linux32")
- if os_name == "Windows":
- return path.join(base_path, "senna-win32.exe")
- if os_name == "Darwin":
- return path.join(base_path, "senna-osx")
- return path.join(base_path, "senna")
+ if bits == '64bit':
+ return path.join(base_path, 'senna-linux64')
+ return path.join(base_path, 'senna-linux32')
+ if os_name == 'Windows':
+ return path.join(base_path, 'senna-win32.exe')
+ if os_name == 'Darwin':
+ return path.join(base_path, 'senna-osx')
+ return path.join(base_path, 'senna')
def _map(self):
"""
for operation in Senna.SUPPORTED_OPERATIONS:
if operation in self.operations:
_map[operation] = i
- i += 1
+ i+= 1
return _map
def tag(self, tokens):
encoding = self._encoding
if not path.isfile(self.executable(self._path)):
- raise OSError(
- "Senna executable expected at %s but not found"
- % self.executable(self._path)
- )
+ raise OSError("Senna executable expected at %s but not found" % self.executable(self._path))
+
# Build the senna command to run the tagger
- _senna_cmd = [
- self.executable(self._path),
- "-path",
- self._path,
- "-usrtokens",
- "-iobtags",
- ]
- _senna_cmd.extend(["-" + op for op in self.operations])
+ _senna_cmd = [self.executable(self._path), '-path', self._path, '-usrtokens', '-iobtags']
+ _senna_cmd.extend(['-'+op for op in self.operations])
# Serialize the actual sentences to a temporary string
- _input = "\n".join((" ".join(x) for x in sentences)) + "\n"
- if isinstance(_input, str) and encoding:
+ _input = '\n'.join((' '.join(x) for x in sentences))+'\n'
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
# Run the tagger and get the output
# Check the return code.
if p.returncode != 0:
- raise RuntimeError("Senna command failed! Details: %s" % stderr)
+ raise RuntimeError('Senna command failed! Details: %s' % stderr)
if encoding:
senna_output = stdout.decode(encoding)
sentence_index += 1
token_index = 0
continue
- tags = tagged_word.split("\t")
+ tags = tagged_word.split('\t')
result = {}
for tag in map_:
- result[tag] = tags[map_[tag]].strip()
+ result[tag] = tags[map_[tag]].strip()
try:
- result["word"] = sentences[sentence_index][token_index]
+ result['word'] = sentences[sentence_index][token_index]
except IndexError:
- raise IndexError(
- "Misalignment error occurred at sentence number %d. Possible reason"
- " is that the sentence size exceeded the maximum size. Check the "
- "documentation of Senna class for more information."
- % sentence_index
- )
+ raise IndexError(
+ "Misalignment error occurred at sentence number %d. Possible reason"
+ " is that the sentence size exceeded the maximum size. Check the "
+ "documentation of Senna class for more information."
+ % sentence_index)
tagged_sentences[-1].append(result)
token_index += 1
return tagged_sentences
# skip doctests if Senna is not installed
def setup_module(module):
from nose import SkipTest
-
try:
- tagger = Senna("/usr/share/senna-v3.0", ["pos", "chk", "ner"])
+ tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
except OSError:
raise SkipTest("Senna executable not found")
# Natural Language Toolkit: SVM-based classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
#
# URL: <http://nltk.org/>
on support vector machines SVMs use nltk.classify.scikitlearn
(or `scikit-learn <http://scikit-learn.org>`_ directly).
"""
-
-
class SvmClassifier(object):
def __init__(self, *args, **kwargs):
raise NotImplementedError(__doc__)
# Natural Language Toolkit: Interface to TADM Classifier
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
import sys
import subprocess
-from nltk.internals import find_binary
+from six import string_types
+from nltk.internals import find_binary
try:
import numpy
except ImportError:
pass
_tadm_bin = None
-
-
def config_tadm(bin=None):
global _tadm_bin
_tadm_bin = find_binary(
- "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
- )
-
+ 'tadm', bin,
+ env_vars=['TADM'],
+ binary_names=['tadm'],
+ url='http://tadm.sf.net')
def write_tadm_file(train_toks, encoding, stream):
"""
# http://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
labels = encoding.labels()
for featureset, label in train_toks:
- length_line = "%d\n" % len(labels)
+ length_line = '%d\n' % len(labels)
stream.write(length_line)
for known_label in labels:
v = encoding.encode(featureset, known_label)
- line = "%d %d %s\n" % (
+ line = '%d %d %s\n' % (
int(label == known_label),
len(v),
- " ".join("%d %d" % u for u in v),
+ ' '.join('%d %d' % u for u in v)
)
stream.write(line)
-
def parse_tadm_weights(paramfile):
"""
Given the stdout output generated by ``tadm`` when training a
weights = []
for line in paramfile:
weights.append(float(line.strip()))
- return numpy.array(weights, "d")
-
+ return numpy.array(weights, 'd')
def call_tadm(args):
"""
Call the ``tadm`` binary with the given arguments.
"""
- if isinstance(args, str):
- raise TypeError("args should be a list of strings")
+ if isinstance(args, string_types):
+ raise TypeError('args should be a list of strings')
if _tadm_bin is None:
config_tadm()
if p.returncode != 0:
print()
print(stderr)
- raise OSError("tadm command failed!")
-
+ raise OSError('tadm command failed!')
def names_demo():
from nltk.classify.util import names_demo
from nltk.classify.maxent import TadmMaxentClassifier
-
classifier = names_demo(TadmMaxentClassifier.train)
-
def encoding_demo():
import sys
from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
-
- tokens = [
- ({"f0": 1, "f1": 1, "f3": 1}, "A"),
- ({"f0": 1, "f2": 1, "f4": 1}, "B"),
- ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
- ]
+ tokens = [({'f0':1, 'f1':1, 'f3':1}, 'A'),
+ ({'f0':1, 'f2':1, 'f4':1}, 'B'),
+ ({'f0':2, 'f2':1, 'f3':1, 'f4':1}, 'A')]
encoding = TadmEventMaxentFeatureEncoding.train(tokens)
write_tadm_file(tokens, encoding, sys.stdout)
print()
for i in range(encoding.length()):
- print("%s --> %d" % (encoding.describe(i), i))
+ print('%s --> %d' % (encoding.describe(i), i))
print()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
encoding_demo()
names_demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
-presented in Cavnar, W. B. and J. M. Trenkle,
+presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".
-The algorithm takes advantage of Zipf's law and uses
+The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.
Language n-grams are provided by the "An Crubadan"
-project. A corpus reader was created separately to read
+project. A corpus reader was created seperately to read
those files.
For details regarding the algorithm, see:
http://borel.slu.edu/crubadan/index.html
"""
-from sys import maxsize
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+from nltk.compat import PY3
from nltk.util import trigrams
+if PY3:
+ from sys import maxsize
+else:
+ from sys import maxint
+
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
## Language identification using TextCat
######################################################################
-
class TextCat(object):
_corpus = None
fingerprints = {}
_START_CHAR = "<"
_END_CHAR = ">"
-
+
last_distances = {}
-
+
def __init__(self):
if not re:
- raise EnvironmentError(
- "classify.textcat requires the regex module that "
- "supports unicode. Try '$ pip install regex' and "
- "see https://pypi.python.org/pypi/regex for "
- "further details."
- )
+ raise EnvironmentError("classify.textcat requires the regex module that "
+ "supports unicode. Try '$ pip install regex' and "
+ "see https://pypi.python.org/pypi/regex for "
+ "further details.")
from nltk.corpus import crubadan
-
self._corpus = crubadan
# Load all language ngrams into cache
for lang in self._corpus.langs():
self._corpus.lang_freq(lang)
-
+
def remove_punctuation(self, text):
- """ Get rid of punctuation except apostrophes """
+ ''' Get rid of punctuation except apostrophes '''
return re.sub(r"[^\P{P}\']+", "", text)
-
+
def profile(self, text):
- """ Create FreqDist of trigrams within text """
+ ''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
-
+
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
- token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
+ token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] = 1
return fingerprint
-
+
def calc_dist(self, lang, trigram, text_profile):
- """ Calculate the "out-of-place" measure between the
- text and language profile for a single trigram """
+ ''' Calculate the "out-of-place" measure between the
+ text and language profile for a single trigram '''
lang_fd = self._corpus.lang_freq(lang)
dist = 0
idx_lang_profile = list(lang_fd.keys()).index(trigram)
idx_text = list(text_profile.keys()).index(trigram)
- # print(idx_lang_profile, ", ", idx_text)
- dist = abs(idx_lang_profile - idx_text)
+ #print(idx_lang_profile, ", ", idx_text)
+ dist = abs(idx_lang_profile - idx_text)
else:
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
- dist = maxsize
+ if PY3:
+ dist = maxsize
+ else:
+ dist = maxint
return dist
-
+
def lang_dists(self, text):
- """ Calculate the "out-of-place" measure between
- the text and all languages """
-
+ ''' Calculate the "out-of-place" measure between
+ the text and all languages '''
+
distances = {}
profile = self.profile(text)
# For all the languages
lang_dist = 0
for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile)
-
+
distances[lang] = lang_dist
-
+
return distances
-
+
def guess_language(self, text):
- """ Find the language with the min distance
- to the text and return its ISO 639-3 code """
+ ''' Find the language with the min distance
+ to the text and return its ISO 639-3 code '''
self.last_distances = self.lang_dists(text)
-
+
return min(self.last_distances, key=self.last_distances.get)
#################################################')
-
def demo():
from nltk.corpus import udhr
- langs = [
- "Kurdish-UTF8",
- "Abkhaz-UTF8",
- "Farsi_Persian-UTF8",
- "Hindi-UTF8",
- "Hawaiian-UTF8",
- "Russian-UTF8",
- "Vietnamese-UTF8",
- "Serbian_Srpski-UTF8",
- "Esperanto-UTF8",
- ]
-
- friendly = {
- "kmr": "Northern Kurdish",
- "abk": "Abkhazian",
- "pes": "Iranian Persian",
- "hin": "Hindi",
- "haw": "Hawaiian",
- "rus": "Russian",
- "vie": "Vietnamese",
- "srp": "Serbian",
- "epo": "Esperanto",
- }
-
+ langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
+ 'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
+ 'Serbian_Srpski-UTF8','Esperanto-UTF8']
+
+ friendly = {'kmr':'Northern Kurdish',
+ 'abk':'Abkhazian',
+ 'pes':'Iranian Persian',
+ 'hin':'Hindi',
+ 'haw':'Hawaiian',
+ 'rus':'Russian',
+ 'vie':'Vietnamese',
+ 'srp':'Serbian',
+ 'epo':'Esperanto'}
+
tc = TextCat()
for cur_lang in langs:
rows = len(raw_sentences) - 1
cols = list(map(len, raw_sentences))
- sample = ""
-
+ sample = ''
+
# Generate a sample text of the language
for i in range(0, rows):
- cur_sent = ""
+ cur_sent = ''
for j in range(0, cols[i]):
- cur_sent += " " + raw_sentences[i][j]
-
+ cur_sent += ' ' + raw_sentences[i][j]
+
sample += cur_sent
-
+
# Try to detect what it is
- print("Language snippet: " + sample[0:140] + "...")
+ print('Language snippet: ' + sample[0:140] + '...')
guess = tc.guess_language(sample)
- print("Language detection: %s (%s)" % (guess, friendly[guess]))
- print("#" * 140)
+ print('Language detection: %s (%s)' % (guess, friendly[guess]))
+ print('#' * 140)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Classifier Utility Functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
"""
Utility functions and classes for classifiers.
"""
+from __future__ import print_function, division
import math
-# from nltk.util import Deprecated
-import nltk.classify.util # for accuracy & log_likelihood
+#from nltk.util import Deprecated
+import nltk.classify.util # for accuracy & log_likelihood
from nltk.util import LazyMap
######################################################################
-# { Helper Functions
+#{ Helper Functions
######################################################################
# alternative name possibility: 'map_featurefunc()'?
if labeled is None:
labeled = toks and isinstance(toks[0], (tuple, list))
if labeled:
-
def lazy_func(labeled_token):
return (feature_func(labeled_token[0]), labeled_token[1])
-
return LazyMap(lazy_func, toks)
else:
return LazyMap(feature_func, toks)
-
def attested_labels(tokens):
"""
:return: A list of all labels that are attested in the given list
"""
return tuple(set(label for (tok, label) in tokens))
-
def log_likelihood(classifier, gold):
results = classifier.prob_classify_many([fs for (fs, l) in gold])
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
return math.log(sum(ll) / len(ll))
-
def accuracy(classifier, gold):
results = classifier.classify_many([fs for (fs, l) in gold])
correct = [l == r for ((fs, l), r) in zip(gold, results)]
else:
return 0
-
class CutoffChecker(object):
"""
A helper class that implements cutoff checks based on number of
Accuracy cutoffs are also implemented, but they're almost never
a good idea to use.
"""
-
def __init__(self, cutoffs):
self.cutoffs = cutoffs.copy()
- if "min_ll" in cutoffs:
- cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
- if "min_lldelta" in cutoffs:
- cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
+ if 'min_ll' in cutoffs:
+ cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
+ if 'min_lldelta' in cutoffs:
+ cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
self.ll = None
self.acc = None
self.iter = 1
def check(self, classifier, train_toks):
cutoffs = self.cutoffs
self.iter += 1
- if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
- return True # iteration cutoff.
+ if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
+ return True # iteration cutoff.
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
if math.isnan(new_ll):
return True
- if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
- if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
- return True # log likelihood cutoff
- if (
- "min_lldelta" in cutoffs
- and self.ll
- and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
- ):
- return True # log likelihood delta cutoff
+ if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
+ if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
+ return True # log likelihood cutoff
+ if ('min_lldelta' in cutoffs and self.ll and
+ ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))):
+ return True # log likelihood delta cutoff
self.ll = new_ll
- if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
- new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
- if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
- return True # log likelihood cutoff
- if (
- "min_accdelta" in cutoffs
- and self.acc
- and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
- ):
- return True # log likelihood delta cutoff
+ if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
+ new_acc = nltk.classify.util.log_likelihood(
+ classifier, train_toks)
+ if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
+ return True # log likelihood cutoff
+ if ('min_accdelta' in cutoffs and self.acc and
+ ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))):
+ return True # log likelihood delta cutoff
self.acc = new_acc
- return False # no cutoff reached.
-
+ return False # no cutoff reached.
######################################################################
-# { Demos
+#{ Demos
######################################################################
-
def names_demo_features(name):
features = {}
- features["alwayson"] = True
- features["startswith"] = name[0].lower()
- features["endswith"] = name[-1].lower()
- for letter in "abcdefghijklmnopqrstuvwxyz":
- features["count(%s)" % letter] = name.lower().count(letter)
- features["has(%s)" % letter] = letter in name.lower()
+ features['alwayson'] = True
+ features['startswith'] = name[0].lower()
+ features['endswith'] = name[-1].lower()
+ for letter in 'abcdefghijklmnopqrstuvwxyz':
+ features['count(%s)' % letter] = name.lower().count(letter)
+ features['has(%s)' % letter] = letter in name.lower()
return features
-
def binary_names_demo_features(name):
features = {}
- features["alwayson"] = True
- features["startswith(vowel)"] = name[0].lower() in "aeiouy"
- features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
- for letter in "abcdefghijklmnopqrstuvwxyz":
- features["count(%s)" % letter] = name.lower().count(letter)
- features["has(%s)" % letter] = letter in name.lower()
- features["startswith(%s)" % letter] = letter == name[0].lower()
- features["endswith(%s)" % letter] = letter == name[-1].lower()
+ features['alwayson'] = True
+ features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
+ features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
+ for letter in 'abcdefghijklmnopqrstuvwxyz':
+ features['count(%s)' % letter] = name.lower().count(letter)
+ features['has(%s)' % letter] = letter in name.lower()
+ features['startswith(%s)' % letter] = (letter == name[0].lower())
+ features['endswith(%s)' % letter] = (letter == name[-1].lower())
return features
-
def names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
# Construct a list of classified names, using the names corpus.
- namelist = [(name, "male") for name in names.words("male.txt")] + [
- (name, "female") for name in names.words("female.txt")
- ]
+ namelist = ([(name, 'male') for name in names.words('male.txt')] +
+ [(name, 'female') for name in names.words('female.txt')])
# Randomly split the names into a test & train set.
random.seed(123456)
test = namelist[5000:5500]
# Train up a classifier.
- print("Training classifier...")
- classifier = trainer([(features(n), g) for (n, g) in train])
+ print('Training classifier...')
+ classifier = trainer( [(features(n), g) for (n, g) in train] )
# Run the classifier on the test data.
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, g) in test]
pdists = classifier.prob_classify_many(test_featuresets)
- ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+ ll = [pdist.logprob(gold)
+ for ((name, gold), pdist) in zip(test, pdists)]
+ print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
print()
- print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
+ print('Unseen Names P(Male) P(Female)\n'+'-'*40)
for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
- if gender == "male":
- fmt = " %-15s *%6.4f %6.4f"
+ if gender == 'male':
+ fmt = ' %-15s *%6.4f %6.4f'
else:
- fmt = " %-15s %6.4f *%6.4f"
- print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
+ fmt = ' %-15s %6.4f *%6.4f'
+ print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
except NotImplementedError:
pass
# Return the classifier
return classifier
-
def partial_names_demo(trainer, features=names_demo_features):
from nltk.corpus import names
import random
- male_names = names.words("male.txt")
- female_names = names.words("female.txt")
+ male_names = names.words('male.txt')
+ female_names = names.words('female.txt')
random.seed(654321)
random.shuffle(male_names)
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
# Create a test set with correctly-labeled male and female names
- test = [(name, True) for name in male_names[2500:2750]] + [
- (name, False) for name in female_names[500:750]
- ]
+ test = [(name, True) for name in male_names[2500:2750]] \
+ + [(name, False) for name in female_names[500:750]]
random.shuffle(test)
# Train up a classifier.
- print("Training classifier...")
+ print('Training classifier...')
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, m) in test]
pdists = classifier.prob_classify_many(test_featuresets)
- ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+ ll = [pdist.logprob(gold)
+ for ((name, gold), pdist) in zip(test, pdists)]
+ print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
print()
- print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
+ print('Unseen Names P(Male) P(Female)\n'+'-'*40)
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
if is_male == True:
- fmt = " %-15s *%6.4f %6.4f"
+ fmt = ' %-15s *%6.4f %6.4f'
else:
- fmt = " %-15s %6.4f *%6.4f"
+ fmt = ' %-15s %6.4f *%6.4f'
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
except NotImplementedError:
pass
# Return the classifier
return classifier
-
_inst_cache = {}
-
-
def wsd_demo(trainer, word, features, n=1000):
from nltk.corpus import senseval
import random
# Get the instances.
- print("Reading data...")
+ print('Reading data...')
global _inst_cache
if word not in _inst_cache:
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
if n > len(instances):
n = len(instances)
senses = list(set(l for (i, l) in instances))
- print(" Senses: " + " ".join(senses))
+ print(' Senses: ' + ' '.join(senses))
# Randomly split the names into a test & train set.
- print("Splitting into test & train...")
+ print('Splitting into test & train...')
random.seed(123456)
random.shuffle(instances)
- train = instances[: int(0.8 * n)]
- test = instances[int(0.8 * n) : n]
+ train = instances[:int(.8*n)]
+ test = instances[int(.8*n):n]
# Train up a classifier.
- print("Training classifier...")
+ print('Training classifier...')
classifier = trainer([(features(i), l) for (i, l) in train])
# Run the classifier on the test data.
- print("Testing classifier...")
+ print('Testing classifier...')
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
- print("Accuracy: %6.4f" % acc)
+ print('Accuracy: %6.4f' % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(i) for (i, n) in test]
pdists = classifier.prob_classify_many(test_featuresets)
- ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+ ll = [pdist.logprob(gold)
+ for ((name, gold), pdist) in zip(test, pdists)]
+ print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
except NotImplementedError:
pass
return classifier
-def check_megam_config():
+
+def check_megam_config(self):
"""
Checks whether the MEGAM binary is configured.
"""
try:
_megam_bin
except NameError:
- err_msg = str(
- "Please configure your megam binary first, e.g.\n"
- ">>> nltk.config_megam('/usr/bin/local/megam')"
- )
+ err_msg = str("Please configure your megam binary first, e.g.\n"
+ ">>> nltk.config_megam('/usr/bin/local/megam')")
raise NameError(err_msg)
# Natural Language Toolkit: Interface to Weka Classsifiers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Classifiers that make use of the external 'Weka' package.
"""
-
+from __future__ import print_function
import time
import tempfile
import os
import zipfile
from sys import stdin
+from six import integer_types, string_types
+
from nltk.probability import DictionaryProbDist
from nltk.internals import java, config_java
from nltk.classify.api import ClassifierI
_weka_classpath = None
-_weka_search = [
- ".",
- "/usr/share/weka",
- "/usr/local/share/weka",
- "/usr/lib/weka",
- "/usr/local/lib/weka",
-]
-
-
+_weka_search = ['.',
+ '/usr/share/weka',
+ '/usr/local/share/weka',
+ '/usr/lib/weka',
+ '/usr/local/lib/weka',]
def config_weka(classpath=None):
global _weka_classpath
if _weka_classpath is None:
searchpath = _weka_search
- if "WEKAHOME" in os.environ:
- searchpath.insert(0, os.environ["WEKAHOME"])
+ if 'WEKAHOME' in os.environ:
+ searchpath.insert(0, os.environ['WEKAHOME'])
for path in searchpath:
- if os.path.exists(os.path.join(path, "weka.jar")):
- _weka_classpath = os.path.join(path, "weka.jar")
+ if os.path.exists(os.path.join(path, 'weka.jar')):
+ _weka_classpath = os.path.join(path, 'weka.jar')
version = _check_weka_version(_weka_classpath)
if version:
- print(
- ("[Found Weka: %s (version %s)]" % (_weka_classpath, version))
- )
+ print(('[Found Weka: %s (version %s)]' %
+ (_weka_classpath, version)))
else:
- print("[Found Weka: %s]" % _weka_classpath)
+ print('[Found Weka: %s]' % _weka_classpath)
_check_weka_version(_weka_classpath)
if _weka_classpath is None:
- raise LookupError(
- "Unable to find weka.jar! Use config_weka() "
- "or set the WEKAHOME environment variable. "
- "For more information about Weka, please see "
- "http://www.cs.waikato.ac.nz/ml/weka/"
- )
-
+ raise LookupError('Unable to find weka.jar! Use config_weka() '
+ 'or set the WEKAHOME environment variable. '
+ 'For more information about Weka, please see '
+ 'http://www.cs.waikato.ac.nz/ml/weka/')
def _check_weka_version(jar):
try:
return None
try:
try:
- return zf.read("weka/core/version.txt")
+ return zf.read('weka/core/version.txt')
except KeyError:
return None
finally:
zf.close()
-
class WekaClassifier(ClassifierI):
def __init__(self, formatter, model_filename):
self._formatter = formatter
self._model = model_filename
def prob_classify_many(self, featuresets):
- return self._classify_many(featuresets, ["-p", "0", "-distribution"])
+ return self._classify_many(featuresets, ['-p', '0', '-distribution'])
def classify_many(self, featuresets):
- return self._classify_many(featuresets, ["-p", "0"])
+ return self._classify_many(featuresets, ['-p', '0'])
def _classify_many(self, featuresets, options):
# Make sure we can find java & weka.
temp_dir = tempfile.mkdtemp()
try:
# Write the test data file.
- test_filename = os.path.join(temp_dir, "test.arff")
+ test_filename = os.path.join(temp_dir, 'test.arff')
self._formatter.write(test_filename, featuresets)
# Call weka to classify the data.
- cmd = [
- "weka.classifiers.bayes.NaiveBayes",
- "-l",
- self._model,
- "-T",
- test_filename,
- ] + options
- (stdout, stderr) = java(
- cmd,
- classpath=_weka_classpath,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
+ cmd = ['weka.classifiers.bayes.NaiveBayes',
+ '-l', self._model, '-T', test_filename] + options
+ (stdout, stderr) = java(cmd, classpath=_weka_classpath,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
# Check if something went wrong:
if stderr and not stdout:
- if "Illegal options: -distribution" in stderr:
- raise ValueError(
- "The installed version of weka does "
- "not support probability distribution "
- "output."
- )
+ if 'Illegal options: -distribution' in stderr:
+ raise ValueError('The installed version of weka does '
+ 'not support probability distribution '
+ 'output.')
else:
- raise ValueError("Weka failed to generate output:\n%s" % stderr)
+ raise ValueError('Weka failed to generate output:\n%s'
+ % stderr)
# Parse weka's output.
- return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
+ return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n'))
finally:
for f in os.listdir(temp_dir):
os.rmdir(temp_dir)
def parse_weka_distribution(self, s):
- probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
+ probs = [float(v) for v in re.split('[*,]+', s) if v.strip()]
probs = dict(zip(self._formatter.labels(), probs))
return DictionaryProbDist(probs)
def parse_weka_output(self, lines):
# Strip unwanted text from stdout
- for i, line in enumerate(lines):
+ for i,line in enumerate(lines):
if line.strip().startswith("inst#"):
lines = lines[i:]
break
- if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
- return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
- elif lines[0].split() == [
- "inst#",
- "actual",
- "predicted",
- "error",
- "distribution",
- ]:
- return [
- self.parse_weka_distribution(line.split()[-1])
- for line in lines[1:]
- if line.strip()
- ]
+ if lines[0].split() == ['inst#', 'actual', 'predicted',
+ 'error', 'prediction']:
+ return [line.split()[2].split(':')[1]
+ for line in lines[1:] if line.strip()]
+ elif lines[0].split() == ['inst#', 'actual', 'predicted',
+ 'error', 'distribution']:
+ return [self.parse_weka_distribution(line.split()[-1])
+ for line in lines[1:] if line.strip()]
# is this safe:?
- elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
+ elif re.match(r'^0 \w+ [01]\.[0-9]* \?\s*$', lines[0]):
return [line.split()[1] for line in lines if line.strip()]
else:
for line in lines[:10]:
print(line)
- raise ValueError(
- "Unhandled output format -- your version "
- "of weka may not be supported.\n"
- " Header: %s" % lines[0]
- )
+ raise ValueError('Unhandled output format -- your version '
+ 'of weka may not be supported.\n'
+ ' Header: %s' % lines[0])
+
# [xx] full list of classifiers (some may be abstract?):
# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
# VotedPerceptron, Winnow, ZeroR
_CLASSIFIER_CLASS = {
- "naivebayes": "weka.classifiers.bayes.NaiveBayes",
- "C4.5": "weka.classifiers.trees.J48",
- "log_regression": "weka.classifiers.functions.Logistic",
- "svm": "weka.classifiers.functions.SMO",
- "kstar": "weka.classifiers.lazy.KStar",
- "ripper": "weka.classifiers.rules.JRip",
- }
-
+ 'naivebayes': 'weka.classifiers.bayes.NaiveBayes',
+ 'C4.5': 'weka.classifiers.trees.J48',
+ 'log_regression': 'weka.classifiers.functions.Logistic',
+ 'svm': 'weka.classifiers.functions.SMO',
+ 'kstar': 'weka.classifiers.lazy.KStar',
+ 'ripper': 'weka.classifiers.rules.JRip',
+ }
@classmethod
- def train(
- cls,
- model_filename,
- featuresets,
- classifier="naivebayes",
- options=[],
- quiet=True,
- ):
+ def train(cls, model_filename, featuresets,
+ classifier='naivebayes', options=[], quiet=True):
# Make sure we can find java & weka.
config_weka()
temp_dir = tempfile.mkdtemp()
try:
# Write the training data file.
- train_filename = os.path.join(temp_dir, "train.arff")
+ train_filename = os.path.join(temp_dir, 'train.arff')
formatter.write(train_filename, featuresets)
if classifier in cls._CLASSIFIER_CLASS:
elif classifier in cls._CLASSIFIER_CLASS.values():
javaclass = classifier
else:
- raise ValueError("Unknown classifier %s" % classifier)
+ raise ValueError('Unknown classifier %s' % classifier)
# Train the weka model.
- cmd = [javaclass, "-d", model_filename, "-t", train_filename]
+ cmd = [javaclass, '-d', model_filename, '-t', train_filename]
cmd += list(options)
if quiet:
stdout = subprocess.PIPE
- else:
- stdout = None
+ else: stdout = None
java(cmd, classpath=_weka_classpath, stdout=stdout)
# Return the new classifier.
def write(self, outfile, tokens):
"""Writes ARFF data to a file for the given data."""
- if not hasattr(outfile, "write"):
- outfile = open(outfile, "w")
+ if not hasattr(outfile, 'write'):
+ outfile = open(outfile, 'w')
outfile.write(self.format(tokens))
outfile.close()
for tok, label in tokens:
for (fname, fval) in tok.items():
if issubclass(type(fval), bool):
- ftype = "{True, False}"
- elif issubclass(type(fval), (int, float, bool)):
- ftype = "NUMERIC"
- elif issubclass(type(fval), str):
- ftype = "STRING"
+ ftype = '{True, False}'
+ elif issubclass(type(fval), (integer_types, float, bool)):
+ ftype = 'NUMERIC'
+ elif issubclass(type(fval), string_types):
+ ftype = 'STRING'
elif fval is None:
- continue # can't tell the type.
+ continue # can't tell the type.
else:
- raise ValueError("Unsupported value type %r" % ftype)
+ raise ValueError('Unsupported value type %r' % ftype)
if features.get(fname, ftype) != ftype:
- raise ValueError("Inconsistent type for %s" % fname)
+ raise ValueError('Inconsistent type for %s' % fname)
features[fname] = ftype
features = sorted(features.items())
def header_section(self):
"""Returns an ARFF header as a string."""
# Header comment.
- s = (
- "% Weka ARFF file\n"
- + "% Generated automatically by NLTK\n"
- + "%% %s\n\n" % time.ctime()
- )
+ s = ('% Weka ARFF file\n' +
+ '% Generated automatically by NLTK\n' +
+ '%% %s\n\n' % time.ctime())
# Relation name
- s += "@RELATION rel\n\n"
+ s += '@RELATION rel\n\n'
# Input attribute specifications
for fname, ftype in self._features:
- s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
+ s += '@ATTRIBUTE %-30r %s\n' % (fname, ftype)
# Label attribute specification
- s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
+ s += '@ATTRIBUTE %-30r {%s}\n' % ('-label-', ','.join(self._labels))
return s
tokens = [(tok, None) for tok in tokens]
# Data section
- s = "\n@DATA\n"
+ s = '\n@DATA\n'
for (tok, label) in tokens:
for fname, ftype in self._features:
- s += "%s," % self._fmt_arff_val(tok.get(fname))
- s += "%s\n" % self._fmt_arff_val(label)
+ s += '%s,' % self._fmt_arff_val(tok.get(fname))
+ s += '%s\n' % self._fmt_arff_val(label)
return s
def _fmt_arff_val(self, fval):
if fval is None:
- return "?"
- elif isinstance(fval, (bool, int)):
- return "%s" % fval
+ return '?'
+ elif isinstance(fval, (bool, integer_types)):
+ return '%s' % fval
elif isinstance(fval, float):
- return "%r" % fval
+ return '%r' % fval
else:
- return "%r" % fval
+ return '%r' % fval
-if __name__ == "__main__":
+if __name__ == '__main__':
from nltk.classify.util import names_demo, binary_names_demo_features
-
def make_classifier(featuresets):
- return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
-
+ return WekaClassifier.train('/tmp/name.model', featuresets,
+ 'C4.5')
classifier = names_demo(make_classifier, binary_names_demo_features)
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: NLTK Command-Line Interface
-#
-# Copyright (C) 2001-2020 NLTK Project
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from functools import partial
-from itertools import chain
-from tqdm import tqdm
-
-import click
-
-from nltk import word_tokenize
-from nltk.util import parallelize_preprocess
-
-CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
-
-
-@click.group(context_settings=CONTEXT_SETTINGS)
-@click.version_option()
-def cli():
- pass
-
-
-@cli.command("tokenize")
-@click.option(
- "--language",
- "-l",
- default="en",
- help="The language for the Punkt sentence tokenization.",
-)
-@click.option(
- "--preserve-line",
- "-l",
- default=True,
- is_flag=True,
- help="An option to keep the preserve the sentence and not sentence tokenize it.",
-)
-@click.option("--processes", "-j", default=1, help="No. of processes.")
-@click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.")
-@click.option(
- "--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens."
-)
-def tokenize_file(language, preserve_line, processes, encoding, delimiter):
- """ This command tokenizes text stream using nltk.word_tokenize """
- with click.get_text_stream("stdin", encoding=encoding) as fin:
- with click.get_text_stream("stdout", encoding=encoding) as fout:
- # If it's single process, joblib parallization is slower,
- # so just process line by line normally.
- if processes == 1:
- for line in tqdm(fin.readlines()):
- print(delimiter.join(word_tokenize(line)), end="\n", file=fout)
- else:
- for outline in parallelize_preprocess(
- word_tokenize, fin.readlines(), processes, progress_bar=True
- ):
- print(delimiter.join(outline), end="\n", file=fout)
# Natural Language Toolkit: Clusterers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
efficiency when required.
"""
-from nltk.cluster.util import (
- VectorSpaceClusterer,
- Dendrogram,
- euclidean_distance,
- cosine_distance,
-)
+from nltk.cluster.util import (VectorSpaceClusterer, Dendrogram,
+ euclidean_distance, cosine_distance)
from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.gaac import GAAClusterer
from nltk.cluster.em import EMClusterer
# Natural Language Toolkit: Clusterer Interfaces
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Porting: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.probability import DictionaryProbDist
-
-class ClusterI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ClusterI(object):
"""
Interface covering basic clustering functionality.
"""
-
@abstractmethod
def cluster(self, vectors, assign_clusters=False):
"""
# Natural Language Toolkit: Expectation Maximization Clusterer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
+from __future__ import print_function, unicode_literals
try:
import numpy
except ImportError:
pass
+from nltk.compat import python_2_unicode_compatible
from nltk.cluster.util import VectorSpaceClusterer
-
+@python_2_unicode_compatible
class EMClusterer(VectorSpaceClusterer):
"""
The Gaussian EM clusterer models the vectors as being produced by
the likelihood of the data does not significantly increase.
"""
- def __init__(
- self,
- initial_means,
- priors=None,
- covariance_matrices=None,
- conv_threshold=1e-6,
- bias=0.1,
- normalise=False,
- svd_dimensions=None,
- ):
+ def __init__(self, initial_means, priors=None, covariance_matrices=None,
+ conv_threshold=1e-6, bias=0.1, normalise=False,
+ svd_dimensions=None):
"""
Creates an EM clusterer with the given starting parameters,
convergence threshold and vector mangling parameters.
means = self._means
priors = self._priors
if not priors:
- priors = self._priors = (
- numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
- )
+ priors = self._priors = numpy.ones(self._num_clusters,
+ numpy.float64) / self._num_clusters
covariances = self._covariance_matrices
if not covariances:
- covariances = self._covariance_matrices = [
- numpy.identity(dimensions, numpy.float64)
- for i in range(self._num_clusters)
- ]
+ covariances = self._covariance_matrices = \
+ [ numpy.identity(dimensions, numpy.float64)
+ for i in range(self._num_clusters) ]
# do the E and M steps until the likelihood plateaus
lastl = self._loglikelihood(vectors, priors, means, covariances)
converged = False
while not converged:
- if trace:
- print("iteration; loglikelihood", lastl)
+ if trace: print('iteration; loglikelihood', lastl)
# E-step, calculate hidden variables, h[i,j]
- h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
+ h = numpy.zeros((len(vectors), self._num_clusters),
+ numpy.float64)
for i in range(len(vectors)):
for j in range(self._num_clusters):
- h[i, j] = priors[j] * self._gaussian(
- means[j], covariances[j], vectors[i]
- )
- h[i, :] /= sum(h[i, :])
+ h[i,j] = priors[j] * self._gaussian(means[j],
+ covariances[j], vectors[i])
+ h[i,:] /= sum(h[i,:])
# M-step, update parameters - cvm, p, mean
for j in range(self._num_clusters):
covariance_before = covariances[j]
- new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
+ new_covariance = numpy.zeros((dimensions, dimensions),
+ numpy.float64)
new_mean = numpy.zeros(dimensions, numpy.float64)
sum_hj = 0.0
for i in range(len(vectors)):
delta = vectors[i] - means[j]
- new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
- sum_hj += h[i, j]
- new_mean += h[i, j] * vectors[i]
+ new_covariance += h[i,j] * \
+ numpy.multiply.outer(delta, delta)
+ sum_hj += h[i,j]
+ new_mean += h[i,j] * vectors[i]
covariances[j] = new_covariance / sum_hj
means[j] = new_mean / sum_hj
priors[j] = sum_hj / len(vectors)
# bias term to stop covariance matrix being singular
- covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
+ covariances[j] += self._bias * \
+ numpy.identity(dimensions, numpy.float64)
# calculate likelihood - FIXME: may be broken
l = self._loglikelihood(vectors, priors, means, covariances)
def classify_vectorspace(self, vector):
best = None
for j in range(self._num_clusters):
- p = self._priors[j] * self._gaussian(
- self._means[j], self._covariance_matrices[j], vector
- )
+ p = self._priors[j] * self._gaussian(self._means[j],
+ self._covariance_matrices[j], vector)
if not best or p > best[0]:
best = (p, j)
return best[1]
def likelihood_vectorspace(self, vector, cluster):
cid = self.cluster_names().index(cluster)
- return self._priors[cluster] * self._gaussian(
- self._means[cluster], self._covariance_matrices[cluster], vector
- )
+ return self._priors[cluster] * self._gaussian(self._means[cluster],
+ self._covariance_matrices[cluster], vector)
def _gaussian(self, mean, cvm, x):
m = len(mean)
- assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
+ assert cvm.shape == (m, m), \
+ 'bad sized covariance matrix, %s' % str(cvm.shape)
try:
det = numpy.linalg.det(cvm)
inv = numpy.linalg.inv(cvm)
a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0)
dx = x - mean
print(dx, inv)
- b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
+ b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx)
return a * numpy.exp(b)
except OverflowError:
# happens when the exponent is negative infinity - i.e. b = 0
for vector in vectors:
p = 0
for j in range(len(priors)):
- p += priors[j] * self._gaussian(means[j], covariances[j], vector)
+ p += priors[j] * \
+ self._gaussian(means[j], covariances[j], vector)
llh += numpy.log(p)
return llh
def __repr__(self):
- return "<EMClusterer means=%s>" % list(self._means)
-
+ return '<EMClusterer means=%s>' % list(self._means)
def demo():
"""
clusterer = cluster.EMClusterer(means, bias=0.1)
clusters = clusterer.cluster(vectors, True, trace=True)
- print("Clustered:", vectors)
- print("As: ", clusters)
+ print('Clustered:', vectors)
+ print('As: ', clusters)
print()
for c in range(2):
- print("Cluster:", c)
- print("Prior: ", clusterer._priors[c])
- print("Mean: ", clusterer._means[c])
- print("Covar: ", clusterer._covariance_matrices[c])
+ print('Cluster:', c)
+ print('Prior: ', clusterer._priors[c])
+ print('Mean: ', clusterer._means[c])
+ print('Covar: ', clusterer._covariance_matrices[c])
print()
# classify a new vector
vector = numpy.array([2, 2])
- print("classify(%s):" % vector, end=" ")
+ print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))
# show the classification probabilities
vector = numpy.array([2, 2])
- print("classification_probdist(%s):" % vector)
+ print('classification_probdist(%s):' % vector)
pdist = clusterer.classification_probdist(vector)
for sample in pdist.samples():
- print("%s => %.0f%%" % (sample, pdist.prob(sample) * 100))
+ print('%s => %.0f%%' % (sample,
+ pdist.prob(sample) *100))
-if __name__ == "__main__":
+#
+# The following demo code is broken.
+#
+# # use a set of tokens with 2D indices
+# vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+# # test the EM clusterer with means given by k-means (2) and
+# # dimensionality reduction
+# clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1)
+# print 'Clusterer:', clusterer
+# clusters = clusterer.cluster(vectors)
+# means = clusterer.means()
+# print 'Means:', clusterer.means()
+# print
+
+# clusterer = cluster.EMClusterer(means, svd_dimensions=1)
+# clusters = clusterer.cluster(vectors, True)
+# print 'Clusterer:', clusterer
+# print 'Clustered:', str(vectors)[:60], '...'
+# print 'As:', str(clusters)[:60], '...'
+# print
+
+# # classify a new vector
+# vector = numpy.array([3, 3])
+# print 'classify(%s):' % vector,
+# print clusterer.classify(vector)
+# print
+
+# # show the classification probabilities
+# vector = numpy.array([2.2, 2])
+# print 'classification_probdist(%s)' % vector
+# pdist = clusterer.classification_probdist(vector)
+# for sample in pdist:
+# print '%s => %.0f%%' % (sample, pdist.prob(sample) *100)
+
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Group Average Agglomerative Clusterer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
try:
import numpy
pass
from nltk.cluster.util import VectorSpaceClusterer, Dendrogram, cosine_distance
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class GAAClusterer(VectorSpaceClusterer):
"""
The Group Average Agglomerative starts with each of the N vectors as singleton
def cluster(self, vectors, assign_clusters=False, trace=False):
# stores the merge order
self._dendrogram = Dendrogram(
- [numpy.array(vector, numpy.float64) for vector in vectors]
- )
+ [numpy.array(vector, numpy.float64) for vector in vectors])
return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
def cluster_vectorspace(self, vectors, trace=False):
# variables describing the initial situation
N = len(vectors)
- cluster_len = [1] * N
+ cluster_len = [1]*N
cluster_count = N
index_map = numpy.arange(N)
# construct the similarity matrix
dims = (N, N)
- dist = numpy.ones(dims, dtype=numpy.float) * numpy.inf
+ dist = numpy.ones(dims, dtype=numpy.float)*numpy.inf
for i in range(N):
- for j in range(i + 1, N):
+ for j in range(i+1, N):
dist[i, j] = cosine_distance(vectors[i], vectors[j])
while cluster_count > max(self._num_clusters, 1):
dist[j, :] = numpy.inf
# merge the clusters
- cluster_len[i] = cluster_len[i] + cluster_len[j]
+ cluster_len[i] = cluster_len[i]+cluster_len[j]
self._dendrogram.merge(index_map[i], index_map[j])
cluster_count -= 1
# update the index map to reflect the indexes if we
# had removed j
- index_map[j + 1 :] -= 1
+ index_map[j+1:] -= 1
index_map[j] = N
self.update_clusters(self._num_clusters)
# number of points in the clusters i and j
i_weight = cluster_len[i]
j_weight = cluster_len[j]
- weight_sum = i_weight + j_weight
+ weight_sum = i_weight+j_weight
# update for x<i
- dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
+ dist[:i, i] = dist[:i, i]*i_weight + dist[:i, j]*j_weight
dist[:i, i] /= weight_sum
# update for i<x<j
- dist[i, i + 1 : j] = (
- dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
- )
+ dist[i, i+1:j] = dist[i, i+1:j]*i_weight + dist[i+1:j, j]*j_weight
# update for i<j<x
- dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
- dist[i, i + 1 :] /= weight_sum
+ dist[i, j+1:] = dist[i, j+1:]*i_weight + dist[j, j+1:]*j_weight
+ dist[i, i+1:] /= weight_sum
def update_clusters(self, num_clusters):
clusters = self._dendrogram.groups(num_clusters)
return self._num_clusters
def __repr__(self):
- return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
-
+ return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
def demo():
"""
clusterer = GAAClusterer(4)
clusters = clusterer.cluster(vectors, True)
- print("Clusterer:", clusterer)
- print("Clustered:", vectors)
- print("As:", clusters)
+ print('Clusterer:', clusterer)
+ print('Clustered:', vectors)
+ print('As:', clusters)
print()
# show the dendrogram
# classify a new vector
vector = numpy.array([3, 3])
- print("classify(%s):" % vector, end=" ")
+ print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))
print()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: K-Means Clusterer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
import copy
import random
from nltk.cluster.util import VectorSpaceClusterer
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class KMeansClusterer(VectorSpaceClusterer):
"""
The K-means clusterer starts with k arbitrary chosen means then allocates
commonly occurring output means are chosen.
"""
- def __init__(
- self,
- num_means,
- distance,
- repeats=1,
- conv_test=1e-6,
- initial_means=None,
- normalise=False,
- svd_dimensions=None,
- rng=None,
- avoid_empty_clusters=False,
- ):
+ def __init__(self, num_means, distance, repeats=1,
+ conv_test=1e-6, initial_means=None,
+ normalise=False, svd_dimensions=None,
+ rng=None, avoid_empty_clusters=False):
"""
:param num_means: the number of means to use (may use fewer)
assert repeats >= 1
assert not (initial_means and repeats > 1)
self._repeats = repeats
- self._rng = rng if rng else random.Random()
+ self._rng = (rng if rng else random.Random())
self._avoid_empty_clusters = avoid_empty_clusters
def cluster_vectorspace(self, vectors, trace=False):
if self._means and self._repeats > 1:
- print("Warning: means will be discarded for subsequent trials")
+ print('Warning: means will be discarded for subsequent trials')
meanss = []
for trial in range(self._repeats):
- if trace:
- print("k-means trial", trial)
+ if trace: print('k-means trial', trial)
if not self._means or trial > 1:
self._means = self._rng.sample(list(vectors), self._num_means)
self._cluster_vectorspace(vectors, trace)
index = self.classify_vectorspace(vector)
clusters[index].append(vector)
- if trace:
- print("iteration")
- # for i in range(self._num_means):
- # print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
+ if trace: print('iteration')
+ #for i in range(self._num_means):
+ #print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
# recalculate cluster means by computing the centroid of each cluster
new_means = list(map(self._centroid, clusters, self._means))
centroid = copy.copy(mean)
for vector in cluster:
centroid += vector
- return centroid / (1 + len(cluster))
+ return centroid / (1+len(cluster))
else:
if not len(cluster):
- sys.stderr.write("Error: no centroid defined for empty cluster.\n")
- sys.stderr.write(
- "Try setting argument 'avoid_empty_clusters' to True\n"
- )
- assert False
+ sys.stderr.write('Error: no centroid defined for empty cluster.\n')
+ sys.stderr.write('Try setting argument \'avoid_empty_clusters\' to True\n')
+ assert(False)
centroid = copy.copy(cluster[0])
for vector in cluster[1:]:
centroid += vector
return centroid / len(cluster)
def __repr__(self):
- return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
-
+ return '<KMeansClusterer means=%s repeats=%d>' % \
+ (self._means, self._repeats)
#################################################################################
-
def demo():
# example from figure 14.9, page 517, Manning and Schutze
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)
- print("Clustered:", vectors)
- print("As:", clusters)
- print("Means:", clusterer.means())
+ print('Clustered:', vectors)
+ print('As:', clusters)
+ print('Means:', clusterer.means())
print()
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors, True)
- print("Clustered:", vectors)
- print("As:", clusters)
- print("Means:", clusterer.means())
+ print('Clustered:', vectors)
+ print('As:', clusters)
+ print('Means:', clusterer.means())
print()
# classify a new vector
vector = numpy.array([3, 3])
- print("classify(%s):" % vector, end=" ")
+ print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))
print()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
+
# Natural Language Toolkit: Clusterer Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Contributor: J Richard Snape
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
from abc import abstractmethod
import copy
pass
from nltk.cluster.api import ClusterI
+from nltk.compat import python_2_unicode_compatible
class VectorSpaceClusterer(ClusterI):
Optionally performs singular value decomposition to reduce the
dimensionality.
"""
-
def __init__(self, normalise=False, svd_dimensions=None):
"""
:param normalise: should vectors be normalised to length 1
# use SVD to reduce the dimensionality
if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
- [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
- S = d[: self._svd_dimensions] * numpy.identity(
- self._svd_dimensions, numpy.float64
- )
- T = u[:, : self._svd_dimensions]
- Dt = vt[: self._svd_dimensions, :]
+ [u, d, vt] = numpy.linalg.svd(numpy.transpose(
+ numpy.array(vectors)))
+ S = d[:self._svd_dimensions] * \
+ numpy.identity(self._svd_dimensions, numpy.float64)
+ T = u[:, :self._svd_dimensions]
+ Dt = vt[:self._svd_dimensions, :]
vectors = numpy.transpose(numpy.dot(S, Dt))
self._Tt = numpy.transpose(T)
Returns the likelihood of the vector belonging to the cluster.
"""
predicted = self.classify_vectorspace(vector)
- return 1.0 if cluster == predicted else 0.0
+ return (1.0 if cluster == predicted else 0.0)
def vector(self, vector):
"""
Returns 1 minus the cosine of the angle between vectors v and u. This is
equal to 1 - (u.v / |u||v|).
"""
- return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
+ return 1 - (numpy.dot(u, v) / (
+ sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
class _DendrogramNode(object):
return cosine_distance(self._value, comparator._value) < 0
+@python_2_unicode_compatible
class Dendrogram(object):
"""
Represents a dendrogram, a tree with a specified branching order. This
"""
# ASCII rendering characters
- JOIN, HLINK, VLINK = "+", "-", "|"
+ JOIN, HLINK, VLINK = '+', '-', '|'
# find the root (or create one)
if len(self._items) > 1:
rhalf = int(width - lhalf - 1)
# display functions
- def format(centre, left=" ", right=" "):
- return "%s%s%s" % (lhalf * left, centre, right * rhalf)
+ def format(centre, left=' ', right=' '):
+ return '%s%s%s' % (lhalf*left, centre, right*rhalf)
def display(str):
stdout.write(str)
# for each merge, top down
queue = [(root._value, root)]
- verticals = [format(" ") for leaf in leaves]
+ verticals = [format(' ') for leaf in leaves]
while queue:
priority, node = queue.pop()
- child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
+ child_left_leaf = list(map(
+ lambda c: c.leaves(False)[0], node._children))
indices = list(map(leaves.index, child_left_leaf))
if child_left_leaf:
min_idx = min(indices)
for i in range(len(leaves)):
if leaves[i] in child_left_leaf:
if i == min_idx:
- display(format(JOIN, " ", HLINK))
+ display(format(JOIN, ' ', HLINK))
elif i == max_idx:
- display(format(JOIN, HLINK, " "))
+ display(format(JOIN, HLINK, ' '))
else:
display(format(JOIN, HLINK, HLINK))
verticals[i] = format(VLINK)
display(format(HLINK, HLINK, HLINK))
else:
display(verticals[i])
- display("\n")
+ display('\n')
for child in node._children:
if child._children:
queue.append((child._value, child))
for vertical in verticals:
display(vertical)
- display("\n")
+ display('\n')
# finally, display the last line
- display("".join(item.center(width) for item in last_row))
- display("\n")
+ display(''.join(item.center(width) for item in last_row))
+ display('\n')
def __repr__(self):
if len(self._items) > 1:
else:
root = self._items[0]
leaves = root.leaves(False)
- return "<Dendrogram with %d leaves>" % len(leaves)
+ return '<Dendrogram with %d leaves>' % len(leaves)
# Natural Language Toolkit: Collections
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, absolute_import
+import locale
+import re
+import types
+import textwrap
+import pydoc
import bisect
-from itertools import islice, chain
+import os
+from itertools import islice, chain, combinations
from functools import total_ordering
-
-# this unused import is for python 2.7
from collections import defaultdict, deque, Counter
+from six import text_type
+
from nltk.internals import slice_bounds, raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
##########################################################################
# Ordered Dictionary
##########################################################################
-
class OrderedDict(dict):
def __init__(self, data=None, **kwargs):
- self._keys = self.keys(data, kwargs.get("keys"))
- self._default_factory = kwargs.get("default_factory")
+ self._keys = self.keys(data, kwargs.get('keys'))
+ self._default_factory = kwargs.get('default_factory')
if data is None:
dict.__init__(self)
else:
assert len(data) == len(keys)
return keys
else:
- assert (
- isinstance(data, dict)
- or isinstance(data, OrderedDict)
- or isinstance(data, list)
- )
+ assert isinstance(data, dict) or \
+ isinstance(data, OrderedDict) or \
+ isinstance(data, list)
if isinstance(data, dict) or isinstance(data, OrderedDict):
return data.keys()
elif isinstance(data, list):
return [key for (key, value) in data]
- elif "_keys" in self.__dict__:
+ elif '_keys' in self.__dict__:
return self._keys
else:
return []
# returns iterator under python 3
return map(self.get, self._keys)
-
######################################################################
# Lazy Sequences
######################################################################
-
@total_ordering
+@python_2_unicode_compatible
class AbstractLazySequence(object):
"""
An abstract base class for read-only sequences whose values are
Subclasses are required to define two methods: ``__len__()``
and ``iterate_from()``.
"""
-
def __len__(self):
"""
Return the number of tokens in the corpus file underlying this
corpus view.
"""
- raise NotImplementedError("should be implemented by subclass")
+ raise NotImplementedError('should be implemented by subclass')
def iterate_from(self, start):
"""
``start``. If ``start>=len(self)``, then this iterator will
generate no tokens.
"""
- raise NotImplementedError("should be implemented by subclass")
+ raise NotImplementedError('should be implemented by subclass')
def __getitem__(self, i):
"""
return LazySubsequence(self, start, stop)
else:
# Handle negative indices
- if i < 0:
- i += len(self)
- if i < 0:
- raise IndexError("index out of range")
+ if i < 0: i += len(self)
+ if i < 0: raise IndexError('index out of range')
# Use iterate_from to extract it.
try:
return next(self.iterate_from(i))
except StopIteration:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
def __iter__(self):
"""Return an iterator that generates the tokens in the corpus
def count(self, value):
"""Return the number of times this list contains ``value``."""
- return sum(1 for elt in self if elt == value)
+ return sum(1 for elt in self if elt==value)
def index(self, value, start=None, stop=None):
"""Return the index of the first occurrence of ``value`` in this
slice bounds -- i.e., they count from the end of the list."""
start, stop = slice_bounds(self, slice(start, stop))
for i, elt in enumerate(islice(self, start, stop)):
- if elt == value:
- return i + start
- raise ValueError("index(x): x not in list")
+ if elt == value: return i+start
+ raise ValueError('index(x): x not in list')
def __contains__(self, value):
"""Return true if this list contains ``value``."""
return LazyConcatenation([self] * count)
_MAX_REPR_SIZE = 60
-
def __repr__(self):
"""
Return a string representation for this corpus view that is
pieces.append(repr(elt))
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % ", ".join(pieces[:-1])
- return "[%s]" % ", ".join(pieces)
+ return '[%s, ...]' % text_type(', ').join(pieces[:-1])
+ return '[%s]' % text_type(', ').join(pieces)
def __eq__(self, other):
- return type(self) == type(other) and list(self) == list(other)
+ return (type(self) == type(other) and list(self) == list(other))
def __ne__(self, other):
return not self == other
"""
:raise ValueError: Corpus view objects are unhashable.
"""
- raise ValueError("%s objects are unhashable" % self.__class__.__name__)
+ raise ValueError('%s objects are unhashable' %
+ self.__class__.__name__)
class LazySubsequence(AbstractLazySequence):
of a list) or greater than the length of ``source``.
"""
# If the slice is small enough, just use a tuple.
- if stop - start < cls.MIN_SIZE:
- return list(islice(source.iterate_from(start), stop - start))
+ if stop-start < cls.MIN_SIZE:
+ return list(islice(source.iterate_from(start), stop-start))
else:
return object.__new__(cls)
return self._stop - self._start
def iterate_from(self, start):
- return islice(
- self._source.iterate_from(start + self._start), max(0, len(self) - start)
- )
+ return islice(self._source.iterate_from(start+self._start),
+ max(0, len(self)-start))
class LazyConcatenation(AbstractLazySequence):
between offsets in the concatenated lists and offsets in the
sublists.
"""
-
def __init__(self, list_of_lists):
self._list = list_of_lists
self._offsets = [0]
def __len__(self):
if len(self._offsets) <= len(self._list):
- for tok in self.iterate_from(self._offsets[-1]):
- pass
+ for tok in self.iterate_from(self._offsets[-1]): pass
return self._offsets[-1]
def iterate_from(self, start_index):
if start_index < self._offsets[-1]:
- sublist_index = bisect.bisect_right(self._offsets, start_index) - 1
+ sublist_index = bisect.bisect_right(self._offsets, start_index)-1
else:
- sublist_index = len(self._offsets) - 1
+ sublist_index = len(self._offsets)-1
index = self._offsets[sublist_index]
sublist_iter = islice(self._list, sublist_index, None)
for sublist in sublist_iter:
- if sublist_index == (len(self._offsets) - 1):
- assert (
- index + len(sublist) >= self._offsets[-1]
- ), "offests not monotonic increasing!"
- self._offsets.append(index + len(sublist))
+ if sublist_index == (len(self._offsets)-1):
+ assert index+len(sublist) >= self._offsets[-1], (
+ 'offests not monotonic increasing!')
+ self._offsets.append(index+len(sublist))
else:
- assert self._offsets[sublist_index + 1] == index + len(
- sublist
- ), "inconsistent list value (num elts)"
+ assert self._offsets[sublist_index+1] == index+len(sublist), (
+ 'inconsistent list value (num elts)')
- for value in sublist[max(0, start_index - index) :]:
+ for value in sublist[max(0, start_index-index):]:
yield value
index += len(sublist)
using a ``LazyMap`` can significantly reduce memory usage when
training and running classifiers.
"""
-
def __init__(self, function, *lists, **config):
"""
:param function: The function that should be applied to
by this lazy map. (default=5)
"""
if not lists:
- raise TypeError("LazyMap requires at least two args")
+ raise TypeError('LazyMap requires at least two args')
self._lists = lists
self._func = function
- self._cache_size = config.get("cache_size", 5)
- self._cache = {} if self._cache_size > 0 else None
+ self._cache_size = config.get('cache_size', 5)
+ self._cache = ({} if self._cache_size > 0 else None)
# If you just take bool() of sum() here _all_lazy will be true just
# in case n >= 1 list is an AbstractLazySequence. Presumably this
# isn't what's intended.
- self._all_lazy = sum(
- isinstance(lst, AbstractLazySequence) for lst in lists
- ) == len(lists)
+ self._all_lazy = sum(isinstance(lst, AbstractLazySequence)
+ for lst in lists) == len(lists)
def iterate_from(self, index):
# Special case: one lazy sublist
# Special case: one non-lazy sublist
elif len(self._lists) == 1:
while True:
- try:
- yield self._func(self._lists[0][index])
- except IndexError:
- return
+ try: yield self._func(self._lists[0][index])
+ except IndexError: return
index += 1
# Special case: n lazy sublists
while True:
elements = []
for iterator in iterators:
- try:
- elements.append(next(iterator))
- except: # FIXME: What is this except really catching? StopIteration?
- elements.append(None)
+ try: elements.append(next(iterator))
+ except: elements.append(None)
if elements == [None] * len(self._lists):
return
yield self._func(*elements)
# general case
else:
while True:
- try:
- elements = [lst[index] for lst in self._lists]
+ try: elements = [lst[index] for lst in self._lists]
except IndexError:
elements = [None] * len(self._lists)
for i, lst in enumerate(self._lists):
- try:
- elements[i] = lst[index]
- except IndexError:
- pass
+ try: elements[i] = lst[index]
+ except IndexError: pass
if elements == [None] * len(self._lists):
return
yield self._func(*elements)
return LazyMap(self._func, *sliced_lists)
else:
# Handle negative indices
- if index < 0:
- index += len(self)
- if index < 0:
- raise IndexError("index out of range")
+ if index < 0: index += len(self)
+ if index < 0: raise IndexError('index out of range')
# Check the cache
if self._cache is not None and index in self._cache:
return self._cache[index]
# Calculate the value
- try:
- val = next(self.iterate_from(index))
+ try: val = next(self.iterate_from(index))
except StopIteration:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# Update the cache
if self._cache is not None:
if len(self._cache) > self._cache_size:
- self._cache.popitem() # discard random entry
+ self._cache.popitem() # discard random entry
self._cache[index] = val
# Return the value
return val
avoiding the creation of an additional long sequence, memory usage can be
significantly reduced.
"""
-
def __init__(self, *lists):
"""
:param lists: the underlying lists
"""
LazyZip.__init__(self, range(len(lst)), lst)
-
class LazyIteratorList(AbstractLazySequence):
"""
Wraps an iterator, loading its elements on demand
and making them subscriptable.
__repr__ displays only the first few elements.
"""
-
def __init__(self, it, known_len=None):
self._it = it
self._len = known_len
def iterate_from(self, start):
"""Create a new iterator over this list starting at the given offset."""
- while len(self._cache) < start:
+ while len(self._cache)<start:
v = next(self._it)
self._cache.append(v)
i = start
- while i < len(self._cache):
+ while i<len(self._cache):
yield self._cache[i]
i += 1
while True:
"""Return a list concatenating other with self."""
return type(self)(chain(other, self))
-
######################################################################
# Trie Implementation
######################################################################
-class Trie(dict):
+class Trie(defaultdict):
"""A Trie implementation for strings"""
-
LEAF = True
def __init__(self, strings=None):
- """Builds a Trie object, which is built around a ``dict``
+ """Builds a Trie object, which is built around a ``defaultdict``
If ``strings`` is provided, it will add the ``strings``, which
consist of a ``list`` of ``strings``, to the Trie.
:type strings: list(str)
"""
- super(Trie, self).__init__()
+ defaultdict.__init__(self, Trie)
if strings:
for string in strings:
self.insert(string)
:Example:
>>> from nltk.collections import Trie
- >>> trie = Trie(["abc", "def"])
- >>> expected = {'a': {'b': {'c': {True: None}}}, \
- 'd': {'e': {'f': {True: None}}}}
- >>> trie == expected
- True
+ >>> trie = Trie(["ab"])
+ >>> trie
+ defaultdict(<class 'nltk.collections.Trie'>, {'a': defaultdict(<class 'nltk.collections.Trie'>, {'b': defaultdict(<class 'nltk.collections.Trie'>, {True: None})})})
"""
if len(string):
# mark the string is complete
self[Trie.LEAF] = None
- def __missing__(self, key):
- self[key] = Trie()
- return self[key]
+ def __str__(self):
+ return str(self.as_dict())
+
+ def as_dict(self, d=None):
+ """Convert ``defaultdict`` to common ``dict`` representation.
+
+ :param: A defaultdict containing strings mapped to nested defaultdicts.
+ This is the structure of the trie. (Default is None)
+ :type: defaultdict(str -> defaultdict)
+ :return: Even though ``defaultdict`` is a subclass of ``dict`` and thus
+ can be converted to a simple ``dict`` using ``dict()``, in our case
+ it's a nested ``defaultdict``, so here's a quick trick to provide to
+ us the ``dict`` representation of the ``Trie`` without
+ ``defaultdict(<class 'nltk.collections.Trie'>, ...``
+ :rtype: dict(str -> dict(bool -> None))
+ Note: there can be an arbitrarily deeply nested
+ ``dict(str -> dict(str -> dict(..))``, but the last
+ level will have ``dict(str -> dict(bool -> None))``
+
+ :Example:
+
+ >>> from nltk.collections import Trie
+ >>> trie = Trie(["abc", "def"])
+ >>> expected = {'a': {'b': {'c': {True: None}}}, 'd': {'e': {'f': {True: None}}}}
+ >>> trie.as_dict() == expected
+ True
+
+ """
+ def _default_to_regular(d):
+ """
+ Source: http://stackoverflow.com/a/26496899/4760801
+
+ :param d: Nested ``defaultdict`` to convert to regular ``dict``
+ :type d: defaultdict(str -> defaultdict(...))
+ :return: A dict representation of the defaultdict
+ :rtype: dict(str -> dict(str -> ...))
+
+ :Example:
+
+ >>> from collections import defaultdict
+ >>> d = defaultdict(defaultdict)
+ >>> d["one"]["two"] = "three"
+ >>> d
+ defaultdict(<type 'collections.defaultdict'>, {'one': defaultdict(None, {'two': 'three'})})
+ >>> _default_to_regular(d)
+ {'one': {'two': 'three'}}
+
+ """
+ if isinstance(d, defaultdict):
+ d = {k: _default_to_regular(v) for k, v in d.items()}
+ return d
+
+ return _default_to_regular(self)
# Natural Language Toolkit: Collocations and Association Measures
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
ngram given appropriate frequency counts. A number of standard association
measures are provided in bigram_measures and trigram_measures.
"""
+from __future__ import print_function
# Possible TODOs:
# - consider the distinction between f(x,_) and f(x) and whether our
# and unigram counts (raw_freq, pmi, student_t)
import itertools as _itertools
+from six import iteritems
from nltk.probability import FreqDist
from nltk.util import ngrams
-
-# these two unused imports are referenced in collocations.doctest
-from nltk.metrics import (
- ContingencyMeasures,
- BigramAssocMeasures,
- TrigramAssocMeasures,
- QuadgramAssocMeasures,
-)
+from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures
from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
self.ngram_fd = ngram_fd
@classmethod
- def _build_new_documents(
- cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None
- ):
- """
+ def _build_new_documents(cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None):
+ '''
Pad the document with the place holder according to the window_size
- """
+ '''
padding = (pad_symbol,) * (window_size - 1)
if pad_right:
- return _itertools.chain.from_iterable(
- _itertools.chain(doc, padding) for doc in documents
- )
+ return _itertools.chain.from_iterable(_itertools.chain(doc, padding) for doc in documents)
if pad_left:
- return _itertools.chain.from_iterable(
- _itertools.chain(padding, doc) for doc in documents
- )
+ return _itertools.chain.from_iterable(_itertools.chain(padding, doc) for doc in documents)
@classmethod
def from_documents(cls, documents):
"""Constructs a collocation finder given a collection of documents,
each of which is a list (or iterable) of tokens.
"""
- # return cls.from_words(_itertools.chain(*documents))
- return cls.from_words(
- cls._build_new_documents(documents, cls.default_ws, pad_right=True)
- )
+ #return cls.from_words(_itertools.chain(*documents))
+ return cls.from_words(cls._build_new_documents(documents, cls.default_ws, pad_right=True))
@staticmethod
def _ngram_freqdist(words, n):
- return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1))
+ return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))
def _apply_filter(self, fn=lambda ngram, freq: False):
"""Generic filter removes ngrams from the frequency distribution
if the function returns True when passed an ngram tuple.
"""
tmp_ngram = FreqDist()
- for ngram, freq in self.ngram_fd.items():
+ for ngram, freq in iteritems(self.ngram_fd):
if not fn(ngram, freq):
tmp_ngram[ngram] = freq
self.ngram_fd = tmp_ngram
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
-
default_ws = 2
def __init__(self, word_fd, bigram_fd, window_size=2):
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
-
default_ws = 3
def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
n_ixx = self.word_fd[w1]
n_xix = self.word_fd[w2]
n_xxi = self.word_fd[w3]
- return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all)
+ return score_fn(n_iii,
+ (n_iix, n_ixi, n_xii),
+ (n_ixx, n_xix, n_xxi),
+ n_all)
class QuadgramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of quadgram collocations or other association measures.
It is often useful to use from_words() rather than constructing an instance directly.
"""
-
default_ws = 4
def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
n_xixx = self.word_fd[w2]
n_xxix = self.word_fd[w3]
n_xxxi = self.word_fd[w4]
- return score_fn(
- n_iiii,
- (n_iiix, n_iixi, n_ixii, n_xiii),
- (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
- (n_ixxx, n_xixx, n_xxix, n_xxxi),
- n_all,
- )
+ return score_fn(n_iiii,
+ (n_iiix, n_iixi, n_ixii, n_xiii),
+ (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
+ (n_ixxx, n_xixx, n_xxix, n_xxxi),
+ n_all)
def demo(scorer=None, compare_scorer=None):
"""Finds bigram collocations in the files of the WebText corpus."""
- from nltk.metrics import (
- BigramAssocMeasures,
- spearman_correlation,
- ranks_from_scores,
- )
+ from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores
if scorer is None:
scorer = BigramAssocMeasures.likelihood_ratio
from nltk.corpus import stopwords, webtext
- ignored_words = stopwords.words("english")
+ ignored_words = stopwords.words('english')
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
for file in webtext.fileids():
- words = [word.lower() for word in webtext.words(file)]
+ words = [word.lower()
+ for word in webtext.words(file)]
cf = BigramCollocationFinder.from_words(words)
cf.apply_freq_filter(3)
cf.apply_word_filter(word_filter)
- corr = spearman_correlation(
- ranks_from_scores(cf.score_ngrams(scorer)),
- ranks_from_scores(cf.score_ngrams(compare_scorer)),
- )
+ corr = spearman_correlation(ranks_from_scores(cf.score_ngrams(scorer)),
+ ranks_from_scores(cf.score_ngrams(compare_scorer)))
print(file)
- print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)])
- print("\t Correlation to %s: %0.4f" % (compare_scorer.__name__, corr))
-
+ print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
+ print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr))
# Slows down loading too much
# bigram_measures = BigramAssocMeasures()
# trigram_measures = TrigramAssocMeasures()
-if __name__ == "__main__":
+if __name__ == '__main__':
import sys
from nltk.metrics import BigramAssocMeasures
try:
- scorer = eval("BigramAssocMeasures." + sys.argv[1])
+ scorer = eval('BigramAssocMeasures.' + sys.argv[1])
except IndexError:
scorer = None
try:
- compare_scorer = eval("BigramAssocMeasures." + sys.argv[2])
+ compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2])
except IndexError:
compare_scorer = None
demo(scorer, compare_scorer)
-__all__ = [
- "BigramCollocationFinder",
- "TrigramCollocationFinder",
- "QuadgramCollocationFinder",
-]
+__all__ = ['BigramCollocationFinder',
+ 'TrigramCollocationFinder', 'QuadgramCollocationFinder']
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Compatibility
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import absolute_import, print_function
import os
-from functools import wraps
+import sys
+from functools import update_wrapper, wraps
+import fractions
+import unicodedata
+
+from six import string_types, text_type
+
+# Python 2/3 compatibility layer. Based on six.
+
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+ def get_im_class(meth):
+ return meth.__self__.__class__
+
+ import io
+ StringIO = io.StringIO
+ BytesIO = io.BytesIO
+
+ from datetime import timezone
+ UTC = timezone.utc
+
+ from tempfile import TemporaryDirectory
+
+else:
+ def get_im_class(meth):
+ return meth.im_class
+
+ try:
+ from cStringIO import StringIO
+ except ImportError:
+ from StringIO import StringIO
+ BytesIO = StringIO
+
+ from datetime import tzinfo, timedelta
+
+ ZERO = timedelta(0)
+ HOUR = timedelta(hours=1)
+
+ # A UTC class for python 2.7
+ class UTC(tzinfo):
+ """UTC"""
+
+ def utcoffset(self, dt):
+ return ZERO
+
+ def tzname(self, dt):
+ return "UTC"
+
+ def dst(self, dt):
+ return ZERO
+
+ UTC = UTC()
+
+ import csv
+ import codecs
+ import cStringIO
+
+ class UnicodeWriter:
+ """
+ A CSV writer which will write rows to CSV file "f",
+ which is encoded in the given encoding.
+ see https://docs.python.org/2/library/csv.html
+ """
+
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8",
+ errors='replace', **kwds):
+ # Redirect output to a queue
+ self.queue = cStringIO.StringIO()
+ self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+ self.stream = f
+ encoder_cls = codecs.getincrementalencoder(encoding)
+ self.encoder = encoder_cls(errors=errors)
+
+ def encode(self, data):
+ if isinstance(data, string_types):
+ return data.encode("utf-8")
+ else:
+ return data
+
+ def writerow(self, row):
+ self.writer.writerow([self.encode(s) for s in row])
+ # Fetch UTF-8 output from the queue ...
+ data = self.queue.getvalue()
+ data = data.decode("utf-8")
+ # ... and reencode it into the target encoding
+ data = self.encoder.encode(data, 'replace')
+ # write to the target stream
+ self.stream.write(data)
+ # empty queue
+ self.queue.truncate(0)
+
+ import warnings as _warnings
+ import os as _os
+ from tempfile import mkdtemp
+
+ class TemporaryDirectory(object):
+ """Create and return a temporary directory. This has the same
+ behavior as mkdtemp but can be used as a context manager. For
+ example:
+
+ with TemporaryDirectory() as tmpdir:
+ ...
+
+ Upon exiting the context, the directory and everything contained
+ in it are removed.
+
+ http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
+ """
+
+ def __init__(self, suffix="", prefix="tmp", dir=None):
+ self._closed = False
+ self.name = None # Handle mkdtemp raising an exception
+ self.name = mkdtemp(suffix, prefix, dir)
+
+ def __repr__(self):
+ return "<{} {!r}>".format(self.__class__.__name__, self.name)
+
+ def __enter__(self):
+ return self.name
+
+ def cleanup(self, _warn=False):
+ if self.name and not self._closed:
+ try:
+ self._rmtree(self.name)
+ except (TypeError, AttributeError) as ex:
+ # Issue #10188: Emit a warning on stderr
+ # if the directory could not be cleaned
+ # up due to missing globals
+ if "None" not in str(ex):
+ raise
+ print("ERROR: {!r} while cleaning up {!r}".format(ex,
+ self),
+ file=sys.stderr)
+ return
+ self._closed = True
+ if _warn:
+ self._warn("Implicitly cleaning up {!r}".format(self),
+ Warning)
+
+ def __exit__(self, exc, value, tb):
+ self.cleanup()
+
+ def __del__(self):
+ # Issue a Warning if implicit cleanup needed
+ self.cleanup(_warn=True)
+
+ # XXX (ncoghlan): The following code attempts to make
+ # this class tolerant of the module nulling out process
+ # that happens during CPython interpreter shutdown
+ # Alas, it doesn't actually manage it. See issue #10188
+ _listdir = staticmethod(_os.listdir)
+ _path_join = staticmethod(_os.path.join)
+ _isdir = staticmethod(_os.path.isdir)
+ _islink = staticmethod(_os.path.islink)
+ _remove = staticmethod(_os.remove)
+ _rmdir = staticmethod(_os.rmdir)
+ _warn = _warnings.warn
+
+ def _rmtree(self, path):
+ # Essentially a stripped down version of shutil.rmtree. We can't
+ # use globals because they may be None'ed out at shutdown.
+ for name in self._listdir(path):
+ fullname = self._path_join(path, name)
+ try:
+ isdir = (self._isdir(fullname) and not
+ self._islink(fullname))
+ except OSError:
+ isdir = False
+ if isdir:
+ self._rmtree(fullname)
+ else:
+ try:
+ self._remove(fullname)
+ except OSError:
+ pass
+ try:
+ self._rmdir(path)
+ except OSError:
+ pass
# ======= Compatibility for datasets that care about Python versions ========
# The following datasets have a /PY3 subdirectory containing
# a full copy of the data which has been re-encoded or repickled.
-DATA_UPDATES = [
- ("chunkers", "maxent_ne_chunker"),
- ("help", "tagsets"),
- ("taggers", "maxent_treebank_pos_tagger"),
- ("tokenizers", "punkt"),
-]
+DATA_UPDATES = [("chunkers", "maxent_ne_chunker"),
+ ("help", "tagsets"),
+ ("taggers", "maxent_treebank_pos_tagger"),
+ ("tokenizers", "punkt")]
_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
+
def add_py3_data(path):
- for item in _PY3_DATA_UPDATES:
- if item in str(path) and "/PY3" not in str(path):
- pos = path.index(item) + len(item)
- if path[pos : pos + 4] == ".zip":
- pos += 4
- path = path[:pos] + "/PY3" + path[pos:]
- break
+ if PY3:
+ for item in _PY3_DATA_UPDATES:
+ if item in str(path) and "/PY3" not in str(path):
+ pos = path.index(item) + len(item)
+ if path[pos:pos + 4] == ".zip":
+ pos += 4
+ path = path[:pos] + "/PY3" + path[pos:]
+ break
return path
def _decorator(*args, **kwargs):
args = (args[0], add_py3_data(args[1])) + args[2:]
return init_func(*args, **kwargs)
-
return wraps(init_func)(_decorator)
+
+
+# ======= Compatibility layer for __str__ and __repr__ ==========
+def remove_accents(text):
+
+ if isinstance(text, bytes):
+ text = text.decode('ascii')
+
+ category = unicodedata.category # this gives a small (~10%) speedup
+ return ''.join(
+ c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
+ )
+
+
+# Select the best transliteration method:
+try:
+ # Older versions of Unidecode are licensed under Artistic License;
+ # assume an older version is installed.
+ from unidecode import unidecode as transliterate
+except ImportError:
+ try:
+ # text-unidecode implementation is worse than Unidecode
+ # implementation so Unidecode is preferred.
+ from text_unidecode import unidecode as transliterate
+ except ImportError:
+ # This transliteration method should be enough
+ # for many Western languages.
+ transliterate = remove_accents
+
+
+def python_2_unicode_compatible(klass):
+ """
+ This decorator defines __unicode__ method and fixes
+ __repr__ and __str__ methods under Python 2.
+
+ To support Python 2 and 3 with a single code base,
+ define __str__ and __repr__ methods returning unicode
+ text and apply this decorator to the class.
+
+ Original __repr__ and __str__ would be available
+ as unicode_repr and __unicode__ (under both Python 2
+ and Python 3).
+ """
+
+ if not issubclass(klass, object):
+ raise ValueError("This decorator doesn't work for old-style classes")
+
+ # both __unicode__ and unicode_repr are public because they
+ # may be useful in console under Python 2.x
+
+ # if __str__ or __repr__ are not overriden in a subclass,
+ # they may be already fixed by this decorator in a parent class
+ # and we shouldn't them again
+
+ if not _was_fixed(klass.__str__):
+ klass.__unicode__ = klass.__str__
+ if not PY3:
+ klass.__str__ = _7bit(_transliterated(klass.__unicode__))
+
+ if not _was_fixed(klass.__repr__):
+ klass.unicode_repr = klass.__repr__
+ if not PY3:
+ klass.__repr__ = _7bit(klass.unicode_repr)
+
+ return klass
+
+
+def unicode_repr(obj):
+ """
+ For classes that was fixed with @python_2_unicode_compatible
+ ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
+ the result is returned without "u" letter (to make output the
+ same under Python 2.x and Python 3.x); for other variables
+ it is the same as ``repr``.
+ """
+ if PY3:
+ return repr(obj)
+
+ # Python 2.x
+ if hasattr(obj, 'unicode_repr'):
+ return obj.unicode_repr()
+
+ if isinstance(obj, text_type):
+ return repr(obj)[1:] # strip "u" letter from output
+
+ return repr(obj)
+
+
+def _transliterated(method):
+ def wrapper(self):
+ return transliterate(method(self))
+
+ update_wrapper(wrapper, method, ["__name__", "__doc__"])
+ if hasattr(method, "_nltk_compat_7bit"):
+ wrapper._nltk_compat_7bit = method._nltk_compat_7bit
+
+ wrapper._nltk_compat_transliterated = True
+ return wrapper
+
+
+def _7bit(method):
+ def wrapper(self):
+ return method(self).encode('ascii', 'backslashreplace')
+
+ update_wrapper(wrapper, method, ["__name__", "__doc__"])
+
+ if hasattr(method, "_nltk_compat_transliterated"):
+ wrapper._nltk_compat_transliterated = (
+ method._nltk_compat_transliterated
+ )
+
+ wrapper._nltk_compat_7bit = True
+ return wrapper
+
+
+def _was_fixed(method):
+ return (getattr(method, "_nltk_compat_7bit", False) or
+ getattr(method, "_nltk_compat_transliterated", False))
+
+
+class Fraction(fractions.Fraction):
+ """
+ This is a simplified backwards compatible version of fractions.Fraction
+ from Python >=3.5. It adds the `_normalize` parameter such that it does
+ not normalize the denominator to the Greatest Common Divisor (gcd) when
+ the numerator is 0.
+
+ This is most probably only used by the nltk.translate.bleu_score.py where
+ numerator and denominator of the different ngram precisions are mutable.
+ But the idea of "mutable" fraction might not be applicable to other usages,
+ See http://stackoverflow.com/questions/34561265
+
+ This objects should be deprecated once NLTK stops supporting Python < 3.5
+ See https://github.com/nltk/nltk/issues/1330
+ """
+ def __new__(cls, numerator=0, denominator=None, _normalize=True):
+ cls = super(Fraction, cls).__new__(cls, numerator, denominator)
+ # To emulate fraction.Fraction.from_float across Python >=2.7,
+ # check that numerator is an integer and denominator is not None.
+ if not _normalize and type(numerator) == int and denominator:
+ cls._numerator = numerator
+ cls._denominator = denominator
+ return cls
# Natural Language Toolkit: Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader import *
abc = LazyCorpusLoader(
- "abc",
- PlaintextCorpusReader,
- r"(?!\.).*\.txt",
- encoding=[("science", "latin_1"), ("rural", "utf8")],
-)
-alpino = LazyCorpusLoader("alpino", AlpinoCorpusReader, tagset="alpino")
+ 'abc', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
+ ('science', 'latin_1'),
+ ('rural', 'utf8')])
+alpino = LazyCorpusLoader(
+ 'alpino', AlpinoCorpusReader, tagset='alpino')
brown = LazyCorpusLoader(
- "brown",
- CategorizedTaggedCorpusReader,
- r"c[a-z]\d\d",
- cat_file="cats.txt",
- tagset="brown",
- encoding="ascii",
-)
+ 'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
+ cat_file='cats.txt', tagset='brown', encoding="ascii")
cess_cat = LazyCorpusLoader(
- "cess_cat",
- BracketParseCorpusReader,
- r"(?!\.).*\.tbf",
- tagset="unknown",
- encoding="ISO-8859-15",
-)
+ 'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
+ tagset='unknown', encoding='ISO-8859-15')
cess_esp = LazyCorpusLoader(
- "cess_esp",
- BracketParseCorpusReader,
- r"(?!\.).*\.tbf",
- tagset="unknown",
- encoding="ISO-8859-15",
-)
-cmudict = LazyCorpusLoader("cmudict", CMUDictCorpusReader, ["cmudict"])
-comtrans = LazyCorpusLoader("comtrans", AlignedCorpusReader, r"(?!\.).*\.txt")
+ 'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
+ tagset='unknown', encoding='ISO-8859-15')
+cmudict = LazyCorpusLoader(
+ 'cmudict', CMUDictCorpusReader, ['cmudict'])
+comtrans = LazyCorpusLoader(
+ 'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
comparative_sentences = LazyCorpusLoader(
- "comparative_sentences",
- ComparativeSentencesCorpusReader,
- r"labeledSentences\.txt",
- encoding="latin-1",
-)
+ 'comparative_sentences', ComparativeSentencesCorpusReader, r'labeledSentences\.txt',
+ encoding='latin-1')
conll2000 = LazyCorpusLoader(
- "conll2000",
- ConllChunkCorpusReader,
- ["train.txt", "test.txt"],
- ("NP", "VP", "PP"),
- tagset="wsj",
- encoding="ascii",
-)
+ 'conll2000', ConllChunkCorpusReader,
+ ['train.txt', 'test.txt'], ('NP','VP','PP'),
+ tagset='wsj', encoding='ascii')
conll2002 = LazyCorpusLoader(
- "conll2002",
- ConllChunkCorpusReader,
- ".*\.(test|train).*",
- ("LOC", "PER", "ORG", "MISC"),
- encoding="utf-8",
-)
+ 'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
+ ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
conll2007 = LazyCorpusLoader(
- "conll2007",
- DependencyCorpusReader,
- ".*\.(test|train).*",
- encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
-)
-crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
+ 'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
+ ('eus', 'ISO-8859-2'),
+ ('esp', 'utf8')])
+crubadan = LazyCorpusLoader(
+ 'crubadan', CrubadanCorpusReader, '.*\.txt')
dependency_treebank = LazyCorpusLoader(
- "dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
-)
+ 'dependency_treebank', DependencyCorpusReader, '.*\.dp',
+ encoding='ascii')
floresta = LazyCorpusLoader(
- "floresta",
- BracketParseCorpusReader,
- r"(?!\.).*\.ptb",
- "#",
- tagset="unknown",
- encoding="ISO-8859-15",
-)
+ 'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
+ tagset='unknown', encoding='ISO-8859-15')
framenet15 = LazyCorpusLoader(
- "framenet_v15",
- FramenetCorpusReader,
- [
- "frRelation.xml",
- "frameIndex.xml",
- "fulltextIndex.xml",
- "luIndex.xml",
- "semTypes.xml",
- ],
-)
+ 'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
framenet = LazyCorpusLoader(
- "framenet_v17",
- FramenetCorpusReader,
- [
- "frRelation.xml",
- "frameIndex.xml",
- "fulltextIndex.xml",
- "luIndex.xml",
- "semTypes.xml",
- ],
-)
+ 'framenet_v17', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
gazetteers = LazyCorpusLoader(
- "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
-)
+ 'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt',
+ encoding='ISO-8859-2')
genesis = LazyCorpusLoader(
- "genesis",
- PlaintextCorpusReader,
- r"(?!\.).*\.txt",
- encoding=[
- ("finnish|french|german", "latin_1"),
- ("swedish", "cp865"),
- (".*", "utf_8"),
- ],
-)
+ 'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
+ ('finnish|french|german', 'latin_1'),
+ ('swedish', 'cp865'),
+ ('.*', 'utf_8')])
gutenberg = LazyCorpusLoader(
- "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
-)
-ieer = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
+ 'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
+ieer = LazyCorpusLoader(
+ 'ieer', IEERCorpusReader, r'(?!README|\.).*')
inaugural = LazyCorpusLoader(
- "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
-)
+ 'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
# [XX] This should probably just use TaggedCorpusReader:
indian = LazyCorpusLoader(
- "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
-)
+ 'indian', IndianCorpusReader, r'(?!\.).*\.pos',
+ tagset='unknown', encoding='utf8')
-jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8")
-knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
-lin_thesaurus = LazyCorpusLoader("lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp")
+jeita = LazyCorpusLoader(
+ 'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
+knbc = LazyCorpusLoader(
+ 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
+lin_thesaurus = LazyCorpusLoader(
+ 'lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
mac_morpho = LazyCorpusLoader(
- "mac_morpho",
- MacMorphoCorpusReader,
- r"(?!\.).*\.txt",
- tagset="unknown",
- encoding="latin-1",
-)
+ 'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt',
+ tagset='unknown', encoding='latin-1')
machado = LazyCorpusLoader(
- "machado",
- PortugueseCategorizedPlaintextCorpusReader,
- r"(?!\.).*\.txt",
- cat_pattern=r"([a-z]*)/.*",
- encoding="latin-1",
-)
+ 'machado', PortugueseCategorizedPlaintextCorpusReader,
+ r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
masc_tagged = LazyCorpusLoader(
- "masc_tagged",
- CategorizedTaggedCorpusReader,
- r"(spoken|written)/.*\.txt",
- cat_file="categories.txt",
- tagset="wsj",
- encoding="utf-8",
- sep="_",
-)
+ 'masc_tagged', CategorizedTaggedCorpusReader, r'(spoken|written)/.*\.txt',
+ cat_file='categories.txt', tagset='wsj', encoding="utf-8", sep="_")
movie_reviews = LazyCorpusLoader(
- "movie_reviews",
- CategorizedPlaintextCorpusReader,
- r"(?!\.).*\.txt",
- cat_pattern=r"(neg|pos)/.*",
- encoding="ascii",
-)
+ 'movie_reviews', CategorizedPlaintextCorpusReader,
+ r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
+ encoding='ascii')
multext_east = LazyCorpusLoader(
- "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
-)
+ 'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8")
names = LazyCorpusLoader(
- "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
-)
+ 'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
nps_chat = LazyCorpusLoader(
- "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
-)
+ 'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
opinion_lexicon = LazyCorpusLoader(
- "opinion_lexicon",
- OpinionLexiconCorpusReader,
- r"(\w+)\-words\.txt",
- encoding="ISO-8859-2",
-)
+ 'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt',
+ encoding='ISO-8859-2')
ppattach = LazyCorpusLoader(
- "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
-)
+ 'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
product_reviews_1 = LazyCorpusLoader(
- "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
-)
+ 'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8')
product_reviews_2 = LazyCorpusLoader(
- "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
-)
+ 'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8')
pros_cons = LazyCorpusLoader(
- "pros_cons",
- ProsConsCorpusReader,
- r"Integrated(Cons|Pros)\.txt",
- cat_pattern=r"Integrated(Cons|Pros)\.txt",
- encoding="ISO-8859-2",
-)
-ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
- "ptb",
- CategorizedBracketParseCorpusReader,
- r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
- cat_file="allcats.txt",
- tagset="wsj",
-)
+ 'pros_cons', ProsConsCorpusReader, r'Integrated(Cons|Pros)\.txt',
+ cat_pattern=r'Integrated(Cons|Pros)\.txt', encoding='ISO-8859-2')
+ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
+ 'ptb', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
+ cat_file='allcats.txt', tagset='wsj')
qc = LazyCorpusLoader(
- "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
-)
+ 'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'], encoding='ISO-8859-2')
reuters = LazyCorpusLoader(
- "reuters",
- CategorizedPlaintextCorpusReader,
- "(training|test).*",
- cat_file="cats.txt",
- encoding="ISO-8859-2",
-)
-rte = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
-senseval = LazyCorpusLoader("senseval", SensevalCorpusReader, r"(?!\.).*\.pos")
+ 'reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
+ cat_file='cats.txt', encoding='ISO-8859-2')
+rte = LazyCorpusLoader(
+ 'rte', RTECorpusReader, r'(?!\.).*\.xml')
+senseval = LazyCorpusLoader(
+ 'senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
sentence_polarity = LazyCorpusLoader(
- "sentence_polarity",
- CategorizedSentencesCorpusReader,
- r"rt-polarity\.(neg|pos)",
- cat_pattern=r"rt-polarity\.(neg|pos)",
- encoding="utf-8",
-)
+ 'sentence_polarity', CategorizedSentencesCorpusReader, r'rt-polarity\.(neg|pos)',
+ cat_pattern=r'rt-polarity\.(neg|pos)', encoding='utf-8')
sentiwordnet = LazyCorpusLoader(
- "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
-)
-shakespeare = LazyCorpusLoader("shakespeare", XMLCorpusReader, r"(?!\.).*\.xml")
+ 'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8')
+shakespeare = LazyCorpusLoader(
+ 'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
sinica_treebank = LazyCorpusLoader(
- "sinica_treebank",
- SinicaTreebankCorpusReader,
- ["parsed"],
- tagset="unknown",
- encoding="utf-8",
-)
+ 'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'],
+ tagset='unknown', encoding='utf-8')
state_union = LazyCorpusLoader(
- "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
-)
+ 'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt',
+ encoding='ISO-8859-2')
stopwords = LazyCorpusLoader(
- "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
-)
+ 'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
subjectivity = LazyCorpusLoader(
- "subjectivity",
- CategorizedSentencesCorpusReader,
- r"(quote.tok.gt9|plot.tok.gt9)\.5000",
- cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
- encoding="latin-1",
-)
+ 'subjectivity', CategorizedSentencesCorpusReader, r'(quote.tok.gt9|plot.tok.gt9)\.5000',
+ cat_map={'quote.tok.gt9.5000':['subj'], 'plot.tok.gt9.5000':['obj']}, encoding='latin-1')
swadesh = LazyCorpusLoader(
- "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
-)
+ 'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
swadesh110 = LazyCorpusLoader(
- 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
-)
+ 'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8')
swadesh207 = LazyCorpusLoader(
- 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
-)
-switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
-timit = LazyCorpusLoader("timit", TimitCorpusReader)
+ 'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8')
+switchboard = LazyCorpusLoader(
+ 'switchboard', SwitchboardCorpusReader, tagset='wsj')
+timit = LazyCorpusLoader(
+ 'timit', TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
- "timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
-)
+ 'timit', TimitTaggedCorpusReader, '.+\.tags',
+ tagset='wsj', encoding='ascii')
toolbox = LazyCorpusLoader(
- "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
-)
+ 'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
treebank = LazyCorpusLoader(
- "treebank/combined",
- BracketParseCorpusReader,
- r"wsj_.*\.mrg",
- tagset="wsj",
- encoding="ascii",
-)
+ 'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
+ tagset='wsj', encoding='ascii')
treebank_chunk = LazyCorpusLoader(
- "treebank/tagged",
- ChunkedCorpusReader,
- r"wsj_.*\.pos",
- sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
- para_block_reader=tagged_treebank_para_block_reader,
- tagset="wsj",
- encoding="ascii",
-)
+ 'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
+ sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
+ para_block_reader=tagged_treebank_para_block_reader, tagset='wsj', encoding='ascii')
treebank_raw = LazyCorpusLoader(
- "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
-)
-twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
-udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
-udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
+ 'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
+twitter_samples = LazyCorpusLoader(
+ 'twitter_samples', TwitterCorpusReader, '.*\.json')
+udhr = LazyCorpusLoader(
+ 'udhr', UdhrCorpusReader)
+udhr2 = LazyCorpusLoader(
+ 'udhr2', PlaintextCorpusReader, r'.*\.txt', encoding='utf8')
universal_treebanks = LazyCorpusLoader(
- "universal_treebanks_v20",
- ConllCorpusReader,
- r".*\.conll",
- columntypes=(
- "ignore",
- "words",
- "ignore",
- "ignore",
- "pos",
- "ignore",
- "ignore",
- "ignore",
- "ignore",
- "ignore",
- ),
-)
-verbnet = LazyCorpusLoader("verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml")
+ 'universal_treebanks_v20', ConllCorpusReader, r'.*\.conll',
+ columntypes = ('ignore', 'words', 'ignore', 'ignore', 'pos',
+ 'ignore', 'ignore', 'ignore', 'ignore', 'ignore'))
+verbnet = LazyCorpusLoader(
+ 'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
- "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
-)
+ 'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader(
- "wordnet",
- WordNetCorpusReader,
- LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
-)
-wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
+ 'wordnet', WordNetCorpusReader,
+ LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'))
+wordnet_ic = LazyCorpusLoader(
+ 'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
- "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
-)
+ 'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
# defined after treebank
propbank = LazyCorpusLoader(
- "propbank",
- PropbankCorpusReader,
- "prop.txt",
- "frames/.*\.xml",
- "verbs.txt",
- lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
- treebank,
-) # Must be defined *after* treebank corpus.
+ 'propbank', PropbankCorpusReader,
+ 'prop.txt', 'frames/.*\.xml', 'verbs.txt',
+ lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
+ treebank) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
- "nombank.1.0",
- NombankCorpusReader,
- "nombank.1.0",
- "frames/.*\.xml",
- "nombank.1.0.words",
- lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
- treebank,
-) # Must be defined *after* treebank corpus.
+ 'nombank.1.0', NombankCorpusReader,
+ 'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
+ lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
+ treebank) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
- "propbank",
- PropbankCorpusReader,
- "prop.txt",
- "frames/.*\.xml",
- "verbs.txt",
+ 'propbank', PropbankCorpusReader,
+ 'prop.txt', 'frames/.*\.xml', 'verbs.txt',
lambda filename: filename.upper(),
- ptb,
-) # Must be defined *after* ptb corpus.
+ ptb) # Must be defined *after* ptb corpus.
nombank_ptb = LazyCorpusLoader(
- "nombank.1.0",
- NombankCorpusReader,
- "nombank.1.0",
- "frames/.*\.xml",
- "nombank.1.0.words",
+ 'nombank.1.0', NombankCorpusReader,
+ 'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
lambda filename: filename.upper(),
- ptb,
-) # Must be defined *after* ptb corpus.
+ ptb) # Must be defined *after* ptb corpus.
semcor = LazyCorpusLoader(
- "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
-) # Must be defined *after* wordnet corpus.
+ 'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml',
+ wordnet) # Must be defined *after* wordnet corpus.
nonbreaking_prefixes = LazyCorpusLoader(
- "nonbreaking_prefixes",
- NonbreakingPrefixesCorpusReader,
- r"(?!README|\.).*",
- encoding="utf8",
-)
+ 'nonbreaking_prefixes', NonbreakingPrefixesCorpusReader, r'(?!README|\.).*', encoding='utf8')
perluniprops = LazyCorpusLoader(
- "perluniprops",
- UnicharsCorpusReader,
- r"(?!README|\.).*",
- nltk_data_subdir="misc",
- encoding="utf8",
-)
+ 'perluniprops', UnicharsCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
# mwa_ppdb = LazyCorpusLoader(
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
# nkjp = LazyCorpusLoader(
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
#
-# panlex_lite = LazyCorpusLoader(
+#panlex_lite = LazyCorpusLoader(
# 'panlex_lite', PanLexLiteCorpusReader)
#
# ycoe = LazyCorpusLoader(
# 'ycoe', YCOECorpusReader)
#
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
-# hebrew_treebank = LazyCorpusLoader(
+#hebrew_treebank = LazyCorpusLoader(
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
-# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
+
def demo():
# This is out-of-date:
abc.demo()
brown.demo()
- # chat80.demo()
+# chat80.demo()
cmudict.demo()
conll2000.demo()
conll2002.demo()
udhr.demo()
webtext.demo()
words.demo()
-
-
# ycoe.demo()
-if __name__ == "__main__":
- # demo()
+if __name__ == '__main__':
+ #demo()
pass
# ** this is for nose **
# unload all corpus after tests
def teardown_module(module=None):
import nltk.corpus
-
for name in dir(nltk.corpus):
obj = getattr(nltk.corpus, name, None)
- if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"):
+ if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'):
obj._unload()
# Natural Language Toolkit: Europarl Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Create a new corpus reader instance for each European language
danish = LazyCorpusLoader(
- "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
-)
+ 'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8')
dutch = LazyCorpusLoader(
- "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
-)
+ 'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8')
english = LazyCorpusLoader(
- "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
-)
+ 'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')
finnish = LazyCorpusLoader(
- "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
-)
+ 'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8')
french = LazyCorpusLoader(
- "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
-)
+ 'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8')
german = LazyCorpusLoader(
- "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
-)
+ 'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8')
greek = LazyCorpusLoader(
- "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
-)
+ 'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8')
italian = LazyCorpusLoader(
- "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
-)
+ 'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8')
portuguese = LazyCorpusLoader(
- "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
-)
+ 'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8')
spanish = LazyCorpusLoader(
- "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
-)
+ 'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8')
swedish = LazyCorpusLoader(
- "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
-)
+ 'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8')
# Natural Language Toolkit: Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
-from nltk.corpus.reader.panlex_swadesh import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
from nltk.corpus.reader import bracket_parse
__all__ = [
- 'CorpusReader',
- 'CategorizedCorpusReader',
- 'PlaintextCorpusReader',
- 'find_corpus_fileids',
- 'TaggedCorpusReader',
- 'CMUDictCorpusReader',
- 'ConllChunkCorpusReader',
- 'WordListCorpusReader',
- 'PPAttachmentCorpusReader',
- 'SensevalCorpusReader',
- 'IEERCorpusReader',
- 'ChunkedCorpusReader',
- 'SinicaTreebankCorpusReader',
- 'BracketParseCorpusReader',
- 'IndianCorpusReader',
- 'ToolboxCorpusReader',
- 'TimitCorpusReader',
- 'YCOECorpusReader',
- 'MacMorphoCorpusReader',
- 'SyntaxCorpusReader',
- 'AlpinoCorpusReader',
- 'RTECorpusReader',
- 'StringCategoryCorpusReader',
- 'EuroparlCorpusReader',
+ 'CorpusReader', 'CategorizedCorpusReader',
+ 'PlaintextCorpusReader', 'find_corpus_fileids',
+ 'TaggedCorpusReader', 'CMUDictCorpusReader',
+ 'ConllChunkCorpusReader', 'WordListCorpusReader',
+ 'PPAttachmentCorpusReader', 'SensevalCorpusReader',
+ 'IEERCorpusReader', 'ChunkedCorpusReader',
+ 'SinicaTreebankCorpusReader', 'BracketParseCorpusReader',
+ 'IndianCorpusReader', 'ToolboxCorpusReader',
+ 'TimitCorpusReader', 'YCOECorpusReader',
+ 'MacMorphoCorpusReader', 'SyntaxCorpusReader',
+ 'AlpinoCorpusReader', 'RTECorpusReader',
+ 'StringCategoryCorpusReader','EuroparlCorpusReader',
'CategorizedBracketParseCorpusReader',
'CategorizedTaggedCorpusReader',
'CategorizedPlaintextCorpusReader',
'PortugueseCategorizedPlaintextCorpusReader',
'tagged_treebank_para_block_reader',
- 'PropbankCorpusReader',
- 'VerbnetCorpusReader',
- 'BNCCorpusReader',
- 'ConllCorpusReader',
- 'XMLCorpusReader',
- 'NPSChatCorpusReader',
- 'SwadeshCorpusReader',
- 'WordNetCorpusReader',
- 'WordNetICCorpusReader',
- 'SwitchboardCorpusReader',
- 'DependencyCorpusReader',
- 'NombankCorpusReader',
- 'IPIPANCorpusReader',
- 'Pl196xCorpusReader',
- 'TEICorpusView',
- 'KNBCorpusReader',
- 'ChasenCorpusReader',
- 'CHILDESCorpusReader',
- 'AlignedCorpusReader',
- 'TimitTaggedCorpusReader',
- 'LinThesaurusCorpusReader',
- 'SemcorCorpusReader',
- 'FramenetCorpusReader',
- 'UdhrCorpusReader',
- 'BNCCorpusReader',
- 'SentiWordNetCorpusReader',
- 'SentiSynset',
- 'TwitterCorpusReader',
- 'NKJPCorpusReader',
- 'CrubadanCorpusReader',
- 'MTECorpusReader',
- 'ReviewsCorpusReader',
- 'OpinionLexiconCorpusReader',
- 'ProsConsCorpusReader',
- 'CategorizedSentencesCorpusReader',
- 'ComparativeSentencesCorpusReader',
- 'PanLexLiteCorpusReader',
- 'NonbreakingPrefixesCorpusReader',
- 'UnicharsCorpusReader',
+ 'PropbankCorpusReader', 'VerbnetCorpusReader',
+ 'BNCCorpusReader', 'ConllCorpusReader',
+ 'XMLCorpusReader', 'NPSChatCorpusReader',
+ 'SwadeshCorpusReader', 'WordNetCorpusReader',
+ 'WordNetICCorpusReader', 'SwitchboardCorpusReader',
+ 'DependencyCorpusReader', 'NombankCorpusReader',
+ 'IPIPANCorpusReader', 'Pl196xCorpusReader',
+ 'TEICorpusView', 'KNBCorpusReader', 'ChasenCorpusReader',
+ 'CHILDESCorpusReader', 'AlignedCorpusReader',
+ 'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
+ 'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
+ 'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
+ 'TwitterCorpusReader', 'NKJPCorpusReader', 'CrubadanCorpusReader',
+ 'MTECorpusReader', 'ReviewsCorpusReader', 'OpinionLexiconCorpusReader',
+ 'ProsConsCorpusReader', 'CategorizedSentencesCorpusReader',
+ 'ComparativeSentencesCorpusReader', 'PanLexLiteCorpusReader',
+ 'NonbreakingPrefixesCorpusReader', 'UnicharsCorpusReader',
'MWAPPDBCorpusReader',
- 'PanlexSwadeshCorpusReader',
]
# Natural Language Toolkit: Aligned Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# Author: Steven Bird <stevenbird1@gmail.com>
# For license information, see LICENSE.TXT
+from six import string_types
+
from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
from nltk.translate import AlignedSent, Alignment
from nltk.corpus.reader.api import CorpusReader
-from nltk.corpus.reader.util import (
- StreamBackedCorpusView,
- concat,
- read_alignedsent_block,
-)
-
+from nltk.corpus.reader.util import StreamBackedCorpusView, concat,\
+ read_alignedsent_block
class AlignedCorpusReader(CorpusReader):
"""
Reader for corpora of word-aligned sentences. Tokens are assumed
to be separated by whitespace. Sentences begin on separate lines.
"""
-
- def __init__(
- self,
- root,
- fileids,
- sep="/",
- word_tokenizer=WhitespaceTokenizer(),
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
- alignedsent_block_reader=read_alignedsent_block,
- encoding="latin1",
- ):
+ def __init__(self, root, fileids,
+ sep='/', word_tokenizer=WhitespaceTokenizer(),
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+ alignedsent_block_reader=read_alignedsent_block,
+ encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
:return: the given file(s) as a single string.
:rtype: str
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
and punctuation symbols.
:rtype: list(str)
"""
- return concat(
- [
- AlignedSentCorpusView(
- fileid,
- enc,
- False,
- False,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._alignedsent_block_reader,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([AlignedSentCorpusView(fileid, enc, False, False,
+ self._word_tokenizer,
+ self._sent_tokenizer,
+ self._alignedsent_block_reader)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def sents(self, fileids=None):
"""
strings.
:rtype: list(list(str))
"""
- return concat(
- [
- AlignedSentCorpusView(
- fileid,
- enc,
- False,
- True,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._alignedsent_block_reader,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([AlignedSentCorpusView(fileid, enc, False, True,
+ self._word_tokenizer,
+ self._sent_tokenizer,
+ self._alignedsent_block_reader)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def aligned_sents(self, fileids=None):
"""
:return: the given file(s) as a list of AlignedSent objects.
:rtype: list(AlignedSent)
"""
- return concat(
- [
- AlignedSentCorpusView(
- fileid,
- enc,
- True,
- True,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._alignedsent_block_reader,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
-
+ return concat([AlignedSentCorpusView(fileid, enc, True, True,
+ self._word_tokenizer,
+ self._sent_tokenizer,
+ self._alignedsent_block_reader)
+ for (fileid, enc) in self.abspaths(fileids, True)])
class AlignedSentCorpusView(StreamBackedCorpusView):
"""
``AlignedSentCorpusView`` objects are typically created by
``AlignedCorpusReader`` (not directly by nltk users).
"""
-
- def __init__(
- self,
- corpus_file,
- encoding,
- aligned,
- group_by_sent,
- word_tokenizer,
- sent_tokenizer,
- alignedsent_block_reader,
- ):
+ def __init__(self, corpus_file, encoding, aligned, group_by_sent,
+ word_tokenizer, sent_tokenizer, alignedsent_block_reader):
self._aligned = aligned
self._group_by_sent = group_by_sent
self._word_tokenizer = word_tokenizer
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
- block = [
- self._word_tokenizer.tokenize(sent_str)
- for alignedsent_str in self._alignedsent_block_reader(stream)
- for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
- ]
+ block = [self._word_tokenizer.tokenize(sent_str)
+ for alignedsent_str in self._alignedsent_block_reader(stream)
+ for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)]
if self._aligned:
- block[2] = Alignment.fromstring(
- " ".join(block[2])
- ) # kludge; we shouldn't have tokenized the alignment string
+ block[2] = Alignment.fromstring(" ".join(block[2])) # kludge; we shouldn't have tokenized the alignment string
block = [AlignedSent(*block)]
elif self._group_by_sent:
block = [block[0]]
# Natural Language Toolkit: API for Corpus Readers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
"""
API for corpus readers.
"""
+from __future__ import unicode_literals
import os
import re
from collections import defaultdict
from itertools import chain
+from six import string_types
+
+from nltk import compat
from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
from nltk.corpus.reader.util import *
-
+@compat.python_2_unicode_compatible
class CorpusReader(object):
"""
A base class for "corpus reader" classes, each of which can be
be used to select which portion of the corpus should be returned.
"""
- def __init__(self, root, fileids, encoding="utf8", tagset=None):
+ def __init__(self, root, fileids, encoding='utf8', tagset=None):
"""
:type root: PathPointer or str
:param root: A path pointer identifying the root directory for
tagged_...() methods.
"""
# Convert the root to a path pointer, if necessary.
- if isinstance(root, str) and not isinstance(root, PathPointer):
- m = re.match("(.*\.zip)/?(.*)$|", root)
+ if isinstance(root, string_types) and not isinstance(root, PathPointer):
+ m = re.match('(.*\.zip)/?(.*)$|', root)
zipfile, zipentry = m.groups()
if zipfile:
root = ZipFilePathPointer(zipfile, zipentry)
else:
root = FileSystemPathPointer(root)
elif not isinstance(root, PathPointer):
- raise TypeError("CorpusReader: expected a string or a PathPointer")
+ raise TypeError('CorpusReader: expected a string or a PathPointer')
# If `fileids` is a regexp, then expand it.
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = find_corpus_fileids(root, fileids)
self._fileids = fileids
def __repr__(self):
if isinstance(self._root, ZipFilePathPointer):
- path = "%s/%s" % (self._root.zipfile.filename, self._root.entry)
+ path = '%s/%s' % (self._root.zipfile.filename, self._root.entry)
else:
- path = "%s" % self._root.path
- return "<%s in %r>" % (self.__class__.__name__, path)
+ path = '%s' % self._root.path
+ return '<%s in %r>' % (self.__class__.__name__, path)
def ensure_loaded(self):
"""
make sure a corpus is loaded -- e.g., in case a user wants to
do help(some_corpus).
"""
- pass # no need to actually do anything.
+ pass # no need to actually do anything.
def readme(self):
"""
"""
return self._root.join(fileid)
- def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
+ def abspaths(self, fileids=None, include_encoding=False,
+ include_fileid=False):
"""
Return a list of the absolute paths for all fileids in this corpus;
or for the given list of fileids, if specified.
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
paths = [self._root.join(f) for f in fileids]
else:
return self._encoding
- def _get_root(self):
- return self._root
-
- root = property(
- _get_root,
- doc="""
+ def _get_root(self): return self._root
+ root = property(_get_root, doc="""
The directory where this corpus is stored.
- :type: PathPointer""",
- )
+ :type: PathPointer""")
######################################################################
-# { Corpora containing categorized items
+#{ Corpora containing categorized items
######################################################################
-
class CategorizedCorpusReader(object):
"""
A mixin class used to aid in the implementation of corpus readers
more than one argument is specified, an exception will be
raised.
"""
- self._f2c = None #: file-to-category mapping
- self._c2f = None #: category-to-file mapping
-
- self._pattern = None #: regexp specifying the mapping
- self._map = None #: dict specifying the mapping
- self._file = None #: fileid of file containing the mapping
- self._delimiter = None #: delimiter for ``self._file``
-
- if "cat_pattern" in kwargs:
- self._pattern = kwargs["cat_pattern"]
- del kwargs["cat_pattern"]
- elif "cat_map" in kwargs:
- self._map = kwargs["cat_map"]
- del kwargs["cat_map"]
- elif "cat_file" in kwargs:
- self._file = kwargs["cat_file"]
- del kwargs["cat_file"]
- if "cat_delimiter" in kwargs:
- self._delimiter = kwargs["cat_delimiter"]
- del kwargs["cat_delimiter"]
+ self._f2c = None #: file-to-category mapping
+ self._c2f = None #: category-to-file mapping
+
+ self._pattern = None #: regexp specifying the mapping
+ self._map = None #: dict specifying the mapping
+ self._file = None #: fileid of file containing the mapping
+ self._delimiter = None #: delimiter for ``self._file``
+
+ if 'cat_pattern' in kwargs:
+ self._pattern = kwargs['cat_pattern']
+ del kwargs['cat_pattern']
+ elif 'cat_map' in kwargs:
+ self._map = kwargs['cat_map']
+ del kwargs['cat_map']
+ elif 'cat_file' in kwargs:
+ self._file = kwargs['cat_file']
+ del kwargs['cat_file']
+ if 'cat_delimiter' in kwargs:
+ self._delimiter = kwargs['cat_delimiter']
+ del kwargs['cat_delimiter']
else:
- raise ValueError(
- "Expected keyword argument cat_pattern or " "cat_map or cat_file."
- )
+ raise ValueError('Expected keyword argument cat_pattern or '
+ 'cat_map or cat_file.')
- if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
- raise ValueError(
- "Specify exactly one of: cat_pattern, " "cat_map, cat_file."
- )
+
+ if ('cat_pattern' in kwargs or 'cat_map' in kwargs or
+ 'cat_file' in kwargs):
+ raise ValueError('Specify exactly one of: cat_pattern, '
+ 'cat_map, cat_file.')
def _init(self):
self._f2c = defaultdict(set)
line = line.strip()
file_id, categories = line.split(self._delimiter, 1)
if file_id not in self.fileids():
- raise ValueError(
- "In category mapping file %s: %s "
- "not found" % (self._file, file_id)
- )
+ raise ValueError('In category mapping file %s: %s '
+ 'not found' % (self._file, file_id))
for category in categories.split(self._delimiter):
self._add(file_id, category)
self._init()
if fileids is None:
return sorted(self._c2f)
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return sorted(set.union(*[self._f2c[d] for d in fileids]))
"""
if categories is None:
return super(CategorizedCorpusReader, self).fileids()
- elif isinstance(categories, str):
+ elif isinstance(categories, string_types):
if self._f2c is None:
self._init()
if categories in self._c2f:
return sorted(self._c2f[categories])
else:
- raise ValueError("Category %s not found" % categories)
+ raise ValueError('Category %s not found' % categories)
else:
if self._f2c is None:
self._init()
return sorted(set.union(*[self._c2f[c] for c in categories]))
-
######################################################################
-# { Treebank readers
+#{ Treebank readers
######################################################################
-# [xx] is it worth it to factor this out?
+#[xx] is it worth it to factor this out?
class SyntaxCorpusReader(CorpusReader):
"""
An abstract base class for reading corpora consisting of
- ``_parse``, which takes a block and returns a list of parsed
sentences.
"""
-
def _parse(self, s):
raise NotImplementedError()
-
def _word(self, s):
raise NotImplementedError()
-
def _tag(self, s):
raise NotImplementedError()
-
def _read_block(self, stream):
raise NotImplementedError()
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def parsed_sents(self, fileids=None):
reader = self._read_parsed_sent_block
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, True)])
def tagged_sents(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_sent_block(stream, tagset)
-
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, True)])
def sents(self, fileids=None):
reader = self._read_sent_block
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, True)])
def tagged_words(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_word_block(stream, tagset)
-
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, reader, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, True)])
def words(self, fileids=None):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid,
+ self._read_word_block,
+ encoding=enc)
+ for fileid, enc in self.abspaths(fileids, True)])
- # ------------------------------------------------------------
- # { Block Readers
+ #------------------------------------------------------------
+ #{ Block Readers
def _read_word_block(self, stream):
return list(chain(*self._read_sent_block(stream)))
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
def _read_tagged_sent_block(self, stream, tagset=None):
- return list(
- filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
- )
+ return list(filter(None, [self._tag(t, tagset)
+ for t in self._read_block(stream)]))
def _read_parsed_sent_block(self, stream):
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
- # } End of Block Readers
- # ------------------------------------------------------------
+ #} End of Block Readers
+ #------------------------------------------------------------
# Natural Language Toolkit: Plaintext Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
- tag = "c5" if c5 else "pos"
+ tag = 'c5' if c5 else 'pos'
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
- tag = "c5" if c5 else "pos"
- return self._views(
- fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
- )
+ tag = 'c5' if c5 else 'pos'
+ return self._views(fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem)
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
f = BNCWordView if self._lazy else self._words
- return concat(
- [
- f(fileid, sent, tag, strip_space, stem)
- for fileid in self.abspaths(fileids)
- ]
- )
+ return concat([f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids)])
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
"""
result = []
xmldoc = ElementTree.parse(fileid).getroot()
- for xmlsent in xmldoc.findall(".//s"):
+ for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if strip_space or stem:
word = word.strip()
if stem:
- word = xmlword.get("hw", word)
- if tag == "c5":
- word = (word, xmlword.get("c5"))
- elif tag == "pos":
- word = (word, xmlword.get("pos", xmlword.get("c5")))
+ word = xmlword.get('hw', word)
+ if tag == 'c5':
+ word = (word, xmlword.get('c5'))
+ elif tag == 'pos':
+ word = (word, xmlword.get('pos', xmlword.get('c5')))
sent.append(word)
if bracket_sent:
- result.append(BNCSentence(xmlsent.attrib["n"], sent))
+ result.append(BNCSentence(xmlsent.attrib['n'], sent))
else:
result.extend(sent)
if result is None:
result = []
for child in elt:
- if child.tag in ("c", "w"):
+ if child.tag in ('c', 'w'):
result.append(child)
else:
_all_xmlwords_in(child, result)
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
-
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
"""
tags_to_ignore = set(
- ["pb", "gap", "vocal", "event", "unclear", "shift", "pause", "align"]
+ ['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
)
"""These tags are ignored. For their description refer to the
technical documentation, for example,
:param stem: If true, then substitute stems for words.
"""
if sent:
- tagspec = ".*/s"
+ tagspec = '.*/s'
else:
- tagspec = ".*/s/(.*/)?(c|w)"
+ tagspec = '.*/s/(.*/)?(c|w)'
self._sent = sent
self._tag = tag
self._strip_space = strip_space
# Read in a tasty header.
self._open()
- self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
+ self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
self.close()
# Reset tag context.
def handle_header(self, elt, context):
# Set up some metadata!
- titles = elt.findall("titleStmt/title")
+ titles = elt.findall('titleStmt/title')
if titles:
- self.title = "\n".join(title.text.strip() for title in titles)
+ self.title = '\n'.join(title.text.strip() for title in titles)
- authors = elt.findall("titleStmt/author")
+ authors = elt.findall('titleStmt/author')
if authors:
- self.author = "\n".join(author.text.strip() for author in authors)
+ self.author = '\n'.join(author.text.strip() for author in authors)
- editors = elt.findall("titleStmt/editor")
+ editors = elt.findall('titleStmt/editor')
if editors:
- self.editor = "\n".join(editor.text.strip() for editor in editors)
+ self.editor = '\n'.join(editor.text.strip() for editor in editors)
- resps = elt.findall("titleStmt/respStmt")
+ resps = elt.findall('titleStmt/respStmt')
if resps:
- self.resps = "\n\n".join(
- "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
+ self.resps = '\n\n'.join(
+ '\n'.join(
+ resp_elt.text.strip() for resp_elt in resp
+ ) for resp in resps
)
def handle_elt(self, elt, context):
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
- word = elt.get("hw", word)
- if self._tag == "c5":
- word = (word, elt.get("c5"))
- elif self._tag == "pos":
- word = (word, elt.get("pos", elt.get("c5")))
+ word = elt.get('hw', word)
+ if self._tag == 'c5':
+ word = (word, elt.get('c5'))
+ elif self._tag == 'pos':
+ word = (word, elt.get('pos', elt.get('c5')))
return word
def handle_sent(self, elt):
sent = []
for child in elt:
- if child.tag in ("mw", "hi", "corr", "trunc"):
+ if child.tag in ('mw', 'hi', 'corr', 'trunc'):
sent += [self.handle_word(w) for w in child]
- elif child.tag in ("w", "c"):
+ elif child.tag in ('w', 'c'):
sent.append(self.handle_word(child))
elif child.tag not in self.tags_to_ignore:
- raise ValueError("Unexpected element %s" % child.tag)
- return BNCSentence(elt.attrib["n"], sent)
+ raise ValueError('Unexpected element %s' % child.tag)
+ return BNCSentence(elt.attrib['n'], sent)
# Natural Language Toolkit: Penn Treebank Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.corpus.reader.api import *
# we use [^\s()]+ instead of \S+? to avoid matching ()
-SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
-TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
-WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
-EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
-
+SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)')
+TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
+WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
+EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
class BracketParseCorpusReader(SyntaxCorpusReader):
"""
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
"""
-
- def __init__(
- self,
- root,
- fileids,
- comment_char=None,
- detect_blocks="unindented_paren",
- encoding="utf8",
- tagset=None,
- ):
+ def __init__(self, root, fileids, comment_char=None,
+ detect_blocks='unindented_paren', encoding='utf8',
+ tagset=None):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
for normalizing or converting the POS tags returned by the
tagged_...() methods.
"""
- # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
- # from CorpusReader?
CorpusReader.__init__(self, root, fileids, encoding)
self._comment_char = comment_char
self._detect_blocks = detect_blocks
self._tagset = tagset
def _read_block(self, stream):
- if self._detect_blocks == "sexpr":
+ if self._detect_blocks == 'sexpr':
return read_sexpr_block(stream, comment_char=self._comment_char)
- elif self._detect_blocks == "blankline":
+ elif self._detect_blocks == 'blankline':
return read_blankline_block(stream)
- elif self._detect_blocks == "unindented_paren":
+ elif self._detect_blocks == 'unindented_paren':
# Tokens start with unindented left parens.
- toks = read_regexp_block(stream, start_re=r"^\(")
+ toks = read_regexp_block(stream, start_re=r'^\(')
# Strip any comments out of the tokens.
if self._comment_char:
- toks = [
- re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
- for tok in toks
- ]
+ toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char),
+ '', tok)
+ for tok in toks]
return toks
else:
- assert 0, "bad block type"
+ assert 0, 'bad block type'
def _normalize(self, t):
+ # If there's an empty set of brackets surrounding the actual
+ # parse, then strip them off.
+ if EMPTY_BRACKETS.match(t):
+ t = t.strip()[1:-1]
# Replace leaves of the form (!), (,), with (! !), (, ,)
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
# Replace leaves of the form (tag word root) with (tag word)
def _parse(self, t):
try:
- tree = Tree.fromstring(self._normalize(t))
- # If there's an empty node at the top, strip it off
- if tree.label() == '' and len(tree) == 1:
- return tree[0]
- else:
- return tree
+ return Tree.fromstring(self._normalize(t))
except ValueError as e:
sys.stderr.write("Bad tree detected; trying to recover...\n")
# Try to recover, if we can:
- if e.args == ("mismatched parens",):
+ if e.args == ('mismatched parens',):
for n in range(1, 5):
try:
- v = Tree(self._normalize(t + ")" * n))
- sys.stderr.write(
- " Recovered by adding %d close " "paren(s)\n" % n
- )
+ v = Tree(self._normalize(t+')'*n))
+ sys.stderr.write(" Recovered by adding %d close "
+ "paren(s)\n" % n)
return v
- except ValueError:
- pass
+ except ValueError: pass
# Try something else:
sys.stderr.write(" Recovered by returning a flat parse.\n")
- # sys.stderr.write(' '.join(t.split())+'\n')
- return Tree("S", self._tag(t))
+ #sys.stderr.write(' '.join(t.split())+'\n')
+ return Tree('S', self._tag(t))
def _tag(self, t, tagset=None):
- tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
+ tagged_sent = [(w,p) for (p,w) in TAGWORD.findall(self._normalize(t))]
if tagset and tagset != self._tagset:
- tagged_sent = [
- (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
- ]
+ tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (w,p) in tagged_sent]
return tagged_sent
def _word(self, t):
return WORD.findall(self._normalize(t))
-
-class CategorizedBracketParseCorpusReader(
- CategorizedCorpusReader, BracketParseCorpusReader
-):
+class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
+ BracketParseCorpusReader):
"""
A reader for parsed corpora whose documents are
divided into categories based on their file identifiers.
@author: Nathan Schneider <nschneid@cs.cmu.edu>
"""
-
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
-
def raw(self, fileids=None, categories=None):
- return BracketParseCorpusReader.raw(self, self._resolve(fileids, categories))
-
+ return BracketParseCorpusReader.raw(
+ self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
- return BracketParseCorpusReader.words(self, self._resolve(fileids, categories))
-
+ return BracketParseCorpusReader.words(
+ self, self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
- return BracketParseCorpusReader.sents(self, self._resolve(fileids, categories))
-
+ return BracketParseCorpusReader.sents(
+ self, self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
- return BracketParseCorpusReader.paras(self, self._resolve(fileids, categories))
-
+ return BracketParseCorpusReader.paras(
+ self, self._resolve(fileids, categories))
def tagged_words(self, fileids=None, categories=None, tagset=None):
return BracketParseCorpusReader.tagged_words(
- self, self._resolve(fileids, categories), tagset
- )
-
+ self, self._resolve(fileids, categories), tagset)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return BracketParseCorpusReader.tagged_sents(
- self, self._resolve(fileids, categories), tagset
- )
-
+ self, self._resolve(fileids, categories), tagset)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return BracketParseCorpusReader.tagged_paras(
- self, self._resolve(fileids, categories), tagset
- )
-
+ self, self._resolve(fileids, categories), tagset)
def parsed_words(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_words(
- self, self._resolve(fileids, categories)
- )
-
+ self, self._resolve(fileids, categories))
def parsed_sents(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_sents(
- self, self._resolve(fileids, categories)
- )
-
+ self, self._resolve(fileids, categories))
def parsed_paras(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_paras(
- self, self._resolve(fileids, categories)
- )
-
+ self, self._resolve(fileids, categories))
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Unfortunately this puts punctuation and some other words out of the sentence
order in the xml element tree. This is no good for tag_ and word_
_tag and _word will be overridden to use a non-default new parameter 'ordered'
- to the overridden _normalize function. The _parse function can then remain
+ to the overridden _normalize function. The _parse function can then remain
untouched.
"""
+ def __init__(self, root, encoding='ISO-8859-1', tagset=None):
+ BracketParseCorpusReader.__init__(self, root, 'alpino\.xml',
+ detect_blocks='blankline',
+ encoding=encoding,
+ tagset=tagset)
- def __init__(self, root, encoding="ISO-8859-1", tagset=None):
- BracketParseCorpusReader.__init__(
- self,
- root,
- "alpino\.xml",
- detect_blocks="blankline",
- encoding=encoding,
- tagset=tagset,
- )
-
- def _normalize(self, t, ordered=False):
+ def _normalize(self, t, ordered = False):
"""Normalize the xml sentence element in t.
- The sentence elements <alpino_ds>, although embedded in a few overall
- xml elements, are seperated by blank lines. That's how the reader can
+ The sentence elements <alpino_ds>, although embedded in a few overall
+ xml elements, are seperated by blank lines. That's how the reader can
deliver them one at a time.
Each sentence has a few category subnodes that are of no use to us.
The remaining word nodes may or may not appear in the proper order.
- begin : the position of the word in the sentence
- pos : Part of Speech: the Tag
- word : the actual word
- The return value is a string with all xml elementes replaced by
+ The return value is a string with all xml elementes replaced by
clauses: either a cat clause with nested clauses, or a word clause.
The order of the bracket clauses closely follows the xml.
If ordered == True, the word clauses include an order sequence number.
# convert XML to sexpr notation
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
if ordered:
- t = re.sub(
- r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
- r"(\1 \2 \3)",
- t,
- )
- else:
+ t = re.sub(r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2 \3)", t)
+ else:
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
t = re.sub(r" </node>", r")", t)
t = re.sub(r"<sentence>.*</sentence>", r"", t)
return t
def _tag(self, t, tagset=None):
- tagged_sent = [
- (int(o), w, p)
- for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
- ]
+ tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))]
tagged_sent.sort()
if tagset and tagset != self._tagset:
- tagged_sent = [
- (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
- ]
+ tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent]
else:
- tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
+ tagged_sent = [(w,p) for (o,w,p) in tagged_sent]
return tagged_sent
def _word(self, t):
"""Return a correctly ordered list if words"""
tagged_sent = self._tag(t)
- return [w for (w, p) in tagged_sent]
+ return [w for (w,p) in tagged_sent]
+
# Natural Language Toolkit: Categorized Sentences Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
sentiment categorization with respect to rating scales". Proceedings of the
ACL, 2005.
"""
+from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
-
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
A reader for corpora in which each row represents a single instance, mainly
CorpusView = StreamBackedCorpusView
- def __init__(
- self,
- root,
- fileids,
- word_tokenizer=WhitespaceTokenizer(),
- sent_tokenizer=None,
- encoding="utf8",
- **kwargs
- ):
+ def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(),
+ sent_tokenizer=None, encoding='utf8', **kwargs):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
- return concat(
- [
- self.CorpusView(path, self._read_sent_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def words(self, fileids=None, categories=None):
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
- return concat(
- [
- self.CorpusView(path, self._read_word_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def _read_sent_block(self, stream):
sents = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
if self._sent_tokenizer:
- sents.extend(
- [
- self._word_tokenizer.tokenize(sent)
- for sent in self._sent_tokenizer.tokenize(line)
- ]
- )
+ sents.extend([self._word_tokenizer.tokenize(sent)
+ for sent in self._sent_tokenizer.tokenize(line)])
else:
sents.append(self._word_tokenizer.tokenize(line))
return sents
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
import sys
+from six import string_types
+
from nltk.corpus.reader import util
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class ChasenCorpusReader(CorpusReader):
- def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
+
+ def __init__(self, root, fileids, encoding='utf8', sent_splitter=None):
self._sent_splitter = sent_splitter
CorpusReader.__init__(self, root, fileids, encoding)
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
- return concat(
- [
- ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChasenCorpusView(fileid, enc,
+ False, False, False, self._sent_splitter)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_words(self, fileids=None):
- return concat(
- [
- ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChasenCorpusView(fileid, enc,
+ True, False, False, self._sent_splitter)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def sents(self, fileids=None):
- return concat(
- [
- ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChasenCorpusView(fileid, enc,
+ False, True, False, self._sent_splitter)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_sents(self, fileids=None):
- return concat(
- [
- ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChasenCorpusView(fileid, enc,
+ True, True, False, self._sent_splitter)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def paras(self, fileids=None):
- return concat(
- [
- ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChasenCorpusView(fileid, enc,
+ False, True, True, self._sent_splitter)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_paras(self, fileids=None):
- return concat(
- [
- ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChasenCorpusView(fileid, enc,
+ True, True, True, self._sent_splitter)
+ for (fileid, enc) in self.abspaths(fileids, True)])
class ChasenCorpusView(StreamBackedCorpusView):
but this'll use fixed sets of word and sentence tokenizer.
"""
- def __init__(
- self,
- corpus_file,
- encoding,
- tagged,
- group_by_sent,
- group_by_para,
- sent_splitter=None,
- ):
+ def __init__(self, corpus_file, encoding,
+ tagged, group_by_sent, group_by_para, sent_splitter=None):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._sent_splitter = sent_splitter
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
+
def read_block(self, stream):
"""Reads one paragraph at a time."""
block = []
sent = []
for line in para_str.splitlines():
- _eos = line.strip() == "EOS"
- _cells = line.split("\t")
- w = (_cells[0], "\t".join(_cells[1:]))
- if not _eos:
- sent.append(w)
+ _eos = line.strip() == 'EOS'
+ _cells = line.split('\t')
+ w = (_cells[0], '\t'.join(_cells[1:]))
+ if not _eos: sent.append(w)
if _eos or (self._sent_splitter and self._sent_splitter(w)):
if not self._tagged:
- sent = [w for (w, t) in sent]
+ sent = [w for (w,t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
sent = []
- if len(sent) > 0:
+ if len(sent)>0:
if not self._tagged:
- sent = [w for (w, t) in sent]
+ sent = [w for (w,t) in sent]
if self._group_by_sent:
para.append(sent)
return block
-
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
- jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
- print("/".join(jeita.words()[22100:22140]))
+ jeita = LazyCorpusLoader(
+ 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
+ print('/'.join( jeita.words()[22100:22140] ))
- print(
- "\nEOS\n".join(
- "\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent)
- for sent in jeita.tagged_sents()[2170:2173]
- )
- )
+ print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
+ for sent in jeita.tagged_sents()[2170:2173]))
def test():
from nltk.corpus.util import LazyCorpusLoader
- jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
-
- assert isinstance(jeita.tagged_words()[0][1], str)
+ jeita = LazyCorpusLoader(
+ 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
+ assert isinstance(jeita.tagged_words()[0][1], string_types)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
test()
# CHILDES XML Corpus Reader
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
# URL: <http://nltk.org/>
"""
Corpus reader for the XML version of the CHILDES corpus.
"""
+from __future__ import print_function, division
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
import re
from collections import defaultdict
+from six import string_types
from nltk.util import flatten, LazyMap, LazyConcatenation
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
# to resolve the namespace issue
-NS = "http://www.talkbank.org/ns/talkbank"
-
+NS = 'http://www.talkbank.org/ns/talkbank'
class CHILDESCorpusReader(XMLCorpusReader):
"""
Corpus reader for the XML version of the CHILDES corpus.
- The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
- version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
+ The CHILDES corpus is available at ``http://childes.psy.cmu.edu/``. The XML
+ version of CHILDES is located at ``http://childes.psy.cmu.edu/data-xml/``.
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
(``nltk_data/corpora/CHILDES/``).
For access to the file text use the usual nltk functions,
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
"""
-
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
- def words(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=False,
- strip_space=True,
- replace=False,
- ):
+ def words(self, fileids=None, speaker='ALL', stem=False,
+ relation=False, strip_space=True, replace=False):
"""
:return: the given file(s) as a list of words
:rtype: list(str)
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
- sent = None
- pos = False
+ sent=None
+ pos=False
if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
-
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
+ return [self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+ get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
- def tagged_words(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=False,
- strip_space=True,
- replace=False,
- ):
+ def tagged_words(self, fileids=None, speaker='ALL', stem=False,
+ relation=False, strip_space=True, replace=False):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
- sent = None
- pos = True
+ sent=None
+ pos=True
if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
-
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
+ return [self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+ get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
- def sents(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=None,
- strip_space=True,
- replace=False,
- ):
+ def sents(self, fileids=None, speaker='ALL', stem=False,
+ relation=None, strip_space=True, replace=False):
"""
:return: the given file(s) as a list of sentences or utterances, each
encoded as a list of word strings.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
- sent = True
- pos = False
+ sent=True
+ pos=False
if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
-
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
+ return [self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+ get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
- def tagged_sents(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=None,
- strip_space=True,
- replace=False,
- ):
+ def tagged_sents(self, fileids=None, speaker='ALL', stem=False,
+ relation=None, strip_space=True, replace=False):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
- sent = True
- pos = True
+ sent=True
+ pos=True
if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
-
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
+ return [self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace) for fileid in self.abspaths(fileids)]
+
+ get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
+ pos, strip_space, replace)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def corpus(self, fileids=None):
xmldoc = ElementTree.parse(fileid).getroot()
# getting participants' data
pat = dictOfDicts()
- for participant in xmldoc.findall(
- ".//{%s}Participants/{%s}participant" % (NS, NS)
- ):
- for (key, value) in participant.items():
- pat[participant.get("id")][key] = value
+ for participant in xmldoc.findall('.//{%s}Participants/{%s}participant'
+ % (NS,NS)):
+ for (key,value) in participant.items():
+ pat[participant.get('id')][key] = value
return pat
- def age(self, fileids=None, speaker="CHI", month=False):
+ def age(self, fileids=None, speaker='CHI', month=False):
"""
:return: the given file(s) as string or int
:rtype: list or int
:param month: If true, return months instead of year-month-date
"""
if not self._lazy:
- return [
- self._get_age(fileid, speaker, month)
- for fileid in self.abspaths(fileids)
- ]
+ return [self._get_age(fileid, speaker, month)
+ for fileid in self.abspaths(fileids)]
get_age = lambda fileid: self._get_age(fileid, speaker, month)
return LazyMap(get_age, self.abspaths(fileids))
def _get_age(self, fileid, speaker, month):
xmldoc = ElementTree.parse(fileid).getroot()
- for pat in xmldoc.findall(".//{%s}Participants/{%s}participant" % (NS, NS)):
+ for pat in xmldoc.findall('.//{%s}Participants/{%s}participant'
+ % (NS,NS)):
try:
- if pat.get("id") == speaker:
- age = pat.get("age")
+ if pat.get('id') == speaker:
+ age = pat.get('age')
if month:
age = self.convert_age(age)
return age
def convert_age(self, age_year):
"Caclculate age in months from a string in CHILDES format"
- m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
- age_month = int(m.group(1)) * 12 + int(m.group(2))
+ m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?",age_year)
+ age_month = int(m.group(1))*12 + int(m.group(2))
try:
if int(m.group(3)) > 15:
age_month += 1
pass
return age_month
- def MLU(self, fileids=None, speaker="CHI"):
+ def MLU(self, fileids=None, speaker='CHI'):
"""
:return: the given file(s) as a floating number
:rtype: list(float)
"""
if not self._lazy:
- return [
- self._getMLU(fileid, speaker=speaker)
- for fileid in self.abspaths(fileids)
- ]
+ return [self._getMLU(fileid, speaker=speaker)
+ for fileid in self.abspaths(fileids)]
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
return LazyMap(get_MLU, self.abspaths(fileids))
def _getMLU(self, fileid, speaker):
- sents = self._get_words(
- fileid,
- speaker=speaker,
- sent=True,
- stem=True,
- relation=False,
- pos=True,
- strip_space=True,
- replace=True,
- )
+ sents = self._get_words(fileid, speaker=speaker, sent=True, stem=True,
+ relation=False, pos=True, strip_space=True, replace=True)
results = []
lastSent = []
numFillers = 0
sentDiscount = 0
for sent in sents:
- posList = [pos for (word, pos) in sent]
+ posList = [pos for (word,pos) in sent]
# if any part of the sentence is intelligible
- if any(pos == "unk" for pos in posList):
- continue
+ if any(pos == 'unk' for pos in posList):
+ next
# if the sentence is null
elif sent == []:
- continue
+ next
# if the sentence is the same as the last sent
elif sent == lastSent:
- continue
+ next
else:
- results.append([word for (word, pos) in sent])
+ results.append([word for (word,pos) in sent])
# count number of fillers
- if len(set(["co", None]).intersection(posList)) > 0:
- numFillers += posList.count("co")
+ if len(set(['co',None]).intersection(posList)) > 0:
+ numFillers += posList.count('co')
numFillers += posList.count(None)
sentDiscount += 1
lastSent = sent
thisWordList = flatten(results)
# count number of morphemes
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
- numWords = (
- len(flatten([word.split("-") for word in thisWordList])) - numFillers
- )
+ numWords = len(flatten([word.split('-')
+ for word in thisWordList])) - numFillers
numSents = len(results) - sentDiscount
- mlu = numWords / numSents
+ mlu = numWords/numSents
except ZeroDivisionError:
mlu = 0
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
return mlu
- def _get_words(
- self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
- ):
- if (
- isinstance(speaker, str) and speaker != "ALL"
- ): # ensure we have a list of speakers
- speaker = [speaker]
+ def _get_words(self, fileid, speaker, sent, stem, relation, pos,
+ strip_space, replace):
+ if isinstance(speaker, string_types) and speaker != 'ALL': # ensure we have a list of speakers
+ speaker = [ speaker ]
xmldoc = ElementTree.parse(fileid).getroot()
# processing each xml doc
results = []
- for xmlsent in xmldoc.findall(".//{%s}u" % NS):
+ for xmlsent in xmldoc.findall('.//{%s}u' % NS):
sents = []
# select speakers
- if speaker == "ALL" or xmlsent.get("who") in speaker:
- for xmlword in xmlsent.findall(".//{%s}w" % NS):
- infl = None
- suffixStem = None
- suffixTag = None
+ if speaker == 'ALL' or xmlsent.get('who') in speaker:
+ for xmlword in xmlsent.findall('.//{%s}w' % NS):
+ infl = None ; suffixStem = None; suffixTag = None
# getting replaced words
- if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)):
- xmlword = xmlsent.find(
- ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS)
- )
- elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)):
- xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS))
+ if replace and xmlsent.find('.//{%s}w/{%s}replacement'
+ % (NS,NS)):
+ xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w'
+ % (NS,NS,NS))
+ elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)):
+ xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS))
# get text
if xmlword.text:
word = xmlword.text
else:
- word = ""
+ word = ''
# strip tailing space
if strip_space:
word = word.strip()
# stem
if relation or stem:
try:
- xmlstem = xmlword.find(".//{%s}stem" % NS)
+ xmlstem = xmlword.find('.//{%s}stem' % NS)
word = xmlstem.text
except AttributeError as e:
pass
# if there is an inflection
try:
- xmlinfl = xmlword.find(
- ".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS)
- )
- word += "-" + xmlinfl.text
+ xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk'
+ % (NS,NS,NS))
+ word += '-' + xmlinfl.text
except:
pass
# if there is a suffix
try:
- xmlsuffix = xmlword.find(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
- % (NS, NS, NS, NS)
- )
+ xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
+ % (NS,NS,NS,NS))
suffixStem = xmlsuffix.text
except AttributeError:
suffixStem = ""
if suffixStem:
- word += "~" + suffixStem
+ word += "~"+suffixStem
# pos
if relation or pos:
try:
xmlpos = xmlword.findall(".//{%s}c" % NS)
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
if xmlpos2 != []:
- tag = xmlpos[0].text + ":" + xmlpos2[0].text
+ tag = xmlpos[0].text+":"+xmlpos2[0].text
else:
tag = xmlpos[0].text
- except (AttributeError, IndexError) as e:
+ except (AttributeError,IndexError) as e:
tag = ""
try:
- xmlsuffixpos = xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
- % (NS, NS, NS, NS, NS)
- )
- xmlsuffixpos2 = xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
- % (NS, NS, NS, NS, NS)
- )
+ xmlsuffixpos = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
+ % (NS,NS,NS,NS,NS))
+ xmlsuffixpos2 = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
+ % (NS,NS,NS,NS,NS))
if xmlsuffixpos2:
- suffixTag = (
- xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
- )
+ suffixTag = xmlsuffixpos[0].text+":"+xmlsuffixpos2[0].text
else:
suffixTag = xmlsuffixpos[0].text
except:
pass
if suffixTag:
- tag += "~" + suffixTag
+ tag += "~"+suffixTag
word = (word, tag)
# relational
# the gold standard is stored in
# <mor></mor><mor type="trn"><gra type="grt">
if relation == True:
- for xmlstem_rel in xmlword.findall(
- ".//{%s}mor/{%s}gra" % (NS, NS)
- ):
- if not xmlstem_rel.get("type") == "grt":
- word = (
- word[0],
- word[1],
- xmlstem_rel.get("index")
- + "|"
- + xmlstem_rel.get("head")
- + "|"
- + xmlstem_rel.get("relation"),
- )
+ for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra'
+ % (NS,NS)):
+ if not xmlstem_rel.get('type') == 'grt':
+ word = (word[0], word[1],
+ xmlstem_rel.get('index')
+ + "|" + xmlstem_rel.get('head')
+ + "|" + xmlstem_rel.get('relation'))
else:
- word = (
- word[0],
- word[1],
- word[2],
- word[0],
- word[1],
- xmlstem_rel.get("index")
- + "|"
- + xmlstem_rel.get("head")
- + "|"
- + xmlstem_rel.get("relation"),
- )
+ word = (word[0], word[1], word[2],
+ word[0], word[1],
+ xmlstem_rel.get('index')
+ + "|" + xmlstem_rel.get('head')
+ + "|" + xmlstem_rel.get('relation'))
try:
- for xmlpost_rel in xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS)
- ):
- if not xmlpost_rel.get("type") == "grt":
- suffixStem = (
- suffixStem[0],
- suffixStem[1],
- xmlpost_rel.get("index")
- + "|"
- + xmlpost_rel.get("head")
- + "|"
- + xmlpost_rel.get("relation"),
- )
+ for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra'
+ % (NS,NS,NS)):
+ if not xmlpost_rel.get('type') == 'grt':
+ suffixStem = (suffixStem[0],
+ suffixStem[1],
+ xmlpost_rel.get('index')
+ + "|" + xmlpost_rel.get('head')
+ + "|" + xmlpost_rel.get('relation'))
else:
- suffixStem = (
- suffixStem[0],
- suffixStem[1],
- suffixStem[2],
- suffixStem[0],
- suffixStem[1],
- xmlpost_rel.get("index")
- + "|"
- + xmlpost_rel.get("head")
- + "|"
- + xmlpost_rel.get("relation"),
- )
+ suffixStem = (suffixStem[0], suffixStem[1],
+ suffixStem[2], suffixStem[0],
+ suffixStem[1],
+ xmlpost_rel.get('index')
+ + "|" + xmlpost_rel.get('head')
+ + "|" + xmlpost_rel.get('relation'))
except:
pass
sents.append(word)
results.extend(sents)
return LazyMap(lambda x: x, results)
+
# Ready-to-use browser opener
"""
shouldn't need to be changed, unless CHILDES changes the configuration
of their server or unless the user sets up their own corpus webserver.
"""
- childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
+ childes_url_base = r'http://childes.psy.cmu.edu/browser/index.php?url='
+
def webview_file(self, fileid, urlbase=None):
"""Map a corpus file to its web version on the CHILDES website,
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
"""
- import webbrowser
+ import webbrowser, re
if urlbase:
- path = urlbase + "/" + fileid
+ path = urlbase+"/"+fileid
else:
full = self.root + "/" + fileid
- full = re.sub(r"\\", "/", full)
- if "/childes/" in full.lower():
+ full = re.sub(r'\\', '/', full)
+ if '/childes/' in full.lower():
# Discard /data-xml/ if present
- path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
- elif "eng-usa" in full.lower():
- path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
+ path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
+ elif 'eng-usa' in full.lower():
+ path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
else:
path = fileid
# Strip ".xml" and add ".cha", as necessary:
- if path.endswith(".xml"):
+ if path.endswith('.xml'):
path = path[:-4]
- if not path.endswith(".cha"):
- path = path + ".cha"
+ if not path.endswith('.cha'):
+ path = path+'.cha'
url = self.childes_url_base + path
# raw_input("Hit Return to continue")
+
def demo(corpus_root=None):
"""
The CHILDES corpus should be manually downloaded and saved
"""
if not corpus_root:
from nltk.data import find
-
- corpus_root = find("corpora/childes/data-xml/Eng-USA/")
+ corpus_root = find('corpora/childes/data-xml/Eng-USA/')
try:
- childes = CHILDESCorpusReader(corpus_root, ".*.xml")
+ childes = CHILDESCorpusReader(corpus_root, '.*.xml')
# describe all corpus
for file in childes.fileids()[:5]:
- corpus = ""
- corpus_id = ""
- for (key, value) in childes.corpus(file)[0].items():
- if key == "Corpus":
- corpus = value
- if key == "Id":
- corpus_id = value
- print("Reading", corpus, corpus_id, " .....")
- print("words:", childes.words(file)[:7], "...")
- print(
- "words with replaced words:",
- childes.words(file, replace=True)[:7],
- " ...",
- )
- print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
- print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
- print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
- print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
- print(
- "words with relations and pos-tag:",
- childes.words(file, relation=True)[:5],
- " ...",
- )
- print("sentence:", childes.sents(file)[:2], " ...")
+ corpus = ''
+ corpus_id = ''
+ for (key,value) in childes.corpus(file)[0].items():
+ if key == "Corpus": corpus = value
+ if key == "Id": corpus_id = value
+ print('Reading', corpus,corpus_id,' .....')
+ print("words:", childes.words(file)[:7],"...")
+ print("words with replaced words:", childes.words(file, replace=True)[:7]," ...")
+ print("words with pos tags:", childes.tagged_words(file)[:7]," ...")
+ print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
+ print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
+ print("stemmed words:", childes.words(file, stem=True)[:7]," ...")
+ print("words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ...")
+ print("sentence:", childes.sents(file)[:2]," ...")
for (participant, values) in childes.participants(file)[0].items():
- for (key, value) in values.items():
- print("\tparticipant", participant, key, ":", value)
+ for (key, value) in values.items():
+ print("\tparticipant", participant, key, ":", value)
print("num of sent:", len(childes.sents(file)))
print("num of morphemes:", len(childes.words(file, stem=True)))
print("age:", childes.age(file))
print()
except LookupError as e:
- print(
- """The CHILDES corpus, or the parts you need, should be manually
- downloaded from https://childes.talkbank.org/data-xml/ and saved at
+ print("""The CHILDES corpus, or the parts you need, should be manually
+ downloaded from http://childes.psy.cmu.edu/data-xml/ and saved at
[NLTK_Data_Dir]/corpora/childes/
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
demo('/path/to/childes/data-xml/Eng-USA/")
- """
- )
- # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
- # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
+ """)
+ #corpus_root_http = urllib2.urlopen('http://childes.psy.cmu.edu/data-xml/Eng-USA/Bates.zip')
+ #corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
##this fails
- # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
+ #childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
if __name__ == "__main__":
# Natural Language Toolkit: Chunked Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
import os.path, codecs
+from six import string_types
+
import nltk
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.tree import Tree
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class ChunkedCorpusReader(CorpusReader):
"""
Reader for chunked (and optionally tagged) corpora. Paragraphs
on blank lines; sentences are listed one per line; and sentences
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
"""
-
- def __init__(
- self,
- root,
- fileids,
- extension="",
- str2chunktree=tagstr2tree,
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
- para_block_reader=read_blankline_block,
- encoding="utf8",
- tagset=None,
- ):
+ def __init__(self, root, fileids, extension='',
+ str2chunktree=tagstr2tree,
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+ para_block_reader=read_blankline_block,
+ encoding='utf8', tagset=None):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:return: the given file(s) as a single string.
:rtype: str
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
and punctuation symbols.
:rtype: list(str)
"""
- return concat(
- [
- ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
+ for (f, enc) in self.abspaths(fileids, True)])
def sents(self, fileids=None):
"""
strings.
:rtype: list(list(str))
"""
- return concat(
- [
- ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
+ for (f, enc) in self.abspaths(fileids, True)])
def paras(self, fileids=None):
"""
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
- return concat(
- [
- ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
+ for (f, enc) in self.abspaths(fileids, True)])
def tagged_words(self, fileids=None, tagset=None):
"""
``(word,tag)``.
:rtype: list(tuple(str,str))
"""
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset)
+ for (f, enc) in self.abspaths(fileids, True)])
def tagged_sents(self, fileids=None, tagset=None):
"""
:rtype: list(list(tuple(str,str)))
"""
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset)
+ for (f, enc) in self.abspaths(fileids, True)])
def tagged_paras(self, fileids=None, tagset=None):
"""
in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
"""
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset)
+ for (f, enc) in self.abspaths(fileids, True)])
def chunked_words(self, fileids=None, tagset=None):
"""
trees over ``(word,tag)`` tuples or word strings.
:rtype: list(tuple(str,str) and Tree)
"""
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset)
+ for (f, enc) in self.abspaths(fileids, True)])
def chunked_sents(self, fileids=None, tagset=None):
"""
tags).
:rtype: list(Tree)
"""
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset)
+ for (f, enc) in self.abspaths(fileids, True)])
def chunked_paras(self, fileids=None, tagset=None):
"""
has tags) or word strings (if the corpus has no tags).
:rtype: list(list(Tree))
"""
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset)
+ for (f, enc) in self.abspaths(fileids, True)])
def _read_block(self, stream):
return [tagstr2tree(t) for t in read_blankline_block(stream)]
-
class ChunkedCorpusView(StreamBackedCorpusView):
- def __init__(
- self,
- fileid,
- encoding,
- tagged,
- group_by_sent,
- group_by_para,
- chunked,
- str2chunktree,
- sent_tokenizer,
- para_block_reader,
- source_tagset=None,
- target_tagset=None,
- ):
+ def __init__(self, fileid, encoding, tagged, group_by_sent,
+ group_by_para, chunked, str2chunktree, sent_tokenizer,
+ para_block_reader, source_tagset=None, target_tagset=None):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._tagged = tagged
self._group_by_sent = group_by_sent
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
- sent = self._str2chunktree(
- sent_str,
- source_tagset=self._source_tagset,
- target_tagset=self._target_tagset,
- )
+ sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
+ target_tagset=self._target_tagset)
# If requested, throw away the tags.
if not self._tagged:
elif isinstance(child, tuple):
tree[i] = child[0]
else:
- raise ValueError("expected child to be Tree or tuple")
+ raise ValueError('expected child to be Tree or tuple')
return tree
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
ZH seizure S IY ZH ER
"""
+import codecs
+
+from six import string_types
+
+from nltk import compat
from nltk.util import Index
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class CMUDictCorpusReader(CorpusReader):
def entries(self):
"""
:return: the cmudict lexicon as a list of entries
containing (word, transcriptions) tuples.
"""
- return concat(
- [
- StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
- for fileid, enc in self.abspaths(None, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, read_cmudict_block,
+ encoding=enc)
+ for fileid, enc in self.abspaths(None, True)])
def raw(self):
"""
:return: the cmudict lexicon as a raw string.
"""
fileids = self._fileids
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
"""
return dict(Index(self.entries()))
-
def read_cmudict_block(stream):
entries = []
- while len(entries) < 100: # Read 100 at a time.
+ while len(entries) < 100: # Read 100 at a time.
line = stream.readline()
- if line == "":
- return entries # end of file.
+ if line == '': return entries # end of file.
pieces = line.split()
- entries.append((pieces[0].lower(), pieces[2:]))
+ entries.append( (pieces[0].lower(), pieces[2:]) )
return entries
# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
import re
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.tokenize import *
# Regular expressions for dataset components
-STARS = re.compile(r"^\*+$")
-COMPARISON = re.compile(r"<cs-[1234]>")
-CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
-GRAD_COMPARISON = re.compile(r"<cs-[123]>")
-NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
+STARS = re.compile(r'^\*+$')
+COMPARISON = re.compile(r'<cs-[1234]>')
+CLOSE_COMPARISON = re.compile(r'</cs-[1234]>')
+GRAD_COMPARISON = re.compile(r'<cs-[123]>')
+NON_GRAD_COMPARISON = re.compile(r'<cs-4>')
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
-KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
-
+KEYWORD = re.compile(r'\((?!.*\()(.*)\)$')
class Comparison(object):
"""
A Comparison represents a comparative sentence and its constituents.
"""
-
- def __init__(
- self,
- text=None,
- comp_type=None,
- entity_1=None,
- entity_2=None,
- feature=None,
- keyword=None,
- ):
+ def __init__(self, text=None, comp_type=None, entity_1=None, entity_2=None,
+ feature=None, keyword=None):
"""
:param text: a string (optionally tokenized) containing a comparation.
:param comp_type: an integer defining the type of comparison expressed.
self.keyword = keyword
def __repr__(self):
- return (
- 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
- 'feature="{}", keyword="{}")'
- ).format(
- self.text,
- self.comp_type,
- self.entity_1,
- self.entity_2,
- self.feature,
- self.keyword,
- )
-
+ return ("Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", "
+ "feature=\"{}\", keyword=\"{}\")").format(self.text, self.comp_type,
+ self.entity_1, self.entity_2, self.feature, self.keyword)
class ComparativeSentencesCorpusReader(CorpusReader):
"""
>>> len(comparative_sentences.comparisons())
853
"""
-
CorpusView = StreamBackedCorpusView
- def __init__(
- self,
- root,
- fileids,
- word_tokenizer=WhitespaceTokenizer(),
- sent_tokenizer=None,
- encoding="utf8",
- ):
+ def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(),
+ sent_tokenizer=None, encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: a list or regexp specifying the fileids in this corpus.
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
- return concat(
- [
- self.CorpusView(path, self._read_comparison_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_comparison_block, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def keywords(self, fileids=None):
"""
:return: the set of keywords and comparative phrases used in the corpus.
:rtype: set(str)
"""
- all_keywords = concat(
- [
- self.CorpusView(path, self._read_keyword_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
-
- keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
+ all_keywords = concat([self.CorpusView(path, self._read_keyword_block, encoding=enc)
+ for (path, enc, fileid)
+ in self.abspaths(fileids, True, True)])
+
+ keywords_set = set([keyword.lower() for keyword in all_keywords if keyword])
return keywords_set
def keywords_readme(self):
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
strings, if no word tokenizer is specified).
:rtype: list(list(str)) or list(str)
"""
- return concat(
- [
- self.CorpusView(path, self._read_sent_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
- return concat(
- [
- self.CorpusView(path, self._read_word_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+ for (path, enc, fileid)
+ in self.abspaths(fileids, True, True)])
def _read_comparison_block(self, stream):
while True:
line = stream.readline()
if not line:
- return [] # end of file.
+ return [] # end of file.
comparison_tags = re.findall(COMPARISON, line)
if comparison_tags:
grad_comparisons = re.findall(GRAD_COMPARISON, line)
if grad_comparisons:
# Each comparison tag has its own relations on a separate line
for comp in grad_comparisons:
- comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
- comparison = Comparison(
- text=comparison_text, comp_type=comp_type
- )
+ comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
+ comparison = Comparison(text=comparison_text, comp_type=comp_type)
line = stream.readline()
entities_feats = ENTITIES_FEATS.findall(line)
if entities_feats:
for (code, entity_feat) in entities_feats:
- if code == "1":
+ if code == '1':
comparison.entity_1 = entity_feat.strip()
- elif code == "2":
+ elif code == '2':
comparison.entity_2 = entity_feat.strip()
- elif code == "3":
+ elif code == '3':
comparison.feature = entity_feat.strip()
keyword = KEYWORD.findall(line)
if keyword:
if non_grad_comparisons:
for comp in non_grad_comparisons:
# comp_type in this case should always be 4.
- comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
- comparison = Comparison(
- text=comparison_text, comp_type=comp_type
- )
+ comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
+ comparison = Comparison(text=comparison_text, comp_type=comp_type)
comparison_bundle.append(comparison)
# Flatten the list of comparisons before returning them
# return concat([comparison_bundle])
if re.match(STARS, line):
break
continue
- if (
- not re.findall(COMPARISON, line)
- and not ENTITIES_FEATS.findall(line)
- and not re.findall(CLOSE_COMPARISON, line)
- ):
+ if not re.findall(COMPARISON, line) and not ENTITIES_FEATS.findall(line) \
+ and not re.findall(CLOSE_COMPARISON, line):
if self._sent_tokenizer:
- return [
- self._word_tokenizer.tokenize(sent)
- for sent in self._sent_tokenizer.tokenize(line)
- ]
+ return [self._word_tokenizer.tokenize(sent)
+ for sent in self._sent_tokenizer.tokenize(line)]
else:
return [self._word_tokenizer.tokenize(line)]
# Natural Language Toolkit: CONLL Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
Read CoNLL-style chunk fileids.
"""
+from __future__ import unicode_literals
+
+import os
+import codecs
import textwrap
+from six import string_types
+
+from nltk import compat
from nltk.tree import Tree
from nltk.util import LazyMap, LazyConcatenation
from nltk.tag import map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class ConllCorpusReader(CorpusReader):
"""
A corpus reader for CoNLL-style files. These files consist of a
annotation type. The set of columns used by CoNLL-style files can
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
therefore takes an argument, ``columntypes``, which is used to
- specify the columns that are used by a given corpus. By default
- columns are split by consecutive whitespaces, with the
- ``separator`` argument you can set a string to split by (e.g.
- ``\'\t\'``).
-
+ specify the columns that are used by a given corpus.
@todo: Add support for reading from corpora where different
parallel files contain different columns.
document at a time (eg parsed_documents()).
"""
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Column Types
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
- WORDS = "words" #: column type for words
- POS = "pos" #: column type for part-of-speech tags
- TREE = "tree" #: column type for parse trees
- CHUNK = "chunk" #: column type for chunk structures
- NE = "ne" #: column type for named entities
- SRL = "srl" #: column type for semantic role labels
- IGNORE = "ignore" #: column type for column that should be ignored
+ WORDS = 'words' #: column type for words
+ POS = 'pos' #: column type for part-of-speech tags
+ TREE = 'tree' #: column type for parse trees
+ CHUNK = 'chunk' #: column type for chunk structures
+ NE = 'ne' #: column type for named entities
+ SRL = 'srl' #: column type for semantic role labels
+ IGNORE = 'ignore' #: column type for column that should be ignored
#: A list of all column types supported by the conll corpus reader.
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Constructor
- # /////////////////////////////////////////////////////////////////
-
- def __init__(
- self,
- root,
- fileids,
- columntypes,
- chunk_types=None,
- root_label="S",
- pos_in_tree=False,
- srl_includes_roleset=True,
- encoding="utf8",
- tree_class=Tree,
- tagset=None,
- separator=None,
- ):
+ #/////////////////////////////////////////////////////////////////
+
+ def __init__(self, root, fileids, columntypes,
+ chunk_types=None, root_label='S', pos_in_tree=False,
+ srl_includes_roleset=True, encoding='utf8',
+ tree_class=Tree, tagset=None):
for columntype in columntypes:
if columntype not in self.COLUMN_TYPES:
- raise ValueError("Bad column type %r" % columntype)
- if isinstance(chunk_types, str):
+ raise ValueError('Bad column type %r' % columntype)
+ if isinstance(chunk_types, string_types):
chunk_types = [chunk_types]
self._chunk_types = chunk_types
- self._colmap = dict((c, i) for (i, c) in enumerate(columntypes))
+ self._colmap = dict((c,i) for (i,c) in enumerate(columntypes))
self._pos_in_tree = pos_in_tree
- self._root_label = root_label # for chunks
+ self._root_label = root_label # for chunks
self._srl_includes_roleset = srl_includes_roleset
self._tree_class = tree_class
CorpusReader.__init__(self, root, fileids, encoding)
self._tagset = tagset
- self.sep = separator
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Data Access Methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
def tagged_words(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
-
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
-
- return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
+ return LazyConcatenation(LazyMap(get_tagged_words,
+ self._grids(fileids)))
def tagged_sents(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
-
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
-
return LazyMap(get_tagged_words, self._grids(fileids))
- def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
+ def chunked_words(self, fileids=None, chunk_types=None,
+ tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
- if chunk_types is None:
- chunk_types = self._chunk_types
-
- def get_chunked_words(grid): # capture chunk_types as local var
+ if chunk_types is None: chunk_types = self._chunk_types
+ def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
+ return LazyConcatenation(LazyMap(get_chunked_words,
+ self._grids(fileids)))
- return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
-
- def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
+ def chunked_sents(self, fileids=None, chunk_types=None,
+ tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
- if chunk_types is None:
- chunk_types = self._chunk_types
-
- def get_chunked_words(grid): # capture chunk_types as local var
+ if chunk_types is None: chunk_types = self._chunk_types
+ def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
-
return LazyMap(get_chunked_words, self._grids(fileids))
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
self._require(self.WORDS, self.POS, self.TREE)
- if pos_in_tree is None:
- pos_in_tree = self._pos_in_tree
-
- def get_parsed_sent(grid): # capture pos_in_tree as local var
+ if pos_in_tree is None: pos_in_tree = self._pos_in_tree
+ def get_parsed_sent(grid): # capture pos_in_tree as local var
return self._get_parsed_sent(grid, pos_in_tree, tagset)
-
return LazyMap(get_parsed_sent, self._grids(fileids))
def srl_spans(self, fileids=None):
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
- if pos_in_tree is None:
- pos_in_tree = self._pos_in_tree
-
- def get_srl_instances(grid): # capture pos_in_tree as local var
+ if pos_in_tree is None: pos_in_tree = self._pos_in_tree
+ def get_srl_instances(grid): # capture pos_in_tree as local var
return self._get_srl_instances(grid, pos_in_tree)
-
result = LazyMap(get_srl_instances, self._grids(fileids))
- if flatten:
- result = LazyConcatenation(result)
+ if flatten: result = LazyConcatenation(result)
return result
def iob_words(self, fileids=None, tagset=None):
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
-
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
-
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
def iob_sents(self, fileids=None, tagset=None):
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
-
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
-
return LazyMap(get_iob_words, self._grids(fileids))
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Grid Reading
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _grids(self, fileids=None):
# n.b.: we could cache the object returned here (keyed on
# fileids), which would let us reuse the same corpus view for
# different things (eg srl and parse trees).
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, self._read_grid_block,
+ encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def _read_grid_block(self, stream):
grids = []
for block in read_blankline_block(stream):
block = block.strip()
- if not block:
- continue
+ if not block: continue
- grid = [line.split(self.sep) for line in block.split("\n")]
+ grid = [line.split() for line in block.split('\n')]
# If there's a docstart row, then discard. ([xx] eventually it
# would be good to actually use it)
- if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
+ if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
del grid[0]
# Check that the grid is consistent.
for row in grid:
if len(row) != len(grid[0]):
- raise ValueError("Inconsistent number of columns:\n%s" % block)
+ raise ValueError('Inconsistent number of columns:\n%s'
+ % block)
grids.append(grid)
return grids
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Transforms
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# given a grid, transform it into some representation (e.g.,
# a list of words or a parse tree).
def _get_words(self, grid):
- return self._get_column(grid, self._colmap["words"])
+ return self._get_column(grid, self._colmap['words'])
def _get_tagged_words(self, grid, tagset=None):
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
- return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
+ return list(zip(self._get_column(grid, self._colmap['words']), pos_tags))
def _get_iob_words(self, grid, tagset=None):
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
- return list(
- zip(
- self._get_column(grid, self._colmap["words"]),
- pos_tags,
- self._get_column(grid, self._colmap["chunk"]),
- )
- )
+ return list(zip(self._get_column(grid, self._colmap['words']), pos_tags,
+ self._get_column(grid, self._colmap['chunk'])))
def _get_chunked_words(self, grid, chunk_types, tagset=None):
# n.b.: this method is very similar to conllstr2tree.
- words = self._get_column(grid, self._colmap["words"])
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ words = self._get_column(grid, self._colmap['words'])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
- chunk_tags = self._get_column(grid, self._colmap["chunk"])
+ chunk_tags = self._get_column(grid, self._colmap['chunk'])
stack = [Tree(self._root_label, [])]
for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
- if chunk_tag == "O":
- state, chunk_type = "O", ""
+ if chunk_tag == 'O':
+ state, chunk_type = 'O', ''
else:
- (state, chunk_type) = chunk_tag.split("-")
+ (state, chunk_type) = chunk_tag.split('-')
# If it's a chunk we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
- state = "O"
+ state = 'O'
# Treat a mismatching I like a B.
- if state == "I" and chunk_type != stack[-1].label():
- state = "B"
+ if state == 'I' and chunk_type != stack[-1].label():
+ state = 'B'
# For B or I: close any open chunks
- if state in "BO" and len(stack) == 2:
+ if state in 'BO' and len(stack) == 2:
stack.pop()
# For B: start a new chunk.
- if state == "B":
+ if state == 'B':
new_chunk = Tree(chunk_type, [])
stack[-1].append(new_chunk)
stack.append(new_chunk)
return stack[0]
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
- words = self._get_column(grid, self._colmap["words"])
- pos_tags = self._get_column(grid, self._colmap["pos"])
+ words = self._get_column(grid, self._colmap['words'])
+ pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
- parse_tags = self._get_column(grid, self._colmap["tree"])
+ parse_tags = self._get_column(grid, self._colmap['tree'])
- treestr = ""
+ treestr = ''
for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
- if word == "(":
- word = "-LRB-"
- if word == ")":
- word = "-RRB-"
- if pos_tag == "(":
- pos_tag = "-LRB-"
- if pos_tag == ")":
- pos_tag = "-RRB-"
- (left, right) = parse_tag.split("*")
- right = right.count(")") * ")" # only keep ')'.
- treestr += "%s (%s %s) %s" % (left, pos_tag, word, right)
+ if word == '(': word = '-LRB-'
+ if word == ')': word = '-RRB-'
+ if pos_tag == '(': pos_tag = '-LRB-'
+ if pos_tag == ')': pos_tag = '-RRB-'
+ (left, right) = parse_tag.split('*')
+ right = right.count(')')*')' # only keep ')'.
+ treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
try:
tree = self._tree_class.fromstring(treestr)
except (ValueError, IndexError):
- tree = self._tree_class.fromstring("(%s %s)" % (self._root_label, treestr))
+ tree = self._tree_class.fromstring('(%s %s)' %
+ (self._root_label, treestr))
if not pos_in_tree:
for subtree in tree.subtrees():
for i, child in enumerate(subtree):
- if (
- isinstance(child, Tree)
- and len(child) == 1
- and isinstance(child[0], str)
- ):
+ if (isinstance(child, Tree) and len(child)==1 and
+ isinstance(child[0], string_types)):
subtree[i] = (child[0], child.label())
return tree
list of list of (start, end), tag) tuples
"""
if self._srl_includes_roleset:
- predicates = self._get_column(grid, self._colmap["srl"] + 1)
- start_col = self._colmap["srl"] + 2
+ predicates = self._get_column(grid, self._colmap['srl']+1)
+ start_col = self._colmap['srl']+2
else:
- predicates = self._get_column(grid, self._colmap["srl"])
- start_col = self._colmap["srl"] + 1
+ predicates = self._get_column(grid, self._colmap['srl'])
+ start_col = self._colmap['srl']+1
# Count how many predicates there are. This tells us how many
# columns to expect for SRL data.
- num_preds = len([p for p in predicates if p != "-"])
+ num_preds = len([p for p in predicates if p != '-'])
spanlists = []
for i in range(num_preds):
- col = self._get_column(grid, start_col + i)
+ col = self._get_column(grid, start_col+i)
spanlist = []
stack = []
for wordnum, srl_tag in enumerate(col):
- (left, right) = srl_tag.split("*")
- for tag in left.split("("):
+ (left, right) = srl_tag.split('*')
+ for tag in left.split('('):
if tag:
stack.append((tag, wordnum))
- for i in range(right.count(")")):
+ for i in range(right.count(')')):
(tag, start) = stack.pop()
- spanlist.append(((start, wordnum + 1), tag))
+ spanlist.append( ((start, wordnum+1), tag) )
spanlists.append(spanlist)
return spanlists
tree = self._get_parsed_sent(grid, pos_in_tree)
spanlists = self._get_srl_spans(grid)
if self._srl_includes_roleset:
- predicates = self._get_column(grid, self._colmap["srl"] + 1)
- rolesets = self._get_column(grid, self._colmap["srl"])
+ predicates = self._get_column(grid, self._colmap['srl']+1)
+ rolesets = self._get_column(grid, self._colmap['srl'])
else:
- predicates = self._get_column(grid, self._colmap["srl"])
+ predicates = self._get_column(grid, self._colmap['srl'])
rolesets = [None] * len(predicates)
instances = ConllSRLInstanceList(tree)
for wordnum, predicate in enumerate(predicates):
- if predicate == "-":
- continue
+ if predicate == '-': continue
# Decide which spanlist to use. Don't assume that they're
# sorted in the same order as the predicates (even though
# they usually are).
for spanlist in spanlists:
for (start, end), tag in spanlist:
- if wordnum in range(start, end) and tag in ("V", "C-V"):
+ if wordnum in range(start,end) and tag in ('V', 'C-V'):
break
- else:
- continue
+ else: continue
break
else:
- raise ValueError("No srl column found for %r" % predicate)
- instances.append(
- ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
- )
+ raise ValueError('No srl column found for %r' % predicate)
+ instances.append(ConllSRLInstance(tree, wordnum, predicate,
+ rolesets[wordnum], spanlist))
return instances
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Helper Methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _require(self, *columntypes):
for columntype in columntypes:
if columntype not in self._colmap:
- raise ValueError(
- "This corpus does not contain a %s " "column." % columntype
- )
+ raise ValueError('This corpus does not contain a %s '
+ 'column.' % columntype)
@staticmethod
def _get_column(grid, column_index):
return [grid[i][column_index] for i in range(len(grid))]
+@compat.python_2_unicode_compatible
class ConllSRLInstance(object):
"""
An SRL instance from a CoNLL corpus, which identifies and
providing labels for the arguments of a single verb.
"""
-
# [xx] add inst.core_arguments, inst.argm_arguments?
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
# Fill in the self.verb and self.arguments values.
for (start, end), tag in tagged_spans:
- if tag in ("V", "C-V"):
+ if tag in ('V', 'C-V'):
self.verb += list(range(start, end))
else:
- self.arguments.append(((start, end), tag))
+ self.arguments.append( ((start, end), tag) )
def __repr__(self):
- # Originally, its:
- ##plural = 's' if len(self.arguments) != 1 else ''
- plural = "s" if len(self.arguments) != 1 else ""
- return "<ConllSRLInstance for %r with %d argument%s>" % (
- (self.verb_stem, len(self.arguments), plural)
- )
+ plural = len(self.arguments)!=1 and 's' or ''
+ return '<ConllSRLInstance for %r with %d argument%s>' % (
+ (self.verb_stem, len(self.arguments), plural))
def pprint(self):
- verbstr = " ".join(self.words[i][0] for i in self.verb)
- hdr = "SRL for %r (stem=%r):\n" % (verbstr, self.verb_stem)
- s = ""
+ verbstr = ' '.join(self.words[i][0] for i in self.verb)
+ hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
+ s = ''
for i, word in enumerate(self.words):
- if isinstance(word, tuple):
- word = word[0]
+ if isinstance(word, tuple): word = word[0]
for (start, end), argid in self.arguments:
- if i == start:
- s += "[%s " % argid
- if i == end:
- s += "] "
- if i in self.verb:
- word = "<<%s>>" % word
- s += word + " "
- return hdr + textwrap.fill(
- s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
- )
-
-
+ if i == start: s += '[%s ' % argid
+ if i == end: s += '] '
+ if i in self.verb: word = '<<%s>>' % word
+ s += word + ' '
+ return hdr + textwrap.fill(s.replace(' ]', ']'),
+ initial_indent=' ',
+ subsequent_indent=' ')
+
+@compat.python_2_unicode_compatible
class ConllSRLInstanceList(list):
"""
Set of instances for a single sentence
"""
-
def __init__(self, tree, instances=()):
self.tree = tree
list.__init__(self, instances)
# Sanity check: trees should be the same
for inst in self:
if inst.tree != self.tree:
- raise ValueError("Tree mismatch!")
+ raise ValueError('Tree mismatch!')
# If desired, add trees:
if include_tree:
words = self.tree.leaves()
pos = [None] * len(words)
- synt = ["*"] * len(words)
+ synt = ['*'] * len(words)
self._tree2conll(self.tree, 0, words, pos, synt)
- s = ""
+ s = ''
for i in range(len(words)):
# optional tree columns
if include_tree:
- s += "%-20s " % words[i]
- s += "%-8s " % pos[i]
- s += "%15s*%-8s " % tuple(synt[i].split("*"))
+ s += '%-20s ' % words[i]
+ s += '%-8s ' % pos[i]
+ s += '%15s*%-8s ' % tuple(synt[i].split('*'))
# verb head column
for inst in self:
if i == inst.verb_head:
- s += "%-20s " % inst.verb_stem
+ s += '%-20s ' % inst.verb_stem
break
else:
- s += "%-20s " % "-"
+ s += '%-20s ' % '-'
# Remaining columns: self
for inst in self:
- argstr = "*"
+ argstr = '*'
for (start, end), argid in inst.tagged_spans:
- if i == start:
- argstr = "(%s%s" % (argid, argstr)
- if i == (end - 1):
- argstr += ")"
- s += "%-12s " % argstr
- s += "\n"
+ if i==start: argstr = '(%s%s' % (argid, argstr)
+ if i==(end-1): argstr += ')'
+ s += '%-12s ' % argstr
+ s += '\n'
return s
def _tree2conll(self, tree, wordnum, words, pos, synt):
assert isinstance(tree, Tree)
- if len(tree) == 1 and isinstance(tree[0], str):
+ if len(tree) == 1 and isinstance(tree[0], string_types):
pos[wordnum] = tree.label()
assert words[wordnum] == tree[0]
- return wordnum + 1
+ return wordnum+1
elif len(tree) == 1 and isinstance(tree[0], tuple):
assert len(tree[0]) == 2
pos[wordnum], pos[wordnum] = tree[0]
- return wordnum + 1
+ return wordnum+1
else:
- synt[wordnum] = "(%s%s" % (tree.label(), synt[wordnum])
+ synt[wordnum] = '(%s%s' % (tree.label(), synt[wordnum])
for child in tree:
- wordnum = self._tree2conll(child, wordnum, words, pos, synt)
- synt[wordnum - 1] += ")"
+ wordnum = self._tree2conll(child, wordnum, words,
+ pos, synt)
+ synt[wordnum-1] += ')'
return wordnum
-
class ConllChunkCorpusReader(ConllCorpusReader):
"""
A ConllCorpusReader whose data file contains three columns: words,
pos, and chunk.
"""
-
- def __init__(
- self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
- ):
+ def __init__(self, root, fileids, chunk_types, encoding='utf8',
+ tagset=None):
ConllCorpusReader.__init__(
- self,
- root,
- fileids,
- ("words", "pos", "chunk"),
- chunk_types=chunk_types,
- encoding=encoding,
- tagset=tagset,
- separator=separator,
- )
+ self, root, fileids, ('words', 'pos', 'chunk'),
+ chunk_types=chunk_types, encoding=encoding,
+ tagset=tagset)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Crubadan N-grams Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
http://borel.slu.edu/crubadan/index.html
"""
+from __future__ import print_function, unicode_literals
+
import re
+from nltk.compat import PY3
from os import path
-
from nltk.corpus.reader import CorpusReader
from nltk.probability import FreqDist
from nltk.data import ZipFilePathPointer
-
class CrubadanCorpusReader(CorpusReader):
"""
A corpus reader used to access language An Crubadan n-gram files.
"""
-
- _LANG_MAPPER_FILE = "table.txt"
+
+ _LANG_MAPPER_FILE = 'table.txt'
_all_lang_freq = {}
-
- def __init__(self, root, fileids, encoding="utf8", tagset=None):
- super(CrubadanCorpusReader, self).__init__(root, fileids, encoding="utf8")
+
+ def __init__(self, root, fileids, encoding='utf8', tagset=None):
+ super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
self._lang_mapping_data = []
self._load_lang_mapping_data()
-
+
def lang_freq(self, lang):
- """ Return n-gram FreqDist for a specific language
- given ISO 639-3 language code """
-
+ ''' Return n-gram FreqDist for a specific language
+ given ISO 639-3 language code '''
+
if lang not in self._all_lang_freq:
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
return self._all_lang_freq[lang]
-
+
def langs(self):
- """ Return a list of supported languages as ISO 639-3 codes """
+ ''' Return a list of supported languages as ISO 639-3 codes '''
return [row[1] for row in self._lang_mapping_data]
-
+
def iso_to_crubadan(self, lang):
- """ Return internal Crubadan code based on ISO 639-3 code """
+ ''' Return internal Crubadan code based on ISO 639-3 code '''
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return i[0]
-
+
def crubadan_to_iso(self, lang):
- """ Return ISO 639-3 code given internal Crubadan code """
+ ''' Return ISO 639-3 code given internal Crubadan code '''
for i in self._lang_mapping_data:
if i[0].lower() == lang.lower():
return i[1]
-
+
def _load_lang_mapping_data(self):
- """ Load language mappings between codes and description from table.txt """
+ ''' Load language mappings between codes and description from table.txt '''
if isinstance(self.root, ZipFilePathPointer):
- raise RuntimeError(
- "Please install the 'crubadan' corpus first, use nltk.download()"
- )
-
+ raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
+
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file)
- raw = open(mapper_file, "r", encoding="utf-8").read().strip()
-
- self._lang_mapping_data = [row.split("\t") for row in raw.split("\n")]
+ if PY3:
+ raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+ else:
+ raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
+ self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
+
def _load_lang_ngrams(self, lang):
- """ Load single n-gram language file given the ISO 639-3 language code
- and return its FreqDist """
+ ''' Load single n-gram language file given the ISO 639-3 language code
+ and return its FreqDist '''
if lang not in self.langs():
raise RuntimeError("Unsupported language.")
crubadan_code = self.iso_to_crubadan(lang)
- ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
+ ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
if not path.isfile(ngram_file):
raise RuntimeError("No N-gram file found for requested language.")
counts = FreqDist()
- f = open(ngram_file, "r", encoding="utf-8")
+ if PY3:
+ f = open(ngram_file, 'r', encoding='utf-8')
+ else:
+ f = open(ngram_file, 'rU')
for line in f:
- data = line.split(" ")
+ if PY3:
+ data = line.split(' ')
+ else:
+ data = line.decode('utf8').split(' ')
- ngram = data[1].strip("\n")
+ ngram = data[1].strip('\n')
freq = int(data[0])
-
+
counts[ngram] = freq
-
+
return counts
+
# Natural Language Toolkit: Dependency Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
# Iker Manterola <returntothehangar@hotmail.com>
#
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class DependencyCorpusReader(SyntaxCorpusReader):
- def __init__(
- self,
- root,
- fileids,
- encoding="utf8",
- word_tokenizer=TabTokenizer(),
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
- para_block_reader=read_blankline_block,
- ):
- # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
- # from CorpusReader?
+
+ def __init__(self, root, fileids, encoding='utf8',
+ word_tokenizer=TabTokenizer(),
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+ para_block_reader=read_blankline_block):
+
CorpusReader.__init__(self, root, fileids, encoding)
#########################################################
return concat(result)
def words(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, False, False, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
+ return concat([DependencyCorpusView(fileid, False, False, False, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)])
def tagged_words(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, True, False, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
+ return concat([DependencyCorpusView(fileid, True, False, False, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)])
def sents(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, False, True, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
+ return concat([DependencyCorpusView(fileid, False, True, False, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)])
def tagged_sents(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, True, True, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
+ return concat([DependencyCorpusView(fileid, True, True, False, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)])
def parsed_sents(self, fileids=None):
- sents = concat(
- [
- DependencyCorpusView(fileid, False, True, True, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
+ sents=concat([DependencyCorpusView(fileid, False, True, True, encoding=enc)
+ for fileid, enc in self.abspaths(fileids, include_encoding=True)])
return [DependencyGraph(sent) for sent in sents]
class DependencyCorpusView(StreamBackedCorpusView):
- _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
-
- def __init__(
- self,
- corpus_file,
- tagged,
- group_by_sent,
- dependencies,
- chunk_types=None,
- encoding="utf8",
- ):
+ _DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da
+
+ def __init__(self, corpus_file, tagged, group_by_sent, dependencies,
+ chunk_types=None, encoding='utf8'):
self._tagged = tagged
self._dependencies = dependencies
self._group_by_sent = group_by_sent
sent = read_blankline_block(stream)[0].strip()
# Strip off the docstart marker, if present.
if sent.startswith(self._DOCSTART):
- sent = sent[len(self._DOCSTART) :].lstrip()
+ sent = sent[len(self._DOCSTART):].lstrip()
# extract word and tag from any of the formats
if not self._dependencies:
- lines = [line.split("\t") for line in sent.split("\n")]
+ lines = [line.split('\t') for line in sent.split('\n')]
if len(lines[0]) == 3 or len(lines[0]) == 4:
sent = [(line[0], line[1]) for line in lines]
elif len(lines[0]) == 10:
sent = [(line[1], line[4]) for line in lines]
else:
- raise ValueError("Unexpected number of fields in dependency tree file")
+ raise ValueError('Unexpected number of fields in dependency tree file')
# discard tags if they weren't requested
if not self._tagged:
# Natural Language Toolkit: Framenet Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Chuck Wooters <wooters@icsi.berkeley.edu>,
# Nathan Schneider <nathan.schneider@georgetown.edu>
# URL: <http://nltk.org/>
"""
Corpus reader for the FrameNet 1.7 lexicon and corpus.
"""
+from __future__ import print_function, unicode_literals
-import os
+import os, sys
import re
import textwrap
import itertools
-import sys
import types
-from collections import defaultdict, OrderedDict
-from operator import itemgetter
-from itertools import zip_longest
-from pprint import pprint
+from six import string_types, text_type
+from six.moves import zip_longest
+from collections import defaultdict, OrderedDict
+from pprint import pprint, pformat
+from nltk.internals import ElementWrapper
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView
+from nltk.compat import python_2_unicode_compatible
+from nltk.util import AbstractLazySequence, LazyConcatenation, LazyMap, LazyIteratorList
-from nltk.util import LazyConcatenation, LazyMap, LazyIteratorList
-
-__docformat__ = "epytext en"
-
+__docformat__ = 'epytext en'
def mimic_wrap(lines, wrap_at=65, **kwargs):
"""
Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same
positions as the first.
"""
- l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n")
+ l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split('\n')
yield l0
def _(line):
il0 = 0
- while line and il0 < len(l0) - 1:
- yield line[: len(l0[il0])]
- line = line[len(l0[il0]) :]
+ while line and il0<len(l0)-1:
+ yield line[:len(l0[il0])]
+ line = line[len(l0[il0]):]
il0 += 1
- if line: # Remaining stuff on this line past the end of the mimicked line.
+ if line: # Remaining stuff on this line past the end of the mimicked line.
# So just textwrap this line.
- for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n"):
+ for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split('\n'):
yield ln
for l in lines[1:]:
yield list(_(l))
-
-def _pretty_longstring(defstr, prefix="", wrap_at=65):
+def _pretty_longstring(defstr, prefix='', wrap_at=65):
"""
Helper function for pretty-printing a long string.
"""
outstr = ""
- for line in textwrap.fill(defstr, wrap_at).split("\n"):
- outstr += prefix + line + "\n"
+ for line in textwrap.fill(defstr, wrap_at).split('\n'):
+ outstr += prefix + line + '\n'
return outstr
-
def _pretty_any(obj):
"""
outstr = ""
for k in obj:
- if isinstance(obj[k], str) and len(obj[k]) > 65:
+ if isinstance(obj[k], string_types) and len(obj[k]) > 65:
outstr += "[{0}]\n".format(k)
- outstr += "{0}".format(_pretty_longstring(obj[k], prefix=" "))
- outstr += "\n"
+ outstr += "{0}".format(_pretty_longstring(obj[k], prefix=' '))
+ outstr += '\n'
else:
outstr += "[{0}] {1}\n".format(k, obj[k])
return outstr
-
def _pretty_semtype(st):
"""
"""
semkeys = st.keys()
- if len(semkeys) == 1:
- return "<None>"
+ if len(semkeys) == 1: return "<None>"
outstr = ""
outstr += "semantic type ({0.ID}): {0.name}\n".format(st)
- if "abbrev" in semkeys:
+ if 'abbrev' in semkeys:
outstr += "[abbrev] {0}\n".format(st.abbrev)
- if "definition" in semkeys:
+ if 'definition' in semkeys:
outstr += "[definition]\n"
- outstr += _pretty_longstring(st.definition, " ")
+ outstr += _pretty_longstring(st.definition,' ')
outstr += "[rootType] {0}({1})\n".format(st.rootType.name, st.rootType.ID)
if st.superType is None:
outstr += "[superType] <None>\n"
else:
outstr += "[superType] {0}({1})\n".format(st.superType.name, st.superType.ID)
outstr += "[subTypes] {0} subtypes\n".format(len(st.subTypes))
- outstr += (
- " "
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in st.subTypes)
- + "\n" * (len(st.subTypes) > 0)
- )
+ outstr += " " + ", ".join('{0}({1})'.format(x.name, x.ID) for x in st.subTypes) + '\n'*(len(st.subTypes)>0)
return outstr
-
def _pretty_frame_relation_type(freltyp):
"""
:return: A nicely formated string representation of the frame relation type.
:rtype: str
"""
- outstr = "<frame relation type ({0.ID}): {0.superFrameName} -- {0.name} -> {0.subFrameName}>".format(
- freltyp
- )
+ outstr = "<frame relation type ({0.ID}): {0.superFrameName} -- {0.name} -> {0.subFrameName}>".format(freltyp)
return outstr
-
def _pretty_frame_relation(frel):
"""
:return: A nicely formated string representation of the frame relation.
:rtype: str
"""
- outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format(
- frel
- )
+ outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format(frel)
return outstr
-
def _pretty_fe_relation(ferel):
"""
:return: A nicely formated string representation of the FE relation.
:rtype: str
"""
- outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format(
- ferel
- )
+ outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format(ferel)
return outstr
-
def _pretty_lu(lu):
"""
lukeys = lu.keys()
outstr = ""
outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu)
- if "definition" in lukeys:
+ if 'definition' in lukeys:
outstr += "[definition]\n"
- outstr += _pretty_longstring(lu.definition, " ")
- if "frame" in lukeys:
- outstr += "\n[frame] {0}({1})\n".format(lu.frame.name, lu.frame.ID)
- if "incorporatedFE" in lukeys:
+ outstr += _pretty_longstring(lu.definition,' ')
+ if 'frame' in lukeys:
+ outstr += "\n[frame] {0}({1})\n".format(lu.frame.name,lu.frame.ID)
+ if 'incorporatedFE' in lukeys:
outstr += "\n[incorporatedFE] {0}\n".format(lu.incorporatedFE)
- if "POS" in lukeys:
+ if 'POS' in lukeys:
outstr += "\n[POS] {0}\n".format(lu.POS)
- if "status" in lukeys:
+ if 'status' in lukeys:
outstr += "\n[status] {0}\n".format(lu.status)
- if "totalAnnotated" in lukeys:
- outstr += "\n[totalAnnotated] {0} annotated examples\n".format(
- lu.totalAnnotated
- )
- if "lexemes" in lukeys:
- outstr += "\n[lexemes] {0}\n".format(
- " ".join("{0}/{1}".format(lex.name, lex.POS) for lex in lu.lexemes)
- )
- if "semTypes" in lukeys:
+ if 'totalAnnotated' in lukeys:
+ outstr += "\n[totalAnnotated] {0} annotated examples\n".format(lu.totalAnnotated)
+ if 'lexemes' in lukeys:
+ outstr += "\n[lexemes] {0}\n".format(' '.join('{0}/{1}'.format(lex.name,lex.POS) for lex in lu.lexemes))
+ if 'semTypes' in lukeys:
outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes))
- outstr += (
- " " * (len(lu.semTypes) > 0)
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in lu.semTypes)
- + "\n" * (len(lu.semTypes) > 0)
- )
- if "URL" in lukeys:
+ outstr += " "*(len(lu.semTypes)>0) + ", ".join('{0}({1})'.format(x.name, x.ID) for x in lu.semTypes) + '\n'*(len(lu.semTypes)>0)
+ if 'URL' in lukeys:
outstr += "\n[URL] {0}\n".format(lu.URL)
- if "subCorpus" in lukeys:
+ if 'subCorpus' in lukeys:
subc = [x.name for x in lu.subCorpus]
outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus))
- for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"):
+ for line in textwrap.fill(", ".join(sorted(subc)), 60).split('\n'):
outstr += " {0}\n".format(line)
- if "exemplars" in lukeys:
- outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format(
- len(lu.exemplars)
- )
+ if 'exemplars' in lukeys:
+ outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format(len(lu.exemplars))
return outstr
-
def _pretty_exemplars(exemplars, lu):
"""
Helper function for pretty-printing a list of exemplar sentences for a lexical unit.
outstr = ""
outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu)
- for i, sent in enumerate(exemplars):
+ for i,sent in enumerate(exemplars):
outstr += "[{0}] {1}\n".format(i, sent.text)
outstr += "\n"
return outstr
-
def _pretty_fulltext_sentences(sents):
"""
Helper function for pretty-printing a list of annotated sentences for a full-text document.
outstr = ""
outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents)
- outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format(
- sents
- )
+ outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format(sents)
outstr += "[sentence]\n".format(sents)
- for i, sent in enumerate(sents.sentence):
+ for i,sent in enumerate(sents.sentence):
outstr += "[{0}] {1}\n".format(i, sent.text)
outstr += "\n"
return outstr
-
def _pretty_fulltext_sentence(sent):
"""
Helper function for pretty-printing an annotated sentence from a full-text document.
"""
outstr = ""
- outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format(
- sent, sent.doc.get("name", sent.doc.description)
- )
+ outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format(sent, sent.doc.get('name',sent.doc.description))
outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
outstr += "\n[POS_tagset] {0}\n\n".format(sent.POS_tagset)
outstr += "[text] + [annotationSet]\n\n"
- outstr += sent._ascii() # -> _annotation_ascii()
+ outstr += sent._ascii() # -> _annotation_ascii()
outstr += "\n"
return outstr
-
def _pretty_pos(aset):
"""
Helper function for pretty-printing a sentence with its POS tags.
"""
outstr = ""
- outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format(
- aset
- )
+ outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format(aset)
# list the target spans and their associated aset index
overt = sorted(aset.POS)
sent = aset.sent
s0 = sent.text
- s1 = ""
- s2 = ""
+ s1 = ''
+ s2 = ''
i = 0
adjust = 0
- for j, k, lbl in overt:
- assert j >= i, ("Overlapping targets?", (j, k, lbl))
- s1 += " " * (j - i) + "-" * (k - j)
- if len(lbl) > (k - j):
+ for j,k,lbl in overt:
+ assert j>=i,('Overlapping targets?',(j,k,lbl))
+ s1 += ' '*(j-i) + '-'*(k-j)
+ if len(lbl)>(k-j):
# add space in the sentence to make room for the annotation index
- amt = len(lbl) - (k - j)
- s0 = (
- s0[: k + adjust] + "~" * amt + s0[k + adjust :]
- ) # '~' to prevent line wrapping
- s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
+ amt = len(lbl)-(k-j)
+ s0 = s0[:k+adjust]+ '~'*amt + s0[k+adjust:] # '~' to prevent line wrapping
+ s1 = s1[:k+adjust]+ ' '*amt + s1[k+adjust:]
adjust += amt
- s2 += " " * (j - i) + lbl.ljust(k - j)
+ s2 += ' '*(j-i) + lbl.ljust(k-j)
i = k
long_lines = [s0, s1, s2]
- outstr += "\n\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- ).replace("~", " ")
+ outstr += '\n\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))).replace('~',' ')
outstr += "\n"
return outstr
-
def _pretty_annotation(sent, aset_level=False):
"""
Helper function for pretty-printing an exemplar sentence for a lexical unit.
sentkeys = sent.keys()
outstr = "annotation set" if aset_level else "exemplar sentence"
outstr += " ({0.ID}):\n".format(sent)
- if aset_level: # TODO: any UNANN exemplars?
+ if aset_level: # TODO: any UNANN exemplars?
outstr += "\n[status] {0}\n".format(sent.status)
- for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"):
+ for k in ('corpID', 'docID', 'paragNo', 'sentNo', 'aPos'):
if k in sentkeys:
outstr += "[{0}] {1}\n".format(k, sent[k])
- outstr += (
- "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU)
- if sent.LU
- else "\n[LU] Not found!"
- )
- outstr += "\n[frame] ({0.ID}) {0.name}\n".format(
- sent.frame
- ) # redundant with above, but .frame is convenient
+ outstr += "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU) if sent.LU else '\n[LU] Not found!'
+ outstr += "\n[frame] ({0.ID}) {0.name}\n".format(sent.frame) # redundant with above, but .frame is convenient
if not aset_level:
- outstr += "\n[annotationSet] {0} annotation sets\n".format(
- len(sent.annotationSet)
- )
+ outstr += "\n[annotationSet] {0} annotation sets\n".format(len(sent.annotationSet))
outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
outstr += "\n[POS_tagset] {0}\n".format(sent.POS_tagset)
- outstr += "\n[GF] {0} relation{1}\n".format(
- len(sent.GF), "s" if len(sent.GF) != 1 else ""
- )
- outstr += "\n[PT] {0} phrase{1}\n".format(
- len(sent.PT), "s" if len(sent.PT) != 1 else ""
- )
+ outstr += "\n[GF] {0} relation{1}\n".format(len(sent.GF), "s" if len(sent.GF)!=1 else "")
+ outstr += "\n[PT] {0} phrase{1}\n".format(len(sent.PT), "s" if len(sent.PT)!=1 else "")
"""
Special Layers
--------------
Gov (governor), X. Gov and X always cooccur.
>>> from nltk.corpus import framenet as fn
- >>> def f(luRE, lyr, ignore=set()):
- ... for i,ex in enumerate(fn.exemplars(luRE)):
- ... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore:
- ... print(i,ex[lyr])
+>>> def f(luRE, lyr, ignore=set()):
+... for i,ex in enumerate(fn.exemplars(luRE)):
+... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore:
+... print(i,ex[lyr])
- Verb: Asp, Non-Asp
- Noun: Cop, Supp, Ctrlr, Gov, X
- Scon: (none)
- Art: (none)
"""
- for lyr in ("NER", "WSL", "Other", "Sent"):
+ for lyr in ('NER', 'WSL', 'Other', 'Sent'):
if lyr in sent and sent[lyr]:
- outstr += "\n[{0}] {1} entr{2}\n".format(
- lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y"
- )
+ outstr += "\n[{0}] {1} entr{2}\n".format(lyr, len(sent[lyr]), "ies" if len(sent[lyr])!=1 else "y")
outstr += "\n[text] + [Target] + [FE]"
# POS-specific layers: syntactically important words that are neither the target
# nor the FEs. Include these along with the first FE layer but with '^' underlining.
- for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
+ for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
if lyr in sent and sent[lyr]:
outstr += " + [{0}]".format(lyr)
- if "FE2" in sentkeys:
+ if 'FE2' in sentkeys:
outstr += " + [FE2]"
- if "FE3" in sentkeys:
+ if 'FE3' in sentkeys:
outstr += " + [FE3]"
outstr += "\n\n"
- outstr += sent._ascii() # -> _annotation_ascii()
+ outstr += sent._ascii() # -> _annotation_ascii()
outstr += "\n"
return outstr
-
def _annotation_ascii(sent):
- """
+ '''
Given a sentence or FE annotation set, construct the width-limited string showing
an ASCII visualization of the sentence's annotations, calling either
_annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate.
This will be attached as a method to appropriate AttrDict instances
and called in the full pretty-printing of the instance.
- """
- if sent._type == "fulltext_sentence" or (
- "annotationSet" in sent and len(sent.annotationSet) > 2
- ):
+ '''
+ if sent._type=='fulltext_sentence' or ('annotationSet' in sent and len(sent.annotationSet)>2):
# a full-text sentence OR sentence with multiple targets.
# (multiple targets = >2 annotation sets, because the first annotation set is POS.)
return _annotation_ascii_frames(sent)
- else: # an FE annotation set, or an LU sentence with 1 target
+ else: # an FE annotation set, or an LU sentence with 1 target
return _annotation_ascii_FEs(sent)
-
def _annotation_ascii_frames(sent):
- """
+ '''
ASCII string rendering of the sentence along with its targets and frame names.
Called for all full-text sentences, as well as the few LU sentences with multiple
targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets).
Line-wrapped to limit the display width.
- """
+ '''
# list the target spans and their associated aset index
overt = []
- for a, aset in enumerate(sent.annotationSet[1:]):
- for j, k in aset.Target:
- indexS = "[{0}]".format(a + 1)
- if aset.status == "UNANN" or aset.LU.status == "Problem":
+ for a,aset in enumerate(sent.annotationSet[1:]):
+ for j,k in aset.Target:
+ indexS = "[{0}]".format(a+1)
+ if aset.status=='UNANN' or aset.LU.status=='Problem':
indexS += " "
- if aset.status == "UNANN":
- indexS += (
- "!"
- ) # warning indicator that there is a frame annotation but no FE annotation
- if aset.LU.status == "Problem":
- indexS += (
- "?"
- ) # warning indicator that there is a missing LU definition (because the LU has Problem status)
- overt.append((j, k, aset.LU.frame.name, indexS))
+ if aset.status=='UNANN':
+ indexS += "!" # warning indicator that there is a frame annotation but no FE annotation
+ if aset.LU.status=='Problem':
+ indexS += "?" # warning indicator that there is a missing LU definition (because the LU has Problem status)
+ overt.append((j,k,aset.LU.frame.name,indexS))
overt = sorted(overt)
duplicates = set()
- for o, (j, k, fname, asetIndex) in enumerate(overt):
- if o > 0 and j <= overt[o - 1][1]:
+ for o,(j,k,fname,asetIndex) in enumerate(overt):
+ if o>0 and j<=overt[o-1][1]:
# multiple annotation sets on the same target
# (e.g. due to a coordination construction or multiple annotators)
- if (
- overt[o - 1][:2] == (j, k) and overt[o - 1][2] == fname
- ): # same target, same frame
+ if overt[o-1][:2]==(j,k) and overt[o-1][2]==fname: # same target, same frame
# splice indices together
- combinedIndex = (
- overt[o - 1][3] + asetIndex
- ) # e.g., '[1][2]', '[1]! [2]'
- combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ")
- overt[o - 1] = overt[o - 1][:3] + (combinedIndex,)
+ combinedIndex = overt[o-1][3] + asetIndex # e.g., '[1][2]', '[1]! [2]'
+ combinedIndex = combinedIndex.replace(' !', '! ').replace(' ?', '? ')
+ overt[o-1] = overt[o-1][:3]+(combinedIndex,)
duplicates.add(o)
- else: # different frames, same or overlapping targets
+ else: # different frames, same or overlapping targets
s = sent.text
- for j, k, fname, asetIndex in overt:
- s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname
- s += "\n(Unable to display sentence with targets marked inline due to overlap)"
+ for j,k,fname,asetIndex in overt:
+ s += '\n' + asetIndex + ' ' + sent.text[j:k] + ' :: ' + fname
+ s += '\n(Unable to display sentence with targets marked inline due to overlap)'
return s
for o in reversed(sorted(duplicates)):
del overt[o]
s0 = sent.text
- s1 = ""
- s11 = ""
- s2 = ""
+ s1 = ''
+ s11 = ''
+ s2 = ''
i = 0
adjust = 0
fAbbrevs = OrderedDict()
- for j, k, fname, asetIndex in overt:
- if not j >= i:
- assert j >= i, (
- "Overlapping targets?"
- + (
- " UNANN"
- if any(aset.status == "UNANN" for aset in sent.annotationSet[1:])
- else ""
- ),
- (j, k, asetIndex),
- )
- s1 += " " * (j - i) + "*" * (k - j)
- short = fname[: k - j]
- if (k - j) < len(fname):
+ for j,k,fname,asetIndex in overt:
+ if not j>=i:
+ assert j>=i,('Overlapping targets?'+(' UNANN' if any(aset.status=='UNANN' for aset in sent.annotationSet[1:]) else ''),(j,k,asetIndex))
+ s1 += ' '*(j-i) + '*'*(k-j)
+ short = fname[:k-j]
+ if (k-j)<len(fname):
r = 0
while short in fAbbrevs:
- if fAbbrevs[short] == fname:
+ if fAbbrevs[short]==fname:
break
r += 1
- short = fname[: k - j - 1] + str(r)
- else: # short not in fAbbrevs
+ short = fname[:k-j-1] + str(r)
+ else: # short not in fAbbrevs
fAbbrevs[short] = fname
- s11 += " " * (j - i) + short.ljust(k - j)
- if len(asetIndex) > (k - j):
+ s11 += ' '*(j-i) + short.ljust(k-j)
+ if len(asetIndex)>(k-j):
# add space in the sentence to make room for the annotation index
- amt = len(asetIndex) - (k - j)
- s0 = (
- s0[: k + adjust] + "~" * amt + s0[k + adjust :]
- ) # '~' to prevent line wrapping
- s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
- s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :]
+ amt = len(asetIndex)-(k-j)
+ s0 = s0[:k+adjust]+ '~'*amt + s0[k+adjust:] # '~' to prevent line wrapping
+ s1 = s1[:k+adjust]+ ' '*amt + s1[k+adjust:]
+ s11 = s11[:k+adjust]+ ' '*amt + s11[k+adjust:]
adjust += amt
- s2 += " " * (j - i) + asetIndex.ljust(k - j)
+ s2 += ' '*(j-i) + asetIndex.ljust(k-j)
i = k
long_lines = [s0, s1, s11, s2]
- outstr = "\n\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- ).replace("~", " ")
- outstr += "\n"
+ outstr = '\n\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))).replace('~',' ')
+ outstr += '\n'
if fAbbrevs:
- outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")"
- assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash"
+ outstr += ' ('+', '.join('='.join(pair) for pair in fAbbrevs.items())+')'
+ assert len(fAbbrevs)==len(dict(fAbbrevs)),'Abbreviation clash'
return outstr
-
def _annotation_ascii_FE_layer(overt, ni, feAbbrevs):
- """Helper for _annotation_ascii_FEs()."""
- s1 = ""
- s2 = ""
+ '''Helper for _annotation_ascii_FEs().'''
+ s1 = ''
+ s2 = ''
i = 0
- for j, k, fename in overt:
- s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j)
- short = fename[: k - j]
- if len(fename) > len(short):
+ for j,k,fename in overt:
+ s1 += ' '*(j-i) + ('^' if fename.islower() else '-')*(k-j)
+ short = fename[:k-j]
+ if len(fename)>len(short):
r = 0
while short in feAbbrevs:
- if feAbbrevs[short] == fename:
+ if feAbbrevs[short]==fename:
break
r += 1
- short = fename[: k - j - 1] + str(r)
- else: # short not in feAbbrevs
+ short = fename[:k-j-1] + str(r)
+ else: # short not in feAbbrevs
feAbbrevs[short] = fename
- s2 += " " * (j - i) + short.ljust(k - j)
+ s2 += ' '*(j-i) + short.ljust(k-j)
i = k
- sNI = ""
+ sNI = ''
if ni:
- sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]"
- return [s1, s2, sNI]
-
+ sNI += ' ['+', '.join(':'.join(x) for x in sorted(ni.items()))+']'
+ return [s1,s2,sNI]
def _annotation_ascii_FEs(sent):
- """
+ '''
ASCII string rendering of the sentence along with a single target and its FEs.
Secondary and tertiary FE layers are included if present.
'sent' can be an FE annotation set or an LU sentence with a single target.
Line-wrapped to limit the display width.
- """
+ '''
feAbbrevs = OrderedDict()
- posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula])
+ posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula])
posspec_separate = False
- for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
+ for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
if lyr in sent and sent[lyr]:
- for a, b, lbl in sent[lyr]:
- if (
- lbl == "X"
- ): # skip this, which covers an entire phrase typically containing the target and all its FEs
+ for a,b,lbl in sent[lyr]:
+ if lbl=='X': # skip this, which covers an entire phrase typically containing the target and all its FEs
# (but do display the Gov)
continue
- if any(1 for x, y, felbl in sent.FE[0] if x <= a < y or a <= x < b):
+ if any(1 for x,y,felbl in sent.FE[0] if x<=a<y or a<=x<b):
# overlap between one of the POS-specific layers and first FE layer
- posspec_separate = (
- True
- ) # show POS-specific layers on a separate line
- posspec.append(
- (a, b, lbl.lower().replace("-", ""))
- ) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names
+ posspec_separate = True # show POS-specific layers on a separate line
+ posspec.append((a,b,lbl.lower().replace('-',''))) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names
if posspec_separate:
POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs)
- FE1 = _annotation_ascii_FE_layer(
- sorted(sent.FE[0] + (posspec if not posspec_separate else [])),
- sent.FE[1],
- feAbbrevs,
- )
+ FE1 = _annotation_ascii_FE_layer(sorted(sent.FE[0] + (posspec if not posspec_separate else [])), sent.FE[1], feAbbrevs)
FE2 = FE3 = None
- if "FE2" in sent:
+ if 'FE2' in sent:
FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs)
- if "FE3" in sent:
+ if 'FE3' in sent:
FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs)
- for i, j in sent.Target:
+ for i,j in sent.Target:
FE1span, FE1name, FE1exp = FE1
- if len(FE1span) < j:
- FE1span += " " * (j - len(FE1span))
- if len(FE1name) < j:
- FE1name += " " * (j - len(FE1name))
+ if len(FE1span)<j:
+ FE1span += ' '*(j-len(FE1span))
+ if len(FE1name)<j:
+ FE1name += ' '*(j-len(FE1name))
FE1[1] = FE1name
- FE1[0] = (
- FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:]
- )
+ FE1[0] = FE1span[:i] + FE1span[i:j].replace(' ','*').replace('-','=') + FE1span[j:]
long_lines = [sent.text]
if posspec_separate:
long_lines.extend(POSSPEC[:2])
- long_lines.extend([FE1[0], FE1[1] + FE1[2]]) # lines with no length limit
+ long_lines.extend([FE1[0], FE1[1]+FE1[2]]) # lines with no length limit
if FE2:
- long_lines.extend([FE2[0], FE2[1] + FE2[2]])
+ long_lines.extend([FE2[0], FE2[1]+FE2[2]])
if FE3:
- long_lines.extend([FE3[0], FE3[1] + FE3[2]])
- long_lines.append("")
- outstr = "\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- )
+ long_lines.extend([FE3[0], FE3[1]+FE3[2]])
+ long_lines.append('')
+ outstr = '\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' ')))
if feAbbrevs:
- outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")"
- assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash"
+ outstr += '('+', '.join('='.join(pair) for pair in feAbbrevs.items())+')'
+ assert len(feAbbrevs)==len(dict(feAbbrevs)),'Abbreviation clash'
outstr += "\n"
return outstr
-
def _pretty_fe(fe):
"""
"""
fekeys = fe.keys()
outstr = ""
- outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format(
- fe, fe.frame
- )
- if "definition" in fekeys:
+ outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format(fe, fe.frame)
+ if 'definition' in fekeys:
outstr += "[definition]\n"
- outstr += _pretty_longstring(fe.definition, " ")
- if "abbrev" in fekeys:
+ outstr += _pretty_longstring(fe.definition,' ')
+ if 'abbrev' in fekeys:
outstr += "[abbrev] {0}\n".format(fe.abbrev)
- if "coreType" in fekeys:
+ if 'coreType' in fekeys:
outstr += "[coreType] {0}\n".format(fe.coreType)
- if "requiresFE" in fekeys:
+ if 'requiresFE' in fekeys:
outstr += "[requiresFE] "
if fe.requiresFE is None:
outstr += "<None>\n"
else:
outstr += "{0}({1})\n".format(fe.requiresFE.name, fe.requiresFE.ID)
- if "excludesFE" in fekeys:
+ if 'excludesFE' in fekeys:
outstr += "[excludesFE] "
if fe.excludesFE is None:
outstr += "<None>\n"
else:
outstr += "{0}({1})\n".format(fe.excludesFE.name, fe.excludesFE.ID)
- if "semType" in fekeys:
+ if 'semType' in fekeys:
outstr += "[semType] "
if fe.semType is None:
outstr += "<None>\n"
else:
- outstr += "\n " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + "\n"
+ outstr += "\n " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + '\n'
return outstr
-
def _pretty_frame(frame):
"""
outstr += "frame ({0.ID}): {0.name}\n\n".format(frame)
outstr += "[URL] {0}\n\n".format(frame.URL)
outstr += "[definition]\n"
- outstr += _pretty_longstring(frame.definition, " ") + "\n"
+ outstr += _pretty_longstring(frame.definition, ' ') + '\n'
outstr += "[semTypes] {0} semantic types\n".format(len(frame.semTypes))
- outstr += (
- " " * (len(frame.semTypes) > 0)
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes)
- + "\n" * (len(frame.semTypes) > 0)
- )
+ outstr += " "*(len(frame.semTypes)>0) + ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes) + '\n'*(len(frame.semTypes)>0)
- outstr += "\n[frameRelations] {0} frame relations\n".format(
- len(frame.frameRelations)
- )
- outstr += " " + "\n ".join(repr(frel) for frel in frame.frameRelations) + "\n"
+ outstr += "\n[frameRelations] {0} frame relations\n".format(len(frame.frameRelations))
+ outstr += ' ' + '\n '.join(repr(frel) for frel in frame.frameRelations) + '\n'
outstr += "\n[lexUnit] {0} lexical units\n".format(len(frame.lexUnit))
lustrs = []
- for luName, lu in sorted(frame.lexUnit.items()):
- tmpstr = "{0} ({1})".format(luName, lu.ID)
+ for luName,lu in sorted(frame.lexUnit.items()):
+ tmpstr = '{0} ({1})'.format(luName, lu.ID)
lustrs.append(tmpstr)
- outstr += "{0}\n".format(_pretty_longstring(", ".join(lustrs), prefix=" "))
+ outstr += "{0}\n".format(_pretty_longstring(', '.join(lustrs),prefix=' '))
outstr += "\n[FE] {0} frame elements\n".format(len(frame.FE))
fes = {}
- for feName, fe in sorted(frame.FE.items()):
+ for feName,fe in sorted(frame.FE.items()):
try:
fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID))
except KeyError:
fes[fe.coreType] = []
fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID))
- for ct in sorted(
- fes.keys(),
- key=lambda ct2: [
- "Core",
- "Core-Unexpressed",
- "Peripheral",
- "Extra-Thematic",
- ].index(ct2),
- ):
- outstr += "{0:>16}: {1}\n".format(ct, ", ".join(sorted(fes[ct])))
-
- outstr += "\n[FEcoreSets] {0} frame element core sets\n".format(
- len(frame.FEcoreSets)
- )
- outstr += (
- " "
- + "\n ".join(
- ", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets
- )
- + "\n"
- )
+ for ct in sorted(fes.keys(), key=lambda ct2: ['Core','Core-Unexpressed','Peripheral','Extra-Thematic'].index(ct2)):
+ outstr += "{0:>16}: {1}\n".format(ct, ', '.join(sorted(fes[ct])))
- return outstr
+ outstr += "\n[FEcoreSets] {0} frame element core sets\n".format(len(frame.FEcoreSets))
+ outstr += " " + '\n '.join(", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets) + '\n'
+ return outstr
class FramenetError(Exception):
"""An exception class for framenet-related errors."""
-
+@python_2_unicode_compatible
class AttrDict(dict):
"""A class that wraps a dict and allows accessing the keys of the
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
- # self.__dict__ = self
+ #self.__dict__ = self
def __setattr__(self, name, value):
self[name] = value
-
def __getattr__(self, name):
- if name == "_short_repr":
+ if name=='_short_repr':
return self._short_repr
return self[name]
-
def __getitem__(self, name):
- v = super(AttrDict, self).__getitem__(name)
- if isinstance(v, Future):
+ v = super(AttrDict,self).__getitem__(name)
+ if isinstance(v,Future):
return v._data()
return v
def _short_repr(self):
- if "_type" in self:
- if self["_type"].endswith("relation"):
+ if '_type' in self:
+ if self['_type'].endswith('relation'):
return self.__repr__()
try:
- return "<{0} ID={1} name={2}>".format(
- self["_type"], self["ID"], self["name"]
- )
+ return "<{0} ID={1} name={2}>".format(self['_type'], self['ID'], self['name'])
except KeyError:
- try: # no ID--e.g., for _type=lusubcorpus
- return "<{0} name={1}>".format(self["_type"], self["name"])
- except KeyError: # no name--e.g., for _type=lusentence
- return "<{0} ID={1}>".format(self["_type"], self["ID"])
+ try: # no ID--e.g., for _type=lusubcorpus
+ return "<{0} name={1}>".format(self['_type'], self['name'])
+ except KeyError: # no name--e.g., for _type=lusentence
+ return "<{0} ID={1}>".format(self['_type'], self['ID'])
else:
return self.__repr__()
def _str(self):
outstr = ""
- if "_type" not in self:
+ if not '_type' in self:
outstr = _pretty_any(self)
- elif self["_type"] == "frame":
+ elif self['_type'] == 'frame':
outstr = _pretty_frame(self)
- elif self["_type"] == "fe":
+ elif self['_type'] == 'fe':
outstr = _pretty_fe(self)
- elif self["_type"] == "lu":
+ elif self['_type'] == 'lu':
outstr = _pretty_lu(self)
- elif self["_type"] == "luexemplars": # list of ALL exemplars for LU
+ elif self['_type'] == 'luexemplars': # list of ALL exemplars for LU
outstr = _pretty_exemplars(self, self[0].LU)
- elif (
- self["_type"] == "fulltext_annotation"
- ): # list of all sentences for full-text doc
+ elif self['_type'] == 'fulltext_annotation': # list of all sentences for full-text doc
outstr = _pretty_fulltext_sentences(self)
- elif self["_type"] == "lusentence":
+ elif self['_type'] == 'lusentence':
outstr = _pretty_annotation(self)
- elif self["_type"] == "fulltext_sentence":
+ elif self['_type'] == 'fulltext_sentence':
outstr = _pretty_fulltext_sentence(self)
- elif self["_type"] in ("luannotationset", "fulltext_annotationset"):
+ elif self['_type'] in ('luannotationset', 'fulltext_annotationset'):
outstr = _pretty_annotation(self, aset_level=True)
- elif self["_type"] == "posannotationset":
+ elif self['_type'] == 'posannotationset':
outstr = _pretty_pos(self)
- elif self["_type"] == "semtype":
+ elif self['_type'] == 'semtype':
outstr = _pretty_semtype(self)
- elif self["_type"] == "framerelationtype":
+ elif self['_type'] == 'framerelationtype':
outstr = _pretty_frame_relation_type(self)
- elif self["_type"] == "framerelation":
+ elif self['_type'] == 'framerelation':
outstr = _pretty_frame_relation(self)
- elif self["_type"] == "ferelation":
+ elif self['_type'] == 'ferelation':
outstr = _pretty_fe_relation(self)
else:
outstr = _pretty_any(self)
# ensure result is unicode string prior to applying the
- # decorator (because non-ASCII characters
+ # @python_2_unicode_compatible decorator (because non-ASCII characters
# could in principle occur in the data and would trigger an encoding error when
# passed as arguments to str.format()).
# assert isinstance(outstr, unicode) # not in Python 3.2
def __str__(self):
return self._str()
-
def __repr__(self):
return self.__str__()
-
+@python_2_unicode_compatible
class SpecialList(list):
"""
A list subclass which adds a '_type' attribute for special printing
(similar to an AttrDict, though this is NOT an AttrDict subclass).
"""
-
def __init__(self, typ, *args, **kwargs):
- super(SpecialList, self).__init__(*args, **kwargs)
+ super(SpecialList,self).__init__(*args, **kwargs)
self._type = typ
def _str(self):
outstr = ""
assert self._type
- if len(self) == 0:
+ if len(self)==0:
outstr = "[]"
- elif self._type == "luexemplars": # list of ALL exemplars for LU
+ elif self._type == 'luexemplars': # list of ALL exemplars for LU
outstr = _pretty_exemplars(self, self[0].LU)
else:
- assert False, self._type
+ assert False,self._type
return outstr
def __str__(self):
return self._str()
-
def __repr__(self):
return self.__str__()
-
class Future(object):
"""
Wraps and acts as a proxy for a value to be loaded lazily (on demand).
Adapted from https://gist.github.com/sergey-miryanov/2935416
"""
-
def __init__(self, loader, *args, **kwargs):
"""
:param loader: when called with no arguments, returns the value to be stored
:type loader: callable
"""
- super(Future, self).__init__(*args, **kwargs)
+ super (Future, self).__init__(*args, **kwargs)
self._loader = loader
self._d = None
-
def _data(self):
if callable(self._loader):
self._d = self._loader()
- self._loader = None # the data is now cached
+ self._loader = None # the data is now cached
return self._d
def __nonzero__(self):
return bool(self._data())
-
def __len__(self):
return len(self._data())
def __setitem__(self, key, value):
- return self._data().__setitem__(key, value)
-
+ return self._data ().__setitem__(key, value)
def __getitem__(self, key):
- return self._data().__getitem__(key)
-
+ return self._data ().__getitem__(key)
def __getattr__(self, key):
return self._data().__getattr__(key)
def __str__(self):
return self._data().__str__()
-
def __repr__(self):
return self._data().__repr__()
-
+@python_2_unicode_compatible
class PrettyDict(AttrDict):
"""
Displays an abbreviated repr of values where possible.
Inherits from AttrDict, so a callable value will
be lazily converted to an actual value.
"""
-
def __init__(self, *args, **kwargs):
- _BREAK_LINES = kwargs.pop("breakLines", False)
+ _BREAK_LINES = kwargs.pop('breakLines', False)
super(PrettyDict, self).__init__(*args, **kwargs)
- dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES)
-
+ dict.__setattr__(self, '_BREAK_LINES', _BREAK_LINES)
def __repr__(self):
parts = []
- for k, v in sorted(self.items()):
- kv = repr(k) + ": "
+ for k,v in sorted(self.items()):
+ kv = repr(k)+': '
try:
kv += v._short_repr()
except AttributeError:
kv += repr(v)
parts.append(kv)
- return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}"
-
+ return '{'+(',\n ' if self._BREAK_LINES else ', ').join(parts)+'}'
+@python_2_unicode_compatible
class PrettyList(list):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
"""
-
# from nltk.util
def __init__(self, *args, **kwargs):
- self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60)
- self._BREAK_LINES = kwargs.pop("breakLines", False)
+ self._MAX_REPR_SIZE = kwargs.pop('maxReprSize', 60)
+ self._BREAK_LINES = kwargs.pop('breakLines', False)
super(PrettyList, self).__init__(*args, **kwargs)
-
def __repr__(self):
"""
Return a string representation for this corpus view that is
length = 5
for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
+ pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(
- ",\n " if self._BREAK_LINES else ", "
- ).join(pieces[:-1])
- return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces)
-
+ return "[%s, ...]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces[:-1])
+ return "[%s]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces)
+@python_2_unicode_compatible
class PrettyLazyMap(LazyMap):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
"""
-
# from nltk.util
_MAX_REPR_SIZE = 60
-
def __repr__(self):
"""
Return a string representation for this corpus view that is
pieces = []
length = 5
for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
+ pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
-
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ return "[%s]" % text_type(', ').join(pieces)
+@python_2_unicode_compatible
class PrettyLazyIteratorList(LazyIteratorList):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
"""
-
# from nltk.util
_MAX_REPR_SIZE = 60
-
def __repr__(self):
"""
Return a string representation for this corpus view that is
pieces = []
length = 5
for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
+ pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
-
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ return "[%s]" % text_type(', ').join(pieces)
+@python_2_unicode_compatible
class PrettyLazyConcatenation(LazyConcatenation):
"""
Displays an abbreviated repr of only the first several elements, not the whole list.
"""
-
# from nltk.util
_MAX_REPR_SIZE = 60
-
def __repr__(self):
"""
Return a string representation for this corpus view that is
pieces = []
length = 5
for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
+ pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ return "[%s]" % text_type(', ').join(pieces)
def __add__(self, other):
"""Return a list concatenating self with other."""
True
"""
- _bad_statuses = ["Problem"]
+ _bad_statuses = ['Problem']
"""
When loading LUs for a frame, those whose status is in this list will be ignored.
Due to caching, if user code modifies this, it should do so before loading any data.
# Indexes used for faster look-ups
self._frame_idx = None
- self._cached_frames = {} # name -> ID
+ self._cached_frames = {} # name -> ID
self._lu_idx = None
self._fulltext_idx = None
self._semtypes = None
- self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.)
- self._frel_idx = None # frame-to-frame relation instances
+ self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.)
+ self._frel_idx = None # frame-to-frame relation instances
self._ferel_idx = None # FE-to-FE relation instances
- self._frel_f_idx = None # frame-to-frame relations associated with each frame
+ self._frel_f_idx = None # frame-to-frame relations associated with each frame
def help(self, attrname=None):
"""Display help information summarizing the main methods."""
# as it's easier to just call frame().
# Also not mentioning lu_basic().
+
msg = """
-Citation: Nathan Schneider and Chuck Wooters (2017),
-"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource".
+Citation: Nathan Schneider and Chuck Wooters (2017),
+"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource".
Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438
Use the following methods to access data in FrameNet.
self._buildrelationindex() # always load frame relations before frames,
# otherwise weird ordering effects might result in incomplete information
self._frame_idx = {}
- for f in XMLCorpusView(
- self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt
- ):
- self._frame_idx[f["ID"]] = f
+ for f in XMLCorpusView(self.abspath("frameIndex.xml"),
+ 'frameIndex/frame', self._handle_elt):
+ self._frame_idx[f['ID']] = f
def _buildcorpusindex(self):
# The total number of fulltext annotated documents in Framenet
# is fairly small (~90) so this index should not be very large
self._fulltext_idx = {}
- for doclist in XMLCorpusView(
- self.abspath("fulltextIndex.xml"),
- "fulltextIndex/corpus",
- self._handle_fulltextindex_elt,
- ):
+ for doclist in XMLCorpusView(self.abspath("fulltextIndex.xml"),
+ 'fulltextIndex/corpus',
+ self._handle_fulltextindex_elt):
for doc in doclist:
self._fulltext_idx[doc.ID] = doc
# The number of LUs in Framenet is about 13,000 so this index
# should not be very large
self._lu_idx = {}
- for lu in XMLCorpusView(
- self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt
- ):
- self._lu_idx[
- lu["ID"]
- ] = lu # populate with LU index entries. if any of these
+ for lu in XMLCorpusView(self.abspath("luIndex.xml"),
+ 'luIndex/lu', self._handle_elt):
+ self._lu_idx[lu['ID']] = lu # populate with LU index entries. if any of these
# are looked up they will be replaced by full LU objects.
def _buildrelationindex(self):
- # print('building relation index...', file=sys.stderr)
- freltypes = PrettyList(
- x
- for x in XMLCorpusView(
- self.abspath("frRelation.xml"),
- "frameRelations/frameRelationType",
- self._handle_framerelationtype_elt,
- )
- )
+ #print('building relation index...', file=sys.stderr)
+ freltypes = PrettyList(x for x in XMLCorpusView(self.abspath("frRelation.xml"),
+ 'frameRelations/frameRelationType',
+ self._handle_framerelationtype_elt))
self._freltyp_idx = {}
self._frel_idx = {}
self._frel_f_idx = defaultdict(set)
for freltyp in freltypes:
self._freltyp_idx[freltyp.ID] = freltyp
for frel in freltyp.frameRelations:
- supF = frel.superFrame = frel[freltyp.superFrameName] = Future(
- (lambda fID: lambda: self.frame_by_id(fID))(frel.supID)
- )
- subF = frel.subFrame = frel[freltyp.subFrameName] = Future(
- (lambda fID: lambda: self.frame_by_id(fID))(frel.subID)
- )
+ supF = frel.superFrame = frel[freltyp.superFrameName] = Future((lambda fID: lambda: self.frame_by_id(fID))(frel.supID))
+ subF = frel.subFrame = frel[freltyp.subFrameName] = Future((lambda fID: lambda: self.frame_by_id(fID))(frel.subID))
self._frel_idx[frel.ID] = frel
self._frel_f_idx[frel.supID].add(frel.ID)
self._frel_f_idx[frel.subID].add(frel.ID)
for ferel in frel.feRelations:
ferel.superFrame = supF
ferel.subFrame = subF
- ferel.superFE = Future(
- (lambda fer: lambda: fer.superFrame.FE[fer.superFEName])(ferel)
- )
- ferel.subFE = Future(
- (lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel)
- )
+ ferel.superFE = Future((lambda fer: lambda: fer.superFrame.FE[fer.superFEName])(ferel))
+ ferel.subFE = Future((lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel))
self._ferel_idx[ferel.ID] = ferel
- # print('...done building relation index', file=sys.stderr)
+ #print('...done building relation index', file=sys.stderr)
def _warn(self, *message, **kwargs):
if self._warnings:
- kwargs.setdefault("file", sys.stderr)
+ kwargs.setdefault('file', sys.stderr)
print(*message, **kwargs)
def readme(self):
raise FramenetError("Unknown document id: {0}".format(fn_docid))
# construct the path name for the xml file containing the document info
- locpath = os.path.join("{0}".format(self._root), self._fulltext_dir, xmlfname)
+ locpath = os.path.join(
+ "{0}".format(self._root), self._fulltext_dir, xmlfname)
# Grab the top-level xml element containing the fulltext annotation
- elt = XMLCorpusView(locpath, "fullTextAnnotation")[0]
+ elt = XMLCorpusView(locpath, 'fullTextAnnotation')[0]
info = self._handle_fulltextannotation_elt(elt)
# add metadata
- for k, v in self._fulltext_idx[fn_docid].items():
+ for k,v in self._fulltext_idx[fn_docid].items():
info[k] = v
return info
# get the name of the frame with this id number
try:
fentry = self._frame_idx[fn_fid]
- if "_type" in fentry:
- return fentry # full frame object is cached
- name = fentry["name"]
+ if '_type' in fentry:
+ return fentry # full frame object is cached
+ name = fentry['name']
except TypeError:
self._buildframeindex()
- name = self._frame_idx[fn_fid]["name"]
+ name = self._frame_idx[fn_fid]['name']
except KeyError:
- raise FramenetError("Unknown frame id: {0}".format(fn_fid))
+ raise FramenetError('Unknown frame id: {0}'.format(fn_fid))
return self.frame_by_name(name, ignorekeys, check_cache=False)
# construct the path name for the xml file containing the Frame info
locpath = os.path.join(
- "{0}".format(self._root), self._frame_dir, fn_fname + ".xml"
- )
- # print(locpath, file=sys.stderr)
+ "{0}".format(self._root), self._frame_dir, fn_fname + ".xml")
+ #print(locpath, file=sys.stderr)
# Grab the xml for the frame
try:
- elt = XMLCorpusView(locpath, "frame")[0]
+ elt = XMLCorpusView(locpath, 'frame')[0]
except IOError:
- raise FramenetError("Unknown frame: {0}".format(fn_fname))
+ raise FramenetError('Unknown frame: {0}'.format(fn_fname))
fentry = self._handle_frame_elt(elt, ignorekeys)
assert fentry
- fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml"
+ fentry.URL = self._fnweb_url + '/' + self._frame_dir + '/' + fn_fname + '.xml'
# INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs
for st in fentry.semTypes:
- if st.rootType.name == "Lexical_type":
+ if st.rootType.name=='Lexical_type':
for lu in fentry.lexUnit.values():
- if not any(
- x is st for x in lu.semTypes
- ): # identity containment check
+ if not any(x is st for x in lu.semTypes): # identity containment check
lu.semTypes.append(st)
+
self._frame_idx[fentry.ID] = fentry
self._cached_frames[fentry.name] = fentry.ID
- """
+ '''
# now set up callables to resolve the LU pointers lazily.
# (could also do this here--caching avoids infinite recursion.)
for luName,luinfo in fentry.lexUnit.items():
fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID)
- """
+ '''
return fentry
def frame(self, fn_fid_or_fname, ignorekeys=[]):
"""
# get the frame info by name or id number
- if isinstance(fn_fid_or_fname, str):
+ if isinstance(fn_fid_or_fname, string_types):
f = self.frame_by_name(fn_fid_or_fname, ignorekeys)
else:
f = self.frame_by_id(fn_fid_or_fname, ignorekeys)
search through ALL of the frame XML files in the db.
>>> from nltk.corpus import framenet as fn
- >>> from nltk.corpus.reader.framenet import PrettyList
- >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) # doctest: +ELLIPSIS
+ >>> fn.frames_by_lemma(r'(?i)a little') # doctest: +ELLIPSIS
[<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>]
:return: A list of frame objects.
:rtype: list(AttrDict)
"""
- return PrettyList(
- f
- for f in self.frames()
- if any(re.search(pat, luName) for luName in f.lexUnit)
- )
+ return PrettyList(f for f in self.frames() if any(re.search(pat, luName) for luName in f.lexUnit))
def lu_basic(self, fn_luid):
"""
>>> lu # doctest: +ELLIPSIS
{'ID': 256,
'POS': 'V',
- 'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
+ 'URL': u'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
'_type': 'lu',
'cBy': ...,
'cDate': '02/08/2001 01:27:50 PST Thu',
:return: Basic information about the lexical unit
:rtype: dict
"""
- return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"])
+ return self.lu(fn_luid, ignorekeys=['subCorpus', 'exemplars'])
def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None):
"""
# LU not in the index. We create a placeholder by falling back to
# luName, frameID, and frameName. However, this will not be listed
# among the LUs for its frame.
- self._warn(
- "LU ID not found: {0} ({1}) in {2} ({3})".format(
- luName, fn_luid, frameName, frameID
- )
- )
- luinfo = AttrDict(
- {
- "_type": "lu",
- "ID": fn_luid,
- "name": luName,
- "frameID": frameID,
- "status": "Problem",
- }
- )
+ self._warn('LU ID not found: {0} ({1}) in {2} ({3})'.format(luName, fn_luid, frameName, frameID))
+ luinfo = AttrDict({'_type': 'lu', 'ID': fn_luid, 'name': luName,
+ 'frameID': frameID, 'status': 'Problem'})
f = self.frame_by_id(luinfo.frameID)
- assert f.name == frameName, (f.name, frameName)
- luinfo["frame"] = f
+ assert f.name==frameName,(f.name,frameName)
+ luinfo['frame'] = f
self._lu_idx[fn_luid] = luinfo
- elif "_type" not in luinfo:
+ elif '_type' not in luinfo:
# we only have an index entry for the LU. loading the frame will replace this.
f = self.frame_by_id(luinfo.frameID)
luinfo = self._lu_idx[fn_luid]
if ignorekeys:
- return AttrDict(
- dict((k, v) for k, v in luinfo.items() if k not in ignorekeys)
- )
+ return AttrDict(dict((k, v) for k, v in luinfo.items() if k not in ignorekeys))
return luinfo
fname = "lu{0}.xml".format(fn_luid)
locpath = os.path.join("{0}".format(self._root), self._lu_dir, fname)
- # print(locpath, file=sys.stderr)
+ #print(locpath, file=sys.stderr)
if not self._lu_idx:
self._buildluindex()
try:
- elt = XMLCorpusView(locpath, "lexUnit")[0]
+ elt = XMLCorpusView(locpath, 'lexUnit')[0]
except IOError:
- raise FramenetError("Unknown LU id: {0}".format(fn_luid))
+ raise FramenetError('Unknown LU id: {0}'.format(fn_luid))
lu2 = self._handle_lexunit_elt(elt, ignorekeys)
- lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname
+ lu.URL = self._fnweb_url + '/' + self._lu_dir + '/' + fname
lu.subCorpus = lu2.subCorpus
- lu.exemplars = SpecialList(
- "luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence]
- )
+ lu.exemplars = SpecialList('luexemplars',
+ [sent for subc in lu.subCorpus for sent in subc.sentence])
for sent in lu.exemplars:
- sent["LU"] = lu
- sent["frame"] = lu.frame
+ sent['LU'] = lu
+ sent['frame'] = lu.frame
for aset in sent.annotationSet:
- aset["LU"] = lu
- aset["frame"] = lu.frame
+ aset['LU'] = lu
+ aset['frame'] = lu.frame
return lu
def _loadsemtypes(self):
"""Create the semantic types index."""
self._semtypes = AttrDict()
- semtypeXML = [
- x
- for x in XMLCorpusView(
- self.abspath("semTypes.xml"),
- "semTypes/semType",
- self._handle_semtype_elt,
- )
- ]
+ semtypeXML = [x for x in XMLCorpusView(self.abspath("semTypes.xml"),
+ 'semTypes/semType',
+ self._handle_semtype_elt)]
for st in semtypeXML:
- n = st["name"]
- a = st["abbrev"]
- i = st["ID"]
+ n = st['name']
+ a = st['abbrev']
+ i = st['ID']
# Both name and abbrev should be able to retrieve the
# ID. The ID will retrieve the semantic type dict itself.
self._semtypes[n] = i
st.superType = self.semtype(st.superType.supID)
st.superType.subTypes.append(st)
else:
- if st not in roots:
- roots.append(st)
+ if st not in roots: roots.append(st)
st.rootType = st
queue = list(roots)
assert queue
for child in st.subTypes:
child.rootType = st.rootType
queue.append(child)
- # self.propagate_semtypes() # apply inferencing over FE relations
+ #self.propagate_semtypes() # apply inferencing over FE relations
def propagate_semtypes(self):
"""
try:
if superST and superST is not subST:
# propagate downward
- assert subST is None or self.semtype_inherits(subST, superST), (
- superST.name,
- ferel,
- subST.name,
- )
+ assert subST is None or self.semtype_inherits(subST, superST),(superST.name,ferel,subST.name)
if subST is None:
ferel.subFE.semType = subST = superST
changed = True
nPropagations += 1
- if (
- ferel.type.name in ["Perspective_on", "Subframe", "Precedes"]
- and subST
- and subST is not superST
- ):
+ if ferel.type.name in ['Perspective_on', 'Subframe', 'Precedes'] and subST \
+ and subST is not superST:
# propagate upward
- assert superST is None, (superST.name, ferel, subST.name)
+ assert superST is None,(superST.name,ferel,subST.name)
ferel.superFE.semType = superST = subST
changed = True
nPropagations += 1
except AssertionError as ex:
# bug in the data! ignore
- # print(ex, file=sys.stderr)
+ #print(ex, file=sys.stderr)
continue
- # print(i, nPropagations, file=sys.stderr)
+ #print(i, nPropagations, file=sys.stderr)
def semtype(self, key):
"""
>>> len(fn.frames()) in (1019, 1221) # FN 1.5 and 1.7, resp.
True
>>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True)
- >>> x.sort(key=itemgetter('ID'))
+ >>> x.sort(key=lambda f: f.ID)
>>> x
[<frame ID=200 name=Criminal_process>,
<frame ID=500 name=Criminal_investigation>,
fIDs = list(self._frame_idx.keys())
if name is not None:
- return PrettyList(
- self.frame(fID) for fID, finfo in self.frame_ids_and_names(name).items()
- )
+ return PrettyList(self.frame(fID) for fID,finfo in self.frame_ids_and_names(name).items())
else:
return PrettyLazyMap(self.frame, fIDs)
"""
if not self._frame_idx:
self._buildframeindex()
- return dict(
- (fID, finfo.name)
- for fID, finfo in self._frame_idx.items()
- if name is None or re.search(name, finfo.name) is not None
- )
+ return dict((fID, finfo.name) for fID,finfo in self._frame_idx.items() if name is None or re.search(name, finfo.name) is not None)
def fes(self, name=None, frame=None):
- """
+ '''
Lists frame element objects. If 'name' is provided, this is treated as
a case-insensitive regular expression to filter by frame name.
(Case-insensitivity is because casing of frame element names is not always
:type name: str
:return: A list of matching frame elements
:rtype: list(AttrDict)
- """
+ '''
# what frames are we searching in?
if frame is not None:
if isinstance(frame, int):
frames = [self.frame(frame)]
- elif isinstance(frame, str):
+ elif isinstance(frame, string_types):
frames = self.frames(frame)
else:
frames = [frame]
else:
frames = self.frames()
- return PrettyList(
- fe
- for f in frames
- for fename, fe in f.FE.items()
- if name is None or re.search(name, fename, re.I)
- )
+ return PrettyList(fe for f in frames for fename,fe in f.FE.items() if name is None or re.search(name, fename, re.I))
def lus(self, name=None, frame=None):
"""
>>> from nltk.corpus import framenet as fn
>>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp.
True
- >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')), maxReprSize=0, breakLines=True)
- [<lu ID=14733 name=a little.n>,
- <lu ID=14743 name=a little.adv>,
- <lu ID=14744 name=a little bit.adv>]
- >>> PrettyList(sorted(fn.lus(r'interest', r'(?i)stimulus'), key=itemgetter('ID')))
- [<lu ID=14894 name=interested.a>, <lu ID=14920 name=interesting.a>]
+ >>> PrettyList(fn.lus(r'(?i)a little'), maxReprSize=0, breakLines=True)
+ [<lu ID=14744 name=a little bit.adv>,
+ <lu ID=14733 name=a little.n>,
+ <lu ID=14743 name=a little.adv>]
+ >>> fn.lus(r'interest', r'(?i)stimulus')
+ [<lu ID=14920 name=interesting.a>, <lu ID=14894 name=interested.a>]
A brief intro to Lexical Units (excerpted from "FrameNet II:
Extended Theory and Practice" by Ruppenhofer et. al., 2010):
if not self._lu_idx:
self._buildluindex()
- if name is not None: # match LUs, then restrict by frame
- result = PrettyList(
- self.lu(luID) for luID, luName in self.lu_ids_and_names(name).items()
- )
+
+
+ if name is not None: # match LUs, then restrict by frame
+ result = PrettyList(self.lu(luID) for luID,luName in self.lu_ids_and_names(name).items())
if frame is not None:
if isinstance(frame, int):
frameIDs = {frame}
- elif isinstance(frame, str):
+ elif isinstance(frame, string_types):
frameIDs = {f.ID for f in self.frames(frame)}
else:
frameIDs = {frame.ID}
result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs)
- elif frame is not None: # all LUs in matching frames
+ elif frame is not None: # all LUs in matching frames
if isinstance(frame, int):
frames = [self.frame(frame)]
- elif isinstance(frame, str):
+ elif isinstance(frame, string_types):
frames = self.frames(frame)
else:
frames = [frame]
- result = PrettyLazyIteratorList(
- iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames))
- )
- else: # all LUs
- luIDs = [
- luID
- for luID, lu in self._lu_idx.items()
- if lu.status not in self._bad_statuses
- ]
+ result = PrettyLazyIteratorList(iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames)))
+ else: # all LUs
+ luIDs = [luID for luID,lu in self._lu_idx.items() if lu.status not in self._bad_statuses]
result = PrettyLazyMap(self.lu, luIDs)
return result
"""
if not self._lu_idx:
self._buildluindex()
- return {
- luID: luinfo.name
- for luID, luinfo in self._lu_idx.items()
- if luinfo.status not in self._bad_statuses
- and (name is None or re.search(name, luinfo.name) is not None)
- }
+ return {luID: luinfo.name for luID,luinfo in self._lu_idx.items()
+ if luinfo.status not in self._bad_statuses
+ and (name is None or re.search(name, luinfo.name) is not None)}
def docs_metadata(self, name=None):
"""
if name is None:
return ftlist
else:
- return PrettyList(
- x for x in ftlist if re.search(name, x["filename"]) is not None
- )
+ return PrettyList(x for x in ftlist if re.search(name, x['filename']) is not None)
def docs(self, name=None):
"""
"""
if exemplars:
- epart = PrettyLazyIteratorList(
- sent.frameAnnotation for sent in self.exemplars(luNamePattern)
- )
+ epart = PrettyLazyIteratorList(sent.frameAnnotation for sent in self.exemplars(luNamePattern))
else:
epart = []
if full_text:
if luNamePattern is not None:
matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys())
- ftpart = PrettyLazyIteratorList(
- aset
- for sent in self.ft_sents()
- for aset in sent.annotationSet[1:]
- if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs
- )
+ ftpart = PrettyLazyIteratorList(aset for sent in self.ft_sents() for aset in sent.annotationSet[1:] if luNamePattern is None or aset.get('luID','CXN_ASET') in matchedLUIDs)
else:
ftpart = []
be specified to retrieve sentences with both overt FEs (in either order).
"""
if fe is None and fe2 is not None:
- raise FramenetError("exemplars(..., fe=None, fe2=<value>) is not allowed")
+ raise FramenetError('exemplars(..., fe=None, fe2=<value>) is not allowed')
elif fe is not None and fe2 is not None:
- if not isinstance(fe2, str):
- if isinstance(fe, str):
+ if not isinstance(fe2, string_types):
+ if isinstance(fe, string_types):
# fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame.
fe, fe2 = fe2, fe
- elif fe.frame is not fe2.frame: # ensure frames match
- raise FramenetError(
- "exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)"
- )
- if frame is None and fe is not None and not isinstance(fe, str):
+ elif fe.frame is not fe2.frame: # ensure frames match
+ raise FramenetError('exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)')
+ if frame is None and fe is not None and not isinstance(fe, string_types):
frame = fe.frame
# narrow down to frames matching criteria
- lusByFrame = defaultdict(
- list
- ) # frame name -> matching LUs, if luNamePattern is specified
+ lusByFrame = defaultdict(list) # frame name -> matching LUs, if luNamePattern is specified
if frame is not None or luNamePattern is not None:
- if frame is None or isinstance(frame, str):
+ if frame is None or isinstance(frame, string_types):
if luNamePattern is not None:
frames = set()
for lu in self.lus(luNamePattern, frame=frame):
else:
frames = self.frames(frame)
else:
- if isinstance(frame, int):
+ if isinstance(frame,int):
frames = [self.frame(frame)]
- else: # frame object
+ else: # frame object
frames = [frame]
if luNamePattern is not None:
lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)}
if fe is not None: # narrow to frames that define this FE
- if isinstance(fe, str):
- frames = PrettyLazyIteratorList(
- f
- for f in frames
- if fe in f.FE
- or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys())
- )
+ if isinstance(fe, string_types):
+ frames = PrettyLazyIteratorList(f for f in frames if fe in f.FE or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys()))
else:
if fe.frame not in frames:
- raise FramenetError(
- "exemplars() call with inconsistent `frame` and `fe` specification"
- )
+ raise FramenetError('exemplars() call with inconsistent `frame` and `fe` specification')
frames = [fe.frame]
- if fe2 is not None: # narrow to frames that ALSO define this FE
- if isinstance(fe2, str):
- frames = PrettyLazyIteratorList(
- f
- for f in frames
- if fe2 in f.FE
- or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys())
- )
+ if fe2 is not None: # narrow to frames that ALSO define this FE
+ if isinstance(fe2, string_types):
+ frames = PrettyLazyIteratorList(f for f in frames if fe2 in f.FE or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys()))
# else we already narrowed it to a single frame
- else: # frame, luNamePattern are None. fe, fe2 are None or strings
+ else: # frame, luNamePattern are None. fe, fe2 are None or strings
if fe is not None:
frames = {ffe.frame.ID for ffe in self.fes(fe)}
if fe2 is not None:
def _matching_exs():
for f in frames:
- fes = fes2 = None # FEs of interest
+ fes = fes2 = None # FEs of interest
if fe is not None:
- fes = (
- {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)}
- if isinstance(fe, str)
- else {fe.name}
- )
+ fes = {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)} if isinstance(fe, string_types) else {fe.name}
if fe2 is not None:
- fes2 = (
- {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)}
- if isinstance(fe2, str)
- else {fe2.name}
- )
-
- for lu in (
- lusByFrame[f.name]
- if luNamePattern is not None
- else f.lexUnit.values()
- ):
+ fes2 = {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)} if isinstance(fe2, string_types) else {fe2.name}
+
+ for lu in lusByFrame[f.name] if luNamePattern is not None else f.lexUnit.values():
for ex in lu.exemplars:
- if (fes is None or self._exemplar_of_fes(ex, fes)) and (
- fes2 is None or self._exemplar_of_fes(ex, fes2)
- ):
+ if (fes is None or self._exemplar_of_fes(ex, fes)) and (fes2 is None or self._exemplar_of_fes(ex, fes2)):
yield ex
return PrettyLazyIteratorList(_matching_exs())
If 'fes' is None, returns all overt FE names.
"""
overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set()
- if "FE2" in ex:
+ if 'FE2' in ex:
overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set()
- if "FE3" in ex:
+ if 'FE3' in ex:
overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set()
return overtNames & fes if fes is not None else overtNames
"""
Full-text annotation sentences, optionally filtered by document name.
"""
- return PrettyLazyIteratorList(
- sent for d in self.docs(docNamePattern) for sent in d.sentence
- )
+ return PrettyLazyIteratorList(sent for d in self.docs(docNamePattern) for sent in d.sentence)
+
def frame_relation_types(self):
"""
Obtain a list of frame relation types.
>>> from nltk.corpus import framenet as fn
- >>> frts = sorted(fn.frame_relation_types(), key=itemgetter('ID'))
+ >>> frts = list(fn.frame_relation_types())
>>> isinstance(frts, list)
True
>>> len(frts) in (9, 10) # FN 1.5 and 1.7, resp.
if relation_type is not None:
if not isinstance(relation_type, dict):
- type = [rt for rt in self.frame_relation_types() if rt.name == type][0]
- assert isinstance(type, dict)
+ type = [rt for rt in self.frame_relation_types() if rt.name==type][0]
+ assert isinstance(type,dict)
# lookup by 'frame'
if frame is not None:
- if isinstance(frame, dict) and "frameRelations" in frame:
+ if isinstance(frame,dict) and 'frameRelations' in frame:
rels = PrettyList(frame.frameRelations)
else:
if not isinstance(frame, int):
# filter by 'frame2'
if frame2 is not None:
if frame is None:
- raise FramenetError(
- "frame_relations(frame=None, frame2=<value>) is not allowed"
- )
+ raise FramenetError("frame_relations(frame=None, frame2=<value>) is not allowed")
if not isinstance(frame2, int):
if isinstance(frame2, dict):
frame2 = frame2.ID
else:
frame2 = self.frame_by_name(frame2).ID
- if frame == frame2:
- raise FramenetError(
- "The two frame arguments to frame_relations() must be different frames"
- )
- rels = [
- rel
- for rel in rels
- if rel.superFrame.ID == frame2 or rel.subFrame.ID == frame2
- ]
-
- return PrettyList(
- sorted(
- rels,
- key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName),
- )
- )
+ if frame==frame2:
+ raise FramenetError("The two frame arguments to frame_relations() must be different frames")
+ rels = [rel for rel in rels if rel.superFrame.ID==frame2 or rel.subFrame.ID==frame2]
+
+ return PrettyList(sorted(rels,
+ key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName)))
def fe_relations(self):
"""
"""
if not self._ferel_idx:
self._buildrelationindex()
- return PrettyList(
- sorted(
- self._ferel_idx.values(),
- key=lambda ferel: (
- ferel.type.ID,
- ferel.frameRelation.superFrameName,
- ferel.superFEName,
- ferel.frameRelation.subFrameName,
- ferel.subFEName,
- ),
- )
- )
+ return PrettyList(sorted(self._ferel_idx.values(),
+ key=lambda ferel: (ferel.type.ID, ferel.frameRelation.superFrameName,
+ ferel.superFEName, ferel.frameRelation.subFrameName, ferel.subFEName)))
def semtypes(self):
"""
"""
if not self._semtypes:
self._loadsemtypes()
- return PrettyList(
- self._semtypes[i] for i in self._semtypes if isinstance(i, int)
- )
+ return PrettyList(self._semtypes[i] for i in self._semtypes if isinstance(i, int))
def _load_xml_attributes(self, d, elt):
"""
return d
# Ignore these attributes when loading attributes from an xml node
- ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest
- "xsi",
- "schemaLocation",
- "xmlns",
- "bgColor",
- "fgColor",
- ]
+ ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest
+ 'xsi', 'schemaLocation', 'xmlns', 'bgColor', 'fgColor']
for attr in attr_dict:
"""
try:
- """
+ '''
# Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
if m:
print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr)
- """
-
- data = data.replace("<t>", "")
- data = data.replace("</t>", "")
- data = re.sub('<fex name="[^"]+">', "", data)
- data = data.replace("</fex>", "")
- data = data.replace("<fen>", "")
- data = data.replace("</fen>", "")
- data = data.replace("<m>", "")
- data = data.replace("</m>", "")
- data = data.replace("<ment>", "")
- data = data.replace("</ment>", "")
- data = data.replace("<ex>", "'")
- data = data.replace("</ex>", "'")
- data = data.replace("<gov>", "")
- data = data.replace("</gov>", "")
- data = data.replace("<x>", "")
- data = data.replace("</x>", "")
+ '''
+
+ data = data.replace('<t>', '')
+ data = data.replace('</t>', '')
+ data = re.sub('<fex name="[^"]+">', '', data)
+ data = data.replace('</fex>', '')
+ data = data.replace('<fen>', '')
+ data = data.replace('</fen>', '')
+ data = data.replace('<m>', '')
+ data = data.replace('</m>', '')
+ data = data.replace('<ment>', '')
+ data = data.replace('</ment>', '')
+ data = data.replace('<ex>', "'")
+ data = data.replace('</ex>', "'")
+ data = data.replace('<gov>', '')
+ data = data.replace('</gov>', '')
+ data = data.replace('<x>', '')
+ data = data.replace('</x>', '')
# Get rid of <def-root> and </def-root> tags
- data = data.replace("<def-root>", "")
- data = data.replace("</def-root>", "")
+ data = data.replace('<def-root>', '')
+ data = data.replace('</def-root>', '')
- data = data.replace("\n", " ")
+ data = data.replace('\n', ' ')
except AttributeError:
pass
corpid = ftinfo.ID
retlist = []
for sub in elt:
- if sub.tag.endswith("document"):
+ if sub.tag.endswith('document'):
doc = self._load_xml_attributes(AttrDict(), sub)
- if "name" in doc:
+ if 'name' in doc:
docname = doc.name
else:
docname = doc.description
doc.filename = "{0}__{1}.xml".format(corpname, docname)
- doc.URL = (
- self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename
- )
+ doc.URL = self._fnweb_url + '/' + self._fulltext_dir + '/' + doc.filename
doc.corpname = corpname
doc.corpid = corpid
retlist.append(doc)
"""Load the info for a Frame from a frame xml file"""
frinfo = self._load_xml_attributes(AttrDict(), elt)
- frinfo["_type"] = "frame"
- frinfo["definition"] = ""
- frinfo["definitionMarkup"] = ""
- frinfo["FE"] = PrettyDict()
- frinfo["FEcoreSets"] = []
- frinfo["lexUnit"] = PrettyDict()
- frinfo["semTypes"] = []
+ frinfo['_type'] = 'frame'
+ frinfo['definition'] = ""
+ frinfo['definitionMarkup'] = ""
+ frinfo['FE'] = PrettyDict()
+ frinfo['FEcoreSets'] = []
+ frinfo['lexUnit'] = PrettyDict()
+ frinfo['semTypes'] = []
for k in ignorekeys:
if k in frinfo:
del frinfo[k]
for sub in elt:
- if sub.tag.endswith("definition") and "definition" not in ignorekeys:
- frinfo["definitionMarkup"] = sub.text
- frinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("FE") and "FE" not in ignorekeys:
+ if sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+ frinfo['definitionMarkup'] = sub.text
+ frinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('FE') and 'FE' not in ignorekeys:
feinfo = self._handle_fe_elt(sub)
- frinfo["FE"][feinfo.name] = feinfo
- feinfo["frame"] = frinfo # backpointer
- elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys:
+ frinfo['FE'][feinfo.name] = feinfo
+ feinfo['frame'] = frinfo # backpointer
+ elif sub.tag.endswith('FEcoreSet') and 'FEcoreSet' not in ignorekeys:
coreset = self._handle_fecoreset_elt(sub)
# assumes all FEs have been loaded before coresets
- frinfo["FEcoreSets"].append(
- PrettyList(frinfo["FE"][fe.name] for fe in coreset)
- )
- elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys:
+ frinfo['FEcoreSets'].append(PrettyList(frinfo['FE'][fe.name] for fe in coreset))
+ elif sub.tag.endswith('lexUnit') and 'lexUnit' not in ignorekeys:
luentry = self._handle_framelexunit_elt(sub)
- if luentry["status"] in self._bad_statuses:
+ if luentry['status'] in self._bad_statuses:
# problematic LU entry; ignore it
continue
- luentry["frame"] = frinfo
- luentry["URL"] = (
- self._fnweb_url
- + "/"
- + self._lu_dir
- + "/"
- + "lu{0}.xml".format(luentry["ID"])
- )
- luentry["subCorpus"] = Future(
- (lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry)
- )
- luentry["exemplars"] = Future(
- (lambda lu: lambda: self._lu_file(lu).exemplars)(luentry)
- )
- frinfo["lexUnit"][luentry.name] = luentry
+ luentry['frame'] = frinfo
+ luentry['URL'] = self._fnweb_url + '/' + self._lu_dir + '/' + "lu{0}.xml".format(luentry['ID'])
+ luentry['subCorpus'] = Future((lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry))
+ luentry['exemplars'] = Future((lambda lu: lambda: self._lu_file(lu).exemplars)(luentry))
+ frinfo['lexUnit'][luentry.name] = luentry
if not self._lu_idx:
self._buildluindex()
self._lu_idx[luentry.ID] = luentry
- elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys:
+ elif sub.tag.endswith('semType') and 'semTypes' not in ignorekeys:
semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
- frinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+ frinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
- frinfo["frameRelations"] = self.frame_relations(frame=frinfo)
+ frinfo['frameRelations'] = self.frame_relations(frame=frinfo)
# resolve 'requires' and 'excludes' links between FEs of this frame
for fe in frinfo.FE.values():
if fe.requiresFE:
name, ID = fe.requiresFE.name, fe.requiresFE.ID
fe.requiresFE = frinfo.FE[name]
- assert fe.requiresFE.ID == ID
+ assert fe.requiresFE.ID==ID
if fe.excludesFE:
name, ID = fe.excludesFE.name, fe.excludesFE.ID
fe.excludesFE = frinfo.FE[name]
- assert fe.excludesFE.ID == ID
+ assert fe.excludesFE.ID==ID
return frinfo
def _handle_framerelationtype_elt(self, elt, *args):
"""Load frame-relation element and its child fe-relation elements from frRelation.xml."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "framerelationtype"
- info["frameRelations"] = PrettyList()
+ info['_type'] = 'framerelationtype'
+ info['frameRelations'] = PrettyList()
for sub in elt:
- if sub.tag.endswith("frameRelation"):
+ if sub.tag.endswith('frameRelation'):
frel = self._handle_framerelation_elt(sub)
- frel["type"] = info # backpointer
+ frel['type'] = info # backpointer
for ferel in frel.feRelations:
- ferel["type"] = info
- info["frameRelations"].append(frel)
+ ferel['type'] = info
+ info['frameRelations'].append(frel)
return info
def _handle_framerelation_elt(self, elt):
"""Load frame-relation element and its child fe-relation elements from frRelation.xml."""
info = self._load_xml_attributes(AttrDict(), elt)
- assert info["superFrameName"] != info["subFrameName"], (elt, info)
- info["_type"] = "framerelation"
- info["feRelations"] = PrettyList()
+ assert info['superFrameName']!=info['subFrameName'],(elt,info)
+ info['_type'] = 'framerelation'
+ info['feRelations'] = PrettyList()
for sub in elt:
- if sub.tag.endswith("FERelation"):
+ if sub.tag.endswith('FERelation'):
ferel = self._handle_elt(sub)
- ferel["_type"] = "ferelation"
- ferel["frameRelation"] = info # backpointer
- info["feRelations"].append(ferel)
+ ferel['_type'] = 'ferelation'
+ ferel['frameRelation'] = info # backpointer
+ info['feRelations'].append(ferel)
return info
element (which we ignore here) and a bunch of 'sentence'
elements."""
info = AttrDict()
- info["_type"] = "fulltext_annotation"
- info["sentence"] = []
+ info['_type'] = 'fulltext_annotation'
+ info['sentence'] = []
for sub in elt:
- if sub.tag.endswith("header"):
+ if sub.tag.endswith('header'):
continue # not used
- elif sub.tag.endswith("sentence"):
+ elif sub.tag.endswith('sentence'):
s = self._handle_fulltext_sentence_elt(sub)
s.doc = info
- info["sentence"].append(s)
+ info['sentence'].append(s)
return info
'sentence' element contains a "text" and "annotationSet" sub
elements."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "fulltext_sentence"
- info["annotationSet"] = []
- info["targets"] = []
+ info['_type'] = "fulltext_sentence"
+ info['annotationSet'] = []
+ info['targets'] = []
target_spans = set()
- info["_ascii"] = types.MethodType(
- _annotation_ascii, info
- ) # attach a method for this instance
- info["text"] = ""
+ info['_ascii'] = types.MethodType(_annotation_ascii, info) # attach a method for this instance
+ info['text'] = ""
for sub in elt:
- if sub.tag.endswith("text"):
- info["text"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("annotationSet"):
- a = self._handle_fulltextannotationset_elt(
- sub, is_pos=(len(info["annotationSet"]) == 0)
- )
- if "cxnID" in a: # ignoring construction annotations for now
+ if sub.tag.endswith('text'):
+ info['text'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('annotationSet'):
+ a = self._handle_fulltextannotationset_elt(sub, is_pos=(len(info['annotationSet'])==0))
+ if 'cxnID' in a: # ignoring construction annotations for now
continue
a.sent = info
a.text = info.text
- info["annotationSet"].append(a)
- if "Target" in a:
+ info['annotationSet'].append(a)
+ if 'Target' in a:
for tspan in a.Target:
if tspan in target_spans:
- self._warn(
- 'Duplicate target span "{0}"'.format(
- info.text[slice(*tspan)]
- ),
- tspan,
- "in sentence",
- info["ID"],
- info.text,
- )
+ self._warn('Duplicate target span "{0}"'.format(info.text[slice(*tspan)]),
+ tspan, 'in sentence',info['ID'], info.text)
# this can happen in cases like "chemical and biological weapons"
# being annotated as "chemical weapons" and "biological weapons"
else:
target_spans.add(tspan)
- info["targets"].append((a.Target, a.luName, a.frameName))
+ info['targets'].append((a.Target, a.luName, a.frameName))
- assert info["annotationSet"][0].status == "UNANN"
- info["POS"] = info["annotationSet"][0].POS
- info["POS_tagset"] = info["annotationSet"][0].POS_tagset
+ assert info['annotationSet'][0].status=='UNANN'
+ info['POS'] = info['annotationSet'][0].POS
+ info['POS_tagset'] = info['annotationSet'][0].POS_tagset
return info
def _handle_fulltextannotationset_elt(self, elt, is_pos=False):
info = self._handle_luannotationset_elt(elt, is_pos=is_pos)
if not is_pos:
- info["_type"] = "fulltext_annotationset"
- if "cxnID" not in info: # ignoring construction annotations for now
- info["LU"] = self.lu(
- info.luID,
- luName=info.luName,
- frameID=info.frameID,
- frameName=info.frameName,
- )
- info["frame"] = info.LU.frame
+ info['_type'] = 'fulltext_annotationset'
+ if 'cxnID' not in info: # ignoring construction annotations for now
+ info['LU'] = self.lu(info.luID, luName=info.luName, frameID=info.frameID, frameName=info.frameName)
+ info['frame'] = info.LU.frame
return info
def _handle_fulltextlayer_elt(self, elt):
"""Load information from the given 'layer' element. Each
'layer' contains several "label" elements."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "layer"
- info["label"] = []
+ info['_type'] = 'layer'
+ info['label'] = []
for sub in elt:
- if sub.tag.endswith("label"):
+ if sub.tag.endswith('label'):
l = self._load_xml_attributes(AttrDict(), sub)
- info["label"].append(l)
+ info['label'].append(l)
return info
def _handle_framelexunit_elt(self, elt):
"""Load the lexical unit info from an xml element in a frame's xml file."""
luinfo = AttrDict()
- luinfo["_type"] = "lu"
+ luinfo['_type'] = 'lu'
luinfo = self._load_xml_attributes(luinfo, elt)
luinfo["definition"] = ""
luinfo["definitionMarkup"] = ""
luinfo["sentenceCount"] = PrettyDict()
- luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes
- luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes
+ luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes
+ luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes
for sub in elt:
- if sub.tag.endswith("definition"):
- luinfo["definitionMarkup"] = sub.text
- luinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("sentenceCount"):
- luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub)
- elif sub.tag.endswith("lexeme"):
+ if sub.tag.endswith('definition'):
+ luinfo['definitionMarkup'] = sub.text
+ luinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('sentenceCount'):
+ luinfo['sentenceCount'] = self._load_xml_attributes(
+ PrettyDict(), sub)
+ elif sub.tag.endswith('lexeme'):
lexemeinfo = self._load_xml_attributes(PrettyDict(), sub)
- if not isinstance(lexemeinfo.name, str):
+ if not isinstance(lexemeinfo.name, string_types):
# some lexeme names are ints by default: e.g.,
# thousand.num has lexeme with name="1000"
lexemeinfo.name = str(lexemeinfo.name)
- luinfo["lexemes"].append(lexemeinfo)
- elif sub.tag.endswith("semType"):
+ luinfo['lexemes'].append(lexemeinfo)
+ elif sub.tag.endswith('semType'):
semtypeinfo = self._load_xml_attributes(PrettyDict(), sub)
- luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+ luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
# sort lexemes by 'order' attribute
# otherwise, e.g., 'write down.v' may have lexemes in wrong order
- luinfo["lexemes"].sort(key=lambda x: x.order)
+ luinfo['lexemes'].sort(key=lambda x: x.order)
return luinfo
(which are not included in frame files).
"""
luinfo = self._load_xml_attributes(AttrDict(), elt)
- luinfo["_type"] = "lu"
- luinfo["definition"] = ""
- luinfo["definitionMarkup"] = ""
- luinfo["subCorpus"] = PrettyList()
- luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes
- luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes
+ luinfo['_type'] = 'lu'
+ luinfo['definition'] = ""
+ luinfo['definitionMarkup'] = ""
+ luinfo['subCorpus'] = PrettyList()
+ luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes
+ luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes
for k in ignorekeys:
if k in luinfo:
del luinfo[k]
for sub in elt:
- if sub.tag.endswith("header"):
+ if sub.tag.endswith('header'):
continue # not used
- elif sub.tag.endswith("valences"):
+ elif sub.tag.endswith('valences'):
continue # not used
- elif sub.tag.endswith("definition") and "definition" not in ignorekeys:
- luinfo["definitionMarkup"] = sub.text
- luinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys:
+ elif sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+ luinfo['definitionMarkup'] = sub.text
+ luinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('subCorpus') and 'subCorpus' not in ignorekeys:
sc = self._handle_lusubcorpus_elt(sub)
if sc is not None:
- luinfo["subCorpus"].append(sc)
- elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys:
- luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub))
- elif sub.tag.endswith("semType") and "semType" not in ignorekeys:
+ luinfo['subCorpus'].append(sc)
+ elif sub.tag.endswith('lexeme') and 'lexeme' not in ignorekeys:
+ luinfo['lexemes'].append(self._load_xml_attributes(PrettyDict(), sub))
+ elif sub.tag.endswith('semType') and 'semType' not in ignorekeys:
semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
- luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+ luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
return luinfo
"""Load a subcorpus of a lexical unit from the given xml."""
sc = AttrDict()
try:
- sc["name"] = elt.get("name")
+ sc['name'] = elt.get('name')
except AttributeError:
return None
- sc["_type"] = "lusubcorpus"
- sc["sentence"] = []
+ sc['_type'] = "lusubcorpus"
+ sc['sentence'] = []
for sub in elt:
- if sub.tag.endswith("sentence"):
+ if sub.tag.endswith('sentence'):
s = self._handle_lusentence_elt(sub)
if s is not None:
- sc["sentence"].append(s)
+ sc['sentence'].append(s)
return sc
def _handle_lusentence_elt(self, elt):
"""Load a sentence from a subcorpus of an LU from xml."""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "lusentence"
- info["annotationSet"] = []
- info["_ascii"] = types.MethodType(
- _annotation_ascii, info
- ) # attach a method for this instance
+ info['_type'] = 'lusentence'
+ info['annotationSet'] = []
+ info['_ascii'] = types.MethodType(_annotation_ascii, info) # attach a method for this instance
for sub in elt:
- if sub.tag.endswith("text"):
- info["text"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("annotationSet"):
- annset = self._handle_luannotationset_elt(
- sub, is_pos=(len(info["annotationSet"]) == 0)
- )
+ if sub.tag.endswith('text'):
+ info['text'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('annotationSet'):
+ annset = self._handle_luannotationset_elt(sub, is_pos=(len(info['annotationSet'])==0))
if annset is not None:
- assert annset.status == "UNANN" or "FE" in annset, annset
- if annset.status != "UNANN":
- info["frameAnnotation"] = annset
+ assert annset.status=='UNANN' or 'FE' in annset,annset
+ if annset.status!='UNANN':
+ info['frameAnnotation'] = annset
# copy layer info up to current level
- for k in (
- "Target",
- "FE",
- "FE2",
- "FE3",
- "GF",
- "PT",
- "POS",
- "POS_tagset",
- "Other",
- "Sent",
- "Verb",
- "Noun",
- "Adj",
- "Adv",
- "Prep",
- "Scon",
- "Art",
- ):
+ for k in ('Target', 'FE', 'FE2', 'FE3', 'GF', 'PT', 'POS', 'POS_tagset',
+ 'Other', 'Sent', 'Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
if k in annset:
info[k] = annset[k]
- info["annotationSet"].append(annset)
- annset["sent"] = info
- annset["text"] = info.text
+ info['annotationSet'].append(annset)
+ annset['sent'] = info
+ annset['text'] = info.text
return info
def _handle_luannotationset_elt(self, elt, is_pos=False):
"""Load an annotation set from a sentence in an subcorpus of an LU"""
info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "posannotationset" if is_pos else "luannotationset"
- info["layer"] = []
- info["_ascii"] = types.MethodType(
- _annotation_ascii, info
- ) # attach a method for this instance
+ info['_type'] = 'posannotationset' if is_pos else 'luannotationset'
+ info['layer'] = []
+ info['_ascii'] = types.MethodType(_annotation_ascii, info) # attach a method for this instance
- if "cxnID" in info: # ignoring construction annotations for now.
+ if 'cxnID' in info: # ignoring construction annotations for now.
return info
for sub in elt:
- if sub.tag.endswith("layer"):
+ if sub.tag.endswith('layer'):
l = self._handle_lulayer_elt(sub)
if l is not None:
overt = []
- ni = {} # null instantiations
+ ni = {} # null instantiations
- info["layer"].append(l)
+ info['layer'].append(l)
for lbl in l.label:
- if "start" in lbl:
- thespan = (lbl.start, lbl.end + 1, lbl.name)
- if l.name not in (
- "Sent",
- "Other",
- ): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans
- assert thespan not in overt, (info.ID, l.name, thespan)
+ if 'start' in lbl:
+ thespan = (lbl.start,lbl.end+1,lbl.name)
+ if l.name not in ('Sent','Other'): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans
+ assert thespan not in overt,(info.ID,l.name,thespan)
overt.append(thespan)
- else: # null instantiation
+ else: # null instantiation
if lbl.name in ni:
- self._warn(
- "FE with multiple NI entries:",
- lbl.name,
- ni[lbl.name],
- lbl.itype,
- )
+ self._warn('FE with multiple NI entries:', lbl.name, ni[lbl.name], lbl.itype)
else:
ni[lbl.name] = lbl.itype
overt = sorted(overt)
- if l.name == "Target":
+ if l.name=='Target':
if not overt:
- self._warn(
- "Skipping empty Target layer in annotation set ID={0}".format(
- info.ID
- )
- )
+ self._warn('Skipping empty Target layer in annotation set ID={0}'.format(info.ID))
continue
- assert all(lblname == "Target" for i, j, lblname in overt)
- if "Target" in info:
- self._warn(
- "Annotation set {0} has multiple Target layers".format(
- info.ID
- )
- )
+ assert all(lblname=='Target' for i,j,lblname in overt)
+ if 'Target' in info:
+ self._warn('Annotation set {0} has multiple Target layers'.format(info.ID))
else:
- info["Target"] = [(i, j) for (i, j, _) in overt]
- elif l.name == "FE":
- if l.rank == 1:
- assert "FE" not in info
- info["FE"] = (overt, ni)
- # assert False,info
+ info['Target'] = [(i,j) for (i,j,_) in overt]
+ elif l.name=='FE':
+ if l.rank==1:
+ assert 'FE' not in info
+ info['FE'] = (overt, ni)
+ #assert False,info
else:
# sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v
- assert 2 <= l.rank <= 3, l.rank
- k = "FE" + str(l.rank)
+ assert 2<=l.rank<=3,l.rank
+ k = 'FE'+str(l.rank)
assert k not in info
info[k] = (overt, ni)
- elif l.name in ("GF", "PT"):
- assert l.rank == 1
+ elif l.name in ('GF', 'PT'):
+ assert l.rank==1
info[l.name] = overt
- elif l.name in ("BNC", "PENN"):
- assert l.rank == 1
- info["POS"] = overt
- info["POS_tagset"] = l.name
+ elif l.name in ('BNC', 'PENN'):
+ assert l.rank==1
+ info['POS'] = overt
+ info['POS_tagset'] = l.name
else:
if is_pos:
- if l.name not in ("NER", "WSL"):
- self._warn(
- "Unexpected layer in sentence annotationset:",
- l.name,
- )
+ if l.name not in ('NER', 'WSL'):
+ self._warn('Unexpected layer in sentence annotationset:', l.name)
else:
- if l.name not in (
- "Sent",
- "Verb",
- "Noun",
- "Adj",
- "Adv",
- "Prep",
- "Scon",
- "Art",
- "Other",
- ):
- self._warn(
- "Unexpected layer in frame annotationset:", l.name
- )
+ if l.name not in ('Sent', 'Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art', 'Other'):
+ self._warn('Unexpected layer in frame annotationset:', l.name)
info[l.name] = overt
- if not is_pos and "cxnID" not in info:
- if "Target" not in info:
- self._warn("Missing target in annotation set ID={0}".format(info.ID))
- assert "FE" in info
- if "FE3" in info:
- assert "FE2" in info
+ if not is_pos and 'cxnID' not in info:
+ if 'Target' not in info:
+ self._warn('Missing target in annotation set ID={0}'.format(info.ID))
+ assert 'FE' in info
+ if 'FE3' in info:
+ assert 'FE2' in info
return info
def _handle_lulayer_elt(self, elt):
"""Load a layer from an annotation set"""
layer = self._load_xml_attributes(AttrDict(), elt)
- layer["_type"] = "lulayer"
- layer["label"] = []
+ layer['_type'] = 'lulayer'
+ layer['label'] = []
for sub in elt:
- if sub.tag.endswith("label"):
+ if sub.tag.endswith('label'):
l = self._load_xml_attributes(AttrDict(), sub)
if l is not None:
- layer["label"].append(l)
+ layer['label'].append(l)
return layer
def _handle_fe_elt(self, elt):
feinfo = self._load_xml_attributes(AttrDict(), elt)
- feinfo["_type"] = "fe"
- feinfo["definition"] = ""
- feinfo["definitionMarkup"] = ""
- feinfo["semType"] = None
- feinfo["requiresFE"] = None
- feinfo["excludesFE"] = None
+ feinfo['_type'] = 'fe'
+ feinfo['definition'] = ""
+ feinfo['definitionMarkup'] = ""
+ feinfo['semType'] = None
+ feinfo['requiresFE'] = None
+ feinfo['excludesFE'] = None
for sub in elt:
- if sub.tag.endswith("definition"):
- feinfo["definitionMarkup"] = sub.text
- feinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("semType"):
+ if sub.tag.endswith('definition'):
+ feinfo['definitionMarkup'] = sub.text
+ feinfo['definition'] = self._strip_tags(sub.text)
+ elif sub.tag.endswith('semType'):
stinfo = self._load_xml_attributes(AttrDict(), sub)
- feinfo["semType"] = self.semtype(stinfo.ID)
- elif sub.tag.endswith("requiresFE"):
- feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub)
- elif sub.tag.endswith("excludesFE"):
- feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub)
+ feinfo['semType'] = self.semtype(stinfo.ID)
+ elif sub.tag.endswith('requiresFE'):
+ feinfo['requiresFE'] = self._load_xml_attributes(AttrDict(), sub)
+ elif sub.tag.endswith('excludesFE'):
+ feinfo['excludesFE'] = self._load_xml_attributes(AttrDict(), sub)
return feinfo
def _handle_semtype_elt(self, elt, tagspec=None):
semt = self._load_xml_attributes(AttrDict(), elt)
- semt["_type"] = "semtype"
- semt["superType"] = None
- semt["subTypes"] = PrettyList()
+ semt['_type'] = 'semtype'
+ semt['superType'] = None
+ semt['subTypes'] = PrettyList()
for sub in elt:
if sub.text is not None:
- semt["definitionMarkup"] = sub.text
- semt["definition"] = self._strip_tags(sub.text)
+ semt['definitionMarkup'] = sub.text
+ semt['definition'] = self._strip_tags(sub.text)
else:
supertypeinfo = self._load_xml_attributes(AttrDict(), sub)
- semt["superType"] = supertypeinfo
+ semt['superType'] = supertypeinfo
# the supertype may not have been loaded yet
return semt
# buildindexes(). We do this here just for demo purposes. If the
# indexes are not built explicitely, they will be built as needed.
#
- print("Building the indexes...")
+ print('Building the indexes...')
fn.buildindexes()
#
# Get some statistics about the corpus
#
- print("Number of Frames:", len(fn.frames()))
- print("Number of Lexical Units:", len(fn.lus()))
- print("Number of annotated documents:", len(fn.docs()))
+ print('Number of Frames:', len(fn.frames()))
+ print('Number of Lexical Units:', len(fn.lus()))
+ print('Number of annotated documents:', len(fn.docs()))
print()
#
# Frames
#
+ print('getting frames whose name matches the (case insensitive) regex: "(?i)medical"')
+ medframes = fn.frames(r'(?i)medical')
print(
- 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"'
- )
- medframes = fn.frames(r"(?i)medical")
- print('Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes)))
+ 'Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes)))
print([(f.name, f.ID) for f in medframes])
#
# get the frame relations
#
print(
- '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(
- m_frame.name, m_frame.ID
- ),
- len(m_frame.frameRelations),
- )
+ '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(m_frame.name,
+ m_frame.ID),
+ len(m_frame.frameRelations))
for fr in m_frame.frameRelations:
- print(" ", fr)
+ print(' ', fr)
#
# get the names of the Frame Elements
#
print(
'\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name),
- len(m_frame.FE),
- )
- print(" ", [x for x in m_frame.FE])
+ len(m_frame.FE))
+ print(' ', [x for x in m_frame.FE])
#
# get the names of the "Core" Frame Elements
#
- print('\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name))
- print(" ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
+ print(
+ '\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name))
+ print(' ', [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
#
# get all of the Lexical Units that are incorporated in the
#
print('\nAll Lexical Units that are incorporated in the "Ailment" FE:')
m_frame = fn.frame(239)
- ailment_lus = [
- x
- for x in m_frame.lexUnit.values()
- if "incorporatedFE" in x and x.incorporatedFE == "Ailment"
- ]
- print(" ", [x.name for x in ailment_lus])
+ ailment_lus = [x for x in m_frame.lexUnit.values() if 'incorporatedFE' in x and x.incorporatedFE == 'Ailment']
+ print(' ', [x.name for x in ailment_lus])
#
# get all of the Lexical Units for the frame
#
- print(
- '\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
- len(m_frame.lexUnit),
- )
- print(" ", [x.name for x in m_frame.lexUnit.values()][:5], "...")
+ print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
+ len(m_frame.lexUnit))
+ print(' ', [x.name for x in m_frame.lexUnit.values()][:5], '...')
#
# get basic info on the second LU in the frame
#
- tmp_id = m_frame.lexUnit["ailment.n"].ID # grab the id of the specified LU
+ tmp_id = m_frame.lexUnit['ailment.n'].ID # grab the id of the specified LU
luinfo = fn.lu_basic(tmp_id) # get basic info on the LU
- print("\nInformation on the LU: {0}".format(luinfo.name))
+ print('\nInformation on the LU: {0}'.format(luinfo.name))
pprint(luinfo)
#
# Get a list of all of the corpora used for fulltext annotation
#
- print("\nNames of all of the corpora used for fulltext annotation:")
- allcorpora = set(x.corpname for x in fn.docs_metadata())
+ print('\nNames of all of the corpora used for fulltext annotation:')
+ allcorpora = set([x.corpname for x in fn.docs_metadata()])
pprint(list(allcorpora))
#
#
firstcorp = list(allcorpora)[0]
firstcorp_docs = fn.docs(firstcorp)
- print('\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp))
+ print(
+ '\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp))
pprint([x.filename for x in firstcorp_docs])
#
# lemmas to frames because each time frames_by_lemma() is
# called, it has to search through ALL of the frame XML files
# in the db.
- print(
- '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":'
- )
- pprint(fn.frames_by_lemma(r"^run.v$"))
-
+ print('\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":')
+ pprint(fn.frames_by_lemma(r'^run.v$'))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: IEER Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
The corpus contains the following files: APW_19980314, APW_19980424,
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
"""
+from __future__ import unicode_literals
+
+from six import string_types
import nltk
+from nltk import compat
from nltk.corpus.reader.api import *
#: A dictionary whose keys are the names of documents in this corpus;
#: and whose values are descriptions of those documents' contents.
titles = {
- "APW_19980314": "Associated Press Weekly, 14 March 1998",
- "APW_19980424": "Associated Press Weekly, 24 April 1998",
- "APW_19980429": "Associated Press Weekly, 29 April 1998",
- "NYT_19980315": "New York Times, 15 March 1998",
- "NYT_19980403": "New York Times, 3 April 1998",
- "NYT_19980407": "New York Times, 7 April 1998",
-}
+ 'APW_19980314': 'Associated Press Weekly, 14 March 1998',
+ 'APW_19980424': 'Associated Press Weekly, 24 April 1998',
+ 'APW_19980429': 'Associated Press Weekly, 29 April 1998',
+ 'NYT_19980315': 'New York Times, 15 March 1998',
+ 'NYT_19980403': 'New York Times, 3 April 1998',
+ 'NYT_19980407': 'New York Times, 7 April 1998',
+ }
#: A list of all documents in this corpus.
documents = sorted(titles)
-
-
+@compat.python_2_unicode_compatible
class IEERDocument(object):
- def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
+ def __init__(self, text, docno=None, doctype=None,
+ date_time=None, headline=''):
self.text = text
self.docno = docno
self.doctype = doctype
def __repr__(self):
if self.headline:
- headline = " ".join(self.headline.leaves())
+ headline = ' '.join(self.headline.leaves())
else:
- headline = (
- " ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
- )
+ headline = ' '.join([w for w in self.text.leaves()
+ if w[:1] != '<'][:12])+'...'
if self.docno is not None:
- return "<IEERDocument %s: %r>" % (self.docno, headline)
+ return '<IEERDocument %s: %r>' % (self.docno, headline)
else:
- return "<IEERDocument: %r>" % headline
-
+ return '<IEERDocument: %r>' % headline
class IEERCorpusReader(CorpusReader):
"""
"""
-
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def docs(self, fileids=None):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, self._read_block,
+ encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def parsed_docs(self, fileids=None):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
-
- def _read_parsed_block(self, stream):
+ return concat([StreamBackedCorpusView(fileid,
+ self._read_parsed_block,
+ encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
+
+ def _read_parsed_block(self,stream):
# TODO: figure out while empty documents are being returned
- return [
- self._parse(doc)
- for doc in self._read_block(stream)
- if self._parse(doc).docno is not None
- ]
+ return [self._parse(doc) for doc in self._read_block(stream)
+ if self._parse(doc).docno is not None]
def _parse(self, doc):
val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
# Skip any preamble.
while True:
line = stream.readline()
- if not line:
- break
- if line.strip() == "<DOC>":
- break
+ if not line: break
+ if line.strip() == '<DOC>': break
out.append(line)
# Read the document
while True:
line = stream.readline()
- if not line:
- break
+ if not line: break
out.append(line)
- if line.strip() == "</DOC>":
- break
+ if line.strip() == '</DOC>': break
# Return the document
- return ["\n".join(out)]
+ return ['\n'.join(out)]
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
- Telugu: IIIT Hyderabad
"""
+from six import string_types
+
from nltk.tag import str2tuple, map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class IndianCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
-
def words(self, fileids=None):
- return concat(
- [
- IndianCorpusView(fileid, enc, False, False)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([IndianCorpusView(fileid, enc,
+ False, False)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_words(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
- return concat(
- [
- IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([IndianCorpusView(fileid, enc,
+ True, False, tag_mapping_function)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def sents(self, fileids=None):
- return concat(
- [
- IndianCorpusView(fileid, enc, False, True)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([IndianCorpusView(fileid, enc,
+ False, True)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_sents(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
- return concat(
- [
- IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([IndianCorpusView(fileid, enc,
+ True, True, tag_mapping_function)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
class IndianCorpusView(StreamBackedCorpusView):
- def __init__(
- self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
- ):
+ def __init__(self, corpus_file, encoding, tagged,
+ group_by_sent, tag_mapping_function=None):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._tag_mapping_function = tag_mapping_function
def read_block(self, stream):
line = stream.readline()
- if line.startswith("<"):
+ if line.startswith('<'):
return []
- sent = [str2tuple(word, sep="_") for word in line.split()]
+ sent = [str2tuple(word, sep='_') for word in line.split()]
if self._tag_mapping_function:
- sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
- if not self._tagged:
- sent = [w for (w, t) in sent]
+ sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
+ if not self._tagged: sent = [w for (w,t) in sent]
if self._group_by_sent:
return [sent]
else:
# Natural Language Toolkit: IPI PAN Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import functools
+from six import string_types
+
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
from nltk.corpus.reader.api import CorpusReader
-
def _parse_args(fun):
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
- kwargs.pop("tags", None)
+ kwargs.pop('tags', None)
if not fileids:
fileids = self.fileids()
return fun(self, fileids, **kwargs)
-
return decorator
-
class IPIPANCorpusReader(CorpusReader):
"""
Corpus reader designed to work with corpus created by IPI PAN.
filecontents = []
for fileid in self._list_morph_files(fileids):
- with open(fileid, "r") as infile:
+ with open(fileid, 'r') as infile:
filecontents.append(infile.read())
- return "".join(filecontents)
+ return ''.join(filecontents)
def channels(self, fileids=None):
if not fileids:
fileids = self.fileids()
- return self._parse_header(fileids, "channel")
+ return self._parse_header(fileids, 'channel')
def domains(self, fileids=None):
if not fileids:
fileids = self.fileids()
- return self._parse_header(fileids, "domain")
+ return self._parse_header(fileids, 'domain')
def categories(self, fileids=None):
if not fileids:
fileids = self.fileids()
- return [
- self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
- ]
+ return [self._map_category(cat)
+ for cat in self._parse_header(fileids, 'keyTerm')]
def fileids(self, channels=None, domains=None, categories=None):
- if channels is not None and domains is not None and categories is not None:
- raise ValueError(
- "You can specify only one of channels, domains "
- "and categories parameter at once"
- )
- if channels is None and domains is None and categories is None:
+ if channels is not None and domains is not None and \
+ categories is not None:
+ raise ValueError('You can specify only one of channels, domains '
+ 'and categories parameter at once')
+ if channels is None and domains is None and \
+ categories is None:
return CorpusReader.fileids(self)
- if isinstance(channels, str):
+ if isinstance(channels, string_types):
channels = [channels]
- if isinstance(domains, str):
+ if isinstance(domains, string_types):
domains = [domains]
- if isinstance(categories, str):
+ if isinstance(categories, string_types):
categories = [categories]
if channels:
- return self._list_morph_files_by("channel", channels)
+ return self._list_morph_files_by('channel', channels)
elif domains:
- return self._list_morph_files_by("domain", domains)
+ return self._list_morph_files_by('domain', domains)
else:
- return self._list_morph_files_by(
- "keyTerm", categories, map=self._map_category
- )
+ return self._list_morph_files_by('keyTerm', categories,
+ map=self._map_category)
@_parse_args
def sents(self, fileids=None, **kwargs):
- return concat(
- [
- self._view(
- fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
- )
- for fileid in self._list_morph_files(fileids)
- ]
- )
+ return concat([self._view(fileid,
+ mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs)
+ for fileid in self._list_morph_files(fileids)])
@_parse_args
def paras(self, fileids=None, **kwargs):
- return concat(
- [
- self._view(
- fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
- )
- for fileid in self._list_morph_files(fileids)
- ]
- )
+ return concat([self._view(fileid,
+ mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs)
+ for fileid in self._list_morph_files(fileids)])
@_parse_args
def words(self, fileids=None, **kwargs):
- return concat(
- [
- self._view(fileid, tags=False, **kwargs)
- for fileid in self._list_morph_files(fileids)
- ]
- )
+ return concat([self._view(fileid, tags=False, **kwargs)
+ for fileid in self._list_morph_files(fileids)])
@_parse_args
def tagged_sents(self, fileids=None, **kwargs):
- return concat(
- [
- self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
- for fileid in self._list_morph_files(fileids)
- ]
- )
+ return concat([self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE,
+ **kwargs)
+ for fileid in self._list_morph_files(fileids)])
@_parse_args
def tagged_paras(self, fileids=None, **kwargs):
- return concat(
- [
- self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
- for fileid in self._list_morph_files(fileids)
- ]
- )
+ return concat([self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE,
+ **kwargs)
+ for fileid in self._list_morph_files(fileids)])
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
- return concat(
- [self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
- )
+ return concat([self._view(fileid, **kwargs)
+ for fileid in self._list_morph_files(fileids)])
def _list_morph_files(self, fileids):
return [f for f in self.abspaths(fileids)]
def _list_header_files(self, fileids):
- return [
- f.replace("morph.xml", "header.xml")
- for f in self._list_morph_files(fileids)
- ]
+ return [f.replace('morph.xml', 'header.xml')
+ for f in self._list_morph_files(fileids)]
def _parse_header(self, fileids, tag):
values = set()
fileids = self.fileids()
ret_fileids = set()
for f in fileids:
- fp = self.abspath(f).replace("morph.xml", "header.xml")
+ fp = self.abspath(f).replace('morph.xml', 'header.xml')
values_list = self._get_tag(fp, tag)
for value in values_list:
if map is not None:
def _get_tag(self, f, tag):
tags = []
- with open(f, "r") as infile:
+ with open(f, 'r') as infile:
header = infile.read()
tag_end = 0
while True:
- tag_pos = header.find("<" + tag, tag_end)
- if tag_pos < 0:
- return tags
- tag_end = header.find("</" + tag + ">", tag_pos)
- tags.append(header[tag_pos + len(tag) + 2 : tag_end])
+ tag_pos = header.find('<'+tag, tag_end)
+ if tag_pos < 0: return tags
+ tag_end = header.find('</'+tag+'>', tag_pos)
+ tags.append(header[tag_pos+len(tag)+2:tag_end])
def _map_category(self, cat):
- pos = cat.find(">")
+ pos = cat.find('>')
if pos == -1:
return cat
else:
- return cat[pos + 1 :]
+ return cat[pos+1:]
def _view(self, filename, **kwargs):
- tags = kwargs.pop("tags", True)
- mode = kwargs.pop("mode", 0)
- simplify_tags = kwargs.pop("simplify_tags", False)
- one_tag = kwargs.pop("one_tag", True)
- disamb_only = kwargs.pop("disamb_only", True)
- append_no_space = kwargs.pop("append_no_space", False)
- append_space = kwargs.pop("append_space", False)
- replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+ tags = kwargs.pop('tags', True)
+ mode = kwargs.pop('mode', 0)
+ simplify_tags = kwargs.pop('simplify_tags', False)
+ one_tag = kwargs.pop('one_tag', True)
+ disamb_only = kwargs.pop('disamb_only', True)
+ append_no_space = kwargs.pop('append_no_space', False)
+ append_space = kwargs.pop('append_space', False)
+ replace_xmlentities = kwargs.pop('replace_xmlentities', True)
if len(kwargs) > 0:
- raise ValueError("Unexpected arguments: %s" % kwargs.keys())
+ raise ValueError('Unexpected arguments: %s' % kwargs.keys())
if not one_tag and not disamb_only:
- raise ValueError(
- "You cannot specify both one_tag=False and " "disamb_only=False"
- )
+ raise ValueError('You cannot specify both one_tag=False and '
+ 'disamb_only=False')
if not tags and (simplify_tags or not one_tag or not disamb_only):
- raise ValueError(
- "You cannot specify simplify_tags, one_tag or "
- "disamb_only with functions other than tagged_*"
- )
-
- return IPIPANCorpusView(
- filename,
- tags=tags,
- mode=mode,
- simplify_tags=simplify_tags,
- one_tag=one_tag,
- disamb_only=disamb_only,
- append_no_space=append_no_space,
- append_space=append_space,
- replace_xmlentities=replace_xmlentities,
- )
+ raise ValueError('You cannot specify simplify_tags, one_tag or '
+ 'disamb_only with functions other than tagged_*')
+
+ return IPIPANCorpusView(filename,
+ tags=tags, mode=mode, simplify_tags=simplify_tags,
+ one_tag=one_tag, disamb_only=disamb_only,
+ append_no_space=append_no_space,
+ append_space=append_space,
+ replace_xmlentities=replace_xmlentities
+ )
class IPIPANCorpusView(StreamBackedCorpusView):
self.in_sentence = False
self.position = 0
- self.show_tags = kwargs.pop("tags", True)
- self.disamb_only = kwargs.pop("disamb_only", True)
- self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
- self.simplify_tags = kwargs.pop("simplify_tags", False)
- self.one_tag = kwargs.pop("one_tag", True)
- self.append_no_space = kwargs.pop("append_no_space", False)
- self.append_space = kwargs.pop("append_space", False)
- self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+ self.show_tags = kwargs.pop('tags', True)
+ self.disamb_only = kwargs.pop('disamb_only', True)
+ self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
+ self.simplify_tags = kwargs.pop('simplify_tags', False)
+ self.one_tag = kwargs.pop('one_tag', True)
+ self.append_no_space = kwargs.pop('append_no_space', False)
+ self.append_space = kwargs.pop('append_space', False)
+ self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
def read_block(self, stream):
sentence = []
self._seek(stream)
lines = self._read_data(stream)
- if lines == [""]:
+ if lines == ['']:
assert not sentences
return []
self.in_sentence = True
elif line.startswith('<chunk type="p"'):
pass
- elif line.startswith("<tok"):
+ elif line.startswith('<tok'):
if self.append_space and space and not no_space:
self._append_space(sentence)
space = True
no_space = False
orth = ""
tags = set()
- elif line.startswith("</chunk"):
+ elif line.startswith('</chunk'):
if self.in_sentence:
self.in_sentence = False
self._seek(stream)
elif self.mode == self.PARAS_MODE:
self._seek(stream)
return [sentences]
- elif line.startswith("<orth"):
+ elif line.startswith('<orth'):
orth = line[6:-7]
if self.replace_xmlentities:
- orth = orth.replace(""", '"').replace("&", "&")
- elif line.startswith("<lex"):
- if not self.disamb_only or line.find("disamb=") != -1:
- tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
+ orth = orth.replace('"', '"').replace('&', '&')
+ elif line.startswith('<lex'):
+ if not self.disamb_only or line.find('disamb=') != -1:
+ tag = line[line.index('<ctag')+6 : line.index('</ctag') ]
tags.add(tag)
- elif line.startswith("</tok"):
+ elif line.startswith('</tok'):
if self.show_tags:
if self.simplify_tags:
- tags = [t.split(":")[0] for t in tags]
+ tags = [t.split(':')[0] for t in tags]
if not self.one_tag or not self.disamb_only:
sentence.append((orth, tuple(tags)))
else:
sentence.append((orth, tags.pop()))
else:
sentence.append(orth)
- elif line.startswith("<ns/>"):
+ elif line.startswith('<ns/>'):
if self.append_space:
no_space = True
if self.append_no_space:
if self.show_tags:
- sentence.append(("", "no-space"))
+ sentence.append(('', 'no-space'))
else:
- sentence.append("")
- elif line.startswith("</cesAna"):
+ sentence.append('')
+ elif line.startswith('</cesAna'):
pass
def _read_data(self, stream):
self.position = stream.tell()
buff = stream.read(4096)
- lines = buff.split("\n")
+ lines = buff.split('\n')
lines.reverse()
return lines
def _append_space(self, sentence):
if self.show_tags:
- sentence.append((" ", "space"))
+ sentence.append((' ', 'space'))
else:
- sentence.append(" ")
+ sentence.append(' ')
#! /usr/bin/env python
# KNB Corpus reader
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
import re
+from six import string_types
from nltk.parse import DependencyGraph
from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
# default function to convert morphlist to str for tree representation
-_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
+_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS')
class KNBCorpusReader(SyntaxCorpusReader):
"""
- def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
+ def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
- # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
- # from CorpusReader?
CorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
- res.append((cells[0], " ".join(cells[1:])))
+ res.append((cells[0], ' '.join(cells[1:])))
return res
dg = DependencyGraph()
i = 0
for line in t.splitlines():
- if line[0] in "*+":
+ if line[0] in '*+':
# start of bunsetsu or tag
cells = line.strip().split(" ", 3)
assert m is not None
node = dg.nodes[i]
- node.update({"address": i, "rel": m.group(2), "word": []})
+ node.update(
+ {
+ 'address': i,
+ 'rel': m.group(2),
+ 'word': [],
+ }
+ )
dep_parent = int(m.group(1))
if dep_parent == -1:
dg.root = node
else:
- dg.nodes[dep_parent]["deps"].append(i)
+ dg.nodes[dep_parent]['deps'].append(i)
i += 1
- elif line[0] != "#":
+ elif line[0] != '#':
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
- morph = cells[0], " ".join(cells[1:])
- dg.nodes[i - 1]["word"].append(morph)
+ morph = cells[0], ' '.join(cells[1:])
+ dg.nodes[i - 1]['word'].append(morph)
if self.morphs2str:
for node in dg.nodes.values():
- node["word"] = self.morphs2str(node["word"])
+ node['word'] = self.morphs2str(node['word'])
return dg.tree()
-
######################################################################
# Demo
######################################################################
import nltk
from nltk.corpus.util import LazyCorpusLoader
- root = nltk.data.find("corpora/knbc/corpus1")
- fileids = [
- f
- for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
- if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
- ]
+ root = nltk.data.find('corpora/knbc/corpus1')
+ fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
+ if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
def _knbc_fileids_sort(x):
- cells = x.split("-")
+ cells = x.split('-')
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
- knbc = LazyCorpusLoader(
- "knbc/corpus1",
- KNBCorpusReader,
- sorted(fileids, key=_knbc_fileids_sort),
- encoding="euc-jp",
- )
+ knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
+ sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
print(knbc.fileids()[:10])
- print("".join(knbc.words()[:100]))
+ print(''.join(knbc.words()[:100]))
- print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
+ print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))
- knbc.morphs2str = lambda morphs: "/".join(
- "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
- ).encode("utf-8")
+ knbc.morphs2str = lambda morphs: '/'.join(
+ "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
+ ).encode('utf-8')
- print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
+ print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))
print(
- "\n".join(
- " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
+ '\n'.join(
+ ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
-
knbc = LazyCorpusLoader(
- "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
- )
- assert isinstance(knbc.words()[0], str)
- assert isinstance(knbc.sents()[0][0], str)
+ 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
+ assert isinstance(knbc.words()[0], string_types)
+ assert isinstance(knbc.sents()[0][0], string_types)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Lin's Thesaurus
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Blanchard <dblanchard@ets.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.txt
+from __future__ import print_function
import re
from collections import defaultdict
@staticmethod
def __defaultdict_factory():
- """ Factory for creating defaultdict of defaultdict(dict)s """
+ ''' Factory for creating defaultdict of defaultdict(dict)s '''
return defaultdict(dict)
def __init__(self, root, badscore=0.0):
- """
+ '''
Initialize the thesaurus.
:param root: root directory containing thesaurus LISP files
:type root: C{string}
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
:type badscore: C{float}
- """
+ '''
- super(LinThesaurusCorpusReader, self).__init__(root, r"sim[A-Z]\.lsp")
+ super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
self._badscore = badscore
- for path, encoding, fileid in self.abspaths(
- include_encoding=True, include_fileid=True
- ):
+ for path, encoding, fileid in self.abspaths(include_encoding=True, include_fileid=True):
with open(path) as lin_file:
first = True
for line in lin_file:
line = line.strip()
# Start of entry
if first:
- key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
+ key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
first = False
# End of entry
- elif line == "))":
+ elif line == '))':
first = True
# Lines with pairs of ngrams and scores
else:
- split_line = line.split("\t")
+ split_line = line.split('\t')
if len(split_line) == 2:
ngram, score = split_line
- self._thesaurus[fileid][key][ngram.strip('"')] = float(
- score
- )
+ self._thesaurus[fileid][key][ngram.strip('"')] = float(score)
def similarity(self, ngram1, ngram2, fileid=None):
- """
+ '''
Returns the similarity score for two ngrams.
:param ngram1: first ngram to compare
:type fileid: C{string}
:return: If fileid is specified, just the score for the two ngrams; otherwise,
list of tuples of fileids and scores.
- """
+ '''
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
if ngram1 == ngram2:
if fileid:
return [(fid, 1.0) for fid in self._fileids]
else:
if fileid:
- return (
- self._thesaurus[fileid][ngram1][ngram2]
- if ngram2 in self._thesaurus[fileid][ngram1]
- else self._badscore
- )
+ return self._thesaurus[fileid][ngram1][ngram2] if ngram2 in self._thesaurus[fileid][ngram1] else self._badscore
else:
- return [
- (
- fid,
- (
- self._thesaurus[fid][ngram1][ngram2]
- if ngram2 in self._thesaurus[fid][ngram1]
- else self._badscore
- ),
- )
- for fid in self._fileids
- ]
+ return [(fid, (self._thesaurus[fid][ngram1][ngram2] if ngram2 in self._thesaurus[fid][ngram1]
+ else self._badscore)) for fid in self._fileids]
def scored_synonyms(self, ngram, fileid=None):
- """
+ '''
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
:param ngram: ngram to lookup
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
list of tuples of fileids and lists, where inner lists consist of tuples of
scores and synonyms.
- """
+ '''
if fileid:
return self._thesaurus[fileid][ngram].items()
else:
- return [
- (fileid, self._thesaurus[fileid][ngram].items())
- for fileid in self._fileids
- ]
+ return [(fileid, self._thesaurus[fileid][ngram].items()) for fileid in self._fileids]
def synonyms(self, ngram, fileid=None):
- """
+ '''
Returns a list of synonyms for the current ngram.
:param ngram: ngram to lookup
:type fileid: C{string}
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
lists, where inner lists contain synonyms.
- """
+ '''
if fileid:
return self._thesaurus[fileid][ngram].keys()
else:
- return [
- (fileid, self._thesaurus[fileid][ngram].keys())
- for fileid in self._fileids
- ]
+ return [(fileid, self._thesaurus[fileid][ngram].keys()) for fileid in self._fileids]
def __contains__(self, ngram):
- """
+ '''
Determines whether or not the given ngram is in the thesaurus.
:param ngram: ngram to lookup
:type ngram: C{string}
:return: whether the given ngram is in the thesaurus.
- """
- return reduce(
- lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
- self._fileids,
- False,
- )
+ '''
+ return reduce(lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), self._fileids, False)
######################################################################
# Demo
######################################################################
-
def demo():
from nltk.corpus import lin_thesaurus as thes
print(thes.similarity(word1, word2))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
import re
from functools import reduce
+from six import string_types
+
from nltk.corpus.reader import concat, TaggedCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusView
def xpath(root, path, ns):
return root.findall(path, ns)
-
class MTECorpusView(XMLCorpusView):
"""
Class for lazy viewing the MTE Corpus.
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
def read_block(self, stream, tagspec=None, elt_handler=None):
- return list(
- filter(
- lambda x: x is not None,
- XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
- )
- )
-
+ return list(filter(lambda x: x is not None, XMLCorpusView.read_block(self, stream, tagspec, elt_handler)))
class MTEFileReader:
"""
parses the xml files and does some tag-filtering depending on the
given method parameters.
"""
-
- ns = {
- "tei": "http://www.tei-c.org/ns/1.0",
- "xml": "http://www.w3.org/XML/1998/namespace",
- }
- tag_ns = "{http://www.tei-c.org/ns/1.0}"
- xml_ns = "{http://www.w3.org/XML/1998/namespace}"
+ ns = {'tei': 'http://www.tei-c.org/ns/1.0',
+ 'xml': 'http://www.w3.org/XML/1998/namespace'}
+ tag_ns = '{http://www.tei-c.org/ns/1.0}'
+ xml_ns = '{http://www.w3.org/XML/1998/namespace}'
word_path = "TEI/text/body/div/div/p/s/(w|c)"
sent_path = "TEI/text/body/div/div/p/s"
para_path = "TEI/text/body/div/div/p"
+
def __init__(self, file_path):
self.__file_path = file_path
@classmethod
- def _word_elt(cls, elt, context):
+ def _word_elt(self, elt, context):
return elt.text
@classmethod
- def _sent_elt(cls, elt, context):
- return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+ def _sent_elt(self, elt, context):
+ return [self._word_elt(w, None) for w in xpath(elt, '*', self.ns)]
@classmethod
- def _para_elt(cls, elt, context):
- return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+ def _para_elt(self, elt, context):
+ return [self._sent_elt(s, None) for s in xpath(elt, '*', self.ns)]
@classmethod
- def _tagged_word_elt(cls, elt, context):
- if "ana" not in elt.attrib:
- return (elt.text, "")
-
- if cls.__tags == "" and cls.__tagset == "msd":
- return (elt.text, elt.attrib["ana"])
- elif cls.__tags == "" and cls.__tagset == "universal":
- return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
+ def _tagged_word_elt(self, elt, context):
+ if ('ana' not in elt.attrib):
+ return (elt.text, '')
+
+ if self.__tags == "" and self.__tagset == "msd":
+ return (elt.text, elt.attrib['ana'])
+ elif self.__tags == "" and self.__tagset == "universal":
+ return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
else:
- tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
- if tags.match(elt.attrib["ana"]):
- if cls.__tagset == "msd":
- return (elt.text, elt.attrib["ana"])
+ tags = re.compile('^' + re.sub("-", ".", self.__tags) + '.*$')
+ if (tags.match(elt.attrib['ana'])):
+ if self.__tagset == "msd":
+ return (elt.text, elt.attrib['ana'])
else:
- return (
- elt.text,
- MTETagConverter.msd_to_universal(elt.attrib["ana"]),
- )
+ return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
else:
return None
@classmethod
- def _tagged_sent_elt(cls, elt, context):
- return list(
- filter(
- lambda x: x is not None,
- [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
- )
- )
+ def _tagged_sent_elt(self, elt, context):
+ return list(filter(lambda x: x is not None, [self._tagged_word_elt(w, None) for w in xpath(elt, '*', self.ns)]))
@classmethod
- def _tagged_para_elt(cls, elt, context):
- return list(
- filter(
- lambda x: x is not None,
- [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
- )
- )
+ def _tagged_para_elt(self, elt, context):
+ return list(filter(lambda x: x is not None, [self._tagged_sent_elt(s, None) for s in xpath(elt, '*', self.ns)]))
@classmethod
- def _lemma_word_elt(cls, elt, context):
- if "lemma" not in elt.attrib:
- return (elt.text, "")
+ def _lemma_word_elt(self, elt, context):
+ if ('lemma' not in elt.attrib):
+ return (elt.text, '')
else:
- return (elt.text, elt.attrib["lemma"])
+ return (elt.text, elt.attrib['lemma'])
@classmethod
- def _lemma_sent_elt(cls, elt, context):
- return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+ def _lemma_sent_elt(self, elt, context):
+ return [self._lemma_word_elt(w, None) for w in xpath(elt, '*', self.ns)]
@classmethod
- def _lemma_para_elt(cls, elt, context):
- return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+ def _lemma_para_elt(self, elt, context):
+ return [self._lemma_sent_elt(s, None) for s in xpath(elt, '*', self.ns)]
def words(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt)
def sents(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt)
def paras(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt)
def lemma_words(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt)
def tagged_words(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
- return MTECorpusView(
- self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt)
def lemma_sents(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt)
def tagged_sents(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
- return MTECorpusView(
- self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt)
def lemma_paras(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt)
def tagged_paras(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
- return MTECorpusView(
- self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
- )
+ return MTECorpusView(self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt)
class MTETagConverter:
"""
mapping_msd_universal = {
- "A": "ADJ",
- "S": "ADP",
- "R": "ADV",
- "C": "CONJ",
- "D": "DET",
- "N": "NOUN",
- "M": "NUM",
- "Q": "PRT",
- "P": "PRON",
- "V": "VERB",
- ".": ".",
- "-": "X",
- }
+ 'A': 'ADJ', 'S': 'ADP', 'R': 'ADV', 'C': 'CONJ',
+ 'D': 'DET', 'N': 'NOUN', 'M': 'NUM', 'Q': 'PRT',
+ 'P': 'PRON', 'V': 'VERB', '.': '.', '-': 'X'}
@staticmethod
def msd_to_universal(tag):
indicator = tag[0] if not tag[0] == "#" else tag[1]
if not indicator in MTETagConverter.mapping_msd_universal:
- indicator = "-"
+ indicator = '-'
return MTETagConverter.mapping_msd_universal[indicator]
-
class MTECorpusReader(TaggedCorpusReader):
"""
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
scheme. These tags can be converted to the Universal tagset
"""
- def __init__(self, root=None, fileids=None, encoding="utf8"):
+ def __init__(self, root=None, fileids=None, encoding='utf8'):
"""
Construct a new MTECorpusreader for a set of documents
located at the given root directory. Example usage:
TaggedCorpusReader.__init__(self, root, fileids, encoding)
def __fileids(self, fileids):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
# filter wrong userinput
- fileids = filter(lambda x: x in self._fileids, fileids)
+ fileids = filter(lambda x : x in self._fileids, fileids)
# filter multext-east sourcefiles that are not compatible to the teip5 specification
- fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
+ fileids = filter(lambda x : x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
if not fileids:
print("No valid multext-east file specified")
return fileids
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).words()
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids)])
def sents(self, fileids=None):
"""
each encoded as a list of word strings
:rtype: list(list(str))
"""
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).sents()
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).sents() for f in self.__fileids(fileids)])
def paras(self, fileids=None):
"""
of sentences, which are in turn encoded as lists of word string
:rtype: list(list(list(str)))
"""
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).paras()
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).paras() for f in self.__fileids(fileids)])
def lemma_words(self, fileids=None):
"""
and punctuation symbols, encoded as tuples (word, lemma)
:rtype: list(tuple(str,str))
"""
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).lemma_words()
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids)])
def tagged_words(self, fileids=None, tagset="msd", tags=""):
"""
:rtype: list(tuple(str, str))
"""
if tagset == "universal" or tagset == "msd":
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).tagged_words(
- tagset, tags
- )
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).tagged_words(tagset, tags) for f in self.__fileids(fileids)])
else:
print("Unknown tagset specified.")
lemma (word, lemma)
:rtype: list(list(tuple(str, str)))
"""
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).lemma_sents()
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).lemma_sents() for f in self.__fileids(fileids)])
+
def tagged_sents(self, fileids=None, tagset="msd", tags=""):
"""
:rtype: list(list(tuple(str, str)))
"""
if tagset == "universal" or tagset == "msd":
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).tagged_sents(
- tagset, tags
- )
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).tagged_sents(tagset, tags) for f in self.__fileids(fileids)])
else:
print("Unknown tagset specified.")
tuples of the word and the corresponding lemma (word, lemma)
:rtype: list(List(List(tuple(str, str))))
"""
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).lemma_paras()
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).lemma_paras() for f in self.__fileids(fileids)])
def tagged_paras(self, fileids=None, tagset="msd", tags=""):
"""
:rtype: list(list(list(tuple(str, str))))
"""
if tagset == "universal" or tagset == "msd":
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).tagged_paras(
- tagset, tags
- )
- for f in self.__fileids(fileids)
- ]
- )
+ return concat([MTEFileReader(os.path.join(self._root, f)).tagged_paras(tagset, tags) for f in self.__fileids(fileids)])
else:
print("Unknown tagset specified.")
# Natural Language Toolkit: NKJP Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Gabriela Kaczka
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import functools
import os
-import re
import tempfile
+from six import string_types
+
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
+import re
def _parse_args(fun):
Wraps function arguments:
if fileids not specified then function set NKJPCorpusReader paths.
"""
-
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
if not fileids:
HEADER_MODE = 2
RAW_MODE = 3
- def __init__(self, root, fileids=".*"):
+ def __init__(self, root, fileids='.*'):
"""
Corpus reader designed to work with National Corpus of Polish.
See http://nkjp.pl/ for more details about NKJP.
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
"""
- if isinstance(fileids, str):
- XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
+ if isinstance(fileids, string_types):
+ XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml')
else:
- XMLCorpusReader.__init__(
- self, root, [fileid + "/header.xml" for fileid in fileids]
- )
+ XMLCorpusReader.__init__(self, root, [fileid + '/header.xml' for fileid in fileids])
self._paths = self.get_paths()
def get_paths(self):
- return [
- os.path.join(str(self._root), f.split("header.xml")[0])
- for f in self._fileids
- ]
+ return [os.path.join(str(self._root), f.split("header.xml")[0]) for f in self._fileids]
def fileids(self):
"""
"""
Returns a view specialised for use with particular corpus file.
"""
- mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
+ mode = kwargs.pop('mode', NKJPCorpusReader.WORDS_MODE)
if mode is NKJPCorpusReader.WORDS_MODE:
return NKJPCorpus_Morph_View(filename, tags=tags)
elif mode is NKJPCorpusReader.SENTS_MODE:
elif mode is NKJPCorpusReader.HEADER_MODE:
return NKJPCorpus_Header_View(filename, tags=tags)
elif mode is NKJPCorpusReader.RAW_MODE:
- return NKJPCorpus_Text_View(
- filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
- )
+ return NKJPCorpus_Text_View(filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE)
else:
- raise NameError("No such mode!")
+ raise NameError('No such mode!')
def add_root(self, fileid):
"""
"""
Returns header(s) of specified fileids.
"""
- return concat(
- [
- self._view(
- self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
- ).handle_query()
- for fileid in fileids
- ]
- )
+ return concat([self._view(self.add_root(fileid),
+ mode=NKJPCorpusReader.HEADER_MODE, **kwargs).handle_query()
+ for fileid in fileids])
@_parse_args
def sents(self, fileids=None, **kwargs):
"""
Returns sentences in specified fileids.
"""
- return concat(
- [
- self._view(
- self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
- ).handle_query()
- for fileid in fileids
- ]
- )
+ return concat([self._view(self.add_root(fileid),
+ mode=NKJPCorpusReader.SENTS_MODE, **kwargs).handle_query()
+ for fileid in fileids])
@_parse_args
def words(self, fileids=None, **kwargs):
Returns words in specified fileids.
"""
- return concat(
- [
- self._view(
- self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
- ).handle_query()
- for fileid in fileids
- ]
- )
+ return concat([self._view(self.add_root(fileid),
+ mode=NKJPCorpusReader.WORDS_MODE, **kwargs).handle_query()
+ for fileid in fileids])
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
Returns tagged words in specified fileids.
"""
- tags = kwargs.pop("tags", [])
- return concat(
- [
- self._view(
- self.add_root(fileid),
- mode=NKJPCorpusReader.WORDS_MODE,
- tags=tags,
- **kwargs
- ).handle_query()
- for fileid in fileids
- ]
- )
+ tags = kwargs.pop('tags', [])
+ return concat([self._view(self.add_root(fileid),
+ mode=NKJPCorpusReader.WORDS_MODE, tags=tags, **kwargs).handle_query()
+ for fileid in fileids])
@_parse_args
def raw(self, fileids=None, **kwargs):
"""
Returns words in specified fileids.
"""
- return concat(
- [
- self._view(
- self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
- ).handle_query()
- for fileid in fileids
- ]
- )
+ return concat([self._view(self.add_root(fileid),
+ mode=NKJPCorpusReader.RAW_MODE, **kwargs).handle_query()
+ for fileid in fileids])
class NKJPCorpus_Header_View(XMLCorpusView):
+
def __init__(self, filename, **kwargs):
"""
HEADER_MODE
header.xml files in NKJP corpus.
"""
self.tagspec = ".*/sourceDesc$"
- XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
+ XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
def handle_query(self):
self._open()
return header
def handle_elt(self, elt, context):
- titles = elt.findall("bibl/title")
+ titles = elt.findall('bibl/title')
title = []
if titles:
- title = "\n".join(title.text.strip() for title in titles)
+ title = '\n'.join(title.text.strip() for title in titles)
- authors = elt.findall("bibl/author")
+ authors = elt.findall('bibl/author')
author = []
if authors:
- author = "\n".join(author.text.strip() for author in authors)
+ author = '\n'.join(author.text.strip() for author in authors)
- dates = elt.findall("bibl/date")
+ dates = elt.findall('bibl/date')
date = []
if dates:
- date = "\n".join(date.text.strip() for date in dates)
+ date = '\n'.join(date.text.strip() for date in dates)
- publishers = elt.findall("bibl/publisher")
+ publishers = elt.findall('bibl/publisher')
publisher = []
if publishers:
- publisher = "\n".join(publisher.text.strip() for publisher in publishers)
+ publisher = '\n'.join(publisher.text.strip() for publisher in publishers)
- idnos = elt.findall("bibl/idno")
+ idnos = elt.findall('bibl/idno')
idno = []
if idnos:
- idno = "\n".join(idno.text.strip() for idno in idnos)
+ idno = '\n'.join(idno.text.strip() for idno in idnos)
- notes = elt.findall("bibl/note")
+ notes = elt.findall('bibl/note')
note = []
if notes:
- note = "\n".join(note.text.strip() for note in notes)
+ note = '\n'.join(note.text.strip() for note in notes)
- return {
- "title": title,
- "author": author,
- "date": date,
- "publisher": publisher,
- "idno": idno,
- "note": note,
- }
+ return {'title': title, 'author': author, 'date': date, 'publisher': publisher,
+ 'idno': idno, 'note': note}
-class XML_Tool:
+class XML_Tool():
"""
Helper class creating xml file to one without references to nkjp: namespace.
That's needed because the XMLCorpusView assumes that one can find short substrings
of XML that are valid XML, which is not true if a namespace is declared at top level
"""
-
def __init__(self, root, filename):
self.read_file = os.path.join(root, filename)
self.write_file = tempfile.NamedTemporaryFile(delete=False)
def build_preprocessed_file(self):
try:
- fr = open(self.read_file, "r")
+ fr = open(self.read_file, 'r')
fw = self.write_file
- line = " "
+ line = ' '
while len(line):
line = fr.readline()
- x = re.split(r"nkjp:[^ ]* ", line) # in all files
- ret = " ".join(x)
- x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
- x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
- x = re.split("<choice>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
- x = re.split("</choice>", ret) # in ann_segmentation.xml
- ret = " ".join(x)
+ x = re.split(r'nkjp:[^ ]* ', line) #in all files
+ ret = ' '.join(x)
+ x = re.split('<nkjp:paren>', ret) #in ann_segmentation.xml
+ ret = ' '.join(x)
+ x = re.split('</nkjp:paren>', ret) #in ann_segmentation.xml
+ ret = ' '.join(x)
+ x = re.split('<choice>', ret) #in ann_segmentation.xml
+ ret = ' '.join(x)
+ x = re.split('</choice>', ret) #in ann_segmentation.xml
+ ret = ' '.join(x)
fw.write(ret)
fr.close()
fw.close()
def remove_preprocessed_file(self):
os.remove(self.write_file.name)
+ pass
class NKJPCorpus_Segmentation_View(XMLCorpusView):
"""
def __init__(self, filename, **kwargs):
- self.tagspec = ".*p/.*s"
- # intersperse NKJPCorpus_Text_View
- self.text_view = NKJPCorpus_Text_View(
- filename, mode=NKJPCorpus_Text_View.SENTS_MODE
- )
+ self.tagspec = '.*p/.*s'
+ #intersperse NKJPCorpus_Text_View
+ self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE)
self.text_view.handle_query()
- # xml preprocessing
- self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
- # base class init
- XMLCorpusView.__init__(
- self, self.xml_tool.build_preprocessed_file(), self.tagspec
- )
+ #xml preprocessing
+ self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
+ #base class init
+ XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
def get_segm_id(self, example_word):
- return example_word.split("(")[1].split(",")[0]
+ return example_word.split('(')[1].split(',')[0]
def get_sent_beg(self, beg_word):
- # returns index of beginning letter in sentence
- return int(beg_word.split(",")[1])
+ #returns index of beginning letter in sentence
+ return int(beg_word.split(',')[1])
def get_sent_end(self, end_word):
- # returns index of end letter in sentence
- splitted = end_word.split(")")[0].split(",")
+ #returns index of end letter in sentence
+ splitted = end_word.split(')')[0].split(',')
return int(splitted[1]) + int(splitted[2])
def get_sentences(self, sent_segm):
- # returns one sentence
+ #returns one sentence
id = self.get_segm_id(sent_segm[0])
- segm = self.text_view.segm_dict[id] # text segment
+ segm = self.text_view.segm_dict[id] #text segment
beg = self.get_sent_beg(sent_segm[0])
- end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
+ end = self.get_sent_end(sent_segm[len(sent_segm)-1])
return segm[beg:end]
def remove_choice(self, segm):
prev_txt_nr = -1
for word in segm:
txt_nr = self.get_segm_id(word)
- # get increasing sequence of ids: in case of choice get first possibility
- if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
+ #get increasing sequence of ids: in case of choice get first possibility
+ if self.get_sent_beg(word) > prev_txt_end-1 or prev_txt_nr != txt_nr:
ret.append(word)
prev_txt_end = self.get_sent_end(word)
prev_txt_nr = txt_nr
def handle_elt(self, elt, context):
ret = []
for seg in elt:
- ret.append(seg.get("corresp"))
+ ret.append(seg.get('corresp'))
return ret
A stream backed corpus view specialized for use with
text.xml files in NKJP corpus.
"""
-
SENTS_MODE = 0
RAW_MODE = 1
def __init__(self, filename, **kwargs):
- self.mode = kwargs.pop("mode", 0)
- self.tagspec = ".*/div/ab"
+ self.mode = kwargs.pop('mode', 0)
+ self.tagspec = '.*/div/ab'
self.segm_dict = dict()
- # xml preprocessing
- self.xml_tool = XML_Tool(filename, "text.xml")
- # base class init
- XMLCorpusView.__init__(
- self, self.xml_tool.build_preprocessed_file(), self.tagspec
- )
+ #xml preprocessing
+ self.xml_tool = XML_Tool(filename, 'text.xml')
+ #base class init
+ XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
def handle_query(self):
try:
for part in segm:
txt.append(part)
- return [" ".join([segm for segm in txt])]
+ return [' '.join([segm for segm in txt])]
def get_segm_id(self, elt):
for attr in elt.attrib:
- if attr.endswith("id"):
+ if attr.endswith('id'):
return elt.get(attr)
def handle_elt(self, elt, context):
- # fill dictionary to use later in sents mode
+ #fill dictionary to use later in sents mode
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
self.segm_dict[self.get_segm_id(elt)] = elt.text
return elt.text
"""
def __init__(self, filename, **kwargs):
- self.tags = kwargs.pop("tags", None)
- self.tagspec = ".*/seg/fs"
- self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
- XMLCorpusView.__init__(
- self, self.xml_tool.build_preprocessed_file(), self.tagspec
- )
+ self.tags = kwargs.pop('tags', None)
+ self.tagspec = '.*/seg/fs'
+ self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
+ XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
def handle_query(self):
try:
raise Exception
def handle_elt(self, elt, context):
- word = ""
+ word = ''
flag = False
is_not_interp = True
- # if tags not specified, then always return word
+ #if tags not specified, then always return word
if self.tags is None:
flag = True
for child in elt:
- # get word
- if "name" in child.keys() and child.attrib["name"] == "orth":
+ #get word
+ if 'name' in child.keys() and child.attrib['name'] == 'orth':
for symbol in child:
- if symbol.tag == "string":
+ if symbol.tag == 'string':
word = symbol.text
- elif "name" in child.keys() and child.attrib["name"] == "interps":
+ elif 'name' in child.keys() and child.attrib['name'] == 'interps':
for symbol in child:
- if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
+ if 'type' in symbol.keys() and symbol.attrib['type'] == 'lex':
for symbol2 in symbol:
- if (
- "name" in symbol2.keys()
- and symbol2.attrib["name"] == "ctag"
- ):
+ if 'name' in symbol2.keys() and symbol2.attrib['name'] == 'ctag':
for symbol3 in symbol2:
- if (
- "value" in symbol3.keys()
- and self.tags is not None
- and symbol3.attrib["value"] in self.tags
- ):
+ if 'value' in symbol3.keys() and self.tags is not None and symbol3.attrib['value'] in self.tags:
flag = True
- elif (
- "value" in symbol3.keys()
- and symbol3.attrib["value"] == "interp"
- ):
+ elif 'value' in symbol3.keys() and symbol3.attrib['value'] == 'interp':
is_not_interp = False
if flag and is_not_interp:
return word
# Natural Language Toolkit: NomBank Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
from xml.etree import ElementTree
from functools import total_ordering
+from six import string_types
+
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class NombankCorpusReader(CorpusReader):
"""
Corpus reader for the nombank corpus, which augments the Penn
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
-
- def __init__(
- self,
- root,
- nomfile,
- framefiles="",
- nounsfile=None,
- parse_fileid_xform=None,
- parse_corpus=None,
- encoding="utf8",
- ):
+ def __init__(self, root, nomfile, framefiles='',
+ nounsfile=None, parse_fileid_xform=None,
+ parse_corpus=None, encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param nomfile: The name of the file containing the predicate-
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by nombank.
"""
-
# If framefiles is specified as a regexp, expand it.
- if isinstance(framefiles, str):
- self._fileids = find_corpus_fileids(root, framefiles)
- self._fileids = list(framefiles)
+ if isinstance(framefiles, string_types):
+ framefiles = find_corpus_fileids(root, framefiles)
+ framefiles = list(framefiles)
# Initialze the corpus reader.
- CorpusReader.__init__(self, root, framefiles, encoding)
+ CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles,
+ encoding)
- # Record our nom file & nouns file.
+ # Record our frame fileids & nom file.
self._nomfile = nomfile
+ self._framefiles = framefiles
self._nounsfile = nounsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
"""
:return: the text contents of the given fileids, as a single string.
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def instances(self, baseform=None):
"""
kwargs = {}
if baseform is not None:
- kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
- return StreamBackedCorpusView(
- self.abspath(self._nomfile),
- lambda stream: self._read_instance_block(stream, **kwargs),
- encoding=self.encoding(self._nomfile),
- )
+ kwargs['instance_filter'] = lambda inst: inst.baseform==baseform
+ return StreamBackedCorpusView(self.abspath(self._nomfile),
+ lambda stream: self._read_instance_block(stream, **kwargs),
+ encoding=self.encoding(self._nomfile))
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
- return StreamBackedCorpusView(
- self.abspath(self._nomfile),
- read_line_block,
- encoding=self.encoding(self._nomfile),
- )
+ return StreamBackedCorpusView(self.abspath(self._nomfile),
+ read_line_block,
+ encoding=self.encoding(self._nomfile))
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
- baseform = roleset_id.split(".")[0]
- baseform = baseform.replace("perc-sign", "%")
- baseform = baseform.replace("oneslashonezero", "1/10").replace(
- "1/10", "1-slash-10"
- )
- framefile = "frames/%s.xml" % baseform
- if framefile not in self.fileids():
- raise ValueError("Frameset file for %s not found" % roleset_id)
+ baseform = roleset_id.split('.')[0]
+ baseform = baseform.replace('perc-sign','%')
+ baseform = baseform.replace('oneslashonezero', '1/10').replace('1/10','1-slash-10')
+ framefile = 'frames/%s.xml' % baseform
+ if framefile not in self._framefiles:
+ raise ValueError('Frameset file for %s not found' %
+ roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- for roleset in etree.findall("predicate/roleset"):
- if roleset.attrib["id"] == roleset_id:
+ for roleset in etree.findall('predicate/roleset'):
+ if roleset.attrib['id'] == roleset_id:
return roleset
- raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
+ raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
- framefile = "frames/%s.xml" % baseform
- if framefile not in self.fileids():
- raise ValueError("Frameset file for %s not found" % baseform)
+ framefile = 'frames/%s.xml' % baseform
+ if framefile not in self._framefiles:
+ raise ValueError('Frameset file for %s not found' %
+ baseform)
framefiles = [framefile]
else:
- framefiles = self.fileids()
+ framefiles = self._framefiles
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- rsets.append(etree.findall("predicate/roleset"))
+ rsets.append(etree.findall('predicate/roleset'))
return LazyConcatenation(rsets)
def nouns(self):
:return: a corpus view that acts as a list of all noun lemmas
in this corpus (from the nombank.1.0.words file).
"""
- return StreamBackedCorpusView(
- self.abspath(self._nounsfile),
- read_line_block,
- encoding=self.encoding(self._nounsfile),
- )
+ return StreamBackedCorpusView(self.abspath(self._nounsfile),
+ read_line_block,
+ encoding=self.encoding(self._nounsfile))
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
line = stream.readline().strip()
if line:
inst = NombankInstance.parse(
- line, self._parse_fileid_xform, self._parse_corpus
- )
+ line, self._parse_fileid_xform,
+ self._parse_corpus)
if instance_filter(inst):
block.append(inst)
return block
-
######################################################################
-# { Nombank Instance & related datatypes
+#{ Nombank Instance & related datatypes
######################################################################
-
+@python_2_unicode_compatible
class NombankInstance(object):
- def __init__(
- self,
- fileid,
- sentnum,
- wordnum,
- baseform,
- sensenumber,
- predicate,
- predid,
- arguments,
- parse_corpus=None,
- ):
+
+ def __init__(self, fileid, sentnum, wordnum, baseform, sensenumber,
+ predicate, predid, arguments, parse_corpus=None):
self.fileid = fileid
"""The name of the file containing the parse tree for this
"""The name of the roleset used by this instance's predicate.
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
look up information about the roleset."""
- r = self.baseform.replace("%", "perc-sign")
- r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
- return "%s.%s" % (r, self.sensenumber)
+ r = self.baseform.replace('%', 'perc-sign')
+ r = r.replace('1/10', '1-slash-10').replace('1-slash-10', 'oneslashonezero')
+ return '%s.%s'%(r, self.sensenumber)
def __repr__(self):
- return "<NombankInstance: %s, sent %s, word %s>" % (
- self.fileid,
- self.sentnum,
- self.wordnum,
- )
+ return ('<NombankInstance: %s, sent %s, word %s>' %
+ (self.fileid, self.sentnum, self.wordnum))
def __str__(self):
- s = "%s %s %s %s %s" % (
- self.fileid,
- self.sentnum,
- self.wordnum,
- self.baseform,
- self.sensenumber,
- )
- items = self.arguments + ((self.predicate, "rel"),)
+ s = '%s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum,
+ self.baseform, self.sensenumber)
+ items = self.arguments + ((self.predicate, 'rel'),)
for (argloc, argid) in sorted(items):
- s += " %s-%s" % (argloc, argid)
+ s += ' %s-%s' % (argloc, argid)
return s
def _get_tree(self):
- if self.parse_corpus is None:
- return None
- if self.fileid not in self.parse_corpus.fileids():
- return None
+ if self.parse_corpus is None: return None
+ if self.fileid not in self.parse_corpus.fileids(): return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
-
- tree = property(
- _get_tree,
- doc="""
+ tree = property(_get_tree, doc="""
The parse tree corresponding to this instance, or None if
- the corresponding tree is not available.""",
- )
+ the corresponding tree is not available.""")
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 6:
- raise ValueError("Badly formatted nombank line: %r" % s)
+ raise ValueError('Badly formatted nombank line: %r' % s)
# Divide the line into its basic pieces.
- (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
+ (fileid, sentnum, wordnum,
+ baseform, sensenumber) = pieces[:5]
args = pieces[5:]
- rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
+ rel = [args.pop(i) for i,p in enumerate(args) if '-rel' in p]
if len(rel) != 1:
- raise ValueError("Badly formatted nombank line: %r" % s)
+ raise ValueError('Badly formatted nombank line: %r' % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
# Parse the predicate location.
- predloc, predid = rel[0].split("-", 1)
+ predloc, predid = rel[0].split('-', 1)
predicate = NombankTreePointer.parse(predloc)
# Parse the arguments.
arguments = []
for arg in args:
- argloc, argid = arg.split("-", 1)
- arguments.append((NombankTreePointer.parse(argloc), argid))
+ argloc, argid = arg.split('-', 1)
+ arguments.append( (NombankTreePointer.parse(argloc), argid) )
# Put it all together.
- return NombankInstance(
- fileid,
- sentnum,
- wordnum,
- baseform,
- sensenumber,
- predicate,
- predid,
- arguments,
- parse_corpus,
- )
-
+ return NombankInstance(fileid, sentnum, wordnum, baseform, sensenumber,
+ predicate, predid, arguments, parse_corpus)
class NombankPointer(object):
"""
chains in a tree. It consists of a sequence of pieces, which
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
"""
-
def __init__(self):
if self.__class__ == NombankPointer:
raise NotImplementedError()
-
+@python_2_unicode_compatible
class NombankChainTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
``NombankTreePointer`` pointers."""
def __str__(self):
- return "*".join("%s" % p for p in self.pieces)
-
+ return '*'.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<NombankChainTreePointer: %s>" % self
-
+ return '<NombankChainTreePointer: %s>' % self
def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
-
+ if tree is None: raise ValueError('Parse tree not avaialable')
+ return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
+@python_2_unicode_compatible
class NombankSplitTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
all ``NombankTreePointer`` pointers."""
def __str__(self):
- return ",".join("%s" % p for p in self.pieces)
-
+ return ','.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<NombankSplitTreePointer: %s>" % self
-
+ return '<NombankSplitTreePointer: %s>' % self
def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
-
+ if tree is None: raise ValueError('Parse tree not avaialable')
+ return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
@total_ordering
+@python_2_unicode_compatible
class NombankTreePointer(NombankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
-
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
- pieces = s.split("*")
+ pieces = s.split('*')
if len(pieces) > 1:
- return NombankChainTreePointer(
- [NombankTreePointer.parse(elt) for elt in pieces]
- )
+ return NombankChainTreePointer([NombankTreePointer.parse(elt)
+ for elt in pieces])
# Deal with split args (xx,yy,zz)
- pieces = s.split(",")
+ pieces = s.split(',')
if len(pieces) > 1:
- return NombankSplitTreePointer(
- [NombankTreePointer.parse(elt) for elt in pieces]
- )
+ return NombankSplitTreePointer([NombankTreePointer.parse(elt)
+ for elt in pieces])
# Deal with normal pointers.
- pieces = s.split(":")
- if len(pieces) != 2:
- raise ValueError("bad nombank pointer %r" % s)
+ pieces = s.split(':')
+ if len(pieces) != 2: raise ValueError('bad nombank pointer %r' % s)
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
- return "%s:%s" % (self.wordnum, self.height)
+ return '%s:%s' % (self.wordnum, self.height)
def __repr__(self):
- return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
+ return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
def __eq__(self, other):
- while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
+ while isinstance(other, (NombankChainTreePointer,
+ NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return self is other
- return self.wordnum == other.wordnum and self.height == other.height
+ return (self.wordnum == other.wordnum and self.height == other.height)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
- while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
+ while isinstance(other, (NombankChainTreePointer,
+ NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
+ if tree is None: raise ValueError('Parse tree not avaialable')
return tree[self.treepos(tree)]
def treepos(self, tree):
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
- if tree is None:
- raise ValueError("Parse tree not avaialable")
+ if tree is None: raise ValueError('Parse tree not avaialable')
stack = [tree]
treepos = []
wordnum = 0
while True:
+ #print treepos
+ #print stack[-1]
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
# word node:
else:
if wordnum == self.wordnum:
- return tuple(treepos[: len(treepos) - self.height - 1])
+ return tuple(treepos[:len(treepos)-self.height-1])
else:
wordnum += 1
stack.pop()
# Natural Language Toolkit: NPS Chat Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
import textwrap
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import *
-
class NPSChatCorpusReader(XMLCorpusReader):
+
def __init__(self, root, fileids, wrap_etree=False, tagset=None):
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
self._tagset = tagset
def xml_posts(self, fileids=None):
if self._wrap_etree:
- return concat(
- [
- XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
- for fileid in self.abspaths(fileids)
- ]
- )
+ return concat([XMLCorpusView(fileid, 'Session/Posts/Post',
+ self._wrap_elt)
+ for fileid in self.abspaths(fileids)])
else:
- return concat(
- [
- XMLCorpusView(fileid, "Session/Posts/Post")
- for fileid in self.abspaths(fileids)
- ]
- )
+ return concat([XMLCorpusView(fileid, 'Session/Posts/Post')
+ for fileid in self.abspaths(fileids)])
def posts(self, fileids=None):
- return concat(
- [
- XMLCorpusView(
- fileid, "Session/Posts/Post/terminals", self._elt_to_words
- )
- for fileid in self.abspaths(fileids)
- ]
- )
+ return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals',
+ self._elt_to_words)
+ for fileid in self.abspaths(fileids)])
def tagged_posts(self, fileids=None, tagset=None):
def reader(elt, handler):
return self._elt_to_tagged_words(elt, handler, tagset)
-
- return concat(
- [
- XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
- for fileid in self.abspaths(fileids)
- ]
- )
+ return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals',
+ reader)
+ for fileid in self.abspaths(fileids)])
def words(self, fileids=None):
return LazyConcatenation(self.posts(fileids))
return ElementWrapper(elt)
def _elt_to_words(self, elt, handler):
- return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
+ return [self._simplify_username(t.attrib['word'])
+ for t in elt.findall('t')]
def _elt_to_tagged_words(self, elt, handler, tagset=None):
- tagged_post = [
- (self._simplify_username(t.attrib["word"]), t.attrib["pos"])
- for t in elt.findall("t")
- ]
+ tagged_post = [(self._simplify_username(t.attrib['word']),
+ t.attrib['pos']) for t in elt.findall('t')]
if tagset and tagset != self._tagset:
- tagged_post = [
- (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
- ]
+ tagged_post = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post]
return tagged_post
@staticmethod
def _simplify_username(word):
- if "User" in word:
- word = "U" + word.split("User", 1)[1]
+ if 'User' in word:
+ word = 'U' + word.split('User', 1)[1]
elif isinstance(word, bytes):
- word = word.decode("ascii")
+ word = word.decode('ascii')
return word
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Comparing Opinions on the Web". Proceedings of the 14th International World
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
"""
+from six import string_types
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *
-
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
"""
This CorpusView is used to skip the initial readme block of the corpus.
"""
-
def __init__(self, *args, **kwargs):
StreamBackedCorpusView.__init__(self, *args, **kwargs)
# open self._stream
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat(
- [
- self.CorpusView(path, self._read_word_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
+ return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def positive(self):
"""
:return: a list of positive words.
:rtype: list(str)
"""
- return self.words("positive-words.txt")
+ return self.words('positive-words.txt')
def negative(self):
"""
:return: a list of negative words.
:rtype: list(str)
"""
- return self.words("negative-words.txt")
+ return self.words('negative-words.txt')
def _read_word_block(self, stream):
words = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
# Natural Language Toolkit: PanLex Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: David Kamholz <kamholz@panlex.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import CorpusReader
-
class PanLexLiteCorpusReader(CorpusReader):
MEANING_Q = """
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
"""
def __init__(self, root):
- self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
+ self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor()
self._uid_lv = {}
self._lv_uid = {}
- for row in self._c.execute("SELECT uid, lv FROM lv"):
+ for row in self._c.execute('SELECT uid, lv FROM lv'):
self._uid_lv[row[0]] = row[1]
self._lv_uid[row[1]] = row[0]
:rtype: list(tuple)
"""
- if lc is None:
- return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
+ if lc == None:
+ return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall()
else:
- return self._c.execute(
- "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
- ).fetchall()
+ return self._c.execute('SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,)).fetchall()
def meanings(self, expr_uid, expr_tt):
"""
uid = self._lv_uid[i[5]]
if not mn in mn_info:
- mn_info[mn] = {
- "uq": i[1],
- "ap": i[2],
- "ui": i[3],
- "ex": {expr_uid: [expr_tt]},
- }
+ mn_info[mn] = { 'uq': i[1], 'ap': i[2], 'ui': i[3], 'ex': { expr_uid: [expr_tt] } }
- if not uid in mn_info[mn]["ex"]:
- mn_info[mn]["ex"][uid] = []
+ if not uid in mn_info[mn]['ex']:
+ mn_info[mn]['ex'][uid] = []
- mn_info[mn]["ex"][uid].append(i[4])
+ mn_info[mn]['ex'][uid].append(i[4])
- return [Meaning(mn, mn_info[mn]) for mn in mn_info]
+ return [ Meaning(mn, mn_info[mn]) for mn in mn_info ]
def translations(self, from_uid, from_tt, to_uid):
"""
:param from_tt: the source expression's text.
:param to_uid: the target language variety, as a seven-character
uniform identifier.
- :return a list of translation tuples. The first element is the expression
+ :return a list of translation tuples. The first element is the expression
text and the second element is the translation quality.
:rtype: list(tuple)
"""
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
-
class Meaning(dict):
"""
Represents a single PanLex meaning. A meaning is a translation set derived
def __init__(self, mn, attr):
super(Meaning, self).__init__(**attr)
- self["mn"] = mn
+ self['mn'] = mn
def id(self):
"""
:return: the meaning's id.
:rtype: int
"""
- return self["mn"]
+ return self['mn']
def quality(self):
"""
:return: the meaning's source's quality (0=worst, 9=best).
:rtype: int
"""
- return self["uq"]
+ return self['uq']
def source(self):
"""
:return: the meaning's source id.
:rtype: int
"""
- return self["ap"]
+ return self['ap']
def source_group(self):
"""
:return: the meaning's source group id.
:rtype: int
"""
- return self["ui"]
+ return self['ui']
def expressions(self):
"""
texts.
:rtype: dict
"""
- return self["ex"]
+ return self['ex']
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Word List Corpus Reader
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Steven Bird <stevenbird1@gmail.com>
-# Edward Loper <edloper@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from collections import namedtuple, defaultdict
-import re
-
-from nltk.tokenize import line_tokenize
-
-from nltk.corpus.reader.wordlist import WordListCorpusReader
-from nltk.corpus.reader.util import *
-from nltk.corpus.reader.api import *
-
-PanlexLanguage = namedtuple('PanlexLanguage',
- ['panlex_uid', # (1) PanLex UID
- 'iso639', # (2) ISO 639 language code
- 'iso639_type', # (3) ISO 639 language type, see README
- 'script', # (4) normal scripts of expressions
- 'name', # (5) PanLex default name
- 'langvar_uid' # (6) UID of the language variety in which the default name is an expression
- ])
-
-class PanlexSwadeshCorpusReader(WordListCorpusReader):
- """
- This is a class to read the PanLex Swadesh list from
-
- David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
- PanLex: Building a Resource for Panlingual Lexical Translation.
- In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
-
- License: CC0 1.0 Universal
- https://creativecommons.org/publicdomain/zero/1.0/legalcode
- """
- def __init__(self, *args, **kwargs):
- super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
- # Find the swadesh size using the fileids' path.
- self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
- self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
- self._macro_langauges = self.get_macrolanguages()
-
- def license(self):
- print('CC0 1.0 Universal')
-
- def readme(self):
- print(self.raw('README'))
-
- def language_codes(self):
- return self._languages.keys()
-
- def get_languages(self):
- for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
- if not line.strip(): # Skip empty lines.
- continue
- yield PanlexLanguage(*line.strip().split('\t'))
-
- def get_macrolanguages(self):
- macro_langauges = defaultdict(list)
- for lang in self._languages.values():
- macro_langauges[lang.iso639].append(lang.panlex_uid)
- return macro_langauges
-
- def words_by_lang(self, lang_code):
- """
- :return: a list of list(str)
- """
- fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
- return [concept.split('\t') for concept in self.words(fileid)]
-
- def words_by_iso639(self, iso63_code):
- """
- :return: a list of list(str)
- """
- fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
- for lang_code in self._macro_langauges[iso63_code]]
- return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
-
- def entries(self, fileids=None):
- """
- :return: a tuple of words for the specified fileids.
- """
- if not fileids:
- fileids = self.fileids()
-
- wordlists = [self.words(f) for f in fileids]
- return list(zip(*wordlists))
# Natural Language Toolkit:
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader
-PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
-SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
+PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
+SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
-TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
-WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
+TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
+WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
TYPE = re.compile(r'type="(.*?)"')
ANA = re.compile(r'ana="(.*?)"')
class TEICorpusView(StreamBackedCorpusView):
- def __init__(
- self,
- corpus_file,
- tagged,
- group_by_sent,
- group_by_para,
- tagset=None,
- head_len=0,
- textids=None,
- ):
+ def __init__(self, corpus_file,
+ tagged, group_by_sent, group_by_para,
+ tagset=None, head_len=0, textids=None):
self._tagged = tagged
self._textids = textids
def read_block(self, stream):
block = stream.readlines(self._pagesize)
block = concat(block)
- while (block.count("<text id") > block.count("</text>")) or block.count(
- "<text id"
- ) == 0:
+ while (block.count('<text id') > block.count('</text>')) \
+ or block.count('<text id') == 0:
tmp = stream.readline()
if len(tmp) <= 0:
break
block += tmp
- block = block.replace("\n", "")
+ block = block.replace('\n', '')
textids = TEXTID.findall(block)
if self._textids:
for tid in textids:
if tid not in self._textids:
beg = block.find(tid) - 1
- end = block[beg:].find("</text>") + len("</text>")
- block = block[:beg] + block[beg + end :]
+ end = block[beg:].find('</text>') + len('</text>')
+ block = block[:beg] + block[beg + end:]
output = []
for para_str in PARA.findall(block):
if not self._tagged:
sent = WORD.findall(sent_str)
else:
- sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
+ sent = list(
+ map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
if self._group_by_sent:
para.append(sent)
else:
def _parse_tag(self, tag_word_tuple):
(tag, word) = tag_word_tuple
- if tag.startswith("w"):
+ if tag.startswith('w'):
tag = ANA.search(tag).group(1)
else: # tag.startswith('c')
tag = TYPE.search(tag).group(1)
head_len = 2770
def __init__(self, *args, **kwargs):
- if "textid_file" in kwargs:
- self._textids = kwargs["textid_file"]
+ if 'textid_file' in kwargs:
+ self._textids = kwargs['textid_file']
else:
self._textids = None
with open(self._textids) as fp:
for line in fp:
line = line.strip()
- file_id, text_ids = line.split(" ", 1)
+ file_id, text_ids = line.split(' ', 1)
if file_id not in self.fileids():
raise ValueError(
- "In text_id mapping file %s: %s not found"
+ 'In text_id mapping file %s: %s not found'
% (self._textids, file_id)
)
for text_id in text_ids.split(self._delimiter):
def _resolve(self, fileids, categories, textids=None):
tmp = None
- if (
- len(list(
- filter(
- lambda accessor: accessor is None, (fileids, categories, textids)
- )
- ))
- != 1
- ):
-
- raise ValueError(
- "Specify exactly one of: fileids, " "categories or textids"
- )
+ if len(filter(lambda accessor: accessor is None,
+ (fileids, categories, textids))) != 1:
+
+ raise ValueError('Specify exactly one of: fileids, '
+ 'categories or textids')
if fileids is not None:
return fileids, None
return self.fileids(categories), None
if textids is not None:
- if isinstance(textids, str):
+ if isinstance(textids, string_types):
textids = [textids]
files = sum((self._t2f[t] for t in textids), [])
tdict = dict()
for f in files:
- tdict[f] = set(self._f2t[f]) & set(textids)
+ tdict[f] = (set(self._f2t[f]) & set(textids))
return files, tdict
def decode_tag(self, tag):
of required chunks---giving much more control to the user.
"""
fileids, _ = self._resolve(fileids, categories)
- if fileids is None:
- return sorted(self._t2f)
+ if fileids is None: return sorted(self._t2f)
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return sorted(sum((self._f2t[d] for d in fileids), []))
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid),
- False,
- False,
- False,
- head_len=self.head_len,
- textids=textids[fileid],
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, False, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
else:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid),
- False,
- False,
- False,
- head_len=self.head_len,
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, False, False,
+ head_len=self.head_len)
+ for fileid in fileids])
def sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid),
- False,
- True,
- False,
- head_len=self.head_len,
- textids=textids[fileid],
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
else:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid), False, True, False, head_len=self.head_len
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, False,
+ head_len=self.head_len)
+ for fileid in fileids])
def paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid),
- False,
- True,
- True,
- head_len=self.head_len,
- textids=textids[fileid],
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, True,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
else:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid), False, True, True, head_len=self.head_len
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, True,
+ head_len=self.head_len)
+ for fileid in fileids])
def tagged_words(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid),
- True,
- False,
- False,
- head_len=self.head_len,
- textids=textids[fileid],
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, False, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
else:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid), True, False, False, head_len=self.head_len
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, False, False,
+ head_len=self.head_len)
+ for fileid in fileids])
def tagged_sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid),
- True,
- True,
- False,
- head_len=self.head_len,
- textids=textids[fileid],
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
else:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid), True, True, False, head_len=self.head_len
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, False,
+ head_len=self.head_len)
+ for fileid in fileids])
def tagged_paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid),
- True,
- True,
- True,
- head_len=self.head_len,
- textids=textids[fileid],
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, True,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
else:
- return concat(
- [
- TEICorpusView(
- self.abspath(fileid), True, True, True, head_len=self.head_len
- )
- for fileid in fileids
- ]
- )
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, True,
+ head_len=self.head_len)
+ for fileid in fileids])
def xml(self, fileids=None, categories=None):
fileids, _ = self._resolve(fileids, categories)
if len(fileids) == 1:
return XMLCorpusReader.xml(self, fileids[0])
else:
- raise TypeError("Expected a single file")
+ raise TypeError('Expected a single file')
def raw(self, fileids=None, categories=None):
fileids, _ = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
# Natural Language Toolkit: Plaintext Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Nitin Madnani <nmadnani@umiacs.umd.edu>
A reader for corpora that consist of plaintext documents.
"""
+from six import string_types
+import codecs
+
import nltk.data
from nltk.tokenize import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class PlaintextCorpusReader(CorpusReader):
"""
Reader for corpora that consist of plaintext documents. Paragraphs
``PlaintextCorpusReader`` may specify alternative corpus view
classes (e.g., to skip the preface sections of documents.)"""
- def __init__(
- self,
- root,
- fileids,
- word_tokenizer=WordPunctTokenizer(),
- sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
- para_block_reader=read_blankline_block,
- encoding="utf8",
- ):
+ def __init__(self, root, fileids,
+ word_tokenizer=WordPunctTokenizer(),
+ sent_tokenizer=nltk.data.LazyLoader(
+ 'tokenizers/punkt/english.pickle'),
+ para_block_reader=read_blankline_block,
+ encoding='utf8'):
"""
Construct a new plaintext corpus reader for a set of documents
located at the given root directory. Example usage:
:return: the given file(s) as a single string.
:rtype: str
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
raw_texts = []
for f in fileids:
_fin = self.open(f)
and punctuation symbols.
:rtype: list(str)
"""
- return concat(
- [
- self.CorpusView(path, self._read_word_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+ for (path, enc, fileid)
+ in self.abspaths(fileids, True, True)])
def sents(self, fileids=None):
"""
:rtype: list(list(str))
"""
if self._sent_tokenizer is None:
- raise ValueError("No sentence tokenizer for this corpus")
+ raise ValueError('No sentence tokenizer for this corpus')
- return concat(
- [
- self.CorpusView(path, self._read_sent_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+ for (path, enc, fileid)
+ in self.abspaths(fileids, True, True)])
def paras(self, fileids=None):
"""
:rtype: list(list(list(str)))
"""
if self._sent_tokenizer is None:
- raise ValueError("No sentence tokenizer for this corpus")
+ raise ValueError('No sentence tokenizer for this corpus')
- return concat(
- [
- self.CorpusView(path, self._read_para_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_para_block, encoding=enc)
+ for (path, enc, fileid)
+ in self.abspaths(fileids, True, True)])
def _read_word_block(self, stream):
words = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
words.extend(self._word_tokenizer.tokenize(stream.readline()))
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
- sents.extend(
- [
- self._word_tokenizer.tokenize(sent)
- for sent in self._sent_tokenizer.tokenize(para)
- ]
- )
+ sents.extend([self._word_tokenizer.tokenize(sent)
+ for sent in self._sent_tokenizer.tokenize(para)])
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
- paras.append(
- [
- self._word_tokenizer.tokenize(sent)
- for sent in self._sent_tokenizer.tokenize(para)
- ]
- )
+ paras.append([self._word_tokenizer.tokenize(sent)
+ for sent in self._sent_tokenizer.tokenize(para)])
return paras
-class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
+class CategorizedPlaintextCorpusReader(CategorizedCorpusReader,
+ PlaintextCorpusReader):
"""
A reader for plaintext corpora whose documents are divided into
categories based on their file identifiers.
"""
-
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
-
def raw(self, fileids=None, categories=None):
- return PlaintextCorpusReader.raw(self, self._resolve(fileids, categories))
-
+ return PlaintextCorpusReader.raw(
+ self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
- return PlaintextCorpusReader.words(self, self._resolve(fileids, categories))
-
+ return PlaintextCorpusReader.words(
+ self, self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
- return PlaintextCorpusReader.sents(self, self._resolve(fileids, categories))
-
+ return PlaintextCorpusReader.sents(
+ self, self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
- return PlaintextCorpusReader.paras(self, self._resolve(fileids, categories))
+ return PlaintextCorpusReader.paras(
+ self, self._resolve(fileids, categories))
-
-# FIXME: Is there a better way? How to not hardcode this?
-# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
-# override the `sent_tokenizer`.
+# is there a better way?
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
def __init__(self, *args, **kwargs):
CategorizedCorpusReader.__init__(self, kwargs)
- kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
- "tokenizers/punkt/portuguese.pickle"
- )
+ kwargs['sent_tokenizer'] = nltk.data.LazyLoader('tokenizers/punkt/portuguese.pickle')
PlaintextCorpusReader.__init__(self, *args, **kwargs)
-
class EuroparlCorpusReader(PlaintextCorpusReader):
"""
def _read_word_block(self, stream):
words = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
words.extend(stream.readline().split())
return words
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
- return concat(
- [
- self.CorpusView(fileid, self._read_para_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([self.CorpusView(fileid, self._read_para_block,
+ encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def paras(self, fileids=None):
- raise NotImplementedError(
- "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
- )
+ raise NotImplementedError('The Europarl corpus reader does not support paragraphs. Please use chapters() instead.')
# Natural Language Toolkit: PP Attachment Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
The PP Attachment Corpus is distributed with NLTK with the permission
of the author.
"""
+from __future__ import unicode_literals
+from six import string_types
+
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
+@compat.python_2_unicode_compatible
class PPAttachment(object):
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
self.sent = sent
self.attachment = attachment
def __repr__(self):
- return (
- "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
- "noun2=%r, attachment=%r)"
- % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
- )
-
+ return ('PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, '
+ 'noun2=%r, attachment=%r)' %
+ (self.sent, self.verb, self.noun1, self.prep,
+ self.noun2, self.attachment))
class PPAttachmentCorpusReader(CorpusReader):
"""
sentence_id verb noun1 preposition noun2 attachment
"""
-
def attachments(self, fileids):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, self._read_obj_block,
+ encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tuples(self, fileids):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([StreamBackedCorpusView(fileid, self._read_tuple_block,
+ encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _read_tuple_block(self, stream):
# Natural Language Toolkit: PropBank Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
from functools import total_ordering
from xml.etree import ElementTree
+from six import string_types
+
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class PropbankCorpusReader(CorpusReader):
"""
Corpus reader for the propbank corpus, which augments the Penn
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
-
- def __init__(
- self,
- root,
- propfile,
- framefiles="",
- verbsfile=None,
- parse_fileid_xform=None,
- parse_corpus=None,
- encoding="utf8",
- ):
+ def __init__(self, root, propfile, framefiles='',
+ verbsfile=None, parse_fileid_xform=None,
+ parse_corpus=None, encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param propfile: The name of the file containing the predicate-
necessary to resolve the tree pointers used by propbank.
"""
# If framefiles is specified as a regexp, expand it.
- if isinstance(framefiles, str):
+ if isinstance(framefiles, string_types):
framefiles = find_corpus_fileids(root, framefiles)
framefiles = list(framefiles)
# Initialze the corpus reader.
- CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
+ CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles,
+ encoding)
# Record our frame fileids & prop file.
self._propfile = propfile
"""
:return: the text contents of the given fileids, as a single string.
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, ): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def instances(self, baseform=None):
"""
kwargs = {}
if baseform is not None:
- kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
- return StreamBackedCorpusView(
- self.abspath(self._propfile),
- lambda stream: self._read_instance_block(stream, **kwargs),
- encoding=self.encoding(self._propfile),
- )
+ kwargs['instance_filter'] = lambda inst: inst.baseform==baseform
+ return StreamBackedCorpusView(self.abspath(self._propfile),
+ lambda stream: self._read_instance_block(stream, **kwargs),
+ encoding=self.encoding(self._propfile))
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
- return StreamBackedCorpusView(
- self.abspath(self._propfile),
- read_line_block,
- encoding=self.encoding(self._propfile),
- )
+ return StreamBackedCorpusView(self.abspath(self._propfile),
+ read_line_block,
+ encoding=self.encoding(self._propfile))
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
- baseform = roleset_id.split(".")[0]
- framefile = "frames/%s.xml" % baseform
+ baseform = roleset_id.split('.')[0]
+ framefile = 'frames/%s.xml' % baseform
if framefile not in self._framefiles:
- raise ValueError("Frameset file for %s not found" % roleset_id)
+ raise ValueError('Frameset file for %s not found' %
+ roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- for roleset in etree.findall("predicate/roleset"):
- if roleset.attrib["id"] == roleset_id:
+ for roleset in etree.findall('predicate/roleset'):
+ if roleset.attrib['id'] == roleset_id:
return roleset
- raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
+ raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
- framefile = "frames/%s.xml" % baseform
+ framefile = 'frames/%s.xml' % baseform
if framefile not in self._framefiles:
- raise ValueError("Frameset file for %s not found" % baseform)
+ raise ValueError('Frameset file for %s not found' %
+ baseform)
framefiles = [framefile]
else:
framefiles = self._framefiles
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- rsets.append(etree.findall("predicate/roleset"))
+ rsets.append(etree.findall('predicate/roleset'))
return LazyConcatenation(rsets)
def verbs(self):
:return: a corpus view that acts as a list of all verb lemmas
in this corpus (from the verbs.txt file).
"""
- return StreamBackedCorpusView(
- self.abspath(self._verbsfile),
- read_line_block,
- encoding=self.encoding(self._verbsfile),
- )
+ return StreamBackedCorpusView(self.abspath(self._verbsfile),
+ read_line_block,
+ encoding=self.encoding(self._verbsfile))
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
line = stream.readline().strip()
if line:
inst = PropbankInstance.parse(
- line, self._parse_fileid_xform, self._parse_corpus
- )
+ line, self._parse_fileid_xform,
+ self._parse_corpus)
if instance_filter(inst):
block.append(inst)
return block
-
######################################################################
-# { Propbank Instance & related datatypes
+#{ Propbank Instance & related datatypes
######################################################################
-
-
+@compat.python_2_unicode_compatible
class PropbankInstance(object):
- def __init__(
- self,
- fileid,
- sentnum,
- wordnum,
- tagger,
- roleset,
- inflection,
- predicate,
- arguments,
- parse_corpus=None,
- ):
+
+ def __init__(self, fileid, sentnum, wordnum, tagger, roleset,
+ inflection, predicate, arguments, parse_corpus=None):
self.fileid = fileid
"""The name of the file containing the parse tree for this
@property
def baseform(self):
"""The baseform of the predicate."""
- return self.roleset.split(".")[0]
+ return self.roleset.split('.')[0]
@property
def sensenumber(self):
"""The sense number of the predicate."""
- return self.roleset.split(".")[1]
+ return self.roleset.split('.')[1]
@property
def predid(self):
"""Identifier of the predicate."""
- return "rel"
+ return 'rel'
def __repr__(self):
- return "<PropbankInstance: %s, sent %s, word %s>" % (
- self.fileid,
- self.sentnum,
- self.wordnum,
- )
+ return ('<PropbankInstance: %s, sent %s, word %s>' %
+ (self.fileid, self.sentnum, self.wordnum))
def __str__(self):
- s = "%s %s %s %s %s %s" % (
- self.fileid,
- self.sentnum,
- self.wordnum,
- self.tagger,
- self.roleset,
- self.inflection,
- )
- items = self.arguments + ((self.predicate, "rel"),)
+ s = '%s %s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum,
+ self.tagger, self.roleset, self.inflection)
+ items = self.arguments + ((self.predicate, 'rel'),)
for (argloc, argid) in sorted(items):
- s += " %s-%s" % (argloc, argid)
+ s += ' %s-%s' % (argloc, argid)
return s
def _get_tree(self):
- if self.parse_corpus is None:
- return None
- if self.fileid not in self.parse_corpus.fileids():
- return None
+ if self.parse_corpus is None: return None
+ if self.fileid not in self.parse_corpus.fileids(): return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
-
- tree = property(
- _get_tree,
- doc="""
+ tree = property(_get_tree, doc="""
The parse tree corresponding to this instance, or None if
- the corresponding tree is not available.""",
- )
+ the corresponding tree is not available.""")
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 7:
- raise ValueError("Badly formatted propbank line: %r" % s)
+ raise ValueError('Badly formatted propbank line: %r' % s)
# Divide the line into its basic pieces.
- (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
- rel = [p for p in pieces[6:] if p.endswith("-rel")]
- args = [p for p in pieces[6:] if not p.endswith("-rel")]
+ (fileid, sentnum, wordnum,
+ tagger, roleset, inflection) = pieces[:6]
+ rel = [p for p in pieces[6:] if p.endswith('-rel')]
+ args = [p for p in pieces[6:] if not p.endswith('-rel')]
if len(rel) != 1:
- raise ValueError("Badly formatted propbank line: %r" % s)
+ raise ValueError('Badly formatted propbank line: %r' % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
# Parse the arguments.
arguments = []
for arg in args:
- argloc, argid = arg.split("-", 1)
- arguments.append((PropbankTreePointer.parse(argloc), argid))
+ argloc, argid = arg.split('-', 1)
+ arguments.append( (PropbankTreePointer.parse(argloc), argid) )
# Put it all together.
- return PropbankInstance(
- fileid,
- sentnum,
- wordnum,
- tagger,
- roleset,
- inflection,
- predicate,
- arguments,
- parse_corpus,
- )
-
+ return PropbankInstance(fileid, sentnum, wordnum, tagger,
+ roleset, inflection, predicate,
+ arguments, parse_corpus)
class PropbankPointer(object):
"""
chains in a tree. It consists of a sequence of pieces, which
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
"""
-
def __init__(self):
if self.__class__ == PropbankPointer:
raise NotImplementedError()
-
-
+@compat.python_2_unicode_compatible
class PropbankChainTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
``PropbankTreePointer`` pointers."""
def __str__(self):
- return "*".join("%s" % p for p in self.pieces)
-
+ return '*'.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<PropbankChainTreePointer: %s>" % self
-
+ return '<PropbankChainTreePointer: %s>' % self
def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
-
+ if tree is None: raise ValueError('Parse tree not avaialable')
+ return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
+@compat.python_2_unicode_compatible
class PropbankSplitTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
all ``PropbankTreePointer`` pointers."""
def __str__(self):
- return ",".join("%s" % p for p in self.pieces)
-
+ return ','.join('%s' % p for p in self.pieces)
def __repr__(self):
- return "<PropbankSplitTreePointer: %s>" % self
-
+ return '<PropbankSplitTreePointer: %s>' % self
def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+ if tree is None: raise ValueError('Parse tree not avaialable')
+ return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
@total_ordering
-
+@compat.python_2_unicode_compatible
class PropbankTreePointer(PropbankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
-
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
- pieces = s.split("*")
+ pieces = s.split('*')
if len(pieces) > 1:
- return PropbankChainTreePointer(
- [PropbankTreePointer.parse(elt) for elt in pieces]
- )
+ return PropbankChainTreePointer([PropbankTreePointer.parse(elt)
+ for elt in pieces])
# Deal with split args (xx,yy,zz)
- pieces = s.split(",")
+ pieces = s.split(',')
if len(pieces) > 1:
- return PropbankSplitTreePointer(
- [PropbankTreePointer.parse(elt) for elt in pieces]
- )
+ return PropbankSplitTreePointer([PropbankTreePointer.parse(elt)
+ for elt in pieces])
# Deal with normal pointers.
- pieces = s.split(":")
- if len(pieces) != 2:
- raise ValueError("bad propbank pointer %r" % s)
+ pieces = s.split(':')
+ if len(pieces) != 2: raise ValueError('bad propbank pointer %r' % s)
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
- return "%s:%s" % (self.wordnum, self.height)
+ return '%s:%s' % (self.wordnum, self.height)
def __repr__(self):
- return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
+ return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
def __eq__(self, other):
- while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
+ while isinstance(other, (PropbankChainTreePointer,
+ PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return self is other
- return self.wordnum == other.wordnum and self.height == other.height
+ return (self.wordnum == other.wordnum and self.height == other.height)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
- while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
+ while isinstance(other, (PropbankChainTreePointer,
+ PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
+ if tree is None: raise ValueError('Parse tree not avaialable')
return tree[self.treepos(tree)]
def treepos(self, tree):
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
- if tree is None:
- raise ValueError("Parse tree not avaialable")
+ if tree is None: raise ValueError('Parse tree not avaialable')
stack = [tree]
treepos = []
wordnum = 0
while True:
+ #print treepos
+ #print stack[-1]
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
# word node:
else:
if wordnum == self.wordnum:
- return tuple(treepos[: len(treepos) - self.height - 1])
+ return tuple(treepos[:len(treepos)-self.height-1])
else:
wordnum += 1
stack.pop()
-
-
+@compat.python_2_unicode_compatible
class PropbankInflection(object):
- # { Inflection Form
- INFINITIVE = "i"
- GERUND = "g"
- PARTICIPLE = "p"
- FINITE = "v"
- # { Inflection Tense
- FUTURE = "f"
- PAST = "p"
- PRESENT = "n"
- # { Inflection Aspect
- PERFECT = "p"
- PROGRESSIVE = "o"
- PERFECT_AND_PROGRESSIVE = "b"
- # { Inflection Person
- THIRD_PERSON = "3"
- # { Inflection Voice
- ACTIVE = "a"
- PASSIVE = "p"
- # { Inflection
- NONE = "-"
- # }
-
- def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
+ #{ Inflection Form
+ INFINITIVE = 'i'
+ GERUND = 'g'
+ PARTICIPLE = 'p'
+ FINITE = 'v'
+ #{ Inflection Tense
+ FUTURE = 'f'
+ PAST = 'p'
+ PRESENT = 'n'
+ #{ Inflection Aspect
+ PERFECT = 'p'
+ PROGRESSIVE = 'o'
+ PERFECT_AND_PROGRESSIVE = 'b'
+ #{ Inflection Person
+ THIRD_PERSON = '3'
+ #{ Inflection Voice
+ ACTIVE = 'a'
+ PASSIVE = 'p'
+ #{ Inflection
+ NONE = '-'
+ #}
+
+ def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
self.form = form
self.tense = tense
self.aspect = aspect
self.voice = voice
def __str__(self):
- return self.form + self.tense + self.aspect + self.person + self.voice
+ return self.form+self.tense+self.aspect+self.person+self.voice
def __repr__(self):
- return "<PropbankInflection: %s>" % self
+ return '<PropbankInflection: %s>' % self
- _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
+ _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
@staticmethod
def parse(s):
- if not isinstance(s, str):
- raise TypeError("expected a string")
- if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
- raise ValueError("Bad propbank inflection string %r" % s)
+ if not isinstance(s, string_types):
+ raise TypeError('expected a string')
+ if (len(s) != 5 or
+ not PropbankInflection._VALIDATE.match(s)):
+ raise ValueError('Bad propbank inflection string %r' % s)
return PropbankInflection(*s)
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
import re
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.tokenize import *
>>> pros_cons.words('IntegratedPros.txt')
['Easy', 'to', 'use', ',', 'economical', '!', ...]
"""
-
CorpusView = StreamBackedCorpusView
- def __init__(
- self,
- root,
- fileids,
- word_tokenizer=WordPunctTokenizer(),
- encoding="utf8",
- **kwargs
- ):
+ def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(),
+ encoding='utf8', **kwargs):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
- return concat(
- [
- self.CorpusView(path, self._read_sent_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def words(self, fileids=None, categories=None):
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
- return concat(
- [
- self.CorpusView(path, self._read_word_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
def _read_sent_block(self, stream):
sents = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
# Natural Language Toolkit: Product Reviews Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
consideration.
"""
+from __future__ import division
+
+from six import string_types
+
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
-TITLE = re.compile(r"^\[t\](.*)$") # [t] Title
-FEATURES = re.compile(
- r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
-) # find 'feature' in feature[+3]
-NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p]
-SENT = re.compile(r"##(.*)$") # find tokenized sentence
+TITLE = re.compile(r'^\[t\](.*)$') # [t] Title
+FEATURES = re.compile(r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]') # find 'feature' in feature[+3]
+NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]') # find 'p' in camera[+2][p]
+SENT = re.compile(r'##(.*)$') # find tokenized sentence
+@compat.python_2_unicode_compatible
class Review(object):
"""
A Review is the main block of a ReviewsCorpusReader.
"""
-
def __init__(self, title=None, review_lines=None):
"""
:param title: the title of the review.
return [review_line.sent for review_line in self.review_lines]
def __repr__(self):
- return 'Review(title="{}", review_lines={})'.format(
- self.title, self.review_lines
- )
+ return 'Review(title=\"{}\", review_lines={})'.format(self.title, self.review_lines)
+@compat.python_2_unicode_compatible
class ReviewLine(object):
"""
A ReviewLine represents a sentence of the review, together with (optional)
annotations of its features and notes about the reviewed item.
"""
-
def __init__(self, sent, features=None, notes=None):
self.sent = sent
if features is None:
self.notes = notes
def __repr__(self):
- return "ReviewLine(features={}, notes={}, sent={})".format(
- self.features, self.notes, self.sent
- )
+ return ('ReviewLine(features={}, notes={}, sent={})'.format(
+ self.features, self.notes, self.sent))
class ReviewsCorpusReader(CorpusReader):
We can compute stats for specific product features:
+ >>> from __future__ import division
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+ >>> # We use float for backward compatibility with division in Python2.7
>>> mean = tot / n_reviews
>>> print(n_reviews, tot, mean)
15 24 1.6
"""
-
CorpusView = StreamBackedCorpusView
- def __init__(
- self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
- ):
+ def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(),
+ encoding='utf8'):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
- return concat(
- [
- self.CorpusView(fileid, self._read_features, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([self.CorpusView(fileid, self._read_features, encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def raw(self, fileids=None):
"""
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
"""
if fileids is None:
fileids = self._fileids
- return concat(
- [
- self.CorpusView(fileid, self._read_review_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([self.CorpusView(fileid, self._read_review_block, encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def sents(self, fileids=None):
"""
list of word strings.
:rtype: list(list(str))
"""
- return concat(
- [
- self.CorpusView(path, self._read_sent_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+ for (path, enc, fileid)
+ in self.abspaths(fileids, True, True)])
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
- return concat(
- [
- self.CorpusView(path, self._read_word_block, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+ for (path, enc, fileid)
+ in self.abspaths(fileids, True, True)])
def _read_features(self, stream):
features = []
while True:
line = stream.readline()
if not line:
- return [] # end of file.
+ return [] # end of file.
title_match = re.match(TITLE, line)
if title_match:
- review = Review(
- title=title_match.group(1).strip()
- ) # We create a new review
+ review = Review(title=title_match.group(1).strip()) # We create a new review
break
# Scan until we find another line matching the regexp, or EOF.
def _read_word_block(self, stream):
words = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
line = stream.readline()
sent = re.findall(SENT, line)
if sent:
# Natural Language Toolkit: RTE Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
challenge number and 'n' is the pair ID.
"""
+from __future__ import unicode_literals
+
+from six import string_types
+
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import *
:rtype: int
"""
- valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
+ valdict = {"TRUE": 1,
+ "FALSE": 0,
+ "YES": 1,
+ "NO": 0}
return valdict[value_string.upper()]
-
+@compat.python_2_unicode_compatible
class RTEPair(object):
"""
Container for RTE text-hypothesis pairs.
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
attribute of this class.
"""
-
- def __init__(
- self,
- pair,
- challenge=None,
- id=None,
- text=None,
- hyp=None,
- value=None,
- task=None,
- length=None,
- ):
+ def __init__(self, pair, challenge=None, id=None, text=None, hyp=None,
+ value=None, task=None, length=None):
"""
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
:param id: identifier for the pair
:param task: attribute for the particular NLP task that the data was drawn from
:param length: attribute for the length of the text of the pair
"""
- self.challenge = challenge
+ self.challenge = challenge
self.id = pair.attrib["id"]
self.gid = "%s-%s" % (self.challenge, self.id)
self.text = pair[0].text
def __repr__(self):
if self.challenge:
- return "<RTEPair: gid=%s-%s>" % (self.challenge, self.id)
+ return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id)
else:
- return "<RTEPair: id=%s>" % self.id
+ return '<RTEPair: id=%s>' % self.id
class RTECorpusReader(XMLCorpusReader):
:rtype: list(RTEPair)
"""
try:
- challenge = doc.attrib["challenge"]
+ challenge = doc.attrib['challenge']
except KeyError:
challenge = None
- return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")]
+ return [RTEPair(pair, challenge=challenge)
+ for pair in doc.getiterator("pair")]
+
def pairs(self, fileids):
"""
:type: list
:rtype: list(RTEPair)
"""
- if isinstance(fileids, str):
- fileids = [fileids]
+ if isinstance(fileids, string_types): fileids = [fileids]
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
# Natural Language Toolkit: SemCor Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the SemCor Corpus.
"""
-
-__docformat__ = "epytext en"
+from __future__ import absolute_import, unicode_literals
+__docformat__ = 'epytext en'
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
from nltk.tree import Tree
-
class SemcorCorpusReader(XMLCorpusReader):
"""
Corpus reader for the SemCor Corpus.
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
"""
-
def __init__(self, root, fileids, wordnet, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
- return self._items(fileids, "word", False, False, False)
+ return self._items(fileids, 'word', False, False, False)
def chunks(self, fileids=None):
"""
that form a unit.
:rtype: list(list(str))
"""
- return self._items(fileids, "chunk", False, False, False)
+ return self._items(fileids, 'chunk', False, False, False)
- def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
+ def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of tagged chunks, represented
in tree form.
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
- return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
+ return self._items(fileids, 'chunk', False, tag!='sem', tag!='pos')
def sents(self, fileids=None):
"""
as a list of word strings.
:rtype: list(list(str))
"""
- return self._items(fileids, "word", True, False, False)
+ return self._items(fileids, 'word', True, False, False)
def chunk_sents(self, fileids=None):
"""
as a list of chunks.
:rtype: list(list(list(str)))
"""
- return self._items(fileids, "chunk", True, False, False)
+ return self._items(fileids, 'chunk', True, False, False)
- def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
+ def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of sentences. Each sentence
is represented as a list of tagged chunks (in tree form).
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
- return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
+ return self._items(fileids, 'chunk', True, tag!='sem', tag!='pos')
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
- if unit == "word" and not bracket_sent:
+ if unit=='word' and not bracket_sent:
# the result of the SemcorWordView may be a multiword unit, so the
# LazyConcatenation will make sure the sentence is flattened
- _ = lambda *args: LazyConcatenation(
- (SemcorWordView if self._lazy else self._words)(*args)
- )
+ _ = lambda *args: LazyConcatenation((SemcorWordView if self._lazy else self._words)(*args))
else:
_ = SemcorWordView if self._lazy else self._words
- return concat(
- [
- _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
- for fileid in self.abspaths(fileids)
- ]
- )
+ return concat([_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
+ for fileid in self.abspaths(fileids)])
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
"""
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
- assert unit in ("token", "word", "chunk")
+ assert unit in ('token', 'word', 'chunk')
result = []
xmldoc = ElementTree.parse(fileid).getroot()
- for xmlsent in xmldoc.findall(".//s"):
+ for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
- itm = SemcorCorpusReader._word(
- xmlword, unit, pos_tag, sem_tag, self._wordnet
- )
- if unit == "word":
+ itm = SemcorCorpusReader._word(xmlword, unit, pos_tag, sem_tag, self._wordnet)
+ if unit=='word':
sent.extend(itm)
else:
sent.append(itm)
if bracket_sent:
- result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
+ result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
else:
result.extend(sent)
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
tkn = xmlword.text
if not tkn:
- tkn = "" # fixes issue 337?
+ tkn = "" # fixes issue 337?
- lemma = xmlword.get("lemma", tkn) # lemma or NE class
- lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense)
+ lemma = xmlword.get('lemma', tkn) # lemma or NE class
+ lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense)
if lexsn is not None:
- sense_key = lemma + "%" + lexsn
- wnpos = ("n", "v", "a", "r", "s")[
- int(lexsn.split(":")[0]) - 1
- ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
+ sense_key = lemma + '%' + lexsn
+ wnpos = ('n','v','a','r','s')[int(lexsn.split(':')[0])-1] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
else:
sense_key = wnpos = None
- redef = xmlword.get(
- "rdf", tkn
- ) # redefinition--this indicates the lookup string
+ redef = xmlword.get('rdf', tkn) # redefinition--this indicates the lookup string
# does not exactly match the enclosed string, e.g. due to typographical adjustments
# or discontinuity of a multiword expression. If a redefinition has occurred,
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
- sensenum = xmlword.get("wnsn") # WordNet sense number
- isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet
- pos = xmlword.get(
- "pos"
- ) # part of speech for the whole chunk (None for punctuation)
+ sensenum = xmlword.get('wnsn') # WordNet sense number
+ isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet
+ pos = xmlword.get('pos') # part of speech for the whole chunk (None for punctuation)
- if unit == "token":
+ if unit=='token':
if not pos_tag and not sem_tag:
itm = tkn
else:
- itm = (
- (tkn,)
- + ((pos,) if pos_tag else ())
- + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
- )
+ itm = (tkn,) + ((pos,) if pos_tag else ()) + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
return itm
else:
- ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE
- if unit == "word":
+ ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE
+ if unit=='word':
return ww
else:
if sensenum is not None:
try:
- sense = wordnet.lemma_from_key(sense_key) # Lemma object
+ sense = wordnet.lemma_from_key(sense_key) # Lemma object
except Exception:
# cannot retrieve the wordnet.Lemma object. possible reasons:
# (a) the wordnet corpus is not downloaded;
- # (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
+ # (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
# solution: just use the lemma name as a string
try:
- sense = "%s.%s.%02d" % (
- lemma,
- wnpos,
- int(sensenum),
- ) # e.g.: reach.v.02
+ sense = '%s.%s.%02d' % (lemma, wnpos, int(sensenum)) # e.g.: reach.v.02
except ValueError:
- sense = (
- lemma + "." + wnpos + "." + sensenum
- ) # e.g. the sense number may be "2;1"
+ sense = lemma+'.'+wnpos+'.'+sensenum # e.g. the sense number may be "2;1"
bottom = [Tree(pos, ww)] if pos_tag else ww
if sem_tag and isOOVEntity:
if sensenum is not None:
- return Tree(sense, [Tree("NE", bottom)])
- else: # 'other' NE
- return Tree("NE", bottom)
+ return Tree(sense, [Tree('NE', bottom)])
+ else: # 'other' NE
+ return Tree('NE', bottom)
elif sem_tag and sensenum is not None:
return Tree(sense, bottom)
elif pos_tag:
return bottom[0]
else:
- return bottom # chunk as a list
-
+ return bottom # chunk as a list
def _all_xmlwords_in(elt, result=None):
- if result is None:
- result = []
+ if result is None: result = []
for child in elt:
- if child.tag in ("wf", "punc"):
- result.append(child)
- else:
- _all_xmlwords_in(child, result)
+ if child.tag in ('wf', 'punc'): result.append(child)
+ else: _all_xmlwords_in(child, result)
return result
-
class SemcorSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
-
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
-
class SemcorWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
-
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
"""
:param fileid: The name of the underlying file.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
- if bracket_sent:
- tagspec = ".*/s"
- else:
- tagspec = ".*/s/(punc|wf)"
+ if bracket_sent: tagspec = '.*/s'
+ else: tagspec = '.*/s/(punc|wf)'
self._unit = unit
self._sent = bracket_sent
XMLCorpusView.__init__(self, fileid, tagspec)
def handle_elt(self, elt, context):
- if self._sent:
- return self.handle_sent(elt)
- else:
- return self.handle_word(elt)
+ if self._sent: return self.handle_sent(elt)
+ else: return self.handle_word(elt)
def handle_word(self, elt):
- return SemcorCorpusReader._word(
- elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
- )
+ return SemcorCorpusReader._word(elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet)
def handle_sent(self, elt):
sent = []
for child in elt:
- if child.tag in ("wf", "punc"):
+ if child.tag in ('wf','punc'):
itm = self.handle_word(child)
- if self._unit == "word":
+ if self._unit=='word':
sent.extend(itm)
else:
sent.append(itm)
else:
- raise ValueError("Unexpected element %s" % child.tag)
- return SemcorSentence(elt.attrib["snum"], sent)
+ raise ValueError('Unexpected element %s' % child.tag)
+ return SemcorSentence(elt.attrib['snum'], sent)
# Natural Language Toolkit: Senseval 2 Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Steven Bird <stevenbird1@gmail.com> (modifications)
# URL: <http://nltk.org/>
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
"""
+from __future__ import print_function, unicode_literals
+
+from six import string_types
import re
from xml.etree import ElementTree
+from nltk import compat
from nltk.tokenize import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
+@compat.python_2_unicode_compatible
class SensevalInstance(object):
def __init__(self, word, position, context, senses):
self.word = word
self.context = context
def __repr__(self):
- return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
- self.word,
- self.position,
- self.context,
- self.senses,
- )
+ return ('SensevalInstance(word=%r, position=%r, '
+ 'context=%r, senses=%r)' %
+ (self.word, self.position, self.context, self.senses))
class SensevalCorpusReader(CorpusReader):
def instances(self, fileids=None):
- return concat(
- [
- SensevalCorpusView(fileid, enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([SensevalCorpusView(fileid, enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def raw(self, fileids=None):
"""
:return: the text contents of the given fileids, as a single string.
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _entry(self, tree):
elts = []
- for lexelt in tree.findall("lexelt"):
- for inst in lexelt.findall("instance"):
- sense = inst[0].attrib["senseid"]
- context = [(w.text, w.attrib["pos"]) for w in inst[1]]
- elts.append((sense, context))
+ for lexelt in tree.findall('lexelt'):
+ for inst in lexelt.findall('instance'):
+ sense = inst[0].attrib['senseid']
+ context = [(w.text, w.attrib['pos'])
+ for w in inst[1]]
+ elts.append( (sense, context) )
return elts
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._word_tokenizer = WhitespaceTokenizer()
- self._lexelt_starts = [0] # list of streampos
- self._lexelts = [None] # list of lexelt names
+ self._lexelt_starts = [0] # list of streampos
+ self._lexelts = [None] # list of lexelt names
def read_block(self, stream):
# Decide which lexical element we're in.
- lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
+ lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1
lexelt = self._lexelts[lexelt_num]
instance_lines = []
in_instance = False
while True:
line = stream.readline()
- if line == "":
+ if line == '':
assert instance_lines == []
return []
# Start of a lexical element?
- if line.lstrip().startswith("<lexelt"):
+ if line.lstrip().startswith('<lexelt'):
lexelt_num += 1
- m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
- assert m is not None # <lexelt> has no 'item=...'
+ m = re.search('item=("[^"]+"|\'[^\']+\')', line)
+ assert m is not None # <lexelt> has no 'item=...'
lexelt = m.group(1)[1:-1]
if lexelt_num < len(self._lexelts):
assert lexelt == self._lexelts[lexelt_num]
self._lexelt_starts.append(stream.tell())
# Start of an instance?
- if line.lstrip().startswith("<instance"):
+ if line.lstrip().startswith('<instance'):
assert instance_lines == []
in_instance = True
instance_lines.append(line)
# End of an instance?
- if line.lstrip().startswith("</instance"):
- xml_block = "\n".join(instance_lines)
+ if line.lstrip().startswith('</instance'):
+ xml_block = '\n'.join(instance_lines)
xml_block = _fixXML(xml_block)
inst = ElementTree.fromstring(xml_block)
return [self._parse_instance(inst, lexelt)]
context = []
position = None
for child in instance:
- if child.tag == "answer":
- senses.append(child.attrib["senseid"])
- elif child.tag == "context":
+ if child.tag == 'answer':
+ senses.append(child.attrib['senseid'])
+ elif child.tag == 'context':
context += self._word_tokenizer.tokenize(child.text)
for cword in child:
- if cword.tag == "compound":
- cword = cword[0] # is this ok to do?
+ if cword.tag == 'compound':
+ cword = cword[0] # is this ok to do?
- if cword.tag == "head":
+ if cword.tag == 'head':
# Some santiy checks:
- assert position is None, "head specified twice"
- assert cword.text.strip() or len(cword) == 1
- assert not (cword.text.strip() and len(cword) == 1)
+ assert position is None, 'head specified twice'
+ assert cword.text.strip() or len(cword)==1
+ assert not (cword.text.strip() and len(cword)==1)
# Record the position of the head:
position = len(context)
# Addd on the head word itself:
if cword.text.strip():
context.append(cword.text.strip())
- elif cword[0].tag == "wf":
- context.append((cword[0].text, cword[0].attrib["pos"]))
+ elif cword[0].tag == 'wf':
+ context.append((cword[0].text,
+ cword[0].attrib['pos']))
if cword[0].tail:
- context += self._word_tokenizer.tokenize(cword[0].tail)
+ context += self._word_tokenizer.tokenize(
+ cword[0].tail)
else:
- assert False, "expected CDATA or wf in <head>"
- elif cword.tag == "wf":
- context.append((cword.text, cword.attrib["pos"]))
- elif cword.tag == "s":
- pass # Sentence boundary marker.
+ assert False, 'expected CDATA or wf in <head>'
+ elif cword.tag == 'wf':
+ context.append((cword.text, cword.attrib['pos']))
+ elif cword.tag == 's':
+ pass # Sentence boundary marker.
else:
- print("ACK", cword.tag)
- assert False, "expected CDATA or <wf> or <head>"
+ print('ACK', cword.tag)
+ assert False, 'expected CDATA or <wf> or <head>'
if cword.tail:
context += self._word_tokenizer.tokenize(cword.tail)
else:
- assert False, "unexpected tag %s" % child.tag
+ assert False, 'unexpected tag %s' % child.tag
return SensevalInstance(lexelt, position, context, senses)
-
def _fixXML(text):
"""
Fix the various issues with Senseval pseudo-XML.
"""
# <~> or <^> => ~ or ^
- text = re.sub(r"<([~\^])>", r"\1", text)
+ text = re.sub(r'<([~\^])>', r'\1', text)
# fix lone &
- text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text)
+ text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text)
# fix """
- text = re.sub(r'"""', "'\"'", text)
+ text = re.sub(r'"""', '\'"\'', text)
# fix <s snum=dd> => <s snum="dd"/>
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
# fix foreign word tag
- text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
+ text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
# remove <&I .>
- text = re.sub(r"<\&I[^>]*>", "", text)
+ text = re.sub(r'<\&I[^>]*>', '', text)
# fix <{word}>
- text = re.sub(r"<{([^}]+)}>", r"\1", text)
+ text = re.sub(r'<{([^}]+)}>', r'\1', text)
# remove <@>, <p>, </p>
- text = re.sub(r"<(@|/?p)>", r"", text)
+ text = re.sub(r'<(@|/?p)>', r'', text)
# remove <&M .> and <&T .> and <&Ms .>
- text = re.sub(r"<&\w+ \.>", r"", text)
+ text = re.sub(r'<&\w+ \.>', r'', text)
# remove <!DOCTYPE... > lines
- text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
+ text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
# remove <[hi]> and <[/p]> etc
- text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
+ text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
# take the thing out of the brackets: <…>
- text = re.sub(r"<(\&\w+;)>", r"\1", text)
+ text = re.sub(r'<(\&\w+;)>', r'\1', text)
# and remove the & for those patterns that aren't regular XML
- text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
+ text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
- text = re.sub(
- r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
- )
+ text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>',
+ r' <wf pos="\2">\1</wf>', text)
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
return text
# -*- coding: utf-8 -*-
# Natural Language Toolkit: SentiWordNet
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
import re
-
+from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader import CorpusReader
-
+@python_2_unicode_compatible
class SentiWordNetCorpusReader(CorpusReader):
- def __init__(self, root, fileids, encoding="utf-8"):
+ def __init__(self, root, fileids, encoding='utf-8'):
"""
Construct a new SentiWordNet Corpus Reader, using data from
the specified file.
- """
- super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding)
+ """
+ super(SentiWordNetCorpusReader, self).__init__(root, fileids,
+ encoding=encoding)
if len(self._fileids) != 1:
- raise ValueError("Exactly one file must be specified")
+ raise ValueError('Exactly one file must be specified')
self._db = {}
self._parse_src_file()
def _parse_src_file(self):
lines = self.open(self._fileids[0]).read().splitlines()
- lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
+ lines = filter((lambda x : not re.search(r"^\s*#", x)), lines)
for i, line in enumerate(lines):
fields = [field.strip() for field in re.split(r"\t+", line)]
- try:
+ try:
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
except:
- raise ValueError("Line %s formatted incorrectly: %s\n" % (i, line))
+ raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line))
if pos and offset:
offset = int(offset)
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
- def senti_synset(self, *vals):
+ def senti_synset(self, *vals):
from nltk.corpus import wordnet as wn
-
if tuple(vals) in self._db:
pos_score, neg_score = self._db[tuple(vals)]
pos, offset = vals
- if pos == "s":
- pos = "a"
- synset = wn.synset_from_pos_and_offset(pos, offset)
+ if pos == 's':
+ pos = 'a'
+ synset = wn._synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset)
else:
synset = wn.synset(vals[0])
pos = synset.pos()
- if pos == "s":
- pos = "a"
+ if pos == 's':
+ pos = 'a'
offset = synset.offset()
if (pos, offset) in self._db:
pos_score, neg_score = self._db[(pos, offset)]
def senti_synsets(self, string, pos=None):
from nltk.corpus import wordnet as wn
-
sentis = []
synset_list = wn.synsets(string, pos)
for synset in synset_list:
sentis.append(self.senti_synset(synset.name()))
- sentis = filter(lambda x: x, sentis)
+ sentis = filter(lambda x : x, sentis)
return sentis
def all_senti_synsets(self):
from nltk.corpus import wordnet as wn
-
for key, fields in self._db.items():
pos, offset = key
pos_score, neg_score = fields
- synset = wn.synset_from_pos_and_offset(pos, offset)
+ synset = wn._synset_from_pos_and_offset(pos, offset)
yield SentiSynset(pos_score, neg_score, synset)
+@python_2_unicode_compatible
class SentiSynset(object):
def __init__(self, pos_score, neg_score, synset):
self._pos_score = pos_score
def __repr__(self):
return "Senti" + repr(self.synset)
+
# Natural Language Toolkit: Sinica Treebank Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Language and Knowledge Processing Group, Institute of Information
Science, Academia Sinica
-The data is distributed with the Natural Language Toolkit under the terms of
+It is distributed with the Natural Language Toolkit under the terms of
the Creative Commons Attribution-NonCommercial-ShareAlike License
[http://creativecommons.org/licenses/by-nc-sa/2.5/].
Extraction, Proceedings of IJCNLP-04, pp560-565.
"""
+import os
+import re
+
from nltk.tree import sinica_parse
from nltk.tag import map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-IDENTIFIER = re.compile(r"^#\S+\s")
-APPENDIX = re.compile(r"(?<=\))#.*$")
-TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
-WORD = re.compile(r":[^:()|]+:([^:()|]+)")
-
+IDENTIFIER = re.compile(r'^#\S+\s')
+APPENDIX = re.compile(r'(?<=\))#.*$')
+TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
+WORD = re.compile(r':[^:()|]+:([^:()|]+)')
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
"""
Reader for the sinica treebank.
"""
-
def _read_block(self, stream):
sent = stream.readline()
- sent = IDENTIFIER.sub("", sent)
- sent = APPENDIX.sub("", sent)
+ sent = IDENTIFIER.sub('', sent)
+ sent = APPENDIX.sub('', sent)
return [sent]
def _parse(self, sent):
return sinica_parse(sent)
def _tag(self, sent, tagset=None):
- tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
+ tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)]
if tagset and tagset != self._tagset:
- tagged_sent = [
- (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
- ]
+ tagged_sent = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in tagged_sent]
return tagged_sent
def _word(self, sent):
# Natural Language Toolkit: String Category Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
"""
# based on PPAttachmentCorpusReader
+from six import string_types
+
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
# in nltk, we use the form (data, tag) -- e.g., tagged words and
# labeled texts for classifiers.
class StringCategoryCorpusReader(CorpusReader):
- def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
+ def __init__(self, root, fileids, delimiter=' ', encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
self._delimiter = delimiter
def tuples(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
+ return concat([StreamBackedCorpusView(fileid, self._read_tuple_block,
+ encoding=enc)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def raw(self, fileids=None):
"""
:return: the text contents of the given fileids, as a single string.
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _read_tuple_block(self, stream):
# Natural Language Toolkit: Switchboard Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
from nltk.tag import str2tuple, map_tag
+from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
+@compat.python_2_unicode_compatible
class SwitchboardTurn(list):
"""
A specialized list object used to encode switchboard utterances.
spearker identifier and utterance id. Note that utterance ids
are only unique within a given discourse.
"""
-
def __init__(self, words, speaker, id):
list.__init__(self, words)
self.speaker = speaker
def __repr__(self):
if len(self) == 0:
- text = ""
+ text = ''
elif isinstance(self[0], tuple):
- text = " ".join("%s/%s" % w for w in self)
+ text = ' '.join('%s/%s' % w for w in self)
else:
- text = " ".join(self)
- return "<%s.%s: %r>" % (self.speaker, self.id, text)
+ text = ' '.join(self)
+ return '<%s.%s: %r>' % (self.speaker, self.id, text)
class SwitchboardCorpusReader(CorpusReader):
- _FILES = ["tagged"]
+ _FILES = ['tagged']
# Use the "tagged" file even for non-tagged data methods, since
# it's tokenized.
self._tagset = tagset
def words(self):
- return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'),
+ self._words_block_reader)
def tagged_words(self, tagset=None):
def tagged_words_block_reader(stream):
return self._tagged_words_block_reader(stream, tagset)
-
- return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'),
+ tagged_words_block_reader)
def turns(self):
- return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'),
+ self._turns_block_reader)
def tagged_turns(self, tagset=None):
def tagged_turns_block_reader(stream):
return self._tagged_turns_block_reader(stream, tagset)
-
- return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
+ return StreamBackedCorpusView(self.abspath('tagged'),
+ tagged_turns_block_reader)
def discourses(self):
- return StreamBackedCorpusView(
- self.abspath("tagged"), self._discourses_block_reader
- )
+ return StreamBackedCorpusView(self.abspath('tagged'),
+ self._discourses_block_reader)
def tagged_discourses(self, tagset=False):
def tagged_discourses_block_reader(stream):
return self._tagged_discourses_block_reader(stream, tagset)
-
- return StreamBackedCorpusView(
- self.abspath("tagged"), tagged_discourses_block_reader
- )
+ return StreamBackedCorpusView(self.abspath('tagged'),
+ tagged_discourses_block_reader)
def _discourses_block_reader(self, stream):
# returns at most 1 discourse. (The other methods depend on this.)
- return [
- [
- self._parse_utterance(u, include_tag=False)
- for b in read_blankline_block(stream)
- for u in b.split("\n")
- if u.strip()
- ]
- ]
+ return [[self._parse_utterance(u, include_tag=False)
+ for b in read_blankline_block(stream)
+ for u in b.split('\n') if u.strip()]]
def _tagged_discourses_block_reader(self, stream, tagset=None):
# returns at most 1 discourse. (The other methods depend on this.)
- return [
- [
- self._parse_utterance(u, include_tag=True, tagset=tagset)
- for b in read_blankline_block(stream)
- for u in b.split("\n")
- if u.strip()
- ]
- ]
+ return [[self._parse_utterance(u, include_tag=True,
+ tagset=tagset)
+ for b in read_blankline_block(stream)
+ for u in b.split('\n') if u.strip()]]
def _turns_block_reader(self, stream):
return self._discourses_block_reader(stream)[0]
return sum(self._discourses_block_reader(stream)[0], [])
def _tagged_words_block_reader(self, stream, tagset=None):
- return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
-
- _UTTERANCE_RE = re.compile("(\w+)\.(\d+)\:\s*(.*)")
- _SEP = "/"
+ return sum(self._tagged_discourses_block_reader(stream,
+ tagset)[0], [])
+ _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)')
+ _SEP = '/'
def _parse_utterance(self, utterance, include_tag, tagset=None):
m = self._UTTERANCE_RE.match(utterance)
if m is None:
- raise ValueError("Bad utterance %r" % utterance)
+ raise ValueError('Bad utterance %r' % utterance)
speaker, id, text = m.groups()
words = [str2tuple(s, self._SEP) for s in text.split()]
if not include_tag:
- words = [w for (w, t) in words]
+ words = [w for (w,t) in words]
elif tagset and tagset != self._tagset:
- words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
+ words = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in words]
return SwitchboardTurn(words, speaker, id)
+
# Natural Language Toolkit: Tagged Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jacob Perkins <japerk@gmail.com>
import os
+from six import string_types
+
from nltk.tag import str2tuple, map_tag
from nltk.tokenize import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.timit import read_timit_block
-
class TaggedCorpusReader(CorpusReader):
"""
Reader for simple part-of-speech tagged corpora. Paragraphs are
constructor. Part of speech tags are case-normalized to upper
case.
"""
-
- def __init__(
- self,
- root,
- fileids,
- sep="/",
- word_tokenizer=WhitespaceTokenizer(),
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
- para_block_reader=read_blankline_block,
- encoding="utf8",
- tagset=None,
- ):
+ def __init__(self, root, fileids,
+ sep='/', word_tokenizer=WhitespaceTokenizer(),
+ sent_tokenizer=RegexpTokenizer('\n', gaps=True),
+ para_block_reader=read_blankline_block,
+ encoding='utf8',
+ tagset=None):
"""
Construct a new Tagged Corpus reader for a set of documents
located at the given root directory. Example usage:
:return: the given file(s) as a single string.
:rtype: str
"""
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
and punctuation symbols.
:rtype: list(str)
"""
- return concat(
- [
- TaggedCorpusView(
- fileid,
- enc,
- False,
- False,
- False,
- self._sep,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._para_block_reader,
- None,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([TaggedCorpusView(fileid, enc,
+ False, False, False,
+ self._sep, self._word_tokenizer,
+ self._sent_tokenizer,
+ self._para_block_reader,
+ None)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def sents(self, fileids=None):
"""
strings.
:rtype: list(list(str))
"""
- return concat(
- [
- TaggedCorpusView(
- fileid,
- enc,
- False,
- True,
- False,
- self._sep,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._para_block_reader,
- None,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([TaggedCorpusView(fileid, enc,
+ False, True, False,
+ self._sep, self._word_tokenizer,
+ self._sent_tokenizer,
+ self._para_block_reader,
+ None)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def paras(self, fileids=None):
"""
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
- return concat(
- [
- TaggedCorpusView(
- fileid,
- enc,
- False,
- True,
- True,
- self._sep,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._para_block_reader,
- None,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([TaggedCorpusView(fileid, enc,
+ False, True, True,
+ self._sep, self._word_tokenizer,
+ self._sent_tokenizer,
+ self._para_block_reader,
+ None)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_words(self, fileids=None, tagset=None):
"""
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
- return concat(
- [
- TaggedCorpusView(
- fileid,
- enc,
- True,
- False,
- False,
- self._sep,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._para_block_reader,
- tag_mapping_function,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([TaggedCorpusView(fileid, enc,
+ True, False, False,
+ self._sep, self._word_tokenizer,
+ self._sent_tokenizer,
+ self._para_block_reader,
+ tag_mapping_function)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_sents(self, fileids=None, tagset=None):
"""
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
- return concat(
- [
- TaggedCorpusView(
- fileid,
- enc,
- True,
- True,
- False,
- self._sep,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._para_block_reader,
- tag_mapping_function,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([TaggedCorpusView(fileid, enc,
+ True, True, False,
+ self._sep, self._word_tokenizer,
+ self._sent_tokenizer,
+ self._para_block_reader,
+ tag_mapping_function)
+ for (fileid, enc) in self.abspaths(fileids, True)])
def tagged_paras(self, fileids=None, tagset=None):
"""
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
- return concat(
- [
- TaggedCorpusView(
- fileid,
- enc,
- True,
- True,
- True,
- self._sep,
- self._word_tokenizer,
- self._sent_tokenizer,
- self._para_block_reader,
- tag_mapping_function,
- )
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
-
-
-class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
+ return concat([TaggedCorpusView(fileid, enc,
+ True, True, True,
+ self._sep, self._word_tokenizer,
+ self._sent_tokenizer,
+ self._para_block_reader,
+ tag_mapping_function)
+ for (fileid, enc) in self.abspaths(fileids, True)])
+
+class CategorizedTaggedCorpusReader(CategorizedCorpusReader,
+ TaggedCorpusReader):
"""
A reader for part-of-speech tagged corpora whose documents are
divided into categories based on their file identifiers.
"""
-
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
- raise ValueError("Specify fileids or categories, not both")
+ raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
-
def raw(self, fileids=None, categories=None):
- return TaggedCorpusReader.raw(self, self._resolve(fileids, categories))
-
+ return TaggedCorpusReader.raw(
+ self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
- return TaggedCorpusReader.words(self, self._resolve(fileids, categories))
-
+ return TaggedCorpusReader.words(
+ self, self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
- return TaggedCorpusReader.sents(self, self._resolve(fileids, categories))
-
+ return TaggedCorpusReader.sents(
+ self, self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
- return TaggedCorpusReader.paras(self, self._resolve(fileids, categories))
-
+ return TaggedCorpusReader.paras(
+ self, self._resolve(fileids, categories))
def tagged_words(self, fileids=None, categories=None, tagset=None):
return TaggedCorpusReader.tagged_words(
- self, self._resolve(fileids, categories), tagset
- )
-
+ self, self._resolve(fileids, categories), tagset)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return TaggedCorpusReader.tagged_sents(
- self, self._resolve(fileids, categories), tagset
- )
-
+ self, self._resolve(fileids, categories), tagset)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return TaggedCorpusReader.tagged_paras(
- self, self._resolve(fileids, categories), tagset
- )
-
+ self, self._resolve(fileids, categories), tagset)
class TaggedCorpusView(StreamBackedCorpusView):
"""
``TaggedCorpusView`` objects are typically created by
``TaggedCorpusReader`` (not directly by nltk users).
"""
-
- def __init__(
- self,
- corpus_file,
- encoding,
- tagged,
- group_by_sent,
- group_by_para,
- sep,
- word_tokenizer,
- sent_tokenizer,
- para_block_reader,
- tag_mapping_function=None,
- ):
+ def __init__(self, corpus_file, encoding, tagged, group_by_sent,
+ group_by_para, sep, word_tokenizer, sent_tokenizer,
+ para_block_reader, tag_mapping_function=None):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
- sent = [
- str2tuple(s, self._sep)
- for s in self._word_tokenizer.tokenize(sent_str)
- ]
+ sent = [str2tuple(s, self._sep) for s in
+ self._word_tokenizer.tokenize(sent_str)]
if self._tag_mapping_function:
- sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
+ sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
if not self._tagged:
- sent = [w for (w, t) in sent]
+ sent = [w for (w,t) in sent]
if self._group_by_sent:
para.append(sent)
else:
block.extend(para)
return block
-
# needs to implement simplified tags
class MacMorphoCorpusReader(TaggedCorpusReader):
"""
``self.paras()`` and ``self.tagged_paras()`` contains a single
sentence.
"""
-
- def __init__(self, root, fileids, encoding="utf8", tagset=None):
+ def __init__(self, root, fileids, encoding='utf8', tagset=None):
TaggedCorpusReader.__init__(
- self,
- root,
- fileids,
- sep="_",
+ self, root, fileids, sep='_',
word_tokenizer=LineTokenizer(),
- sent_tokenizer=RegexpTokenizer(".*\n"),
+ sent_tokenizer=RegexpTokenizer('.*\n'),
para_block_reader=self._read_block,
encoding=encoding,
- tagset=tagset,
- )
+ tagset=tagset)
def _read_block(self, stream):
- return read_regexp_block(stream, r".*", r".*_\.")
-
+ return read_regexp_block(stream, r'.*', r'.*_\.')
class TimitTaggedCorpusReader(TaggedCorpusReader):
"""
A corpus reader for tagged sentences that are included in the TIMIT corpus.
"""
-
def __init__(self, *args, **kwargs):
TaggedCorpusReader.__init__(
- self, para_block_reader=read_timit_block, *args, **kwargs
- )
+ self, para_block_reader=read_timit_block, *args, **kwargs)
def paras(self):
- raise NotImplementedError("use sents() instead")
+ raise NotImplementedError('use sents() instead')
def tagged_paras(self):
- raise NotImplementedError("use tagged_sents() instead")
+ raise NotImplementedError('use tagged_sents() instead')
timit.audiodata function.
"""
+from __future__ import print_function, unicode_literals
+
import sys
import os
import re
import tempfile
import time
+from six import string_types
+
+from nltk import compat
from nltk.tree import Tree
from nltk.internals import import_from_stdlib
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class TimitCorpusReader(CorpusReader):
"""
Reader for the TIMIT corpus (or any other corpus with the same
- <utterance-id>.wav: utterance sound file
"""
- _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
+ _FILE_RE = (r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' +
+ r'timitdic\.txt|spkrinfo\.txt')
"""A regexp matching fileids that are used by this corpus reader."""
- _UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
+ _UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
- def __init__(self, root, encoding="utf8"):
+ def __init__(self, root, encoding='utf8'):
"""
Construct a new TIMIT corpus reader in the given directory.
:param root: The root directory for this corpus.
"""
# Ensure that wave files don't get treated as unicode data:
- if isinstance(encoding, str):
- encoding = [(".*\.wav", None), (".*", encoding)]
+ if isinstance(encoding, string_types):
+ encoding = [('.*\.wav', None), ('.*', encoding)]
- CorpusReader.__init__(
- self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
- )
+ CorpusReader.__init__(self, root,
+ find_corpus_fileids(root, self._FILE_RE),
+ encoding=encoding)
- self._utterances = [
- name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
- ]
+ self._utterances = [name[:-4] for name in
+ find_corpus_fileids(root, self._UTTERANCE_RE)]
"""A list of the utterance identifiers for all utterances in
this corpus."""
self._speakerinfo = None
self._root = root
- self.speakers = sorted(set(u.split("/")[0] for u in self._utterances))
+ self.speakers = sorted(set(u.split('/')[0] for u in self._utterances))
def fileids(self, filetype=None):
"""
"""
if filetype is None:
return CorpusReader.fileids(self)
- elif filetype in ("txt", "wrd", "phn", "wav"):
- return ["%s.%s" % (u, filetype) for u in self._utterances]
- elif filetype == "metadata":
- return ["timitdic.txt", "spkrinfo.txt"]
+ elif filetype in ('txt', 'wrd', 'phn', 'wav'):
+ return ['%s.%s' % (u, filetype) for u in self._utterances]
+ elif filetype == 'metadata':
+ return ['timitdic.txt', 'spkrinfo.txt']
else:
- raise ValueError("Bad value for filetype: %r" % filetype)
+ raise ValueError('Bad value for filetype: %r' % filetype)
- def utteranceids(
- self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
- ):
+ def utteranceids(self, dialect=None, sex=None, spkrid=None,
+ sent_type=None, sentid=None):
"""
:return: A list of the utterance identifiers for all
utterances in this corpus, or for the given speaker, dialect
region, gender, sentence type, or sentence number, if
specified.
"""
- if isinstance(dialect, str):
- dialect = [dialect]
- if isinstance(sex, str):
- sex = [sex]
- if isinstance(spkrid, str):
- spkrid = [spkrid]
- if isinstance(sent_type, str):
- sent_type = [sent_type]
- if isinstance(sentid, str):
- sentid = [sentid]
+ if isinstance(dialect, string_types): dialect = [dialect]
+ if isinstance(sex, string_types): sex = [sex]
+ if isinstance(spkrid, string_types): spkrid = [spkrid]
+ if isinstance(sent_type, string_types): sent_type = [sent_type]
+ if isinstance(sentid, string_types): sentid = [sentid]
utterances = self._utterances[:]
if dialect is not None:
each word.
"""
_transcriptions = {}
- for line in self.open("timitdic.txt"):
- if not line.strip() or line[0] == ";":
- continue
- m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
- if not m:
- raise ValueError("Bad line: %r" % line)
+ for line in self.open('timitdic.txt'):
+ if not line.strip() or line[0] == ';': continue
+ m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
+ if not m: raise ValueError('Bad line: %r' % line)
_transcriptions[m.group(1)] = m.group(2).split()
return _transcriptions
def spkrid(self, utterance):
- return utterance.split("/")[0]
+ return utterance.split('/')[0]
def sentid(self, utterance):
- return utterance.split("/")[1]
+ return utterance.split('/')[1]
def utterance(self, spkrid, sentid):
- return "%s/%s" % (spkrid, sentid)
+ return '%s/%s' % (spkrid, sentid)
def spkrutteranceids(self, speaker):
"""
:return: A list of all utterances associated with a given
speaker.
"""
- return [
- utterance
- for utterance in self._utterances
- if utterance.startswith(speaker + "/")
- ]
+ return [utterance for utterance in self._utterances
+ if utterance.startswith(speaker+'/')]
def spkrinfo(self, speaker):
"""
if self._speakerinfo is None:
self._speakerinfo = {}
- for line in self.open("spkrinfo.txt"):
- if not line.strip() or line[0] == ";":
- continue
+ for line in self.open('spkrinfo.txt'):
+ if not line.strip() or line[0] == ';': continue
rec = line.strip().split(None, 9)
- key = "dr%s-%s%s" % (rec[2], rec[1].lower(), rec[0].lower())
+ key = "dr%s-%s%s" % (rec[2],rec[1].lower(),rec[0].lower())
self._speakerinfo[key] = SpeakerInfo(*rec)
return self._speakerinfo[speaker]
def phones(self, utterances=None):
- return [
- line.split()[-1]
- for fileid in self._utterance_fileids(utterances, ".phn")
- for line in self.open(fileid)
- if line.strip()
- ]
+ return [line.split()[-1]
+ for fileid in self._utterance_fileids(utterances, '.phn')
+ for line in self.open(fileid) if line.strip()]
def phone_times(self, utterances=None):
"""
offset is represented as a number of 16kHz samples!
"""
- return [
- (line.split()[2], int(line.split()[0]), int(line.split()[1]))
- for fileid in self._utterance_fileids(utterances, ".phn")
- for line in self.open(fileid)
- if line.strip()
- ]
+ return [(line.split()[2], int(line.split()[0]), int(line.split()[1]))
+ for fileid in self._utterance_fileids(utterances, '.phn')
+ for line in self.open(fileid) if line.strip()]
def words(self, utterances=None):
- return [
- line.split()[-1]
- for fileid in self._utterance_fileids(utterances, ".wrd")
- for line in self.open(fileid)
- if line.strip()
- ]
+ return [line.split()[-1]
+ for fileid in self._utterance_fileids(utterances, '.wrd')
+ for line in self.open(fileid) if line.strip()]
def word_times(self, utterances=None):
- return [
- (line.split()[2], int(line.split()[0]), int(line.split()[1]))
- for fileid in self._utterance_fileids(utterances, ".wrd")
- for line in self.open(fileid)
- if line.strip()
- ]
+ return [(line.split()[2], int(line.split()[0]), int(line.split()[1]))
+ for fileid in self._utterance_fileids(utterances, '.wrd')
+ for line in self.open(fileid) if line.strip()]
def sents(self, utterances=None):
- return [
- [line.split()[-1] for line in self.open(fileid) if line.strip()]
- for fileid in self._utterance_fileids(utterances, ".wrd")
- ]
+ return [[line.split()[-1]
+ for line in self.open(fileid) if line.strip()]
+ for fileid in self._utterance_fileids(utterances, '.wrd')]
def sent_times(self, utterances=None):
- return [
- (
- line.split(None, 2)[-1].strip(),
- int(line.split()[0]),
- int(line.split()[1]),
- )
- for fileid in self._utterance_fileids(utterances, ".txt")
- for line in self.open(fileid)
- if line.strip()
- ]
+ return [(line.split(None,2)[-1].strip(),
+ int(line.split()[0]), int(line.split()[1]))
+ for fileid in self._utterance_fileids(utterances, '.txt')
+ for line in self.open(fileid) if line.strip()]
def phone_trees(self, utterances=None):
- if utterances is None:
- utterances = self._utterances
- if isinstance(utterances, str):
- utterances = [utterances]
+ if utterances is None: utterances = self._utterances
+ if isinstance(utterances, string_types): utterances = [utterances]
trees = []
for utterance in utterances:
while sent_times:
(sent, sent_start, sent_end) = sent_times.pop(0)
- trees.append(Tree("S", []))
- while (
- word_times and phone_times and phone_times[0][2] <= word_times[0][1]
- ):
+ trees.append(Tree('S', []))
+ while (word_times and phone_times and
+ phone_times[0][2] <= word_times[0][1]):
trees[-1].append(phone_times.pop(0)[0])
while word_times and word_times[0][2] <= sent_end:
(word, word_start, word_end) = word_times.pop(0)
# fileids.
def wav(self, utterance, start=0, end=None):
# nltk.chunk conflicts with the stdlib module 'chunk'
- wave = import_from_stdlib("wave")
+ wave = import_from_stdlib('wave')
- w = wave.open(self.open(utterance + ".wav"), "rb")
+ w = wave.open(self.open(utterance+'.wav'), 'rb')
if end is None:
end = w.getnframes()
# Skip past frames before start, then read the frames we want
w.readframes(start)
- frames = w.readframes(end - start)
+ frames = w.readframes(end-start)
# Open a new temporary file -- the wave module requires
# an actual file, and won't work w/ stringio. :(
tf = tempfile.TemporaryFile()
- out = wave.open(tf, "w")
+ out = wave.open(tf, 'w')
# Write the parameters & data to the new file.
out.setparams(w.getparams())
return tf.read()
def audiodata(self, utterance, start=0, end=None):
- assert end is None or end > start
+ assert(end is None or end > start)
headersize = 44
if end is None:
- data = self.open(utterance + ".wav").read()
+ data = self.open(utterance+'.wav').read()
else:
- data = self.open(utterance + ".wav").read(headersize + end * 2)
- return data[headersize + start * 2 :]
+ data = self.open(utterance+'.wav').read(headersize+end*2)
+ return data[headersize+start*2:]
def _utterance_fileids(self, utterances, extension):
- if utterances is None:
- utterances = self._utterances
- if isinstance(utterances, str):
- utterances = [utterances]
- return ["%s%s" % (u, extension) for u in utterances]
+ if utterances is None: utterances = self._utterances
+ if isinstance(utterances, string_types): utterances = [utterances]
+ return ['%s%s' % (u, extension) for u in utterances]
def play(self, utterance, start=0, end=None):
"""
# Method 1: os audio dev.
try:
import ossaudiodev
-
try:
- dsp = ossaudiodev.open("w")
+ dsp = ossaudiodev.open('w')
dsp.setfmt(ossaudiodev.AFMT_S16_LE)
dsp.channels(1)
dsp.speed(16000)
dsp.write(self.audiodata(utterance, start, end))
dsp.close()
except IOError as e:
- print(
- (
- "can't acquire the audio device; please "
- "activate your audio device."
- ),
- file=sys.stderr,
- )
+ print(("can't acquire the audio device; please "
+ "activate your audio device."), file=sys.stderr)
print("system error message:", str(e), file=sys.stderr)
return
except ImportError:
try:
# FIXME: this won't work under python 3
import pygame.mixer, StringIO
-
pygame.mixer.init(16000)
f = StringIO.StringIO(self.wav(utterance, start, end))
pygame.mixer.Sound(f).play()
pass
# Method 3: complain. :)
- print(
- ("you must install pygame or ossaudiodev " "for audio playback."),
- file=sys.stderr,
- )
+ print(("you must install pygame or ossaudiodev "
+ "for audio playback."), file=sys.stderr)
+@compat.python_2_unicode_compatible
class SpeakerInfo(object):
- def __init__(
- self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
- ):
+ def __init__(self, id, sex, dr, use, recdate, birthdate,
+ ht, race, edu, comments=None):
self.id = id
self.sex = sex
self.dr = dr
self.comments = comments
def __repr__(self):
- attribs = "id sex dr use recdate birthdate ht race edu comments"
- args = ["%s=%r" % (attr, getattr(self, attr)) for attr in attribs.split()]
- return "SpeakerInfo(%s)" % (", ".join(args))
+ attribs = 'id sex dr use recdate birthdate ht race edu comments'
+ args = ['%s=%r' % (attr, getattr(self, attr))
+ for attr in attribs.split()]
+ return 'SpeakerInfo(%s)' % (', '.join(args))
def read_timit_block(stream):
number that will be ignored.
"""
line = stream.readline()
- if not line:
- return []
- n, sent = line.split(" ", 1)
+ if not line: return []
+ n, sent = line.split(' ', 1)
return [sent]
# Natural Language Toolkit: Toolbox Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Greg Aumann <greg_aumann@sil.org>
# Stuart Robinson <Stuart.Robinson@mpi.nl>
# Steven Bird <stevenbird1@gmail.com>
Toolbox databases and settings fileids.
"""
+import os
+import re
+import codecs
+
+from six import string_types
+
from nltk.toolbox import ToolboxData
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class ToolboxCorpusReader(CorpusReader):
def xml(self, fileids, key=None):
- return concat(
- [
- ToolboxData(path, enc).parse(key=key)
- for (path, enc) in self.abspaths(fileids, True)
- ]
- )
+ return concat([ToolboxData(path, enc).parse(key=key)
+ for (path, enc) in self.abspaths(fileids, True)])
- def fields(
- self,
- fileids,
- strip=True,
- unwrap=True,
- encoding="utf8",
- errors="strict",
- unicode_fields=None,
- ):
- return concat(
- [
- list(
- ToolboxData(fileid, enc).fields(
- strip, unwrap, encoding, errors, unicode_fields
- )
- )
- for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
- ]
- )
+ def fields(self, fileids, strip=True, unwrap=True, encoding='utf8',
+ errors='strict', unicode_fields=None):
+ return concat([list(ToolboxData(fileid,enc).fields(
+ strip, unwrap, encoding, errors, unicode_fields))
+ for (fileid, enc)
+ in self.abspaths(fileids, include_encoding=True)])
# should probably be done lazily:
def entries(self, fileids, **kwargs):
- if "key" in kwargs:
- key = kwargs["key"]
- del kwargs["key"]
+ if 'key' in kwargs:
+ key = kwargs['key']
+ del kwargs['key']
else:
- key = "lx" # the default key in MDF
+ key = 'lx' # the default key in MDF
entries = []
for marker, contents in self.fields(fileids, **kwargs):
if marker == key:
pass
return entries
- def words(self, fileids, key="lx"):
+ def words(self, fileids, key='lx'):
return [contents for marker, contents in self.fields(fileids) if marker == key]
def raw(self, fileids):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def demo():
pass
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Twitter Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import json
import os
+from six import string_types
+
from nltk.tokenize import TweetTokenizer
from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
The corpus view class used by this reader.
"""
- def __init__(
- self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
- ):
+ def __init__(self, root, fileids=None,
+ word_tokenizer=TweetTokenizer(),
+ encoding='utf8'):
"""
:param root: The root directory for this corpus.
self._word_tokenizer = word_tokenizer
+
+
def docs(self, fileids=None):
"""
Returns the full Tweet objects, as specified by `Twitter
from JSON.
:rtype: list(dict)
"""
- return concat(
- [
- self.CorpusView(path, self._read_tweets, encoding=enc)
- for (path, enc, fileid) in self.abspaths(fileids, True, True)
- ]
- )
+ return concat([self.CorpusView(path, self._read_tweets, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
def strings(self, fileids=None):
"""
tweets = []
for jsono in fulltweets:
try:
- text = jsono["text"]
+ text = jsono['text']
if isinstance(text, bytes):
text = text.decode(self.encoding)
tweets.append(text)
pass
return tweets
+
def tokenized(self, fileids=None):
"""
:return: the given file(s) as a list of the text content of Tweets as
tokenizer = self._word_tokenizer
return [tokenizer.tokenize(t) for t in tweets]
+
def raw(self, fileids=None):
"""
Return the corpora in their raw form.
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, str):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
+
def _read_tweets(self, stream):
"""
Assumes that each line in ``stream`` is a JSON-serialised object.
"""
UDHR corpus reader. It mostly deals with encodings.
"""
+from __future__ import absolute_import, unicode_literals
from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
-
class UdhrCorpusReader(PlaintextCorpusReader):
ENCODINGS = [
- (".*-Latin1$", "latin-1"),
- (".*-Hebrew$", "hebrew"),
- (".*-Arabic$", "cp1256"),
- ("Czech_Cesky-UTF8", "cp1250"), # yeah
- (".*-Cyrillic$", "cyrillic"),
- (".*-SJIS$", "SJIS"),
- (".*-GB2312$", "GB2312"),
- (".*-Latin2$", "ISO-8859-2"),
- (".*-Greek$", "greek"),
- (".*-UTF8$", "utf-8"),
- ("Hungarian_Magyar-Unicode", "utf-16-le"),
- ("Amahuaca", "latin1"),
- ("Turkish_Turkce-Turkish", "latin5"),
- ("Lithuanian_Lietuviskai-Baltic", "latin4"),
- ("Japanese_Nihongo-EUC", "EUC-JP"),
- ("Japanese_Nihongo-JIS", "iso2022_jp"),
- ("Chinese_Mandarin-HZ", "hz"),
- ("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
+ ('.*-Latin1$', 'latin-1'),
+ ('.*-Hebrew$', 'hebrew'),
+ ('.*-Arabic$', 'cp1256'),
+ ('Czech_Cesky-UTF8', 'cp1250'), # yeah
+ ('.*-Cyrillic$', 'cyrillic'),
+ ('.*-SJIS$', 'SJIS'),
+ ('.*-GB2312$', 'GB2312'),
+ ('.*-Latin2$', 'ISO-8859-2'),
+ ('.*-Greek$', 'greek'),
+ ('.*-UTF8$', 'utf-8'),
+
+ ('Hungarian_Magyar-Unicode', 'utf-16-le'),
+ ('Amahuaca', 'latin1'),
+ ('Turkish_Turkce-Turkish', 'latin5'),
+ ('Lithuanian_Lietuviskai-Baltic', 'latin4'),
+ ('Japanese_Nihongo-EUC', 'EUC-JP'),
+ ('Japanese_Nihongo-JIS', 'iso2022_jp'),
+ ('Chinese_Mandarin-HZ', 'hz'),
+ ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
]
- SKIP = set(
- [
- # The following files are not fully decodable because they
- # were truncated at wrong bytes:
- "Burmese_Myanmar-UTF8",
- "Japanese_Nihongo-JIS",
- "Chinese_Mandarin-HZ",
- "Chinese_Mandarin-UTF8",
- "Gujarati-UTF8",
- "Hungarian_Magyar-Unicode",
- "Lao-UTF8",
- "Magahi-UTF8",
- "Marathi-UTF8",
- "Tamil-UTF8",
- # Unfortunately, encodings required for reading
- # the following files are not supported by Python:
- "Vietnamese-VPS",
- "Vietnamese-VIQR",
- "Vietnamese-TCVN",
- "Magahi-Agra",
- "Bhojpuri-Agra",
- "Esperanto-T61", # latin3 raises an exception
- # The following files are encoded for specific fonts:
- "Burmese_Myanmar-WinResearcher",
- "Armenian-DallakHelv",
- "Tigrinya_Tigrigna-VG2Main",
- "Amharic-Afenegus6..60375", # ?
- "Navaho_Dine-Navajo-Navaho-font",
- # What are these?
- "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
- "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
- # The following files are unintended:
- "Czech-Latin2-err",
- "Russian_Russky-UTF8~",
- ]
- )
+ SKIP = set([
+ # The following files are not fully decodable because they
+ # were truncated at wrong bytes:
+ 'Burmese_Myanmar-UTF8',
+ 'Japanese_Nihongo-JIS',
+ 'Chinese_Mandarin-HZ',
+ 'Chinese_Mandarin-UTF8',
+ 'Gujarati-UTF8',
+ 'Hungarian_Magyar-Unicode',
+ 'Lao-UTF8',
+ 'Magahi-UTF8',
+ 'Marathi-UTF8',
+ 'Tamil-UTF8',
+
+ # Unfortunately, encodings required for reading
+ # the following files are not supported by Python:
+ 'Vietnamese-VPS',
+ 'Vietnamese-VIQR',
+ 'Vietnamese-TCVN',
+ 'Magahi-Agra',
+ 'Bhojpuri-Agra',
+ 'Esperanto-T61', # latin3 raises an exception
+
+ # The following files are encoded for specific fonts:
+ 'Burmese_Myanmar-WinResearcher',
+ 'Armenian-DallakHelv',
+ 'Tigrinya_Tigrigna-VG2Main',
+ 'Amharic-Afenegus6..60375', # ?
+ 'Navaho_Dine-Navajo-Navaho-font',
+
+ # What are these?
+ 'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
+ 'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',
+
+ # The following files are unintended:
+ 'Czech-Latin2-err',
+ 'Russian_Russky-UTF8~',
+ ])
+
- def __init__(self, root="udhr"):
- fileids = find_corpus_fileids(root, r"(?!README|\.).*")
+ def __init__(self, root='udhr'):
+ fileids = find_corpus_fileids(root, r'(?!README|\.).*')
super(UdhrCorpusReader, self).__init__(
root,
[fileid for fileid in fileids if fileid not in self.SKIP],
- encoding=self.ENCODINGS,
+ encoding=self.ENCODINGS
)
# Natural Language Toolkit: Corpus Reader Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
import bisect
import re
import tempfile
-import pickle
+from six import string_types, text_type
from functools import reduce
-from xml.etree import ElementTree
+try:
+ import cPickle as pickle
+except ImportError:
+ import pickle
+
+# Use the c version of ElementTree, which is faster, if possible:
+try: from xml.etree import cElementTree as ElementTree
+except ImportError: from xml.etree import ElementTree
from nltk.tokenize import wordpunct_tokenize
from nltk.internals import slice_bounds
from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation, py25
######################################################################
-# { Corpus View
+#{ Corpus View
######################################################################
-
class StreamBackedCorpusView(AbstractLazySequence):
"""
A 'view' of a corpus file, which acts like a sequence of tokens:
end_toknum is the token index of the first token not in the
block; and tokens is a list of the tokens in the block.
"""
-
- def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
+ def __init__(self, fileid, block_reader=None, startpos=0,
+ encoding='utf8'):
"""
Create a new corpus view, based on the file ``fileid``, and
read with ``block_reader``. See the class documentation
else:
self._eofpos = os.stat(self._fileid).st_size
except Exception as exc:
- raise ValueError("Unable to open or access %r -- %s" % (fileid, exc))
+ raise ValueError('Unable to open or access %r -- %s' %
+ (fileid, exc))
# Maintain a cache of the most recently read block, to
# increase efficiency of random access.
self._cache = (-1, -1, None)
- fileid = property(
- lambda self: self._fileid,
- doc="""
+ fileid = property(lambda self: self._fileid, doc="""
The fileid of the file that is accessed by this view.
- :type: str or PathPointer""",
- )
+ :type: str or PathPointer""")
def read_block(self, stream):
"""
:param stream: an input stream
:type stream: stream
"""
- raise NotImplementedError("Abstract Method")
+ raise NotImplementedError('Abstract Method')
def _open(self):
"""
self._stream = self._fileid.open(self._encoding)
elif self._encoding:
self._stream = SeekableUnicodeStreamReader(
- open(self._fileid, "rb"), self._encoding
- )
+ open(self._fileid, 'rb'), self._encoding)
else:
- self._stream = open(self._fileid, "rb")
+ self._stream = open(self._fileid, 'rb')
def close(self):
"""
if self._len is None:
# iterate_from() sets self._len when it reaches the end
# of the file:
- for tok in self.iterate_from(self._toknum[-1]):
- pass
+ for tok in self.iterate_from(self._toknum[-1]): pass
return self._len
def __getitem__(self, i):
# Check if it's in the cache.
offset = self._cache[0]
if offset <= start and stop <= self._cache[1]:
- return self._cache[2][start - offset : stop - offset]
+ return self._cache[2][start-offset:stop-offset]
# Construct & return the result.
return LazySubsequence(self, start, stop)
else:
# Handle negative indices
- if i < 0:
- i += len(self)
- if i < 0:
- raise IndexError("index out of range")
+ if i < 0: i += len(self)
+ if i < 0: raise IndexError('index out of range')
# Check if it's in the cache.
offset = self._cache[0]
if offset <= i < self._cache[1]:
- return self._cache[2][i - offset]
+ return self._cache[2][i-offset]
# Use iterate_from to extract it.
try:
return next(self.iterate_from(i))
except StopIteration:
- raise IndexError("index out of range")
+ raise IndexError('index out of range')
# If we wanted to be thread-safe, then this method would need to
# do some locking.
def iterate_from(self, start_tok):
# Start by feeding from the cache, if possible.
if self._cache[0] <= start_tok < self._cache[1]:
- for tok in self._cache[2][start_tok - self._cache[0] :]:
+ for tok in self._cache[2][start_tok-self._cache[0]:]:
yield tok
start_tok += 1
# our mapping, then we can jump straight to the correct block;
# otherwise, start at the last block we've processed.
if start_tok < self._toknum[-1]:
- block_index = bisect.bisect_right(self._toknum, start_tok) - 1
+ block_index = bisect.bisect_right(self._toknum, start_tok)-1
toknum = self._toknum[block_index]
filepos = self._filepos[block_index]
else:
- block_index = len(self._toknum) - 1
+ block_index = len(self._toknum)-1
toknum = self._toknum[-1]
filepos = self._filepos[-1]
self._current_blocknum = block_index
tokens = self.read_block(self._stream)
assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
- "block reader %s() should return list or tuple."
- % self.read_block.__name__
- )
+ 'block reader %s() should return list or tuple.' %
+ self.read_block.__name__)
num_toks = len(tokens)
new_filepos = self._stream.tell()
assert new_filepos > filepos, (
- "block reader %s() should consume at least 1 byte (filepos=%d)"
- % (self.read_block.__name__, filepos)
- )
+ 'block reader %s() should consume at least 1 byte (filepos=%d)' %
+ (self.read_block.__name__, filepos))
# Update our cache.
- self._cache = (toknum, toknum + num_toks, list(tokens))
+ self._cache = (toknum, toknum+num_toks, list(tokens))
# Update our mapping.
assert toknum <= self._toknum[-1]
if num_toks > 0:
block_index += 1
if toknum == self._toknum[-1]:
- assert new_filepos > self._filepos[-1] # monotonic!
+ assert new_filepos > self._filepos[-1] # monotonic!
self._filepos.append(new_filepos)
- self._toknum.append(toknum + num_toks)
+ self._toknum.append(toknum+num_toks)
else:
# Check for consistency:
- assert (
- new_filepos == self._filepos[block_index]
- ), "inconsistent block reader (num chars read)"
- assert (
- toknum + num_toks == self._toknum[block_index]
- ), "inconsistent block reader (num tokens returned)"
+ assert new_filepos == self._filepos[block_index], (
+ 'inconsistent block reader (num chars read)')
+ assert toknum+num_toks == self._toknum[block_index], (
+ 'inconsistent block reader (num tokens returned)')
# If we reached the end of the file, then update self._len
if new_filepos == self._eofpos:
# Generate the tokens in this block (but skip any tokens
# before start_tok). Note that between yields, our state
# may be modified.
- for tok in tokens[max(0, start_tok - toknum) :]:
+ for tok in tokens[max(0, start_tok-toknum):]:
yield tok
# If we're at the end of the file, then we're done.
assert new_filepos <= self._eofpos
# when possible.
def __add__(self, other):
return concat([self, other])
-
def __radd__(self, other):
return concat([other, self])
-
def __mul__(self, count):
return concat([self] * count)
-
def __rmul__(self, count):
return concat([self] * count)
-
class ConcatenatedCorpusView(AbstractLazySequence):
"""
A 'view' of a corpus file that joins together one or more
``StreamBackedCorpusViews<StreamBackedCorpusView>``. At most
one file handle is left open at any time.
"""
-
def __init__(self, corpus_views):
self._pieces = corpus_views
"""A list of the corpus subviews that make up this
def __len__(self):
if len(self._offsets) <= len(self._pieces):
# Iterate to the end of the corpus.
- for tok in self.iterate_from(self._offsets[-1]):
- pass
+ for tok in self.iterate_from(self._offsets[-1]): pass
return self._offsets[-1]
piece.close()
def iterate_from(self, start_tok):
- piecenum = bisect.bisect_right(self._offsets, start_tok) - 1
+ piecenum = bisect.bisect_right(self._offsets, start_tok)-1
while piecenum < len(self._pieces):
offset = self._offsets[piecenum]
self._open_piece = piece
# Get everything we can from this piece.
- for tok in piece.iterate_from(max(0, start_tok - offset)):
+ for tok in piece.iterate_from(max(0, start_tok-offset)):
yield tok
# Update the offset table.
- if piecenum + 1 == len(self._offsets):
+ if piecenum+1 == len(self._offsets):
self._offsets.append(self._offsets[-1] + len(piece))
# Move on to the next piece.
piecenum += 1
-
def concat(docs):
"""
Concatenate together the contents of multiple documents from a
if len(docs) == 1:
return docs[0]
if len(docs) == 0:
- raise ValueError("concat() expects at least one object!")
+ raise ValueError('concat() expects at least one object!')
types = set(d.__class__ for d in docs)
# If they're all strings, use string concatenation.
- if all(isinstance(doc, str) for doc in docs):
- return "".join(docs)
+ if all(isinstance(doc, string_types) for doc in docs):
+ return ''.join(docs)
# If they're all corpus views, then use ConcatenatedCorpusView.
for typ in types:
- if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)):
+ if not issubclass(typ, (StreamBackedCorpusView,
+ ConcatenatedCorpusView)):
break
else:
return ConcatenatedCorpusView(docs)
typ = list(types)[0]
if issubclass(typ, list):
- return reduce((lambda a, b: a + b), docs, [])
+ return reduce((lambda a,b:a+b), docs, [])
if issubclass(typ, tuple):
- return reduce((lambda a, b: a + b), docs, ())
+ return reduce((lambda a,b:a+b), docs, ())
if ElementTree.iselement(typ):
- xmltree = ElementTree.Element("documents")
- for doc in docs:
- xmltree.append(doc)
+ xmltree = ElementTree.Element('documents')
+ for doc in docs: xmltree.append(doc)
return xmltree
# No method found!
raise ValueError("Don't know how to concatenate types: %r" % types)
-
######################################################################
-# { Corpus View for Pickled Sequences
+#{ Corpus View for Pickled Sequences
######################################################################
-
class PickleCorpusView(StreamBackedCorpusView):
"""
A stream backed corpus view for corpus files that consist of
>>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP
>>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP
"""
-
BLOCK_SIZE = 100
PROTOCOL = -1
def read_block(self, stream):
result = []
for i in range(self.BLOCK_SIZE):
- try:
- result.append(pickle.load(stream))
- except EOFError:
- break
+ try: result.append(pickle.load(stream))
+ except EOFError: break
return result
def __del__(self):
fileid. (This method is called whenever a
``PickledCorpusView`` is garbage-collected.
"""
- if getattr(self, "_delete_on_gc"):
+ if getattr(self, '_delete_on_gc'):
if os.path.exists(self._fileid):
- try:
- os.remove(self._fileid)
- except (OSError, IOError):
- pass
- self.__dict__.clear() # make the garbage collector's job easier
+ try: os.remove(self._fileid)
+ except (OSError, IOError): pass
+ self.__dict__.clear() # make the garbage collector's job easier
@classmethod
def write(cls, sequence, output_file):
- if isinstance(output_file, str):
- output_file = open(output_file, "wb")
+ if isinstance(output_file, string_types):
+ output_file = open(output_file, 'wb')
for item in sequence:
pickle.dump(item, output_file, cls.PROTOCOL)
deleted whenever this object gets garbage-collected.
"""
try:
- fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-")
- output_file = os.fdopen(fd, "wb")
+ fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-')
+ output_file = os.fdopen(fd, 'wb')
cls.write(sequence, output_file)
output_file.close()
return PickleCorpusView(output_file_name, delete_on_gc)
except (OSError, IOError) as e:
- raise ValueError("Error while creating temp file: %s" % e)
+ raise ValueError('Error while creating temp file: %s' % e)
+
######################################################################
-# { Block Readers
+#{ Block Readers
######################################################################
-
def read_whitespace_block(stream):
toks = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
toks.extend(stream.readline().split())
return toks
-
def read_wordpunct_block(stream):
toks = []
- for i in range(20): # Read 20 lines at a time.
+ for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
-
def read_line_block(stream):
toks = []
for i in range(20):
line = stream.readline()
- if not line:
- return toks
- toks.append(line.rstrip("\n"))
+ if not line: return toks
+ toks.append(line.rstrip('\n'))
return toks
-
def read_blankline_block(stream):
- s = ""
+ s = ''
while True:
line = stream.readline()
# End of file:
if not line:
- if s:
- return [s]
- else:
- return []
+ if s: return [s]
+ else: return []
# Blank line:
elif line and not line.strip():
- if s:
- return [s]
+ if s: return [s]
# Other line:
else:
s += line
-
def read_alignedsent_block(stream):
- s = ""
+ s = ''
while True:
line = stream.readline()
- if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
+ if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n':
continue
# End of file:
if not line:
- if s:
- return [s]
- else:
- return []
+ if s: return [s]
+ else: return []
# Other line:
else:
s += line
- if re.match("^\d+-\d+", line) is not None:
+ if re.match('^\d+-\d+', line) is not None:
return [s]
-
def read_regexp_block(stream, start_re, end_re=None):
"""
Read a sequence of tokens from a stream, where tokens begin with
# Scan until we find a line matching the start regexp.
while True:
line = stream.readline()
- if not line:
- return [] # end of file.
- if re.match(start_re, line):
- break
+ if not line: return [] # end of file.
+ if re.match(start_re, line): break
# Scan until we find another line matching the regexp, or EOF.
lines = [line]
line = stream.readline()
# End of file:
if not line:
- return ["".join(lines)]
+ return [''.join(lines)]
# End of token:
if end_re is not None and re.match(end_re, line):
- return ["".join(lines)]
+ return [''.join(lines)]
# Start of new token: backup to just before it starts, and
# return the token we've already collected.
if end_re is None and re.match(start_re, line):
stream.seek(oldpos)
- return ["".join(lines)]
+ return [''.join(lines)]
# Anything else is part of the token.
lines.append(line)
-
def read_sexpr_block(stream, block_size=16384, comment_char=None):
"""
Read a sequence of s-expressions from the stream, and leave the
"""
start = stream.tell()
block = stream.read(block_size)
- encoding = getattr(stream, "encoding", None)
- assert encoding is not None or isinstance(block, str)
- if encoding not in (None, "utf-8"):
+ encoding = getattr(stream, 'encoding', None)
+ assert encoding is not None or isinstance(block, text_type)
+ if encoding not in (None, 'utf-8'):
import warnings
-
- warnings.warn(
- "Parsing may fail, depending on the properties "
- "of the %s encoding!" % encoding
- )
+ warnings.warn('Parsing may fail, depending on the properties '
+ 'of the %s encoding!' % encoding)
# (e.g., the utf-16 encoding does not work because it insists
# on adding BOMs to the beginning of encoded strings.)
if comment_char:
- COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
+ COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char))
while True:
try:
# If we're stripping comments, then make sure our block ends
# Read the block.
tokens, offset = _parse_sexpr_block(block)
# Skip whitespace
- offset = re.compile(r"\s*").search(block, offset).end()
+ offset = re.compile(r'\s*').search(block, offset).end()
# Move to the end position.
if encoding is None:
- stream.seek(start + offset)
+ stream.seek(start+offset)
else:
- stream.seek(start + len(block[:offset].encode(encoding)))
+ stream.seek(start+len(block[:offset].encode(encoding)))
# Return the list of tokens we processed
return tokens
except ValueError as e:
- if e.args[0] == "Block too small":
+ if e.args[0] == 'Block too small':
next_block = stream.read(block_size)
if next_block:
block += next_block
else:
# The file ended mid-sexpr -- return what we got.
return [block.strip()]
- else:
- raise
-
+ else: raise
def _sub_space(m):
"""Helper function: given a regexp match, return a string of
spaces that's the same length as the matched string."""
- return " " * (m.end() - m.start())
-
+ return ' '*(m.end()-m.start())
def _parse_sexpr_block(block):
tokens = []
start = end = 0
while end < len(block):
- m = re.compile(r"\S").search(block, end)
+ m = re.compile(r'\S').search(block, end)
if not m:
return tokens, end
start = m.start()
# Case 1: sexpr is not parenthesized.
- if m.group() != "(":
- m2 = re.compile(r"[\s(]").search(block, start)
+ if m.group() != '(':
+ m2 = re.compile(r'[\s(]').search(block, start)
if m2:
end = m2.start()
else:
- if tokens:
- return tokens, end
- raise ValueError("Block too small")
+ if tokens: return tokens, end
+ raise ValueError('Block too small')
# Case 2: parenthesized sexpr.
else:
nesting = 0
- for m in re.compile(r"[()]").finditer(block, start):
- if m.group() == "(":
- nesting += 1
- else:
- nesting -= 1
+ for m in re.compile(r'[()]').finditer(block, start):
+ if m.group()=='(': nesting += 1
+ else: nesting -= 1
if nesting == 0:
end = m.end()
break
else:
- if tokens:
- return tokens, end
- raise ValueError("Block too small")
+ if tokens: return tokens, end
+ raise ValueError('Block too small')
tokens.append(block[start:end])
######################################################################
-# { Finding Corpus Items
+#{ Finding Corpus Items
######################################################################
-
def find_corpus_fileids(root, regexp):
if not isinstance(root, PathPointer):
- raise TypeError("find_corpus_fileids: expected a PathPointer")
- regexp += "$"
+ raise TypeError('find_corpus_fileids: expected a PathPointer')
+ regexp += '$'
# Find fileids in a zipfile: scan the zipfile's namelist. Filter
# out entries that end in '/' -- they're directories.
if isinstance(root, ZipFilePathPointer):
- fileids = [
- name[len(root.entry) :]
- for name in root.zipfile.namelist()
- if not name.endswith("/")
- ]
+ fileids = [name[len(root.entry):] for name in root.zipfile.namelist()
+ if not name.endswith('/')]
items = [name for name in fileids if re.match(regexp, name)]
return sorted(items)
# workaround for py25 which doesn't support followlinks
kwargs = {}
if not py25():
- kwargs = {"followlinks": True}
+ kwargs = {'followlinks': True}
for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
- prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
- items += [
- prefix + fileid
- for fileid in fileids
- if re.match(regexp, prefix + fileid)
- ]
+ prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
+ items += [prefix+fileid for fileid in fileids
+ if re.match(regexp, prefix+fileid)]
# Don't visit svn directories:
- if ".svn" in subdirs:
- subdirs.remove(".svn")
+ if '.svn' in subdirs: subdirs.remove('.svn')
return sorted(items)
else:
raise AssertionError("Don't know how to handle %r" % root)
-
def _path_from(parent, child):
- if os.path.split(parent)[1] == "":
+ if os.path.split(parent)[1] == '':
parent = os.path.split(parent)[0]
path = []
while parent != child:
assert os.path.split(child)[0] != child
return path
-
######################################################################
-# { Paragraph structure in Treebank files
+#{ Paragraph structure in Treebank files
######################################################################
-
def tagged_treebank_para_block_reader(stream):
# Read the next paragraph.
- para = ""
+ para = ''
while True:
line = stream.readline()
# End of paragraph:
- if re.match("======+\s*$", line):
- if para.strip():
- return [para]
+ if re.match('======+\s*$', line):
+ if para.strip(): return [para]
# End of file:
- elif line == "":
- if para.strip():
- return [para]
- else:
- return []
+ elif line == '':
+ if para.strip(): return [para]
+ else: return []
# Content line:
else:
para += line
# Natural Language Toolkit: Verbnet Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
For details about VerbNet see:
https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
"""
+from __future__ import unicode_literals
import re
import textwrap
from collections import defaultdict
+from six import string_types
+
from nltk.corpus.reader.xmldocs import XMLCorpusReader
# runs 2-30 times faster.
self._quick_index()
- _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
+ _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
"""Regular expression that matches (and decomposes) longids"""
- _SHORTID_RE = re.compile(r"[\d+.\-]+$")
+ _SHORTID_RE = re.compile(r'[\d+.\-]+$')
"""Regular expression that matches shortids"""
- _INDEX_RE = re.compile(
- r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' r'<VNSUBCLASS ID="([^"]+)"/?>'
- )
+ _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|'
+ r'<VNSUBCLASS ID="([^"]+)"/?>')
"""Regular expression used by ``_index()`` to quickly scan the corpus
for basic information."""
return sorted(self._lemma_to_class.keys())
else:
# [xx] should this include subclass members?
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
+ return [member.get('name') for member in
+ vnclass.findall('MEMBERS/MEMBER')]
def wordnetids(self, vnclass=None):
"""
return sorted(self._wordnet_to_class.keys())
else:
# [xx] should this include subclass members?
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- return sum(
- [
- member.get("wn", "").split()
- for member in vnclass.findall("MEMBERS/MEMBER")
- ],
- [],
- )
+ return sum([member.get('wn', '').split() for member in
+ vnclass.findall('MEMBERS/MEMBER')], [])
def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
"""
If nothing is specified, return all classids within VerbNet
"""
if fileid is not None:
- return [c for (c, f) in self._class_to_fileid.items() if f == fileid]
+ return [c for (c, f) in self._class_to_fileid.items()
+ if f == fileid]
elif lemma is not None:
return self._lemma_to_class[lemma]
elif wordnetid is not None:
return self._wordnet_to_class[wordnetid]
elif classid is not None:
xmltree = self.vnclass(classid)
- return [
- subclass.get("ID")
- for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
- ]
+ return [subclass.get('ID') for subclass in
+ xmltree.findall('SUBCLASSES/VNSUBCLASS')]
else:
return sorted(self._class_to_fileid.keys())
def vnclass(self, fileid_or_classid):
"""Returns VerbNet class ElementTree
-
+
Return an ElementTree containing the xml for the specified
VerbNet class.
if classid in self._class_to_fileid:
fileid = self._class_to_fileid[self.longid(classid)]
tree = self.xml(fileid)
- if classid == tree.get("ID"):
+ if classid == tree.get('ID'):
return tree
else:
- for subclass in tree.findall(".//VNSUBCLASS"):
- if classid == subclass.get("ID"):
+ for subclass in tree.findall('.//VNSUBCLASS'):
+ if classid == subclass.get('ID'):
return subclass
else:
assert False # we saw it during _index()!
else:
- raise ValueError("Unknown identifier {}".format(fileid_or_classid))
+ raise ValueError('Unknown identifier {}'.format(fileid_or_classid))
def fileids(self, vnclass_ids=None):
"""
"""
if vnclass_ids is None:
return self._fileids
- elif isinstance(vnclass_ids, str):
+ elif isinstance(vnclass_ids, string_types):
return [self._class_to_fileid[self.longid(vnclass_ids)]]
else:
- return [
- self._class_to_fileid[self.longid(vnclass_id)]
- for vnclass_id in vnclass_ids
- ]
+ return [self._class_to_fileid[self.longid(vnclass_id)]
+ for vnclass_id in vnclass_ids]
def frames(self, vnclass):
"""Given a VerbNet class, this method returns VerbNet frames
-
+
The members returned are:
1) Example
2) Description
3) Syntax
4) Semantics
-
+
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
:return: frames - a list of frame dictionaries
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
frames = []
- vnframes = vnclass.findall("FRAMES/FRAME")
+ vnframes = vnclass.findall('FRAMES/FRAME')
for vnframe in vnframes:
- frames.append(
- {
- "example": self._get_example_within_frame(vnframe),
- "description": self._get_description_within_frame(vnframe),
- "syntax": self._get_syntactic_list_within_frame(vnframe),
- "semantics": self._get_semantics_within_frame(vnframe),
- }
- )
+ frames.append({
+ 'example': self._get_example_within_frame(vnframe),
+ 'description': self._get_description_within_frame(vnframe),
+ 'syntax': self._get_syntactic_list_within_frame(vnframe),
+ 'semantics': self._get_semantics_within_frame(vnframe)
+ })
return frames
def subclasses(self, vnclass):
- """Returns subclass ids, if any exist
-
+ """Returns subclass ids, if any exist
+
Given a VerbNet class, this method returns subclass ids (if they exist)
in a list of strings.
-
+
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
:return: list of subclasses
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- subclasses = [
- subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
- ]
+ subclasses = [subclass.get('ID') for subclass in
+ vnclass.findall('SUBCLASSES/VNSUBCLASS')]
return subclasses
def themroles(self, vnclass):
"""Returns thematic roles participating in a VerbNet class
-
+
Members returned as part of roles are-
1) Type
2) Modifiers
-
+
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
:return: themroles: A list of thematic roles in the VerbNet class
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
themroles = []
- for trole in vnclass.findall("THEMROLES/THEMROLE"):
- themroles.append(
- {
- "type": trole.get("type"),
- "modifiers": [
- {"value": restr.get("Value"), "type": restr.get("type")}
- for restr in trole.findall("SELRESTRS/SELRESTR")
- ],
- }
- )
+ for trole in vnclass.findall('THEMROLES/THEMROLE'):
+ themroles.append({
+ 'type': trole.get('type'),
+ 'modifiers': [{'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in trole.findall('SELRESTRS/SELRESTR')]
+ })
return themroles
######################################################################
"""
Initialize the indexes ``_lemma_to_class``,
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
- through the corpus fileids. This is fast if ElementTree
- uses the C implementation (<0.1 secs), but quite slow (>10 secs)
- if only the python implementation is available.
+ through the corpus fileids. This is fast with cElementTree
+ (<0.1 secs), but quite slow (>10 secs) with the python
+ implementation of ElementTree.
"""
for fileid in self._fileids:
self._index_helper(self.xml(fileid), fileid)
def _index_helper(self, xmltree, fileid):
"""Helper for ``_index()``"""
- vnclass = xmltree.get("ID")
+ vnclass = xmltree.get('ID')
self._class_to_fileid[vnclass] = fileid
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
- for member in xmltree.findall("MEMBERS/MEMBER"):
- self._lemma_to_class[member.get("name")].append(vnclass)
- for wn in member.get("wn", "").split():
+ for member in xmltree.findall('MEMBERS/MEMBER'):
+ self._lemma_to_class[member.get('name')].append(vnclass)
+ for wn in member.get('wn', '').split():
self._wordnet_to_class[wn].append(vnclass)
- for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
+ for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'):
self._index_helper(subclass, fileid)
def _quick_index(self):
through the corpus fileids. This doesn't do proper xml parsing,
but is good enough to find everything in the standard VerbNet
corpus -- and it runs about 30 times faster than xml parsing
- (with the python ElementTree; only 2-3 times faster
- if ElementTree uses the C implementation).
+ (with the python ElementTree; only 2-3 times faster with
+ cElementTree).
"""
# nb: if we got rid of wordnet_to_class, this would run 2-3
# times faster.
vnclass = groups[2] # for <MEMBER> elts.
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
else:
- assert False, "unexpected match condition"
+ assert False, 'unexpected match condition'
######################################################################
# { Identifier conversion
def longid(self, shortid):
"""Returns longid of a VerbNet class
-
+
Given a short VerbNet class identifier (eg '37.10'), map it
to a long id (eg 'confess-37.10'). If ``shortid`` is already a
long id, then return it as-is"""
if self._LONGID_RE.match(shortid):
return shortid # it's already a longid.
elif not self._SHORTID_RE.match(shortid):
- raise ValueError("vnclass identifier %r not found" % shortid)
+ raise ValueError('vnclass identifier %r not found' % shortid)
try:
return self._shortid_to_longid[shortid]
except KeyError:
- raise ValueError("vnclass identifier %r not found" % shortid)
+ raise ValueError('vnclass identifier %r not found' % shortid)
def shortid(self, longid):
"""Returns shortid of a VerbNet class
-
+
Given a long VerbNet class identifier (eg 'confess-37.10'),
map it to a short id (eg '37.10'). If ``longid`` is already a
short id, then return it as-is."""
if m:
return m.group(2)
else:
- raise ValueError("vnclass identifier %r not found" % longid)
+ raise ValueError('vnclass identifier %r not found' % longid)
######################################################################
# { Frame access utility functions
def _get_semantics_within_frame(self, vnframe):
"""Returns semantics within a single frame
-
+
A utility function to retrieve semantics within a frame in VerbNet
Members of the semantics dictionary:
- 1) Predicate value
+ 1) Predicate value
2) Arguments
-
+
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
:return: semantics: semantics dictionary
"""
semantics_within_single_frame = []
- for pred in vnframe.findall("SEMANTICS/PRED"):
- arguments = [
- {"type": arg.get("type"), "value": arg.get("value")}
- for arg in pred.findall("ARGS/ARG")
- ]
- semantics_within_single_frame.append(
- {"predicate_value": pred.get("value"), "arguments": arguments}
- )
+ for pred in vnframe.findall('SEMANTICS/PRED'):
+ arguments = [{'type': arg.get('type'), 'value': arg.get('value')}
+ for arg in pred.findall('ARGS/ARG')]
+ semantics_within_single_frame.append({
+ 'predicate_value': pred.get('value'),
+ 'arguments': arguments
+ })
return semantics_within_single_frame
def _get_example_within_frame(self, vnframe):
"""Returns example within a frame
-
+
A utility function to retrieve an example within a frame in VerbNet.
-
+
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
:return: example_text: The example sentence for this particular frame
"""
- example_element = vnframe.find("EXAMPLES/EXAMPLE")
+ example_element = vnframe.find('EXAMPLES/EXAMPLE')
if example_element is not None:
example_text = example_element.text
else:
def _get_description_within_frame(self, vnframe):
"""Returns member description within frame
-
+
A utility function to retrieve a description of participating members
within a frame in VerbNet.
-
+
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
- :return: description: a description dictionary with members - primary and secondary
+ :return: description: a description dictionary with members - primary and secondary
"""
- description_element = vnframe.find("DESCRIPTION")
+ description_element = vnframe.find('DESCRIPTION')
return {
- "primary": description_element.attrib["primary"],
- "secondary": description_element.get("secondary", ""),
+ 'primary': description_element.attrib['primary'],
+ 'secondary': description_element.get('secondary', '')
}
def _get_syntactic_list_within_frame(self, vnframe):
"""Returns semantics within a frame
-
+
A utility function to retrieve semantics within a frame in VerbNet.
Members of the syntactic dictionary:
1) POS Tag
2) Modifiers
-
+
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
:return: syntax_within_single_frame
"""
syntax_within_single_frame = []
- for elt in vnframe.find("SYNTAX"):
+ for elt in vnframe.find('SYNTAX'):
pos_tag = elt.tag
modifiers = dict()
- modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
- modifiers["selrestrs"] = [
- {"value": restr.get("Value"), "type": restr.get("type")}
- for restr in elt.findall("SELRESTRS/SELRESTR")
- ]
- modifiers["synrestrs"] = [
- {"value": restr.get("Value"), "type": restr.get("type")}
- for restr in elt.findall("SYNRESTRS/SYNRESTR")
- ]
- syntax_within_single_frame.append(
- {"pos_tag": pos_tag, "modifiers": modifiers}
- )
+ modifiers['value'] = elt.get('value') if 'value' in elt.attrib else ""
+ modifiers['selrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in elt.findall('SELRESTRS/SELRESTR')]
+ modifiers['synrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in elt.findall('SYNRESTRS/SYNRESTR')]
+ syntax_within_single_frame.append({
+ 'pos_tag': pos_tag,
+ 'modifiers': modifiers
+ })
return syntax_within_single_frame
######################################################################
def pprint(self, vnclass):
"""Returns pretty printed version of a VerbNet class
-
+
Return a string containing a pretty-printed representation of
the given VerbNet class.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- s = vnclass.get("ID") + "\n"
- s += self.pprint_subclasses(vnclass, indent=" ") + "\n"
- s += self.pprint_members(vnclass, indent=" ") + "\n"
- s += " Thematic roles:\n"
- s += self.pprint_themroles(vnclass, indent=" ") + "\n"
- s += " Frames:\n"
- s += self.pprint_frames(vnclass, indent=" ")
+ s = vnclass.get('ID') + '\n'
+ s += self.pprint_subclasses(vnclass, indent=' ') + '\n'
+ s += self.pprint_members(vnclass, indent=' ') + '\n'
+ s += ' Thematic roles:\n'
+ s += self.pprint_themroles(vnclass, indent=' ') + '\n'
+ s += ' Frames:\n'
+ s += self.pprint_frames(vnclass, indent=' ')
return s
- def pprint_subclasses(self, vnclass, indent=""):
+ def pprint_subclasses(self, vnclass, indent=''):
"""Returns pretty printed version of subclasses of VerbNet class
-
+
Return a string containing a pretty-printed representation of
the given VerbNet class's subclasses.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
subclasses = self.subclasses(vnclass)
- if not subclasses:
- subclasses = ["(none)"]
- s = "Subclasses: " + " ".join(subclasses)
- return textwrap.fill(
- s, 70, initial_indent=indent, subsequent_indent=indent + " "
- )
-
- def pprint_members(self, vnclass, indent=""):
- """Returns pretty printed version of members in a VerbNet class
+ if not subclasses: subclasses = ['(none)']
+ s = 'Subclasses: ' + ' '.join(subclasses)
+ return textwrap.fill(s, 70, initial_indent=indent,
+ subsequent_indent=indent + ' ')
+ def pprint_members(self, vnclass, indent=''):
+ """Returns pretty printed version of members in a VerbNet class
+
Return a string containing a pretty-printed representation of
the given VerbNet class's member verbs.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
members = self.lemmas(vnclass)
if not members:
- members = ["(none)"]
- s = "Members: " + " ".join(members)
- return textwrap.fill(
- s, 70, initial_indent=indent, subsequent_indent=indent + " "
- )
+ members = ['(none)']
+ s = 'Members: ' + ' '.join(members)
+ return textwrap.fill(s, 70, initial_indent=indent,
+ subsequent_indent=indent + ' ')
- def pprint_themroles(self, vnclass, indent=""):
+ def pprint_themroles(self, vnclass, indent=''):
"""Returns pretty printed version of thematic roles in a VerbNet class
-
+
Return a string containing a pretty-printed representation of
the given VerbNet class's thematic roles.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
pieces = []
for themrole in self.themroles(vnclass):
- piece = indent + "* " + themrole.get("type")
- modifiers = [
- modifier["value"] + modifier["type"]
- for modifier in themrole["modifiers"]
- ]
+ piece = indent + '* ' + themrole.get('type')
+ modifiers = [modifier['value'] + modifier['type']
+ for modifier in themrole['modifiers']]
if modifiers:
- piece += "[{}]".format(" ".join(modifiers))
+ piece += '[{}]'.format(' '.join(modifiers))
pieces.append(piece)
- return "\n".join(pieces)
+ return '\n'.join(pieces)
- def pprint_frames(self, vnclass, indent=""):
+ def pprint_frames(self, vnclass, indent=''):
"""Returns pretty version of all frames in a VerbNet class
-
+
Return a string containing a pretty-printed representation of
the list of frames within the VerbNet class.
:param vnclass: A VerbNet class identifier; or an ElementTree
containing the xml contents of a VerbNet class.
"""
- if isinstance(vnclass, str):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
pieces = []
for vnframe in self.frames(vnclass):
pieces.append(self._pprint_single_frame(vnframe, indent))
- return "\n".join(pieces)
+ return '\n'.join(pieces)
- def _pprint_single_frame(self, vnframe, indent=""):
+ def _pprint_single_frame(self, vnframe, indent=''):
"""Returns pretty printed version of a single frame in a VerbNet class
-
+
Returns a string containing a pretty-printed representation of
the given frame.
-
+
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
- frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
- frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
- frame_string += (
- self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n"
- )
- frame_string += indent + " Semantics:\n"
- frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ")
+ frame_string = self._pprint_description_within_frame(vnframe, indent) + '\n'
+ frame_string += self._pprint_example_within_frame(vnframe, indent + ' ') + '\n'
+ frame_string += self._pprint_syntax_within_frame(vnframe, indent + ' Syntax: ') + '\n'
+ frame_string += indent + ' Semantics:\n'
+ frame_string += self._pprint_semantics_within_frame(vnframe, indent + ' ')
return frame_string
- def _pprint_example_within_frame(self, vnframe, indent=""):
+ def _pprint_example_within_frame(self, vnframe, indent=''):
"""Returns pretty printed version of example within frame in a VerbNet class
-
+
Return a string containing a pretty-printed representation of
the given VerbNet frame example.
:param vnframe: An ElementTree containing the xml contents of
a Verbnet frame.
"""
- if vnframe["example"]:
- return indent + " Example: " + vnframe["example"]
+ if vnframe['example']:
+ return indent + ' Example: ' + vnframe['example']
- def _pprint_description_within_frame(self, vnframe, indent=""):
+ def _pprint_description_within_frame(self, vnframe, indent=''):
"""Returns pretty printed version of a VerbNet frame description
-
+
Return a string containing a pretty-printed representation of
the given VerbNet frame description.
:param vnframe: An ElementTree containing the xml contents of
a VerbNet frame.
"""
- description = indent + vnframe["description"]["primary"]
- if vnframe["description"]["secondary"]:
- description += " ({})".format(vnframe["description"]["secondary"])
+ description = indent + vnframe['description']['primary']
+ if vnframe['description']['secondary']:
+ description += ' ({})'.format(vnframe['description']['secondary'])
return description
- def _pprint_syntax_within_frame(self, vnframe, indent=""):
- """Returns pretty printed version of syntax within a frame in a VerbNet class
-
+ def _pprint_syntax_within_frame(self, vnframe, indent=''):
+ """Returns pretty printed version of syntax within a frame in a VerbNet class
+
Return a string containing a pretty-printed representation of
the given VerbNet frame syntax.
a VerbNet frame.
"""
pieces = []
- for element in vnframe["syntax"]:
- piece = element["pos_tag"]
+ for element in vnframe['syntax']:
+ piece = element['pos_tag']
modifier_list = []
- if "value" in element["modifiers"] and element["modifiers"]["value"]:
- modifier_list.append(element["modifiers"]["value"])
- modifier_list += [
- "{}{}".format(restr["value"], restr["type"])
- for restr in (
- element["modifiers"]["selrestrs"]
- + element["modifiers"]["synrestrs"]
- )
- ]
+ if 'value' in element['modifiers'] and element['modifiers']['value']:
+ modifier_list.append(element['modifiers']['value'])
+ modifier_list += ['{}{}'.format(restr['value'], restr['type'])
+ for restr in (element['modifiers']['selrestrs'] +
+ element['modifiers']['synrestrs'])]
if modifier_list:
- piece += "[{}]".format(" ".join(modifier_list))
+ piece += '[{}]'.format(' '.join(modifier_list))
pieces.append(piece)
- return indent + " ".join(pieces)
+ return indent + ' '.join(pieces)
- def _pprint_semantics_within_frame(self, vnframe, indent=""):
+ def _pprint_semantics_within_frame(self, vnframe, indent=''):
"""Returns a pretty printed version of semantics within frame in a VerbNet class
-
+
Return a string containing a pretty-printed representation of
the given VerbNet frame semantics.
a VerbNet frame.
"""
pieces = []
- for predicate in vnframe["semantics"]:
- arguments = [argument["value"] for argument in predicate["arguments"]]
- pieces.append(
- "{}({})".format(predicate["predicate_value"], ", ".join(arguments))
- )
- return "\n".join("{}* {}".format(indent, piece) for piece in pieces)
+ for predicate in vnframe['semantics']:
+ arguments = [argument['value'] for argument in predicate['arguments']]
+ pieces.append('{}({})'.format(predicate['predicate_value'], ', '.join(arguments)))
+ return '\n'.join('{}* {}'.format(indent, piece) for piece in pieces)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from six import string_types
+
from nltk.tokenize import line_tokenize
from nltk.corpus.reader.util import *
"""
List of words, one per line. Blank lines are ignored.
"""
-
- def words(self, fileids=None, ignore_lines_startswith="\n"):
- return [
- line
- for line in line_tokenize(self.raw(fileids))
- if not line.startswith(ignore_lines_startswith)
- ]
+ def words(self, fileids=None, ignore_lines_startswith='\n'):
+ return [line for line in line_tokenize(self.raw(fileids))
+ if not line.startswith(ignore_lines_startswith)]
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
Moses Machine Translation toolkit. These lists are used in the Python port
of the Moses' word tokenizer.
"""
-
- available_langs = {
- "catalan": "ca",
- "czech": "cs",
- "german": "de",
- "greek": "el",
- "english": "en",
- "spanish": "es",
- "finnish": "fi",
- "french": "fr",
- "hungarian": "hu",
- "icelandic": "is",
- "italian": "it",
- "latvian": "lv",
- "dutch": "nl",
- "polish": "pl",
- "portuguese": "pt",
- "romanian": "ro",
- "russian": "ru",
- "slovak": "sk",
- "slovenian": "sl",
- "swedish": "sv",
- "tamil": "ta",
- }
+ available_langs = {'catalan': 'ca', 'czech': 'cs', 'german': 'de',
+ 'greek': 'el', 'english': 'en', 'spanish': 'es',
+ 'finnish': 'fi', 'french': 'fr', 'hungarian': 'hu',
+ 'icelandic': 'is', 'italian': 'it', 'latvian': 'lv',
+ 'dutch': 'nl', 'polish': 'pl', 'portuguese': 'pt',
+ 'romanian': 'ro', 'russian': 'ru', 'slovak': 'sk',
+ 'slovenian': 'sl', 'swedish': 'sv', 'tamil': 'ta'}
# Also, add the lang IDs as the keys.
- available_langs.update({v: v for v in available_langs.values()})
+ available_langs.update({v:v for v in available_langs.values()})
- def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
+ def words(self, lang=None, fileids=None, ignore_lines_startswith='#'):
"""
This module returns a list of nonbreaking prefixes for the specified
language(s).
# all languages when fileids==None.
if lang in self.available_langs:
lang = self.available_langs[lang]
- fileids = ["nonbreaking_prefix." + lang]
- return [
- line
- for line in line_tokenize(self.raw(fileids))
- if not line.startswith(ignore_lines_startswith)
- ]
-
+ fileids = ['nonbreaking_prefix.'+lang]
+ return [line for line in line_tokenize(self.raw(fileids))
+ if not line.startswith(ignore_lines_startswith)]
class UnicharsCorpusReader(WordListCorpusReader):
"""
The files in the perluniprop.zip are extracted using the Unicode::Tussle
module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
"""
-
# These are categories similar to the Perl Unicode Properties
- available_categories = [
- "Close_Punctuation",
- "Currency_Symbol",
- "IsAlnum",
- "IsAlpha",
- "IsLower",
- "IsN",
- "IsSc",
- "IsSo",
- "IsUpper",
- "Line_Separator",
- "Number",
- "Open_Punctuation",
- "Punctuation",
- "Separator",
- "Symbol",
- ]
+ available_categories = ['Close_Punctuation', 'Currency_Symbol',
+ 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc',
+ 'IsSo', 'IsUpper', 'Line_Separator', 'Number',
+ 'Open_Punctuation', 'Punctuation', 'Separator',
+ 'Symbol']
def chars(self, category=None, fileids=None):
"""
:return: a list of characters given the specific unicode character category
"""
if category in self.available_categories:
- fileids = [category + ".txt"]
+ fileids = [category+'.txt']
return list(self.raw(fileids).strip())
:return: a list of tuples of similar lexical terms.
"""
-
- mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
-
+ mwa_ppdb_xxxl_file = 'ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs'
def entries(self, fileids=mwa_ppdb_xxxl_file):
"""
:return: a tuple of synonym word pairs.
"""
- return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
+ return [tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids))]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: WordNet
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bethard <Steven.Bethard@colorado.edu>
# Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
"""
+from __future__ import print_function, unicode_literals
+
import math
import re
from itertools import islice, chain
from operator import itemgetter
from collections import defaultdict, deque
+from six import iteritems
+from six.moves import range
+
from nltk.corpus.reader import CorpusReader
from nltk.util import binary_search_file as _binary_search_file
from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
from nltk.internals import deprecated
######################################################################
_INF = 1e300
# { Part-of-speech constants
-ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# }
POS_LIST = [NOUN, VERB, ADJ, ADV]
"Somebody %s INFINITIVE",
"Somebody %s VERB-ing",
"It %s that CLAUSE",
- "Something %s INFINITIVE",
-)
+ "Something %s INFINITIVE")
-SENSENUM_RE = re.compile(r"\.[\d]+\.")
+SENSENUM_RE = re.compile(r'\.[\d]+\.')
######################################################################
"""A common base class for lemmas and synsets."""
def hypernyms(self):
- return self._related("@")
+ return self._related('@')
def _hypernyms(self):
- return self._related("@")
+ return self._related('@')
def instance_hypernyms(self):
- return self._related("@i")
+ return self._related('@i')
def _instance_hypernyms(self):
- return self._related("@i")
+ return self._related('@i')
def hyponyms(self):
- return self._related("~")
+ return self._related('~')
def instance_hyponyms(self):
- return self._related("~i")
+ return self._related('~i')
def member_holonyms(self):
- return self._related("#m")
+ return self._related('#m')
def substance_holonyms(self):
- return self._related("#s")
+ return self._related('#s')
def part_holonyms(self):
- return self._related("#p")
+ return self._related('#p')
def member_meronyms(self):
- return self._related("%m")
+ return self._related('%m')
def substance_meronyms(self):
- return self._related("%s")
+ return self._related('%s')
def part_meronyms(self):
- return self._related("%p")
+ return self._related('%p')
def topic_domains(self):
- return self._related(";c")
-
- def in_topic_domains(self):
- return self._related("-c")
+ return self._related(';c')
def region_domains(self):
- return self._related(";r")
-
- def in_region_domains(self):
- return self._related("-r")
+ return self._related(';r')
def usage_domains(self):
- return self._related(";u")
-
- def in_usage_domains(self):
- return self._related("-u")
+ return self._related(';u')
def attributes(self):
- return self._related("=")
+ return self._related('=')
def entailments(self):
- return self._related("*")
+ return self._related('*')
def causes(self):
- return self._related(">")
+ return self._related('>')
def also_sees(self):
- return self._related("^")
+ return self._related('^')
def verb_groups(self):
- return self._related("$")
+ return self._related('$')
def similar_tos(self):
- return self._related("&")
+ return self._related('&')
def __hash__(self):
return hash(self._name)
return self._name < other._name
+@python_2_unicode_compatible
class Lemma(_WordNetObject):
"""
The lexical entry for a single morphological form of a
'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
'salt.n.03.salinity'.
- Lemma attributes, accessible via methods with the same name:
+ Lemma attributes, accessible via methods with the same name::
- name: The canonical name of this lemma.
- synset: The synset that this lemma belongs to.
- syntactic_marker: For adjectives, the WordNet string identifying the
syntactic position relative modified noun. See:
- https://wordnet.princeton.edu/documentation/wninput5wn
+ http://wordnet.princeton.edu/man/wninput.5WN.html#sect10
For all other parts of speech, this attribute is None.
- count: The frequency of this lemma in wordnet.
Lemmas have the following methods for retrieving related Lemmas. They
correspond to the names for the pointer symbols defined here:
- https://wordnet.princeton.edu/documentation/wninput5wn
+ http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
These methods all return lists of Lemmas:
- antonyms
- pertainyms
"""
- __slots__ = [
- "_wordnet_corpus_reader",
- "_name",
- "_syntactic_marker",
- "_synset",
- "_frame_strings",
- "_frame_ids",
- "_lexname_index",
- "_lex_id",
- "_lang",
- "_key",
- ]
-
- def __init__(
- self,
- wordnet_corpus_reader,
- synset,
- name,
- lexname_index,
- lex_id,
- syntactic_marker,
- ):
+ __slots__ = ['_wordnet_corpus_reader', '_name', '_syntactic_marker',
+ '_synset', '_frame_strings', '_frame_ids',
+ '_lexname_index', '_lex_id', '_lang', '_key']
+
+ def __init__(self, wordnet_corpus_reader, synset, name,
+ lexname_index, lex_id, syntactic_marker):
self._wordnet_corpus_reader = wordnet_corpus_reader
self._name = name
self._syntactic_marker = syntactic_marker
self._frame_ids = []
self._lexname_index = lexname_index
self._lex_id = lex_id
- self._lang = "eng"
+ self._lang = 'eng'
self._key = None # gets set later.
def _related(self, relation_symbol):
get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
- if (self._name, relation_symbol) not in self._synset._lemma_pointers:
- return []
- return [
+ return sorted([
get_synset(pos, offset)._lemmas[lemma_index]
- for pos, offset, lemma_index in self._synset._lemma_pointers[
- self._name, relation_symbol
- ]
- ]
+ for pos, offset, lemma_index
+ in self._synset._lemma_pointers[self._name, relation_symbol]
+ ])
def count(self):
"""Return the frequency count for this Lemma"""
return self._wordnet_corpus_reader.lemma_count(self)
def antonyms(self):
- return self._related("!")
+ return self._related('!')
def derivationally_related_forms(self):
- return self._related("+")
+ return self._related('+')
def pertainyms(self):
- return self._related("\\")
+ return self._related('\\')
+@python_2_unicode_compatible
class Synset(_WordNetObject):
"""Create a Synset from a "<lemma>.<pos>.<number>" string where:
<lemma> is the word's morphological stem
Synsets have the following methods for retrieving related Synsets.
They correspond to the names for the pointer symbols defined here:
- https://wordnet.princeton.edu/documentation/wninput5wn
+ http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
These methods all return lists of Synsets.
- hypernyms, instance_hypernyms
- pertainyms
"""
- __slots__ = [
- "_pos",
- "_offset",
- "_name",
- "_frame_ids",
- "_lemmas",
- "_lemma_names",
- "_definition",
- "_examples",
- "_lexname",
- "_pointers",
- "_lemma_pointers",
- "_max_depth",
- "_min_depth",
- ]
+ __slots__ = ['_pos', '_offset', '_name', '_frame_ids',
+ '_lemmas', '_lemma_names',
+ '_definition', '_examples', '_lexname',
+ '_pointers', '_lemma_pointers', '_max_depth',
+ '_min_depth']
def __init__(self, wordnet_corpus_reader):
self._wordnet_corpus_reader = wordnet_corpus_reader
self._all_hypernyms = None
self._pointers = defaultdict(set)
- self._lemma_pointers = defaultdict(list)
+ self._lemma_pointers = defaultdict(set)
def pos(self):
return self._pos
def _needs_root(self):
if self._pos == NOUN:
- if self._wordnet_corpus_reader.get_version() == "1.6":
+ if self._wordnet_corpus_reader.get_version() == '1.6':
return True
else:
return False
elif self._pos == VERB:
return True
- def lemma_names(self, lang="eng"):
- """Return all the lemma_names associated with the synset"""
- if lang == "eng":
+ def lemma_names(self, lang='eng'):
+ '''Return all the lemma_names associated with the synset'''
+ if lang == 'eng':
return self._lemma_names
else:
self._wordnet_corpus_reader._load_lang_data(lang)
- i = self._wordnet_corpus_reader.ss2of(self, lang)
+ i = self._wordnet_corpus_reader.ss2of(self)
if i in self._wordnet_corpus_reader._lang_data[lang][0]:
return self._wordnet_corpus_reader._lang_data[lang][0][i]
else:
return []
- def lemmas(self, lang="eng"):
- """Return all the lemma objects associated with the synset"""
- if lang == "eng":
+ def lemmas(self, lang='eng'):
+ '''Return all the lemma objects associated with the synset'''
+ if lang == 'eng':
return self._lemmas
else:
self._wordnet_corpus_reader._load_lang_data(lang)
self._wordnet_corpus_reader,
self,
lem,
- self._wordnet_corpus_reader._lexnames.index(self.lexname()),
+ self._wordnet_corpus_reader._lexnames.index(
+ self.lexname()
+ ),
0,
- None,
+ None
)
temp._lang = lang
lemmark.append(temp)
next_synset = todo.pop()
if next_synset not in seen:
seen.add(next_synset)
- next_hypernyms = (
- next_synset.hypernyms() + next_synset.instance_hypernyms()
- )
+ next_hypernyms = next_synset.hypernyms() + \
+ next_synset.instance_hypernyms()
if not next_hypernyms:
result.append(next_synset)
else:
todo.extend(next_hypernyms)
return result
- # Simpler implementation which makes incorrect assumption that
- # hypernym hierarchy is acyclic:
- #
- # if not self.hypernyms():
- # return [self]
- # else:
- # return list(set(root for h in self.hypernyms()
- # for root in h.root_hypernyms()))
+# Simpler implementation which makes incorrect assumption that
+# hypernym hierarchy is acyclic:
+#
+# if not self.hypernyms():
+# return [self]
+# else:
+# return list(set(root for h in self.hypernyms()
+# for root in h.root_hypernyms()))
def max_depth(self):
"""
:return: The length of the longest hypernym path from this
"""
from nltk.util import breadth_first
-
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
)
return list(self._all_hypernyms.intersection(other._all_hypernyms))
- def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
+ def lowest_common_hypernyms(
+ self, other, simulate_root=False, use_min_depth=False
+ ):
"""
Get a list of lowest synset(s) that both synsets have as a hypernym.
When `use_min_depth == False` this means that the synset which appears
synsets = self.common_hypernyms(other)
if simulate_root:
fake_synset = Synset(None)
- fake_synset._name = "*ROOT*"
+ fake_synset._name = '*ROOT*'
fake_synset.hypernyms = lambda: []
fake_synset.instance_hypernyms = lambda: []
synsets.append(fake_synset)
try:
if use_min_depth:
max_depth = max(s.min_depth() for s in synsets)
- unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
+ unsorted_lch = [
+ s for s in synsets if s.min_depth() == max_depth
+ ]
else:
max_depth = max(s.max_depth() for s in synsets)
- unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
+ unsorted_lch = [
+ s for s in synsets if s.max_depth() == max_depth
+ ]
return sorted(unsorted_lch)
except ValueError:
return []
"""
distances = set([(self, distance)])
for hypernym in self._hypernyms() + self._instance_hypernyms():
- distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
+ distances |= hypernym.hypernym_distances(
+ distance+1,
+ simulate_root=False
+ )
if simulate_root:
fake_synset = Synset(None)
- fake_synset._name = "*ROOT*"
+ fake_synset._name = '*ROOT*'
fake_synset_distance = max(distances, key=itemgetter(1))[1]
- distances.add((fake_synset, fake_synset_distance + 1))
+ distances.add((fake_synset, fake_synset_distance+1))
return distances
def _shortest_hypernym_paths(self, simulate_root):
- if self._name == "*ROOT*":
+ if self._name == '*ROOT*':
return {self: 0}
queue = deque([(self, 0)])
if simulate_root:
fake_synset = Synset(None)
- fake_synset._name = "*ROOT*"
+ fake_synset._name = '*ROOT*'
path[fake_synset] = max(path.values()) + 1
return path
# For each ancestor synset common to both subject synsets, find the
# connecting path length. Return the shortest of these.
- inf = float("inf")
+ inf = float('inf')
path_distance = inf
- for synset, d1 in dist_dict1.items():
+ for synset, d1 in iteritems(dist_dict1):
d2 = dist_dict2.get(synset, inf)
path_distance = min(path_distance, d1 + d2)
tree = [self]
if depth != 0:
- tree += [x.tree(rel, depth - 1, cut_mark) for x in rel(self)]
+ tree += [x.tree(rel, depth-1, cut_mark) for x in rel(self)]
elif cut_mark:
tree += [cut_mark]
return tree
"""
distance = self.shortest_path_distance(
- other, simulate_root=simulate_root and self._needs_root()
+ other,
+ simulate_root=simulate_root and self._needs_root()
)
if distance is None or distance < 0:
return None
if self._pos != other._pos:
raise WordNetError(
- "Computing the lch similarity requires "
- "%s and %s to have the same part of speech." % (self, other)
+ 'Computing the lch similarity requires '
+ '%s and %s to have the same part of speech.' %
+ (self, other)
)
need_root = self._needs_root()
if self._pos not in self._wordnet_corpus_reader._max_depth:
- self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
+ self._wordnet_corpus_reader._compute_max_depth(
+ self._pos, need_root
+ )
depth = self._wordnet_corpus_reader._max_depth[self._pos]
distance = self.shortest_path_distance(
- other, simulate_root=simulate_root and need_root
+ other,
+ simulate_root=simulate_root and need_root
)
if distance is None or distance < 0 or depth == 0:
# It is possible that more accurate results could be obtained by
# removing this setting and it should be tested later on
subsumers = self.lowest_common_hypernyms(
- other, simulate_root=simulate_root and need_root, use_min_depth=True
+ other,
+ simulate_root=simulate_root and need_root, use_min_depth=True
)
# If no LCS was found return None
# subsuming. Add this to the LCS path length to get the path
# length from each synset to the root.
len1 = self.shortest_path_distance(
- subsumer, simulate_root=simulate_root and need_root
+ subsumer,
+ simulate_root=simulate_root and need_root
)
len2 = other.shortest_path_distance(
- subsumer, simulate_root=simulate_root and need_root
+ subsumer,
+ simulate_root=simulate_root and need_root
)
if len1 is None or len2 is None:
return None
for synset in todo:
seen.add(synset)
yield todo
- todo = [
- hypernym
- for synset in todo
- for hypernym in (synset.hypernyms() + synset.instance_hypernyms())
- if hypernym not in seen
- ]
+ todo = [hypernym
+ for synset in todo
+ for hypernym in (
+ synset.hypernyms() + synset.instance_hypernyms()
+ )
+ if hypernym not in seen]
def __repr__(self):
return "%s('%s')" % (type(self).__name__, self._name)
def _related(self, relation_symbol, sort=True):
get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
- if relation_symbol not in self._pointers:
- return []
pointer_tuples = self._pointers[relation_symbol]
r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
if sort:
# WordNet Corpus Reader
######################################################################
-
class WordNetCorpusReader(CorpusReader):
"""
A corpus reader used to access wordnet or its variants.
"""
- _ENCODING = "utf8"
+ _ENCODING = 'utf8'
# { Part-of-speech constants
- ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+ ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# }
# { Filename constants
- _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"}
+ _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
# }
# { Part of speech constants
#: A list of file identifiers for all the fileids used by this
#: corpus reader.
- _FILES = (
- "cntlist.rev",
- "lexnames",
- "index.sense",
- "index.adj",
- "index.adv",
- "index.noun",
- "index.verb",
- "data.adj",
- "data.adv",
- "data.noun",
- "data.verb",
- "adj.exc",
- "adv.exc",
- "noun.exc",
- "verb.exc",
- )
+ _FILES = ('cntlist.rev', 'lexnames', 'index.sense',
+ 'index.adj', 'index.adv', 'index.noun', 'index.verb',
+ 'data.adj', 'data.adv', 'data.noun', 'data.verb',
+ 'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc', )
def __init__(self, root, omw_reader):
"""
Construct a new wordnet corpus reader, with the given root
directory.
"""
- super(WordNetCorpusReader, self).__init__(
- root, self._FILES, encoding=self._ENCODING
- )
+ super(WordNetCorpusReader, self).__init__(root, self._FILES,
+ encoding=self._ENCODING)
# A index that provides the file offset
# Map from lemma -> pos -> synset_index -> offset
self._key_synset_file = None
# Load the lexnames
- for i, line in enumerate(self.open("lexnames")):
+ for i, line in enumerate(self.open('lexnames')):
index, lexname, _ = line.split()
assert int(index) == i
self._lexnames.append(lexname)
# load the exception file data into memory
self._load_exception_map()
- # Open Multilingual WordNet functions, contributed by
- # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
+# Open Multilingual WordNet functions, contributed by
+# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
def of2ss(self, of):
- """ take an id and return the synsets """
+ ''' take an id and return the synsets '''
return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
- def ss2of(self, ss, lang=None):
- """ return the ID of the synset """
- pos = ss.pos()
- # Only these 3 WordNets retain the satellite pos tag
- if lang not in ["nld", "lit", "slk"] and pos == "s":
- pos = "a"
- return "{:08d}-{}".format(ss.offset(), pos)
+ def ss2of(self, ss):
+ ''' return the ID of the synset '''
+ return ("{:08d}-{}".format(ss.offset(), ss.pos()))
def _load_lang_data(self, lang):
- """ load the wordnet data of the requested language from the file to
- the cache, _lang_data """
+ ''' load the wordnet data of the requested language from the file to
+ the cache, _lang_data '''
if lang in self._lang_data.keys():
return
if lang not in self.langs():
raise WordNetError("Language is not supported.")
- f = self._omw_reader.open("{0:}/wn-data-{0:}.tab".format(lang))
+ f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
self.custom_lemmas(f, lang)
f.close()
def langs(self):
- """ return a list of languages supported by Multilingual Wordnet """
+ ''' return a list of languages supported by Multilingual Wordnet '''
import os
-
- langs = ["eng"]
+ langs = ['eng']
fileids = self._omw_reader.fileids()
for fileid in fileids:
file_name, file_extension = os.path.splitext(fileid)
- if file_extension == ".tab":
- langs.append(file_name.split("-")[-1])
+ if file_extension == '.tab':
+ langs.append(file_name.split('-')[-1])
return langs
for suffix in self._FILEMAP.values():
# parse each line of the file (ignoring comment lines)
- for i, line in enumerate(self.open("index.%s" % suffix)):
- if line.startswith(" "):
+ for i, line in enumerate(self.open('index.%s' % suffix)):
+ if line.startswith(' '):
continue
_iter = iter(line.split())
- def _next_token():
- return next(_iter)
+ def _next_token(): return next(_iter)
try:
_next_token()
# get synset offsets
- synset_offsets = [int(_next_token()) for _ in range(n_synsets)]
+ synset_offsets = [
+ int(_next_token()) for _ in range(n_synsets)
+ ]
# raise more informative error with file name and line number
except (AssertionError, ValueError) as e:
- tup = ("index.%s" % suffix), (i + 1), e
- raise WordNetError("file %s, line %i: %s" % tup)
+ tup = ('index.%s' % suffix), (i + 1), e
+ raise WordNetError('file %s, line %i: %s' % tup)
# map lemmas and parts of speech to synsets
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
# load the exception file data into memory
for pos, suffix in self._FILEMAP.items():
self._exception_map[pos] = {}
- for line in self.open("%s.exc" % suffix):
+ for line in self.open('%s.exc' % suffix):
terms = line.split()
self._exception_map[pos][terms[0]] = terms[1:]
self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
def get_version(self):
fh = self._data_file(ADJ)
for line in fh:
- match = re.search(r"WordNet (\d+\.\d+) Copyright", line)
+ match = re.search(r'WordNet (\d+\.\d+) Copyright', line)
if match is not None:
version = match.group(1)
fh.seek(0)
# Loading Lemmas
#############################################################
- def lemma(self, name, lang="eng"):
- """Return lemma object that matches the name"""
+ def lemma(self, name, lang='eng'):
+ '''Return lemma object that matches the name'''
# cannot simply split on first '.',
# e.g.: '.45_caliber.a.01..45_caliber'
- separator = SENSENUM_RE.search(name).end()
-
- synset_name, lemma_name = name[: separator - 1], name[separator:]
+ separator = SENSENUM_RE.search(name).start()
+ leadingZero = int(name[separator+1]) == 0
+ if (leadingZero):
+ synset_name, lemma_name = name[:separator+3], name[separator+4:]
+ else:
+ synset_name, lemma_name = name[:separator+2], name[separator+3:]
+
synset = self.synset(synset_name)
for lemma in synset.lemmas(lang):
if lemma._name == lemma_name:
return lemma
- raise WordNetError("no lemma %r in %r" % (lemma_name, synset_name))
+ raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
def lemma_from_key(self, key):
# Keys are case sensitive and always lower-case
key = key.lower()
- lemma_name, lex_sense = key.split("%")
- pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
+ lemma_name, lex_sense = key.split('%')
+ pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':')
pos = self._pos_names[int(pos_number)]
# open the key -> synset file if necessary
if self._key_synset_file is None:
- self._key_synset_file = self.open("index.sense")
+ self._key_synset_file = self.open('index.sense')
# Find the synset for the lemma.
synset_line = _binary_search_file(self._key_synset_file, key)
#############################################################
def synset(self, name):
# split name into lemma, part of speech and synset number
- lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
+ lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
synset_index = int(synset_index_str) - 1
# get the offset for this synset
try:
offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
except KeyError:
- message = "no lemma %r with part of speech %r"
+ message = 'no lemma %r with part of speech %r'
raise WordNetError(message % (lemma, pos))
except IndexError:
n_senses = len(self._lemma_pos_offset_map[lemma][pos])
synset = self.synset_from_pos_and_offset(pos, offset)
# some basic sanity checks on loaded attributes
- if pos == "s" and synset._pos == "a":
- message = (
- "adjective satellite requested but only plain "
- "adjective found for lemma %r"
- )
+ if pos == 's' and synset._pos == 'a':
+ message = ('adjective satellite requested but only plain '
+ 'adjective found for lemma %r')
raise WordNetError(message % lemma)
- assert synset._pos == pos or (pos == "a" and synset._pos == "s")
+ assert synset._pos == pos or (pos == 'a' and synset._pos == 's')
# Return the synset object.
return synset
if pos == ADJ_SAT:
pos = ADJ
if self._data_file_map.get(pos) is None:
- fileid = "data.%s" % self._FILEMAP[pos]
+ fileid = 'data.%s' % self._FILEMAP[pos]
self._data_file_map[pos] = self.open(fileid)
return self._data_file_map[pos]
self._synset_offset_cache[pos][offset] = synset
return synset
- @deprecated("Use public method synset_from_pos_and_offset() instead")
+ @deprecated('Use public method synset_from_pos_and_offset() instead')
def _synset_from_pos_and_offset(self, *args, **kwargs):
"""
Hack to help people like the readers of
try:
# parse out the definitions and examples from the gloss
- columns_str, gloss = data_file_line.strip().split("|")
- definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
- examples = re.findall(r'"([^"]*)"', gloss)
- for example in examples:
- synset._examples.append(example)
-
- synset._definition = definition.strip("; ")
+ columns_str, gloss = data_file_line.split('|')
+ gloss = gloss.strip()
+ definitions = []
+ for gloss_part in gloss.split(';'):
+ gloss_part = gloss_part.strip()
+ if gloss_part.startswith('"'):
+ synset._examples.append(gloss_part.strip('"'))
+ else:
+ definitions.append(gloss_part)
+ synset._definition = '; '.join(definitions)
# split the other info into fields
_iter = iter(columns_str.split())
- def _next_token():
- return next(_iter)
+ def _next_token(): return next(_iter)
# get the offset
synset._offset = int(_next_token())
# get the lex_id (used for sense_keys)
lex_id = int(_next_token(), 16)
# If the lemma has a syntactic marker, extract it.
- m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
+ m = re.match(r'(.*?)(\(.*\))?$', lemma_name)
lemma_name, syn_mark = m.groups()
# create the lemma object
- lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
+ lemma = Lemma(self, synset, lemma_name, lexname_index,
+ lex_id, syn_mark)
synset._lemmas.append(lemma)
synset._lemma_names.append(lemma._name)
offset = int(_next_token())
pos = _next_token()
lemma_ids_str = _next_token()
- if lemma_ids_str == "0000":
+ if lemma_ids_str == '0000':
synset._pointers[symbol].add((pos, offset))
else:
source_index = int(lemma_ids_str[:2], 16) - 1
source_lemma_name = synset._lemmas[source_index]._name
lemma_pointers = synset._lemma_pointers
tups = lemma_pointers[source_lemma_name, symbol]
- tups.append((pos, offset, target_index))
+ tups.add((pos, offset, target_index))
# read the verb frames
try:
for _ in range(frame_count):
# read the plus sign
plus = _next_token()
- assert plus == "+"
+ assert plus == '+'
# read the frame and lemma number
frame_number = int(_next_token())
frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
synset._frame_ids.append(frame_number)
for lemma in synset._lemmas:
lemma._frame_ids.append(frame_number)
- lemma._frame_strings.append(frame_string_fmt % lemma._name)
+ lemma._frame_strings.append(
+ frame_string_fmt % lemma._name
+ )
# only a specific word in the synset
else:
lemma = synset._lemmas[lemma_number - 1]
lemma._frame_ids.append(frame_number)
- lemma._frame_strings.append(frame_string_fmt % lemma._name)
+ lemma._frame_strings.append(
+ frame_string_fmt % lemma._name
+ )
# raise a more informative error with line text
except ValueError as e:
- raise WordNetError("line %r: %s" % (data_file_line, e))
+ raise WordNetError('line %r: %s' % (data_file_line, e))
# set sense keys for Lemma objects - note that this has to be
# done afterwards so that the relations are available
if synset._pos == ADJ_SAT:
head_lemma = synset.similar_tos()[0]._lemmas[0]
head_name = head_lemma._name
- head_id = "%02d" % head_lemma._lex_id
+ head_id = '%02d' % head_lemma._lex_id
else:
- head_name = head_id = ""
- tup = (
- lemma._name,
- WordNetCorpusReader._pos_numbers[synset._pos],
- lemma._lexname_index,
- lemma._lex_id,
- head_name,
- head_id,
- )
- lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
+ head_name = head_id = ''
+ tup = (lemma._name, WordNetCorpusReader._pos_numbers[synset._pos],
+ lemma._lexname_index, lemma._lex_id, head_name, head_id)
+ lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
# the canonical name is based on the first lemma
lemma_name = synset._lemmas[0]._name.lower()
offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
sense_index = offsets.index(synset._offset)
tup = lemma_name, synset._pos, sense_index + 1
- synset._name = "%s.%s.%02i" % tup
+ synset._name = '%s.%s.%02i' % tup
return synset
- def synset_from_sense_key(self, sense_key):
- """
- Retrieves synset based on a given sense_key. Sense keys can be
- obtained from lemma.key()
-
- From https://wordnet.princeton.edu/documentation/senseidx5wn:
- A sense_key is represented as:
- lemma % lex_sense (e.g. 'dog%1:18:01::')
- where lex_sense is encoded as:
- ss_type:lex_filenum:lex_id:head_word:head_id
-
- lemma: ASCII text of word/collocation, in lower case
- ss_type: synset type for the sense (1 digit int)
- The synset type is encoded as follows:
- 1 NOUN
- 2 VERB
- 3 ADJECTIVE
- 4 ADVERB
- 5 ADJECTIVE SATELLITE
- lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
- lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
- head_word: lemma of the first word in satellite's head synset
- Only used if sense is in an adjective satellite synset
- head_id: uniquely identifies sense in a lexicographer file when paired with head_word
- Only used if head_word is present (2 digit int)
- """
- sense_key_regex = re.compile(r"(.*)\%(.*):(.*):(.*):(.*):(.*)")
- synset_types = {1: NOUN, 2: VERB, 3: ADJ, 4: ADV, 5: ADJ_SAT}
- lemma, ss_type, _, lex_id, _, _ = sense_key_regex.match(sense_key).groups()
-
- # check that information extracted from sense_key is valid
- error = None
- if not lemma:
- error = "lemma"
- elif int(ss_type) not in synset_types:
- error = "ss_type"
- elif int(lex_id) < 0 or int(lex_id) > 99:
- error = "lex_id"
- if error:
- raise WordNetError(
- "valid {} could not be extracted from the sense key".format(error)
- )
-
- synset_id = ".".join([lemma, synset_types[int(ss_type)], lex_id])
- return self.synset(synset_id)
-
#############################################################
# Retrieve synsets and lemmas.
#############################################################
- def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True):
+ def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True):
"""Load all synsets with a given lemma and part of speech tag.
If no pos is specified, all synsets for all parts of speech
will be loaded.
"""
lemma = lemma.lower()
- if lang == "eng":
+ if lang == 'eng':
get_synset = self.synset_from_pos_and_offset
index = self._lemma_pos_offset_map
if pos is None:
pos = POS_LIST
- return [
- get_synset(p, offset)
- for p in pos
- for form in self._morphy(lemma, p, check_exceptions)
- for offset in index[form].get(p, [])
- ]
+ return [get_synset(p, offset)
+ for p in pos
+ for form in self._morphy(lemma, p, check_exceptions)
+ for offset in index[form].get(p, [])]
else:
self._load_lang_data(lang)
synset_list = []
- if lemma in self._lang_data[lang][1]:
- for l in self._lang_data[lang][1][lemma]:
- if pos is not None and l[-1] != pos:
- continue
- synset_list.append(self.of2ss(l))
+ for l in self._lang_data[lang][1][lemma]:
+ if pos is not None and l[-1] != pos:
+ continue
+ synset_list.append(self.of2ss(l))
return synset_list
- def lemmas(self, lemma, pos=None, lang="eng"):
+ def lemmas(self, lemma, pos=None, lang='eng'):
"""Return all Lemma objects with a name matching the specified lemma
name and part of speech tag. Matches any part of speech tag if none is
specified."""
lemma = lemma.lower()
- if lang == "eng":
- return [
- lemma_obj
- for synset in self.synsets(lemma, pos)
- for lemma_obj in synset.lemmas()
- if lemma_obj.name().lower() == lemma
- ]
+ if lang == 'eng':
+ return [lemma_obj
+ for synset in self.synsets(lemma, pos)
+ for lemma_obj in synset.lemmas()
+ if lemma_obj.name().lower() == lemma]
else:
self._load_lang_data(lang)
lemmas.append(lemma_obj)
return lemmas
- def all_lemma_names(self, pos=None, lang="eng"):
+ def all_lemma_names(self, pos=None, lang='eng'):
"""Return all lemma names for all synsets for the given
part of speech tag and language or languages. If pos is
not specified, all synsets for all parts of speech will
be used."""
- if lang == "eng":
+ if lang == 'eng':
if pos is None:
return iter(self._lemma_pos_offset_map)
else:
return (
- lemma
- for lemma in self._lemma_pos_offset_map
+ lemma for lemma in self._lemma_pos_offset_map
if pos in self._lemma_pos_offset_map[lemma]
)
else:
continue
lemma.extend(self._lang_data[lang][0][i])
- lemma = iter(set(lemma))
+ lemma = list(set(lemma))
return lemma
def all_synsets(self, pos=None):
# be moved while we're not looking.
if pos_tag == ADJ_SAT:
pos_tag = ADJ
- fileid = "data.%s" % self._FILEMAP[pos_tag]
+ fileid = 'data.%s' % self._FILEMAP[pos_tag]
data_file = self.open(fileid)
try:
else:
data_file.close()
- def words(self, lang="eng"):
+ def words(self, lang='eng'):
"""return lemmas of the given language as list of words"""
return self.all_lemma_names(lang=lang)
- def license(self, lang="eng"):
+ def license(self, lang='eng'):
"""Return the contents of LICENSE (for omw)
use lang=lang to get the license for an individual language"""
- if lang == "eng":
+ if lang == 'eng':
return self.open("LICENSE").read()
elif lang in self.langs():
return self._omw_reader.open("{}/LICENSE".format(lang)).read()
- elif lang == "omw":
+ elif lang == 'omw':
# under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("LICENSE").read()
elif lang in self._lang_data:
- raise WordNetError("Cannot determine license for user-provided tab file")
+ raise WordNetError(
+ "Cannot determine license for user-provided tab file"
+ )
else:
raise WordNetError("Language is not supported.")
- def readme(self, lang="omw"):
+ def readme(self, lang='omw'):
"""Return the contents of README (for omw)
use lang=lang to get the readme for an individual language"""
- if lang == "eng":
+ if lang == 'eng':
return self.open("README").read()
elif lang in self.langs():
return self._omw_reader.open("{}/README".format(lang)).read()
- elif lang == "omw":
+ elif lang == 'omw':
# under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("README").read()
elif lang in self._lang_data:
else:
raise WordNetError("Language is not supported.")
- def citation(self, lang="omw"):
+ def citation(self, lang='omw'):
"""Return the contents of citation.bib file (for omw)
use lang=lang to get the citation for an individual language"""
- if lang == "eng":
+ if lang == 'eng':
return self.open("citation.bib").read()
elif lang in self.langs():
return self._omw_reader.open("{}/citation.bib".format(lang)).read()
- elif lang == "omw":
+ elif lang == 'omw':
# under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("citation.bib").read()
elif lang in self._lang_data:
def lemma_count(self, lemma):
"""Return the frequency count for this Lemma"""
# Currently, count is only work for English
- if lemma._lang != "eng":
+ if lemma._lang != 'eng':
return 0
# open the count file if we haven't already
if self._key_count_file is None:
- self._key_count_file = self.open("cntlist.rev")
+ self._key_count_file = self.open('cntlist.rev')
# find the key in the counts file and return the count
line = _binary_search_file(self._key_count_file, lemma._key)
if line:
- return int(line.rsplit(" ", 1)[-1])
+ return int(line.rsplit(' ', 1)[-1])
else:
return 0
- def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+ def path_similarity(
+ self, synset1, synset2, verbose=False, simulate_root=True
+ ):
return synset1.path_similarity(synset2, verbose, simulate_root)
-
path_similarity.__doc__ = Synset.path_similarity.__doc__
- def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+ def lch_similarity(
+ self, synset1, synset2, verbose=False, simulate_root=True
+ ):
return synset1.lch_similarity(synset2, verbose, simulate_root)
-
lch_similarity.__doc__ = Synset.lch_similarity.__doc__
- def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+ def wup_similarity(
+ self, synset1, synset2, verbose=False, simulate_root=True
+ ):
return synset1.wup_similarity(synset2, verbose, simulate_root)
-
wup_similarity.__doc__ = Synset.wup_similarity.__doc__
def res_similarity(self, synset1, synset2, ic, verbose=False):
return synset1.res_similarity(synset2, ic, verbose)
-
res_similarity.__doc__ = Synset.res_similarity.__doc__
def jcn_similarity(self, synset1, synset2, ic, verbose=False):
return synset1.jcn_similarity(synset2, ic, verbose)
-
jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
def lin_similarity(self, synset1, synset2, ic, verbose=False):
return synset1.lin_similarity(synset2, ic, verbose)
-
lin_similarity.__doc__ = Synset.lin_similarity.__doc__
#############################################################
return None
MORPHOLOGICAL_SUBSTITUTIONS = {
- NOUN: [
- ("s", ""),
- ("ses", "s"),
- ("ves", "f"),
- ("xes", "x"),
- ("zes", "z"),
- ("ches", "ch"),
- ("shes", "sh"),
- ("men", "man"),
- ("ies", "y"),
- ],
- VERB: [
- ("s", ""),
- ("ies", "y"),
- ("es", "e"),
- ("es", ""),
- ("ed", "e"),
- ("ed", ""),
- ("ing", "e"),
- ("ing", ""),
- ],
- ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")],
- ADV: [],
- }
+ NOUN: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'),
+ ('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'),
+ ('men', 'man'), ('ies', 'y')],
+ VERB: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''),
+ ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')],
+ ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
+ ADV: []}
MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
def apply_rules(forms):
- return [
- form[: -len(old)] + new
- for form in forms
- for old, new in substitutions
- if form.endswith(old)
- ]
+ return [form[:-len(old)] + new
+ for form in forms
+ for old, new in substitutions
+ if form.endswith(old)]
def filter_forms(forms):
result = []
:param lang ISO 639-3 code of the language of the tab file
"""
if len(lang) != 3:
- raise ValueError("lang should be a (3 character) ISO 639-3 code")
+ raise ValueError('lang should be a (3 character) ISO 639-3 code')
self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
- for line in tab_file.readlines():
- if isinstance(line, bytes):
+ for l in tab_file.readlines():
+ if isinstance(l, bytes):
# Support byte-stream files (e.g. as returned by Python 2's
# open() function) as well as text-stream ones
- line = line.decode("utf-8")
- if not line.startswith("#"):
- offset_pos, lemma_type, lemma = line.strip().split("\t")
- lemma = lemma.strip().replace(" ", "_")
- self._lang_data[lang][0][offset_pos].append(lemma)
- self._lang_data[lang][1][lemma.lower()].append(offset_pos)
- # Make sure no more entries are accidentally added subsequently
- self._lang_data[lang][0].default_factory = None
- self._lang_data[lang][1].default_factory = None
+ l = l.decode('utf-8')
+ l = l.replace('\n', '')
+ l = l.replace(' ', '_')
+ if l[0] != '#':
+ word = l.split('\t')
+ self._lang_data[lang][0][word[0]].append(word[2])
+ self._lang_data[lang][1][word[2].lower()].append(word[0])
######################################################################
# WordNet Information Content Corpus Reader
######################################################################
-
class WordNetICCorpusReader(CorpusReader):
"""
A corpus reader for the WordNet information content corpus.
"""
def __init__(self, root, fileids):
- CorpusReader.__init__(self, root, fileids, encoding="utf8")
+ CorpusReader.__init__(self, root, fileids, encoding='utf8')
# this load function would be more efficient if the data was pickled
# Note that we can't use NLTK's frequency distributions because
# More information about the metrics is available at
# http://marimba.d.umn.edu/similarity/measures.html
-
def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
return synset1.path_similarity(synset2, verbose, simulate_root)
"""
if synset1._pos != synset2._pos:
raise WordNetError(
- "Computing the least common subsumer requires "
- "%s and %s to have the same part of speech." % (synset1, synset2)
+ 'Computing the least common subsumer requires '
+ '%s and %s to have the same part of speech.' %
+ (synset1, synset2)
)
ic1 = information_content(synset1, ic)
# Utility functions
-
def information_content(synset, ic):
try:
icpos = ic[synset._pos]
except KeyError:
- msg = "Information content file has no entries for part-of-speech: %s"
+ msg = 'Information content file has no entries for part-of-speech: %s'
raise WordNetError(msg % synset._pos)
counts = icpos[synset._offset]
# get the part of speech (NOUN or VERB) from the information content record
# (each identifier has a 'n' or 'v' suffix)
-
def _get_pos(field):
- if field[-1] == "n":
+ if field[-1] == 'n':
return NOUN
- elif field[-1] == "v":
+ elif field[-1] == 'v':
return VERB
else:
msg = (
# unload corpus after tests
def teardown_module(module=None):
from nltk.corpus import wordnet
-
wordnet._unload()
+
# Natural Language Toolkit: XML Corpus Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
(note -- not named 'xml' to avoid conflicting w/ standard xml package)
"""
+from __future__ import print_function, unicode_literals
import codecs
-from xml.etree import ElementTree
+
+# Use the c version of ElementTree, which is faster, if possible:
+try: from xml.etree import cElementTree as ElementTree
+except ImportError: from xml.etree import ElementTree
+
+from six import string_types
from nltk.data import SeekableUnicodeStreamReader
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import *
-
class XMLCorpusReader(CorpusReader):
"""
Corpus reader for corpora whose documents are xml files.
``encoding`` argument, because the unicode encoding is specified by
the XML files themselves. See the XML specs for more info.
"""
-
def __init__(self, root, fileids, wrap_etree=False):
self._wrap_etree = wrap_etree
CorpusReader.__init__(self, root, fileids)
# Make sure we have exactly one file -- no concatenating XML.
if fileid is None and len(self._fileids) == 1:
fileid = self._fileids[0]
- if not isinstance(fileid, str):
- raise TypeError("Expected a single file identifier string")
+ if not isinstance(fileid, string_types):
+ raise TypeError('Expected a single file identifier string')
# Read the XML in using ElementTree.
elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
# If requested, wrap it.
elt = self.xml(fileid)
encoding = self.encoding(fileid)
- word_tokenizer = WordPunctTokenizer()
+ word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
return out
def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
+ if fileids is None: fileids = self._fileids
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
elt_handler(elt, tagspec) -> value
"""
- if elt_handler:
- self.handle_elt = elt_handler
+ if elt_handler: self.handle_elt = elt_handler
- self._tagspec = re.compile(tagspec + r"\Z")
+ self._tagspec = re.compile(tagspec+r'\Z')
"""The tag specification for this corpus view."""
self._tag_context = {0: ()}
finally:
infile.close()
else:
- with open(fileid, "rb") as infile:
+ with open(fileid, 'rb') as infile:
s = infile.readline()
if s.startswith(codecs.BOM_UTF16_BE):
- return "utf-16-be"
+ return 'utf-16-be'
if s.startswith(codecs.BOM_UTF16_LE):
- return "utf-16-le"
+ return 'utf-16-le'
if s.startswith(codecs.BOM_UTF32_BE):
- return "utf-32-be"
+ return 'utf-32-be'
if s.startswith(codecs.BOM_UTF32_LE):
- return "utf-32-le"
+ return 'utf-32-le'
if s.startswith(codecs.BOM_UTF8):
- return "utf-8"
+ return 'utf-8'
m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
if m:
return m.group(1).decode()
if m:
return m.group(1).decode()
# No encoding found -- what should the default be?
- return "utf-8"
+ return 'utf-8'
def handle_elt(self, elt, context):
"""
#: A regular expression that matches XML fragments that do not
#: contain any un-closed tags.
- _VALID_XML_RE = re.compile(
- r"""
+ _VALID_XML_RE = re.compile(r"""
[^<]*
(
((<!--.*?-->) | # comment
(<[^!>][^>]*>)) # tag or PI
[^<]*)*
\Z""",
- re.DOTALL | re.VERBOSE,
- )
+ re.DOTALL|re.VERBOSE)
#: A regular expression used to extract the tag name from a start tag,
#: end tag, or empty-elt tag string.
- _XML_TAG_NAME = re.compile("<\s*/?\s*([^\s>]+)")
+ _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
#: A regular expression used to find all start-tags, end-tags, and
#: emtpy-elt tags in an XML file. This regexp is more lenient than
#: the XML spec -- e.g., it allows spaces in some places where the
#: spec does not.
- _XML_PIECE = re.compile(
- r"""
+ _XML_PIECE = re.compile(r"""
# Include these so we can skip them:
(?P<COMMENT> <!--.*?--> )|
(?P<CDATA> <![CDATA[.*?]]> )|
(?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
(?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
(?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
- re.DOTALL | re.VERBOSE,
- )
+ re.DOTALL|re.VERBOSE)
def _read_xml_fragment(self, stream):
"""
then this function either backtracks to the last '<', or reads
another block.
"""
- fragment = ""
+ fragment = ''
if isinstance(stream, SeekableUnicodeStreamReader):
startpos = stream.tell()
return fragment
# Do we have a fragment that will never be well-formed?
- if re.search("[<>]", fragment).group(0) == ">":
+ if re.search('[<>]', fragment).group(0) == '>':
pos = stream.tell() - (
- len(fragment) - re.search("[<>]", fragment).end()
- )
+ len(fragment)-re.search('[<>]', fragment).end())
raise ValueError('Unexpected ">" near char %s' % pos)
# End of file?
if not xml_block:
- raise ValueError("Unexpected end of file: tag not closed")
+ raise ValueError('Unexpected end of file: tag not closed')
# If not, then we must be in the middle of a <..tag..>.
# If appropriate, backtrack to the most recent '<'
# character.
- last_open_bracket = fragment.rfind("<")
+ last_open_bracket = fragment.rfind('<')
if last_open_bracket > 0:
if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
if isinstance(stream, SeekableUnicodeStreamReader):
stream.seek(startpos)
stream.char_seek_forward(last_open_bracket)
else:
- stream.seek(-(len(fragment) - last_open_bracket), 1)
+ stream.seek(-(len(fragment)-last_open_bracket), 1)
return fragment[:last_open_bracket]
# Otherwise, read another block. (i.e., return to the
matches ``tagspec``, and return the result of applying
``elt_handler`` to each element found.
"""
- if tagspec is None:
- tagspec = self._tagspec
- if elt_handler is None:
- elt_handler = self.handle_elt
+ if tagspec is None: tagspec = self._tagspec
+ if elt_handler is None: elt_handler = self.handle_elt
# Use a stack of strings to keep track of our context:
context = list(self._tag_context.get(stream.tell()))
- assert context is not None # check this -- could it ever happen?
+ assert context is not None # check this -- could it ever happen?
elts = []
- elt_start = None # where does the elt start
- elt_depth = None # what context depth
- elt_text = ""
+ elt_start = None # where does the elt start
+ elt_depth = None # what context depth
+ elt_text = ''
- while elts == [] or elt_start is not None:
+ while elts==[] or elt_start is not None:
if isinstance(stream, SeekableUnicodeStreamReader):
startpos = stream.tell()
xml_fragment = self._read_xml_fragment(stream)
# End of file.
if not xml_fragment:
- if elt_start is None:
- break
- else:
- raise ValueError("Unexpected end of file")
+ if elt_start is None: break
+ else: raise ValueError('Unexpected end of file')
# Process each <tag> in the xml fragment.
for piece in self._XML_PIECE.finditer(xml_fragment):
if self._DEBUG:
- print("%25s %s" % ("/".join(context)[-20:], piece.group()))
+ print('%25s %s' % ('/'.join(context)[-20:], piece.group()))
- if piece.group("START_TAG"):
+ if piece.group('START_TAG'):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
# Keep context up-to-date.
context.append(name)
# Is this one of the elts we're looking for?
if elt_start is None:
- if re.match(tagspec, "/".join(context)):
+ if re.match(tagspec, '/'.join(context)):
elt_start = piece.start()
elt_depth = len(context)
- elif piece.group("END_TAG"):
+ elif piece.group('END_TAG'):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
# sanity checks:
if not context:
- raise ValueError("Unmatched tag </%s>" % name)
+ raise ValueError('Unmatched tag </%s>' % name)
if name != context[-1]:
- raise ValueError(
- "Unmatched tag <%s>...</%s>" % (context[-1], name)
- )
+ raise ValueError('Unmatched tag <%s>...</%s>' %
+ (context[-1], name))
# Is this the end of an element?
if elt_start is not None and elt_depth == len(context):
- elt_text += xml_fragment[elt_start : piece.end()]
- elts.append((elt_text, "/".join(context)))
+ elt_text += xml_fragment[elt_start:piece.end()]
+ elts.append( (elt_text, '/'.join(context)) )
elt_start = elt_depth = None
- elt_text = ""
+ elt_text = ''
# Keep context up-to-date
context.pop()
- elif piece.group("EMPTY_ELT_TAG"):
+ elif piece.group('EMPTY_ELT_TAG'):
name = self._XML_TAG_NAME.match(piece.group()).group(1)
if elt_start is None:
- if re.match(tagspec, "/".join(context) + "/" + name):
- elts.append((piece.group(), "/".join(context) + "/" + name))
+ if re.match(tagspec, '/'.join(context)+'/'+name):
+ elts.append((piece.group(),
+ '/'.join(context)+'/'+name))
if elt_start is not None:
# If we haven't found any elements yet, then keep
# take back the last start-tag, and return what
# we've gotten so far (elts is non-empty).
if self._DEBUG:
- print(" " * 36 + "(backtrack)")
+ print(' '*36+'(backtrack)')
if isinstance(stream, SeekableUnicodeStreamReader):
stream.seek(startpos)
stream.char_seek_forward(elt_start)
else:
- stream.seek(-(len(xml_fragment) - elt_start), 1)
- context = context[: elt_depth - 1]
+ stream.seek(-(len(xml_fragment)-elt_start), 1)
+ context = context[:elt_depth-1]
elt_start = elt_depth = None
- elt_text = ""
+ elt_text = ''
# Update the _tag_context dict.
pos = stream.tell()
else:
self._tag_context[pos] = tuple(context)
- return [
- elt_handler(
- ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
- context,
- )
- for (elt, context) in elts
- ]
+ return [elt_handler(ElementTree.fromstring(
+ elt.encode('ascii', 'xmlcharrefreplace')),
+ context)
+ for (elt, context) in elts]
import os
import re
+from six import string_types
+
from nltk.tokenize import RegexpTokenizer
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.tagged import TaggedCorpusReader
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
-
class YCOECorpusReader(CorpusReader):
"""
Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
English Prose (YCOE), a 1.5 million word syntactically-annotated
corpus of Old English prose texts.
"""
-
- def __init__(self, root, encoding="utf8"):
+ def __init__(self, root, encoding='utf8'):
CorpusReader.__init__(self, root, [], encoding)
self._psd_reader = YCOEParseCorpusReader(
- self.root.join("psd"), ".*", ".psd", encoding=encoding
- )
- self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
+ self.root.join('psd'), '.*', '.psd', encoding=encoding)
+ self._pos_reader = YCOETaggedCorpusReader(
+ self.root.join('pos'), '.*', '.pos')
# Make sure we have a consistent set of items:
documents = set(f[:-4] for f in self._psd_reader.fileids())
if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
- raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
+ raise ValueError('Items in "psd" and "pos" '
+ 'subdirectories do not match.')
- fileids = sorted(
- ["%s.psd" % doc for doc in documents]
- + ["%s.pos" % doc for doc in documents]
- )
+ fileids = sorted(['%s.psd' % doc for doc in documents] +
+ ['%s.pos' % doc for doc in documents])
CorpusReader.__init__(self, root, fileids, encoding)
self._documents = sorted(documents)
"""
if fileids is None:
return self._documents
- if isinstance(fileids, str):
+ if isinstance(fileids, string_types):
fileids = [fileids]
for f in fileids:
if f not in self._fileids:
- raise KeyError("File id %s not found" % fileids)
+ raise KeyError('File id %s not found' % fileids)
# Strip off the '.pos' and '.psd' extensions.
return sorted(set(f[:-4] for f in fileids))
"""
if documents is None:
return self._fileids
- elif isinstance(documents, str):
+ elif isinstance(documents, string_types):
documents = [documents]
- return sorted(
- set(
- ["%s.pos" % doc for doc in documents]
- + ["%s.psd" % doc for doc in documents]
- )
- )
+ return sorted(set(['%s.pos' % doc for doc in documents] +
+ ['%s.psd' % doc for doc in documents]))
def _getfileids(self, documents, subcorpus):
"""
if documents is None:
documents = self._documents
else:
- if isinstance(documents, str):
+ if isinstance(documents, string_types):
documents = [documents]
for document in documents:
if document not in self._documents:
- if document[-4:] in (".pos", ".psd"):
+ if document[-4:] in ('.pos', '.psd'):
raise ValueError(
- "Expected a document identifier, not a file "
- "identifier. (Use corpus.documents() to get "
- "a list of document identifiers."
- )
+ 'Expected a document identifier, not a file '
+ 'identifier. (Use corpus.documents() to get '
+ 'a list of document identifiers.')
else:
- raise ValueError("Document identifier %s not found" % document)
- return ["%s.%s" % (d, subcorpus) for d in documents]
+ raise ValueError('Document identifier %s not found'
+ % document)
+ return ['%s.%s' % (d, subcorpus) for d in documents]
# Delegate to one of our two sub-readers:
def words(self, documents=None):
- return self._pos_reader.words(self._getfileids(documents, "pos"))
-
+ return self._pos_reader.words(self._getfileids(documents, 'pos'))
def sents(self, documents=None):
- return self._pos_reader.sents(self._getfileids(documents, "pos"))
-
+ return self._pos_reader.sents(self._getfileids(documents, 'pos'))
def paras(self, documents=None):
- return self._pos_reader.paras(self._getfileids(documents, "pos"))
-
+ return self._pos_reader.paras(self._getfileids(documents, 'pos'))
def tagged_words(self, documents=None):
- return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
-
+ return self._pos_reader.tagged_words(self._getfileids(documents, 'pos'))
def tagged_sents(self, documents=None):
- return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
-
+ return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos'))
def tagged_paras(self, documents=None):
- return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
-
+ return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos'))
def parsed_sents(self, documents=None):
- return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
+ return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd'))
class YCOEParseCorpusReader(BracketParseCorpusReader):
"""Specialized version of the standard bracket parse corpus reader
that strips out (CODE ...) and (ID ...) nodes."""
-
def _parse(self, t):
- t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
- if re.match(r"\s*\(\s*\)\s*$", t):
- return None
+ t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t)
+ if re.match(r'\s*\(\s*\)\s*$', t): return None
return BracketParseCorpusReader._parse(self, t)
-
class YCOETaggedCorpusReader(TaggedCorpusReader):
- def __init__(self, root, items, encoding="utf8"):
- gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
+ def __init__(self, root, items, encoding='utf8'):
+ gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
- TaggedCorpusReader.__init__(
- self, root, items, sep="_", sent_tokenizer=sent_tokenizer
- )
-
+ TaggedCorpusReader.__init__(self, root, items, sep='_',
+ sent_tokenizer=sent_tokenizer)
#: A list of all documents and their titles in ycoe.
documents = {
- "coadrian.o34": "Adrian and Ritheus",
- "coaelhom.o3": "Ælfric, Supplemental Homilies",
- "coaelive.o3": "Ælfric's Lives of Saints",
- "coalcuin": "Alcuin De virtutibus et vitiis",
- "coalex.o23": "Alexander's Letter to Aristotle",
- "coapollo.o3": "Apollonius of Tyre",
- "coaugust": "Augustine",
- "cobede.o2": "Bede's History of the English Church",
- "cobenrul.o3": "Benedictine Rule",
- "coblick.o23": "Blickling Homilies",
- "coboeth.o2": "Boethius' Consolation of Philosophy",
- "cobyrhtf.o3": "Byrhtferth's Manual",
- "cocanedgD": "Canons of Edgar (D)",
- "cocanedgX": "Canons of Edgar (X)",
- "cocathom1.o3": "Ælfric's Catholic Homilies I",
- "cocathom2.o3": "Ælfric's Catholic Homilies II",
- "cochad.o24": "Saint Chad",
- "cochdrul": "Chrodegang of Metz, Rule",
- "cochristoph": "Saint Christopher",
- "cochronA.o23": "Anglo-Saxon Chronicle A",
- "cochronC": "Anglo-Saxon Chronicle C",
- "cochronD": "Anglo-Saxon Chronicle D",
- "cochronE.o34": "Anglo-Saxon Chronicle E",
- "cocura.o2": "Cura Pastoralis",
- "cocuraC": "Cura Pastoralis (Cotton)",
- "codicts.o34": "Dicts of Cato",
- "codocu1.o1": "Documents 1 (O1)",
- "codocu2.o12": "Documents 2 (O1/O2)",
- "codocu2.o2": "Documents 2 (O2)",
- "codocu3.o23": "Documents 3 (O2/O3)",
- "codocu3.o3": "Documents 3 (O3)",
- "codocu4.o24": "Documents 4 (O2/O4)",
- "coeluc1": "Honorius of Autun, Elucidarium 1",
- "coeluc2": "Honorius of Autun, Elucidarium 1",
- "coepigen.o3": "Ælfric's Epilogue to Genesis",
- "coeuphr": "Saint Euphrosyne",
- "coeust": "Saint Eustace and his companions",
- "coexodusP": "Exodus (P)",
- "cogenesiC": "Genesis (C)",
- "cogregdC.o24": "Gregory's Dialogues (C)",
- "cogregdH.o23": "Gregory's Dialogues (H)",
- "coherbar": "Pseudo-Apuleius, Herbarium",
- "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
- "coinspolX": "Wulfstan's Institute of Polity (X)",
- "cojames": "Saint James",
- "colacnu.o23": "Lacnunga",
- "colaece.o2": "Leechdoms",
- "colaw1cn.o3": "Laws, Cnut I",
- "colaw2cn.o3": "Laws, Cnut II",
- "colaw5atr.o3": "Laws, Æthelred V",
- "colaw6atr.o3": "Laws, Æthelred VI",
- "colawaf.o2": "Laws, Alfred",
- "colawafint.o2": "Alfred's Introduction to Laws",
- "colawger.o34": "Laws, Gerefa",
- "colawine.ox2": "Laws, Ine",
- "colawnorthu.o3": "Northumbra Preosta Lagu",
- "colawwllad.o4": "Laws, William I, Lad",
- "coleofri.o4": "Leofric",
- "colsigef.o3": "Ælfric's Letter to Sigefyrth",
- "colsigewB": "Ælfric's Letter to Sigeweard (B)",
- "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
- "colwgeat": "Ælfric's Letter to Wulfgeat",
- "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
- "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
- "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
- "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
- "comargaC.o34": "Saint Margaret (C)",
- "comargaT": "Saint Margaret (T)",
- "comart1": "Martyrology, I",
- "comart2": "Martyrology, II",
- "comart3.o23": "Martyrology, III",
- "comarvel.o23": "Marvels of the East",
- "comary": "Mary of Egypt",
- "coneot": "Saint Neot",
- "conicodA": "Gospel of Nicodemus (A)",
- "conicodC": "Gospel of Nicodemus (C)",
- "conicodD": "Gospel of Nicodemus (D)",
- "conicodE": "Gospel of Nicodemus (E)",
- "coorosiu.o2": "Orosius",
- "cootest.o3": "Heptateuch",
- "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
- "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
- "coprefcura.o2": "Preface to the Cura Pastoralis",
- "coprefgen.o3": "Ælfric's Preface to Genesis",
- "copreflives.o3": "Ælfric's Preface to Lives of Saints",
- "coprefsolilo": "Preface to Augustine's Soliloquies",
- "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
- "corood": "History of the Holy Rood-Tree",
- "cosevensl": "Seven Sleepers",
- "cosolilo": "St. Augustine's Soliloquies",
- "cosolsat1.o4": "Solomon and Saturn I",
- "cosolsat2": "Solomon and Saturn II",
- "cotempo.o3": "Ælfric's De Temporibus Anni",
- "coverhom": "Vercelli Homilies",
- "coverhomE": "Vercelli Homilies (E)",
- "coverhomL": "Vercelli Homilies (L)",
- "covinceB": "Saint Vincent (Bodley 343)",
- "covinsal": "Vindicta Salvatoris",
- "cowsgosp.o3": "West-Saxon Gospels",
- "cowulf.o34": "Wulfstan's Homilies",
-}
+ 'coadrian.o34': 'Adrian and Ritheus',
+ 'coaelhom.o3': 'Ælfric, Supplemental Homilies',
+ 'coaelive.o3': 'Ælfric\'s Lives of Saints',
+ 'coalcuin': 'Alcuin De virtutibus et vitiis',
+ 'coalex.o23': 'Alexander\'s Letter to Aristotle',
+ 'coapollo.o3': 'Apollonius of Tyre',
+ 'coaugust': 'Augustine',
+ 'cobede.o2': 'Bede\'s History of the English Church',
+ 'cobenrul.o3': 'Benedictine Rule',
+ 'coblick.o23': 'Blickling Homilies',
+ 'coboeth.o2': 'Boethius\' Consolation of Philosophy',
+ 'cobyrhtf.o3': 'Byrhtferth\'s Manual',
+ 'cocanedgD': 'Canons of Edgar (D)',
+ 'cocanedgX': 'Canons of Edgar (X)',
+ 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
+ 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
+ 'cochad.o24': 'Saint Chad',
+ 'cochdrul': 'Chrodegang of Metz, Rule',
+ 'cochristoph': 'Saint Christopher',
+ 'cochronA.o23': 'Anglo-Saxon Chronicle A',
+ 'cochronC': 'Anglo-Saxon Chronicle C',
+ 'cochronD': 'Anglo-Saxon Chronicle D',
+ 'cochronE.o34': 'Anglo-Saxon Chronicle E',
+ 'cocura.o2': 'Cura Pastoralis',
+ 'cocuraC': 'Cura Pastoralis (Cotton)',
+ 'codicts.o34': 'Dicts of Cato',
+ 'codocu1.o1': 'Documents 1 (O1)',
+ 'codocu2.o12': 'Documents 2 (O1/O2)',
+ 'codocu2.o2': 'Documents 2 (O2)',
+ 'codocu3.o23': 'Documents 3 (O2/O3)',
+ 'codocu3.o3': 'Documents 3 (O3)',
+ 'codocu4.o24': 'Documents 4 (O2/O4)',
+ 'coeluc1': 'Honorius of Autun, Elucidarium 1',
+ 'coeluc2': 'Honorius of Autun, Elucidarium 1',
+ 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
+ 'coeuphr': 'Saint Euphrosyne',
+ 'coeust': 'Saint Eustace and his companions',
+ 'coexodusP': 'Exodus (P)',
+ 'cogenesiC': 'Genesis (C)',
+ 'cogregdC.o24': 'Gregory\'s Dialogues (C)',
+ 'cogregdH.o23': 'Gregory\'s Dialogues (H)',
+ 'coherbar': 'Pseudo-Apuleius, Herbarium',
+ 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)',
+ 'coinspolX': 'Wulfstan\'s Institute of Polity (X)',
+ 'cojames': 'Saint James',
+ 'colacnu.o23': 'Lacnunga',
+ 'colaece.o2': 'Leechdoms',
+ 'colaw1cn.o3': 'Laws, Cnut I',
+ 'colaw2cn.o3': 'Laws, Cnut II',
+ 'colaw5atr.o3': 'Laws, Æthelred V',
+ 'colaw6atr.o3': 'Laws, Æthelred VI',
+ 'colawaf.o2': 'Laws, Alfred',
+ 'colawafint.o2': 'Alfred\'s Introduction to Laws',
+ 'colawger.o34': 'Laws, Gerefa',
+ 'colawine.ox2': 'Laws, Ine',
+ 'colawnorthu.o3': 'Northumbra Preosta Lagu',
+ 'colawwllad.o4': 'Laws, William I, Lad',
+ 'coleofri.o4': 'Leofric',
+ 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
+ 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
+ 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
+ 'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
+ 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
+ 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
+ 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
+ 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
+ 'comargaC.o34': 'Saint Margaret (C)',
+ 'comargaT': 'Saint Margaret (T)',
+ 'comart1': 'Martyrology, I',
+ 'comart2': 'Martyrology, II',
+ 'comart3.o23': 'Martyrology, III',
+ 'comarvel.o23': 'Marvels of the East',
+ 'comary': 'Mary of Egypt',
+ 'coneot': 'Saint Neot',
+ 'conicodA': 'Gospel of Nicodemus (A)',
+ 'conicodC': 'Gospel of Nicodemus (C)',
+ 'conicodD': 'Gospel of Nicodemus (D)',
+ 'conicodE': 'Gospel of Nicodemus (E)',
+ 'coorosiu.o2': 'Orosius',
+ 'cootest.o3': 'Heptateuch',
+ 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
+ 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
+ 'coprefcura.o2': 'Preface to the Cura Pastoralis',
+ 'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
+ 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
+ 'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
+ 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
+ 'corood': 'History of the Holy Rood-Tree',
+ 'cosevensl': 'Seven Sleepers',
+ 'cosolilo': 'St. Augustine\'s Soliloquies',
+ 'cosolsat1.o4': 'Solomon and Saturn I',
+ 'cosolsat2': 'Solomon and Saturn II',
+ 'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
+ 'coverhom': 'Vercelli Homilies',
+ 'coverhomE': 'Vercelli Homilies (E)',
+ 'coverhomL': 'Vercelli Homilies (L)',
+ 'covinceB': 'Saint Vincent (Bodley 343)',
+ 'covinsal': 'Vindicta Salvatoris',
+ 'cowsgosp.o3': 'West-Saxon Gospels',
+ 'cowulf.o34': 'Wulfstan\'s Homilies'
+ }
# Natural Language Toolkit: Corpus Reader Utility Functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
######################################################################
-# { Lazy Corpus Loader
+#{ Lazy Corpus Loader
######################################################################
+from __future__ import unicode_literals
import re
import gc
import nltk
+from nltk.compat import python_2_unicode_compatible
TRY_ZIPFILE_FIRST = False
-
+@python_2_unicode_compatible
class LazyCorpusLoader(object):
"""
To see the API documentation for this lazily loaded corpus, first
run corpus.ensure_loaded(), and then run help(this_corpus).
-
+
LazyCorpusLoader is a proxy object which is used to stand in for a
corpus object before the corpus is loaded. This allows NLTK to
create an object for each corpus, but defer the costs associated
NLTK data package. Once they've properly installed the data
package (or modified ``nltk.data.path`` to point to its location),
they can then use the corpus object without restarting python.
-
+
:param name: The name of the corpus
:type name: str
:param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader
:param *args: Any other non-keywords arguments that `reader_cls` might need.
:param *kargs: Any other keywords arguments that `reader_cls` might need.
"""
-
def __init__(self, name, reader_cls, *args, **kwargs):
from nltk.corpus.reader.api import CorpusReader
-
assert issubclass(reader_cls, CorpusReader)
self.__name = self.__name__ = name
self.__reader_cls = reader_cls
- # If nltk_data_subdir is set explicitly
- if "nltk_data_subdir" in kwargs:
+ # If nltk_data_subdir is set explicitly
+ if 'nltk_data_subdir' in kwargs:
# Use the specified subdirectory path
- self.subdir = kwargs["nltk_data_subdir"]
+ self.subdir = kwargs['nltk_data_subdir']
# Pops the `nltk_data_subdir` argument, we don't need it anymore.
- kwargs.pop("nltk_data_subdir", None)
- else: # Otherwise use 'nltk_data/corpora'
- self.subdir = "corpora"
+ kwargs.pop('nltk_data_subdir', None)
+ else: # Otherwise use 'nltk_data/corpora'
+ self.subdir = 'corpora'
self.__args = args
self.__kwargs = kwargs
def __load(self):
# Find the corpus root directory.
- zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name)
+ zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name)
if TRY_ZIPFILE_FIRST:
try:
- root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
+ root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
except LookupError as e:
- try:
- root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
- except LookupError:
- raise e
+ try: root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
+ except LookupError: raise e
else:
try:
- root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
+ root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
except LookupError as e:
- try:
- root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
- except LookupError:
- raise e
+ try: root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
+ except LookupError: raise e
# Load the corpus.
corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
# the corpus by modifying our own __dict__ and __class__ to
# match that of the corpus.
- args, kwargs = self.__args, self.__kwargs
+ args, kwargs = self.__args, self.__kwargs
name, reader_cls = self.__name, self.__reader_cls
self.__dict__ = corpus.__dict__
# (see http://bugs.python.org/issue1225107).
# Without this fix tests may take extra 1.5GB RAM
# because all corpora gets loaded during test collection.
- if attr == "__bases__":
+ if attr == '__bases__':
raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
self.__load()
return getattr(self, attr)
def __repr__(self):
- return "<%s in %r (not loaded yet)>" % (
- self.__reader_cls.__name__,
- ".../corpora/" + self.__name,
- )
+ return '<%s in %r (not loaded yet)>' % (
+ self.__reader_cls.__name__, '.../corpora/'+self.__name)
def _unload(self):
# If an exception occures during corpus loading then
"""
Magic for creating bound methods (used for _unload).
"""
-
class Foo(object):
- def meth(self):
- pass
-
+ def meth(self): pass
f = Foo()
bound_method = type(f.meth)
try:
return bound_method(func, self, self.__class__)
- except TypeError: # python3
+ except TypeError: # python3
return bound_method(func, self)
# Natural Language Toolkit: Utility functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
adds it to a resource cache; and ``retrieve()`` copies a given resource
to a local file.
"""
+from __future__ import print_function, unicode_literals
+from __future__ import division
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import functools
import textwrap
import io
-from io import BytesIO
import os
import re
import sys
import zipfile
import codecs
-import pickle
-from abc import ABCMeta, abstractmethod
-from gzip import GzipFile, WRITE as GZ_WRITE
+from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE
-from urllib.request import urlopen, url2pathname
+try: # Python 3.
+ textwrap_indent = functools.partial(textwrap.indent, prefix=' ')
+except AttributeError: # Python 2; indent() not available for Python2.
+ textwrap_fill = functools.partial(textwrap.fill,
+ initial_indent=' ',
+ subsequent_indent=' ',
+ replace_whitespace=False)
+ def textwrap_indent(text):
+ return '\n'.join(textwrap_fill(line) for line in text.splitlines())
try:
from zlib import Z_SYNC_FLUSH as FLUSH
except ImportError:
from zlib import Z_FINISH as FLUSH
+try:
+ import cPickle as pickle
+except ImportError:
+ import pickle
+
+from six import string_types, text_type
+from six.moves.urllib.request import urlopen, url2pathname
+
# this import should be more specific:
import nltk
-from nltk.compat import py3_data, add_py3_data
-from nltk.internals import deprecated
-
-textwrap_indent = functools.partial(textwrap.indent, prefix=" ")
+from nltk.compat import py3_data, add_py3_data, BytesIO
######################################################################
# Search Path
(e.g., in their home directory under ~/nltk_data)."""
# User-specified locations:
-_paths_from_env = os.environ.get("NLTK_DATA", str("")).split(os.pathsep)
+_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep)
path += [d for d in _paths_from_env if d]
-if "APPENGINE_RUNTIME" not in os.environ and os.path.expanduser("~/") != "~/":
- path.append(os.path.expanduser(str("~/nltk_data")))
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
+ path.append(os.path.expanduser(str('~/nltk_data')))
-if sys.platform.startswith("win"):
+if sys.platform.startswith('win'):
# Common locations on Windows:
path += [
- os.path.join(sys.prefix, str("nltk_data")),
- os.path.join(sys.prefix, str("share"), str("nltk_data")),
- os.path.join(sys.prefix, str("lib"), str("nltk_data")),
- os.path.join(os.environ.get(str("APPDATA"), str("C:\\")), str("nltk_data")),
- str(r"C:\nltk_data"),
- str(r"D:\nltk_data"),
- str(r"E:\nltk_data"),
+ str(r'C:\nltk_data'), str(r'D:\nltk_data'), str(r'E:\nltk_data'),
+ os.path.join(sys.prefix, str('nltk_data')),
+ os.path.join(sys.prefix, str('lib'), str('nltk_data')),
+ os.path.join(
+ os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data'))
]
else:
# Common locations on UNIX & OS X:
path += [
- os.path.join(sys.prefix, str("nltk_data")),
- os.path.join(sys.prefix, str("share"), str("nltk_data")),
- os.path.join(sys.prefix, str("lib"), str("nltk_data")),
- str("/usr/share/nltk_data"),
- str("/usr/local/share/nltk_data"),
- str("/usr/lib/nltk_data"),
- str("/usr/local/lib/nltk_data"),
+ str('/usr/share/nltk_data'),
+ str('/usr/local/share/nltk_data'),
+ str('/usr/lib/nltk_data'),
+ str('/usr/local/lib/nltk_data'),
+ os.path.join(sys.prefix, str('nltk_data')),
+ os.path.join(sys.prefix, str('lib'), str('nltk_data'))
]
# Util Functions
######################################################################
-
-def gzip_open_unicode(
- filename,
- mode="rb",
- compresslevel=9,
- encoding="utf-8",
- fileobj=None,
- errors=None,
- newline=None,
-):
+def gzip_open_unicode(filename, mode="rb", compresslevel=9, encoding='utf-8',
+ fileobj=None, errors=None, newline=None):
if fileobj is None:
fileobj = GzipFile(filename, mode, compresslevel, fileobj)
return io.TextIOWrapper(fileobj, encoding, errors, newline)
>>> split_resource_url('file:///C:/home/nltk')
('file', '/C:/home/nltk')
"""
- protocol, path_ = resource_url.split(":", 1)
- if protocol == "nltk":
+ protocol, path_ = resource_url.split(':', 1)
+ if protocol == 'nltk':
pass
- elif protocol == "file":
- if path_.startswith("/"):
- path_ = "/" + path_.lstrip("/")
+ elif protocol == 'file':
+ if path_.startswith('/'):
+ path_ = '/' + path_.lstrip('/')
else:
- path_ = re.sub(r"^/{0,2}", "", path_)
+ path_ = re.sub(r'^/{0,2}', '', path_)
return protocol, path_
protocol, name = split_resource_url(resource_url)
except ValueError:
# the resource url has no protocol, use the nltk protocol by default
- protocol = "nltk"
+ protocol = 'nltk'
name = resource_url
# use file protocol if the path is an absolute path
- if protocol == "nltk" and os.path.isabs(name):
- protocol = "file://"
+ if protocol == 'nltk' and os.path.isabs(name):
+ protocol = 'file://'
name = normalize_resource_name(name, False, None)
- elif protocol == "file":
- protocol = "file://"
+ elif protocol == 'file':
+ protocol = 'file://'
# name is absolute
name = normalize_resource_name(name, False, None)
- elif protocol == "nltk":
- protocol = "nltk:"
+ elif protocol == 'nltk':
+ protocol = 'nltk:'
name = normalize_resource_name(name, True)
else:
# handled by urllib
- protocol += "://"
- return "".join([protocol, name])
+ protocol += '://'
+ return ''.join([protocol, name])
def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
>>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file'
True
"""
- is_dir = bool(re.search(r"[\\/.]$", resource_name)) or resource_name.endswith(
- os.path.sep
- )
- if sys.platform.startswith("win"):
- resource_name = resource_name.lstrip("/")
+ is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(os.path.sep)
+ if sys.platform.startswith('win'):
+ resource_name = resource_name.lstrip('/')
else:
- resource_name = re.sub(r"^/+", "/", resource_name)
+ resource_name = re.sub(r'^/+', '/', resource_name)
if allow_relative:
resource_name = os.path.normpath(resource_name)
else:
if relative_path is None:
relative_path = os.curdir
- resource_name = os.path.abspath(os.path.join(relative_path, resource_name))
- resource_name = resource_name.replace("\\", "/").replace(os.path.sep, "/")
- if sys.platform.startswith("win") and os.path.isabs(resource_name):
- resource_name = "/" + resource_name
- if is_dir and not resource_name.endswith("/"):
- resource_name += "/"
+ resource_name = os.path.abspath(
+ os.path.join(relative_path, resource_name))
+ resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
+ if sys.platform.startswith('win') and os.path.isabs(resource_name):
+ resource_name = '/' + resource_name
+ if is_dir and not resource_name.endswith('/'):
+ resource_name += '/'
return resource_name
# Path Pointers
######################################################################
-
-class PathPointer(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class PathPointer(object):
"""
An abstract base class for 'path pointers,' used by NLTK's data
package to identify specific paths. Two subclasses exist:
"""
-class FileSystemPathPointer(PathPointer, str):
+class FileSystemPathPointer(PathPointer, text_type):
"""
A path pointer that identifies a file which can be accessed
directly via a given absolute path.
"""
-
@py3_data
def __init__(self, _path):
"""
_path = os.path.abspath(_path)
if not os.path.exists(_path):
- raise IOError("No such file or directory: %r" % _path)
+ raise IOError('No such file or directory: %r' % _path)
self._path = _path
# There's no need to call str.__init__(), since it's a no-op;
return self._path
def open(self, encoding=None):
- stream = open(self._path, "rb")
+ stream = open(self._path, 'rb')
if encoding is not None:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
return FileSystemPathPointer(_path)
def __repr__(self):
- return "FileSystemPathPointer(%r)" % self._path
+ # This should be a byte string under Python 2.x;
+ # we don't want transliteration here so
+ # @python_2_unicode_compatible is not used.
+ return str('FileSystemPathPointer(%r)' % self._path)
def __str__(self):
return self._path
-@deprecated("Use gzip.GzipFile instead as it also uses a buffer.")
+
class BufferedGzipFile(GzipFile):
- """A ``GzipFile`` subclass for compatibility with older nltk releases.
+ """
+ A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``.
+ This allows faster reads and writes of data to and from gzip-compressed
+ files at the cost of using more memory.
+
+ The default buffer size is 2MB.
- Use ``GzipFile`` directly as it also buffers in all supported
- Python versions.
+ ``BufferedGzipFile`` is useful for loading large gzipped pickle objects
+ as well as writing large encoded feature files for classifier training.
"""
+ MB = 2 ** 20
+ SIZE = 2 * MB
@py3_data
- def __init__(
- self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs
- ):
- """Return a buffered gzip file object."""
+ def __init__(self, filename=None, mode=None, compresslevel=9,
+ fileobj=None, **kwargs):
+ """
+ Return a buffered gzip file object.
+
+ :param filename: a filesystem path
+ :type filename: str
+ :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
+ 'w', or 'wb'
+ :type mode: str
+ :param compresslevel: The compresslevel argument is an integer from 1
+ to 9 controlling the level of compression; 1 is fastest and
+ produces the least compression, and 9 is slowest and produces the
+ most compression. The default is 9.
+ :type compresslevel: int
+ :param fileobj: a BytesIO stream to read from instead of a file.
+ :type fileobj: BytesIO
+ :param size: number of bytes to buffer during calls to read() and write()
+ :type size: int
+ :rtype: BufferedGzipFile
+ """
GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
+ self._size = kwargs.get('size', self.SIZE)
+ self._nltk_buffer = BytesIO()
+ # cStringIO does not support len.
+ self._len = 0
+
+ def _reset_buffer(self):
+ # For some reason calling BytesIO.truncate() here will lead to
+ # inconsistent writes so just set _buffer to a new BytesIO object.
+ self._nltk_buffer = BytesIO()
+ self._len = 0
+
+ def _write_buffer(self, data):
+ # Simply write to the buffer and increment the buffer size.
+ if data is not None:
+ self._nltk_buffer.write(data)
+ self._len += len(data)
+
+ def _write_gzip(self, data):
+ # Write the current buffer to the GzipFile.
+ GzipFile.write(self, self._nltk_buffer.getvalue())
+ # Then reset the buffer and write the new data to the buffer.
+ self._reset_buffer()
+ self._write_buffer(data)
+
+ def close(self):
+ # GzipFile.close() doesn't actuallly close anything.
+ if self.mode == GZ_WRITE:
+ self._write_gzip(None)
+ self._reset_buffer()
+ return GzipFile.close(self)
+
+ def flush(self, lib_mode=FLUSH):
+ self._nltk_buffer.flush()
+ GzipFile.flush(self, lib_mode)
- def write(self, data):
- # This is identical to GzipFile.write but does not return
- # the bytes written to retain compatibility.
- super().write(data)
+ def read(self, size=None):
+ if not size:
+ size = self._size
+ contents = BytesIO()
+ while True:
+ blocks = GzipFile.read(self, size)
+ if not blocks:
+ contents.flush()
+ break
+ contents.write(blocks)
+ return contents.getvalue()
+ else:
+ return GzipFile.read(self, size)
+
+ def write(self, data, size=-1):
+ """
+ :param data: bytes to write to file or buffer
+ :type data: bytes
+ :param size: buffer at least size bytes before writing to file
+ :type size: int
+ """
+ if not size:
+ size = self._size
+ if self._len + len(data) <= size:
+ self._write_buffer(data)
+ else:
+ self._write_gzip(data)
class GzipFileSystemPathPointer(FileSystemPathPointer):
"""
def open(self, encoding=None):
- stream = GzipFile(self._path, "rb")
+ # Note: In >= Python3.5, GzipFile is already using a
+ # buffered reader in the backend which has a variable self._buffer
+ # See https://github.com/nltk/nltk/issues/1308
+ if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+ stream = BufferedGzipFile(self._path, 'rb')
+ else:
+ stream = GzipFile(self._path, 'rb')
if encoding:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
A path pointer that identifies a file contained within a zipfile,
which can be accessed by reading that zipfile.
"""
-
@py3_data
- def __init__(self, zipfile, entry=""):
+ def __init__(self, zipfile, entry=''):
"""
Create a new path pointer pointing at the specified entry
in the given zipfile.
:raise IOError: If the given zipfile does not exist, or if it
does not contain the specified entry.
"""
- if isinstance(zipfile, str):
+ if isinstance(zipfile, string_types):
zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
+ # Normalize the entry string, it should be relative:
+ entry = normalize_resource_name(entry, True, '/').lstrip('/')
+
# Check that the entry exists:
if entry:
-
- # Normalize the entry string, it should be relative:
- entry = normalize_resource_name(entry, True, "/").lstrip("/")
-
try:
zipfile.getinfo(entry)
except Exception:
# the zip file. So if `entry` is a directory name,
# then check if the zipfile contains any files that
# are under the given directory.
- if entry.endswith("/") and [
- n for n in zipfile.namelist() if n.startswith(entry)
- ]:
+ if (entry.endswith('/') and
+ [n for n in zipfile.namelist() if n.startswith(entry)]):
pass # zipfile contains a file in that directory.
else:
# Otherwise, complain.
- raise IOError(
- "Zipfile %r does not contain %r" % (zipfile.filename, entry)
- )
+ raise IOError('Zipfile %r does not contain %r' %
+ (zipfile.filename, entry))
self._zipfile = zipfile
self._entry = entry
def open(self, encoding=None):
data = self._zipfile.read(self._entry)
stream = BytesIO(data)
- if self._entry.endswith(".gz"):
- stream = GzipFile(self._entry, fileobj=stream)
+ if self._entry.endswith('.gz'):
+ # Note: In >= Python3.5, GzipFile is already using a
+ # buffered reader in the backend which has a variable self._buffer
+ # See https://github.com/nltk/nltk/issues/1308
+ if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+ stream = BufferedGzipFile(self._entry, fileobj=stream)
+ else:
+ stream = GzipFile(self._entry, fileobj=stream)
elif encoding is not None:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
return self._zipfile.getinfo(self._entry).file_size
def join(self, fileid):
- entry = "%s/%s" % (self._entry, fileid)
+ entry = '%s/%s' % (self._entry, fileid)
return ZipFilePathPointer(self._zipfile, entry)
def __repr__(self):
- return str("ZipFilePathPointer(%r, %r)") % (self._zipfile.filename, self._entry)
+ return str('ZipFilePathPointer(%r, %r)') % (
+ self._zipfile.filename, self._entry)
def __str__(self):
- return os.path.normpath(os.path.join(self._zipfile.filename, self._entry))
+ return os.path.normpath(os.path.join(self._zipfile.filename,
+ self._entry))
######################################################################
paths = path
# Check if the resource name includes a zipfile name
- m = re.match(r"(.*\.zip)/?(.*)$|", resource_name)
+ m = re.match(r'(.*\.zip)/?(.*)$|', resource_name)
zipfile, zipentry = m.groups()
# Check each item in our path
for path_ in paths:
# Is the path item a zipfile?
- if path_ and (os.path.isfile(path_) and path_.endswith(".zip")):
+ if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
try:
return ZipFilePathPointer(path_, resource_name)
except IOError:
if zipfile is None:
p = os.path.join(path_, url2pathname(resource_name))
if os.path.exists(p):
- if p.endswith(".gz"):
+ if p.endswith('.gz'):
return GzipFileSystemPathPointer(p)
else:
return FileSystemPathPointer(p)
# again, assuming that one of the path components is inside a
# zipfile of the same name.
if zipfile is None:
- pieces = resource_name.split("/")
+ pieces = resource_name.split('/')
for i in range(len(pieces)):
- modified_name = "/".join(pieces[:i] + [pieces[i] + ".zip"] + pieces[i:])
+ modified_name = '/'.join(pieces[:i] +
+ [pieces[i] + '.zip'] + pieces[i:])
try:
return find(modified_name, paths)
except LookupError:
pass
# Identify the package (i.e. the .zip file) to download.
- resource_zipname = resource_name.split("/")[1]
- if resource_zipname.endswith(".zip"):
- resource_zipname = resource_zipname.rpartition(".")[0]
+ resource_zipname = resource_name.split('/')[1]
+ if resource_zipname.endswith('.zip'):
+ resource_zipname = resource_zipname.rpartition('.')[0]
# Display a friendly error message if the resource wasn't found:
- msg = str(
- "Resource \33[93m{resource}\033[0m not found.\n"
- "Please use the NLTK Downloader to obtain the resource:\n\n"
- "\33[31m" # To display red text in terminal.
- ">>> import nltk\n"
- ">>> nltk.download('{resource}')\n"
- "\033[0m"
- ).format(resource=resource_zipname)
+ msg = str("Resource \33[93m{resource}\033[0m not found.\n"
+ "Please use the NLTK Downloader to obtain the resource:\n\n"
+ "\33[31m" # To display red text in terminal.
+ ">>> import nltk\n"
+ ">>> nltk.download(\'{resource}\')\n"
+ "\033[0m").format(resource=resource_zipname)
msg = textwrap_indent(msg)
- msg += "\n For more information see: https://www.nltk.org/data.html\n"
-
- msg += "\n Attempted to load \33[93m{resource_name}\033[0m\n".format(
- resource_name=resource_name
- )
-
- msg += "\n Searched in:" + "".join("\n - %r" % d for d in paths)
- sep = "*" * 70
- resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)
+ msg += '\n Searched in:' + ''.join('\n - %r' % d for d in paths)
+ sep = '*' * 70
+ resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
raise LookupError(resource_not_found)
"""
resource_url = normalize_resource_url(resource_url)
if filename is None:
- if resource_url.startswith("file:"):
+ if resource_url.startswith('file:'):
filename = os.path.split(resource_url)[-1]
else:
- filename = re.sub(r"(^\w+:)?.*/", "", resource_url)
+ filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
if os.path.exists(filename):
filename = os.path.abspath(filename)
raise ValueError("File %r already exists!" % filename)
if verbose:
- print("Retrieving %r, saving to %r" % (resource_url, filename))
+ print('Retrieving %r, saving to %r' % (resource_url, filename))
# Open the input & output streams.
infile = _open(resource_url)
#: load() method. Keys are format names, and values are format
#: descriptions.
FORMATS = {
- "pickle": "A serialized python object, stored using the pickle module.",
- "json": "A serialized python object, stored using the json module.",
- "yaml": "A serialized python object, stored using the yaml module.",
- "cfg": "A context free grammar.",
- "pcfg": "A probabilistic CFG.",
- "fcfg": "A feature CFG.",
- "fol": "A list of first order logic expressions, parsed with "
- "nltk.sem.logic.Expression.fromstring.",
- "logic": "A list of first order logic expressions, parsed with "
- "nltk.sem.logic.LogicParser. Requires an additional logic_parser "
- "parameter",
- "val": "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
- "raw": "The raw (byte string) contents of a file.",
- "text": "The raw (unicode string) contents of a file. ",
+ 'pickle': "A serialized python object, stored using the pickle module.",
+ 'json': "A serialized python object, stored using the json module.",
+ 'yaml': "A serialized python object, stored using the yaml module.",
+ 'cfg': "A context free grammar.",
+ 'pcfg': "A probabilistic CFG.",
+ 'fcfg': "A feature CFG.",
+ 'fol': "A list of first order logic expressions, parsed with "
+ "nltk.sem.logic.Expression.fromstring.",
+ 'logic': "A list of first order logic expressions, parsed with "
+ "nltk.sem.logic.LogicParser. Requires an additional logic_parser "
+ "parameter",
+ 'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
+ 'raw': "The raw (byte string) contents of a file.",
+ 'text': "The raw (unicode string) contents of a file. "
}
#: A dictionary mapping from file extensions to format names, used
#: by load() when format="auto" to decide the format for a
#: given resource url.
AUTO_FORMATS = {
- "pickle": "pickle",
- "json": "json",
- "yaml": "yaml",
- "cfg": "cfg",
- "pcfg": "pcfg",
- "fcfg": "fcfg",
- "fol": "fol",
- "logic": "logic",
- "val": "val",
- "txt": "text",
- "text": "text",
+ 'pickle': 'pickle',
+ 'json': 'json',
+ 'yaml': 'yaml',
+ 'cfg': 'cfg',
+ 'pcfg': 'pcfg',
+ 'fcfg': 'fcfg',
+ 'fol': 'fol',
+ 'logic': 'logic',
+ 'val': 'val',
+ 'txt': 'text',
+ 'text': 'text',
}
-def load(
- resource_url,
- format="auto",
- cache=True,
- verbose=False,
- logic_parser=None,
- fstruct_reader=None,
- encoding=None,
-):
+def load(resource_url, format='auto', cache=True, verbose=False,
+ logic_parser=None, fstruct_reader=None, encoding=None):
"""
Load a given resource from the NLTK data package. The following
resource formats are currently supported:
:type cache: bool
:param cache: If true, add this resource to a cache. If load()
finds a resource in its cache, then it will return it from the
- cache rather than loading it.
+ cache rather than loading it. The cache uses weak references,
+ so a resource wil automatically be expunged from the cache
+ when no more objects are using it.
:type verbose: bool
:param verbose: If true, print a message when loading a resource.
Messages are not displayed when a resource is retrieved from
resource_url = add_py3_data(resource_url)
# Determine the format of the resource.
- if format == "auto":
- resource_url_parts = resource_url.split(".")
+ if format == 'auto':
+ resource_url_parts = resource_url.split('.')
ext = resource_url_parts[-1]
- if ext == "gz":
+ if ext == 'gz':
ext = resource_url_parts[-2]
format = AUTO_FORMATS.get(ext)
if format is None:
- raise ValueError(
- "Could not determine format for %s based "
- 'on its file\nextension; use the "format" '
- "argument to specify the format explicitly." % resource_url
- )
+ raise ValueError('Could not determine format for %s based '
+ 'on its file\nextension; use the "format" '
+ 'argument to specify the format explicitly.'
+ % resource_url)
if format not in FORMATS:
- raise ValueError("Unknown format type: %s!" % (format,))
+ raise ValueError('Unknown format type: %s!' % (format,))
# If we've cached the resource, then just return it.
if cache:
resource_val = _resource_cache.get((resource_url, format))
if resource_val is not None:
if verbose:
- print("<<Using cached copy of %s>>" % (resource_url,))
+ print('<<Using cached copy of %s>>' % (resource_url,))
return resource_val
# Let the user know what's going on.
if verbose:
- print("<<Loading %s>>" % (resource_url,))
+ print('<<Loading %s>>' % (resource_url,))
# Load the resource.
opened_resource = _open(resource_url)
- if format == "raw":
+ if format == 'raw':
resource_val = opened_resource.read()
- elif format == "pickle":
+ elif format == 'pickle':
resource_val = pickle.load(opened_resource)
- elif format == "json":
+ elif format == 'json':
import json
from nltk.jsontags import json_tags
-
resource_val = json.load(opened_resource)
tag = None
if len(resource_val) != 1:
tag = next(resource_val.keys())
if tag not in json_tags:
- raise ValueError("Unknown json tag.")
- elif format == "yaml":
+ raise ValueError('Unknown json tag.')
+ elif format == 'yaml':
import yaml
-
- resource_val = yaml.safe_load(opened_resource)
+ resource_val = yaml.load(opened_resource)
else:
# The resource is a text format.
binary_data = opened_resource.read()
string_data = binary_data.decode(encoding)
else:
try:
- string_data = binary_data.decode("utf-8")
+ string_data = binary_data.decode('utf-8')
except UnicodeDecodeError:
- string_data = binary_data.decode("latin-1")
- if format == "text":
+ string_data = binary_data.decode('latin-1')
+ if format == 'text':
resource_val = string_data
- elif format == "cfg":
- resource_val = nltk.grammar.CFG.fromstring(string_data, encoding=encoding)
- elif format == "pcfg":
- resource_val = nltk.grammar.PCFG.fromstring(string_data, encoding=encoding)
- elif format == "fcfg":
+ elif format == 'cfg':
+ resource_val = nltk.grammar.CFG.fromstring(
+ string_data, encoding=encoding)
+ elif format == 'pcfg':
+ resource_val = nltk.grammar.PCFG.fromstring(
+ string_data, encoding=encoding)
+ elif format == 'fcfg':
resource_val = nltk.grammar.FeatureGrammar.fromstring(
- string_data,
- logic_parser=logic_parser,
- fstruct_reader=fstruct_reader,
- encoding=encoding,
- )
- elif format == "fol":
+ string_data, logic_parser=logic_parser,
+ fstruct_reader=fstruct_reader, encoding=encoding)
+ elif format == 'fol':
resource_val = nltk.sem.read_logic(
- string_data,
- logic_parser=nltk.sem.logic.LogicParser(),
- encoding=encoding,
- )
- elif format == "logic":
+ string_data, logic_parser=nltk.sem.logic.LogicParser(),
+ encoding=encoding)
+ elif format == 'logic':
resource_val = nltk.sem.read_logic(
- string_data, logic_parser=logic_parser, encoding=encoding
- )
- elif format == "val":
- resource_val = nltk.sem.read_valuation(string_data, encoding=encoding)
+ string_data, logic_parser=logic_parser, encoding=encoding)
+ elif format == 'val':
+ resource_val = nltk.sem.read_valuation(
+ string_data, encoding=encoding)
else:
- raise AssertionError(
- "Internal NLTK error: Format %s isn't "
- "handled by nltk.data.load()" % (format,)
- )
+ raise AssertionError("Internal NLTK error: Format %s isn't "
+ "handled by nltk.data.load()" % (format,))
opened_resource.close()
return resource_val
-def show_cfg(resource_url, escape="##"):
+def show_cfg(resource_url, escape='##'):
"""
Write out a grammar file, ignoring escaped and empty lines.
:param escape: Prepended string that signals lines to be ignored
"""
resource_url = normalize_resource_url(resource_url)
- resource_val = load(resource_url, format="text", cache=False)
+ resource_val = load(resource_url, format='text', cache=False)
lines = resource_val.splitlines()
for l in lines:
if l.startswith(escape):
continue
- if re.match("^$", l):
+ if re.match('^$', l):
continue
print(l)
resource_url = normalize_resource_url(resource_url)
protocol, path_ = split_resource_url(resource_url)
- if protocol is None or protocol.lower() == "nltk":
- return find(path_, path + [""]).open()
- elif protocol.lower() == "file":
+ if protocol is None or protocol.lower() == 'nltk':
+ return find(path_, path + ['']).open()
+ elif protocol.lower() == 'file':
# urllib might not use mode='rb', so handle this one ourselves:
- return find(path_, [""]).open()
+ return find(path_, ['']).open()
else:
return urlopen(resource_url)
-
######################################################################
# Lazy Resource Loader
######################################################################
+# We shouldn't apply @python_2_unicode_compatible
+# decorator to LazyLoader, this is resource.__class__ responsibility.
+
class LazyLoader(object):
+
@py3_data
def __init__(self, _path):
self._path = _path
# __class__ to something new:
return repr(self)
-
######################################################################
# Open-On-Demand ZipFile
######################################################################
file-like object (to allow re-opening). ``OpenOnDemandZipFile`` is
read-only (i.e. ``write()`` and ``writestr()`` are disabled.
"""
-
@py3_data
def __init__(self, filename):
- if not isinstance(filename, str):
- raise TypeError("ReopenableZipFile filename must be a string")
+ if not isinstance(filename, string_types):
+ raise TypeError('ReopenableZipFile filename must be a string')
zipfile.ZipFile.__init__(self, filename)
assert self.filename == filename
self.close()
def read(self, name):
assert self.fp is None
- self.fp = open(self.filename, "rb")
+ self.fp = open(self.filename, 'rb')
value = zipfile.ZipFile.read(self, name)
# Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
# Since we only opened one file here, we add 1.
def write(self, *args, **kwargs):
""":raise NotImplementedError: OpenOnDemandZipfile is read-only"""
- raise NotImplementedError("OpenOnDemandZipfile is read-only")
+ raise NotImplementedError('OpenOnDemandZipfile is read-only')
def writestr(self, *args, **kwargs):
""":raise NotImplementedError: OpenOnDemandZipfile is read-only"""
- raise NotImplementedError("OpenOnDemandZipfile is read-only")
+ raise NotImplementedError('OpenOnDemandZipfile is read-only')
def __repr__(self):
- return repr(str("OpenOnDemandZipFile(%r)") % self.filename)
-
+ return repr(str('OpenOnDemandZipFile(%r)') % self.filename)
######################################################################
-# { Seekable Unicode Stream Reader
+#{ Seekable Unicode Stream Reader
######################################################################
this shouldn't cause a problem with any of python's builtin
unicode encodings.
"""
-
DEBUG = True # : If true, then perform extra sanity checks.
@py3_data
- def __init__(self, stream, encoding, errors="strict"):
+ def __init__(self, stream, encoding, errors='strict'):
# Rewind the stream to its beginning.
stream.seek(0)
"""The function that is used to decode byte strings into
unicode strings."""
- self.bytebuffer = b""
+ self.bytebuffer = b''
"""A buffer to use bytes that have been read but have not yet
been decoded. This is only used when the final bytes from
a read do not form a complete encoding for a character."""
"""The length of the byte order marker at the beginning of
the stream (or None for no byte order marker)."""
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Read methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def read(self, size=None):
"""
# If linebuffer is not empty, then include it in the result
if self.linebuffer:
- chars = "".join(self.linebuffer) + chars
+ chars = ''.join(self.linebuffer) + chars
self.linebuffer = None
self._rewind_numchars = None
return chars
- def discard_line(self):
- if self.linebuffer and len(self.linebuffer) > 1:
- line = self.linebuffer.pop(0)
- self._rewind_numchars += len(line)
- else:
- self.stream.readline()
-
def readline(self, size=None):
"""
Read a line of text, decode it using this reader's encoding,
return line
readsize = size or 72
- chars = ""
+ chars = ''
# If there's a remaining incomplete line in the buffer, add it.
if self.linebuffer:
# If we're at a '\r', then read one extra character, since
# it might be a '\n', to get the proper line ending.
- if new_chars and new_chars.endswith("\r"):
+ if new_chars and new_chars.endswith('\r'):
new_chars += self._read(1)
chars += new_chars
if len(lines) > 1:
line = lines[0]
self.linebuffer = lines[1:]
- self._rewind_numchars = len(new_chars) - (len(chars) - len(line))
+ self._rewind_numchars = (len(new_chars) -
+ (len(chars) - len(line)))
self._rewind_checkpoint = startpos
break
elif len(lines) == 1:
"""Return self"""
return self
- def __del__(self):
- # let garbage collector deal with still opened streams
- if not self.closed:
- self.close()
-
def xreadlines(self):
"""Return self"""
return self
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Pass-through methods & properties
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
@property
def closed(self):
"""
self.stream.close()
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Seek and tell
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def seek(self, offset, whence=0):
"""
typically be negative).
"""
if whence == 1:
- raise ValueError(
- "Relative seek is not supported for "
- "SeekableUnicodeStreamReader -- consider "
- "using char_seek_forward() instead."
- )
+ raise ValueError('Relative seek is not supported for '
+ 'SeekableUnicodeStreamReader -- consider '
+ 'using char_seek_forward() instead.')
self.stream.seek(offset, whence)
self.linebuffer = None
- self.bytebuffer = b""
+ self.bytebuffer = b''
self._rewind_numchars = None
self._rewind_checkpoint = self.stream.tell()
Move the read pointer forward by ``offset`` characters.
"""
if offset < 0:
- raise ValueError("Negative offsets are not supported")
+ raise ValueError('Negative offsets are not supported')
# Clear all buffers.
self.seek(self.tell())
# Perform the seek operation.
"""
if est_bytes is None:
est_bytes = offset
- bytes = b""
+ bytes = b''
while True:
# Read in a block of bytes.
orig_filepos = self.stream.tell()
# Calculate an estimate of where we think the newline is.
- bytes_read = (orig_filepos - len(self.bytebuffer)) - self._rewind_checkpoint
+ bytes_read = ((orig_filepos - len(self.bytebuffer)) -
+ self._rewind_checkpoint)
buf_size = sum(len(line) for line in self.linebuffer)
- est_bytes = int(
- (bytes_read * self._rewind_numchars / (self._rewind_numchars + buf_size))
- )
+ est_bytes = int((bytes_read * self._rewind_numchars /
+ (self._rewind_numchars + buf_size)))
self.stream.seek(self._rewind_checkpoint)
self._char_seek_forward(self._rewind_numchars, est_bytes)
if self.DEBUG:
self.stream.seek(filepos)
check1 = self._incr_decode(self.stream.read(50))[0]
- check2 = "".join(self.linebuffer)
+ check2 = ''.join(self.linebuffer)
assert check1.startswith(check2) or check2.startswith(check1)
# Return to our original filepos (so we don't have to throw
# Return the calculated filepos
return filepos
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Helper methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _read(self, size=None):
"""
unicode string. ``linebuffer`` is not included in the result.
"""
if size == 0:
- return ""
+ return ''
# Skip past the byte order marker, if present.
if self._bom and self.stream.tell() == 0:
"""
while True:
try:
- return self.decode(bytes, "strict")
+ return self.decode(bytes, 'strict')
except UnicodeDecodeError as exc:
# If the exception occurs at the end of the string,
# then assume that it's a truncation error.
if exc.end == len(bytes):
- return self.decode(bytes[: exc.start], self.errors)
+ return self.decode(bytes[:exc.start], self.errors)
# Otherwise, if we're being strict, then raise it.
- elif self.errors == "strict":
+ elif self.errors == 'strict':
raise
# If we're not strict, then re-process it with our
return self.decode(bytes, self.errors)
_BOM_TABLE = {
- "utf8": [(codecs.BOM_UTF8, None)],
- "utf16": [(codecs.BOM_UTF16_LE, "utf16-le"), (codecs.BOM_UTF16_BE, "utf16-be")],
- "utf16le": [(codecs.BOM_UTF16_LE, None)],
- "utf16be": [(codecs.BOM_UTF16_BE, None)],
- "utf32": [(codecs.BOM_UTF32_LE, "utf32-le"), (codecs.BOM_UTF32_BE, "utf32-be")],
- "utf32le": [(codecs.BOM_UTF32_LE, None)],
- "utf32be": [(codecs.BOM_UTF32_BE, None)],
+ 'utf8': [(codecs.BOM_UTF8, None)],
+ 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'),
+ (codecs.BOM_UTF16_BE, 'utf16-be')],
+ 'utf16le': [(codecs.BOM_UTF16_LE, None)],
+ 'utf16be': [(codecs.BOM_UTF16_BE, None)],
+ 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'),
+ (codecs.BOM_UTF32_BE, 'utf32-be')],
+ 'utf32le': [(codecs.BOM_UTF32_LE, None)],
+ 'utf32be': [(codecs.BOM_UTF32_BE, None)],
}
def _check_bom(self):
# Normalize our encoding name
- enc = re.sub("[ -]", "", self.encoding.lower())
+ enc = re.sub('[ -]', '', self.encoding.lower())
# Look up our encoding in the BOM table.
bom_info = self._BOM_TABLE.get(enc)
return None
-__all__ = [
- "path",
- "PathPointer",
- "FileSystemPathPointer",
- "BufferedGzipFile",
- "GzipFileSystemPathPointer",
- "GzipFileSystemPathPointer",
- "find",
- "retrieve",
- "FORMATS",
- "AUTO_FORMATS",
- "load",
- "show_cfg",
- "clear_cache",
- "LazyLoader",
- "OpenOnDemandZipFile",
- "GzipFileSystemPathPointer",
- "SeekableUnicodeStreamReader",
-]
+__all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile',
+ 'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer',
+ 'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load',
+ 'show_cfg', 'clear_cache', 'LazyLoader', 'OpenOnDemandZipFile',
+ 'GzipFileSystemPathPointer', 'SeekableUnicodeStreamReader']
Included in NLTK for its support of a nice memoization decorator.
"""
-
-__docformat__ = "restructuredtext en"
+from __future__ import print_function
+__docformat__ = 'restructuredtext en'
## The basic trick is to generate the source code for the decorated function
## with the right signature and to evaluate it.
# Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in
# the Python standard library.
-OLD_SYS_PATH = sys.path[:]
-sys.path = [p for p in sys.path if p and "nltk" not in p]
+old_sys_path = sys.path[:]
+sys.path = [p for p in sys.path if "nltk" not in p]
import inspect
+sys.path = old_sys_path
-sys.path = OLD_SYS_PATH
-
-def __legacysignature(signature):
- """
- For retrocompatibility reasons, we don't use a standard Signature.
- Instead, we use the string generated by this method.
- Basically, from a Signature we create a string and remove the default values.
- """
- listsignature = str(signature)[1:-1].split(",")
- for counter, param in enumerate(listsignature):
- if param.count("=") > 0:
- listsignature[counter] = param[0:param.index("=")].strip()
- else:
- listsignature[counter] = param.strip()
- return ", ".join(listsignature)
-
+try:
+ set
+except NameError:
+ from sets import Set as set
def getinfo(func):
"""
- argnames (the names of the arguments : list)
- defaults (the values of the default arguments : tuple)
- signature (the signature : str)
- - fullsignature (the full signature : Signature)
- doc (the docstring : str)
- module (the module name : str)
- dict (the function __dict__ : str)
>>> info["signature"]
'self, x, y, *args, **kw'
-
- >>> info["fullsignature"]
- <Signature (self, x=1, y=2, *args, **kw)>
"""
assert inspect.ismethod(func) or inspect.isfunction(func)
- argspec = inspect.getfullargspec(func)
- regargs, varargs, varkwargs = argspec[:3]
+ if sys.version_info[0] >= 3:
+ argspec = inspect.getfullargspec(func)
+ else:
+ argspec = inspect.getargspec(func)
+ regargs, varargs, varkwargs, defaults = argspec[:4]
argnames = list(regargs)
if varargs:
argnames.append(varargs)
if varkwargs:
argnames.append(varkwargs)
- fullsignature = inspect.signature(func)
- # Convert Signature to str
- signature = __legacysignature(fullsignature)
-
+ signature = inspect.formatargspec(regargs, varargs, varkwargs, defaults,
+ formatvalue=lambda value: "")[1:-1]
# pypy compatibility
- if hasattr(func, "__closure__"):
+ if hasattr(func, '__closure__'):
_closure = func.__closure__
_globals = func.__globals__
else:
_closure = func.func_closure
_globals = func.func_globals
- return dict(
- name=func.__name__,
- argnames=argnames,
- signature=signature,
- fullsignature=fullsignature,
- defaults=func.__defaults__,
- doc=func.__doc__,
- module=func.__module__,
- dict=func.__dict__,
- globals=_globals,
- closure=_closure,
- )
-
+ return dict(name=func.__name__, argnames=argnames, signature=signature,
+ defaults = func.__defaults__, doc=func.__doc__,
+ module=func.__module__, dict=func.__dict__,
+ globals=_globals, closure=_closure)
+# akin to functools.update_wrapper
def update_wrapper(wrapper, model, infodict=None):
- " akin to functools.update_wrapper "
infodict = infodict or getinfo(model)
- wrapper.__name__ = infodict["name"]
- wrapper.__doc__ = infodict["doc"]
- wrapper.__module__ = infodict["module"]
- wrapper.__dict__.update(infodict["dict"])
- wrapper.__defaults__ = infodict["defaults"]
+ wrapper.__name__ = infodict['name']
+ wrapper.__doc__ = infodict['doc']
+ wrapper.__module__ = infodict['module']
+ wrapper.__dict__.update(infodict['dict'])
+ wrapper.__defaults__ = infodict['defaults']
wrapper.undecorated = model
return wrapper
-
def new_wrapper(wrapper, model):
"""
An improvement over functools.update_wrapper. The wrapper is a generic
"""
if isinstance(model, dict):
infodict = model
- else: # assume model is a function
+ else: # assume model is a function
infodict = getinfo(model)
- assert (
- not "_wrapper_" in infodict["argnames"]
- ), '"_wrapper_" is a reserved argument name!'
+ assert not '_wrapper_' in infodict["argnames"], (
+ '"_wrapper_" is a reserved argument name!')
src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict
funcopy = eval(src, dict(_wrapper_=wrapper))
return update_wrapper(funcopy, model, infodict)
-
# helper used in decorator_factory
def __call__(self, func):
- return new_wrapper(lambda *a, **k: self.call(func, *a, **k), func)
-
+ return new_wrapper(lambda *a, **k : self.call(func, *a, **k), func)
def decorator_factory(cls):
"""
method.
"""
attrs = set(dir(cls))
- if "__call__" in attrs:
- raise TypeError(
- "You cannot decorate a class with a nontrivial " "__call__ method"
- )
- if "call" not in attrs:
- raise TypeError("You cannot decorate a class without a " ".call method")
+ if '__call__' in attrs:
+ raise TypeError('You cannot decorate a class with a nontrivial '
+ '__call__ method')
+ if 'call' not in attrs:
+ raise TypeError('You cannot decorate a class without a '
+ '.call method')
cls.__call__ = __call__
return cls
-
def decorator(caller):
"""
General purpose decorator factory: takes a caller function as
"""
if inspect.isclass(caller):
return decorator_factory(caller)
-
- def _decorator(func): # the real meat is here
+ def _decorator(func): # the real meat is here
infodict = getinfo(func)
- argnames = infodict["argnames"]
- assert not (
- "_call_" in argnames or "_func_" in argnames
- ), "You cannot use _call_ or _func_ as argument names!"
+ argnames = infodict['argnames']
+ assert not ('_call_' in argnames or '_func_' in argnames), (
+ 'You cannot use _call_ or _func_ as argument names!')
src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict
# import sys; print >> sys.stderr, src # for debugging purposes
dec_func = eval(src, dict(_func_=func, _call_=caller))
return update_wrapper(dec_func, func, infodict)
-
return update_wrapper(_decorator, caller)
-
def getattr_(obj, name, default_thunk):
"Similar to .setdefault in dictionaries."
try:
setattr(obj, name, default)
return default
-
@decorator
def memoize(func, *args):
dic = getattr_(func, "memoize_dic", dict)
# memoize_dic is created at the first call
if args in dic:
return dic[args]
- result = func(*args)
- dic[args] = result
- return result
+ else:
+ result = func(*args)
+ dic[args] = result
+ return result
########################## LEGALESE ###############################
# Natural Language Toolkit: Corpus & Model Downloader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
"""
-# ----------------------------------------------------------------------
+#----------------------------------------------------------------------
+from __future__ import print_function, division, unicode_literals
"""
default: unzip or not?
"""
-import time, os, zipfile, sys, textwrap, threading, itertools, shutil, functools
-import subprocess
+import time, os, zipfile, sys, textwrap, threading, itertools, shutil
from hashlib import md5
-from xml.etree import ElementTree
try:
TKINTER = True
- from tkinter import (
- Tk,
- Frame,
- Label,
- Entry,
- Button,
- Canvas,
- Menu,
- IntVar,
- TclError,
- )
- from tkinter.messagebox import showerror
+ from six.moves.tkinter import (Tk, Frame, Label, Entry, Button, Canvas,
+ Menu, IntVar, TclError)
+ from six.moves.tkinter_messagebox import showerror
from nltk.draw.table import Table
from nltk.draw.util import ShowText
-except ImportError:
+except:
TKINTER = False
TclError = ValueError
-from urllib.request import urlopen
-from urllib.error import HTTPError, URLError
+from xml.etree import ElementTree
+
+from six import string_types, text_type
+from six.moves import input
+from six.moves.urllib.request import urlopen
+from six.moves.urllib.error import HTTPError, URLError
import nltk
-
-# urllib2 = nltk.internals.import_from_stdlib('urllib2')
+from nltk.compat import python_2_unicode_compatible
+#urllib2 = nltk.internals.import_from_stdlib('urllib2')
######################################################################
# Directory entry objects (from the data server's index file)
######################################################################
+@python_2_unicode_compatible
class Package(object):
"""
A directory entry for a downloadable package. These entries are
that file is a zip file, then it can be automatically decompressed
when the package is installed.
"""
-
- def __init__(
- self,
- id,
- url,
- name=None,
- subdir="",
- size=None,
- unzipped_size=None,
- checksum=None,
- svn_revision=None,
- copyright="Unknown",
- contact="Unknown",
- license="Unknown",
- author="Unknown",
- unzip=True,
- **kw
- ):
+ def __init__(self, id, url, name=None, subdir='',
+ size=None, unzipped_size=None,
+ checksum=None, svn_revision=None,
+ copyright='Unknown', contact='Unknown',
+ license='Unknown', author='Unknown',
+ unzip=True,
+ **kw):
self.id = id
"""A unique identifier for this package."""
self.author = author
"""Author of this package."""
- ext = os.path.splitext(url.split("/")[-1])[1]
- self.filename = os.path.join(subdir, id + ext)
+ ext = os.path.splitext(url.split('/')[-1])[1]
+ self.filename = os.path.join(subdir, id+ext)
"""The filename that should be used for this package's file. It
is formed by joining ``self.subdir`` with ``self.id``, and
using the same extension as ``url``."""
- self.unzip = bool(int(unzip)) # '0' or '1'
+ self.unzip = bool(int(unzip)) # '0' or '1'
"""A flag indicating whether this corpus should be unzipped by
default."""
@staticmethod
def fromxml(xml):
- if isinstance(xml, str):
+ if isinstance(xml, string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
- xml.attrib[key] = str(xml.attrib[key])
+ xml.attrib[key] = text_type(xml.attrib[key])
return Package(**xml.attrib)
def __lt__(self, other):
return self.id < other.id
def __repr__(self):
- return "<Package %s>" % self.id
-
+ return '<Package %s>' % self.id
+@python_2_unicode_compatible
class Collection(object):
"""
A directory entry for a collection of downloadable packages.
These entries are extracted from the XML index file that is
downloaded by ``Downloader``.
"""
-
def __init__(self, id, children, name=None, **kw):
self.id = id
"""A unique identifier for this collection."""
@staticmethod
def fromxml(xml):
- if isinstance(xml, str):
+ if isinstance(xml, string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
- xml.attrib[key] = str(xml.attrib[key])
- children = [child.get("ref") for child in xml.findall("item")]
+ xml.attrib[key] = text_type(xml.attrib[key])
+ children = [child.get('ref') for child in xml.findall('item')]
return Collection(children=children, **xml.attrib)
def __lt__(self, other):
return self.id < other.id
def __repr__(self):
- return "<Collection %s>" % self.id
-
+ return '<Collection %s>' % self.id
######################################################################
# Message Passing Objects
######################################################################
-
class DownloaderMessage(object):
"""A status message object, used by ``incr_download`` to
communicate its progress."""
-
-
class StartCollectionMessage(DownloaderMessage):
"""Data server has started working on a collection of packages."""
-
- def __init__(self, collection):
- self.collection = collection
-
-
+ def __init__(self, collection): self.collection = collection
class FinishCollectionMessage(DownloaderMessage):
"""Data server has finished working on a collection of packages."""
-
- def __init__(self, collection):
- self.collection = collection
-
-
+ def __init__(self, collection): self.collection = collection
class StartPackageMessage(DownloaderMessage):
"""Data server has started working on a package."""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class FinishPackageMessage(DownloaderMessage):
"""Data server has finished working on a package."""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class StartDownloadMessage(DownloaderMessage):
"""Data server has started downloading a package."""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class FinishDownloadMessage(DownloaderMessage):
"""Data server has finished downloading a package."""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class StartUnzipMessage(DownloaderMessage):
"""Data server has started unzipping a package."""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class FinishUnzipMessage(DownloaderMessage):
"""Data server has finished unzipping a package."""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class UpToDateMessage(DownloaderMessage):
"""The package download file is already up-to-date"""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class StaleMessage(DownloaderMessage):
"""The package download file is out-of-date or corrupt"""
-
- def __init__(self, package):
- self.package = package
-
-
+ def __init__(self, package): self.package = package
class ErrorMessage(DownloaderMessage):
"""Data server encountered an error"""
-
def __init__(self, package, message):
self.package = package
if isinstance(message, Exception):
else:
self.message = message
-
class ProgressMessage(DownloaderMessage):
"""Indicates how much progress the data server has made"""
-
- def __init__(self, progress):
- self.progress = progress
-
-
+ def __init__(self, progress): self.progress = progress
class SelectDownloadDirMessage(DownloaderMessage):
"""Indicates what download directory the data server is using"""
-
- def __init__(self, download_dir):
- self.download_dir = download_dir
-
+ def __init__(self, download_dir): self.download_dir = download_dir
######################################################################
# NLTK Data Server
######################################################################
-
class Downloader(object):
"""
A class used to access the NLTK data server, which can be used to
download corpora and other data packages.
"""
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Configuration
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
- INDEX_TIMEOUT = 60 * 60 # 1 hour
+ INDEX_TIMEOUT = 60*60 # 1 hour
"""The amount of time after which the cached copy of the data
server index will be considered 'stale,' and will be
re-downloaded."""
- DEFAULT_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml"
+ DEFAULT_URL = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml'
"""The default URL for the NLTK data server's index. An
alternative URL can be specified when creating a new
``Downloader`` object."""
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Status Constants
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
- INSTALLED = "installed"
+ INSTALLED = 'installed'
"""A status string indicating that a package or collection is
installed and up-to-date."""
- NOT_INSTALLED = "not installed"
+ NOT_INSTALLED = 'not installed'
"""A status string indicating that a package or collection is
not installed."""
- STALE = "out of date"
+ STALE = 'out of date'
"""A status string indicating that a package or collection is
corrupt or out-of-date."""
- PARTIAL = "partial"
+ PARTIAL = 'partial'
"""A status string indicating that a collection is partially
installed (i.e., only some of its packages are installed.)"""
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Cosntructor
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def __init__(self, server_index_url=None, download_dir=None):
self._url = server_index_url or self.DEFAULT_URL
if self._download_dir is None:
self._download_dir = self.default_download_dir()
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Information
- # /////////////////////////////////////////////////////////////////
-
- def list(
- self,
- download_dir=None,
- show_packages=True,
- show_collections=True,
- header=True,
- more_prompt=False,
- skip_installed=False,
- ):
- lines = 0 # for more_prompt
+ #/////////////////////////////////////////////////////////////////
+
+ def list(self, download_dir=None, show_packages=True,
+ show_collections=True, header=True, more_prompt=False,
+ skip_installed=False):
+ lines = 0 # for more_prompt
if download_dir is None:
download_dir = self._download_dir
- print("Using default data directory (%s)" % download_dir)
+ print('Using default data directory (%s)' % download_dir)
if header:
- print("=" * (26 + len(self._url)))
- print(" Data server index for <%s>" % self._url)
- print("=" * (26 + len(self._url)))
- lines += 3 # for more_prompt
+ print('='*(26+len(self._url)))
+ print(' Data server index for <%s>' % self._url)
+ print('='*(26+len(self._url)))
+ lines += 3 # for more_prompt
stale = partial = False
categories = []
- if show_packages:
- categories.append("packages")
- if show_collections:
- categories.append("collections")
+ if show_packages: categories.append('packages')
+ if show_collections: categories.append('collections')
for category in categories:
- print("%s:" % category.capitalize())
- lines += 1 # for more_prompt
+ print('%s:' % category.capitalize())
+ lines += 1 # for more_prompt
for info in sorted(getattr(self, category)(), key=str):
status = self.status(info, download_dir)
- if status == self.INSTALLED and skip_installed:
- continue
- if status == self.STALE:
- stale = True
- if status == self.PARTIAL:
- partial = True
- prefix = {
- self.INSTALLED: "*",
- self.STALE: "-",
- self.PARTIAL: "P",
- self.NOT_INSTALLED: " ",
- }[status]
- name = textwrap.fill(
- "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " "
- )[27:]
- print(" [%s] %s %s" % (prefix, info.id.ljust(20, "."), name))
- lines += len(name.split("\n")) # for more_prompt
+ if status == self.INSTALLED and skip_installed: continue
+ if status == self.STALE: stale = True
+ if status == self.PARTIAL: partial = True
+ prefix = {self.INSTALLED:'*', self.STALE:'-',
+ self.PARTIAL:'P', self.NOT_INSTALLED: ' '}[status]
+ name = textwrap.fill('-'*27 + (info.name or info.id),
+ 75, subsequent_indent=27*' ')[27:]
+ print(' [%s] %s %s' % (prefix, info.id.ljust(20, '.'), name))
+ lines += len(name.split('\n')) # for more_prompt
if more_prompt and lines > 20:
user_input = input("Hit Enter to continue: ")
- if user_input.lower() in ("x", "q"):
- return
+ if (user_input.lower() in ('x', 'q')): return
lines = 0
print()
- msg = "([*] marks installed packages"
- if stale:
- msg += "; [-] marks out-of-date or corrupt packages"
- if partial:
- msg += "; [P] marks partially installed collections"
- print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76))
+ msg = '([*] marks installed packages'
+ if stale: msg += '; [-] marks out-of-date or corrupt packages'
+ if partial: msg += '; [P] marks partially installed collections'
+ print(textwrap.fill(msg+')', subsequent_indent=' ', width=76))
def packages(self):
self._update_index()
def corpora(self):
self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"]
+ return [pkg for (id,pkg) in self._packages.items()
+ if pkg.subdir == 'corpora']
def models(self):
self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"]
+ return [pkg for (id,pkg) in self._packages.items()
+ if pkg.subdir != 'corpora']
def collections(self):
self._update_index()
return self._collections.values()
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Downloading
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _info_or_id(self, info_or_id):
- if isinstance(info_or_id, str):
+ if isinstance(info_or_id, string_types):
return self.info(info_or_id)
else:
return info_or_id
yield SelectDownloadDirMessage(download_dir)
# If they gave us a list of ids, then download each one.
- if isinstance(info_or_id, (list, tuple)):
+ if isinstance(info_or_id, (list,tuple)):
for msg in self._download_list(info_or_id, download_dir, force):
yield msg
return
# Look up the requested collection or package.
- try:
- info = self._info_or_id(info_or_id)
+ try: info = self._info_or_id(info_or_id)
except (IOError, ValueError) as e:
- yield ErrorMessage(None, "Error loading %s: %s" % (info_or_id, e))
+ yield ErrorMessage(None, 'Error loading %s: %s' %
+ (info_or_id, e))
return
# Handle collections.
yield msg
def _num_packages(self, item):
- if isinstance(item, Package):
- return 1
- else:
- return len(item.packages)
+ if isinstance(item, Package): return 1
+ else: return len(item.packages)
def _download_list(self, items, download_dir, force):
# Look up the requested items.
for i in range(len(items)):
- try:
- items[i] = self._info_or_id(items[i])
+ try: items[i] = self._info_or_id(items[i])
except (IOError, ValueError) as e:
yield ErrorMessage(items[i], e)
return
progress = 0
for i, item in enumerate(items):
if isinstance(item, Package):
- delta = 1.0 / num_packages
+ delta = 1./num_packages
else:
- delta = len(item.packages) / num_packages
+ delta = len(item.packages)/num_packages
for msg in self.incr_download(item, download_dir, force):
if isinstance(msg, ProgressMessage):
- yield ProgressMessage(progress + msg.progress * delta)
+ yield ProgressMessage(progress + msg.progress*delta)
else:
yield msg
- progress += 100 * delta
+ progress += 100*delta
def _download_package(self, info, download_dir, force):
yield StartPackageMessage(info)
yield ProgressMessage(5)
try:
infile = urlopen(info.url)
- with open(filepath, "wb") as outfile:
- num_blocks = max(1, info.size / (1024 * 16))
+ with open(filepath, 'wb') as outfile:
+ #print info.size
+ num_blocks = max(1, info.size/(1024*16))
for block in itertools.count():
- s = infile.read(1024 * 16) # 16k blocks.
+ s = infile.read(1024*16) # 16k blocks.
outfile.write(s)
- if not s:
- break
- if block % 2 == 0: # how often?
- yield ProgressMessage(min(80, 5 + 75 * (block / num_blocks)))
+ if not s: break
+ if block % 2 == 0: # how often?
+ yield ProgressMessage(min(80, 5+75*(block/num_blocks)))
infile.close()
except IOError as e:
- yield ErrorMessage(
- info,
- "Error downloading %r from <%s>:" "\n %s" % (info.id, info.url, e),
- )
+ yield ErrorMessage(info, 'Error downloading %r from <%s>:'
+ '\n %s' % (info.id, info.url, e))
return
yield FinishDownloadMessage(info)
yield ProgressMessage(80)
# If it's a zipfile, uncompress it.
- if info.filename.endswith(".zip"):
+ if info.filename.endswith('.zip'):
zipdir = os.path.join(download_dir, info.subdir)
# Unzip if we're unzipping by default; *or* if it's already
# been unzipped (presumably a previous version).
yield FinishPackageMessage(info)
- def download(
- self,
- info_or_id=None,
- download_dir=None,
- quiet=False,
- force=False,
- prefix="[nltk_data] ",
- halt_on_error=True,
- raise_on_error=False,
- print_error_to=sys.stderr,
- ):
-
- print_to = functools.partial(print, file=print_error_to)
+ def download(self, info_or_id=None, download_dir=None, quiet=False,
+ force=False, prefix='[nltk_data] ', halt_on_error=True,
+ raise_on_error=False):
# If no info or id is given, then use the interactive shell.
if info_or_id is None:
# [xx] hmm -- changing self._download_dir here seems like
# the wrong thing to do. Maybe the _interactive_download
# function should make a new copy of self to use?
- if download_dir is not None:
- self._download_dir = download_dir
+ if download_dir is not None: self._download_dir = download_dir
self._interactive_download()
return True
else:
# Define a helper function for displaying output:
- def show(s, prefix2=""):
- print_to(
- textwrap.fill(
- s,
- initial_indent=prefix + prefix2,
- subsequent_indent=prefix + prefix2 + " " * 4,
- )
- )
+ def show(s, prefix2=''):
+ print(textwrap.fill(s, initial_indent=prefix+prefix2,
+ subsequent_indent=prefix+prefix2+' '*4))
for msg in self.incr_download(info_or_id, download_dir, force):
# Error messages
return False
self._errors = True
if not quiet:
- print_to("Error installing package. Retry? [n/y/e]")
+ print("Error installing package. Retry? [n/y/e]")
choice = input().strip()
- if choice in ["y", "Y"]:
- if not self.download(
- msg.package.id,
- download_dir,
- quiet,
- force,
- prefix,
- halt_on_error,
- raise_on_error,
- ):
+ if choice in ['y', 'Y']:
+ if not self.download(msg.package.id, download_dir,
+ quiet, force, prefix,
+ halt_on_error, raise_on_error):
return False
- elif choice in ["e", "E"]:
+ elif choice in ['e', 'E']:
return False
# All other messages
if not quiet:
# Collection downloading messages:
if isinstance(msg, StartCollectionMessage):
- show("Downloading collection %r" % msg.collection.id)
- prefix += " | "
- print_to(prefix)
+ show('Downloading collection %r' % msg.collection.id)
+ prefix += ' | '
+ print(prefix)
elif isinstance(msg, FinishCollectionMessage):
- print_to(prefix)
+ print(prefix)
prefix = prefix[:-4]
if self._errors:
- show(
- "Downloaded collection %r with errors"
- % msg.collection.id
- )
+ show('Downloaded collection %r with errors' %
+ msg.collection.id)
else:
- show("Done downloading collection %s" % msg.collection.id)
+ show('Done downloading collection %s' %
+ msg.collection.id)
# Package downloading messages:
elif isinstance(msg, StartPackageMessage):
- show(
- "Downloading package %s to %s..."
- % (msg.package.id, download_dir)
- )
+ show('Downloading package %s to %s...' %
+ (msg.package.id, download_dir))
elif isinstance(msg, UpToDateMessage):
- show("Package %s is already up-to-date!" % msg.package.id, " ")
- # elif isinstance(msg, StaleMessage):
+ show('Package %s is already up-to-date!' %
+ msg.package.id, ' ')
+ #elif isinstance(msg, StaleMessage):
# show('Package %s is out-of-date or corrupt' %
# msg.package.id, ' ')
elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s." % msg.package.filename, " ")
+ show('Unzipping %s.' % msg.package.filename, ' ')
# Data directory message:
elif isinstance(msg, SelectDownloadDirMessage):
or collection. Status can be one of ``INSTALLED``,
``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``.
"""
- if download_dir is None:
- download_dir = self._download_dir
+ if download_dir is None: download_dir = self._download_dir
info = self._info_or_id(info_or_id)
# Handle collections:
return self.STALE
elif self.PARTIAL in pkg_status:
return self.PARTIAL
- elif self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status:
+ elif (self.INSTALLED in pkg_status and
+ self.NOT_INSTALLED in pkg_status):
return self.PARTIAL
elif self.NOT_INSTALLED in pkg_status:
return self.NOT_INSTALLED
return self._pkg_status(info, filepath)
else:
if info.id not in self._status_cache:
- self._status_cache[info.id] = self._pkg_status(info, filepath)
+ self._status_cache[info.id] = self._pkg_status(info,
+ filepath)
return self._status_cache[info.id]
def _pkg_status(self, info, filepath):
return self.NOT_INSTALLED
# Check if the file has the correct size.
- try:
- filestat = os.stat(filepath)
- except OSError:
- return self.NOT_INSTALLED
+ try: filestat = os.stat(filepath)
+ except OSError: return self.NOT_INSTALLED
if filestat.st_size != int(info.size):
return self.STALE
# If it's a zipfile, and it's been at least partially
# unzipped, then check if it's been fully unzipped.
- if filepath.endswith(".zip"):
+ if filepath.endswith('.zip'):
unzipdir = filepath[:-4]
if not os.path.exists(unzipdir):
- return self.INSTALLED # but not unzipped -- ok!
+ return self.INSTALLED # but not unzipped -- ok!
if not os.path.isdir(unzipdir):
return self.STALE
- unzipped_size = sum(
- os.stat(os.path.join(d, f)).st_size
- for d, _, files in os.walk(unzipdir)
- for f in files
- )
+ unzipped_size = sum(os.stat(os.path.join(d, f)).st_size
+ for d, _, files in os.walk(unzipdir)
+ for f in files)
if unzipped_size != info.unzipped_size:
return self.STALE
# Otherwise, everything looks good.
return self.INSTALLED
- def update(self, quiet=False, prefix="[nltk_data] "):
+ def update(self, quiet=False, prefix='[nltk_data] '):
"""
Re-download any packages whose status is STALE.
"""
if self.status(pkg) == self.STALE:
self.download(pkg, quiet=quiet, prefix=prefix)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Index
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _update_index(self, url=None):
"""A helper function that ensures that self._index is
up-to-date. If the index is older than self.INDEX_TIMEOUT,
then download it again."""
# Check if the index is aleady up-to-date. If so, do nothing.
- if not (
- self._index is None
- or url is not None
- or time.time() - self._index_timestamp > self.INDEX_TIMEOUT
- ):
+ if not (self._index is None or url is not None or
+ time.time()-self._index_timestamp > self.INDEX_TIMEOUT):
return
# If a URL was specified, then update our URL.
# Download the index file.
self._index = nltk.internals.ElementWrapper(
- ElementTree.parse(urlopen(self._url)).getroot()
- )
+ ElementTree.parse(urlopen(self._url)).getroot())
self._index_timestamp = time.time()
# Build a dictionary of packages.
- packages = [Package.fromxml(p) for p in self._index.findall("packages/package")]
+ packages = [Package.fromxml(p) for p in
+ self._index.findall('packages/package')]
self._packages = dict((p.id, p) for p in packages)
# Build a dictionary of collections.
- collections = [
- Collection.fromxml(c) for c in self._index.findall("collections/collection")
- ]
+ collections = [Collection.fromxml(c) for c in
+ self._index.findall('collections/collection')]
self._collections = dict((c.id, c) for c in collections)
# Replace identifiers with actual children in collection.children.
elif child_id in self._collections:
collection.children[i] = self._collections[child_id]
else:
- print(
- "removing collection member with no package: {}".format(
- child_id
- )
- )
+ print('removing collection member with no package: {}'.format(child_id))
del collection.children[i]
# Fill in collection.packages for each collection.
"""Return the ``Package`` or ``Collection`` record for the
given item."""
self._update_index()
- if id in self._packages:
- return self._packages[id]
- if id in self._collections:
- return self._collections[id]
- raise ValueError("Package %r not found in index" % id)
+ if id in self._packages: return self._packages[id]
+ if id in self._collections: return self._collections[id]
+ raise ValueError('Package %r not found in index' % id)
def xmlinfo(self, id):
"""Return the XML info record for the given item"""
self._update_index()
- for package in self._index.findall("packages/package"):
- if package.get("id") == id:
+ for package in self._index.findall('packages/package'):
+ if package.get('id') == id:
return package
- for collection in self._index.findall("collections/collection"):
- if collection.get("id") == id:
+ for collection in self._index.findall('collections/collection'):
+ if collection.get('id') == id:
return collection
- raise ValueError("Package %r not found in index" % id)
+ raise ValueError('Package %r not found in index' % id)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# URL & Data Directory
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _get_url(self):
"""The URL for the data server's index file."""
return self._url
-
def _set_url(self, url):
"""
Set a new URL for the data server. If we're unable to contact
except:
self._url = original_url
raise
-
url = property(_get_url, _set_url)
def default_download_dir(self):
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
"""
# Check if we are on GAE where we cannot write into filesystem.
- if "APPENGINE_RUNTIME" in os.environ:
+ if 'APPENGINE_RUNTIME' in os.environ:
return
# Check if we have sufficient permissions to install in a
# variety of system-wide locations.
for nltkdir in nltk.data.path:
- if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir):
+ if (os.path.exists(nltkdir) and
+ nltk.internals.is_writable(nltkdir)):
return nltkdir
# On Windows, use %APPDATA%
- if sys.platform == "win32" and "APPDATA" in os.environ:
- homedir = os.environ["APPDATA"]
+ if sys.platform == 'win32' and 'APPDATA' in os.environ:
+ homedir = os.environ['APPDATA']
# Otherwise, install in the user's home directory.
else:
- homedir = os.path.expanduser("~/")
- if homedir == "~/":
+ homedir = os.path.expanduser('~/')
+ if homedir == '~/':
raise ValueError("Could not find a default download directory")
# append "nltk_data" to the home directory
- return os.path.join(homedir, "nltk_data")
+ return os.path.join(homedir, 'nltk_data')
def _get_download_dir(self):
"""
``download_dir`` argument when calling ``download()``.
"""
return self._download_dir
-
def _set_download_dir(self, download_dir):
self._download_dir = download_dir
# Clear the status cache.
self._status_cache.clear()
-
download_dir = property(_get_download_dir, _set_download_dir)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Interactive Shell
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _interactive_download(self):
# Try the GUI first; if that doesn't work, try the simple
else:
DownloaderShell(self).run()
-
class DownloaderShell(object):
def __init__(self, dataserver):
self._ds = dataserver
def _simple_interactive_menu(self, *options):
- print("-" * 75)
- spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " "
- print(" " + spc.join(options))
- print("-" * 75)
+ print('-'*75)
+ spc = (68 - sum(len(o) for o in options))//(len(options)-1)*' '
+ print(' ' + spc.join(options))
+ #w = 76/len(options)
+ #fmt = ' ' + ('%-'+str(w)+'s')*(len(options)-1) + '%s'
+ #print fmt % options
+ print('-'*75)
def run(self):
- print("NLTK Downloader")
+ print('NLTK Downloader')
while True:
self._simple_interactive_menu(
- "d) Download",
- "l) List",
- " u) Update",
- "c) Config",
- "h) Help",
- "q) Quit",
- )
- user_input = input("Downloader> ").strip()
- if not user_input:
- print()
- continue
+ 'd) Download', 'l) List', ' u) Update', 'c) Config', 'h) Help', 'q) Quit')
+ user_input = input('Downloader> ').strip()
+ if not user_input: print(); continue
command = user_input.lower().split()[0]
args = user_input.split()[1:]
try:
- if command == "l":
+ if command == 'l':
print()
- self._ds.list(self._ds.download_dir, header=False, more_prompt=True)
- elif command == "h":
+ self._ds.list(self._ds.download_dir, header=False,
+ more_prompt=True)
+ elif command == 'h':
self._simple_interactive_help()
- elif command == "c":
+ elif command == 'c':
self._simple_interactive_config()
- elif command in ("q", "x"):
+ elif command in ('q', 'x'):
return
- elif command == "d":
+ elif command == 'd':
self._simple_interactive_download(args)
- elif command == "u":
+ elif command == 'u':
self._simple_interactive_update()
else:
- print("Command %r unrecognized" % user_input)
+ print('Command %r unrecognized' % user_input)
except HTTPError as e:
- print("Error reading from server: %s" % e)
+ print('Error reading from server: %s'%e)
except URLError as e:
- print("Error connecting to server: %s" % e.reason)
+ print('Error connecting to server: %s'%e.reason)
# try checking if user_input is a package name, &
# downloading it?
print()
def _simple_interactive_download(self, args):
if args:
for arg in args:
- try:
- self._ds.download(arg, prefix=" ")
- except (IOError, ValueError) as e:
- print(e)
+ try: self._ds.download(arg, prefix=' ')
+ except (IOError, ValueError) as e: print(e)
else:
while True:
print()
- print("Download which package (l=list; x=cancel)?")
- user_input = input(" Identifier> ")
- if user_input.lower() == "l":
- self._ds.list(
- self._ds.download_dir,
- header=False,
- more_prompt=True,
- skip_installed=True,
- )
+ print('Download which package (l=list; x=cancel)?')
+ user_input = input(' Identifier> ')
+ if user_input.lower()=='l':
+ self._ds.list(self._ds.download_dir, header=False,
+ more_prompt=True, skip_installed=True)
continue
- elif user_input.lower() in ("x", "q", ""):
+ elif user_input.lower() in ('x', 'q', ''):
return
elif user_input:
for id in user_input.split():
- try:
- self._ds.download(id, prefix=" ")
- except (IOError, ValueError) as e:
- print(e)
+ try: self._ds.download(id, prefix=' ')
+ except (IOError, ValueError) as e: print(e)
break
def _simple_interactive_update(self):
while True:
stale_packages = []
stale = partial = False
- for info in sorted(getattr(self._ds, "packages")(), key=str):
+ for info in sorted(getattr(self._ds, 'packages')(), key=str):
if self._ds.status(info) == self._ds.STALE:
stale_packages.append((info.id, info.name))
print()
if stale_packages:
- print("Will update following packages (o=ok; x=cancel)")
+ print('Will update following packages (o=ok; x=cancel)')
for pid, pname in stale_packages:
- name = textwrap.fill(
- "-" * 27 + (pname), 75, subsequent_indent=27 * " "
- )[27:]
- print(" [ ] %s %s" % (pid.ljust(20, "."), name))
+ name = textwrap.fill('-'*27 + (pname),
+ 75, subsequent_indent=27*' ')[27:]
+ print(' [ ] %s %s' % (pid.ljust(20, '.'), name))
print()
- user_input = input(" Identifier> ")
- if user_input.lower() == "o":
+ user_input = input(' Identifier> ')
+ if user_input.lower()=='o':
for pid, pname in stale_packages:
- try:
- self._ds.download(pid, prefix=" ")
- except (IOError, ValueError) as e:
- print(e)
+ try: self._ds.download(pid, prefix=' ')
+ except (IOError, ValueError) as e: print(e)
break
- elif user_input.lower() in ("x", "q", ""):
+ elif user_input.lower() in ('x', 'q', ''):
return
else:
- print("Nothing to update.")
+ print('Nothing to update.')
return
def _simple_interactive_help(self):
print()
- print("Commands:")
- print(
- " d) Download a package or collection u) Update out of date packages"
- )
- print(" l) List packages & collections h) Help")
- print(" c) View & Modify Configuration q) Quit")
+ print('Commands:')
+ print(' d) Download a package or collection u) Update out of date packages')
+ print(' l) List packages & collections h) Help')
+ print(' c) View & Modify Configuration q) Quit')
def _show_config(self):
print()
- print("Data Server:")
- print(" - URL: <%s>" % self._ds.url)
- print((" - %d Package Collections Available" % len(self._ds.collections())))
- print((" - %d Individual Packages Available" % len(self._ds.packages())))
+ print('Data Server:')
+ print(' - URL: <%s>' % self._ds.url)
+ print((' - %d Package Collections Available' %
+ len(self._ds.collections())))
+ print((' - %d Individual Packages Available' %
+ len(self._ds.packages())))
print()
- print("Local Machine:")
- print(" - Data directory: %s" % self._ds.download_dir)
+ print('Local Machine:')
+ print(' - Data directory: %s' % self._ds.download_dir)
def _simple_interactive_config(self):
self._show_config()
while True:
print()
self._simple_interactive_menu(
- "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu"
- )
- user_input = input("Config> ").strip().lower()
- if user_input == "s":
+ 's) Show Config', 'u) Set Server URL',
+ 'd) Set Data Dir', 'm) Main Menu')
+ user_input = input('Config> ').strip().lower()
+ if user_input == 's':
self._show_config()
- elif user_input == "d":
- new_dl_dir = input(" New Directory> ").strip()
- if new_dl_dir in ("", "x", "q", "X", "Q"):
- print(" Cancelled!")
+ elif user_input == 'd':
+ new_dl_dir = input(' New Directory> ').strip()
+ if new_dl_dir in ('', 'x', 'q', 'X', 'Q'):
+ print(' Cancelled!')
elif os.path.isdir(new_dl_dir):
self._ds.download_dir = new_dl_dir
else:
- print(("Directory %r not found! Create it first." % new_dl_dir))
- elif user_input == "u":
- new_url = input(" New URL> ").strip()
- if new_url in ("", "x", "q", "X", "Q"):
- print(" Cancelled!")
+ print(('Directory %r not found! Create it first.' %
+ new_dl_dir))
+ elif user_input == 'u':
+ new_url = input(' New URL> ').strip()
+ if new_url in ('', 'x', 'q', 'X', 'Q'):
+ print(' Cancelled!')
else:
- if not new_url.startswith(("http://", "https://")):
- new_url = "http://" + new_url
- try:
- self._ds.url = new_url
+ if not new_url.startswith(('http://', 'https://')):
+ new_url = 'http://'+new_url
+ try: self._ds.url = new_url
except Exception as e:
- print("Error reading <%r>:\n %s" % (new_url, e))
- elif user_input == "m":
+ print('Error reading <%r>:\n %s' % (new_url, e))
+ elif user_input == 'm':
break
-
class DownloaderGUI(object):
"""
Graphical interface for downloading packages from the NLTK data
server.
"""
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Column Configuration
- # /////////////////////////////////////////////////////////////////
-
- COLUMNS = [
- "",
- "Identifier",
- "Name",
- "Size",
- "Status",
- "Unzipped Size",
- "Copyright",
- "Contact",
- "License",
- "Author",
- "Subdir",
- "Checksum",
- ]
+ #/////////////////////////////////////////////////////////////////
+
+ COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status',
+ 'Unzipped Size',
+ 'Copyright', 'Contact', 'License', 'Author',
+ 'Subdir', 'Checksum']
"""A list of the names of columns. This controls the order in
which the columns will appear. If this is edited, then
``_package_to_columns()`` may need to be edited to match."""
- COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0}
+ COLUMN_WEIGHTS = {'': 0, 'Name': 5, 'Size': 0, 'Status': 0}
"""A dictionary specifying how columns should be resized when the
table is resized. Columns with weight 0 will not be resized at
all; and columns with high weight will be resized more.
Default weight (for columns not explicitly listed) is 1."""
- COLUMN_WIDTHS = {
- "": 1,
- "Identifier": 20,
- "Name": 45,
- "Size": 10,
- "Unzipped Size": 10,
- "Status": 12,
- }
+ COLUMN_WIDTHS = {'':1, 'Identifier':20, 'Name':45,
+ 'Size': 10, 'Unzipped Size': 10,
+ 'Status': 12}
"""A dictionary specifying how wide each column should be, in
characters. The default width (for columns not explicitly
listed) is specified by ``DEFAULT_COLUMN_WIDTH``."""
"""The default width for columns that are not explicitly listed
in ``COLUMN_WIDTHS``."""
- INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"]
+ INITIAL_COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status']
"""The set of columns that should be displayed by default."""
# Perform a few import-time sanity checks to make sure that the
# column configuration variables are defined consistently:
- for c in COLUMN_WEIGHTS:
- assert c in COLUMNS
- for c in COLUMN_WIDTHS:
- assert c in COLUMNS
- for c in INITIAL_COLUMNS:
- assert c in COLUMNS
-
- # /////////////////////////////////////////////////////////////////
+ for c in COLUMN_WEIGHTS: assert c in COLUMNS
+ for c in COLUMN_WIDTHS: assert c in COLUMNS
+ for c in INITIAL_COLUMNS: assert c in COLUMNS
+
+ #/////////////////////////////////////////////////////////////////
# Color Configuration
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
- _BACKDROP_COLOR = ("#000", "#ccc")
+ _BACKDROP_COLOR = ('#000', '#ccc')
- _ROW_COLOR = {
- Downloader.INSTALLED: ("#afa", "#080"),
- Downloader.PARTIAL: ("#ffa", "#880"),
- Downloader.STALE: ("#faa", "#800"),
- Downloader.NOT_INSTALLED: ("#fff", "#888"),
- }
+ _ROW_COLOR = {Downloader.INSTALLED: ('#afa', '#080'),
+ Downloader.PARTIAL: ('#ffa', '#880'),
+ Downloader.STALE: ('#faa', '#800'),
+ Downloader.NOT_INSTALLED: ('#fff', '#888')}
- _MARK_COLOR = ("#000", "#ccc")
+ _MARK_COLOR = ('#000', '#ccc')
- # _FRONT_TAB_COLOR = ('#ccf', '#008')
- # _BACK_TAB_COLOR = ('#88a', '#448')
- _FRONT_TAB_COLOR = ("#fff", "#45c")
- _BACK_TAB_COLOR = ("#aaa", "#67a")
+ #_FRONT_TAB_COLOR = ('#ccf', '#008')
+ #_BACK_TAB_COLOR = ('#88a', '#448')
+ _FRONT_TAB_COLOR = ('#fff', '#45c')
+ _BACK_TAB_COLOR = ('#aaa', '#67a')
- _PROGRESS_COLOR = ("#f00", "#aaa")
+ _PROGRESS_COLOR = ('#f00', '#aaa')
- _TAB_FONT = "helvetica -16 bold"
+ _TAB_FONT = 'helvetica -16 bold'
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Constructor
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def __init__(self, dataserver, use_threads=True):
self._ds = dataserver
# A message log.
self._log_messages = []
self._log_indent = 0
- self._log("NLTK Downloader Started!")
+ self._log('NLTK Downloader Started!')
# Create the main window.
top = self.top = Tk()
- top.geometry("+50+50")
- top.title("NLTK Downloader")
+ top.geometry('+50+50')
+ top.title('NLTK Downloader')
top.configure(background=self._BACKDROP_COLOR[1])
# Set up some bindings now, in case anything goes wrong.
- top.bind("<Control-q>", self.destroy)
- top.bind("<Control-x>", self.destroy)
+ top.bind('<Control-q>', self.destroy)
+ top.bind('<Control-x>', self.destroy)
self._destroyed = False
self._column_vars = {}
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
self._show_info()
self._select_columns()
# Make sure we get notified when we're destroyed, so we can
# cancel any download in progress.
- self._table.bind("<Destroy>", self._destroy)
+ self._table.bind('<Destroy>', self._destroy)
def _log(self, msg):
- self._log_messages.append(
- "%s %s%s" % (time.ctime(), " | " * self._log_indent, msg)
- )
+ self._log_messages.append('%s %s%s' % (time.ctime(),
+ ' | '*self._log_indent, msg))
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Internals
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _init_widgets(self):
# Create the top-level frame structures
- f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0)
- f1.pack(sid="top", expand=True, fill="both")
+ f1 = Frame(self.top, relief='raised', border=2, padx=8, pady=0)
+ f1.pack(sid='top', expand=True, fill='both')
f1.grid_rowconfigure(2, weight=1)
f1.grid_columnconfigure(0, weight=1)
- Frame(f1, height=8).grid(column=0, row=0) # spacer
+ Frame(f1, height=8).grid(column=0, row=0) # spacer
tabframe = Frame(f1)
- tabframe.grid(column=0, row=1, sticky="news")
+ tabframe.grid(column=0, row=1, sticky='news')
tableframe = Frame(f1)
- tableframe.grid(column=0, row=2, sticky="news")
+ tableframe.grid(column=0, row=2, sticky='news')
buttonframe = Frame(f1)
- buttonframe.grid(column=0, row=3, sticky="news")
- Frame(f1, height=8).grid(column=0, row=4) # spacer
+ buttonframe.grid(column=0, row=3, sticky='news')
+ Frame(f1, height=8).grid(column=0, row=4) # spacer
infoframe = Frame(f1)
- infoframe.grid(column=0, row=5, sticky="news")
- Frame(f1, height=8).grid(column=0, row=6) # spacer
- progressframe = Frame(
- self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1]
- )
- progressframe.pack(side="bottom", fill="x")
- self.top["border"] = 0
- self.top["highlightthickness"] = 0
+ infoframe.grid(column=0, row=5, sticky='news')
+ Frame(f1, height=8).grid(column=0, row=6) # spacer
+ progressframe = Frame(self.top, padx=3, pady=3,
+ background=self._BACKDROP_COLOR[1])
+ progressframe.pack(side='bottom', fill='x')
+ self.top['border'] = 0
+ self.top['highlightthickness'] = 0
# Create the tabs
- self._tab_names = ["Collections", "Corpora", "Models", "All Packages"]
+ self._tab_names = ['Collections', 'Corpora',
+ 'Models', 'All Packages',]
self._tabs = {}
for i, tab in enumerate(self._tab_names):
label = Label(tabframe, text=tab, font=self._TAB_FONT)
- label.pack(side="left", padx=((i + 1) % 2) * 10)
- label.bind("<Button-1>", self._select_tab)
+ label.pack(side='left', padx=((i+1)%2)*10)
+ label.bind('<Button-1>', self._select_tab)
self._tabs[tab.lower()] = label
# Create the table.
- column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS]
- self._table = Table(
- tableframe,
- self.COLUMNS,
- column_weights=column_weights,
- highlightthickness=0,
- listbox_height=16,
- reprfunc=self._table_reprfunc,
- )
- self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked
+ column_weights = [self.COLUMN_WEIGHTS.get(column, 1)
+ for column in self.COLUMNS]
+ self._table = Table(tableframe, self.COLUMNS,
+ column_weights=column_weights,
+ highlightthickness=0, listbox_height=16,
+ reprfunc=self._table_reprfunc)
+ self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked
for i, column in enumerate(self.COLUMNS):
width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH)
self._table.columnconfig(i, width=width)
- self._table.pack(expand=True, fill="both")
+ self._table.pack(expand=True, fill='both')
self._table.focus()
- self._table.bind_to_listboxes("<Double-Button-1>", self._download)
- self._table.bind("<space>", self._table_mark)
- self._table.bind("<Return>", self._download)
- self._table.bind("<Left>", self._prev_tab)
- self._table.bind("<Right>", self._next_tab)
- self._table.bind("<Control-a>", self._mark_all)
+ self._table.bind_to_listboxes('<Double-Button-1>',
+ self._download)
+ self._table.bind('<space>', self._table_mark)
+ self._table.bind('<Return>', self._download)
+ self._table.bind('<Left>', self._prev_tab)
+ self._table.bind('<Right>', self._next_tab)
+ self._table.bind('<Control-a>', self._mark_all)
# Create entry boxes for URL & download_dir
infoframe.grid_columnconfigure(1, weight=1)
- info = [
- ("url", "Server Index:", self._set_url),
- ("download_dir", "Download Directory:", self._set_download_dir),
- ]
+ info = [('url', 'Server Index:', self._set_url),
+ ('download_dir','Download Directory:',self._set_download_dir)]
self._info = {}
for (i, (key, label, callback)) in enumerate(info):
- Label(infoframe, text=label).grid(column=0, row=i, sticky="e")
- entry = Entry(
- infoframe, font="courier", relief="groove", disabledforeground="black"
- )
+ Label(infoframe, text=label).grid(column=0, row=i, sticky='e')
+ entry = Entry(infoframe, font='courier', relief='groove',
+ disabledforeground='black')
self._info[key] = (entry, callback)
- entry.bind("<Return>", self._info_save)
- entry.bind("<Button-1>", lambda e, key=key: self._info_edit(key))
- entry.grid(column=1, row=i, sticky="ew")
+ entry.bind('<Return>', self._info_save)
+ entry.bind('<Button-1>', lambda e,key=key: self._info_edit(key))
+ entry.grid(column=1, row=i, sticky='ew')
# If the user edits url or download_dir, and then clicks outside
# the entry box, then save their results.
- self.top.bind("<Button-1>", self._info_save)
+ self.top.bind('<Button-1>', self._info_save)
# Create Download & Refresh buttons.
self._download_button = Button(
- buttonframe, text="Download", command=self._download, width=8
- )
- self._download_button.pack(side="left")
+ buttonframe, text='Download', command=self._download, width=8)
+ self._download_button.pack(side='left')
self._refresh_button = Button(
- buttonframe, text="Refresh", command=self._refresh, width=8
- )
- self._refresh_button.pack(side="right")
+ buttonframe, text='Refresh', command=self._refresh, width=8)
+ self._refresh_button.pack(side='right')
# Create Progress bar
- self._progresslabel = Label(
- progressframe,
- text="",
- foreground=self._BACKDROP_COLOR[0],
- background=self._BACKDROP_COLOR[1],
- )
- self._progressbar = Canvas(
- progressframe,
- width=200,
- height=16,
- background=self._PROGRESS_COLOR[1],
- relief="sunken",
- border=1,
- )
+ self._progresslabel = Label(progressframe, text='',
+ foreground=self._BACKDROP_COLOR[0],
+ background=self._BACKDROP_COLOR[1])
+ self._progressbar = Canvas(progressframe, width=200, height=16,
+ background=self._PROGRESS_COLOR[1],
+ relief='sunken', border=1)
self._init_progressbar()
- self._progressbar.pack(side="right")
- self._progresslabel.pack(side="left")
+ self._progressbar.pack(side='right')
+ self._progresslabel.pack(side='left')
def _init_menu(self):
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Download", underline=0, command=self._download, accelerator="Return"
- )
+ filemenu.add_command(label='Download', underline=0,
+ command=self._download, accelerator='Return')
filemenu.add_separator()
- filemenu.add_command(
- label="Change Server Index",
- underline=7,
- command=lambda: self._info_edit("url"),
- )
- filemenu.add_command(
- label="Change Download Directory",
- underline=0,
- command=lambda: self._info_edit("download_dir"),
- )
+ filemenu.add_command(label='Change Server Index', underline=7,
+ command=lambda: self._info_edit('url'))
+ filemenu.add_command(label='Change Download Directory', underline=0,
+ command=lambda: self._info_edit('download_dir'))
filemenu.add_separator()
- filemenu.add_command(label="Show Log", underline=5, command=self._show_log)
+ filemenu.add_command(label='Show Log', underline=5,
+ command=self._show_log)
filemenu.add_separator()
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-x')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
# Create a menu to control which columns of the table are
# shown. n.b.: we never hide the first two columns (mark and
var = IntVar(self.top)
assert column not in self._column_vars
self._column_vars[column] = var
- if column in self.INITIAL_COLUMNS:
- var.set(1)
- viewmenu.add_checkbutton(
- label=column, underline=0, variable=var, command=self._select_columns
- )
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ if column in self.INITIAL_COLUMNS: var.set(1)
+ viewmenu.add_checkbutton(label=column, underline=0, variable=var,
+ command=self._select_columns)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
# Create a sort menu
# [xx] this should be selectbuttons; and it should include
# reversed sorts as options.
sortmenu = Menu(menubar, tearoff=0)
for column in self._table.column_names[1:]:
- sortmenu.add_command(
- label="Sort by %s" % column,
- command=(lambda c=column: self._table.sort_by(c, "ascending")),
- )
+ sortmenu.add_command(label='Sort by %s' % column,
+ command=(lambda c=column:
+ self._table.sort_by(c, 'ascending')))
sortmenu.add_separator()
- # sortmenu.add_command(label='Descending Sort:')
+ #sortmenu.add_command(label='Descending Sort:')
for column in self._table.column_names[1:]:
- sortmenu.add_command(
- label="Reverse sort by %s" % column,
- command=(lambda c=column: self._table.sort_by(c, "descending")),
- )
- menubar.add_cascade(label="Sort", underline=0, menu=sortmenu)
+ sortmenu.add_command(label='Reverse sort by %s' % column,
+ command=(lambda c=column:
+ self._table.sort_by(c, 'descending')))
+ menubar.add_cascade(label='Sort', underline=0, menu=sortmenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- helpmenu.add_command(
- label="Instructions", underline=0, command=self.help, accelerator="F1"
- )
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
- self.top.bind("<F1>", self.help)
+ helpmenu.add_command(label='About', underline=0,
+ command=self.about)
+ helpmenu.add_command(label='Instructions', underline=0,
+ command=self.help, accelerator='F1')
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+ self.top.bind('<F1>', self.help)
self.top.config(menu=menubar)
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
self._table.select(0)
def _info_edit(self, info_key):
- self._info_save() # just in case.
+ self._info_save() # just in case.
(entry, callback) = self._info[info_key]
- entry["state"] = "normal"
- entry["relief"] = "sunken"
+ entry['state'] = 'normal'
+ entry['relief'] = 'sunken'
entry.focus()
def _info_save(self, e=None):
focus = self._table
for entry, callback in self._info.values():
- if entry["state"] == "disabled":
- continue
- if e is not None and e.widget is entry and e.keysym != "Return":
+ if entry['state'] == 'disabled': continue
+ if e is not None and e.widget is entry and e.keysym != 'Return':
focus = entry
else:
- entry["state"] = "disabled"
- entry["relief"] = "groove"
+ entry['state'] = 'disabled'
+ entry['relief'] = 'groove'
callback(entry.get())
focus.focus()
def _table_reprfunc(self, row, col, val):
- if self._table.column_names[col].endswith("Size"):
- if isinstance(val, str):
- return " %s" % val
- elif val < 1024 ** 2:
- return " %.1f KB" % (val / 1024.0 ** 1)
- elif val < 1024 ** 3:
- return " %.1f MB" % (val / 1024.0 ** 2)
- else:
- return " %.1f GB" % (val / 1024.0 ** 3)
+ if self._table.column_names[col].endswith('Size'):
+ if isinstance(val, string_types): return ' %s' % val
+ elif val < 1024**2: return ' %.1f KB' % (val/1024.**1)
+ elif val < 1024**3: return ' %.1f MB' % (val/1024.**2)
+ else: return ' %.1f GB' % (val/1024.**3)
- if col in (0, ""):
- return str(val)
- else:
- return " %s" % val
+ if col in (0, ''): return str(val)
+ else: return ' %s' % val
def _set_url(self, url):
- if url == self._ds.url:
- return
+ if url == self._ds.url: return
try:
self._ds.url = url
self._fill_table()
except IOError as e:
- showerror("Error Setting Server Index", str(e))
+ showerror('Error Setting Server Index', str(e))
self._show_info()
+
def _set_download_dir(self, download_dir):
- if self._ds.download_dir == download_dir:
- return
+ if self._ds.download_dir == download_dir: return
# check if the dir exists, and if not, ask if we should create it?
# Clear our status cache, & re-check what's installed
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
self._show_info()
def _show_info(self):
- print("showing info", self._ds.url)
- for entry, cb in self._info.values():
- entry["state"] = "normal"
- entry.delete(0, "end")
- self._info["url"][0].insert(0, self._ds.url)
- self._info["download_dir"][0].insert(0, self._ds.download_dir)
- for entry, cb in self._info.values():
- entry["state"] = "disabled"
+ print('showing info', self._ds.url)
+ for entry,cb in self._info.values():
+ entry['state'] = 'normal'
+ entry.delete(0, 'end')
+ self._info['url'][0].insert(0, self._ds.url)
+ self._info['download_dir'][0].insert(0, self._ds.download_dir)
+ for entry,cb in self._info.values():
+ entry['state'] = 'disabled'
def _prev_tab(self, *e):
for i, tab in enumerate(self._tab_names):
if tab.lower() == self._tab and i > 0:
- self._tab = self._tab_names[i - 1].lower()
+ self._tab = self._tab_names[i-1].lower()
try:
return self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
def _next_tab(self, *e):
for i, tab in enumerate(self._tab_names):
- if tab.lower() == self._tab and i < (len(self._tabs) - 1):
- self._tab = self._tab_names[i + 1].lower()
+ if tab.lower() == self._tab and i < (len(self._tabs)-1):
+ self._tab = self._tab_names[i+1].lower()
try:
return self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
def _select_tab(self, event):
- self._tab = event.widget["text"].lower()
+ self._tab = event.widget['text'].lower()
try:
self._fill_table()
except HTTPError as e:
- showerror("Error reading from server", e)
+ showerror('Error reading from server', e)
except URLError as e:
- showerror("Error connecting to server", e.reason)
+ showerror('Error connecting to server', e.reason)
- _tab = "collections"
- # _tab = 'corpora'
+ _tab = 'collections'
+ #_tab = 'corpora'
_rows = None
-
def _fill_table(self):
selected_row = self._table.selected_row()
self._table.clear()
- if self._tab == "all packages":
+ if self._tab == 'all packages':
items = self._ds.packages()
- elif self._tab == "corpora":
+ elif self._tab == 'corpora':
items = self._ds.corpora()
- elif self._tab == "models":
+ elif self._tab == 'models':
items = self._ds.models()
- elif self._tab == "collections":
+ elif self._tab == 'collections':
items = self._ds.collections()
else:
- assert 0, "bad tab value %r" % self._tab
+ assert 0, 'bad tab value %r' % self._tab
rows = [self._package_to_columns(item) for item in items]
self._table.extend(rows)
# Highlight the active tab.
for tab, label in self._tabs.items():
if tab == self._tab:
- label.configure(
- foreground=self._FRONT_TAB_COLOR[0],
- background=self._FRONT_TAB_COLOR[1],
- )
+ label.configure(foreground=self._FRONT_TAB_COLOR[0],
+ background=self._FRONT_TAB_COLOR[1])
else:
- label.configure(
- foreground=self._BACK_TAB_COLOR[0],
- background=self._BACK_TAB_COLOR[1],
- )
+ label.configure(foreground=self._BACK_TAB_COLOR[0],
+ background=self._BACK_TAB_COLOR[1])
- self._table.sort_by("Identifier", order="ascending")
+ self._table.sort_by('Identifier', order='ascending')
self._color_table()
self._table.select(selected_row)
# though. (This is on OS X w/ python 2.5) The length of
# delay that's necessary seems to depend on how fast the
# comptuer is. :-/
- self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview())
- self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview())
+ self.top.after(150, self._table._scrollbar.set,
+ *self._table._mlb.yview())
+ self.top.after(300, self._table._scrollbar.set,
+ *self._table._mlb.yview())
def _update_table_status(self):
for row_num in range(len(self._table)):
- status = self._ds.status(self._table[row_num, "Identifier"])
- self._table[row_num, "Status"] = status
+ status = self._ds.status(self._table[row_num, 'Identifier'])
+ self._table[row_num, 'Status'] = status
self._color_table()
def _download(self, *e):
if self._use_threads:
return self._download_threaded(*e)
- marked = [
- self._table[row, "Identifier"]
- for row in range(len(self._table))
- if self._table[row, 0] != ""
- ]
+ marked = [self._table[row, 'Identifier']
+ for row in range(len(self._table))
+ if self._table[row, 0] != '']
selection = self._table.selected_row()
if not marked and selection is not None:
- marked = [self._table[selection, "Identifier"]]
+ marked = [self._table[selection, 'Identifier']]
download_iter = self._ds.incr_download(marked, self._ds.download_dir)
self._log_indent = 0
self._download_cb(download_iter, marked)
- _DL_DELAY = 10
-
+ _DL_DELAY=10
def _download_cb(self, download_iter, ids):
- try:
- msg = next(download_iter)
+ try: msg = next(download_iter)
except StopIteration:
- # self._fill_table(sort=False)
+ #self._fill_table(sort=False)
self._update_table_status()
afterid = self.top.after(10, self._show_progress, 0)
- self._afterid["_download_cb"] = afterid
+ self._afterid['_download_cb'] = afterid
return
def show(s):
- self._progresslabel["text"] = s
+ self._progresslabel['text'] = s
self._log(s)
-
if isinstance(msg, ProgressMessage):
self._show_progress(msg.progress)
elif isinstance(msg, ErrorMessage):
if msg.package is not None:
self._select(msg.package.id)
self._show_progress(None)
- return # halt progress.
+ return # halt progress.
elif isinstance(msg, StartCollectionMessage):
- show("Downloading collection %s" % msg.collection.id)
+ show('Downloading collection %s' % msg.collection.id)
self._log_indent += 1
elif isinstance(msg, StartPackageMessage):
- show("Downloading package %s" % msg.package.id)
+ show('Downloading package %s' % msg.package.id)
elif isinstance(msg, UpToDateMessage):
- show("Package %s is up-to-date!" % msg.package.id)
- # elif isinstance(msg, StaleMessage):
+ show('Package %s is up-to-date!' % msg.package.id)
+ #elif isinstance(msg, StaleMessage):
# show('Package %s is out-of-date or corrupt' % msg.package.id)
elif isinstance(msg, FinishDownloadMessage):
- show("Finished downloading %r." % msg.package.id)
+ show('Finished downloading %r.' % msg.package.id)
elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s" % msg.package.filename)
+ show('Unzipping %s' % msg.package.filename)
elif isinstance(msg, FinishCollectionMessage):
self._log_indent -= 1
- show("Finished downloading collection %r." % msg.collection.id)
+ show('Finished downloading collection %r.' % msg.collection.id)
self._clear_mark(msg.collection.id)
elif isinstance(msg, FinishPackageMessage):
self._clear_mark(msg.package.id)
- afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids)
- self._afterid["_download_cb"] = afterid
+ afterid = self.top.after(self._DL_DELAY, self._download_cb,
+ download_iter, ids)
+ self._afterid['_download_cb'] = afterid
def _select(self, id):
for row in range(len(self._table)):
- if self._table[row, "Identifier"] == id:
+ if self._table[row, 'Identifier'] == id:
self._table.select(row)
return
def _color_table(self):
# Color rows according to status.
for row in range(len(self._table)):
- bg, sbg = self._ROW_COLOR[self._table[row, "Status"]]
- fg, sfg = ("black", "white")
- self._table.rowconfig(
- row,
- foreground=fg,
- selectforeground=sfg,
- background=bg,
- selectbackground=sbg,
- )
+ bg, sbg = self._ROW_COLOR[self._table[row, 'Status']]
+ fg, sfg = ('black', 'white')
+ self._table.rowconfig(row, foreground=fg, selectforeground=sfg,
+ background=bg, selectbackground=sbg)
# Color the marked column
- self._table.itemconfigure(
- row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1]
- )
+ self._table.itemconfigure(row, 0,
+ foreground=self._MARK_COLOR[0],
+ background=self._MARK_COLOR[1])
+
def _clear_mark(self, id):
for row in range(len(self._table)):
- if self._table[row, "Identifier"] == id:
- self._table[row, 0] = ""
+ if self._table[row, 'Identifier'] == id:
+ self._table[row, 0] = ''
def _mark_all(self, *e):
for row in range(len(self._table)):
- self._table[row, 0] = "X"
+ self._table[row,0] = 'X'
def _table_mark(self, *e):
selection = self._table.selected_row()
if selection >= 0:
- if self._table[selection][0] != "":
- self._table[selection, 0] = ""
+ if self._table[selection][0] != '':
+ self._table[selection,0] = ''
else:
- self._table[selection, 0] = "X"
+ self._table[selection,0] = 'X'
self._table.select(delta=1)
def _show_log(self):
- text = "\n".join(self._log_messages)
- ShowText(self.top, "NLTK Downloader Log", text)
+ text = '\n'.join(self._log_messages)
+ ShowText(self.top, 'NLTK Downloader Log', text)
def _package_to_columns(self, pkg):
"""
"""
row = []
for column_index, column_name in enumerate(self.COLUMNS):
- if column_index == 0: # Mark:
- row.append("")
- elif column_name == "Identifier":
+ if column_index == 0: # Mark:
+ row.append('')
+ elif column_name == 'Identifier':
row.append(pkg.id)
- elif column_name == "Status":
+ elif column_name == 'Status':
row.append(self._ds.status(pkg))
else:
- attr = column_name.lower().replace(" ", "_")
- row.append(getattr(pkg, attr, "n/a"))
+ attr = column_name.lower().replace(' ', '_')
+ row.append(getattr(pkg, attr, 'n/a'))
return row
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# External Interface
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def destroy(self, *e):
- if self._destroyed:
- return
+ if self._destroyed: return
self.top.destroy()
self._destroyed = True
def mainloop(self, *args, **kwargs):
self.top.mainloop(*args, **kwargs)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# HELP
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
- HELP = textwrap.dedent(
- """\
+ HELP = textwrap.dedent("""\
This tool can be used to download a variety of corpora and models
that can be used with NLTK. Each corpus or model is distributed
in a single zip file, known as a \"package file.\" You can
[down]\t Select next package
[left]\t Select previous tab
[right]\t Select next tab
- """
- )
+ """)
def help(self, *e):
# The default font's not very legible; try using 'fixed' instead.
try:
- ShowText(
- self.top,
- "Help: NLTK Dowloader",
- self.HELP.strip(),
- width=75,
- font="fixed",
- )
+ ShowText(self.top, 'Help: NLTK Dowloader',
+ self.HELP.strip(), width=75, font='fixed')
except:
- ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75)
+ ShowText(self.top, 'Help: NLTK Downloader',
+ self.HELP.strip(), width=75)
def about(self, *e):
- ABOUT = "NLTK Downloader\n" + "Written by Edward Loper"
- TITLE = "About: NLTK Downloader"
+ ABOUT = ("NLTK Downloader\n"+
+ "Written by Edward Loper")
+ TITLE = 'About: NLTK Downloader'
try:
- from tkinter.messagebox import Message
-
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except ImportError:
ShowText(self.top, TITLE, ABOUT)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Progress Bar
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
_gradient_width = 5
-
def _init_progressbar(self):
c = self._progressbar
- width, height = int(c["width"]), int(c["height"])
- for i in range(0, (int(c["width"]) * 2) // self._gradient_width):
- c.create_line(
- i * self._gradient_width + 20,
- -20,
- i * self._gradient_width - height - 20,
- height + 20,
- width=self._gradient_width,
- fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12),
- )
- c.addtag_all("gradient")
- c.itemconfig("gradient", state="hidden")
+ width, height = int(c['width']), int(c['height'])
+ for i in range(0, (int(c['width'])*2)//self._gradient_width):
+ c.create_line(i*self._gradient_width+20, -20,
+ i*self._gradient_width-height-20, height+20,
+ width=self._gradient_width,
+ fill='#%02x0000' % (80 + abs(i%6-3)*12))
+ c.addtag_all('gradient')
+ c.itemconfig('gradient', state='hidden')
# This is used to display progress
- c.addtag_withtag(
- "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])
- )
+ c.addtag_withtag('redbox', c.create_rectangle(
+ 0, 0, 0, 0, fill=self._PROGRESS_COLOR[0]))
def _show_progress(self, percent):
c = self._progressbar
if percent is None:
- c.coords("redbox", 0, 0, 0, 0)
- c.itemconfig("gradient", state="hidden")
+ c.coords('redbox', 0, 0, 0, 0)
+ c.itemconfig('gradient', state='hidden')
else:
- width, height = int(c["width"]), int(c["height"])
+ width, height = int(c['width']), int(c['height'])
x = percent * int(width) // 100 + 1
- c.coords("redbox", 0, 0, x, height + 1)
+ c.coords('redbox', 0, 0, x, height+1)
def _progress_alive(self):
c = self._progressbar
if not self._downloading:
- c.itemconfig("gradient", state="hidden")
+ c.itemconfig('gradient', state='hidden')
else:
- c.itemconfig("gradient", state="normal")
- x1, y1, x2, y2 = c.bbox("gradient")
+ c.itemconfig('gradient', state='normal')
+ x1, y1, x2, y2 = c.bbox('gradient')
if x1 <= -100:
- c.move("gradient", (self._gradient_width * 6) - 4, 0)
+ c.move('gradient', (self._gradient_width*6)-4, 0)
else:
- c.move("gradient", -4, 0)
+ c.move('gradient', -4, 0)
afterid = self.top.after(200, self._progress_alive)
- self._afterid["_progress_alive"] = afterid
+ self._afterid['_progress_alive'] = afterid
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Threaded downloader
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _download_threaded(self, *e):
# If the user tries to start a new download while we're already
return
# Change the 'download' button to an 'abort' button.
- self._download_button["text"] = "Cancel"
+ self._download_button['text'] = 'Cancel'
- marked = [
- self._table[row, "Identifier"]
- for row in range(len(self._table))
- if self._table[row, 0] != ""
- ]
+ marked = [self._table[row, 'Identifier']
+ for row in range(len(self._table))
+ if self._table[row, 0] != '']
selection = self._table.selected_row()
if not marked and selection is not None:
- marked = [self._table[selection, "Identifier"]]
+ marked = [self._table[selection, 'Identifier']]
# Create a new data server object for the download operation,
# just in case the user modifies our data server during the
# Start downloading in a separate thread.
assert self._download_msg_queue == []
assert self._download_abort_queue == []
- self._DownloadThread(
- ds,
- marked,
- self._download_lock,
- self._download_msg_queue,
- self._download_abort_queue,
- ).start()
+ self._DownloadThread(ds, marked, self._download_lock,
+ self._download_msg_queue,
+ self._download_abort_queue).start()
# Monitor the download message queue & display its progress.
self._log_indent = 0
def _abort_download(self):
if self._downloading:
self._download_lock.acquire()
- self._download_abort_queue.append("abort")
+ self._download_abort_queue.append('abort')
self._download_lock.release()
class _DownloadThread(threading.Thread):
self.abort = abort
threading.Thread.__init__(self)
- def run(self):
+ def run (self):
for msg in self.data_server.incr_download(self.items):
self.lock.acquire()
self.message_queue.append(msg)
# Check if we've been told to kill ourselves:
if self.abort:
- self.message_queue.append("aborted")
+ self.message_queue.append('aborted')
self.lock.release()
return
self.lock.release()
self.lock.acquire()
- self.message_queue.append("finished")
+ self.message_queue.append('finished')
self.lock.release()
- _MONITOR_QUEUE_DELAY = 100
-
+ _MONITOR_QUEUE_DELAY=100
def _monitor_message_queue(self):
def show(s):
- self._progresslabel["text"] = s
+ self._progresslabel['text'] = s
self._log(s)
# Try to acquire the lock; if it's busy, then just try again later.
for msg in self._download_msg_queue:
# Done downloading?
- if msg == "finished" or msg == "aborted":
- # self._fill_table(sort=False)
+ if msg == 'finished' or msg == 'aborted':
+ #self._fill_table(sort=False)
self._update_table_status()
self._downloading = False
- self._download_button["text"] = "Download"
+ self._download_button['text'] = 'Download'
del self._download_msg_queue[:]
del self._download_abort_queue[:]
self._download_lock.release()
- if msg == "aborted":
- show("Download aborted!")
+ if msg == 'aborted':
+ show('Download aborted!')
self._show_progress(None)
else:
afterid = self.top.after(100, self._show_progress, None)
- self._afterid["_monitor_message_queue"] = afterid
+ self._afterid['_monitor_message_queue'] = afterid
return
# All other messages
self._select(msg.package.id)
self._show_progress(None)
self._downloading = False
- return # halt progress.
+ return # halt progress.
elif isinstance(msg, StartCollectionMessage):
- show("Downloading collection %r" % msg.collection.id)
+ show('Downloading collection %r' % msg.collection.id)
self._log_indent += 1
elif isinstance(msg, StartPackageMessage):
self._ds.clear_status_cache(msg.package.id)
- show("Downloading package %r" % msg.package.id)
+ show('Downloading package %r' % msg.package.id)
elif isinstance(msg, UpToDateMessage):
- show("Package %s is up-to-date!" % msg.package.id)
- # elif isinstance(msg, StaleMessage):
+ show('Package %s is up-to-date!' % msg.package.id)
+ #elif isinstance(msg, StaleMessage):
# show('Package %s is out-of-date or corrupt; updating it' %
# msg.package.id)
elif isinstance(msg, FinishDownloadMessage):
- show("Finished downloading %r." % msg.package.id)
+ show('Finished downloading %r.' % msg.package.id)
elif isinstance(msg, StartUnzipMessage):
- show("Unzipping %s" % msg.package.filename)
+ show('Unzipping %s' % msg.package.filename)
elif isinstance(msg, FinishUnzipMessage):
- show("Finished installing %s" % msg.package.id)
+ show('Finished installing %s' % msg.package.id)
elif isinstance(msg, FinishCollectionMessage):
self._log_indent -= 1
- show("Finished downloading collection %r." % msg.collection.id)
+ show('Finished downloading collection %r.' % msg.collection.id)
self._clear_mark(msg.collection.id)
elif isinstance(msg, FinishPackageMessage):
self._update_table_status()
# waiting for a good point to abort it, so we don't end up
# with a partially unzipped package or anything like that).
if self._download_abort_queue:
- self._progresslabel["text"] = "Aborting download..."
+ self._progresslabel['text'] = 'Aborting download...'
# Clear the message queue and then release the lock
del self._download_msg_queue[:]
self._download_lock.release()
# Check the queue again after MONITOR_QUEUE_DELAY msec.
- afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue)
- self._afterid["_monitor_message_queue"] = afterid
-
+ afterid = self.top.after(self._MONITOR_QUEUE_DELAY,
+ self._monitor_message_queue)
+ self._afterid['_monitor_message_queue'] = afterid
######################################################################
# Helper Functions
######################################################################
# [xx] It may make sense to move these to nltk.internals.
-
def md5_hexdigest(file):
"""
Calculate and return the MD5 checksum for a given file.
``file`` may either be a filename or an open stream.
"""
- if isinstance(file, str):
- with open(file, "rb") as infile:
+ if isinstance(file, string_types):
+ with open(file, 'rb') as infile:
return _md5_hexdigest(infile)
return _md5_hexdigest(file)
-
def _md5_hexdigest(fp):
md5_digest = md5()
while True:
- block = fp.read(1024 * 16) # 16k blocks
- if not block:
- break
+ block = fp.read(1024*16) # 16k blocks
+ if not block: break
md5_digest.update(block)
return md5_digest.hexdigest()
if isinstance(message, ErrorMessage):
raise Exception(message)
-
def _unzip_iter(filename, root, verbose=True):
if verbose:
- sys.stdout.write("Unzipping %s" % os.path.split(filename)[1])
+ sys.stdout.write('Unzipping %s' % os.path.split(filename)[1])
sys.stdout.flush()
- try:
- zf = zipfile.ZipFile(filename)
+ try: zf = zipfile.ZipFile(filename)
except zipfile.error as e:
- yield ErrorMessage(filename, "Error with downloaded zip file")
+ yield ErrorMessage(filename, 'Error with downloaded zip file')
return
except Exception as e:
yield ErrorMessage(filename, e)
return
- zf.extractall(root)
+ # Get lists of directories & files
+ namelist = zf.namelist()
+ dirlist = set()
+ for x in namelist:
+ if x.endswith('/'):
+ dirlist.add(x)
+ else:
+ dirlist.add(x.rsplit('/',1)[0] + '/')
+ filelist = [x for x in namelist if not x.endswith('/')]
+
+ # Create the target directory if it doesn't exist
+ if not os.path.exists(root):
+ os.mkdir(root)
+
+ # Create the directory structure
+ for dirname in sorted(dirlist):
+ pieces = dirname[:-1].split('/')
+ for i in range(len(pieces)):
+ dirpath = os.path.join(root, *pieces[:i+1])
+ if not os.path.exists(dirpath):
+ os.mkdir(dirpath)
+ # Extract files.
+ for i, filename in enumerate(filelist):
+ filepath = os.path.join(root, *filename.split('/'))
+
+ try:
+ with open(filepath, 'wb') as dstfile, zf.open(filename) as srcfile:
+ shutil.copyfileobj(srcfile, dstfile)
+ except Exception as e:
+ yield ErrorMessage(filename, e)
+ return
+
+ if verbose and (i*10/len(filelist) > (i-1)*10/len(filelist)):
+ sys.stdout.write('.')
+ sys.stdout.flush()
if verbose:
print()
-
######################################################################
# Index Builder
######################################################################
# This may move to a different file sometime.
-
+import subprocess, zipfile
def build_index(root, base_url):
"""
"""
# Find all packages.
packages = []
- for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")):
+ for pkg_xml, zf, subdir in _find_packages(os.path.join(root, 'packages')):
zipstat = os.stat(zf.filename)
- url = "%s/%s/%s" % (base_url, subdir, os.path.split(zf.filename)[1])
+ url = '%s/%s/%s' % (base_url, subdir, os.path.split(zf.filename)[1])
unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist())
# Fill in several fields of the package xml with calculated values.
- pkg_xml.set("unzipped_size", "%s" % unzipped_size)
- pkg_xml.set("size", "%s" % zipstat.st_size)
- pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename))
- pkg_xml.set("subdir", subdir)
- # pkg_xml.set('svn_revision', _svn_revision(zf.filename))
- if not pkg_xml.get("url"):
- pkg_xml.set("url", url)
+ pkg_xml.set('unzipped_size', '%s' % unzipped_size)
+ pkg_xml.set('size', '%s' % zipstat.st_size)
+ pkg_xml.set('checksum', '%s' % md5_hexdigest(zf.filename))
+ pkg_xml.set('subdir', subdir)
+ #pkg_xml.set('svn_revision', _svn_revision(zf.filename))
+ if not pkg_xml.get('url'):
+ pkg_xml.set('url', url)
# Record the package.
packages.append(pkg_xml)
# Find all collections
- collections = list(_find_collections(os.path.join(root, "collections")))
+ collections = list(_find_collections(os.path.join(root, 'collections')))
# Check that all UIDs are unique
uids = set()
- for item in packages + collections:
- if item.get("id") in uids:
- raise ValueError("Duplicate UID: %s" % item.get("id"))
- uids.add(item.get("id"))
+ for item in packages+collections:
+ if item.get('id') in uids:
+ raise ValueError('Duplicate UID: %s' % item.get('id'))
+ uids.add(item.get('id'))
# Put it all together
- top_elt = ElementTree.Element("nltk_data")
- top_elt.append(ElementTree.Element("packages"))
- for package in packages:
- top_elt[0].append(package)
- top_elt.append(ElementTree.Element("collections"))
- for collection in collections:
- top_elt[1].append(collection)
+ top_elt = ElementTree.Element('nltk_data')
+ top_elt.append(ElementTree.Element('packages'))
+ for package in packages: top_elt[0].append(package)
+ top_elt.append(ElementTree.Element('collections'))
+ for collection in collections: top_elt[1].append(collection)
_indent_xml(top_elt)
return top_elt
-
-def _indent_xml(xml, prefix=""):
+def _indent_xml(xml, prefix=''):
"""
Helper for ``build_index()``: Given an XML ``ElementTree``, modify it
(and its descendents) ``text`` and ``tail`` attributes to generate
spaces with respect to its parent.
"""
if len(xml) > 0:
- xml.text = (xml.text or "").strip() + "\n" + prefix + " "
+ xml.text = (xml.text or '').strip() + '\n' + prefix + ' '
for child in xml:
- _indent_xml(child, prefix + " ")
+ _indent_xml(child, prefix+' ')
for child in xml[:-1]:
- child.tail = (child.tail or "").strip() + "\n" + prefix + " "
- xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix
-
+ child.tail = (child.tail or '').strip() + '\n' + prefix + ' '
+ xml[-1].tail = (xml[-1].tail or '').strip() + '\n' + prefix
def _check_package(pkg_xml, zipfilename, zf):
"""
"""
# The filename must patch the id given in the XML file.
uid = os.path.splitext(os.path.split(zipfilename)[1])[0]
- if pkg_xml.get("id") != uid:
- raise ValueError(
- "package identifier mismatch (%s vs %s)" % (pkg_xml.get("id"), uid)
- )
+ if pkg_xml.get('id') != uid:
+ raise ValueError('package identifier mismatch (%s vs %s)' %
+ (pkg_xml.get('id'), uid))
# Zip file must expand to a subdir whose name matches uid.
- if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()):
- raise ValueError(
- "Zipfile %s.zip does not expand to a single "
- "subdirectory %s/" % (uid, uid)
- )
-
+ if sum( (name!=uid and not name.startswith(uid+'/'))
+ for name in zf.namelist() ):
+ raise ValueError('Zipfile %s.zip does not expand to a single '
+ 'subdirectory %s/' % (uid, uid))
# update for git?
def _svn_revision(filename):
Helper for ``build_index()``: Calculate the subversion revision
number for a given file (by using ``subprocess`` to run ``svn``).
"""
- p = subprocess.Popen(
- ["svn", "status", "-v", filename],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
+ p = subprocess.Popen(['svn', 'status', '-v', filename],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
(stdout, stderr) = p.communicate()
if p.returncode != 0 or stderr or not stdout:
- raise ValueError(
- "Error determining svn_revision for %s: %s"
- % (os.path.split(filename)[1], textwrap.fill(stderr))
- )
+ raise ValueError('Error determining svn_revision for %s: %s' %
+ (os.path.split(filename)[1], textwrap.fill(stderr)))
return stdout.split()[2]
-
def _find_collections(root):
"""
Helper for ``build_index()``: Yield a list of ElementTree.Element
packages = []
for dirname, subdirs, files in os.walk(root):
for filename in files:
- if filename.endswith(".xml"):
+ if filename.endswith('.xml'):
xmlfile = os.path.join(dirname, filename)
yield ElementTree.parse(xmlfile).getroot()
-
def _find_packages(root):
"""
Helper for ``build_index()``: Yield a list of tuples
the package was found (e.g. 'corpora' or 'grammars').
"""
from nltk.corpus.reader.util import _path_from
-
# Find all packages.
packages = []
for dirname, subdirs, files in os.walk(root):
- relpath = "/".join(_path_from(root, dirname))
+ relpath = '/'.join(_path_from(root, dirname))
for filename in files:
- if filename.endswith(".xml"):
+ if filename.endswith('.xml'):
xmlfilename = os.path.join(dirname, filename)
- zipfilename = xmlfilename[:-4] + ".zip"
- try:
- zf = zipfile.ZipFile(zipfilename)
+ zipfilename = xmlfilename[:-4]+'.zip'
+ try: zf = zipfile.ZipFile(zipfilename)
except Exception as e:
- raise ValueError("Error reading file %r!\n%s" % (zipfilename, e))
- try:
- pkg_xml = ElementTree.parse(xmlfilename).getroot()
+ raise ValueError('Error reading file %r!\n%s' %
+ (zipfilename, e))
+ try: pkg_xml = ElementTree.parse(xmlfilename).getroot()
except Exception as e:
- raise ValueError("Error reading file %r!\n%s" % (xmlfilename, e))
+ raise ValueError('Error reading file %r!\n%s' %
+ (xmlfilename, e))
# Check that the UID matches the filename
uid = os.path.split(xmlfilename[:-4])[1]
- if pkg_xml.get("id") != uid:
- raise ValueError(
- "package identifier mismatch (%s "
- "vs %s)" % (pkg_xml.get("id"), uid)
- )
+ if pkg_xml.get('id') != uid:
+ raise ValueError('package identifier mismatch (%s '
+ 'vs %s)' % (pkg_xml.get('id'), uid))
# Check that the zipfile expands to a subdir whose
# name matches the uid.
- if sum(
- (name != uid and not name.startswith(uid + "/"))
- for name in zf.namelist()
- ):
- raise ValueError(
- "Zipfile %s.zip does not expand to a "
- "single subdirectory %s/" % (uid, uid)
- )
+ if sum( (name!=uid and not name.startswith(uid+'/'))
+ for name in zf.namelist() ):
+ raise ValueError('Zipfile %s.zip does not expand to a '
+ 'single subdirectory %s/' % (uid, uid))
yield pkg_xml, zf, relpath
# Don't recurse into svn subdirectories:
- try:
- subdirs.remove(".svn")
- except ValueError:
- pass
-
+ try: subdirs.remove('.svn')
+ except ValueError: pass
######################################################################
# Main:
_downloader = Downloader()
download = _downloader.download
-
def download_shell():
DownloaderShell(_downloader).run()
-
def download_gui():
DownloaderGUI(_downloader).mainloop()
-
def update():
_downloader.update()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
from optparse import OptionParser
-
parser = OptionParser()
- parser.add_option(
- "-d",
- "--dir",
- dest="dir",
- help="download package to directory DIR",
- metavar="DIR",
- )
- parser.add_option(
- "-q",
- "--quiet",
- dest="quiet",
- action="store_true",
- default=False,
- help="work quietly",
- )
- parser.add_option(
- "-f",
- "--force",
- dest="force",
- action="store_true",
- default=False,
- help="download even if already installed",
- )
- parser.add_option(
- "-e",
- "--exit-on-error",
- dest="halt_on_error",
- action="store_true",
- default=False,
- help="exit if an error occurs",
- )
- parser.add_option(
- "-u",
- "--url",
- dest="server_index_url",
- default=os.environ.get("NLTK_DOWNLOAD_URL"),
- help="download server index url",
- )
+ parser.add_option("-d", "--dir", dest="dir",
+ help="download package to directory DIR", metavar="DIR")
+ parser.add_option("-q", "--quiet", dest="quiet", action="store_true",
+ default=False, help="work quietly")
+ parser.add_option("-f", "--force", dest="force", action="store_true",
+ default=False, help="download even if already installed")
+ parser.add_option("-e", "--exit-on-error", dest="halt_on_error", action="store_true",
+ default=False, help="exit if an error occurs")
+ parser.add_option("-u", "--url", dest="server_index_url",
+ default=os.environ.get('NLTK_DOWNLOAD_URL'),
+ help="download server index url")
(options, args) = parser.parse_args()
- downloader = Downloader(server_index_url=options.server_index_url)
+ downloader = Downloader(server_index_url = options.server_index_url)
if args:
for pkg_id in args:
- rv = downloader.download(
- info_or_id=pkg_id,
- download_dir=options.dir,
- quiet=options.quiet,
- force=options.force,
- halt_on_error=options.halt_on_error,
- )
- if rv == False and options.halt_on_error:
+ rv = downloader.download(info_or_id=pkg_id, download_dir=options.dir,
+ quiet=options.quiet, force=options.force,
+ halt_on_error=options.halt_on_error)
+ if rv==False and options.halt_on_error:
break
else:
- downloader.download(
- download_dir=options.dir,
- quiet=options.quiet,
- force=options.force,
- halt_on_error=options.halt_on_error,
- )
+ downloader.download(download_dir=options.dir,
+ quiet=options.quiet, force=options.force,
+ halt_on_error=options.halt_on_error)
# Natural Language Toolkit: graphical representations package
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# Import Tkinter-based modules if Tkinter is installed
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
import warnings
-
- warnings.warn("nltk.draw package not loaded " "(please install Tkinter library).")
+ warnings.warn("nltk.draw package not loaded "
+ "(please install Tkinter library).")
else:
from nltk.draw.cfg import ProductionList, CFGEditor, CFGDemo
- from nltk.draw.tree import (
- TreeSegmentWidget,
- tree_to_treesegment,
- TreeWidget,
- TreeView,
- draw_trees,
- )
+ from nltk.draw.tree import (TreeSegmentWidget, tree_to_treesegment,
+ TreeWidget, TreeView, draw_trees)
from nltk.draw.table import Table
from nltk.draw.dispersion import dispersion_plot
# skip doctests from this package
def setup_module(module):
from nose import SkipTest
-
raise SkipTest("nltk.draw examples are not doctests")
# Natural Language Toolkit: CFG visualization
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import re
-from tkinter import (
- Button,
- Canvas,
- Entry,
- Frame,
- IntVar,
- Label,
- Scrollbar,
- Text,
- Tk,
- Toplevel,
-)
-
-from nltk.grammar import CFG, _read_cfg_production, Nonterminal, nonterminals
+from six import string_types
+from six.moves.tkinter import (Button, Canvas, Entry, Frame, IntVar, Label,
+ Scrollbar, Text, Tk, Toplevel)
+
+from nltk.grammar import (CFG, _read_cfg_production,
+ Nonterminal, nonterminals)
from nltk.tree import Tree
from nltk.draw.tree import TreeSegmentWidget, tree_to_treesegment
-from nltk.draw.util import (
- CanvasFrame,
- ColorizedList,
- ShowText,
- SymbolWidget,
- TextWidget,
-)
+from nltk.draw.util import (CanvasFrame, ColorizedList, ShowText,
+ SymbolWidget, TextWidget)
######################################################################
# Production List
######################################################################
-
class ProductionList(ColorizedList):
- ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+ ARROW = SymbolWidget.SYMBOLS['rightarrow']
def _init_colortags(self, textwidget, options):
- textwidget.tag_config("terminal", foreground="#006000")
- textwidget.tag_config("arrow", font="symbol", underline="0")
- textwidget.tag_config(
- "nonterminal", foreground="blue", font=("helvetica", -12, "bold")
- )
+ textwidget.tag_config('terminal', foreground='#006000')
+ textwidget.tag_config('arrow', font='symbol', underline='0')
+ textwidget.tag_config('nonterminal', foreground='blue',
+ font=('helvetica', -12, 'bold'))
def _item_repr(self, item):
contents = []
- contents.append(("%s\t" % item.lhs(), "nonterminal"))
- contents.append((self.ARROW, "arrow"))
+ contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+ contents.append((self.ARROW, 'arrow'))
for elt in item.rhs():
if isinstance(elt, Nonterminal):
- contents.append((" %s" % elt.symbol(), "nonterminal"))
+ contents.append((' %s' % elt.symbol(), 'nonterminal'))
else:
- contents.append((" %r" % elt, "terminal"))
+ contents.append((' %r' % elt, 'terminal'))
return contents
-
######################################################################
# CFG Editor
######################################################################
"""
-
class CFGEditor(object):
"""
A dialog window for creating and editing context free grammars.
- All terminals must be strings consisting of word characters
and space characters.
"""
-
# Regular expressions used by _analyze_line. Precompile them, so
# we can process the text faster.
- ARROW = SymbolWidget.SYMBOLS["rightarrow"]
- _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
- _ARROW_RE = re.compile("\s*(->|(" + ARROW + "))\s*")
- _PRODUCTION_RE = re.compile(
- r"(^\s*\w+\s*)"
- + "(->|(" # LHS
- + ARROW
- + "))\s*"
- + r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$" # arrow
- ) # RHS
- _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
- _BOLD = ("helvetica", -12, "bold")
+ ARROW = SymbolWidget.SYMBOLS['rightarrow']
+ _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|("+ARROW+"))")
+ _ARROW_RE = re.compile("\s*(->|("+ARROW+"))\s*")
+ _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" + # LHS
+ "(->|("+ARROW+"))\s*" + # arrow
+ r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$") # RHS
+ _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|("+ARROW+")")
+ _BOLD = ('helvetica', -12, 'bold')
def __init__(self, parent, cfg=None, set_cfg_callback=None):
self._parent = parent
- if cfg is not None:
- self._cfg = cfg
- else:
- self._cfg = CFG(Nonterminal("S"), [])
+ if cfg is not None: self._cfg = cfg
+ else: self._cfg = CFG(Nonterminal('S'), [])
self._set_cfg_callback = set_cfg_callback
self._highlight_matching_nonterminals = 1
self._init_bindings()
self._init_startframe()
- self._startframe.pack(side="top", fill="x", expand=0)
+ self._startframe.pack(side='top', fill='x', expand=0)
self._init_prodframe()
- self._prodframe.pack(side="top", fill="both", expand=1)
+ self._prodframe.pack(side='top', fill='both', expand=1)
self._init_buttons()
- self._buttonframe.pack(side="bottom", fill="x", expand=0)
+ self._buttonframe.pack(side='bottom', fill='x', expand=0)
self._textwidget.focus()
def _init_startframe(self):
frame = self._startframe = Frame(self._top)
self._start = Entry(frame)
- self._start.pack(side="right")
- Label(frame, text="Start Symbol:").pack(side="right")
- Label(frame, text="Productions:").pack(side="left")
+ self._start.pack(side='right')
+ Label(frame, text='Start Symbol:').pack(side='right')
+ Label(frame, text='Productions:').pack(side='left')
self._start.insert(0, self._cfg.start().symbol())
def _init_buttons(self):
frame = self._buttonframe = Frame(self._top)
- Button(frame, text="Ok", command=self._ok, underline=0, takefocus=0).pack(
- side="left"
- )
- Button(frame, text="Apply", command=self._apply, underline=0, takefocus=0).pack(
- side="left"
- )
- Button(frame, text="Reset", command=self._reset, underline=0, takefocus=0).pack(
- side="left"
- )
- Button(
- frame, text="Cancel", command=self._cancel, underline=0, takefocus=0
- ).pack(side="left")
- Button(frame, text="Help", command=self._help, underline=0, takefocus=0).pack(
- side="right"
- )
+ Button(frame, text='Ok', command=self._ok,
+ underline=0, takefocus=0).pack(side='left')
+ Button(frame, text='Apply', command=self._apply,
+ underline=0, takefocus=0).pack(side='left')
+ Button(frame, text='Reset', command=self._reset,
+ underline=0, takefocus=0,).pack(side='left')
+ Button(frame, text='Cancel', command=self._cancel,
+ underline=0, takefocus=0).pack(side='left')
+ Button(frame, text='Help', command=self._help,
+ underline=0, takefocus=0).pack(side='right')
def _init_bindings(self):
- self._top.title("CFG Editor")
- self._top.bind("<Control-q>", self._cancel)
- self._top.bind("<Alt-q>", self._cancel)
- self._top.bind("<Control-d>", self._cancel)
- # self._top.bind('<Control-x>', self._cancel)
- self._top.bind("<Alt-x>", self._cancel)
- self._top.bind("<Escape>", self._cancel)
- # self._top.bind('<Control-c>', self._cancel)
- self._top.bind("<Alt-c>", self._cancel)
-
- self._top.bind("<Control-o>", self._ok)
- self._top.bind("<Alt-o>", self._ok)
- self._top.bind("<Control-a>", self._apply)
- self._top.bind("<Alt-a>", self._apply)
- self._top.bind("<Control-r>", self._reset)
- self._top.bind("<Alt-r>", self._reset)
- self._top.bind("<Control-h>", self._help)
- self._top.bind("<Alt-h>", self._help)
- self._top.bind("<F1>", self._help)
+ self._top.title('CFG Editor')
+ self._top.bind('<Control-q>', self._cancel)
+ self._top.bind('<Alt-q>', self._cancel)
+ self._top.bind('<Control-d>', self._cancel)
+ #self._top.bind('<Control-x>', self._cancel)
+ self._top.bind('<Alt-x>', self._cancel)
+ self._top.bind('<Escape>', self._cancel)
+ #self._top.bind('<Control-c>', self._cancel)
+ self._top.bind('<Alt-c>', self._cancel)
+
+ self._top.bind('<Control-o>', self._ok)
+ self._top.bind('<Alt-o>', self._ok)
+ self._top.bind('<Control-a>', self._apply)
+ self._top.bind('<Alt-a>', self._apply)
+ self._top.bind('<Control-r>', self._reset)
+ self._top.bind('<Alt-r>', self._reset)
+ self._top.bind('<Control-h>', self._help)
+ self._top.bind('<Alt-h>', self._help)
+ self._top.bind('<F1>', self._help)
def _init_prodframe(self):
self._prodframe = Frame(self._top)
# Create the basic Text widget & scrollbar.
- self._textwidget = Text(
- self._prodframe, background="#e0e0e0", exportselection=1
- )
- self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient="vertical")
- self._textwidget.config(yscrollcommand=self._textscroll.set)
+ self._textwidget = Text(self._prodframe, background='#e0e0e0',
+ exportselection=1)
+ self._textscroll = Scrollbar(self._prodframe, takefocus=0,
+ orient='vertical')
+ self._textwidget.config(yscrollcommand = self._textscroll.set)
self._textscroll.config(command=self._textwidget.yview)
- self._textscroll.pack(side="right", fill="y")
- self._textwidget.pack(expand=1, fill="both", side="left")
+ self._textscroll.pack(side='right', fill='y')
+ self._textwidget.pack(expand=1, fill='both', side='left')
# Initialize the colorization tags. Each nonterminal gets its
# own tag, so they aren't listed here.
- self._textwidget.tag_config("terminal", foreground="#006000")
- self._textwidget.tag_config("arrow", font="symbol")
- self._textwidget.tag_config("error", background="red")
+ self._textwidget.tag_config('terminal', foreground='#006000')
+ self._textwidget.tag_config('arrow', font='symbol')
+ self._textwidget.tag_config('error', background='red')
# Keep track of what line they're on. We use that to remember
# to re-analyze a line whenever they leave it.
self._linenum = 0
# Expand "->" to an arrow.
- self._top.bind(">", self._replace_arrows)
+ self._top.bind('>', self._replace_arrows)
# Re-colorize lines when appropriate.
- self._top.bind("<<Paste>>", self._analyze)
- self._top.bind("<KeyPress>", self._check_analyze)
- self._top.bind("<ButtonPress>", self._check_analyze)
+ self._top.bind('<<Paste>>', self._analyze)
+ self._top.bind('<KeyPress>', self._check_analyze)
+ self._top.bind('<ButtonPress>', self._check_analyze)
# Tab cycles focus. (why doesn't this work??)
def cycle(e, textwidget=self._textwidget):
textwidget.tk_focusNext().focus()
-
- self._textwidget.bind("<Tab>", cycle)
-
- prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
- for i in range(len(prod_tuples) - 1, 0, -1):
- if prod_tuples[i][0] == prod_tuples[i - 1][0]:
- if () in prod_tuples[i][1]:
- continue
- if () in prod_tuples[i - 1][1]:
- continue
- print(prod_tuples[i - 1][1])
+ self._textwidget.bind('<Tab>', cycle)
+
+ prod_tuples = [(p.lhs(),[p.rhs()]) for p in self._cfg.productions()]
+ for i in range(len(prod_tuples)-1,0,-1):
+ if (prod_tuples[i][0] == prod_tuples[i-1][0]):
+ if () in prod_tuples[i][1]: continue
+ if () in prod_tuples[i-1][1]: continue
+ print(prod_tuples[i-1][1])
print(prod_tuples[i][1])
- prod_tuples[i - 1][1].extend(prod_tuples[i][1])
+ prod_tuples[i-1][1].extend(prod_tuples[i][1])
del prod_tuples[i]
for lhs, rhss in prod_tuples:
print(lhs, rhss)
- s = "%s ->" % lhs
+ s = '%s ->' % lhs
for rhs in rhss:
for elt in rhs:
- if isinstance(elt, Nonterminal):
- s += " %s" % elt
- else:
- s += " %r" % elt
- s += " |"
- s = s[:-2] + "\n"
- self._textwidget.insert("end", s)
+ if isinstance(elt, Nonterminal): s += ' %s' % elt
+ else: s += ' %r' % elt
+ s += ' |'
+ s = s[:-2] + '\n'
+ self._textwidget.insert('end', s)
self._analyze()
- # # Add the producitons to the text widget, and colorize them.
- # prod_by_lhs = {}
- # for prod in self._cfg.productions():
- # if len(prod.rhs()) > 0:
- # prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
- # for (lhs, prods) in prod_by_lhs.items():
- # self._textwidget.insert('end', '%s ->' % lhs)
- # self._textwidget.insert('end', self._rhs(prods[0]))
- # for prod in prods[1:]:
- # print '\t|'+self._rhs(prod),
- # self._textwidget.insert('end', '\t|'+self._rhs(prod))
- # print
- # self._textwidget.insert('end', '\n')
- # for prod in self._cfg.productions():
- # if len(prod.rhs()) == 0:
- # self._textwidget.insert('end', '%s' % prod)
- # self._analyze()
-
- # def _rhs(self, prod):
- # s = ''
- # for elt in prod.rhs():
- # if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
- # else: s += ' %r' % elt
- # return s
+# # Add the producitons to the text widget, and colorize them.
+# prod_by_lhs = {}
+# for prod in self._cfg.productions():
+# if len(prod.rhs()) > 0:
+# prod_by_lhs.setdefault(prod.lhs(),[]).append(prod)
+# for (lhs, prods) in prod_by_lhs.items():
+# self._textwidget.insert('end', '%s ->' % lhs)
+# self._textwidget.insert('end', self._rhs(prods[0]))
+# for prod in prods[1:]:
+# print '\t|'+self._rhs(prod),
+# self._textwidget.insert('end', '\t|'+self._rhs(prod))
+# print
+# self._textwidget.insert('end', '\n')
+# for prod in self._cfg.productions():
+# if len(prod.rhs()) == 0:
+# self._textwidget.insert('end', '%s' % prod)
+# self._analyze()
+
+# def _rhs(self, prod):
+# s = ''
+# for elt in prod.rhs():
+# if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol()
+# else: s += ' %r' % elt
+# return s
def _clear_tags(self, linenum):
"""
Remove all tags (except ``arrow`` and ``sel``) from the given
line of the text widget used for editing the productions.
"""
- start = "%d.0" % linenum
- end = "%d.end" % linenum
+ start = '%d.0'%linenum
+ end = '%d.end'%linenum
for tag in self._textwidget.tag_names():
- if tag not in ("arrow", "sel"):
+ if tag not in ('arrow', 'sel'):
self._textwidget.tag_remove(tag, start, end)
def _check_analyze(self, *e):
all colorization from the line we moved to, and re-colorize
the line that we moved from.
"""
- linenum = int(self._textwidget.index("insert").split(".")[0])
+ linenum = int(self._textwidget.index('insert').split('.')[0])
if linenum != self._linenum:
self._clear_tags(linenum)
self._analyze_line(self._linenum)
symbol font). This searches the whole buffer, but is fast
enough to be done anytime they press '>'.
"""
- arrow = "1.0"
+ arrow = '1.0'
while True:
- arrow = self._textwidget.search("->", arrow, "end+1char")
- if arrow == "":
- break
- self._textwidget.delete(arrow, arrow + "+2char")
- self._textwidget.insert(arrow, self.ARROW, "arrow")
- self._textwidget.insert(arrow, "\t")
-
- arrow = "1.0"
+ arrow = self._textwidget.search('->', arrow, 'end+1char')
+ if arrow == '': break
+ self._textwidget.delete(arrow, arrow+'+2char')
+ self._textwidget.insert(arrow, self.ARROW, 'arrow')
+ self._textwidget.insert(arrow, '\t')
+
+ arrow = '1.0'
while True:
- arrow = self._textwidget.search(self.ARROW, arrow + "+1char", "end+1char")
- if arrow == "":
- break
- self._textwidget.tag_add("arrow", arrow, arrow + "+1char")
+ arrow = self._textwidget.search(self.ARROW, arrow+'+1char',
+ 'end+1char')
+ if arrow == '': break
+ self._textwidget.tag_add('arrow', arrow, arrow+'+1char')
def _analyze_token(self, match, linenum):
"""
the line).
"""
# What type of token is it?
- if match.group()[0] in "'\"":
- tag = "terminal"
- elif match.group() in ("->", self.ARROW):
- tag = "arrow"
+ if match.group()[0] in "'\"": tag = 'terminal'
+ elif match.group() in ('->', self.ARROW): tag = 'arrow'
else:
# If it's a nonterminal, then set up new bindings, so we
# can highlight all instances of that nonterminal when we
# put the mouse over it.
- tag = "nonterminal_" + match.group()
+ tag = 'nonterminal_'+match.group()
if tag not in self._textwidget.tag_names():
self._init_nonterminal_tag(tag)
- start = "%d.%d" % (linenum, match.start())
- end = "%d.%d" % (linenum, match.end())
+ start = '%d.%d' % (linenum, match.start())
+ end = '%d.%d' % (linenum, match.end())
self._textwidget.tag_add(tag, start, end)
- def _init_nonterminal_tag(self, tag, foreground="blue"):
- self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD)
+ def _init_nonterminal_tag(self, tag, foreground='blue'):
+ self._textwidget.tag_config(tag, foreground=foreground,
+ font=CFGEditor._BOLD)
if not self._highlight_matching_nonterminals:
return
-
def enter(e, textwidget=self._textwidget, tag=tag):
- textwidget.tag_config(tag, background="#80ff80")
-
+ textwidget.tag_config(tag, background='#80ff80')
def leave(e, textwidget=self._textwidget, tag=tag):
- textwidget.tag_config(tag, background="")
-
- self._textwidget.tag_bind(tag, "<Enter>", enter)
- self._textwidget.tag_bind(tag, "<Leave>", leave)
+ textwidget.tag_config(tag, background='')
+ self._textwidget.tag_bind(tag, '<Enter>', enter)
+ self._textwidget.tag_bind(tag, '<Leave>', leave)
def _analyze_line(self, linenum):
"""
self._clear_tags(linenum)
# Get the line line's text string.
- line = self._textwidget.get(repr(linenum) + ".0", repr(linenum) + ".end")
+ line = self._textwidget.get(repr(linenum)+'.0', repr(linenum)+'.end')
# If it's a valid production, then colorize each token.
if CFGEditor._PRODUCTION_RE.match(line):
# and call analyze_token on each token.
def analyze_token(match, self=self, linenum=linenum):
self._analyze_token(match, linenum)
- return ""
-
+ return ''
CFGEditor._TOKEN_RE.sub(analyze_token, line)
- elif line.strip() != "":
+ elif line.strip() != '':
# It's invalid; show the user where the error is.
self._mark_error(linenum, line)
arrowmatch = CFGEditor._ARROW_RE.search(line)
if not arrowmatch:
# If there's no arrow at all, highlight the whole line.
- start = "%d.0" % linenum
- end = "%d.end" % linenum
+ start = '%d.0' % linenum
+ end = '%d.end' % linenum
elif not CFGEditor._LHS_RE.match(line):
# Otherwise, if the LHS is bad, highlight it.
- start = "%d.0" % linenum
- end = "%d.%d" % (linenum, arrowmatch.start())
+ start = '%d.0' % linenum
+ end = '%d.%d' % (linenum, arrowmatch.start())
else:
# Otherwise, highlight the RHS.
- start = "%d.%d" % (linenum, arrowmatch.end())
- end = "%d.end" % linenum
+ start = '%d.%d' % (linenum, arrowmatch.end())
+ end = '%d.end' % linenum
# If we're highlighting 0 chars, highlight the whole line.
- if self._textwidget.compare(start, "==", end):
- start = "%d.0" % linenum
- end = "%d.end" % linenum
- self._textwidget.tag_add("error", start, end)
+ if self._textwidget.compare(start, '==', end):
+ start = '%d.0' % linenum
+ end = '%d.end' % linenum
+ self._textwidget.tag_add('error', start, end)
def _analyze(self, *e):
"""
Replace ``->`` with arrows, and colorize the entire buffer.
"""
self._replace_arrows()
- numlines = int(self._textwidget.index("end").split(".")[0])
- for linenum in range(1, numlines + 1): # line numbers start at 1.
+ numlines = int(self._textwidget.index('end').split('.')[0])
+ for linenum in range(1, numlines+1): # line numbers start at 1.
self._analyze_line(linenum)
def _parse_productions(self):
productions = []
# Get the text, normalize it, and split it into lines.
- text = self._textwidget.get("1.0", "end")
- text = re.sub(self.ARROW, "->", text)
- text = re.sub("\t", " ", text)
- lines = text.split("\n")
+ text = self._textwidget.get('1.0', 'end')
+ text = re.sub(self.ARROW, '->', text)
+ text = re.sub('\t', ' ', text)
+ lines = text.split('\n')
# Convert each line to a CFG production
for line in lines:
line = line.strip()
- if line == "":
- continue
+ if line=='': continue
productions += _read_cfg_production(line)
- # if line.strip() == '': continue
- # if not CFGEditor._PRODUCTION_RE.match(line):
+ #if line.strip() == '': continue
+ #if not CFGEditor._PRODUCTION_RE.match(line):
# raise ValueError('Bad production string %r' % line)
#
- # (lhs_str, rhs_str) = line.split('->')
- # lhs = Nonterminal(lhs_str.strip())
- # rhs = []
- # def parse_token(match, rhs=rhs):
+ #(lhs_str, rhs_str) = line.split('->')
+ #lhs = Nonterminal(lhs_str.strip())
+ #rhs = []
+ #def parse_token(match, rhs=rhs):
# token = match.group()
# if token[0] in "'\"": rhs.append(token[1:-1])
# else: rhs.append(Nonterminal(token))
# return ''
- # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
+ #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str)
#
- # productions.append(Production(lhs, *rhs))
+ #productions.append(Production(lhs, *rhs))
return productions
def _destroy(self, *e):
- if self._top is None:
- return
+ if self._top is None: return
self._top.destroy()
self._top = None
self._set_cfg_callback(cfg)
def _reset(self, *e):
- self._textwidget.delete("1.0", "end")
+ self._textwidget.delete('1.0', 'end')
for production in self._cfg.productions():
- self._textwidget.insert("end", "%s\n" % production)
+ self._textwidget.insert('end', '%s\n' % production)
self._analyze()
if self._set_cfg_callback is not None:
self._set_cfg_callback(self._cfg)
def _cancel(self, *e):
- try:
- self._reset()
- except:
- pass
+ try: self._reset()
+ except: pass
self._destroy()
def _help(self, *e):
# The default font's not very legible; try using 'fixed' instead.
try:
- ShowText(
- self._parent,
- "Help: Chart Parser Demo",
- (_CFGEditor_HELP).strip(),
- width=75,
- font="fixed",
- )
+ ShowText(self._parent, 'Help: Chart Parser Demo',
+ (_CFGEditor_HELP).strip(), width=75, font='fixed')
except:
- ShowText(
- self._parent,
- "Help: Chart Parser Demo",
- (_CFGEditor_HELP).strip(),
- width=75,
- )
-
+ ShowText(self._parent, 'Help: Chart Parser Demo',
+ (_CFGEditor_HELP).strip(), width=75)
######################################################################
# New Demo (built tree based on cfg)
######################################################################
-
class CFGDemo(object):
def __init__(self, grammar, text):
self._grammar = grammar
# Set up the main window.
self._top = Tk()
- self._top.title("Context Free Grammar Demo")
+ self._top.title('Context Free Grammar Demo')
# Base font size
self._size = IntVar(self._top)
- self._size.set(12) # = medium
+ self._size.set(12) # = medium
# Set up the key bindings
self._init_bindings(self._top)
# Create the basic frames
frame1 = Frame(self._top)
- frame1.pack(side="left", fill="y", expand=0)
+ frame1.pack(side='left', fill='y', expand=0)
self._init_menubar(self._top)
self._init_buttons(self._top)
self._init_grammar(frame1)
self._init_treelet(frame1)
self._init_workspace(self._top)
- # //////////////////////////////////////////////////
+ #//////////////////////////////////////////////////
# Initialization
- # //////////////////////////////////////////////////
+ #//////////////////////////////////////////////////
def _init_bindings(self, top):
- top.bind("<Control-q>", self.destroy)
+ top.bind('<Control-q>', self.destroy)
- def _init_menubar(self, parent):
- pass
+ def _init_menubar(self, parent): pass
- def _init_buttons(self, parent):
- pass
+ def _init_buttons(self, parent): pass
def _init_grammar(self, parent):
self._prodlist = ProductionList(parent, self._grammar, width=20)
- self._prodlist.pack(side="top", fill="both", expand=1)
+ self._prodlist.pack(side='top', fill='both', expand=1)
self._prodlist.focus()
- self._prodlist.add_callback("select", self._selectprod_cb)
- self._prodlist.add_callback("move", self._selectprod_cb)
+ self._prodlist.add_callback('select', self._selectprod_cb)
+ self._prodlist.add_callback('move', self._selectprod_cb)
def _init_treelet(self, parent):
- self._treelet_canvas = Canvas(parent, background="white")
- self._treelet_canvas.pack(side="bottom", fill="x")
+ self._treelet_canvas = Canvas(parent, background='white')
+ self._treelet_canvas.pack(side='bottom', fill='x')
self._treelet = None
def _init_workspace(self, parent):
- self._workspace = CanvasFrame(parent, background="white")
- self._workspace.pack(side="right", fill="both", expand=1)
+ self._workspace = CanvasFrame(parent, background='white')
+ self._workspace.pack(side='right', fill='both', expand=1)
self._tree = None
self.reset_workspace()
- # //////////////////////////////////////////////////
+ #//////////////////////////////////////////////////
# Workspace
- # //////////////////////////////////////////////////
+ #//////////////////////////////////////////////////
def reset_workspace(self):
c = self._workspace.canvas()
fontsize = int(self._size.get())
- node_font = ("helvetica", -(fontsize + 4), "bold")
- leaf_font = ("helvetica", -(fontsize + 2))
+ node_font = ('helvetica', -(fontsize+4), 'bold')
+ leaf_font = ('helvetica', -(fontsize+2))
# Remove the old tree
if self._tree is not None:
leaves.append(TextWidget(c, word, font=leaf_font, draggable=1))
# Put it all together into one tree
- self._tree = TreeSegmentWidget(c, rootnode, leaves, color="white")
+ self._tree = TreeSegmentWidget(c, rootnode, leaves,
+ color='white')
# Add it to the workspace.
self._workspace.add_widget(self._tree)
# Move the leaves to the bottom of the workspace.
- for leaf in leaves:
- leaf.move(0, 100)
+ for leaf in leaves: leaf.move(0,100)
- # self._nodes = {start:1}
- # self._leaves = dict([(l,1) for l in leaves])
+ #self._nodes = {start:1}
+ #self._leaves = dict([(l,1) for l in leaves])
def workspace_markprod(self, production):
pass
def _markproduction(self, prod, tree=None):
- if tree is None:
- tree = self._tree
- for i in range(len(tree.subtrees()) - len(prod.rhs())):
- if tree["color", i] == "white":
- self._markproduction # FIXME: Is this necessary at all?
+ if tree is None: tree = self._tree
+ for i in range(len(tree.subtrees())-len(prod.rhs())):
+ if tree['color', i] == 'white':
+ self._markproduction
for j, node in enumerate(prod.rhs()):
- widget = tree.subtrees()[i + j]
- if (
- isinstance(node, Nonterminal)
- and isinstance(widget, TreeSegmentWidget)
- and node.symbol == widget.label().text()
- ):
- pass # matching nonterminal
- elif (
- isinstance(node, str)
- and isinstance(widget, TextWidget)
- and node == widget.text()
- ):
- pass # matching nonterminal
- else:
- break
+ widget = tree.subtrees()[i+j]
+ if (isinstance(node, Nonterminal) and
+ isinstance(widget, TreeSegmentWidget) and
+ node.symbol == widget.label().text()):
+ pass # matching nonterminal
+ elif (isinstance(node, string_types) and
+ isinstance(widget, TextWidget) and
+ node == widget.text()):
+ pass # matching nonterminal
+ else: break
else:
# Everything matched!
- print("MATCH AT", i)
+ print('MATCH AT', i)
- # //////////////////////////////////////////////////
+ #//////////////////////////////////////////////////
# Grammar
- # //////////////////////////////////////////////////
+ #//////////////////////////////////////////////////
def _selectprod_cb(self, production):
canvas = self._treelet_canvas
self._prodlist.highlight(production)
- if self._treelet is not None:
- self._treelet.destroy()
+ if self._treelet is not None: self._treelet.destroy()
# Convert the production to a tree.
rhs = production.rhs()
for (i, elt) in enumerate(rhs):
- if isinstance(elt, Nonterminal):
- elt = Tree(elt)
+ if isinstance(elt, Nonterminal): elt = Tree(elt)
tree = Tree(production.lhs().symbol(), *rhs)
# Draw the tree in the treelet area.
fontsize = int(self._size.get())
- node_font = ("helvetica", -(fontsize + 4), "bold")
- leaf_font = ("helvetica", -(fontsize + 2))
- self._treelet = tree_to_treesegment(
- canvas, tree, node_font=node_font, leaf_font=leaf_font
- )
- self._treelet["draggable"] = 1
+ node_font = ('helvetica', -(fontsize+4), 'bold')
+ leaf_font = ('helvetica', -(fontsize+2))
+ self._treelet = tree_to_treesegment(canvas, tree,
+ node_font=node_font,
+ leaf_font=leaf_font)
+ self._treelet['draggable'] = 1
# Center the treelet.
(x1, y1, x2, y2) = self._treelet.bbox()
- w, h = int(canvas["width"]), int(canvas["height"])
- self._treelet.move((w - x1 - x2) / 2, (h - y1 - y2) / 2)
+ w, h = int(canvas['width']), int(canvas['height'])
+ self._treelet.move((w-x1-x2)/2, (h-y1-y2)/2)
# Mark the places where we can add it to the workspace.
self._markproduction(production)
def mainloop(self, *args, **kwargs):
self._top.mainloop(*args, **kwargs)
-
def demo2():
from nltk import Nonterminal, Production, CFG
-
- nonterminals = "S VP NP PP P N Name V Det"
- (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
+ nonterminals = 'S VP NP PP P N Name V Det'
+ (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
+ for s in nonterminals.split()]
productions = (
# Syntactic Productions
Production(S, [NP, VP]),
Production(VP, [V, NP]),
Production(PP, [P, NP]),
Production(PP, []),
- Production(PP, ["up", "over", NP]),
+
+ Production(PP, ['up', 'over', NP]),
+
# Lexical Productions
- Production(NP, ["I"]),
- Production(Det, ["the"]),
- Production(Det, ["a"]),
- Production(N, ["man"]),
- Production(V, ["saw"]),
- Production(P, ["in"]),
- Production(P, ["with"]),
- Production(N, ["park"]),
- Production(N, ["dog"]),
- Production(N, ["statue"]),
- Production(Det, ["my"]),
- )
+ Production(NP, ['I']), Production(Det, ['the']),
+ Production(Det, ['a']), Production(N, ['man']),
+ Production(V, ['saw']), Production(P, ['in']),
+ Production(P, ['with']), Production(N, ['park']),
+ Production(N, ['dog']), Production(N, ['statue']),
+ Production(Det, ['my']),
+ )
grammar = CFG(S, productions)
- text = "I saw a man in the park".split()
- d = CFGDemo(grammar, text)
+ text = 'I saw a man in the park'.split()
+ d=CFGDemo(grammar, text)
d.mainloop()
-
######################################################################
# Old Demo
######################################################################
-
def demo():
from nltk import Nonterminal, CFG
+ nonterminals = 'S VP NP PP P N Name V Det'
+ (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
+ for s in nonterminals.split()]
- nonterminals = "S VP NP PP P N Name V Det"
- (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
-
- grammar = CFG.fromstring(
- """
+ grammar = CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N
P -> 'up'
P -> 'over'
P -> 'with'
- """
- )
-
- def cb(grammar):
- print(grammar)
+ """)
+ def cb(grammar): print(grammar)
top = Tk()
editor = CFGEditor(top, grammar, cb)
- Label(top, text="\nTesting CFG Editor\n").pack()
- Button(top, text="Quit", command=top.destroy).pack()
+ Label(top, text='\nTesting CFG Editor\n').pack()
+ Button(top, text='Quit', command=top.destroy).pack()
top.mainloop()
-
def demo3():
from nltk import Production
-
- (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals(
- "S, VP, NP, PP, P, N, Name, V, Det"
- )
+ (S, VP, NP, PP, P, N, Name, V, Det) = \
+ nonterminals('S, VP, NP, PP, P, N, Name, V, Det')
productions = (
# Syntactic Productions
Production(VP, [V, NP]),
Production(PP, [P, NP]),
Production(PP, []),
- Production(PP, ["up", "over", NP]),
- # Lexical Productions
- Production(NP, ["I"]),
- Production(Det, ["the"]),
- Production(Det, ["a"]),
- Production(N, ["man"]),
- Production(V, ["saw"]),
- Production(P, ["in"]),
- Production(P, ["with"]),
- Production(N, ["park"]),
- Production(N, ["dog"]),
- Production(N, ["statue"]),
- Production(Det, ["my"]),
- )
- t = Tk()
+ Production(PP, ['up', 'over', NP]),
- def destroy(e, t=t):
- t.destroy()
+ # Lexical Productions
+ Production(NP, ['I']), Production(Det, ['the']),
+ Production(Det, ['a']), Production(N, ['man']),
+ Production(V, ['saw']), Production(P, ['in']),
+ Production(P, ['with']), Production(N, ['park']),
+ Production(N, ['dog']), Production(N, ['statue']),
+ Production(Det, ['my']),
+ )
- t.bind("q", destroy)
+ t = Tk()
+ def destroy(e, t=t): t.destroy()
+ t.bind('q', destroy)
p = ProductionList(t, productions)
- p.pack(expand=1, fill="both")
- p.add_callback("select", p.markonly)
- p.add_callback("move", p.markonly)
+ p.pack(expand=1, fill='both')
+ p.add_callback('select', p.markonly)
+ p.add_callback('move', p.markonly)
p.focus()
p.mark(productions[2])
p.mark(productions[8])
-
-if __name__ == "__main__":
- demo()
+if __name__ == '__main__': demo()
# Natural Language Toolkit: Dispersion Plots
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
A utility for displaying lexical dispersion.
"""
-
def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
"""
Generate a lexical dispersion plot.
try:
from matplotlib import pylab
except ImportError:
- raise ValueError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
- )
+ raise ValueError('The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/')
text = list(text)
words.reverse()
words_to_comp = words
text_to_comp = text
- points = [
- (x, y)
- for x in range(len(text_to_comp))
- for y in range(len(words_to_comp))
- if text_to_comp[x] == words_to_comp[y]
- ]
+ points = [(x,y) for x in range(len(text_to_comp))
+ for y in range(len(words_to_comp))
+ if text_to_comp[x] == words_to_comp[y]]
if points:
x, y = list(zip(*points))
else:
x = y = ()
- pylab.plot(x, y, "b|", scalex=0.1)
+ pylab.plot(x, y, "b|", scalex=.1)
pylab.yticks(list(range(len(words))), words, color="b")
pylab.ylim(-1, len(words))
pylab.title(title)
pylab.xlabel("Word Offset")
pylab.show()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
+ import nltk.compat
from nltk.corpus import gutenberg
-
- words = ["Elinor", "Marianne", "Edward", "Willoughby"]
- dispersion_plot(gutenberg.words("austen-sense.txt"), words)
+ words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
+ dispersion_plot(gutenberg.words('austen-sense.txt'), words)
# Natural Language Toolkit: Table widget
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Tkinter widgets for displaying multi-column listboxes and tables.
"""
+from __future__ import division
+
+
import operator
-from tkinter import Frame, Label, Listbox, Scrollbar, Tk
+from six.moves.tkinter import (Frame, Label, Listbox, Scrollbar, Tk)
######################################################################
# Multi-Column Listbox
######################################################################
-
class MultiListbox(Frame):
"""
A multi-column listbox, where the current selection applies to an
contained listboxes. For any methods that do not have docstrings,
see ``Tkinter.Listbox`` for a description of what that method does.
"""
-
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Configuration
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
#: Default configuration values for the frame.
- FRAME_CONFIG = dict(background="#888", takefocus=True, highlightthickness=1)
+ FRAME_CONFIG = dict(background='#888',
+ takefocus=True,
+ highlightthickness=1)
#: Default configurations for the column labels.
- LABEL_CONFIG = dict(
- borderwidth=1,
- relief="raised",
- font="helvetica -16 bold",
- background="#444",
- foreground="white",
- )
+ LABEL_CONFIG = dict(borderwidth=1, relief='raised',
+ font='helvetica -16 bold',
+ background='#444', foreground='white')
#: Default configuration for the column listboxes.
- LISTBOX_CONFIG = dict(
- borderwidth=1,
- selectborderwidth=0,
- highlightthickness=0,
- exportselection=False,
- selectbackground="#888",
- activestyle="none",
- takefocus=False,
- )
-
- # /////////////////////////////////////////////////////////////////
+ LISTBOX_CONFIG = dict(borderwidth=1,
+ selectborderwidth=0,
+ highlightthickness=0,
+ exportselection=False,
+ selectbackground='#888',
+ activestyle='none',
+ takefocus=False)
+
+ #/////////////////////////////////////////////////////////////////
# Constructor
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def __init__(self, master, columns, column_weights=None, cnf={}, **kw):
"""
if column_weights is None:
column_weights = [1] * len(columns)
elif len(column_weights) != len(columns):
- raise ValueError("Expected one column_weight for each column")
+ raise ValueError('Expected one column_weight for each column')
self._column_weights = column_weights
# Configure our widgets.
if include_labels:
l = Label(self, text=label, **self.LABEL_CONFIG)
self._labels.append(l)
- l.grid(column=i, row=0, sticky="news", padx=0, pady=0)
+ l.grid(column=i, row=0, sticky='news', padx=0, pady=0)
l.column_index = i
# Create a listbox for the column
lb = Listbox(self, **self.LISTBOX_CONFIG)
self._listboxes.append(lb)
- lb.grid(column=i, row=1, sticky="news", padx=0, pady=0)
+ lb.grid(column=i, row=1, sticky='news', padx=0, pady=0)
lb.column_index = i
# Clicking or dragging selects:
- lb.bind("<Button-1>", self._select)
- lb.bind("<B1-Motion>", self._select)
+ lb.bind('<Button-1>', self._select)
+ lb.bind('<B1-Motion>', self._select)
# Scroll whell scrolls:
- lb.bind("<Button-4>", lambda e: self._scroll(-1))
- lb.bind("<Button-5>", lambda e: self._scroll(+1))
- lb.bind("<MouseWheel>", lambda e: self._scroll(e.delta))
+ lb.bind('<Button-4>', lambda e: self._scroll(-1))
+ lb.bind('<Button-5>', lambda e: self._scroll(+1))
+ lb.bind('<MouseWheel>', lambda e: self._scroll(e.delta))
# Button 2 can be used to scan:
- lb.bind("<Button-2>", lambda e: self.scan_mark(e.x, e.y))
- lb.bind("<B2-Motion>", lambda e: self.scan_dragto(e.x, e.y))
+ lb.bind('<Button-2>', lambda e: self.scan_mark(e.x, e.y))
+ lb.bind('<B2-Motion>', lambda e: self.scan_dragto(e.x, e.y))
# Dragging outside the window has no effect (diable
# the default listbox behavior, which scrolls):
- lb.bind("<B1-Leave>", lambda e: "break")
+ lb.bind('<B1-Leave>', lambda e: 'break')
# Columns can be resized by dragging them:
- l.bind("<Button-1>", self._resize_column)
+ l.bind('<Button-1>', self._resize_column)
# Columns can be resized by dragging them. (This binding is
# used if they click on the grid between columns:)
- self.bind("<Button-1>", self._resize_column)
+ self.bind('<Button-1>', self._resize_column)
# Set up key bindings for the widget:
- self.bind("<Up>", lambda e: self.select(delta=-1))
- self.bind("<Down>", lambda e: self.select(delta=1))
- self.bind("<Prior>", lambda e: self.select(delta=-self._pagesize()))
- self.bind("<Next>", lambda e: self.select(delta=self._pagesize()))
+ self.bind('<Up>', lambda e: self.select(delta=-1))
+ self.bind('<Down>', lambda e: self.select(delta=1))
+ self.bind('<Prior>', lambda e: self.select(delta=-self._pagesize()))
+ self.bind('<Next>', lambda e: self.select(delta=self._pagesize()))
# Configuration customizations
self.configure(cnf, **kw)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Column Resizing
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _resize_column(self, event):
"""
"""
# If we're already waiting for a button release, then ignore
# the new button press.
- if event.widget.bind("<ButtonRelease>"):
+ if event.widget.bind('<ButtonRelease>'):
return False
# Decide which column (if any) to resize.
self._resize_column_index = None
if event.widget is self:
for i, lb in enumerate(self._listboxes):
- if abs(event.x - (lb.winfo_x() + lb.winfo_width())) < 10:
+ if abs(event.x-(lb.winfo_x()+lb.winfo_width())) < 10:
self._resize_column_index = i
- elif event.x > (event.widget.winfo_width() - 5):
+ elif event.x > (event.widget.winfo_width()-5):
self._resize_column_index = event.widget.column_index
elif event.x < 5 and event.widget.column_index != 0:
- self._resize_column_index = event.widget.column_index - 1
+ self._resize_column_index = event.widget.column_index-1
# Bind callbacks that are used to resize it.
if self._resize_column_index is not None:
- event.widget.bind("<Motion>", self._resize_column_motion_cb)
- event.widget.bind(
- "<ButtonRelease-%d>" % event.num, self._resize_column_buttonrelease_cb
- )
+ event.widget.bind('<Motion>', self._resize_column_motion_cb)
+ event.widget.bind('<ButtonRelease-%d>' % event.num,
+ self._resize_column_buttonrelease_cb)
return True
else:
return False
def _resize_column_motion_cb(self, event):
lb = self._listboxes[self._resize_column_index]
- charwidth = lb.winfo_width() / lb["width"]
+ charwidth = lb.winfo_width() / lb['width']
x1 = event.x + event.widget.winfo_x()
x2 = lb.winfo_x() + lb.winfo_width()
- lb["width"] = max(3, lb["width"] + (x1 - x2) // charwidth)
+ lb['width'] = max(3, lb['width'] + (x1-x2) // charwidth)
def _resize_column_buttonrelease_cb(self, event):
- event.widget.unbind("<ButtonRelease-%d>" % event.num)
- event.widget.unbind("<Motion>")
+ event.widget.unbind('<ButtonRelease-%d>' % event.num)
+ event.widget.unbind('<Motion>')
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Properties
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
@property
def column_names(self):
"""
return tuple(self._listboxes)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Mouse & Keyboard Callback Functions
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _select(self, e):
i = e.widget.nearest(e.y)
- self.selection_clear(0, "end")
+ self.selection_clear(0, 'end')
self.selection_set(i)
self.activate(i)
self.focus()
def _scroll(self, delta):
for lb in self._listboxes:
- lb.yview_scroll(delta, "unit")
- return "break"
+ lb.yview_scroll(delta, 'unit')
+ return 'break'
def _pagesize(self):
""":return: The number of rows that makes up one page"""
- return int(self.index("@0,1000000")) - int(self.index("@0,0"))
+ return int(self.index('@0,1000000')) - int(self.index('@0,0'))
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Row selection
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def select(self, index=None, delta=None, see=True):
"""
selected index, to ensure that it is visible.
"""
if (index is not None) and (delta is not None):
- raise ValueError("specify index or delta, but not both")
+ raise ValueError('specify index or delta, but not both')
# If delta was given, then calculate index.
if delta is not None:
index = int(self.curselection()[0]) + delta
# Clear all selected rows.
- self.selection_clear(0, "end")
+ self.selection_clear(0, 'end')
# Select the specified index
if index is not None:
- index = min(max(index, 0), self.size() - 1)
- # self.activate(index)
+ index = min(max(index, 0), self.size()-1)
+ #self.activate(index)
self.selection_set(index)
- if see:
- self.see(index)
+ if see: self.see(index)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Configuration
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def configure(self, cnf={}, **kw):
"""
"""
cnf = dict(list(cnf.items()) + list(kw.items()))
for (key, val) in list(cnf.items()):
- if key.startswith("label_") or key.startswith("label-"):
+ if key.startswith('label_') or key.startswith('label-'):
for label in self._labels:
label.configure({key[6:]: val})
- elif key.startswith("listbox_") or key.startswith("listbox-"):
+ elif key.startswith('listbox_') or key.startswith('listbox-'):
for listbox in self._listboxes:
listbox.configure({key[8:]: val})
else:
- Frame.configure(self, {key: val})
+ Frame.configure(self, {key:val})
def __setitem__(self, key, val):
"""
Configure this widget. This is equivalent to
``self.configure({key,val``)}. See ``configure()``.
"""
- self.configure({key: val})
+ self.configure({key:val})
def rowconfigure(self, row_index, cnf={}, **kw):
"""
arguments are: ``background``, ``bg``, ``foreground``, ``fg``,
``selectbackground``, ``selectforeground``.
"""
- for lb in self._listboxes:
- lb.itemconfigure(row_index, cnf, **kw)
+ for lb in self._listboxes: lb.itemconfigure(row_index, cnf, **kw)
def columnconfigure(self, col_index, cnf={}, **kw):
"""
cnf = dict(list(cnf.items()) + list(kw.items()))
for (key, val) in list(cnf.items()):
- if key in (
- "background",
- "bg",
- "foreground",
- "fg",
- "selectbackground",
- "selectforeground",
- ):
- for i in range(lb.size()):
- lb.itemconfigure(i, {key: val})
+ if key in ('background', 'bg', 'foreground', 'fg',
+ 'selectbackground', 'selectforeground'):
+ for i in range(lb.size()): lb.itemconfigure(i, {key:val})
else:
- lb.configure({key: val})
+ lb.configure({key:val})
def itemconfigure(self, row_index, col_index, cnf=None, **kw):
"""
lb = self._listboxes[col_index]
return lb.itemconfigure(row_index, cnf, **kw)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Value Access
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def insert(self, index, *rows):
"""
"""
for elt in rows:
if len(elt) != len(self._column_names):
- raise ValueError(
- "rows should be tuples whose length "
- "is equal to the number of columns"
- )
- for (lb, elts) in zip(self._listboxes, list(zip(*rows))):
+ raise ValueError('rows should be tuples whose length '
+ 'is equal to the number of columns')
+ for (lb,elts) in zip(self._listboxes, list(zip(*rows))):
lb.insert(index, *elts)
def get(self, first, last=None):
"""
dx, dy, _, _ = self.grid_bbox(row=0, column=col)
x, y, w, h = self._listboxes[col].bbox(row)
- return int(x) + int(dx), int(y) + int(dy), int(w), int(h)
+ return int(x)+int(dx), int(y)+int(dy), int(w), int(h)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Hide/Show Columns
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def hide_column(self, col_index):
"""
"""
weight = self._column_weights[col_index]
if self._labels:
- self._labels[col_index].grid(
- column=col_index, row=0, sticky="news", padx=0, pady=0
- )
- self._listboxes[col_index].grid(
- column=col_index, row=1, sticky="news", padx=0, pady=0
- )
+ self._labels[col_index].grid(column=col_index, row=0,
+ sticky='news', padx=0, pady=0)
+ self._listboxes[col_index].grid(column=col_index, row=1,
+ sticky='news', padx=0, pady=0)
self.grid_columnconfigure(col_index, weight=weight)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Binding Methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def bind_to_labels(self, sequence=None, func=None, add=None):
"""
functions (if any), allowing for their deletion (to
prevent a memory leak).
"""
- return [label.bind(sequence, func, add) for label in self.column_labels]
+ return [label.bind(sequence, func, add)
+ for label in self.column_labels]
def bind_to_listboxes(self, sequence=None, func=None, add=None):
"""
functions (if any), allowing for their deletion (to
prevent a memory leak).
"""
- return self.bind_to_labels(sequence, func, add) + self.bind_to_listboxes(
- sequence, func, add
- )
+ return (self.bind_to_labels(sequence, func, add) +
+ self.bind_to_listboxes(sequence, func, add))
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Simple Delegation
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# These methods delegate to the first listbox:
def curselection(self, *args, **kwargs):
return self._listboxes[0].curselection(*args, **kwargs)
-
def selection_includes(self, *args, **kwargs):
return self._listboxes[0].selection_includes(*args, **kwargs)
-
def itemcget(self, *args, **kwargs):
return self._listboxes[0].itemcget(*args, **kwargs)
-
def size(self, *args, **kwargs):
return self._listboxes[0].size(*args, **kwargs)
-
def index(self, *args, **kwargs):
return self._listboxes[0].index(*args, **kwargs)
-
def nearest(self, *args, **kwargs):
return self._listboxes[0].nearest(*args, **kwargs)
# These methods delegate to each listbox (and return None):
def activate(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.activate(*args, **kwargs)
-
+ for lb in self._listboxes: lb.activate(*args, **kwargs)
def delete(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.delete(*args, **kwargs)
-
+ for lb in self._listboxes: lb.delete(*args, **kwargs)
def scan_mark(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.scan_mark(*args, **kwargs)
-
+ for lb in self._listboxes: lb.scan_mark(*args, **kwargs)
def scan_dragto(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.scan_dragto(*args, **kwargs)
-
+ for lb in self._listboxes: lb.scan_dragto(*args, **kwargs)
def see(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.see(*args, **kwargs)
-
+ for lb in self._listboxes: lb.see(*args, **kwargs)
def selection_anchor(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.selection_anchor(*args, **kwargs)
-
+ for lb in self._listboxes: lb.selection_anchor(*args, **kwargs)
def selection_clear(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.selection_clear(*args, **kwargs)
-
+ for lb in self._listboxes: lb.selection_clear(*args, **kwargs)
def selection_set(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.selection_set(*args, **kwargs)
-
+ for lb in self._listboxes: lb.selection_set(*args, **kwargs)
def yview(self, *args, **kwargs):
- for lb in self._listboxes:
- v = lb.yview(*args, **kwargs)
- return v # if called with no arguments
-
+ for lb in self._listboxes: v = lb.yview(*args, **kwargs)
+ return v # if called with no arguments
def yview_moveto(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.yview_moveto(*args, **kwargs)
-
+ for lb in self._listboxes: lb.yview_moveto(*args, **kwargs)
def yview_scroll(self, *args, **kwargs):
- for lb in self._listboxes:
- lb.yview_scroll(*args, **kwargs)
+ for lb in self._listboxes: lb.yview_scroll(*args, **kwargs)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Aliases
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
itemconfig = itemconfigure
rowconfig = rowconfigure
select_includes = selection_includes
select_set = selection_set
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# These listbox methods are not defined for multi-listbox
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# def xview(self, *what): pass
# def xview_moveto(self, fraction): pass
# def xview_scroll(self, number, what): pass
-
######################################################################
# Table
######################################################################
-
class Table(object):
"""
A display widget for a table of values, based on a ``MultiListbox``
table. Each element of _rows is a row value, i.e., a list of
cell values, one for each column in the row.
"""
-
- def __init__(
- self,
- master,
- column_names,
- rows=None,
- column_weights=None,
- scrollbar=True,
- click_to_sort=True,
- reprfunc=None,
- cnf={},
- **kw
- ):
+ def __init__(self, master, column_names, rows=None,
+ column_weights=None,
+ scrollbar=True, click_to_sort=True,
+ reprfunc=None, cnf={}, **kw):
"""
Construct a new Table widget.
self._reprfunc = reprfunc
self._frame = Frame(master)
- self._column_name_to_index = dict((c, i) for (i, c) in enumerate(column_names))
+ self._column_name_to_index = dict((c,i) for (i,c) in
+ enumerate(column_names))
# Make a copy of the rows & check that it's valid.
- if rows is None:
- self._rows = []
- else:
- self._rows = [[v for v in row] for row in rows]
- for row in self._rows:
- self._checkrow(row)
+ if rows is None: self._rows = []
+ else: self._rows = [[v for v in row] for row in rows]
+ for row in self._rows: self._checkrow(row)
# Create our multi-list box.
- self._mlb = MultiListbox(self._frame, column_names, column_weights, cnf, **kw)
- self._mlb.pack(side="left", expand=True, fill="both")
+ self._mlb = MultiListbox(self._frame, column_names,
+ column_weights, cnf, **kw)
+ self._mlb.pack(side='left', expand=True, fill='both')
# Optional scrollbar
if scrollbar:
- sb = Scrollbar(self._frame, orient="vertical", command=self._mlb.yview)
- self._mlb.listboxes[0]["yscrollcommand"] = sb.set
- # for listbox in self._mlb.listboxes:
+ sb = Scrollbar(self._frame, orient='vertical',
+ command=self._mlb.yview)
+ self._mlb.listboxes[0]['yscrollcommand'] = sb.set
+ #for listbox in self._mlb.listboxes:
# listbox['yscrollcommand'] = sb.set
- sb.pack(side="right", fill="y")
+ sb.pack(side='right', fill='y')
self._scrollbar = sb
# Set up sorting
self._sortkey = None
if click_to_sort:
for i, l in enumerate(self._mlb.column_labels):
- l.bind("<Button-1>", self._sort)
+ l.bind('<Button-1>', self._sort)
# Fill in our multi-list box.
self._fill_table()
- # /////////////////////////////////////////////////////////////////
- # { Widget-like Methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
+ #{ Widget-like Methods
+ #/////////////////////////////////////////////////////////////////
# These all just delegate to either our frame or our MLB.
def pack(self, *args, **kwargs):
columnconfig = columnconfigure
itemconfig = itemconfigure
- # /////////////////////////////////////////////////////////////////
- # { Table as list-of-lists
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
+ #{ Table as list-of-lists
+ #/////////////////////////////////////////////////////////////////
def insert(self, row_index, rowvalue):
"""
self._checkrow(rowvalue)
self._rows.insert(row_index, rowvalue)
if self._reprfunc is not None:
- rowvalue = [
- self._reprfunc(row_index, j, v) for (j, v) in enumerate(rowvalue)
- ]
+ rowvalue = [self._reprfunc(row_index,j,v)
+ for (j,v) in enumerate(rowvalue)]
self._mlb.insert(row_index, rowvalue)
- if self._DEBUG:
- self._check_table_vs_mlb()
+ if self._DEBUG: self._check_table_vs_mlb()
def extend(self, rowvalues):
"""
table. Each row value should be a tuple of cell values,
one for each column in the row.
"""
- for rowvalue in rowvalues:
- self.append(rowvalue)
- if self._DEBUG:
- self._check_table_vs_mlb()
+ for rowvalue in rowvalues: self.append(rowvalue)
+ if self._DEBUG: self._check_table_vs_mlb()
def append(self, rowvalue):
"""
in the new row.
"""
self.insert(len(self._rows), rowvalue)
- if self._DEBUG:
- self._check_table_vs_mlb()
+ if self._DEBUG: self._check_table_vs_mlb()
def clear(self):
"""
Delete all rows in this table.
"""
self._rows = []
- self._mlb.delete(0, "end")
- if self._DEBUG:
- self._check_table_vs_mlb()
+ self._mlb.delete(0, 'end')
+ if self._DEBUG: self._check_table_vs_mlb()
def __getitem__(self, index):
"""
``i``th row and the ``j``th column.
"""
if isinstance(index, slice):
- raise ValueError("Slicing not supported")
- elif isinstance(index, tuple) and len(index) == 2:
+ raise ValueError('Slicing not supported')
+ elif isinstance(index, tuple) and len(index)==2:
return self._rows[index[0]][self.column_index(index[1])]
else:
return tuple(self._rows[index])
``val``.
"""
if isinstance(index, slice):
- raise ValueError("Slicing not supported")
+ raise ValueError('Slicing not supported')
+
# table[i,j] = val
- elif isinstance(index, tuple) and len(index) == 2:
+ elif isinstance(index, tuple) and len(index)==2:
i, j = index[0], self.column_index(index[1])
config_cookie = self._save_config_info([i])
self._rows[i][j] = val
if self._reprfunc is not None:
val = self._reprfunc(i, j, val)
self._mlb.listboxes[j].insert(i, val)
- self._mlb.listboxes[j].delete(i + 1)
+ self._mlb.listboxes[j].delete(i+1)
self._restore_config_info(config_cookie)
# table[i] = val
self._checkrow(val)
self._rows[index] = list(val)
if self._reprfunc is not None:
- val = [self._reprfunc(index, j, v) for (j, v) in enumerate(val)]
+ val = [self._reprfunc(index,j,v) for (j,v) in enumerate(val)]
self._mlb.insert(index, val)
- self._mlb.delete(index + 1)
+ self._mlb.delete(index+1)
self._restore_config_info(config_cookie)
def __delitem__(self, row_index):
Delete the ``row_index``th row from this table.
"""
if isinstance(row_index, slice):
- raise ValueError("Slicing not supported")
- if isinstance(row_index, tuple) and len(row_index) == 2:
- raise ValueError("Cannot delete a single cell!")
+ raise ValueError('Slicing not supported')
+ if isinstance(row_index, tuple) and len(row_index)==2:
+ raise ValueError('Cannot delete a single cell!')
del self._rows[row_index]
self._mlb.delete(row_index)
- if self._DEBUG:
- self._check_table_vs_mlb()
+ if self._DEBUG: self._check_table_vs_mlb()
def __len__(self):
"""
number of elements; and if not, raise an exception.
"""
if len(rowvalue) != self._num_columns:
- raise ValueError(
- "Row %r has %d columns; expected %d"
- % (rowvalue, len(rowvalue), self._num_columns)
- )
+ raise ValueError('Row %r has %d columns; expected %d' %
+ (rowvalue, len(rowvalue), self._num_columns))
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Columns
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
@property
def column_names(self):
""":see: ``MultiListbox.show_column()``"""
self._mlb.show_column(self.column_index(column_index))
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Selection
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def selected_row(self):
"""
``table[table.selected_row()]``.
"""
sel = self._mlb.curselection()
- if sel:
- return int(sel[0])
- else:
- return None
+ if sel: return int(sel[0])
+ else: return None
def select(self, index=None, delta=None, see=True):
""":see: ``MultiListbox.select()``"""
self._mlb.select(index, delta, see)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Sorting
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
- def sort_by(self, column_index, order="toggle"):
+ def sort_by(self, column_index, order='toggle'):
"""
Sort the rows in this table, using the specified column's
values as a sort key.
then reverse the rows; otherwise sort in ascending
order.
"""
- if order not in ("ascending", "descending", "toggle"):
- raise ValueError(
- 'sort_by(): order should be "ascending", ' '"descending", or "toggle".'
- )
+ if order not in ('ascending', 'descending', 'toggle'):
+ raise ValueError('sort_by(): order should be "ascending", '
+ '"descending", or "toggle".')
column_index = self.column_index(column_index)
config_cookie = self._save_config_info(index_by_id=True)
# Sort the rows.
- if order == "toggle" and column_index == self._sortkey:
+ if order == 'toggle' and column_index == self._sortkey:
self._rows.reverse()
else:
- self._rows.sort(
- key=operator.itemgetter(column_index), reverse=(order == "descending")
- )
+ self._rows.sort(key=operator.itemgetter(column_index),
+ reverse=(order=='descending'))
self._sortkey = column_index
# Redraw the table.
self._fill_table()
self._restore_config_info(config_cookie, index_by_id=True, see=True)
- if self._DEBUG:
- self._check_table_vs_mlb()
+ if self._DEBUG: self._check_table_vs_mlb()
def _sort(self, event):
"""Event handler for clicking on a column label -- sort by
# If they click on the far-left of far-right of a column's
# label, then resize rather than sorting.
if self._mlb._resize_column(event):
- return "continue"
+ return 'continue'
# Otherwise, sort.
else:
self.sort_by(column_index)
- return "continue"
+ return 'continue'
- # /////////////////////////////////////////////////////////////////
- # { Table Drawing Helpers
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
+ #{ Table Drawing Helpers
+ #/////////////////////////////////////////////////////////////////
def _fill_table(self, save_config=True):
"""
selection will also be lost -- i.e., no row will be selected
after this call completes.
"""
- self._mlb.delete(0, "end")
+ self._mlb.delete(0, 'end')
for i, row in enumerate(self._rows):
if self._reprfunc is not None:
- row = [self._reprfunc(i, j, v) for (j, v) in enumerate(row)]
- self._mlb.insert("end", row)
+ row = [self._reprfunc(i,j,v) for (j,v) in enumerate(row)]
+ self._mlb.insert('end', row)
def _get_itemconfig(self, r, c):
- return dict(
- (k, self._mlb.itemconfig(r, c, k)[-1])
- for k in (
- "foreground",
- "selectforeground",
- "background",
- "selectbackground",
- )
- )
+ return dict( (k, self._mlb.itemconfig(r, c, k)[-1])
+ for k in ('foreground', 'selectforeground',
+ 'background', 'selectbackground') )
def _save_config_info(self, row_indices=None, index_by_id=False):
"""
# Look up the color configuration info for each row.
if index_by_id:
- config = dict(
- (
- id(self._rows[r]),
- [self._get_itemconfig(r, c) for c in range(self._num_columns)],
- )
- for r in row_indices
- )
+ config = dict((id(self._rows[r]), [self._get_itemconfig(r, c)
+ for c in range(self._num_columns)])
+ for r in row_indices)
else:
- config = dict(
- (r, [self._get_itemconfig(r, c) for c in range(self._num_columns)])
- for r in row_indices
- )
+ config = dict((r, [self._get_itemconfig(r, c)
+ for c in range(self._num_columns)])
+ for r in row_indices)
+
return selection, config
# Clear the selection.
if selection is None:
- self._mlb.selection_clear(0, "end")
+ self._mlb.selection_clear(0, 'end')
# Restore selection & color config
if index_by_id:
for c in range(self._num_columns):
self._mlb.itemconfigure(r, c, config[r][c])
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Debugging (Invariant Checker)
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
_DEBUG = False
"""If true, then run ``_check_table_vs_mlb()`` after any operation
for row in self:
assert len(row) == self._num_columns
assert self._num_columns == len(self._mlb.column_names)
- # assert self._column_names == self._mlb.column_names
+ #assert self._column_names == self._mlb.column_names
for i, row in enumerate(self):
for j, cell in enumerate(row):
if self._reprfunc is not None:
cell = self._reprfunc(i, j, cell)
assert self._mlb.get(i)[j] == cell
-
######################################################################
# Demo/Test Function
######################################################################
# update this to use new WordNet API
def demo():
root = Tk()
- root.bind("<Control-q>", lambda e: root.destroy())
+ root.bind('<Control-q>', lambda e: root.destroy())
- table = Table(
- root,
- "Word Synset Hypernym Hyponym".split(),
- column_weights=[0, 1, 1, 1],
- reprfunc=(lambda i, j, s: " %s" % s),
- )
- table.pack(expand=True, fill="both")
+ table = Table(root, 'Word Synset Hypernym Hyponym'.split(),
+ column_weights=[0, 1, 1, 1],
+ reprfunc=(lambda i,j,s: ' %s' % s))
+ table.pack(expand=True, fill='both')
from nltk.corpus import wordnet
from nltk.corpus import brown
-
for word, pos in sorted(set(brown.tagged_words()[:500])):
- if pos[0] != "N":
- continue
+ if pos[0] != 'N': continue
word = word.lower()
for synset in wordnet.synsets(word):
try:
hyper_def = synset.hypernyms()[0].definition()
except:
- hyper_def = "*none*"
+ hyper_def = '*none*'
try:
hypo_def = synset.hypernyms()[0].definition()
except:
- hypo_def = "*none*"
- table.append([word, synset.definition(), hyper_def, hypo_def])
-
- table.columnconfig("Word", background="#afa")
- table.columnconfig("Synset", background="#efe")
- table.columnconfig("Hypernym", background="#fee")
- table.columnconfig("Hyponym", background="#ffe")
+ hypo_def = '*none*'
+ table.append([word,
+ synset.definition(),
+ hyper_def,
+ hypo_def])
+
+ table.columnconfig('Word', background='#afa')
+ table.columnconfig('Synset', background='#efe')
+ table.columnconfig('Hypernym', background='#fee')
+ table.columnconfig('Hyponym', background='#ffe')
for row in range(len(table)):
- for column in ("Hypernym", "Hyponym"):
- if table[row, column] == "*none*":
- table.itemconfig(
- row, column, foreground="#666", selectforeground="#666"
- )
+ for column in ('Hypernym', 'Hyponym'):
+ if table[row, column] == '*none*':
+ table.itemconfig(row, column, foreground='#666',
+ selectforeground='#666')
root.mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Graphical Representations for Trees
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Graphically display a Tree.
"""
-from tkinter import IntVar, Menu, Tk
+from six.moves.tkinter import IntVar, Menu, Tk
from nltk.util import in_idle
from nltk.tree import Tree
-from nltk.draw.util import (
- CanvasFrame,
- CanvasWidget,
- BoxWidget,
- TextWidget,
- ParenWidget,
- OvalWidget,
-)
+from nltk.draw.util import (CanvasFrame, CanvasWidget, BoxWidget,
+ TextWidget, ParenWidget, OvalWidget)
##//////////////////////////////////////////////////////
## Tree Segment
##//////////////////////////////////////////////////////
-
class TreeSegmentWidget(CanvasWidget):
"""
A canvas widget that displays a single segment of a hierarchical
branch downwards).
- ``draggable``: whether the widget can be dragged by the user.
"""
-
def __init__(self, canvas, label, subtrees, **attribs):
"""
:type node:
self._ordered = False
# Create canvas objects.
- self._lines = [canvas.create_line(0, 0, 0, 0, fill="#006060") for c in subtrees]
- self._polygon = canvas.create_polygon(
- 0, 0, fill="", state="hidden", outline="#006060"
- )
+ self._lines = [canvas.create_line(0,0,0,0, fill='#006060')
+ for c in subtrees]
+ self._polygon = canvas.create_polygon(0,0, fill='', state='hidden',
+ outline='#006060')
# Register child widgets (label + subtrees)
self._add_child_widget(label)
def __setitem__(self, attr, value):
canvas = self.canvas()
- if attr == "roof":
+ if attr == 'roof':
self._roof = value
if self._roof:
- for l in self._lines:
- canvas.itemconfig(l, state="hidden")
- canvas.itemconfig(self._polygon, state="normal")
+ for l in self._lines: canvas.itemconfig(l, state='hidden')
+ canvas.itemconfig(self._polygon, state='normal')
else:
- for l in self._lines:
- canvas.itemconfig(l, state="normal")
- canvas.itemconfig(self._polygon, state="hidden")
- elif attr == "orientation":
- if value == "horizontal":
- self._horizontal = 1
- elif value == "vertical":
- self._horizontal = 0
+ for l in self._lines: canvas.itemconfig(l, state='normal')
+ canvas.itemconfig(self._polygon, state='hidden')
+ elif attr == 'orientation':
+ if value == 'horizontal': self._horizontal = 1
+ elif value == 'vertical': self._horizontal = 0
else:
- raise ValueError("orientation must be horizontal or vertical")
- elif attr == "color":
- for l in self._lines:
- canvas.itemconfig(l, fill=value)
+ raise ValueError('orientation must be horizontal or vertical')
+ elif attr == 'color':
+ for l in self._lines: canvas.itemconfig(l, fill=value)
canvas.itemconfig(self._polygon, outline=value)
- elif isinstance(attr, tuple) and attr[0] == "color":
+ elif isinstance(attr, tuple) and attr[0] == 'color':
# Set the color of an individual line.
l = self._lines[int(attr[1])]
canvas.itemconfig(l, fill=value)
- elif attr == "fill":
+ elif attr == 'fill':
canvas.itemconfig(self._polygon, fill=value)
- elif attr == "width":
- canvas.itemconfig(self._polygon, {attr: value})
- for l in self._lines:
- canvas.itemconfig(l, {attr: value})
- elif attr in ("xspace", "yspace"):
- if attr == "xspace":
- self._xspace = value
- elif attr == "yspace":
- self._yspace = value
+ elif attr == 'width':
+ canvas.itemconfig(self._polygon, {attr:value})
+ for l in self._lines: canvas.itemconfig(l, {attr:value})
+ elif attr in ('xspace', 'yspace'):
+ if attr == 'xspace': self._xspace = value
+ elif attr == 'yspace': self._yspace = value
self.update(self._label)
- elif attr == "ordered":
+ elif attr == 'ordered':
self._ordered = value
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "roof":
- return self._roof
- elif attr == "width":
+ if attr == 'roof': return self._roof
+ elif attr == 'width':
return self.canvas().itemcget(self._polygon, attr)
- elif attr == "color":
- return self.canvas().itemcget(self._polygon, "outline")
- elif isinstance(attr, tuple) and attr[0] == "color":
+ elif attr == 'color':
+ return self.canvas().itemcget(self._polygon, 'outline')
+ elif isinstance(attr, tuple) and attr[0] == 'color':
l = self._lines[int(attr[1])]
- return self.canvas().itemcget(l, "fill")
- elif attr == "xspace":
- return self._xspace
- elif attr == "yspace":
- return self._yspace
- elif attr == "orientation":
- if self._horizontal:
- return "horizontal"
- else:
- return "vertical"
- elif attr == "ordered":
+ return self.canvas().itemcget(l, 'fill')
+ elif attr == 'xspace': return self._xspace
+ elif attr == 'yspace': return self._yspace
+ elif attr == 'orientation':
+ if self._horizontal: return 'horizontal'
+ else: return 'vertical'
+ elif attr == 'ordered':
return self._ordered
else:
return CanvasWidget.__getitem__(self, attr)
canvas = self.canvas()
self._subtrees.insert(index, child)
self._add_child_widget(child)
- self._lines.append(canvas.create_line(0, 0, 0, 0, fill="#006060"))
+ self._lines.append(canvas.create_line(0,0,0,0, fill='#006060'))
self.update(self._label)
# but.. lines???
else:
bbox = child.bbox()
if self._horizontal:
- return (bbox[0], (bbox[1] + bbox[3]) / 2.0)
+ return (bbox[0], (bbox[1]+bbox[3])/2.0)
else:
- return ((bbox[0] + bbox[2]) / 2.0, bbox[1])
+ return ((bbox[0]+bbox[2])/2.0, bbox[1])
def _node_bottom(self):
bbox = self._label.bbox()
if self._horizontal:
- return (bbox[2], (bbox[1] + bbox[3]) / 2.0)
+ return (bbox[2], (bbox[1]+bbox[3])/2.0)
else:
- return ((bbox[0] + bbox[2]) / 2.0, bbox[3])
+ return ((bbox[0]+bbox[2])/2.0, bbox[3])
def _update(self, child):
- if len(self._subtrees) == 0:
- return
- if self._label.bbox() is None:
- return # [XX] ???
+ if len(self._subtrees) == 0: return
+ if self._label.bbox() is None: return # [XX] ???
# Which lines need to be redrawn?
- if child is self._label:
- need_update = self._subtrees
- else:
- need_update = [child]
+ if child is self._label: need_update = self._subtrees
+ else: need_update = [child]
if self._ordered and not self._managing:
need_update = self._maintain_order(child)
ymax = max(ymax, bbox[3])
if self._horizontal:
- self.canvas().coords(
- self._polygon, nodex, nodey, xmin, ymin, xmin, ymax, nodex, nodey
- )
+ self.canvas().coords(self._polygon, nodex, nodey, xmin,
+ ymin, xmin, ymax, nodex, nodey)
else:
- self.canvas().coords(
- self._polygon, nodex, nodey, xmin, ymin, xmax, ymin, nodex, nodey
- )
+ self.canvas().coords(self._polygon, nodex, nodey, xmin,
+ ymin, xmax, ymin, nodex, nodey)
# Redraw all lines that need it.
for subtree in need_update:
# Check all the leaves
for subtree in self._subtrees:
(x1, y1, x2, y2) = subtree.bbox()
- if bot + self._yspace > y1:
- subtree.move(0, bot + self._yspace - y1)
+ if bot+self._yspace > y1:
+ subtree.move(0,bot+self._yspace-y1)
return self._subtrees
else:
# Check leaves to our right.
x = right + self._xspace
- for i in range(index + 1, len(self._subtrees)):
+ for i in range(index+1, len(self._subtrees)):
(x1, y1, x2, y2) = self._subtrees[i].bbox()
if x > x1:
- self._subtrees[i].move(x - x1, 0)
- x += x2 - x1 + self._xspace
+ self._subtrees[i].move(x-x1, 0)
+ x += x2-x1 + self._xspace
moved.append(self._subtrees[i])
# Check leaves to our left.
x = left - self._xspace
- for i in range(index - 1, -1, -1):
+ for i in range(index-1, -1, -1):
(x1, y1, x2, y2) = self._subtrees[i].bbox()
if x < x2:
- self._subtrees[i].move(x - x2, 0)
- x -= x2 - x1 + self._xspace
+ self._subtrees[i].move(x-x2, 0)
+ x -= x2-x1 + self._xspace
moved.append(self._subtrees[i])
# Check the node
(x1, y1, x2, y2) = self._label.bbox()
- if y2 > top - self._yspace:
- self._label.move(0, top - self._yspace - y2)
+ if y2 > top-self._yspace:
+ self._label.move(0, top-self._yspace-y2)
moved = self._subtrees
# Return a list of the nodes we moved
# Check all the leaves
for subtree in self._subtrees:
(x1, y1, x2, y2) = subtree.bbox()
- if right + self._xspace > x1:
- subtree.move(right + self._xspace - x1)
+ if right+self._xspace > x1:
+ subtree.move(right+self._xspace-x1)
return self._subtrees
else:
# Check leaves below us.
y = bot + self._yspace
- for i in range(index + 1, len(self._subtrees)):
+ for i in range(index+1, len(self._subtrees)):
(x1, y1, x2, y2) = self._subtrees[i].bbox()
if y > y1:
- self._subtrees[i].move(0, y - y1)
- y += y2 - y1 + self._yspace
+ self._subtrees[i].move(0, y-y1)
+ y += y2-y1 + self._yspace
moved.append(self._subtrees[i])
# Check leaves above us
y = top - self._yspace
- for i in range(index - 1, -1, -1):
+ for i in range(index-1, -1, -1):
(x1, y1, x2, y2) = self._subtrees[i].bbox()
if y < y2:
- self._subtrees[i].move(0, y - y2)
- y -= y2 - y1 + self._yspace
+ self._subtrees[i].move(0, y-y2)
+ y -= y2-y1 + self._yspace
moved.append(self._subtrees[i])
# Check the node
(x1, y1, x2, y2) = self._label.bbox()
- if x2 > left - self._xspace:
- self._label.move(left - self._xspace - x2, 0)
+ if x2 > left-self._xspace:
+ self._label.move(left-self._xspace-x2, 0)
moved = self._subtrees
# Return a list of the nodes we moved
# Center the subtrees with the node.
for subtree in self._subtrees:
- subtree.move(0, nodey - center)
+ subtree.move(0, nodey-center)
def _manage_vertical(self):
(nodex, nodey) = self._node_bottom()
# Find the center of their tops.
center = 0.0
for subtree in self._subtrees:
- center += self._subtree_top(subtree)[0] / len(self._subtrees)
+ center += self._subtree_top(subtree)[0]/len(self._subtrees)
# Center the subtrees with the node.
for subtree in self._subtrees:
- subtree.move(nodex - center, 0)
+ subtree.move(nodex-center, 0)
def _manage(self):
self._managing = True
(nodex, nodey) = self._node_bottom()
- if len(self._subtrees) == 0:
- return
+ if len(self._subtrees) == 0: return
- if self._horizontal:
- self._manage_horizontal()
- else:
- self._manage_vertical()
+ if self._horizontal: self._manage_horizontal()
+ else: self._manage_vertical()
# Update lines to subtrees.
for subtree in self._subtrees:
self._managing = False
def __repr__(self):
- return "[TreeSeg %s: %s]" % (self._label, self._subtrees)
-
-
-def _tree_to_treeseg(
- canvas,
- t,
- make_node,
- make_leaf,
- tree_attribs,
- node_attribs,
- leaf_attribs,
- loc_attribs,
-):
+ return '[TreeSeg %s: %s]' % (self._label, self._subtrees)
+
+def _tree_to_treeseg(canvas, t, make_node, make_leaf,
+ tree_attribs, node_attribs,
+ leaf_attribs, loc_attribs):
if isinstance(t, Tree):
label = make_node(canvas, t.label(), **node_attribs)
- subtrees = [
- _tree_to_treeseg(
- canvas,
- child,
- make_node,
- make_leaf,
- tree_attribs,
- node_attribs,
- leaf_attribs,
- loc_attribs,
- )
- for child in t
- ]
+ subtrees = [_tree_to_treeseg(canvas, child, make_node, make_leaf,
+ tree_attribs, node_attribs,
+ leaf_attribs, loc_attribs)
+ for child in t]
return TreeSegmentWidget(canvas, label, subtrees, **tree_attribs)
else:
return make_leaf(canvas, t, **leaf_attribs)
-
-def tree_to_treesegment(
- canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs
-):
+def tree_to_treesegment(canvas, t, make_node=TextWidget,
+ make_leaf=TextWidget, **attribs):
"""
Convert a Tree into a ``TreeSegmentWidget``.
loc_attribs = {}
for (key, value) in list(attribs.items()):
- if key[:5] == "tree_":
- tree_attribs[key[5:]] = value
- elif key[:5] == "node_":
- node_attribs[key[5:]] = value
- elif key[:5] == "leaf_":
- leaf_attribs[key[5:]] = value
- elif key[:4] == "loc_":
- loc_attribs[key[4:]] = value
- else:
- raise ValueError("Bad attribute: %s" % key)
- return _tree_to_treeseg(
- canvas,
- t,
- make_node,
- make_leaf,
- tree_attribs,
- node_attribs,
- leaf_attribs,
- loc_attribs,
- )
-
+ if key[:5] == 'tree_': tree_attribs[key[5:]] = value
+ elif key[:5] == 'node_': node_attribs[key[5:]] = value
+ elif key[:5] == 'leaf_': leaf_attribs[key[5:]] = value
+ elif key[:4] == 'loc_': loc_attribs[key[4:]] = value
+ else: raise ValueError('Bad attribute: %s' % key)
+ return _tree_to_treeseg(canvas, t, make_node, make_leaf,
+ tree_attribs, node_attribs,
+ leaf_attribs, loc_attribs)
##//////////////////////////////////////////////////////
## Tree Widget
##//////////////////////////////////////////////////////
-
class TreeWidget(CanvasWidget):
"""
A canvas widget that displays a single Tree.
segments.
- ``draggable``: whether the widget can be dragged by the user.
"""
-
- def __init__(
- self, canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs
- ):
+ def __init__(self, canvas, t, make_node=TextWidget,
+ make_leaf=TextWidget, **attribs):
# Node & leaf canvas widget constructors
self._make_node = make_node
self._make_leaf = make_leaf
# Attributes.
self._nodeattribs = {}
self._leafattribs = {}
- self._locattribs = {"color": "#008000"}
- self._line_color = "#008080"
+ self._locattribs = {'color': '#008000'}
+ self._line_color = '#008080'
self._line_width = 1
- self._roof_color = "#008080"
- self._roof_fill = "#c0c0c0"
+ self._roof_color = '#008080'
+ self._roof_fill = '#c0c0c0'
self._shapeable = False
self._xspace = 10
self._yspace = 10
- self._orientation = "vertical"
+ self._orientation = 'vertical'
self._ordered = False
# Build trees.
- self._keys = {} # treeseg -> key
+ self._keys = {} # treeseg -> key
self._expanded_trees = {}
self._collapsed_trees = {}
self._nodes = []
self._leaves = []
- # self._locs = []
+ #self._locs = []
self._make_collapsed_trees(canvas, t, ())
self._treeseg = self._make_expanded_tree(canvas, t, ())
self._add_child_widget(self._treeseg)
"""
Add a binding to all leaves.
"""
- for leaf in self._leaves:
- leaf.bind_click(callback, button)
- for leaf in self._leaves:
- leaf.bind_click(callback, button)
+ for leaf in self._leaves: leaf.bind_click(callback, button)
+ for leaf in self._leaves: leaf.bind_click(callback, button)
def bind_drag_leaves(self, callback, button=1):
"""
Add a binding to all leaves.
"""
- for leaf in self._leaves:
- leaf.bind_drag(callback, button)
- for leaf in self._leaves:
- leaf.bind_drag(callback, button)
+ for leaf in self._leaves: leaf.bind_drag(callback, button)
+ for leaf in self._leaves: leaf.bind_drag(callback, button)
def bind_click_nodes(self, callback, button=1):
"""
Add a binding to all nodes.
"""
- for node in self._nodes:
- node.bind_click(callback, button)
- for node in self._nodes:
- node.bind_click(callback, button)
+ for node in self._nodes: node.bind_click(callback, button)
+ for node in self._nodes: node.bind_click(callback, button)
def bind_drag_nodes(self, callback, button=1):
"""
Add a binding to all nodes.
"""
- for node in self._nodes:
- node.bind_drag(callback, button)
- for node in self._nodes:
- node.bind_drag(callback, button)
+ for node in self._nodes: node.bind_drag(callback, button)
+ for node in self._nodes: node.bind_drag(callback, button)
def _make_collapsed_trees(self, canvas, t, key):
- if not isinstance(t, Tree):
- return
+ if not isinstance(t, Tree): return
make_node = self._make_node
make_leaf = self._make_leaf
node = make_node(canvas, t.label(), **self._nodeattribs)
self._nodes.append(node)
- leaves = [make_leaf(canvas, l, **self._leafattribs) for l in t.leaves()]
+ leaves = [make_leaf(canvas, l, **self._leafattribs)
+ for l in t.leaves()]
self._leaves += leaves
- treeseg = TreeSegmentWidget(
- canvas,
- node,
- leaves,
- roof=1,
- color=self._roof_color,
- fill=self._roof_fill,
- width=self._line_width,
- )
+ treeseg = TreeSegmentWidget(canvas, node, leaves, roof=1,
+ color=self._roof_color,
+ fill=self._roof_fill,
+ width=self._line_width)
self._collapsed_trees[key] = treeseg
self._keys[treeseg] = key
- # self._add_child_widget(treeseg)
+ #self._add_child_widget(treeseg)
treeseg.hide()
# Build trees for children.
node = make_node(canvas, t.label(), **self._nodeattribs)
self._nodes.append(node)
children = t
- subtrees = [
- self._make_expanded_tree(canvas, children[i], key + (i,))
- for i in range(len(children))
- ]
- treeseg = TreeSegmentWidget(
- canvas, node, subtrees, color=self._line_color, width=self._line_width
- )
+ subtrees = [self._make_expanded_tree(canvas, children[i], key+(i,))
+ for i in range(len(children))]
+ treeseg = TreeSegmentWidget(canvas, node, subtrees,
+ color=self._line_color,
+ width=self._line_width)
self._expanded_trees[key] = treeseg
self._keys[treeseg] = key
return treeseg
return leaf
def __setitem__(self, attr, value):
- if attr[:5] == "node_":
- for node in self._nodes:
- node[attr[5:]] = value
- elif attr[:5] == "leaf_":
- for leaf in self._leaves:
- leaf[attr[5:]] = value
- elif attr == "line_color":
+ if attr[:5] == 'node_':
+ for node in self._nodes: node[attr[5:]] = value
+ elif attr[:5] == 'leaf_':
+ for leaf in self._leaves: leaf[attr[5:]] = value
+ elif attr == 'line_color':
self._line_color = value
- for tseg in list(self._expanded_trees.values()):
- tseg["color"] = value
- elif attr == "line_width":
+ for tseg in list(self._expanded_trees.values()): tseg['color'] = value
+ elif attr == 'line_width':
self._line_width = value
- for tseg in list(self._expanded_trees.values()):
- tseg["width"] = value
- for tseg in list(self._collapsed_trees.values()):
- tseg["width"] = value
- elif attr == "roof_color":
+ for tseg in list(self._expanded_trees.values()): tseg['width'] = value
+ for tseg in list(self._collapsed_trees.values()): tseg['width'] = value
+ elif attr == 'roof_color':
self._roof_color = value
- for tseg in list(self._collapsed_trees.values()):
- tseg["color"] = value
- elif attr == "roof_fill":
+ for tseg in list(self._collapsed_trees.values()): tseg['color'] = value
+ elif attr == 'roof_fill':
self._roof_fill = value
- for tseg in list(self._collapsed_trees.values()):
- tseg["fill"] = value
- elif attr == "shapeable":
+ for tseg in list(self._collapsed_trees.values()): tseg['fill'] = value
+ elif attr == 'shapeable':
self._shapeable = value
for tseg in list(self._expanded_trees.values()):
- tseg["draggable"] = value
+ tseg['draggable'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["draggable"] = value
- for leaf in self._leaves:
- leaf["draggable"] = value
- elif attr == "xspace":
+ tseg['draggable'] = value
+ for leaf in self._leaves: leaf['draggable'] = value
+ elif attr == 'xspace':
self._xspace = value
for tseg in list(self._expanded_trees.values()):
- tseg["xspace"] = value
+ tseg['xspace'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["xspace"] = value
+ tseg['xspace'] = value
self.manage()
- elif attr == "yspace":
+ elif attr == 'yspace':
self._yspace = value
for tseg in list(self._expanded_trees.values()):
- tseg["yspace"] = value
+ tseg['yspace'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["yspace"] = value
+ tseg['yspace'] = value
self.manage()
- elif attr == "orientation":
+ elif attr == 'orientation':
self._orientation = value
for tseg in list(self._expanded_trees.values()):
- tseg["orientation"] = value
+ tseg['orientation'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["orientation"] = value
+ tseg['orientation'] = value
self.manage()
- elif attr == "ordered":
+ elif attr == 'ordered':
self._ordered = value
for tseg in list(self._expanded_trees.values()):
- tseg["ordered"] = value
+ tseg['ordered'] = value
for tseg in list(self._collapsed_trees.values()):
- tseg["ordered"] = value
- else:
- CanvasWidget.__setitem__(self, attr, value)
+ tseg['ordered'] = value
+ else: CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr[:5] == "node_":
+ if attr[:5] == 'node_':
return self._nodeattribs.get(attr[5:], None)
- elif attr[:5] == "leaf_":
+ elif attr[:5] == 'leaf_':
return self._leafattribs.get(attr[5:], None)
- elif attr[:4] == "loc_":
+ elif attr[:4] == 'loc_':
return self._locattribs.get(attr[4:], None)
- elif attr == "line_color":
- return self._line_color
- elif attr == "line_width":
- return self._line_width
- elif attr == "roof_color":
- return self._roof_color
- elif attr == "roof_fill":
- return self._roof_fill
- elif attr == "shapeable":
- return self._shapeable
- elif attr == "xspace":
- return self._xspace
- elif attr == "yspace":
- return self._yspace
- elif attr == "orientation":
- return self._orientation
- else:
- return CanvasWidget.__getitem__(self, attr)
-
- def _tags(self):
- return []
+ elif attr == 'line_color': return self._line_color
+ elif attr == 'line_width': return self._line_width
+ elif attr == 'roof_color': return self._roof_color
+ elif attr == 'roof_fill': return self._roof_fill
+ elif attr == 'shapeable': return self._shapeable
+ elif attr == 'xspace': return self._xspace
+ elif attr == 'yspace': return self._yspace
+ elif attr == 'orientation': return self._orientation
+ else: return CanvasWidget.__getitem__(self, attr)
+
+ def _tags(self): return []
def _manage(self):
- segs = list(self._expanded_trees.values()) + list(
- self._collapsed_trees.values()
- )
+ segs = list(self._expanded_trees.values()) + list(self._collapsed_trees.values())
for tseg in segs:
if tseg.hidden():
tseg.show()
Collapse/expand a tree.
"""
old_treeseg = treeseg
- if old_treeseg["roof"]:
+ if old_treeseg['roof']:
new_treeseg = self._expanded_trees[self._keys[old_treeseg]]
else:
new_treeseg = self._collapsed_trees[self._keys[old_treeseg]]
new_treeseg.show()
(newx, newy) = new_treeseg.label().bbox()[:2]
(oldx, oldy) = old_treeseg.label().bbox()[:2]
- new_treeseg.move(oldx - newx, oldy - newy)
+ new_treeseg.move(oldx-newx, oldy-newy)
# Hide the old tree
old_treeseg.hide()
# We could do parent.manage() here instead, if we wanted.
new_treeseg.parent().update(new_treeseg)
-
##//////////////////////////////////////////////////////
## draw_trees
##//////////////////////////////////////////////////////
-
class TreeView(object):
def __init__(self, *trees):
from math import sqrt, ceil
self._trees = trees
self._top = Tk()
- self._top.title("NLTK")
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Control-q>", self.destroy)
+ self._top.title('NLTK')
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Control-q>', self.destroy)
cf = self._cframe = CanvasFrame(self._top)
- self._top.bind("<Control-p>", self._cframe.print_to_file)
+ self._top.bind('<Control-p>', self._cframe.print_to_file)
# Size is variable.
self._size = IntVar(self._top)
self._size.set(12)
- bold = ("helvetica", -self._size.get(), "bold")
- helv = ("helvetica", -self._size.get())
+ bold = ('helvetica', -self._size.get(), 'bold')
+ helv = ('helvetica', -self._size.get())
# Lay the trees out in a square.
self._width = int(ceil(sqrt(len(trees))))
self._widgets = []
for i in range(len(trees)):
- widget = TreeWidget(
- cf.canvas(),
- trees[i],
- node_font=bold,
- leaf_color="#008040",
- node_color="#004080",
- roof_color="#004040",
- roof_fill="white",
- line_color="#004040",
- draggable=1,
- leaf_font=helv,
- )
+ widget = TreeWidget(cf.canvas(), trees[i], node_font=bold,
+ leaf_color='#008040', node_color='#004080',
+ roof_color='#004040', roof_fill='white',
+ line_color='#004040', draggable=1,
+ leaf_font=helv)
widget.bind_click_trees(widget.toggle_collapsed)
self._widgets.append(widget)
cf.add_widget(widget, 0, 0)
self._layout()
- self._cframe.pack(expand=1, fill="both")
+ self._cframe.pack(expand=1, fill='both')
self._init_menubar()
def _layout(self):
if i % width == 0:
y = ymax
x = 0
- widget.move(x - oldx, y - oldy)
+ widget.move(x-oldx, y-oldy)
x = widget.bbox()[2] + 10
ymax = max(ymax, widget.bbox()[3] + 10)
menubar = Menu(self._top)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Print to Postscript",
- underline=0,
- command=self._cframe.print_to_file,
- accelerator="Ctrl-p",
- )
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Print to Postscript', underline=0,
+ command=self._cframe.print_to_file,
+ accelerator='Ctrl-p')
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-x')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
zoommenu = Menu(menubar, tearoff=0)
- zoommenu.add_radiobutton(
- label="Tiny",
- variable=self._size,
- underline=0,
- value=10,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Small",
- variable=self._size,
- underline=0,
- value=12,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Medium",
- variable=self._size,
- underline=0,
- value=14,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Large",
- variable=self._size,
- underline=0,
- value=28,
- command=self.resize,
- )
- zoommenu.add_radiobutton(
- label="Huge",
- variable=self._size,
- underline=0,
- value=50,
- command=self.resize,
- )
- menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu)
+ zoommenu.add_radiobutton(label='Tiny', variable=self._size,
+ underline=0, value=10, command=self.resize)
+ zoommenu.add_radiobutton(label='Small', variable=self._size,
+ underline=0, value=12, command=self.resize)
+ zoommenu.add_radiobutton(label='Medium', variable=self._size,
+ underline=0, value=14, command=self.resize)
+ zoommenu.add_radiobutton(label='Large', variable=self._size,
+ underline=0, value=28, command=self.resize)
+ zoommenu.add_radiobutton(label='Huge', variable=self._size,
+ underline=0, value=50, command=self.resize)
+ menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
self._top.config(menu=menubar)
def resize(self, *e):
- bold = ("helvetica", -self._size.get(), "bold")
- helv = ("helvetica", -self._size.get())
+ bold = ('helvetica', -self._size.get(), 'bold')
+ helv = ('helvetica', -self._size.get())
xspace = self._size.get()
yspace = self._size.get()
for widget in self._widgets:
- widget["node_font"] = bold
- widget["leaf_font"] = helv
- widget["xspace"] = xspace
- widget["yspace"] = yspace
- if self._size.get() < 20:
- widget["line_width"] = 1
- elif self._size.get() < 30:
- widget["line_width"] = 2
- else:
- widget["line_width"] = 3
+ widget['node_font'] = bold
+ widget['leaf_font'] = helv
+ widget['xspace'] = xspace
+ widget['yspace'] = yspace
+ if self._size.get() < 20: widget['line_width'] = 1
+ elif self._size.get() < 30: widget['line_width'] = 2
+ else: widget['line_width'] = 3
self._layout()
def destroy(self, *e):
- if self._top is None:
- return
+ if self._top is None: return
self._top.destroy()
self._top = None
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self._top.mainloop(*args, **kwargs)
-
def draw_trees(*trees):
"""
Open a new window containing a graphical diagram of the given
TreeView(*trees).mainloop()
return
-
##//////////////////////////////////////////////////////
## Demo Code
##//////////////////////////////////////////////////////
-
def demo():
import random
-
def fill(cw):
- cw["fill"] = "#%06d" % random.randint(0, 999999)
+ cw['fill'] = '#%06d' % random.randint(0,999999)
cf = CanvasFrame(width=550, height=450, closeenough=2)
- t = Tree.fromstring(
- """
+ t = Tree.fromstring('''
(S (NP the very big cat)
- (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))"""
- )
-
- tc = TreeWidget(
- cf.canvas(),
- t,
- draggable=1,
- node_font=("helvetica", -14, "bold"),
- leaf_font=("helvetica", -12, "italic"),
- roof_fill="white",
- roof_color="black",
- leaf_color="green4",
- node_color="blue2",
- )
- cf.add_widget(tc, 10, 10)
+ (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''')
- def boxit(canvas, text):
- big = ("helvetica", -16, "bold")
- return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green")
+ tc = TreeWidget(cf.canvas(), t, draggable=1,
+ node_font=('helvetica', -14, 'bold'),
+ leaf_font=('helvetica', -12, 'italic'),
+ roof_fill='white', roof_color='black',
+ leaf_color='green4', node_color='blue2')
+ cf.add_widget(tc,10,10)
+ def boxit(canvas, text):
+ big = ('helvetica', -16, 'bold')
+ return BoxWidget(canvas, TextWidget(canvas, text,
+ font=big), fill='green')
def ovalit(canvas, text):
- return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan")
+ return OvalWidget(canvas, TextWidget(canvas, text),
+ fill='cyan')
- treetok = Tree.fromstring("(S (NP this tree) (VP (V is) (AdjP shapeable)))")
+ treetok = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)
def color(node):
- node["color"] = "#%04d00" % random.randint(0, 9999)
-
+ node['color'] = '#%04d00' % random.randint(0,9999)
def color2(treeseg):
- treeseg.label()["fill"] = "#%06d" % random.randint(0, 9999)
- treeseg.label().child()["color"] = "white"
+ treeseg.label()['fill'] = '#%06d' % random.randint(0,9999)
+ treeseg.label().child()['color'] = 'white'
tc.bind_click_trees(tc.toggle_collapsed)
tc2.bind_click_trees(tc2.toggle_collapsed)
tc2.expanded_tree().bind_click(color2, 3)
paren = ParenWidget(cf.canvas(), tc2)
- cf.add_widget(paren, tc.bbox()[2] + 10, 10)
+ cf.add_widget(paren, tc.bbox()[2]+10, 10)
- tree3 = Tree.fromstring(
- """
+ tree3 = Tree.fromstring('''
(S (NP this tree) (AUX was)
- (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))"""
- )
- tc3 = tree_to_treesegment(
- cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2
- )
- tc3["draggable"] = 1
- cf.add_widget(tc3, 10, tc.bbox()[3] + 10)
+ (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''')
+ tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4',
+ tree_xspace=2, tree_width=2)
+ tc3['draggable'] = 1
+ cf.add_widget(tc3, 10, tc.bbox()[3]+10)
def orientswitch(treewidget):
- if treewidget["orientation"] == "horizontal":
- treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical")
- treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical")
- treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical")
- treewidget.collapsed_tree().subtrees()[3].set_text("vertical")
- treewidget["orientation"] = "vertical"
+ if treewidget['orientation'] == 'horizontal':
+ treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical')
+ treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical')
+ treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
+ treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
+ treewidget['orientation'] = 'vertical'
else:
- treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal")
- treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal")
- treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal")
- treewidget.collapsed_tree().subtrees()[3].set_text("horizontal")
- treewidget["orientation"] = "horizontal"
+ treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal')
+ treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal')
+ treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
+ treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
+ treewidget['orientation'] = 'horizontal'
text = """
Try clicking, right clicking, and dragging
and OvalWidget). The bottom-left tree is
built from tree_to_treesegment."""
twidget = TextWidget(cf.canvas(), text.strip())
- textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1)
- cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10)
-
- tree4 = Tree.fromstring("(S (NP this tree) (VP (V is) (Adj horizontal)))")
- tc4 = TreeWidget(
- cf.canvas(),
- tree4,
- draggable=1,
- line_color="brown2",
- roof_color="brown2",
- node_font=("helvetica", -12, "bold"),
- node_color="brown4",
- orientation="horizontal",
- )
+ textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
+ cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10)
+
+ tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))')
+ tc4 = TreeWidget(cf.canvas(), tree4, draggable=1,
+ line_color='brown2', roof_color='brown2',
+ node_font=('helvetica', -12, 'bold'),
+ node_color='brown4', orientation='horizontal')
tc4.manage()
- cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10)
+ cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10)
tc4.bind_click(orientswitch)
tc4.bind_click_trees(tc4.toggle_collapsed, 3)
# Run mainloop
cf.mainloop()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Drawing utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
from abc import ABCMeta, abstractmethod
-from tkinter import (
- Button,
- Canvas,
- Entry,
- Frame,
- Label,
- Menu,
- Menubutton,
- Scrollbar,
- StringVar,
- Text,
- Tk,
- Toplevel,
- Widget,
- RAISED,
-)
-from tkinter.filedialog import asksaveasfilename
+from six import add_metaclass
+from six.moves.tkinter import (Button, Canvas, Entry, Frame, Label, Menu,
+ Menubutton, Scrollbar, StringVar, Text, Tk,
+ Toplevel, Widget, RAISED)
+from six.moves.tkinter_tkfiledialog import asksaveasfilename
from nltk.util import in_idle
##//////////////////////////////////////////////////////
-class CanvasWidget(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class CanvasWidget(object):
"""
A collection of graphical elements and bindings used to display a
complex object on a Tkinter ``Canvas``. A canvas widget is
argument, which is the ``CanvasWidget`` that triggered the
callback.
"""
-
def __init__(self, canvas, parent=None, **attribs):
"""
Create a new canvas widget. This constructor should only be
:param attribs: The new canvas widget's attributes.
"""
if self.__class__ == CanvasWidget:
- raise TypeError("CanvasWidget is an abstract base class")
+ raise TypeError('CanvasWidget is an abstract base class')
if not isinstance(canvas, Canvas):
- raise TypeError("Expected a canvas!")
+ raise TypeError('Expected a canvas!')
self.__canvas = canvas
self.__parent = parent
# If the subclass constructor called _add_child_widget, then
# self.__children will already exist.
- if not hasattr(self, "_CanvasWidget__children"):
- self.__children = []
+ if not hasattr(self, '_CanvasWidget__children'): self.__children = []
# Is this widget hidden?
self.__hidden = 0
self.__draggable = 0
# Set up attributes.
- for (attr, value) in list(attribs.items()):
- self[attr] = value
+ for (attr, value) in list(attribs.items()): self[attr] = value
# Manage this canvas widget
self._manage()
# Register any new bindings
for tag in self._tags():
- self.__canvas.tag_bind(tag, "<ButtonPress-1>", self.__press_cb)
- self.__canvas.tag_bind(tag, "<ButtonPress-2>", self.__press_cb)
- self.__canvas.tag_bind(tag, "<ButtonPress-3>", self.__press_cb)
+ self.__canvas.tag_bind(tag, '<ButtonPress-1>',
+ self.__press_cb)
+ self.__canvas.tag_bind(tag, '<ButtonPress-2>',
+ self.__press_cb)
+ self.__canvas.tag_bind(tag, '<ButtonPress-3>',
+ self.__press_cb)
##//////////////////////////////////////////////////////
## Inherited methods.
specified with respect to the coordinate space of the ``Canvas``.
:rtype: tuple(int, int, int, int)
"""
- if self.__hidden:
- return (0, 0, 0, 0)
- if len(self.tags()) == 0:
- raise ValueError("No tags")
+ if self.__hidden: return (0,0,0,0)
+ if len(self.tags()) == 0: raise ValueError('No tags')
return self.__canvas.bbox(*self.tags())
def width(self):
its ``Canvas``'s coordinate space.
:rtype: int
"""
- if len(self.tags()) == 0:
- raise ValueError("No tags")
+ if len(self.tags()) == 0: raise ValueError('No tags')
bbox = self.__canvas.bbox(*self.tags())
- return bbox[2] - bbox[0]
+ return bbox[2]-bbox[0]
def height(self):
"""
its ``Canvas``'s coordinate space.
:rtype: int
"""
- if len(self.tags()) == 0:
- raise ValueError("No tags")
+ if len(self.tags()) == 0: raise ValueError('No tags')
bbox = self.__canvas.bbox(*self.tags())
- return bbox[3] - bbox[1]
+ return bbox[3]-bbox[1]
def parent(self):
"""
downwards.
:rtype: None
"""
- if dx == dy == 0:
- return
+ if dx == dy == 0: return
for tag in self.tags():
self.__canvas.move(tag, dx, dy)
- if self.__parent:
- self.__parent.update(self)
+ if self.__parent: self.__parent.update(self)
- def moveto(self, x, y, anchor="NW"):
+ def moveto(self, x, y, anchor='NW'):
"""
Move this canvas widget to the given location. In particular,
shift the canvas widget such that the corner or side of the
specifies the top center; ``'NE'`` specifies the top right
corner; etc.
"""
- x1, y1, x2, y2 = self.bbox()
- if anchor == "NW":
- self.move(x - x1, y - y1)
- if anchor == "N":
- self.move(x - x1 / 2 - x2 / 2, y - y1)
- if anchor == "NE":
- self.move(x - x2, y - y1)
- if anchor == "E":
- self.move(x - x2, y - y1 / 2 - y2 / 2)
- if anchor == "SE":
- self.move(x - x2, y - y2)
- if anchor == "S":
- self.move(x - x1 / 2 - x2 / 2, y - y2)
- if anchor == "SW":
- self.move(x - x1, y - y2)
- if anchor == "W":
- self.move(x - x1, y - y1 / 2 - y2 / 2)
+ x1,y1,x2,y2 = self.bbox()
+ if anchor == 'NW': self.move(x-x1, y-y1)
+ if anchor == 'N': self.move(x-x1/2-x2/2, y-y1)
+ if anchor == 'NE': self.move(x-x2, y-y1)
+ if anchor == 'E': self.move(x-x2, y-y1/2-y2/2)
+ if anchor == 'SE': self.move(x-x2, y-y2)
+ if anchor == 'S': self.move(x-x1/2-x2/2, y-y2)
+ if anchor == 'SW': self.move(x-x1, y-y2)
+ if anchor == 'W': self.move(x-x1, y-y1/2-y2/2)
def destroy(self):
"""
return
for tag in self.tags():
- self.__canvas.tag_unbind(tag, "<ButtonPress-1>")
- self.__canvas.tag_unbind(tag, "<ButtonPress-2>")
- self.__canvas.tag_unbind(tag, "<ButtonPress-3>")
+ self.__canvas.tag_unbind(tag, '<ButtonPress-1>')
+ self.__canvas.tag_unbind(tag, '<ButtonPress-2>')
+ self.__canvas.tag_unbind(tag, '<ButtonPress-3>')
self.__canvas.delete(*self.tags())
self.__canvas = None
:param child: The child widget that changed.
:type child: CanvasWidget
"""
- if self.__hidden or child.__hidden:
- return
+ if self.__hidden or child.__hidden: return
# If we're already updating, then do nothing. This prevents
# infinite loops when _update modifies its children.
- if self.__updating:
- return
+ if self.__updating: return
self.__updating = 1
# Update this CanvasWidget.
self._update(child)
# Propagate update request to the parent.
- if self.__parent:
- self.__parent.update(self)
+ if self.__parent: self.__parent.update(self)
# We're done updating.
self.__updating = 0
:rtype: None
"""
- if self.__hidden:
- return
- for child in self.__children:
- child.manage()
+ if self.__hidden: return
+ for child in self.__children: child.manage()
self._manage()
def tags(self):
:rtype: list of int
"""
if self.__canvas is None:
- raise ValueError("Attempt to access a destroyed canvas widget")
+ raise ValueError('Attempt to access a destroyed canvas widget')
tags = []
tags += self._tags()
for child in self.__children:
:rtype: None
"""
- if attr == "draggable":
+ if attr == 'draggable':
self.__draggable = value
else:
- raise ValueError("Unknown attribute %r" % attr)
+ raise ValueError('Unknown attribute %r' % attr)
def __getitem__(self, attr):
"""
canvas widget.
:rtype: (any)
"""
- if attr == "draggable":
+ if attr == 'draggable':
return self.__draggable
else:
- raise ValueError("Unknown attribute %r" % attr)
+ raise ValueError('Unknown attribute %r' % attr)
def __repr__(self):
"""
:return: a string representation of this canvas widget.
:rtype: str
"""
- return "<%s>" % self.__class__.__name__
+ return '<%s>' % self.__class__.__name__
def hide(self):
"""
"""
self.__hidden = 1
for tag in self.tags():
- self.__canvas.itemconfig(tag, state="hidden")
+ self.__canvas.itemconfig(tag, state='hidden')
def show(self):
"""
"""
self.__hidden = 0
for tag in self.tags():
- self.__canvas.itemconfig(tag, state="normal")
+ self.__canvas.itemconfig(tag, state='normal')
def hidden(self):
"""
will be called with this ``CanvasWidget`` as its argument.
"""
self.__draggable = 1
- self.__callbacks["drag"] = callback
+ self.__callbacks['drag'] = callback
def unbind_click(self, button=1):
"""
this ``CanvasWidget``. Typically, this should be 1 (left
button), 3 (right button), or 2 (middle button).
"""
- try:
- del self.__callbacks[button]
- except:
- pass
+ try: del self.__callbacks[button]
+ except: pass
def unbind_drag(self):
"""
Remove a callback that was registered with ``bind_drag``.
"""
- try:
- del self.__callbacks["drag"]
- except:
- pass
+ try: del self.__callbacks['drag']
+ except: pass
##//////////////////////////////////////////////////////
## Callback internals
"""
# If we're already waiting for a button release, then ignore
# this new button press.
- if (
- self.__canvas.bind("<ButtonRelease-1>")
- or self.__canvas.bind("<ButtonRelease-2>")
- or self.__canvas.bind("<ButtonRelease-3>")
- ):
+ if (self.__canvas.bind('<ButtonRelease-1>') or
+ self.__canvas.bind('<ButtonRelease-2>') or
+ self.__canvas.bind('<ButtonRelease-3>')):
return
# Unbind motion (just in case; this shouldn't be necessary)
- self.__canvas.unbind("<Motion>")
+ self.__canvas.unbind('<Motion>')
# Record the button press event.
self.__press = event
if event.num == 1:
widget = self
while widget is not None:
- if widget["draggable"]:
+ if widget['draggable']:
widget.__start_drag(event)
break
widget = widget.parent()
# Set up the button release callback.
- self.__canvas.bind("<ButtonRelease-%d>" % event.num, self.__release_cb)
+ self.__canvas.bind('<ButtonRelease-%d>' % event.num,
+ self.__release_cb)
def __start_drag(self, event):
"""
- register a motion callback
- record the drag coordinates
"""
- self.__canvas.bind("<Motion>", self.__motion_cb)
+ self.__canvas.bind('<Motion>', self.__motion_cb)
self.__drag_x = event.x
self.__drag_y = event.y
- move this object to the new location
- record the new drag coordinates
"""
- self.move(event.x - self.__drag_x, event.y - self.__drag_y)
+ self.move(event.x-self.__drag_x, event.y-self.__drag_y)
self.__drag_x = event.x
self.__drag_y = event.y
- call the appropriate handler.
"""
# Unbind the button release & motion callbacks.
- self.__canvas.unbind("<ButtonRelease-%d>" % event.num)
- self.__canvas.unbind("<Motion>")
+ self.__canvas.unbind('<ButtonRelease-%d>' % event.num)
+ self.__canvas.unbind('<Motion>')
# Is it a click or a drag?
- if (
- event.time - self.__press.time < 100
- and abs(event.x - self.__press.x) + abs(event.y - self.__press.y) < 5
- ):
+ if (event.time - self.__press.time < 100 and
+ abs(event.x-self.__press.x) + abs(event.y-self.__press.y) < 5):
# Move it back, if we were dragging.
if self.__draggable and event.num == 1:
- self.move(
- self.__press.x - self.__drag_x, self.__press.y - self.__drag_y
- )
+ self.move(self.__press.x - self.__drag_x,
+ self.__press.y - self.__drag_y)
self.__click(event.num)
elif event.num == 1:
self.__drag()
call it. If no ancestors have a drag callback, do nothing.
"""
if self.__draggable:
- if "drag" in self.__callbacks:
- cb = self.__callbacks["drag"]
+ if 'drag' in self.__callbacks:
+ cb = self.__callbacks['drag']
try:
cb(self)
except:
- print("Error in drag callback for %r" % self)
+ print('Error in drag callback for %r' % self)
elif self.__parent is not None:
self.__parent.__drag()
"""
if button in self.__callbacks:
cb = self.__callbacks[button]
- # try:
+ #try:
cb(self)
- # except:
- # print('Error in click callback for %r' % self)
+ #except:
+ # print 'Error in click callback for %r' % self
# raise
elif self.__parent is not None:
self.__parent.__click(button)
have a parent.
:type child: CanvasWidget
"""
- if not hasattr(self, "_CanvasWidget__children"):
- self.__children = []
+ if not hasattr(self, '_CanvasWidget__children'): self.__children = []
if child.__parent is not None:
- raise ValueError("{} already has a parent".format(child))
+ raise ValueError('%s already has a parent', child)
child.__parent = self
self.__children.append(child)
:rtype: None
"""
-
##//////////////////////////////////////////////////////
## Basic widgets.
##//////////////////////////////////////////////////////
-
class TextWidget(CanvasWidget):
"""
A canvas widget that displays a single string of text.
this width, it will be line-wrapped at whitespace.
- ``draggable``: whether the text can be dragged by the user.
"""
-
def __init__(self, canvas, text, **attribs):
"""
Create a new text widget.
CanvasWidget.__init__(self, canvas, **attribs)
def __setitem__(self, attr, value):
- if attr in ("color", "font", "justify", "width"):
- if attr == "color":
- attr = "fill"
- self.canvas().itemconfig(self._tag, {attr: value})
+ if attr in ('color', 'font', 'justify', 'width'):
+ if attr == 'color': attr = 'fill'
+ self.canvas().itemconfig(self._tag, {attr:value})
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "width":
+ if attr == 'width':
return int(self.canvas().itemcget(self._tag, attr))
- elif attr in ("color", "font", "justify"):
- if attr == "color":
- attr = "fill"
+ elif attr in ('color', 'font', 'justify'):
+ if attr == 'color': attr = 'fill'
return self.canvas().itemcget(self._tag, attr)
else:
return CanvasWidget.__getitem__(self, attr)
- def _tags(self):
- return [self._tag]
+ def _tags(self): return [self._tag]
def text(self):
"""
:return: The text displayed by this text widget.
:rtype: str
"""
- return self.canvas().itemcget(self._tag, "TEXT")
+ return self.canvas().itemcget(self._tag, 'TEXT')
def set_text(self, text):
"""
self.parent().update(self)
def __repr__(self):
- return "[Text: %r]" % self._text
-
+ return '[Text: %r]' % self._text
class SymbolWidget(TextWidget):
"""
:cvar SYMBOLS: A dictionary mapping from symbols to the character
in the ``symbol`` font used to render them.
"""
-
- SYMBOLS = {
- "neg": "\330",
- "disj": "\332",
- "conj": "\331",
- "lambda": "\154",
- "merge": "\304",
- "forall": "\042",
- "exists": "\044",
- "subseteq": "\315",
- "subset": "\314",
- "notsubset": "\313",
- "emptyset": "\306",
- "imp": "\336",
- "rightarrow": chr(222), #'\256',
- "equal": "\75",
- "notequal": "\271",
- "intersection": "\307",
- "union": "\310",
- "epsilon": "e",
- }
+ SYMBOLS = {'neg':'\330', 'disj':'\332', 'conj': '\331',
+ 'lambda': '\154', 'merge': '\304',
+ 'forall': '\042', 'exists': '\044',
+ 'subseteq': '\315', 'subset': '\314',
+ 'notsubset': '\313', 'emptyset': '\306',
+ 'imp': '\336', 'rightarrow': chr(222), #'\256',
+ 'equal': '\75', 'notequal': '\271',
+ 'intersection': '\307', 'union': '\310',
+ 'epsilon': 'e',
+ }
def __init__(self, canvas, symbol, **attribs):
"""
:param symbol: The name of the symbol to display.
:param attribs: The new canvas widget's attributes.
"""
- attribs["font"] = "symbol"
- TextWidget.__init__(self, canvas, "", **attribs)
+ attribs['font'] = 'symbol'
+ TextWidget.__init__(self, canvas, '', **attribs)
self.set_symbol(symbol)
def symbol(self):
:param symbol: The name of the symbol to display.
"""
if symbol not in SymbolWidget.SYMBOLS:
- raise ValueError("Unknown symbol: %s" % symbol)
+ raise ValueError('Unknown symbol: %s' % symbol)
self._symbol = symbol
self.set_text(SymbolWidget.SYMBOLS[symbol])
def __repr__(self):
- return "[Symbol: %r]" % self._symbol
+ return '[Symbol: %r]' % self._symbol
@staticmethod
def symbolsheet(size=20):
``SymbolWidget.SYMBOLS`` dictionary.
"""
top = Tk()
-
- def destroy(e, top=top):
- top.destroy()
-
- top.bind("q", destroy)
- Button(top, text="Quit", command=top.destroy).pack(side="bottom")
- text = Text(top, font=("helvetica", -size), width=20, height=30)
- text.pack(side="left")
- sb = Scrollbar(top, command=text.yview)
- text["yscrollcommand"] = sb.set
- sb.pack(side="right", fill="y")
- text.tag_config("symbol", font=("symbol", -size))
+ def destroy(e, top=top): top.destroy()
+ top.bind('q', destroy)
+ Button(top, text='Quit', command=top.destroy).pack(side='bottom')
+ text = Text(top, font=('helvetica', -size), width=20, height=30)
+ text.pack(side='left')
+ sb=Scrollbar(top, command=text.yview)
+ text['yscrollcommand']=sb.set
+ sb.pack(side='right', fill='y')
+ text.tag_config('symbol', font=('symbol', -size))
for i in range(256):
- if i in (0, 10):
- continue # null and newline
- for k, v in list(SymbolWidget.SYMBOLS.items()):
+ if i in (0,10): continue # null and newline
+ for k,v in list(SymbolWidget.SYMBOLS.items()):
if v == chr(i):
- text.insert("end", "%-10s\t" % k)
+ text.insert('end', '%-10s\t' % k)
break
else:
- text.insert("end", "%-10d \t" % i)
- text.insert("end", "[%s]\n" % chr(i), "symbol")
+ text.insert('end', '%-10d \t' % i)
+ text.insert('end', '[%s]\n' % chr(i), 'symbol')
top.mainloop()
and any subclasses that define attributes should define
``__setitem__`` and ``__getitem__``.
"""
-
def __init__(self, canvas, child, **attribs):
"""
Create a new container widget. This constructor should only
def __repr__(self):
name = self.__class__.__name__
- if name[-6:] == "Widget":
- name = name[:-6]
- return "[%s: %r]" % (name, self._child)
-
+ if name[-6:] == 'Widget': name = name[:-6]
+ return '[%s: %r]' % (name, self._child)
class BoxWidget(AbstractContainerWidget):
"""
and the box.
- ``draggable``: whether the text can be dragged by the user.
"""
-
def __init__(self, canvas, child, **attribs):
"""
Create a new box widget.
"""
self._child = child
self._margin = 1
- self._box = canvas.create_rectangle(1, 1, 1, 1)
+ self._box = canvas.create_rectangle(1,1,1,1)
canvas.tag_lower(self._box)
AbstractContainerWidget.__init__(self, canvas, child, **attribs)
def __setitem__(self, attr, value):
- if attr == "margin":
- self._margin = value
- elif attr in ("outline", "fill", "width"):
- self.canvas().itemconfig(self._box, {attr: value})
+ if attr == 'margin': self._margin = value
+ elif attr in ('outline', 'fill', 'width'):
+ self.canvas().itemconfig(self._box, {attr:value})
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "margin":
- return self._margin
- elif attr == "width":
+ if attr == 'margin': return self._margin
+ elif attr == 'width':
return float(self.canvas().itemcget(self._box, attr))
- elif attr in ("outline", "fill", "width"):
+ elif attr in ('outline', 'fill', 'width'):
return self.canvas().itemcget(self._box, attr)
else:
return CanvasWidget.__getitem__(self, attr)
def _update(self, child):
(x1, y1, x2, y2) = child.bbox()
- margin = self._margin + self["width"] / 2
- self.canvas().coords(
- self._box, x1 - margin, y1 - margin, x2 + margin, y2 + margin
- )
-
- def _tags(self):
- return [self._box]
+ margin = self._margin + self['width']/2
+ self.canvas().coords(self._box, x1-margin, y1-margin,
+ x2+margin, y2+margin)
+ def _tags(self): return [self._box]
class OvalWidget(AbstractContainerWidget):
"""
- ``draggable``: whether the text can be dragged by the user.
- ``double``: If true, then a double-oval is drawn.
"""
-
def __init__(self, canvas, child, **attribs):
"""
Create a new oval widget.
"""
self._child = child
self._margin = 1
- self._oval = canvas.create_oval(1, 1, 1, 1)
- self._circle = attribs.pop("circle", False)
- self._double = attribs.pop("double", False)
+ self._oval = canvas.create_oval(1,1,1,1)
+ self._circle = attribs.pop('circle', False)
+ self._double = attribs.pop('double', False)
if self._double:
- self._oval2 = canvas.create_oval(1, 1, 1, 1)
+ self._oval2 = canvas.create_oval(1,1,1,1)
else:
self._oval2 = None
canvas.tag_lower(self._oval)
def __setitem__(self, attr, value):
c = self.canvas()
- if attr == "margin":
- self._margin = value
- elif attr == "double":
- if value == True and self._oval2 is None:
+ if attr == 'margin': self._margin = value
+ elif attr == 'double':
+ if value==True and self._oval2 is None:
# Copy attributes & position from self._oval.
x1, y1, x2, y2 = c.bbox(self._oval)
- w = self["width"] * 2
- self._oval2 = c.create_oval(
- x1 - w,
- y1 - w,
- x2 + w,
- y2 + w,
- outline=c.itemcget(self._oval, "outline"),
- width=c.itemcget(self._oval, "width"),
- )
+ w = self['width']*2
+ self._oval2 = c.create_oval(x1-w, y1-w, x2+w, y2+w,
+ outline=c.itemcget(self._oval, 'outline'),
+ width=c.itemcget(self._oval, 'width'))
c.tag_lower(self._oval2)
- if value == False and self._oval2 is not None:
+ if value==False and self._oval2 is not None:
c.delete(self._oval2)
self._oval2 = None
- elif attr in ("outline", "fill", "width"):
- c.itemconfig(self._oval, {attr: value})
- if self._oval2 is not None and attr != "fill":
- c.itemconfig(self._oval2, {attr: value})
- if self._oval2 is not None and attr != "fill":
- self.canvas().itemconfig(self._oval2, {attr: value})
+ elif attr in ('outline', 'fill', 'width'):
+ c.itemconfig(self._oval, {attr:value})
+ if self._oval2 is not None and attr!='fill':
+ c.itemconfig(self._oval2, {attr:value})
+ if self._oval2 is not None and attr!='fill':
+ self.canvas().itemconfig(self._oval2, {attr:value})
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "margin":
- return self._margin
- elif attr == "double":
- return self._double is not None
- elif attr == "width":
+ if attr == 'margin': return self._margin
+ elif attr == 'double': return self._double is not None
+ elif attr == 'width':
return float(self.canvas().itemcget(self._oval, attr))
- elif attr in ("outline", "fill", "width"):
+ elif attr in ('outline', 'fill', 'width'):
return self.canvas().itemcget(self._oval, attr)
else:
return CanvasWidget.__getitem__(self, attr)
# If we're a circle, pretend our contents are square.
if self._circle:
- dx, dy = abs(x1 - x2), abs(y1 - y2)
+ dx, dy = abs(x1-x2), abs(y1-y2)
if dx > dy:
- y = (y1 + y2) / 2
- y1, y2 = y - dx / 2, y + dx / 2
+ y = (y1+y2)/2
+ y1, y2 = y-dx/2, y+dx/2
elif dy > dx:
- x = (x1 + x2) / 2
- x1, x2 = x - dy / 2, x + dy / 2
+ x = (x1+x2)/2
+ x1, x2 = x-dy/2, x+dy/2
# Find the four corners.
- left = int((x1 * (1 + R) + x2 * (1 - R)) / 2)
- right = left + int((x2 - x1) * R)
- top = int((y1 * (1 + R) + y2 * (1 - R)) / 2)
- bot = top + int((y2 - y1) * R)
- self.canvas().coords(
- self._oval, left - margin, top - margin, right + margin, bot + margin
- )
+ left = int(( x1*(1+R) + x2*(1-R) ) / 2)
+ right = left + int((x2-x1)*R)
+ top = int(( y1*(1+R) + y2*(1-R) ) / 2)
+ bot = top + int((y2-y1)*R)
+ self.canvas().coords(self._oval, left-margin, top-margin,
+ right+margin, bot+margin)
if self._oval2 is not None:
- self.canvas().coords(
- self._oval2,
- left - margin + 2,
- top - margin + 2,
- right + margin - 2,
- bot + margin - 2,
- )
+ self.canvas().coords(self._oval2, left-margin+2, top-margin+2,
+ right+margin-2, bot+margin-2)
def _tags(self):
if self._oval2 is None:
else:
return [self._oval, self._oval2]
-
class ParenWidget(AbstractContainerWidget):
"""
A canvas widget that places a pair of parenthases around a child
- ``width``: The width of the parenthases.
- ``draggable``: whether the text can be dragged by the user.
"""
-
def __init__(self, canvas, child, **attribs):
"""
Create a new parenthasis widget.
:param attribs: The new canvas widget's attributes.
"""
self._child = child
- self._oparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=90, extent=180)
- self._cparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=-90, extent=180)
+ self._oparen = canvas.create_arc(1,1,1,1, style='arc',
+ start=90, extent=180)
+ self._cparen = canvas.create_arc(1,1,1,1, style='arc',
+ start=-90, extent=180)
AbstractContainerWidget.__init__(self, canvas, child, **attribs)
def __setitem__(self, attr, value):
- if attr == "color":
+ if attr == 'color':
self.canvas().itemconfig(self._oparen, outline=value)
self.canvas().itemconfig(self._cparen, outline=value)
- elif attr == "width":
+ elif attr == 'width':
self.canvas().itemconfig(self._oparen, width=value)
self.canvas().itemconfig(self._cparen, width=value)
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "color":
- return self.canvas().itemcget(self._oparen, "outline")
- elif attr == "width":
- return self.canvas().itemcget(self._oparen, "width")
+ if attr == 'color':
+ return self.canvas().itemcget(self._oparen, 'outline')
+ elif attr == 'width':
+ return self.canvas().itemcget(self._oparen, 'width')
else:
return CanvasWidget.__getitem__(self, attr)
def _update(self, child):
(x1, y1, x2, y2) = child.bbox()
- width = max((y2 - y1) / 6, 4)
- self.canvas().coords(self._oparen, x1 - width, y1, x1 + width, y2)
- self.canvas().coords(self._cparen, x2 - width, y1, x2 + width, y2)
-
- def _tags(self):
- return [self._oparen, self._cparen]
+ width = max((y2-y1)/6, 4)
+ self.canvas().coords(self._oparen, x1-width, y1, x1+width, y2)
+ self.canvas().coords(self._cparen, x2-width, y1, x2+width, y2)
+ def _tags(self): return [self._oparen, self._cparen]
class BracketWidget(AbstractContainerWidget):
"""
- ``width``: The width of the brackets.
- ``draggable``: whether the text can be dragged by the user.
"""
-
def __init__(self, canvas, child, **attribs):
"""
Create a new bracket widget.
:param attribs: The new canvas widget's attributes.
"""
self._child = child
- self._obrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1)
- self._cbrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1)
+ self._obrack = canvas.create_line(1,1,1,1,1,1,1,1)
+ self._cbrack = canvas.create_line(1,1,1,1,1,1,1,1)
AbstractContainerWidget.__init__(self, canvas, child, **attribs)
def __setitem__(self, attr, value):
- if attr == "color":
+ if attr == 'color':
self.canvas().itemconfig(self._obrack, fill=value)
self.canvas().itemconfig(self._cbrack, fill=value)
- elif attr == "width":
+ elif attr == 'width':
self.canvas().itemconfig(self._obrack, width=value)
self.canvas().itemconfig(self._cbrack, width=value)
else:
CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "color":
- return self.canvas().itemcget(self._obrack, "outline")
- elif attr == "width":
- return self.canvas().itemcget(self._obrack, "width")
+ if attr == 'color':
+ return self.canvas().itemcget(self._obrack, 'outline')
+ elif attr == 'width':
+ return self.canvas().itemcget(self._obrack, 'width')
else:
return CanvasWidget.__getitem__(self, attr)
def _update(self, child):
(x1, y1, x2, y2) = child.bbox()
- width = max((y2 - y1) / 8, 2)
- self.canvas().coords(
- self._obrack, x1, y1, x1 - width, y1, x1 - width, y2, x1, y2
- )
- self.canvas().coords(
- self._cbrack, x2, y1, x2 + width, y1, x2 + width, y2, x2, y2
- )
-
- def _tags(self):
- return [self._obrack, self._cbrack]
+ width = max((y2-y1)/8, 2)
+ self.canvas().coords(self._obrack, x1, y1, x1-width, y1,
+ x1-width, y2, x1, y2)
+ self.canvas().coords(self._cbrack, x2, y1, x2+width, y1,
+ x2+width, y2, x2, y2)
+ def _tags(self): return [self._obrack, self._cbrack]
class SequenceWidget(CanvasWidget):
"""
- ``ordered``: If true, then keep the children in their
original order.
"""
-
def __init__(self, canvas, *children, **attribs):
"""
Create a new sequence widget.
:type children: list(CanvasWidget)
:param attribs: The new canvas widget's attributes.
"""
- self._align = "center"
+ self._align = 'center'
self._space = 1
self._ordered = False
self._children = list(children)
- for child in children:
- self._add_child_widget(child)
+ for child in children: self._add_child_widget(child)
CanvasWidget.__init__(self, canvas, **attribs)
def __setitem__(self, attr, value):
- if attr == "align":
- if value not in ("top", "bottom", "center"):
- raise ValueError("Bad alignment: %r" % value)
+ if attr == 'align':
+ if value not in ('top', 'bottom', 'center'):
+ raise ValueError('Bad alignment: %r' % value)
self._align = value
- elif attr == "space":
- self._space = value
- elif attr == "ordered":
- self._ordered = value
- else:
- CanvasWidget.__setitem__(self, attr, value)
+ elif attr == 'space': self._space = value
+ elif attr == 'ordered': self._ordered = value
+ else: CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "align":
- return self._align
- elif attr == "space":
- return self._space
- elif attr == "ordered":
- return self._ordered
- else:
- return CanvasWidget.__getitem__(self, attr)
+ if attr == 'align': return self._align
+ elif attr == 'space': return self._space
+ elif attr == 'ordered': return self._ordered
+ else: return CanvasWidget.__getitem__(self, attr)
- def _tags(self):
- return []
+ def _tags(self): return []
def _yalign(self, top, bot):
- if self._align == "top":
- return top
- if self._align == "bottom":
- return bot
- if self._align == "center":
- return (top + bot) / 2
+ if self._align == 'top': return top
+ if self._align == 'bottom': return bot
+ if self._align == 'center': return (top+bot)/2
def _update(self, child):
# Align all children with child.
y = self._yalign(top, bot)
for c in self._children:
(x1, y1, x2, y2) = c.bbox()
- c.move(0, y - self._yalign(y1, y2))
+ c.move(0, y-self._yalign(y1,y2))
if self._ordered and len(self._children) > 1:
index = self._children.index(child)
x = right + self._space
- for i in range(index + 1, len(self._children)):
+ for i in range(index+1, len(self._children)):
(x1, y1, x2, y2) = self._children[i].bbox()
if x > x1:
- self._children[i].move(x - x1, 0)
- x += x2 - x1 + self._space
+ self._children[i].move(x-x1, 0)
+ x += x2-x1 + self._space
x = left - self._space
- for i in range(index - 1, -1, -1):
+ for i in range(index-1, -1, -1):
(x1, y1, x2, y2) = self._children[i].bbox()
if x < x2:
- self._children[i].move(x - x2, 0)
- x -= x2 - x1 + self._space
+ self._children[i].move(x-x2, 0)
+ x -= x2-x1 + self._space
def _manage(self):
- if len(self._children) == 0:
- return
+ if len(self._children) == 0: return
child = self._children[0]
# Align all children with child.
# Line up children to the right of child.
x = right + self._space
- for i in range(index + 1, len(self._children)):
+ for i in range(index+1, len(self._children)):
(x1, y1, x2, y2) = self._children[i].bbox()
- self._children[i].move(x - x1, y - self._yalign(y1, y2))
- x += x2 - x1 + self._space
+ self._children[i].move(x-x1, y-self._yalign(y1,y2))
+ x += x2-x1 + self._space
# Line up children to the left of child.
x = left - self._space
- for i in range(index - 1, -1, -1):
+ for i in range(index-1, -1, -1):
(x1, y1, x2, y2) = self._children[i].bbox()
- self._children[i].move(x - x2, y - self._yalign(y1, y2))
- x -= x2 - x1 + self._space
+ self._children[i].move(x-x2, y-self._yalign(y1,y2))
+ x -= x2-x1 + self._space
def __repr__(self):
- return "[Sequence: " + repr(self._children)[1:-1] + "]"
+ return '[Sequence: ' + repr(self._children)[1:-1]+']'
# Provide an alias for the child_widgets() member.
children = CanvasWidget.child_widgets
self._children.insert(index, child)
self._add_child_widget(child)
-
class StackWidget(CanvasWidget):
"""
A canvas widget that keeps a list of canvas widgets in a vertical
- ``ordered``: If true, then keep the children in their
original order.
"""
-
def __init__(self, canvas, *children, **attribs):
"""
Create a new stack widget.
:type children: list(CanvasWidget)
:param attribs: The new canvas widget's attributes.
"""
- self._align = "center"
+ self._align = 'center'
self._space = 1
self._ordered = False
self._children = list(children)
- for child in children:
- self._add_child_widget(child)
+ for child in children: self._add_child_widget(child)
CanvasWidget.__init__(self, canvas, **attribs)
def __setitem__(self, attr, value):
- if attr == "align":
- if value not in ("left", "right", "center"):
- raise ValueError("Bad alignment: %r" % value)
+ if attr == 'align':
+ if value not in ('left', 'right', 'center'):
+ raise ValueError('Bad alignment: %r' % value)
self._align = value
- elif attr == "space":
- self._space = value
- elif attr == "ordered":
- self._ordered = value
- else:
- CanvasWidget.__setitem__(self, attr, value)
+ elif attr == 'space': self._space = value
+ elif attr == 'ordered': self._ordered = value
+ else: CanvasWidget.__setitem__(self, attr, value)
def __getitem__(self, attr):
- if attr == "align":
- return self._align
- elif attr == "space":
- return self._space
- elif attr == "ordered":
- return self._ordered
- else:
- return CanvasWidget.__getitem__(self, attr)
+ if attr == 'align': return self._align
+ elif attr == 'space': return self._space
+ elif attr == 'ordered': return self._ordered
+ else: return CanvasWidget.__getitem__(self, attr)
- def _tags(self):
- return []
+ def _tags(self): return []
def _xalign(self, left, right):
- if self._align == "left":
- return left
- if self._align == "right":
- return right
- if self._align == "center":
- return (left + right) / 2
+ if self._align == 'left': return left
+ if self._align == 'right': return right
+ if self._align == 'center': return (left+right)/2
def _update(self, child):
# Align all children with child.
x = self._xalign(left, right)
for c in self._children:
(x1, y1, x2, y2) = c.bbox()
- c.move(x - self._xalign(x1, x2), 0)
+ c.move(x-self._xalign(x1,x2), 0)
if self._ordered and len(self._children) > 1:
index = self._children.index(child)
y = bot + self._space
- for i in range(index + 1, len(self._children)):
+ for i in range(index+1, len(self._children)):
(x1, y1, x2, y2) = self._children[i].bbox()
if y > y1:
- self._children[i].move(0, y - y1)
- y += y2 - y1 + self._space
+ self._children[i].move(0, y-y1)
+ y += y2-y1 + self._space
y = top - self._space
- for i in range(index - 1, -1, -1):
+ for i in range(index-1, -1, -1):
(x1, y1, x2, y2) = self._children[i].bbox()
if y < y2:
- self._children[i].move(0, y - y2)
- y -= y2 - y1 + self._space
+ self._children[i].move(0, y-y2)
+ y -= y2-y1 + self._space
def _manage(self):
- if len(self._children) == 0:
- return
+ if len(self._children) == 0: return
child = self._children[0]
# Align all children with child.
# Line up children below the child.
y = bot + self._space
- for i in range(index + 1, len(self._children)):
+ for i in range(index+1, len(self._children)):
(x1, y1, x2, y2) = self._children[i].bbox()
- self._children[i].move(x - self._xalign(x1, x2), y - y1)
- y += y2 - y1 + self._space
+ self._children[i].move(x-self._xalign(x1,x2), y-y1)
+ y += y2-y1 + self._space
# Line up children above the child.
y = top - self._space
- for i in range(index - 1, -1, -1):
+ for i in range(index-1, -1, -1):
(x1, y1, x2, y2) = self._children[i].bbox()
- self._children[i].move(x - self._xalign(x1, x2), y - y2)
- y -= y2 - y1 + self._space
+ self._children[i].move(x-self._xalign(x1,x2), y-y2)
+ y -= y2-y1 + self._space
def __repr__(self):
- return "[Stack: " + repr(self._children)[1:-1] + "]"
+ return '[Stack: ' + repr(self._children)[1:-1]+']'
# Provide an alias for the child_widgets() member.
children = CanvasWidget.child_widgets
self._children.insert(index, child)
self._add_child_widget(child)
-
class SpaceWidget(CanvasWidget):
"""
A canvas widget that takes up space but does not display
height of zero; and if you wish to only create vertical space, use
a width of zero.
"""
-
def __init__(self, canvas, width, height, **attribs):
"""
Create a new space widget.
:param attribs: The new canvas widget's attributes.
"""
# For some reason,
- if width > 4:
- width -= 4
- if height > 4:
- height -= 4
- self._tag = canvas.create_line(1, 1, width, height, fill="")
+ if width > 4: width -= 4
+ if height > 4: height -= 4
+ self._tag = canvas.create_line(1, 1, width, height, fill='')
CanvasWidget.__init__(self, canvas, **attribs)
# note: width() and height() are already defined by CanvasWidget.
:rtype: None
"""
[x1, y1, x2, y2] = self.bbox()
- self.canvas().coords(self._tag, x1, y1, x1 + width, y2)
+ self.canvas().coords(self._tag, x1, y1, x1+width, y2)
def set_height(self, height):
"""
:rtype: None
"""
[x1, y1, x2, y2] = self.bbox()
- self.canvas().coords(self._tag, x1, y1, x2, y1 + height)
-
- def _tags(self):
- return [self._tag]
+ self.canvas().coords(self._tag, x1, y1, x2, y1+height)
- def __repr__(self):
- return "[Space]"
+ def _tags(self): return [self._tag]
+ def __repr__(self): return '[Space]'
class ScrollWatcherWidget(CanvasWidget):
"""
scroll-watcher widget will only increase the size of the
``Canvas``'s scrollregion; it will never decrease it.
"""
-
def __init__(self, canvas, *children, **attribs):
"""
Create a new scroll-watcher widget.
scrollregion.
:param attribs: The new canvas widget's attributes.
"""
- for child in children:
- self._add_child_widget(child)
+ for child in children: self._add_child_widget(child)
CanvasWidget.__init__(self, canvas, **attribs)
def add_child(self, canvaswidget):
"""
self._remove_child_widget(canvaswidget)
- def _tags(self):
- return []
+ def _tags(self): return []
def _update(self, child):
self._adjust_scrollregion()
"""
bbox = self.bbox()
canvas = self.canvas()
- scrollregion = [int(n) for n in canvas["scrollregion"].split()]
- if len(scrollregion) != 4:
- return
- if (
- bbox[0] < scrollregion[0]
- or bbox[1] < scrollregion[1]
- or bbox[2] > scrollregion[2]
- or bbox[3] > scrollregion[3]
- ):
- scrollregion = "%d %d %d %d" % (
- min(bbox[0], scrollregion[0]),
- min(bbox[1], scrollregion[1]),
- max(bbox[2], scrollregion[2]),
- max(bbox[3], scrollregion[3]),
- )
- canvas["scrollregion"] = scrollregion
-
+ scrollregion = [int(n) for n in canvas['scrollregion'].split()]
+ if len(scrollregion) != 4: return
+ if (bbox[0] < scrollregion[0] or bbox[1] < scrollregion[1] or
+ bbox[2] > scrollregion[2] or bbox[3] > scrollregion[3]):
+ scrollregion = ('%d %d %d %d' %
+ (min(bbox[0], scrollregion[0]),
+ min(bbox[1], scrollregion[1]),
+ max(bbox[2], scrollregion[2]),
+ max(bbox[3], scrollregion[3])))
+ canvas['scrollregion'] = scrollregion
##//////////////////////////////////////////////////////
## Canvas Frame
##//////////////////////////////////////////////////////
-
class CanvasFrame(object):
"""
A ``Tkinter`` frame containing a canvas and scrollbars.
its own main window, including a "Done" button and a "Print"
button.
"""
-
def __init__(self, parent=None, **kw):
"""
Create a new ``CanvasFrame``.
# If no parent was given, set up a top-level window.
if parent is None:
self._parent = Tk()
- self._parent.title("NLTK")
- self._parent.bind("<Control-p>", lambda e: self.print_to_file())
- self._parent.bind("<Control-x>", self.destroy)
- self._parent.bind("<Control-q>", self.destroy)
+ self._parent.title('NLTK')
+ self._parent.bind('<Control-p>', lambda e: self.print_to_file())
+ self._parent.bind('<Control-x>', self.destroy)
+ self._parent.bind('<Control-q>', self.destroy)
else:
self._parent = parent
# Create a frame for the canvas & scrollbars
self._frame = frame = Frame(self._parent)
self._canvas = canvas = Canvas(frame, **kw)
- xscrollbar = Scrollbar(self._frame, orient="horizontal")
- yscrollbar = Scrollbar(self._frame, orient="vertical")
- xscrollbar["command"] = canvas.xview
- yscrollbar["command"] = canvas.yview
- canvas["xscrollcommand"] = xscrollbar.set
- canvas["yscrollcommand"] = yscrollbar.set
- yscrollbar.pack(fill="y", side="right")
- xscrollbar.pack(fill="x", side="bottom")
- canvas.pack(expand=1, fill="both", side="left")
+ xscrollbar = Scrollbar(self._frame, orient='horizontal')
+ yscrollbar = Scrollbar(self._frame, orient='vertical')
+ xscrollbar['command'] = canvas.xview
+ yscrollbar['command'] = canvas.yview
+ canvas['xscrollcommand'] = xscrollbar.set
+ canvas['yscrollcommand'] = yscrollbar.set
+ yscrollbar.pack(fill='y', side='right')
+ xscrollbar.pack(fill='x', side='bottom')
+ canvas.pack(expand=1, fill='both', side='left')
# Set initial scroll region.
- scrollregion = "0 0 %s %s" % (canvas["width"], canvas["height"])
- canvas["scrollregion"] = scrollregion
+ scrollregion = '0 0 %s %s' % (canvas['width'], canvas['height'])
+ canvas['scrollregion'] = scrollregion
self._scrollwatcher = ScrollWatcherWidget(canvas)
# If no parent was given, pack the frame, and add a menu.
if parent is None:
- self.pack(expand=1, fill="both")
+ self.pack(expand=1, fill='both')
self._init_menubar()
def _init_menubar(self):
menubar = Menu(self._parent)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Print to Postscript",
- underline=0,
- command=self.print_to_file,
- accelerator="Ctrl-p",
- )
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Print to Postscript', underline=0,
+ command=self.print_to_file, accelerator='Ctrl-p')
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='Ctrl-x')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
self._parent.config(menu=menubar)
:rtype: None
"""
if filename is None:
- ftypes = [("Postscript files", ".ps"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".ps")
- if not filename:
- return
+ ftypes = [('Postscript files', '.ps'),
+ ('All files', '*')]
+ filename = asksaveasfilename(filetypes=ftypes,
+ defaultextension='.ps')
+ if not filename: return
(x0, y0, w, h) = self.scrollregion()
- postscript = self._canvas.postscript(
- x=x0,
- y=y0,
- width=w + 2,
- height=h + 2,
- pagewidth=w + 2, # points = 1/72 inch
- pageheight=h + 2, # points = 1/72 inch
- pagex=0,
- pagey=0,
- )
+ postscript = self._canvas.postscript(x=x0, y=y0,
+ width=w+2, height=h+2,
+ pagewidth=w+2, # points = 1/72 inch
+ pageheight=h+2, # points = 1/72 inch
+ pagex=0, pagey=0)
# workaround for bug in Tk font handling
- postscript = postscript.replace(" 0 scalefont ", " 9 scalefont ")
- with open(filename, "wb") as f:
- f.write(postscript.encode("utf8"))
+ postscript = postscript.replace(' 0 scalefont ', ' 9 scalefont ')
+ with open(filename, 'wb') as f:
+ f.write(postscript.encode('utf8'))
def scrollregion(self):
"""
this ``CanvasFrame``.
:rtype: 4-tuple of int
"""
- (x1, y1, x2, y2) = self._canvas["scrollregion"].split()
+ (x1, y1, x2, y2) = self._canvas['scrollregion'].split()
return (int(x1), int(y1), int(x2), int(y2))
def canvas(self):
(x, y) = self._find_room(canvaswidget, x, y)
# Move to (x,y)
- (x1, y1, x2, y2) = canvaswidget.bbox()
- canvaswidget.move(x - x1, y - y1)
+ (x1,y1,x2,y2) = canvaswidget.bbox()
+ canvaswidget.move(x-x1,y-y1)
# Register with scrollwatcher.
self._scrollwatcher.add_child(canvaswidget)
w = widget.width()
h = widget.height()
- if w >= (right - left):
- return (0, 0)
- if h >= (bot - top):
- return (0, 0)
+ if w >= (right-left): return (0,0)
+ if h >= (bot-top): return (0,0)
# Move the widget out of the way, for now.
- (x1, y1, x2, y2) = widget.bbox()
- widget.move(left - x2 - 50, top - y2 - 50)
+ (x1,y1,x2,y2) = widget.bbox()
+ widget.move(left-x2-50, top-y2-50)
if desired_x is not None:
x = desired_x
- for y in range(top, bot - h, int((bot - top - h) / 10)):
- if not self._canvas.find_overlapping(
- x - 5, y - 5, x + w + 5, y + h + 5
- ):
- return (x, y)
+ for y in range(top, bot-h, int((bot-top-h)/10)):
+ if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5):
+ return (x,y)
if desired_y is not None:
y = desired_y
- for x in range(left, right - w, int((right - left - w) / 10)):
- if not self._canvas.find_overlapping(
- x - 5, y - 5, x + w + 5, y + h + 5
- ):
- return (x, y)
-
- for y in range(top, bot - h, int((bot - top - h) / 10)):
- for x in range(left, right - w, int((right - left - w) / 10)):
- if not self._canvas.find_overlapping(
- x - 5, y - 5, x + w + 5, y + h + 5
- ):
- return (x, y)
- return (0, 0)
+ for x in range(left, right-w, int((right-left-w)/10)):
+ if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5):
+ return (x,y)
+
+ for y in range(top, bot-h, int((bot-top-h)/10)):
+ for x in range(left, right-w, int((right-left-w)/10)):
+ if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5):
+ return (x,y)
+ return (0,0)
def destroy_widget(self, canvaswidget):
"""
Destroy this ``CanvasFrame``. If this ``CanvasFrame`` created a
top-level window, then this will close that window.
"""
- if self._parent is None:
- return
+ if self._parent is None: return
self._parent.destroy()
self._parent = None
from a secript); otherwise, the frame will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self._parent.mainloop(*args, **kwargs)
-
##//////////////////////////////////////////////////////
## Text display
##//////////////////////////////////////////////////////
-
class ShowText(object):
"""
A ``Tkinter`` window used to display a text. ``ShowText`` is
typically used by graphical tools to display help text, or similar
information.
"""
-
- def __init__(self, root, title, text, width=None, height=None, **textbox_options):
+ def __init__(self, root, title, text, width=None, height=None,
+ **textbox_options):
if width is None or height is None:
(width, height) = self.find_dimentions(text, width, height)
self._top = top = Toplevel(root)
top.title(title)
- b = Button(top, text="Ok", command=self.destroy)
- b.pack(side="bottom")
+ b = Button(top, text='Ok', command=self.destroy)
+ b.pack(side='bottom')
tbf = Frame(top)
- tbf.pack(expand=1, fill="both")
- scrollbar = Scrollbar(tbf, orient="vertical")
- scrollbar.pack(side="right", fill="y")
- textbox = Text(tbf, wrap="word", width=width, height=height, **textbox_options)
- textbox.insert("end", text)
- textbox["state"] = "disabled"
- textbox.pack(side="left", expand=1, fill="both")
- scrollbar["command"] = textbox.yview
- textbox["yscrollcommand"] = scrollbar.set
+ tbf.pack(expand=1, fill='both')
+ scrollbar = Scrollbar(tbf, orient='vertical')
+ scrollbar.pack(side='right', fill='y')
+ textbox = Text(tbf, wrap='word', width=width,
+ height=height, **textbox_options)
+ textbox.insert('end', text)
+ textbox['state'] = 'disabled'
+ textbox.pack(side='left', expand=1, fill='both')
+ scrollbar['command'] = textbox.yview
+ textbox['yscrollcommand'] = scrollbar.set
# Make it easy to close the window.
- top.bind("q", self.destroy)
- top.bind("x", self.destroy)
- top.bind("c", self.destroy)
- top.bind("<Return>", self.destroy)
- top.bind("<Escape>", self.destroy)
+ top.bind('q', self.destroy)
+ top.bind('x', self.destroy)
+ top.bind('c', self.destroy)
+ top.bind('<Return>', self.destroy)
+ top.bind('<Escape>', self.destroy)
# Focus the scrollbar, so they can use up/down, etc.
scrollbar.focus()
def find_dimentions(self, text, width, height):
- lines = text.split("\n")
+ lines = text.split('\n')
if width is None:
maxwidth = max(len(line) for line in lines)
width = min(maxwidth, 80)
height = 0
for line in lines:
while len(line) > width:
- brk = line[:width].rfind(" ")
+ brk = line[:width].rfind(' ')
line = line[brk:]
height += 1
height += 1
return (width, height)
def destroy(self, *e):
- if self._top is None:
- return
+ if self._top is None: return
self._top.destroy()
self._top = None
from a secript); otherwise, the window will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self._top.mainloop(*args, **kwargs)
-
##//////////////////////////////////////////////////////
## Entry dialog
##//////////////////////////////////////////////////////
-
class EntryDialog(object):
"""
A dialog box for entering
"""
-
- def __init__(
- self, parent, original_text="", instructions="", set_callback=None, title=None
- ):
+ def __init__(self, parent, original_text='', instructions='',
+ set_callback=None, title=None):
self._parent = parent
self._original_text = original_text
self._set_callback = set_callback
- width = int(max(30, len(original_text) * 3 / 2))
+ width = int(max(30, len(original_text)*3/2))
self._top = Toplevel(parent)
- if title:
- self._top.title(title)
+ if title: self._top.title(title)
# The text entry box.
entryframe = Frame(self._top)
- entryframe.pack(expand=1, fill="both", padx=5, pady=5, ipady=10)
+ entryframe.pack(expand=1, fill='both', padx=5, pady=5,ipady=10)
if instructions:
- l = Label(entryframe, text=instructions)
- l.pack(side="top", anchor="w", padx=30)
+ l=Label(entryframe, text=instructions)
+ l.pack(side='top', anchor='w', padx=30)
self._entry = Entry(entryframe, width=width)
- self._entry.pack(expand=1, fill="x", padx=30)
+ self._entry.pack(expand=1, fill='x', padx=30)
self._entry.insert(0, original_text)
# A divider
- divider = Frame(self._top, borderwidth=1, relief="sunken")
- divider.pack(fill="x", ipady=1, padx=10)
+ divider = Frame(self._top, borderwidth=1, relief='sunken')
+ divider.pack(fill='x', ipady=1, padx=10)
# The buttons.
buttons = Frame(self._top)
- buttons.pack(expand=0, fill="x", padx=5, pady=5)
- b = Button(buttons, text="Cancel", command=self._cancel, width=8)
- b.pack(side="right", padx=5)
- b = Button(buttons, text="Ok", command=self._ok, width=8, default="active")
- b.pack(side="left", padx=5)
- b = Button(buttons, text="Apply", command=self._apply, width=8)
- b.pack(side="left")
-
- self._top.bind("<Return>", self._ok)
- self._top.bind("<Control-q>", self._cancel)
- self._top.bind("<Escape>", self._cancel)
+ buttons.pack(expand=0, fill='x', padx=5, pady=5)
+ b = Button(buttons, text='Cancel', command=self._cancel, width=8)
+ b.pack(side='right', padx=5)
+ b = Button(buttons, text='Ok', command=self._ok,
+ width=8, default='active')
+ b.pack(side='left', padx=5)
+ b = Button(buttons, text='Apply', command=self._apply, width=8)
+ b.pack(side='left')
+
+ self._top.bind('<Return>', self._ok)
+ self._top.bind('<Control-q>', self._cancel)
+ self._top.bind('<Escape>', self._cancel)
self._entry.focus()
def _reset(self, *e):
- self._entry.delete(0, "end")
+ self._entry.delete(0,'end')
self._entry.insert(0, self._original_text)
if self._set_callback:
self._set_callback(self._original_text)
def _cancel(self, *e):
- try:
- self._reset()
- except:
- pass
+ try: self._reset()
+ except: pass
self._destroy()
def _ok(self, *e):
self._set_callback(self._entry.get())
def _destroy(self, *e):
- if self._top is None:
- return
+ if self._top is None: return
self._top.destroy()
self._top = None
-
##//////////////////////////////////////////////////////
## Colorized List
##//////////////////////////////////////////////////////
-
class ColorizedList(object):
"""
An abstract base class for displaying a colorized list of items.
:note: Typically, you will want to register a callback for
``'select'`` that calls ``mark`` on the given item.
"""
-
def __init__(self, parent, items=[], **options):
"""
Construct a new list.
self._init_itemframe(options.copy())
# Set up key & mouse bindings.
- self._textwidget.bind("<KeyPress>", self._keypress)
- self._textwidget.bind("<ButtonPress>", self._buttonpress)
+ self._textwidget.bind('<KeyPress>', self._keypress)
+ self._textwidget.bind('<ButtonPress>', self._buttonpress)
# Fill in the given CFG's items.
self._items = None
self.set(items)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Abstract methods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
@abstractmethod
def _init_colortags(self, textwidget, options):
"""
strings returned may not contain newline characters.
"""
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Item Access
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def get(self, index=None):
"""
Modify the list of items contained by this list.
"""
items = list(items)
- if self._items == items:
- return
+ if self._items == items: return
self._items = list(items)
- self._textwidget["state"] = "normal"
- self._textwidget.delete("1.0", "end")
+ self._textwidget['state'] = 'normal'
+ self._textwidget.delete('1.0', 'end')
for item in items:
for (text, colortag) in self._item_repr(item):
- assert "\n" not in text, "item repr may not contain newline"
- self._textwidget.insert("end", text, colortag)
- self._textwidget.insert("end", "\n")
+ assert '\n' not in text, 'item repr may not contain newline'
+ self._textwidget.insert('end', text, colortag)
+ self._textwidget.insert('end', '\n')
# Remove the final newline
- self._textwidget.delete("end-1char", "end")
- self._textwidget.mark_set("insert", "1.0")
- self._textwidget["state"] = "disabled"
+ self._textwidget.delete('end-1char', 'end')
+ self._textwidget.mark_set('insert', '1.0')
+ self._textwidget['state'] = 'disabled'
# Clear all marks
self._marks.clear()
"""
if item is None:
self._marks.clear()
- self._textwidget.tag_remove("highlight", "1.0", "end+1char")
+ self._textwidget.tag_remove('highlight', '1.0', 'end+1char')
else:
index = self._items.index(item)
del self._marks[item]
- (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2))
- self._textwidget.tag_remove("highlight", start, end)
+ (start, end) = ('%d.0' % (index+1), '%d.0' % (index+2))
+ self._textwidget.tag_remove('highlight', start, end)
def mark(self, item):
"""
"""
self._marks[item] = 1
index = self._items.index(item)
- (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2))
- self._textwidget.tag_add("highlight", start, end)
+ (start, end) = ('%d.0' % (index+1), '%d.0' % (index+2))
+ self._textwidget.tag_add('highlight', start, end)
def markonly(self, item):
"""
the item is already visible, then do nothing.
"""
index = self._items.index(item)
- self._textwidget.see("%d.0" % (index + 1))
+ self._textwidget.see('%d.0' % (index+1))
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Callbacks
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def add_callback(self, event, func):
"""
single item as its argument. (The item selected
or the item moved to).
"""
- if event == "select":
- events = ["click1", "space", "return"]
- elif event == "move":
- events = ["up", "down", "next", "prior"]
- else:
- events = [event]
+ if event == 'select': events = ['click1', 'space', 'return']
+ elif event == 'move': events = ['up', 'down', 'next', 'prior']
+ else: events = [event]
for e in events:
- self._callbacks.setdefault(e, {})[func] = 1
+ self._callbacks.setdefault(e,{})[func] = 1
def remove_callback(self, event, func=None):
"""
Deregister a callback function. If ``func`` is none, then
all callbacks are removed for the given event.
"""
- if event is None:
- events = list(self._callbacks.keys())
- elif event == "select":
- events = ["click1", "space", "return"]
- elif event == "move":
- events = ["up", "down", "next", "prior"]
- else:
- events = [event]
+ if event is None: events = list(self._callbacks.keys())
+ elif event == 'select': events = ['click1', 'space', 'return']
+ elif event == 'move': events = ['up', 'down', 'next', 'prior']
+ else: events = [event]
for e in events:
- if func is None:
- del self._callbacks[e]
+ if func is None: del self._callbacks[e]
else:
- try:
- del self._callbacks[e][func]
- except:
- pass
+ try: del self._callbacks[e][func]
+ except: pass
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Tkinter Methods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def pack(self, cnf={}, **kw):
- # "@include: Tkinter.Pack.pack"
+# "@include: Tkinter.Pack.pack"
self._itemframe.pack(cnf, **kw)
def grid(self, cnf={}, **kw):
- # "@include: Tkinter.Grid.grid"
+# "@include: Tkinter.Grid.grid"
self._itemframe.grid(cnf, *kw)
def focus(self):
- # "@include: Tkinter.Widget.focus"
+# "@include: Tkinter.Widget.focus"
self._textwidget.focus()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Internal Methods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def _init_itemframe(self, options):
self._itemframe = Frame(self._parent)
# Create the basic Text widget & scrollbar.
- options.setdefault("background", "#e0e0e0")
+ options.setdefault('background', '#e0e0e0')
self._textwidget = Text(self._itemframe, **options)
- self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient="vertical")
- self._textwidget.config(yscrollcommand=self._textscroll.set)
+ self._textscroll = Scrollbar(self._itemframe, takefocus=0,
+ orient='vertical')
+ self._textwidget.config(yscrollcommand = self._textscroll.set)
self._textscroll.config(command=self._textwidget.yview)
- self._textscroll.pack(side="right", fill="y")
- self._textwidget.pack(expand=1, fill="both", side="left")
+ self._textscroll.pack(side='right', fill='y')
+ self._textwidget.pack(expand=1, fill='both', side='left')
# Initialize the colorization tags
- self._textwidget.tag_config(
- "highlight", background="#e0ffff", border="1", relief="raised"
- )
+ self._textwidget.tag_config('highlight', background='#e0ffff',
+ border='1', relief='raised')
self._init_colortags(self._textwidget, options)
# How do I want to mark keyboard selection?
- self._textwidget.tag_config("sel", foreground="")
- self._textwidget.tag_config(
- "sel", foreground="", background="", border="", underline=1
- )
- self._textwidget.tag_lower("highlight", "sel")
+ self._textwidget.tag_config('sel', foreground='')
+ self._textwidget.tag_config('sel', foreground='', background='',
+ border='', underline=1)
+ self._textwidget.tag_lower('highlight', 'sel')
def _fire_callback(self, event, itemnum):
- if event not in self._callbacks:
- return
+ if event not in self._callbacks: return
if 0 <= itemnum < len(self._items):
item = self._items[itemnum]
else:
cb_func(item)
def _buttonpress(self, event):
- clickloc = "@%d,%d" % (event.x, event.y)
+ clickloc = '@%d,%d' % (event.x,event.y)
insert_point = self._textwidget.index(clickloc)
- itemnum = int(insert_point.split(".")[0]) - 1
- self._fire_callback("click%d" % event.num, itemnum)
+ itemnum = int(insert_point.split('.')[0])-1
+ self._fire_callback('click%d' % event.num, itemnum)
def _keypress(self, event):
- if event.keysym == "Return" or event.keysym == "space":
- insert_point = self._textwidget.index("insert")
- itemnum = int(insert_point.split(".")[0]) - 1
+ if event.keysym == 'Return' or event.keysym == 'space':
+ insert_point = self._textwidget.index('insert')
+ itemnum = int(insert_point.split('.')[0])-1
self._fire_callback(event.keysym.lower(), itemnum)
return
- elif event.keysym == "Down":
- delta = "+1line"
- elif event.keysym == "Up":
- delta = "-1line"
- elif event.keysym == "Next":
- delta = "+10lines"
- elif event.keysym == "Prior":
- delta = "-10lines"
- else:
- return "continue"
-
- self._textwidget.mark_set("insert", "insert" + delta)
- self._textwidget.see("insert")
- self._textwidget.tag_remove("sel", "1.0", "end+1char")
- self._textwidget.tag_add("sel", "insert linestart", "insert lineend")
-
- insert_point = self._textwidget.index("insert")
- itemnum = int(insert_point.split(".")[0]) - 1
+ elif event.keysym == 'Down': delta='+1line'
+ elif event.keysym == 'Up': delta='-1line'
+ elif event.keysym == 'Next': delta='+10lines'
+ elif event.keysym == 'Prior': delta='-10lines'
+ else: return 'continue'
+
+ self._textwidget.mark_set('insert', 'insert'+delta)
+ self._textwidget.see('insert')
+ self._textwidget.tag_remove('sel', '1.0', 'end+1char')
+ self._textwidget.tag_add('sel', 'insert linestart', 'insert lineend')
+
+ insert_point = self._textwidget.index('insert')
+ itemnum = int(insert_point.split('.')[0])-1
self._fire_callback(event.keysym.lower(), itemnum)
- return "break"
-
+ return 'break'
##//////////////////////////////////////////////////////
## Improved OptionMenu
##//////////////////////////////////////////////////////
-
class MutableOptionMenu(Menubutton):
def __init__(self, master, values, **options):
- self._callback = options.get("command")
- if "command" in options:
- del options["command"]
+ self._callback = options.get('command')
+ if 'command' in options: del options['command']
# Create a variable
self._variable = variable = StringVar()
if len(values) > 0:
variable.set(values[0])
- kw = {
- "borderwidth": 2,
- "textvariable": variable,
- "indicatoron": 1,
- "relief": RAISED,
- "anchor": "c",
- "highlightthickness": 2,
- }
+ kw = {"borderwidth": 2, "textvariable": variable,
+ "indicatoron": 1, "relief": RAISED, "anchor": "c",
+ "highlightthickness": 2}
kw.update(options)
Widget.__init__(self, master, "menubutton", kw)
- self.widgetName = "tk_optionMenu"
- self._menu = Menu(self, name="menu", tearoff=0)
+ self.widgetName = 'tk_optionMenu'
+ self._menu = Menu(self, name="menu", tearoff=0,)
self.menuname = self._menu._w
self._values = []
- for value in values:
- self.add(value)
+ for value in values: self.add(value)
self["menu"] = self._menu
def add(self, value):
- if value in self._values:
- return
-
- def set(value=value):
- self.set(value)
-
+ if value in self._values: return
+ def set(value=value): self.set(value)
self._menu.add_command(label=value, command=set)
self._values.append(value)
self._menu.delete(i, i)
def __getitem__(self, name):
- if name == "menu":
+ if name == 'menu':
return self.__menu
return Widget.__getitem__(self, name)
Menubutton.destroy(self)
self._menu = None
-
##//////////////////////////////////////////////////////
## Test code.
##//////////////////////////////////////////////////////
-
def demo():
"""
A simple demonstration showing how to use canvas widgets.
"""
-
def fill(cw):
from random import randint
-
- cw["fill"] = "#00%04d" % randint(0, 9999)
-
+ cw['fill'] = '#00%04d' % randint(0,9999)
def color(cw):
from random import randint
-
- cw["color"] = "#ff%04d" % randint(0, 9999)
+ cw['color'] = '#ff%04d' % randint(0,9999)
cf = CanvasFrame(closeenough=10, width=300, height=300)
c = cf.canvas()
- ct3 = TextWidget(c, "hiya there", draggable=1)
- ct2 = TextWidget(c, "o o\n||\n___\n U", draggable=1, justify="center")
- co = OvalWidget(c, ct2, outline="red")
- ct = TextWidget(c, "o o\n||\n\\___/", draggable=1, justify="center")
- cp = ParenWidget(c, ct, color="red")
- cb = BoxWidget(c, cp, fill="cyan", draggable=1, width=3, margin=10)
- equation = SequenceWidget(
- c,
- SymbolWidget(c, "forall"),
- TextWidget(c, "x"),
- SymbolWidget(c, "exists"),
- TextWidget(c, "y: "),
- TextWidget(c, "x"),
- SymbolWidget(c, "notequal"),
- TextWidget(c, "y"),
- )
+ ct3 = TextWidget(c, 'hiya there', draggable=1)
+ ct2 = TextWidget(c, 'o o\n||\n___\n U', draggable=1, justify='center')
+ co = OvalWidget(c, ct2, outline='red')
+ ct = TextWidget(c, 'o o\n||\n\\___/', draggable=1, justify='center')
+ cp = ParenWidget(c, ct, color='red')
+ cb = BoxWidget(c, cp, fill='cyan', draggable=1, width=3, margin=10)
+ equation = SequenceWidget(c,
+ SymbolWidget(c, 'forall'), TextWidget(c, 'x'),
+ SymbolWidget(c, 'exists'), TextWidget(c, 'y: '),
+ TextWidget(c, 'x'), SymbolWidget(c, 'notequal'),
+ TextWidget(c, 'y'))
space = SpaceWidget(c, 0, 30)
- cstack = StackWidget(c, cb, ct3, space, co, equation, align="center")
- prompt_msg = TextWidget(
- c, "try clicking\nand dragging", draggable=1, justify="center"
- )
- cs = SequenceWidget(c, cstack, prompt_msg)
- zz = BracketWidget(c, cs, color="green4", width=3)
+ cstack = StackWidget(c, cb, ct3, space, co, equation, align='center')
+ foo = TextWidget(c, 'try clicking\nand dragging',
+ draggable=1, justify='center')
+ cs = SequenceWidget(c, cstack, foo)
+ zz = BracketWidget(c, cs, color='green4', width=3)
cf.add_widget(zz, 60, 30)
cb.bind_click(fill)
ct3.bind_click(color)
cf.mainloop()
- # ShowText(None, 'title', ((('this is text'*150)+'\n')*5))
-
+ #ShowText(None, 'title', ((('this is text'*150)+'\n')*5))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Feature Structures
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>,
# Rob Speer,
# Steven Bird <stevenbird1@gmail.com>
or if you plan to use them as dictionary keys, it is strongly
recommended that you use full-fledged ``FeatStruct`` objects.
"""
+from __future__ import print_function, unicode_literals, division
import re
import copy
from functools import total_ordering
+from six import integer_types, string_types
+
from nltk.internals import read_str, raise_unorderable_types
-from nltk.sem.logic import (
- Variable,
- Expression,
- SubstituteBindingsI,
- LogicParser,
- LogicalExpressionException,
-)
+from nltk.sem.logic import (Variable, Expression, SubstituteBindingsI,
+ LogicParser, LogicalExpressionException)
+from nltk.compat import python_2_unicode_compatible, unicode_repr
######################################################################
# Feature Structure
######################################################################
-
@total_ordering
class FeatStruct(SubstituteBindingsI):
"""
feature structue."""
##////////////////////////////////////////////////////////////
- # { Constructor
+ #{ Constructor
##////////////////////////////////////////////////////////////
def __new__(cls, features=None, **morefeatures):
elif _is_mapping(features):
return FeatDict.__new__(FeatDict, features, **morefeatures)
elif morefeatures:
- raise TypeError(
- "Keyword arguments may only be specified "
- "if features is None or is a mapping."
- )
- if isinstance(features, str):
+ raise TypeError('Keyword arguments may only be specified '
+ 'if features is None or is a mapping.')
+ if isinstance(features, string_types):
if FeatStructReader._START_FDICT_RE.match(features):
return FeatDict.__new__(FeatDict, features, **morefeatures)
else:
elif _is_sequence(features):
return FeatList.__new__(FeatList, features)
else:
- raise TypeError("Expected string or mapping or sequence")
+ raise TypeError('Expected string or mapping or sequence')
# Otherwise, construct the object as normal.
else:
- return super(FeatStruct, cls).__new__(cls, features, **morefeatures)
+ return super(FeatStruct, cls).__new__(cls, features,
+ **morefeatures)
##////////////////////////////////////////////////////////////
- # { Uniform Accessor Methods
+ #{ Uniform Accessor Methods
##////////////////////////////////////////////////////////////
# These helper functions allow the methods defined by FeatStruct
# to treat all feature structures as mappings, even if they're
def _keys(self):
"""Return an iterable of the feature identifiers used by this
FeatStruct."""
- raise NotImplementedError() # Implemented by subclasses.
+ raise NotImplementedError() # Implemented by subclasses.
def _values(self):
"""Return an iterable of the feature values directly defined
by this FeatStruct."""
- raise NotImplementedError() # Implemented by subclasses.
+ raise NotImplementedError() # Implemented by subclasses.
def _items(self):
"""Return an iterable of (fid,fval) pairs, where fid is a
feature identifier and fval is the corresponding feature
value, for all features defined by this FeatStruct."""
- raise NotImplementedError() # Implemented by subclasses.
+ raise NotImplementedError() # Implemented by subclasses.
##////////////////////////////////////////////////////////////
- # { Equality & Hashing
+ #{ Equality & Hashing
##////////////////////////////////////////////////////////////
def equal_values(self, other, check_reentrance=False):
otherwise, raise ``TypeError``.
"""
if not self._frozen:
- raise TypeError("FeatStructs must be frozen before they " "can be hashed.")
- try:
- return self._hash
+ raise TypeError('FeatStructs must be frozen before they '
+ 'can be hashed.')
+ try: return self._hash
except AttributeError:
self._hash = self._calculate_hashvalue(set())
return self._hash
- def _equal(
- self, other, check_reentrance, visited_self, visited_other, visited_pairs
- ):
+ def _equal(self, other, check_reentrance, visited_self,
+ visited_other, visited_pairs):
"""
Return True iff self and other have equal values.
for all pairs of feature structures we've already visited.
"""
# If we're the same object, then we're equal.
- if self is other:
- return True
+ if self is other: return True
# If we have different classes, we're definitely not equal.
- if self.__class__ != other.__class__:
- return False
+ if self.__class__ != other.__class__: return False
# If we define different features, we're definitely not equal.
# (Perform len test first because it's faster -- we should
# do profiling to see if this actually helps)
- if len(self) != len(other):
- return False
- if set(self._keys()) != set(other._keys()):
- return False
+ if len(self) != len(other): return False
+ if set(self._keys()) != set(other._keys()): return False
# If we're checking reentrance, then any time we revisit a
# structure, make sure that it was paired with the same
# Keep track of which nodes we've visited.
visited_self.add(id(self))
visited_other.add(id(other))
- visited_pairs.add((id(self), id(other)))
+ visited_pairs.add( (id(self), id(other)) )
# Now we have to check all values. If any of them don't match,
# then return false.
for (fname, self_fval) in self._items():
other_fval = other[fname]
if isinstance(self_fval, FeatStruct):
- if not self_fval._equal(
- other_fval,
- check_reentrance,
- visited_self,
- visited_other,
- visited_pairs,
- ):
+ if not self_fval._equal(other_fval, check_reentrance,
+ visited_self, visited_other,
+ visited_pairs):
return False
else:
- if self_fval != other_fval:
- return False
+ if self_fval != other_fval: return False
# Everything matched up; return true.
return True
:param visited: A set containing the ids of all feature
structures we've already visited while hashing.
"""
- if id(self) in visited:
- return 1
+ if id(self) in visited: return 1
visited.add(id(self))
hashval = 5831
else:
hashval += hash(fval)
# Convert to a 32 bit int.
- hashval = int(hashval & 0x7FFFFFFF)
+ hashval = int(hashval & 0x7fffffff)
return hashval
##////////////////////////////////////////////////////////////
- # { Freezing
+ #{ Freezing
##////////////////////////////////////////////////////////////
#: Error message used by mutating methods when called on a frozen
'freeze' any feature value that is not a ``FeatStruct``; it
is recommended that you use only immutable feature values.
"""
- if self._frozen:
- return
+ if self._frozen: return
self._freeze(set())
def frozen(self):
:param visited: A set containing the ids of all feature
structures we've already visited while freezing.
"""
- if id(self) in visited:
- return
+ if id(self) in visited: return
visited.add(id(self))
self._frozen = True
for (fname, fval) in sorted(self._items()):
fval._freeze(visited)
##////////////////////////////////////////////////////////////
- # { Copying
+ #{ Copying
##////////////////////////////////////////////////////////////
def copy(self, deep=True):
# Subclasses should define __deepcopy__ to ensure that the new
# copy will not be frozen.
def __deepcopy__(self, memo):
- raise NotImplementedError() # Implemented by subclasses.
+ raise NotImplementedError() # Implemented by subclasses.
##////////////////////////////////////////////////////////////
- # { Structural Information
+ #{ Structural Information
##////////////////////////////////////////////////////////////
def cyclic(self):
:param visited: A set containing the ids of all feature
structures we've already visited while freezing.
"""
- raise NotImplementedError() # Implemented by subclasses.
+ raise NotImplementedError() # Implemented by subclasses.
def _walk(self, visited):
- if id(self) in visited:
- return
+ if id(self) in visited: return
visited.add(id(self))
yield self
for fval in self._values():
return reentrances
##////////////////////////////////////////////////////////////
- # { Variables & Bindings
+ #{ Variables & Bindings
##////////////////////////////////////////////////////////////
def substitute_bindings(self, bindings):
return remove_variables(self)
##////////////////////////////////////////////////////////////
- # { Unification
+ #{ Unification
##////////////////////////////////////////////////////////////
- def unify(self, other, bindings=None, trace=False, fail=None, rename_vars=True):
+ def unify(self, other, bindings=None, trace=False,
+ fail=None, rename_vars=True):
return unify(self, other, bindings, trace, fail, rename_vars)
def subsumes(self, other):
return subsumes(self, other)
##////////////////////////////////////////////////////////////
- # { String Representations
+ #{ String Representations
##////////////////////////////////////////////////////////////
def __repr__(self):
"""
raise NotImplementedError()
-
# Mutation: disable if frozen.
_FROZEN_ERROR = "Frozen FeatStructs may not be modified."
_FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError."
-
-
-def _check_frozen(method, indent=""):
+def _check_frozen(method, indent=''):
"""
Given a method function, return a new method function that first
checks if ``self._frozen`` is true; and if so, raises ``ValueError``
with an appropriate message. Otherwise, call the method and return
its result.
"""
-
def wrapped(self, *args, **kwargs):
- if self._frozen:
- raise ValueError(_FROZEN_ERROR)
- else:
- return method(self, *args, **kwargs)
-
+ if self._frozen: raise ValueError(_FROZEN_ERROR)
+ else: return method(self, *args, **kwargs)
wrapped.__name__ = method.__name__
- wrapped.__doc__ = (method.__doc__ or "") + (_FROZEN_NOTICE % indent)
+ wrapped.__doc__ = (method.__doc__ or '') + (_FROZEN_NOTICE % indent)
return wrapped
# Feature Dictionary
######################################################################
-
-
+@python_2_unicode_compatible
class FeatDict(FeatStruct, dict):
"""
A feature structure that acts like a Python dictionary. I.e., a
:see: ``FeatStruct`` for information about feature paths, reentrance,
cyclic feature structures, mutability, freezing, and hashing.
"""
-
def __init__(self, features=None, **morefeatures):
"""
Create a new feature dictionary, with the specified features.
``morefeatures``, then the value from ``morefeatures`` will be
used.
"""
- if isinstance(features, str):
+ if isinstance(features, string_types):
FeatStructReader().fromstring(features, self)
self.update(**morefeatures)
else:
# update() checks the types of features.
self.update(features, **morefeatures)
- # ////////////////////////////////////////////////////////////
- # { Dict methods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Dict methods
+ #////////////////////////////////////////////////////////////
_INDEX_ERROR = str("Expected feature name or path. Got %r.")
def __getitem__(self, name_or_path):
"""If the feature with the given name or path exists, return
its value; otherwise, raise ``KeyError``."""
- if isinstance(name_or_path, (str, Feature)):
+ if isinstance(name_or_path, (string_types, Feature)):
return dict.__getitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
try:
val = self
for fid in name_or_path:
if not isinstance(val, FeatStruct):
- raise KeyError # path contains base value
+ raise KeyError # path contains base value
val = val[fid]
return val
except (KeyError, IndexError):
def get(self, name_or_path, default=None):
"""If the feature with the given name or path exists, return its
value; otherwise, return ``default``."""
- try:
- return self[name_or_path]
- except KeyError:
- return default
+ try: return self[name_or_path]
+ except KeyError: return default
def __contains__(self, name_or_path):
"""Return true if a feature with the given name or path exists."""
- try:
- self[name_or_path]
- return True
- except KeyError:
- return False
+ try: self[name_or_path]; return True
+ except KeyError: return False
def has_key(self, name_or_path):
"""Return true if a feature with the given name or path exists."""
def __delitem__(self, name_or_path):
"""If the feature with the given name or path exists, delete
its value; otherwise, raise ``KeyError``."""
- if self._frozen:
- raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (str, Feature)):
+ if self._frozen: raise ValueError(_FROZEN_ERROR)
+ if isinstance(name_or_path, (string_types, Feature)):
return dict.__delitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
else:
parent = self[name_or_path[:-1]]
if not isinstance(parent, FeatStruct):
- raise KeyError(name_or_path) # path contains base value
+ raise KeyError(name_or_path) # path contains base value
del parent[name_or_path[-1]]
else:
raise TypeError(self._INDEX_ERROR % name_or_path)
"""Set the value for the feature with the given name or path
to ``value``. If ``name_or_path`` is an invalid path, raise
``KeyError``."""
- if self._frozen:
- raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (str, Feature)):
+ if self._frozen: raise ValueError(_FROZEN_ERROR)
+ if isinstance(name_or_path, (string_types, Feature)):
return dict.__setitem__(self, name_or_path, value)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
else:
parent = self[name_or_path[:-1]]
if not isinstance(parent, FeatStruct):
- raise KeyError(name_or_path) # path contains base value
+ raise KeyError(name_or_path) # path contains base value
parent[name_or_path[-1]] = value
else:
raise TypeError(self._INDEX_ERROR % name_or_path)
setdefault = _check_frozen(dict.setdefault)
def update(self, features=None, **morefeatures):
- if self._frozen:
- raise ValueError(_FROZEN_ERROR)
+ if self._frozen: raise ValueError(_FROZEN_ERROR)
if features is None:
items = ()
- elif hasattr(features, "items") and callable(features.items):
+ elif hasattr(features, 'items') and callable(features.items):
items = features.items()
- elif hasattr(features, "__iter__"):
+ elif hasattr(features, '__iter__'):
items = features
else:
- raise ValueError("Expected mapping or list of tuples")
+ raise ValueError('Expected mapping or list of tuples')
for key, val in items:
- if not isinstance(key, (str, Feature)):
- raise TypeError("Feature names must be strings")
+ if not isinstance(key, (string_types, Feature)):
+ raise TypeError('Feature names must be strings')
self[key] = val
for key, val in morefeatures.items():
- if not isinstance(key, (str, Feature)):
- raise TypeError("Feature names must be strings")
+ if not isinstance(key, (string_types, Feature)):
+ raise TypeError('Feature names must be strings')
self[key] = val
##////////////////////////////////////////////////////////////
- # { Copying
+ #{ Copying
##////////////////////////////////////////////////////////////
def __deepcopy__(self, memo):
memo[id(self)] = selfcopy = self.__class__()
for (key, val) in self._items():
- selfcopy[copy.deepcopy(key, memo)] = copy.deepcopy(val, memo)
+ selfcopy[copy.deepcopy(key,memo)] = copy.deepcopy(val,memo)
return selfcopy
##////////////////////////////////////////////////////////////
- # { Uniform Accessor Methods
+ #{ Uniform Accessor Methods
##////////////////////////////////////////////////////////////
- def _keys(self):
- return self.keys()
-
- def _values(self):
- return self.values()
-
- def _items(self):
- return self.items()
+ def _keys(self): return self.keys()
+ def _values(self): return self.values()
+ def _items(self): return self.items()
##////////////////////////////////////////////////////////////
- # { String Representations
+ #{ String Representations
##////////////////////////////////////////////////////////////
def __str__(self):
Display a multi-line representation of this feature dictionary
as an FVM (feature value matrix).
"""
- return "\n".join(self._str(self._find_reentrances({}), {}))
+ return '\n'.join(self._str(self._find_reentrances({}), {}))
def _repr(self, reentrances, reentrance_ids):
segments = []
- prefix = ""
- suffix = ""
+ prefix = ''
+ suffix = ''
# If this is the first time we've seen a reentrant structure,
# then assign it a unique identifier.
if reentrances[id(self)]:
assert id(self) not in reentrance_ids
- reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1)
+ reentrance_ids[id(self)] = repr(len(reentrance_ids)+1)
# sorting note: keys are unique strings, so we'll never fall
# through to comparing values.
for (fname, fval) in sorted(self.items()):
- display = getattr(fname, "display", None)
+ display = getattr(fname, 'display', None)
if id(fval) in reentrance_ids:
- segments.append("%s->(%s)" % (fname, reentrance_ids[id(fval)]))
- elif (
- display == "prefix"
- and not prefix
- and isinstance(fval, (Variable, str))
- ):
- prefix = "%s" % fval
- elif display == "slash" and not suffix:
+ segments.append('%s->(%s)' %
+ (fname, reentrance_ids[id(fval)]))
+ elif (display == 'prefix' and not prefix and
+ isinstance(fval, (Variable, string_types))):
+ prefix = '%s' % fval
+ elif display == 'slash' and not suffix:
if isinstance(fval, Variable):
- suffix = "/%s" % fval.name
+ suffix = '/%s' % fval.name
else:
- suffix = "/%s" % repr(fval)
+ suffix = '/%s' % unicode_repr(fval)
elif isinstance(fval, Variable):
- segments.append("%s=%s" % (fname, fval.name))
+ segments.append('%s=%s' % (fname, fval.name))
elif fval is True:
- segments.append("+%s" % fname)
+ segments.append('+%s' % fname)
elif fval is False:
- segments.append("-%s" % fname)
+ segments.append('-%s' % fname)
elif isinstance(fval, Expression):
- segments.append("%s=<%s>" % (fname, fval))
+ segments.append('%s=<%s>' % (fname, fval))
elif not isinstance(fval, FeatStruct):
- segments.append("%s=%s" % (fname, repr(fval)))
+ segments.append('%s=%s' % (fname, unicode_repr(fval)))
else:
fval_repr = fval._repr(reentrances, reentrance_ids)
- segments.append("%s=%s" % (fname, fval_repr))
+ segments.append('%s=%s' % (fname, fval_repr))
# If it's reentrant, then add on an identifier tag.
if reentrances[id(self)]:
- prefix = "(%s)%s" % (reentrance_ids[id(self)], prefix)
- return "%s[%s]%s" % (prefix, ", ".join(segments), suffix)
+ prefix = '(%s)%s' % (reentrance_ids[id(self)], prefix)
+ return '%s[%s]%s' % (prefix, ', '.join(segments), suffix)
def _str(self, reentrances, reentrance_ids):
"""
# then tack on an id string.
if reentrances[id(self)]:
assert id(self) not in reentrance_ids
- reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1)
+ reentrance_ids[id(self)] = repr(len(reentrance_ids)+1)
# Special case: empty feature dict.
if len(self) == 0:
if reentrances[id(self)]:
- return ["(%s) []" % reentrance_ids[id(self)]]
+ return ['(%s) []' % reentrance_ids[id(self)]]
else:
- return ["[]"]
+ return ['[]']
# What's the longest feature name? Use this to align names.
maxfnamelen = max(len("%s" % k) for k in self.keys())
for (fname, fval) in sorted(self.items()):
fname = ("%s" % fname).ljust(maxfnamelen)
if isinstance(fval, Variable):
- lines.append("%s = %s" % (fname, fval.name))
+ lines.append('%s = %s' % (fname,fval.name))
elif isinstance(fval, Expression):
- lines.append("%s = <%s>" % (fname, fval))
+ lines.append('%s = <%s>' % (fname, fval))
elif isinstance(fval, FeatList):
fval_repr = fval._repr(reentrances, reentrance_ids)
- lines.append("%s = %s" % (fname, repr(fval_repr)))
+ lines.append('%s = %s' % (fname, unicode_repr(fval_repr)))
elif not isinstance(fval, FeatDict):
# It's not a nested feature structure -- just print it.
- lines.append("%s = %s" % (fname, repr(fval)))
+ lines.append('%s = %s' % (fname, unicode_repr(fval)))
elif id(fval) in reentrance_ids:
# It's a feature structure we've seen before -- print
# the reentrance id.
- lines.append("%s -> (%s)" % (fname, reentrance_ids[id(fval)]))
+ lines.append('%s -> (%s)' % (fname, reentrance_ids[id(fval)]))
else:
# It's a new feature structure. Separate it from
# other values by a blank line.
- if lines and lines[-1] != "":
- lines.append("")
+ if lines and lines[-1] != '': lines.append('')
# Recursively print the feature's value (fval).
fval_lines = fval._str(reentrances, reentrance_ids)
# Indent each line to make room for fname.
- fval_lines = [(" " * (maxfnamelen + 3)) + l for l in fval_lines]
+ fval_lines = [(' '*(maxfnamelen+3))+l for l in fval_lines]
# Pick which line we'll display fname on, & splice it in.
- nameline = (len(fval_lines) - 1) // 2
+ nameline = (len(fval_lines)-1) // 2
fval_lines[nameline] = (
- fname + " =" + fval_lines[nameline][maxfnamelen + 2 :]
- )
+ fname+' ='+fval_lines[nameline][maxfnamelen+2:])
# Add the feature structure to the output.
lines += fval_lines
# Separate FeatStructs by a blank line.
- lines.append("")
+ lines.append('')
# Get rid of any excess blank lines.
- if lines[-1] == "":
- lines.pop()
+ if lines[-1] == '': lines.pop()
# Add brackets around everything.
maxlen = max(len(line) for line in lines)
- lines = ["[ %s%s ]" % (line, " " * (maxlen - len(line))) for line in lines]
+ lines = ['[ %s%s ]' % (line, ' '*(maxlen-len(line))) for line in lines]
# If it's reentrant, then add on an identifier tag.
if reentrances[id(self)]:
- idstr = "(%s) " % reentrance_ids[id(self)]
- lines = [(" " * len(idstr)) + l for l in lines]
- idline = (len(lines) - 1) // 2
- lines[idline] = idstr + lines[idline][len(idstr) :]
+ idstr = '(%s) ' % reentrance_ids[id(self)]
+ lines = [(' '*len(idstr))+l for l in lines]
+ idline = (len(lines)-1) // 2
+ lines[idline] = idstr + lines[idline][len(idstr):]
return lines
# Feature List
######################################################################
-
class FeatList(FeatStruct, list):
"""
A list of feature values, where each feature value is either a
:see: ``FeatStruct`` for information about feature paths, reentrance,
cyclic feature structures, mutability, freezing, and hashing.
"""
-
def __init__(self, features=()):
"""
Create a new feature list, with the specified features.
``FeatStructReader``. Otherwise, it should be a sequence
of basic values and nested feature structures.
"""
- if isinstance(features, str):
+ if isinstance(features, string_types):
FeatStructReader().fromstring(features, self)
else:
list.__init__(self, features)
- # ////////////////////////////////////////////////////////////
- # { List methods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ List methods
+ #////////////////////////////////////////////////////////////
_INDEX_ERROR = "Expected int or feature path. Got %r."
def __getitem__(self, name_or_path):
- if isinstance(name_or_path, int):
+ if isinstance(name_or_path, integer_types):
return list.__getitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
try:
val = self
for fid in name_or_path:
if not isinstance(val, FeatStruct):
- raise KeyError # path contains base value
+ raise KeyError # path contains base value
val = val[fid]
return val
except (KeyError, IndexError):
def __delitem__(self, name_or_path):
"""If the feature with the given name or path exists, delete
its value; otherwise, raise ``KeyError``."""
- if self._frozen:
- raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (int, slice)):
+ if self._frozen: raise ValueError(_FROZEN_ERROR)
+ if isinstance(name_or_path, (integer_types, slice)):
return list.__delitem__(self, name_or_path)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
else:
parent = self[name_or_path[:-1]]
if not isinstance(parent, FeatStruct):
- raise KeyError(name_or_path) # path contains base value
+ raise KeyError(name_or_path) # path contains base value
del parent[name_or_path[-1]]
else:
raise TypeError(self._INDEX_ERROR % name_or_path)
"""Set the value for the feature with the given name or path
to ``value``. If ``name_or_path`` is an invalid path, raise
``KeyError``."""
- if self._frozen:
- raise ValueError(_FROZEN_ERROR)
- if isinstance(name_or_path, (int, slice)):
+ if self._frozen: raise ValueError(_FROZEN_ERROR)
+ if isinstance(name_or_path, (integer_types, slice)):
return list.__setitem__(self, name_or_path, value)
elif isinstance(name_or_path, tuple):
if len(name_or_path) == 0:
else:
parent = self[name_or_path[:-1]]
if not isinstance(parent, FeatStruct):
- raise KeyError(name_or_path) # path contains base value
+ raise KeyError(name_or_path) # path contains base value
parent[name_or_path[-1]] = value
else:
raise TypeError(self._INDEX_ERROR % name_or_path)
- # __delslice__ = _check_frozen(list.__delslice__, ' ')
- # __setslice__ = _check_frozen(list.__setslice__, ' ')
+# __delslice__ = _check_frozen(list.__delslice__, ' ')
+# __setslice__ = _check_frozen(list.__setslice__, ' ')
__iadd__ = _check_frozen(list.__iadd__)
__imul__ = _check_frozen(list.__imul__)
append = _check_frozen(list.append)
sort = _check_frozen(list.sort)
##////////////////////////////////////////////////////////////
- # { Copying
+ #{ Copying
##////////////////////////////////////////////////////////////
def __deepcopy__(self, memo):
memo[id(self)] = selfcopy = self.__class__()
- selfcopy.extend(copy.deepcopy(fval, memo) for fval in self)
+ selfcopy.extend(copy.deepcopy(fval,memo) for fval in self)
return selfcopy
##////////////////////////////////////////////////////////////
- # { Uniform Accessor Methods
+ #{ Uniform Accessor Methods
##////////////////////////////////////////////////////////////
- def _keys(self):
- return list(range(len(self)))
-
- def _values(self):
- return self
-
- def _items(self):
- return enumerate(self)
+ def _keys(self): return list(range(len(self)))
+ def _values(self): return self
+ def _items(self): return enumerate(self)
##////////////////////////////////////////////////////////////
- # { String Representations
+ #{ String Representations
##////////////////////////////////////////////////////////////
# Special handling for: reentrances, variables, expressions.
# then assign it a unique identifier.
if reentrances[id(self)]:
assert id(self) not in reentrance_ids
- reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1)
- prefix = "(%s)" % reentrance_ids[id(self)]
+ reentrance_ids[id(self)] = repr(len(reentrance_ids)+1)
+ prefix = '(%s)' % reentrance_ids[id(self)]
else:
- prefix = ""
+ prefix = ''
segments = []
for fval in self:
if id(fval) in reentrance_ids:
- segments.append("->(%s)" % reentrance_ids[id(fval)])
+ segments.append('->(%s)' % reentrance_ids[id(fval)])
elif isinstance(fval, Variable):
segments.append(fval.name)
elif isinstance(fval, Expression):
- segments.append("%s" % fval)
+ segments.append('%s' % fval)
elif isinstance(fval, FeatStruct):
segments.append(fval._repr(reentrances, reentrance_ids))
else:
- segments.append("%s" % repr(fval))
-
- return "%s[%s]" % (prefix, ", ".join(segments))
+ segments.append('%s' % unicode_repr(fval))
+ return '%s[%s]' % (prefix, ', '.join(segments))
######################################################################
# Variables & Bindings
######################################################################
-
-def substitute_bindings(fstruct, bindings, fs_class="default"):
+def substitute_bindings(fstruct, bindings, fs_class='default'):
"""
Return the feature structure that is obtained by replacing each
variable bound by ``bindings`` with its binding. If a variable is
:type bindings: dict(Variable -> any)
:param bindings: A dictionary mapping from variables to values.
"""
- if fs_class == "default":
- fs_class = _default_fs_class(fstruct)
+ if fs_class == 'default': fs_class = _default_fs_class(fstruct)
fstruct = copy.deepcopy(fstruct)
_substitute_bindings(fstruct, bindings, fs_class, set())
return fstruct
-
def _substitute_bindings(fstruct, bindings, fs_class, visited):
# Visit each node only once:
- if id(fstruct) in visited:
- return
+ if id(fstruct) in visited: return
visited.add(id(fstruct))
- if _is_mapping(fstruct):
- items = fstruct.items()
- elif _is_sequence(fstruct):
- items = enumerate(fstruct)
- else:
- raise ValueError("Expected mapping or sequence")
+ if _is_mapping(fstruct): items = fstruct.items()
+ elif _is_sequence(fstruct): items = enumerate(fstruct)
+ else: raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
- while isinstance(fval, Variable) and fval in bindings:
+ while (isinstance(fval, Variable) and fval in bindings):
fval = fstruct[fname] = bindings[fval]
if isinstance(fval, fs_class):
_substitute_bindings(fval, bindings, fs_class, visited)
elif isinstance(fval, SubstituteBindingsI):
fstruct[fname] = fval.substitute_bindings(bindings)
-
-def retract_bindings(fstruct, bindings, fs_class="default"):
+def retract_bindings(fstruct, bindings, fs_class='default'):
"""
Return the feature structure that is obtained by replacing each
feature structure value that is bound by ``bindings`` with the
values in ``bindings`` may be modified if they are contained in
``fstruct``.
"""
- if fs_class == "default":
- fs_class = _default_fs_class(fstruct)
+ if fs_class == 'default': fs_class = _default_fs_class(fstruct)
(fstruct, new_bindings) = copy.deepcopy((fstruct, bindings))
bindings.update(new_bindings)
- inv_bindings = dict((id(val), var) for (var, val) in bindings.items())
+ inv_bindings = dict((id(val),var) for (var,val) in bindings.items())
_retract_bindings(fstruct, inv_bindings, fs_class, set())
return fstruct
-
def _retract_bindings(fstruct, inv_bindings, fs_class, visited):
# Visit each node only once:
- if id(fstruct) in visited:
- return
+ if id(fstruct) in visited: return
visited.add(id(fstruct))
- if _is_mapping(fstruct):
- items = fstruct.items()
- elif _is_sequence(fstruct):
- items = enumerate(fstruct)
- else:
- raise ValueError("Expected mapping or sequence")
+ if _is_mapping(fstruct): items = fstruct.items()
+ elif _is_sequence(fstruct): items = enumerate(fstruct)
+ else: raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, fs_class):
if id(fval) in inv_bindings:
_retract_bindings(fval, inv_bindings, fs_class, visited)
-def find_variables(fstruct, fs_class="default"):
+def find_variables(fstruct, fs_class='default'):
"""
:return: The set of variables used by this feature structure.
:rtype: set(Variable)
"""
- if fs_class == "default":
- fs_class = _default_fs_class(fstruct)
+ if fs_class == 'default': fs_class = _default_fs_class(fstruct)
return _variables(fstruct, set(), fs_class, set())
-
def _variables(fstruct, vars, fs_class, visited):
# Visit each node only once:
- if id(fstruct) in visited:
- return
+ if id(fstruct) in visited: return
visited.add(id(fstruct))
- if _is_mapping(fstruct):
- items = fstruct.items()
- elif _is_sequence(fstruct):
- items = enumerate(fstruct)
- else:
- raise ValueError("Expected mapping or sequence")
+ if _is_mapping(fstruct): items = fstruct.items()
+ elif _is_sequence(fstruct): items = enumerate(fstruct)
+ else: raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, Variable):
vars.add(fval)
vars.update(fval.variables())
return vars
-
-def rename_variables(
- fstruct, vars=None, used_vars=(), new_vars=None, fs_class="default"
-):
+def rename_variables(fstruct, vars=None, used_vars=(), new_vars=None,
+ fs_class='default'):
"""
Return the feature structure that is obtained by replacing
any of this feature structure's variables that are in ``vars``
If new_vars is not specified, then an empty dictionary is used.
"""
- if fs_class == "default":
- fs_class = _default_fs_class(fstruct)
+ if fs_class == 'default': fs_class = _default_fs_class(fstruct)
# Default values:
- if new_vars is None:
- new_vars = {}
- if vars is None:
- vars = find_variables(fstruct, fs_class)
- else:
- vars = set(vars)
+ if new_vars is None: new_vars = {}
+ if vars is None: vars = find_variables(fstruct, fs_class)
+ else: vars = set(vars)
# Add our own variables to used_vars.
used_vars = find_variables(fstruct, fs_class).union(used_vars)
# Copy ourselves, and rename variables in the copy.
- return _rename_variables(
- copy.deepcopy(fstruct), vars, used_vars, new_vars, fs_class, set()
- )
-
+ return _rename_variables(copy.deepcopy(fstruct), vars, used_vars,
+ new_vars, fs_class, set())
def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited):
- if id(fstruct) in visited:
- return
+ if id(fstruct) in visited: return
visited.add(id(fstruct))
- if _is_mapping(fstruct):
- items = fstruct.items()
- elif _is_sequence(fstruct):
- items = enumerate(fstruct)
- else:
- raise ValueError("Expected mapping or sequence")
+ if _is_mapping(fstruct): items = fstruct.items()
+ elif _is_sequence(fstruct): items = enumerate(fstruct)
+ else: raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, Variable):
# If it's in new_vars, then rebind it.
fstruct[fname] = new_vars[fval]
used_vars.add(new_vars[fval])
elif isinstance(fval, fs_class):
- _rename_variables(fval, vars, used_vars, new_vars, fs_class, visited)
+ _rename_variables(fval, vars, used_vars, new_vars,
+ fs_class, visited)
elif isinstance(fval, SubstituteBindingsI):
# Pick new names for any variables in `vars`
for var in fval.variables():
fstruct[fname] = fval.substitute_bindings(new_vars)
return fstruct
-
def _rename_variable(var, used_vars):
- name, n = re.sub("\d+$", "", var.name), 2
- if not name:
- name = "?"
- while Variable("%s%s" % (name, n)) in used_vars:
- n += 1
- return Variable("%s%s" % (name, n))
-
+ name, n = re.sub('\d+$', '', var.name), 2
+ if not name: name = '?'
+ while Variable('%s%s' % (name, n)) in used_vars: n += 1
+ return Variable('%s%s' % (name, n))
-def remove_variables(fstruct, fs_class="default"):
+def remove_variables(fstruct, fs_class='default'):
"""
:rtype: FeatStruct
:return: The feature structure that is obtained by deleting
all features whose values are ``Variables``.
"""
- if fs_class == "default":
- fs_class = _default_fs_class(fstruct)
+ if fs_class == 'default': fs_class = _default_fs_class(fstruct)
return _remove_variables(copy.deepcopy(fstruct), fs_class, set())
-
def _remove_variables(fstruct, fs_class, visited):
if id(fstruct) in visited:
return
elif _is_sequence(fstruct):
items = list(enumerate(fstruct))
else:
- raise ValueError("Expected mapping or sequence")
+ raise ValueError('Expected mapping or sequence')
for (fname, fval) in items:
if isinstance(fval, Variable):
# Unification
######################################################################
-
-
+@python_2_unicode_compatible
class _UnificationFailure(object):
def __repr__(self):
- return "nltk.featstruct.UnificationFailure"
-
+ return 'nltk.featstruct.UnificationFailure'
UnificationFailure = _UnificationFailure()
"""A unique value used to indicate unification failure. It can be
returned by ``Feature.unify_base_values()`` or by custom ``fail()``
functions to indicate that unificaiton should fail."""
-
# The basic unification algorithm:
# 1. Make copies of self and other (preserving reentrance)
# 2. Destructively unify self and other
# 3. Apply forward pointers, to preserve reentrance.
# 4. Replace bound variables with their values.
-def unify(
- fstruct1,
- fstruct2,
- bindings=None,
- trace=False,
- fail=None,
- rename_vars=True,
- fs_class="default",
-):
+def unify(fstruct1, fstruct2, bindings=None, trace=False,
+ fail=None, rename_vars=True, fs_class='default'):
"""
Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature
structure. This unified feature structure is the minimal
"""
# Decide which class(es) will be treated as feature structures,
# for the purposes of unification.
- if fs_class == "default":
+ if fs_class == 'default':
fs_class = _default_fs_class(fstruct1)
if _default_fs_class(fstruct2) != fs_class:
- raise ValueError(
- "Mixing FeatStruct objects with Python "
- "dicts and lists is not supported."
- )
+ raise ValueError("Mixing FeatStruct objects with Python "
+ "dicts and lists is not supported.")
assert isinstance(fstruct1, fs_class)
assert isinstance(fstruct2, fs_class)
# If bindings are unspecified, use an empty set of bindings.
- user_bindings = bindings is not None
- if bindings is None:
- bindings = {}
+ user_bindings = (bindings is not None)
+ if bindings is None: bindings = {}
# Make copies of fstruct1 and fstruct2 (since the unification
# algorithm is destructive). Do it all at once, to preserve
# reentrance links between fstruct1 and fstruct2. Copy bindings
# as well, in case there are any bound vars that contain parts
# of fstruct1 or fstruct2.
- (fstruct1copy, fstruct2copy, bindings_copy) = copy.deepcopy(
- (fstruct1, fstruct2, bindings)
- )
+ (fstruct1copy, fstruct2copy, bindings_copy) = (
+ copy.deepcopy((fstruct1, fstruct2, bindings)))
# Copy the bindings back to the original bindings dict.
bindings.update(bindings_copy)
# Do the actual unification. If it fails, return None.
forward = {}
- if trace:
- _trace_unify_start((), fstruct1copy, fstruct2copy)
- try:
- result = _destructively_unify(
- fstruct1copy, fstruct2copy, bindings, forward, trace, fail, fs_class, ()
- )
- except _UnificationFailureError:
- return None
+ if trace: _trace_unify_start((), fstruct1copy, fstruct2copy)
+ try: result = _destructively_unify(fstruct1copy, fstruct2copy, bindings,
+ forward, trace, fail, fs_class, ())
+ except _UnificationFailureError: return None
# _destructively_unify might return UnificationFailure, e.g. if we
# tried to unify a mapping with a sequence.
if result is UnificationFailure:
- if fail is None:
- return None
- else:
- return fail(fstruct1copy, fstruct2copy, ())
+ if fail is None: return None
+ else: return fail(fstruct1copy, fstruct2copy, ())
# Replace any feature structure that has a forward pointer
# with the target of its forward pointer.
result = _apply_forwards(result, forward, fs_class, set())
- if user_bindings:
- _apply_forwards_to_bindings(forward, bindings)
+ if user_bindings: _apply_forwards_to_bindings(forward, bindings)
# Replace bound vars with values.
_resolve_aliases(bindings)
_substitute_bindings(result, bindings, fs_class, set())
# Return the result.
- if trace:
- _trace_unify_succeed((), result)
- if trace:
- _trace_bindings((), bindings)
+ if trace: _trace_unify_succeed((), result)
+ if trace: _trace_bindings((), bindings)
return result
-
class _UnificationFailureError(Exception):
"""An exception that is used by ``_destructively_unify`` to abort
unification when a failure is encountered."""
-
-def _destructively_unify(
- fstruct1, fstruct2, bindings, forward, trace, fail, fs_class, path
-):
+def _destructively_unify(fstruct1, fstruct2, bindings, forward,
+ trace, fail, fs_class, path):
"""
Attempt to unify ``fstruct1`` and ``fstruct2`` by modifying them
in-place. If the unification succeeds, then ``fstruct1`` will
# Note: this, together with the forward pointers, ensures
# that unification will terminate even for cyclic structures.
if fstruct1 is fstruct2:
- if trace:
- _trace_unify_identity(path, fstruct1)
+ if trace: _trace_unify_identity(path, fstruct1)
return fstruct1
# Set fstruct2's forward pointer to point to fstruct1; this makes
# Unifying two mappings:
if _is_mapping(fstruct1) and _is_mapping(fstruct2):
for fname in fstruct1:
- if getattr(fname, "default", None) is not None:
+ if getattr(fname, 'default', None) is not None:
fstruct2.setdefault(fname, fname.default)
for fname in fstruct2:
- if getattr(fname, "default", None) is not None:
+ if getattr(fname, 'default', None) is not None:
fstruct1.setdefault(fname, fname.default)
# Unify any values that are defined in both fstruct1 and
for fname, fval2 in sorted(fstruct2.items()):
if fname in fstruct1:
fstruct1[fname] = _unify_feature_values(
- fname,
- fstruct1[fname],
- fval2,
- bindings,
- forward,
- trace,
- fail,
- fs_class,
- path + (fname,),
- )
+ fname, fstruct1[fname], fval2, bindings,
+ forward, trace, fail, fs_class, path+(fname,))
else:
fstruct1[fname] = fval2
- return fstruct1 # Contains the unified value.
+ return fstruct1 # Contains the unified value.
# Unifying two sequences:
elif _is_sequence(fstruct1) and _is_sequence(fstruct2):
# Unify corresponding values in fstruct1 and fstruct2.
for findex in range(len(fstruct1)):
fstruct1[findex] = _unify_feature_values(
- findex,
- fstruct1[findex],
- fstruct2[findex],
- bindings,
- forward,
- trace,
- fail,
- fs_class,
- path + (findex,),
- )
-
- return fstruct1 # Contains the unified value.
+ findex, fstruct1[findex], fstruct2[findex], bindings,
+ forward, trace, fail, fs_class, path+(findex,))
+
+ return fstruct1 # Contains the unified value.
# Unifying sequence & mapping: fail. The failure function
# doesn't get a chance to recover in this case.
- elif (_is_sequence(fstruct1) or _is_mapping(fstruct1)) and (
- _is_sequence(fstruct2) or _is_mapping(fstruct2)
- ):
+ elif ((_is_sequence(fstruct1) or _is_mapping(fstruct1)) and
+ (_is_sequence(fstruct2) or _is_mapping(fstruct2))):
return UnificationFailure
# Unifying anything else: not allowed!
- raise TypeError("Expected mappings or sequences")
-
+ raise TypeError('Expected mappings or sequences')
-def _unify_feature_values(
- fname, fval1, fval2, bindings, forward, trace, fail, fs_class, fpath
-):
+def _unify_feature_values(fname, fval1, fval2, bindings, forward,
+ trace, fail, fs_class, fpath):
"""
Attempt to unify ``fval1`` and and ``fval2``, and return the
resulting unified value. The method of unification will depend on
5. If they're both base values, then unify them. By default,
this will succeed if they are equal, and fail otherwise.
"""
- if trace:
- _trace_unify_start(fpath, fval1, fval2)
+ if trace: _trace_unify_start(fpath, fval1, fval2)
# Look up the "canonical" copy of fval1 and fval2
- while id(fval1) in forward:
- fval1 = forward[id(fval1)]
- while id(fval2) in forward:
- fval2 = forward[id(fval2)]
+ while id(fval1) in forward: fval1 = forward[id(fval1)]
+ while id(fval2) in forward: fval2 = forward[id(fval2)]
# If fval1 or fval2 is a bound variable, then
# replace it by the variable's bound value. This
# Case 1: Two feature structures (recursive case)
if isinstance(fval1, fs_class) and isinstance(fval2, fs_class):
- result = _destructively_unify(
- fval1, fval2, bindings, forward, trace, fail, fs_class, fpath
- )
+ result = _destructively_unify(fval1, fval2, bindings, forward,
+ trace, fail, fs_class, fpath)
# Case 2: Two unbound variables (create alias)
- elif isinstance(fval1, Variable) and isinstance(fval2, Variable):
- if fval1 != fval2:
- bindings[fval2] = fval1
+ elif (isinstance(fval1, Variable) and
+ isinstance(fval2, Variable)):
+ if fval1 != fval2: bindings[fval2] = fval1
result = fval1
# Case 3: An unbound variable and a value (bind)
elif isinstance(fval1, CustomFeatureValue):
result = fval1.unify(fval2)
# Sanity check: unify value should be symmetric
- if isinstance(fval2, CustomFeatureValue) and result != fval2.unify(fval1):
+ if (isinstance(fval2, CustomFeatureValue) and
+ result != fval2.unify(fval1)):
raise AssertionError(
- "CustomFeatureValue objects %r and %r disagree "
- "about unification value: %r vs. %r"
- % (fval1, fval2, result, fval2.unify(fval1))
- )
+ 'CustomFeatureValue objects %r and %r disagree '
+ 'about unification value: %r vs. %r' %
+ (fval1, fval2, result, fval2.unify(fval1)))
elif isinstance(fval2, CustomFeatureValue):
result = fval2.unify(fval1)
# Case 5c: Simple values -- check if they're equal.
# If we unification failed, call the failure function; it
# might decide to continue anyway.
if result is UnificationFailure:
- if fail is not None:
- result = fail(fval1, fval2, fpath)
- if trace:
- _trace_unify_fail(fpath[:-1], result)
+ if fail is not None: result = fail(fval1, fval2, fpath)
+ if trace: _trace_unify_fail(fpath[:-1], result)
if result is UnificationFailure:
raise _UnificationFailureError
if isinstance(result, fs_class):
result = _apply_forwards(result, forward, fs_class, set())
- if trace:
- _trace_unify_succeed(fpath, result)
+ if trace: _trace_unify_succeed(fpath, result)
if trace and isinstance(result, fs_class):
_trace_bindings(fpath, bindings)
return result
-
def _apply_forwards_to_bindings(forward, bindings):
"""
Replace any feature structure that has a forward pointer with
value = forward[id(value)]
bindings[var] = value
-
def _apply_forwards(fstruct, forward, fs_class, visited):
"""
Replace any feature structure that has a forward pointer with
the target of its forward pointer (to preserve reentrancy).
"""
# Follow our own forwards pointers (if any)
- while id(fstruct) in forward:
- fstruct = forward[id(fstruct)]
+ while id(fstruct) in forward: fstruct = forward[id(fstruct)]
# Visit each node only once:
- if id(fstruct) in visited:
- return
+ if id(fstruct) in visited: return
visited.add(id(fstruct))
- if _is_mapping(fstruct):
- items = fstruct.items()
- elif _is_sequence(fstruct):
- items = enumerate(fstruct)
- else:
- raise ValueError("Expected mapping or sequence")
+ if _is_mapping(fstruct): items = fstruct.items()
+ elif _is_sequence(fstruct): items = enumerate(fstruct)
+ else: raise ValueError('Expected mapping or sequence')
for fname, fval in items:
if isinstance(fval, fs_class):
# Replace w/ forwarded value.
return fstruct
-
def _resolve_aliases(bindings):
"""
Replace any bound aliased vars with their binding; and replace
while isinstance(value, Variable) and value in bindings:
value = bindings[var] = bindings[value]
-
def _trace_unify_start(path, fval1, fval2):
if path == ():
- print("\nUnification trace:")
+ print('\nUnification trace:')
else:
- fullname = ".".join("%s" % n for n in path)
- print(" " + "| " * (len(path) - 1) + "|")
- print(" " + "| " * (len(path) - 1) + "| Unify feature: %s" % fullname)
- print(" " + "| " * len(path) + " / " + _trace_valrepr(fval1))
- print(" " + "| " * len(path) + "|\\ " + _trace_valrepr(fval2))
-
-
+ fullname = '.'.join("%s" % n for n in path)
+ print(' '+'| '*(len(path)-1)+'|')
+ print(' '+'| '*(len(path)-1)+'| Unify feature: %s' % fullname)
+ print(' '+'| '*len(path)+' / '+_trace_valrepr(fval1))
+ print(' '+'| '*len(path)+'|\\ '+_trace_valrepr(fval2))
def _trace_unify_identity(path, fval1):
- print(" " + "| " * len(path) + "|")
- print(" " + "| " * len(path) + "| (identical objects)")
- print(" " + "| " * len(path) + "|")
- print(" " + "| " * len(path) + "+-->" + repr(fval1))
-
-
+ print(' '+'| '*len(path)+'|')
+ print(' '+'| '*len(path)+'| (identical objects)')
+ print(' '+'| '*len(path)+'|')
+ print(' '+'| '*len(path)+'+-->'+unicode_repr(fval1))
def _trace_unify_fail(path, result):
- if result is UnificationFailure:
- resume = ""
- else:
- resume = " (nonfatal)"
- print(" " + "| " * len(path) + "| |")
- print(" " + "X " * len(path) + "X X <-- FAIL" + resume)
-
-
+ if result is UnificationFailure: resume = ''
+ else: resume = ' (nonfatal)'
+ print(' '+'| '*len(path)+'| |')
+ print(' '+'X '*len(path)+'X X <-- FAIL'+resume)
def _trace_unify_succeed(path, fval1):
# Print the result.
- print(" " + "| " * len(path) + "|")
- print(" " + "| " * len(path) + "+-->" + repr(fval1))
-
-
+ print(' '+'| '*len(path)+'|')
+ print(' '+'| '*len(path)+'+-->'+unicode_repr(fval1))
def _trace_bindings(path, bindings):
# Print the bindings (if any).
if len(bindings) > 0:
- binditems = sorted(bindings.items(), key=lambda v: v[0].name)
- bindstr = "{%s}" % ", ".join(
- "%s: %s" % (var, _trace_valrepr(val)) for (var, val) in binditems
- )
- print(" " + "| " * len(path) + " Bindings: " + bindstr)
-
-
+ binditems = sorted(bindings.items(), key=lambda v:v[0].name)
+ bindstr = '{%s}' % ', '.join(
+ '%s: %s' % (var, _trace_valrepr(val))
+ for (var, val) in binditems)
+ print(' '+'| '*len(path)+' Bindings: '+bindstr)
def _trace_valrepr(val):
if isinstance(val, Variable):
- return "%s" % val
+ return '%s' % val
else:
- return "%s" % repr(val)
-
+ return '%s' % unicode_repr(val)
def subsumes(fstruct1, fstruct2):
"""
"""
return fstruct2 == unify(fstruct1, fstruct2)
-
def conflicts(fstruct1, fstruct2, trace=0):
"""
Return a list of the feature paths of all features which are
:rtype: list(tuple)
"""
conflict_list = []
-
def add_conflict(fval1, fval2, path):
conflict_list.append(path)
return fval1
-
unify(fstruct1, fstruct2, fail=add_conflict, trace=trace)
return conflict_list
-
######################################################################
# Helper Functions
######################################################################
-
def _is_mapping(v):
- return hasattr(v, "__contains__") and hasattr(v, "keys")
-
+ return hasattr(v, '__contains__') and hasattr(v, 'keys')
def _is_sequence(v):
- return (
- hasattr(v, "__iter__")
- and hasattr(v, "__len__")
- and not isinstance(v, str)
- )
-
+ return (hasattr(v, '__iter__') and hasattr(v, '__len__') and
+ not isinstance(v, string_types))
def _default_fs_class(obj):
- if isinstance(obj, FeatStruct):
- return FeatStruct
- if isinstance(obj, (dict, list)):
- return (dict, list)
+ if isinstance(obj, FeatStruct): return FeatStruct
+ if isinstance(obj, (dict, list)): return (dict, list)
else:
- raise ValueError(
- "To unify objects of type %s, you must specify "
- "fs_class explicitly." % obj.__class__.__name__
- )
-
-
+ raise ValueError('To unify objects of type %s, you must specify '
+ 'fs_class explicitly.' % obj.__class__.__name__)
######################################################################
# FeatureValueSet & FeatureValueTuple
######################################################################
-
class SubstituteBindingsSequence(SubstituteBindingsI):
"""
A mixin class for sequence clases that distributes variables() and
substitute_bindings() over the object's elements.
"""
-
def variables(self):
- return [elt for elt in self if isinstance(elt, Variable)] + sum(
- [
- list(elt.variables())
- for elt in self
- if isinstance(elt, SubstituteBindingsI)
- ],
- [],
- )
+ return ([elt for elt in self if isinstance(elt, Variable)] +
+ sum([list(elt.variables()) for elt in self
+ if isinstance(elt, SubstituteBindingsI)], []))
def substitute_bindings(self, bindings):
return self.__class__([self.subst(v, bindings) for v in self])
else:
return bindings.get(v, v)
-
-
+@python_2_unicode_compatible
class FeatureValueTuple(SubstituteBindingsSequence, tuple):
"""
A base feature value that is a tuple of other base feature values.
variable substitutions will be propagated to the elements
contained by the set. A ``FeatureValueTuple`` is immutable.
"""
-
- def __repr__(self): # [xx] really use %s here?
- if len(self) == 0:
- return "()"
- return "(%s)" % ", ".join("%s" % (b,) for b in self)
-
+ def __repr__(self): # [xx] really use %s here?
+ if len(self) == 0: return '()'
+ return '(%s)' % ', '.join('%s' % (b,) for b in self)
+@python_2_unicode_compatible
class FeatureValueSet(SubstituteBindingsSequence, frozenset):
"""
A base feature value that is a set of other base feature values.
variable substitutions will be propagated to the elements
contained by the set. A ``FeatureValueSet`` is immutable.
"""
-
- def __repr__(self): # [xx] really use %s here?
- if len(self) == 0:
- return "{/}" # distinguish from dict.
+ def __repr__(self): # [xx] really use %s here?
+ if len(self) == 0: return '{/}' # distinguish from dict.
# n.b., we sort the string reprs of our elements, to ensure
# that our own repr is deterministic.
- return "{%s}" % ", ".join(sorted("%s" % (b,) for b in self))
-
+ return '{%s}' % ', '.join(sorted('%s' % (b,) for b in self))
__str__ = __repr__
-
-
+@python_2_unicode_compatible
class FeatureValueUnion(SubstituteBindingsSequence, frozenset):
"""
A base feature value that represents the union of two or more
``FeatureValueSet`` or ``Variable``.
"""
-
def __new__(cls, values):
# If values contains FeatureValueUnions, then collapse them.
values = _flatten(values, FeatureValueUnion)
# n.b., we sort the string reprs of our elements, to ensure
# that our own repr is deterministic. also, note that len(self)
# is guaranteed to be 2 or more.
- return "{%s}" % "+".join(sorted("%s" % (b,) for b in self))
-
-
+ return '{%s}' % '+'.join(sorted('%s' % (b,) for b in self))
+@python_2_unicode_compatible
class FeatureValueConcat(SubstituteBindingsSequence, tuple):
"""
A base feature value that represents the concatenation of two or
more ``FeatureValueTuple`` or ``Variable``.
"""
-
def __new__(cls, values):
# If values contains FeatureValueConcats, then collapse them.
values = _flatten(values, FeatureValueConcat)
def __repr__(self):
# n.b.: len(self) is guaranteed to be 2 or more.
- return "(%s)" % "+".join("%s" % (b,) for b in self)
+ return '(%s)' % '+'.join('%s' % (b,) for b in self)
def _flatten(lst, cls):
"""
result = []
for elt in lst:
- if isinstance(elt, cls):
- result.extend(elt)
- else:
- result.append(elt)
+ if isinstance(elt, cls): result.extend(elt)
+ else: result.append(elt)
return result
-
######################################################################
# Specialized Features
######################################################################
-
@total_ordering
-
+@python_2_unicode_compatible
class Feature(object):
"""
A feature identifier that's specialized to put additional
constraints, default values, etc.
"""
-
def __init__(self, name, default=None, display=None):
- assert display in (None, "prefix", "slash")
+ assert display in (None, 'prefix', 'slash')
- self._name = name # [xx] rename to .identifier?
- self._default = default # [xx] not implemented yet.
+ self._name = name # [xx] rename to .identifier?
+ self._default = default # [xx] not implemented yet.
self._display = display
- if self._display == "prefix":
+ if self._display == 'prefix':
self._sortkey = (-1, self._name)
- elif self._display == "slash":
+ elif self._display == 'slash':
self._sortkey = (1, self._name)
else:
self._sortkey = (0, self._name)
return self._display
def __repr__(self):
- return "*%s*" % self.name
+ return '*%s*' % self.name
def __lt__(self, other):
- if isinstance(other, str):
+ if isinstance(other, string_types):
return True
if not isinstance(other, Feature):
raise_unorderable_types("<", self, other)
def __hash__(self):
return hash(self._name)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# These can be overridden by subclasses:
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def read_value(self, s, position, reentrances, parser):
return parser.read_value(s, position, reentrances)
If possible, return a single value.. If not, return
the value ``UnificationFailure``.
"""
- if fval1 == fval2:
- return fval1
- else:
- return UnificationFailure
+ if fval1 == fval2: return fval1
+ else: return UnificationFailure
class SlashFeature(Feature):
def read_value(self, s, position, reentrances, parser):
return parser.read_partial(s, position, reentrances)
-
class RangeFeature(Feature):
- RANGE_RE = re.compile("(-?\d+):(-?\d+)")
-
+ RANGE_RE = re.compile('(-?\d+):(-?\d+)')
def read_value(self, s, position, reentrances, parser):
m = self.RANGE_RE.match(s, position)
- if not m:
- raise ValueError("range", position)
+ if not m: raise ValueError('range', position)
return (int(m.group(1)), int(m.group(2))), m.end()
def unify_base_values(self, fval1, fval2, bindings):
- if fval1 is None:
- return fval2
- if fval2 is None:
- return fval1
+ if fval1 is None: return fval2
+ if fval2 is None: return fval1
rng = max(fval1[0], fval2[0]), min(fval1[1], fval2[1])
- if rng[1] < rng[0]:
- return UnificationFailure
+ if rng[1] < rng[0]: return UnificationFailure
return rng
-
-SLASH = SlashFeature("slash", default=False, display="slash")
-TYPE = Feature("type", display="prefix")
-
+SLASH = SlashFeature('slash', default=False, display='slash')
+TYPE = Feature('type', display='prefix')
######################################################################
# Specialized Feature Values
######################################################################
-
@total_ordering
class CustomFeatureValue(object):
"""
Subclasses must define ``unify()``, ``__eq__()`` and ``__lt__()``.
Subclasses may also wish to define ``__hash__()``.
"""
-
def unify(self, other):
"""
If this base value unifies with ``other``, then return the
unified value. Otherwise, return ``UnificationFailure``.
"""
- raise NotImplementedError("abstract base class")
+ raise NotImplementedError('abstract base class')
def __eq__(self, other):
- raise NotImplementedError("abstract base class")
+ raise NotImplementedError('abstract base class')
def __ne__(self, other):
return not self == other
def __lt__(self, other):
- raise NotImplementedError("abstract base class")
+ raise NotImplementedError('abstract base class')
def __hash__(self):
- raise TypeError("%s objects or unhashable" % self.__class__.__name__)
-
+ raise TypeError('%s objects or unhashable' % self.__class__.__name__)
######################################################################
# Feature Structure Reader
######################################################################
-
class FeatStructReader(object):
- def __init__(
- self,
- features=(SLASH, TYPE),
- fdict_class=FeatStruct,
- flist_class=FeatList,
- logic_parser=None,
- ):
- self._features = dict((f.name, f) for f in features)
+ def __init__(self, features=(SLASH, TYPE), fdict_class=FeatStruct,
+ flist_class=FeatList, logic_parser=None):
+ self._features = dict((f.name,f) for f in features)
self._fdict_class = fdict_class
self._flist_class = flist_class
self._prefix_feature = None
self._slash_feature = None
for feature in features:
- if feature.display == "slash":
+ if feature.display == 'slash':
if self._slash_feature:
- raise ValueError("Multiple features w/ display=slash")
+ raise ValueError('Multiple features w/ display=slash')
self._slash_feature = feature
- if feature.display == "prefix":
+ if feature.display == 'prefix':
if self._prefix_feature:
- raise ValueError("Multiple features w/ display=prefix")
+ raise ValueError('Multiple features w/ display=prefix')
self._prefix_feature = feature
- self._features_with_defaults = [
- feature for feature in features if feature.default is not None
- ]
+ self._features_with_defaults = [feature for feature in features
+ if feature.default is not None]
if logic_parser is None:
logic_parser = LogicParser()
self._logic_parser = logic_parser
s = s.strip()
value, position = self.read_partial(s, 0, {}, fstruct)
if position != len(s):
- self._error(s, "end of string", position)
+ self._error(s, 'end of string', position)
return value
- _START_FSTRUCT_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)")
- _END_FSTRUCT_RE = re.compile(r"\s*]\s*")
- _SLASH_RE = re.compile(r"/")
+ _START_FSTRUCT_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)')
+ _END_FSTRUCT_RE = re.compile(r'\s*]\s*')
+ _SLASH_RE = re.compile(r'/')
_FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*')
- _REENTRANCE_RE = re.compile(r"\s*->\s*")
- _TARGET_RE = re.compile(r"\s*\((\d+)\)\s*")
- _ASSIGN_RE = re.compile(r"\s*=\s*")
- _COMMA_RE = re.compile(r"\s*,\s*")
- _BARE_PREFIX_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()")
+ _REENTRANCE_RE = re.compile(r'\s*->\s*')
+ _TARGET_RE = re.compile(r'\s*\((\d+)\)\s*')
+ _ASSIGN_RE = re.compile(r'\s*=\s*')
+ _COMMA_RE = re.compile(r'\s*,\s*')
+ _BARE_PREFIX_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()')
# This one is used to distinguish fdicts from flists:
- _START_FDICT_RE = re.compile(
- r"(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))"
- % (
- _BARE_PREFIX_RE.pattern,
- _START_FSTRUCT_RE.pattern,
- _FEATURE_NAME_RE.pattern,
- _FEATURE_NAME_RE.pattern,
- )
- )
+ _START_FDICT_RE = re.compile(r'(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))' % (
+ _BARE_PREFIX_RE.pattern, _START_FSTRUCT_RE.pattern,
+ _FEATURE_NAME_RE.pattern, _FEATURE_NAME_RE.pattern))
def read_partial(self, s, position=0, reentrances=None, fstruct=None):
"""
parsing and the position where the parsed feature structure ends.
:rtype: bool
"""
- if reentrances is None:
- reentrances = {}
+ if reentrances is None: reentrances = {}
try:
return self._read_partial(s, position, reentrances, fstruct)
except ValueError as e:
- if len(e.args) != 2:
- raise
+ if len(e.args) != 2: raise
self._error(s, *e.args)
def _read_partial(self, s, position, reentrances, fstruct=None):
if not match:
match = self._BARE_PREFIX_RE.match(s, position)
if not match:
- raise ValueError("open bracket or identifier", position)
+ raise ValueError('open bracket or identifier', position)
position = match.end()
# If there as an identifier, record it.
if match.group(1):
identifier = match.group(1)
if identifier in reentrances:
- raise ValueError("new identifier", match.start(1))
+ raise ValueError('new identifier', match.start(1))
reentrances[identifier] = fstruct
if isinstance(fstruct, FeatDict):
fstruct.clear()
- return self._read_partial_featdict(s, position, match, reentrances, fstruct)
+ return self._read_partial_featdict(s, position, match,
+ reentrances, fstruct)
else:
del fstruct[:]
- return self._read_partial_featlist(s, position, match, reentrances, fstruct)
+ return self._read_partial_featlist(s, position, match,
+ reentrances, fstruct)
- def _read_partial_featlist(self, s, position, match, reentrances, fstruct):
+ def _read_partial_featlist(self, s, position, match,
+ reentrances, fstruct):
# Prefix features are not allowed:
- if match.group(2):
- raise ValueError("open bracket")
+ if match.group(2): raise ValueError('open bracket')
# Bare prefixes are not allowed:
- if not match.group(3):
- raise ValueError("open bracket")
+ if not match.group(3): raise ValueError('open bracket')
# Build a list of the features defined by the structure.
while position < len(s):
if match:
position = match.end()
match = self._TARGET_RE.match(s, position)
- if not match:
- raise ValueError("identifier", position)
+ if not match: raise ValueError('identifier', position)
target = match.group(1)
if target not in reentrances:
- raise ValueError("bound identifier", position)
+ raise ValueError('bound identifier', position)
position = match.end()
fstruct.append(reentrances[target])
# Anything else is a value.
else:
- value, position = self._read_value(0, s, position, reentrances)
+ value, position = (
+ self._read_value(0, s, position, reentrances))
fstruct.append(value)
# If there's a close bracket, handle it at the top of the loop.
# Otherwise, there should be a comma
match = self._COMMA_RE.match(s, position)
- if match is None:
- raise ValueError("comma", position)
+ if match is None: raise ValueError('comma', position)
position = match.end()
# We never saw a close bracket.
- raise ValueError("close bracket", position)
+ raise ValueError('close bracket', position)
- def _read_partial_featdict(self, s, position, match, reentrances, fstruct):
+ def _read_partial_featdict(self, s, position, match,
+ reentrances, fstruct):
# If there was a prefix feature, record it.
if match.group(2):
if self._prefix_feature is None:
- raise ValueError("open bracket or identifier", match.start(2))
+ raise ValueError('open bracket or identifier', match.start(2))
prefixval = match.group(2).strip()
- if prefixval.startswith("?"):
+ if prefixval.startswith('?'):
prefixval = Variable(prefixval)
fstruct[self._prefix_feature] = prefixval
# Get the feature name's name
match = self._FEATURE_NAME_RE.match(s, position)
- if match is None:
- raise ValueError("feature name", position)
+ if match is None: raise ValueError('feature name', position)
name = match.group(2)
position = match.end()
# Check if it's a special feature.
- if name[0] == "*" and name[-1] == "*":
+ if name[0] == '*' and name[-1] == '*':
name = self._features.get(name[1:-1])
if name is None:
- raise ValueError("known special feature", match.start(2))
+ raise ValueError('known special feature', match.start(2))
# Check if this feature has a value already.
if name in fstruct:
- raise ValueError("new name", match.start(2))
+ raise ValueError('new name', match.start(2))
# Boolean value ("+name" or "-name")
- if match.group(1) == "+":
- value = True
- if match.group(1) == "-":
- value = False
+ if match.group(1) == '+': value = True
+ if match.group(1) == '-': value = False
# Reentrance link ("-> (target)")
if value is None:
position = match.end()
match = self._TARGET_RE.match(s, position)
if not match:
- raise ValueError("identifier", position)
+ raise ValueError('identifier', position)
target = match.group(1)
if target not in reentrances:
- raise ValueError("bound identifier", position)
+ raise ValueError('bound identifier', position)
position = match.end()
value = reentrances[target]
match = self._ASSIGN_RE.match(s, position)
if match:
position = match.end()
- value, position = self._read_value(name, s, position, reentrances)
+ value, position = (
+ self._read_value(name, s, position, reentrances))
# None of the above: error.
else:
- raise ValueError("equals sign", position)
+ raise ValueError('equals sign', position)
# Store the value.
fstruct[name] = value
# Otherwise, there should be a comma
match = self._COMMA_RE.match(s, position)
- if match is None:
- raise ValueError("comma", position)
+ if match is None: raise ValueError('comma', position)
position = match.end()
# We never saw a close bracket.
- raise ValueError("close bracket", position)
+ raise ValueError('close bracket', position)
def _finalize(self, s, pos, reentrances, fstruct):
"""
v, pos = self._read_value(name, s, match.end(), reentrances)
fstruct[name] = v
## Add any default features. -- handle in unficiation instead?
- # for feature in self._features_with_defaults:
+ #for feature in self._features_with_defaults:
# fstruct.setdefault(feature, feature.default)
# Return the value.
return fstruct, pos
if match:
handler_func = getattr(self, handler)
return handler_func(s, position, reentrances, match)
- raise ValueError("value", position)
+ raise ValueError('value', position)
def _error(self, s, expected, position):
- lines = s.split("\n")
+ lines = s.split('\n')
while position > len(lines[0]):
- position -= len(lines.pop(0)) + 1 # +1 for the newline.
- estr = (
- "Error parsing feature structure\n "
- + lines[0]
- + "\n "
- + " " * position
- + "^ "
- + "Expected %s" % expected
- )
+ position -= len(lines.pop(0))+1 # +1 for the newline.
+ estr = ('Error parsing feature structure\n ' +
+ lines[0] + '\n ' + ' '*position + '^ ' +
+ 'Expected %s' % expected)
raise ValueError(estr)
- # ////////////////////////////////////////////////////////////
- # { Value Readers
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Value Readers
+ #////////////////////////////////////////////////////////////
#: A table indicating how feature values should be processed. Each
#: entry in the table is a pair (handler, regexp). The first entry
#: the string position where the value ended. (n.b.: order is
#: important here!)
VALUE_HANDLERS = [
- ("read_fstruct_value", _START_FSTRUCT_RE),
- ("read_var_value", re.compile(r"\?[a-zA-Z_][a-zA-Z0-9_]*")),
- ("read_str_value", re.compile("[uU]?[rR]?(['\"])")),
- ("read_int_value", re.compile(r"-?\d+")),
- ("read_sym_value", re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")),
- (
- "read_app_value",
- re.compile(r"<(app)\((\?[a-z][a-z]*)\s*," r"\s*(\?[a-z][a-z]*)\)>"),
- ),
- # ('read_logic_value', re.compile(r'<([^>]*)>')),
- # lazily match any character after '<' until we hit a '>' not preceded by '-'
- ("read_logic_value", re.compile(r"<(.*?)(?<!-)>")),
- ("read_set_value", re.compile(r"{")),
- ("read_tuple_value", re.compile(r"\(")),
- ]
+ ('read_fstruct_value', _START_FSTRUCT_RE),
+ ('read_var_value', re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*')),
+ ('read_str_value', re.compile("[uU]?[rR]?(['\"])")),
+ ('read_int_value', re.compile(r'-?\d+')),
+ ('read_sym_value', re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')),
+ ('read_app_value', re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,'
+ r'\s*(\?[a-z][a-z]*)\)>')),
+# ('read_logic_value', re.compile(r'<([^>]*)>')),
+ #lazily match any character after '<' until we hit a '>' not preceded by '-'
+ ('read_logic_value', re.compile(r'<(.*?)(?<!-)>')),
+ ('read_set_value', re.compile(r'{')),
+ ('read_tuple_value', re.compile(r'\(')),
+ ]
def read_fstruct_value(self, s, position, reentrances, match):
return self.read_partial(s, position, reentrances)
def read_var_value(self, s, position, reentrances, match):
return Variable(match.group()), match.end()
- _SYM_CONSTS = {"None": None, "True": True, "False": False}
-
+ _SYM_CONSTS = {'None':None, 'True':True, 'False':False}
def read_sym_value(self, s, position, reentrances, match):
val, end = match.group(), match.end()
return self._SYM_CONSTS.get(val, val), end
def read_app_value(self, s, position, reentrances, match):
"""Mainly included for backwards compat."""
- return self._logic_parser.parse("%s(%s)" % match.group(2, 3)), match.end()
+ return self._logic_parser.parse('%s(%s)' % match.group(2,3)), match.end()
def read_logic_value(self, s, position, reentrances, match):
try:
raise ValueError()
return expr, match.end()
except ValueError:
- raise ValueError("logic expression", match.start(1))
+ raise ValueError('logic expression', match.start(1))
def read_tuple_value(self, s, position, reentrances, match):
- return self._read_seq_value(
- s, position, reentrances, match, ")", FeatureValueTuple, FeatureValueConcat
- )
+ return self._read_seq_value(s, position, reentrances, match, ')',
+ FeatureValueTuple, FeatureValueConcat)
def read_set_value(self, s, position, reentrances, match):
- return self._read_seq_value(
- s, position, reentrances, match, "}", FeatureValueSet, FeatureValueUnion
- )
+ return self._read_seq_value(s, position, reentrances, match, '}',
+ FeatureValueSet, FeatureValueUnion)
- def _read_seq_value(
- self, s, position, reentrances, match, close_paren, seq_class, plus_class
- ):
+ def _read_seq_value(self, s, position, reentrances, match,
+ close_paren, seq_class, plus_class):
"""
Helper function used by read_tuple_value and read_set_value.
"""
cp = re.escape(close_paren)
position = match.end()
# Special syntax fo empty tuples:
- m = re.compile(r"\s*/?\s*%s" % cp).match(s, position)
- if m:
- return seq_class(), m.end()
+ m = re.compile(r'\s*/?\s*%s' % cp).match(s, position)
+ if m: return seq_class(), m.end()
# Read values:
values = []
seen_plus = False
while True:
# Close paren: return value.
- m = re.compile(r"\s*%s" % cp).match(s, position)
+ m = re.compile(r'\s*%s' % cp).match(s, position)
if m:
- if seen_plus:
- return plus_class(values), m.end()
- else:
- return seq_class(values), m.end()
+ if seen_plus: return plus_class(values), m.end()
+ else: return seq_class(values), m.end()
# Read the next value.
val, position = self.read_value(s, position, reentrances)
values.append(val)
# Comma or looking at close paren
- m = re.compile(r"\s*(,|\+|(?=%s))\s*" % cp).match(s, position)
- if not m:
- raise ValueError("',' or '+' or '%s'" % cp, position)
- if m.group(1) == "+":
- seen_plus = True
+ m = re.compile(r'\s*(,|\+|(?=%s))\s*' % cp).match(s, position)
+ if not m: raise ValueError("',' or '+' or '%s'" % cp, position)
+ if m.group(1) == '+': seen_plus = True
position = m.end()
-
######################################################################
-# { Demo
+#{ Demo
######################################################################
-
-def display_unification(fs1, fs2, indent=" "):
+def display_unification(fs1, fs2, indent=' '):
# Print the two input feature structures, side by side.
- fs1_lines = ("%s" % fs1).split("\n")
- fs2_lines = ("%s" % fs2).split("\n")
+ fs1_lines = ("%s" % fs1).split('\n')
+ fs2_lines = ("%s" % fs2).split('\n')
if len(fs1_lines) > len(fs2_lines):
- blankline = "[" + " " * (len(fs2_lines[0]) - 2) + "]"
- fs2_lines += [blankline] * len(fs1_lines)
+ blankline = '['+' '*(len(fs2_lines[0])-2)+']'
+ fs2_lines += [blankline]*len(fs1_lines)
else:
- blankline = "[" + " " * (len(fs1_lines[0]) - 2) + "]"
- fs1_lines += [blankline] * len(fs2_lines)
+ blankline = '['+' '*(len(fs1_lines[0])-2)+']'
+ fs1_lines += [blankline]*len(fs2_lines)
for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines):
- print(indent + fs1_line + " " + fs2_line)
- print(indent + "-" * len(fs1_lines[0]) + " " + "-" * len(fs2_lines[0]))
+ print(indent + fs1_line + ' ' + fs2_line)
+ print(indent+'-'*len(fs1_lines[0])+' '+'-'*len(fs2_lines[0]))
- linelen = len(fs1_lines[0]) * 2 + 3
- print(indent + "| |".center(linelen))
- print(indent + "+-----UNIFY-----+".center(linelen))
- print(indent + "|".center(linelen))
- print(indent + "V".center(linelen))
+ linelen = len(fs1_lines[0])*2+3
+ print(indent+'| |'.center(linelen))
+ print(indent+'+-----UNIFY-----+'.center(linelen))
+ print(indent+'|'.center(linelen))
+ print(indent+'V'.center(linelen))
bindings = {}
result = fs1.unify(fs2, bindings)
if result is None:
- print(indent + "(FAILED)".center(linelen))
+ print(indent+'(FAILED)'.center(linelen))
else:
- print(
- "\n".join(indent + l.center(linelen) for l in ("%s" % result).split("\n"))
- )
+ print('\n'.join(indent+l.center(linelen)
+ for l in ("%s" % result).split('\n')))
if bindings and len(bindings.bound_variables()) > 0:
print(repr(bindings).center(linelen))
return result
-
def interactive_demo(trace=False):
import random, sys
- HELP = """
+ HELP = '''
1-%d: Select the corresponding feature structure
q: Quit
t: Turn tracing on or off
l: List all feature structures
?: Help
- """
+ '''
- print(
- """
+ print('''
This demo will repeatedly present you with a list of feature
structures, and ask you to choose two for unification. Whenever a
new feature structure is generated, it is added to the list of
random subset for you to choose between at a given time. If you
want to see the complete lists, type "l". For a list of valid
commands, type "?".
- """
- )
+ ''')
print('Press "Enter" to continue...')
sys.stdin.readline()
fstruct_strings = [
- "[agr=[number=sing, gender=masc]]",
- "[agr=[gender=masc, person=3]]",
- "[agr=[gender=fem, person=3]]",
- "[subj=[agr=(1)[]], agr->(1)]",
- "[obj=?x]",
- "[subj=?x]",
- "[/=None]",
- "[/=NP]",
- "[cat=NP]",
- "[cat=VP]",
- "[cat=PP]",
- "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]",
- "[gender=masc, agr=?C]",
- "[gender=?S, agr=[gender=?S,person=3]]",
- ]
+ '[agr=[number=sing, gender=masc]]',
+ '[agr=[gender=masc, person=3]]',
+ '[agr=[gender=fem, person=3]]',
+ '[subj=[agr=(1)[]], agr->(1)]',
+ '[obj=?x]', '[subj=?x]',
+ '[/=None]', '[/=NP]',
+ '[cat=NP]', '[cat=VP]', '[cat=PP]',
+ '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+ '[gender=masc, agr=?C]',
+ '[gender=?S, agr=[gender=?S,person=3]]'
+ ]
- all_fstructs = [
- (i, FeatStruct(fstruct_strings[i])) for i in range(len(fstruct_strings))
- ]
+ all_fstructs = [(i, FeatStruct(fstruct_strings[i]))
+ for i in range(len(fstruct_strings))]
def list_fstructs(fstructs):
for i, fstruct in fstructs:
print()
- lines = ("%s" % fstruct).split("\n")
- print("%3d: %s" % (i + 1, lines[0]))
- for line in lines[1:]:
- print(" " + line)
+ lines = ("%s" % fstruct).split('\n')
+ print('%3d: %s' % (i+1, lines[0]))
+ for line in lines[1:]: print(' '+line)
print()
+
while True:
# Pick 5 feature structures at random from the master list.
MAX_CHOICES = 5
else:
fstructs = all_fstructs
- print("_" * 75)
+ print('_'*75)
- print("Choose two feature structures to unify:")
+ print('Choose two feature structures to unify:')
list_fstructs(fstructs)
- selected = [None, None]
- for (nth, i) in (("First", 0), ("Second", 1)):
+ selected = [None,None]
+ for (nth,i) in (('First',0), ('Second',1)):
while selected[i] is None:
- print(
- (
- "%s feature structure (1-%d,q,t,l,?): "
- % (nth, len(all_fstructs))
- ),
- end=" ",
- )
+ print(('%s feature structure (1-%d,q,t,l,?): '
+ % (nth, len(all_fstructs))), end=' ')
try:
input = sys.stdin.readline().strip()
- if input in ("q", "Q", "x", "X"):
- return
- if input in ("t", "T"):
+ if input in ('q', 'Q', 'x', 'X'): return
+ if input in ('t', 'T'):
trace = not trace
- print(" Trace = %s" % trace)
- continue
- if input in ("h", "H", "?"):
- print(HELP % len(fstructs))
+ print(' Trace = %s' % trace)
continue
- if input in ("l", "L"):
- list_fstructs(all_fstructs)
- continue
- num = int(input) - 1
+ if input in ('h', 'H', '?'):
+ print(HELP % len(fstructs)); continue
+ if input in ('l', 'L'):
+ list_fstructs(all_fstructs); continue
+ num = int(input)-1
selected[i] = all_fstructs[num][1]
print()
except:
- print("Bad sentence number")
+ print('Bad sentence number')
continue
if trace:
result = display_unification(selected[0], selected[1])
if result is not None:
for i, fstruct in all_fstructs:
- if repr(result) == repr(fstruct):
- break
+ if repr(result) == repr(fstruct): break
else:
all_fstructs.append((len(all_fstructs), result))
print('\nType "Enter" to continue unifying; or "q" to quit.')
input = sys.stdin.readline().strip()
- if input in ("q", "Q", "x", "X"):
- return
-
+ if input in ('q', 'Q', 'x', 'X'): return
def demo(trace=False):
"""
Just for testing
"""
- # import random
+ #import random
# processor breaks with values like '3rd'
fstruct_strings = [
- "[agr=[number=sing, gender=masc]]",
- "[agr=[gender=masc, person=3]]",
- "[agr=[gender=fem, person=3]]",
- "[subj=[agr=(1)[]], agr->(1)]",
- "[obj=?x]",
- "[subj=?x]",
- "[/=None]",
- "[/=NP]",
- "[cat=NP]",
- "[cat=VP]",
- "[cat=PP]",
- "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]",
- "[gender=masc, agr=?C]",
- "[gender=?S, agr=[gender=?S,person=3]]",
+ '[agr=[number=sing, gender=masc]]',
+ '[agr=[gender=masc, person=3]]',
+ '[agr=[gender=fem, person=3]]',
+ '[subj=[agr=(1)[]], agr->(1)]',
+ '[obj=?x]', '[subj=?x]',
+ '[/=None]', '[/=NP]',
+ '[cat=NP]', '[cat=VP]', '[cat=PP]',
+ '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+ '[gender=masc, agr=?C]',
+ '[gender=?S, agr=[gender=?S,person=3]]'
]
all_fstructs = [FeatStruct(fss) for fss in fstruct_strings]
- # MAX_CHOICES = 5
- # if len(all_fstructs) > MAX_CHOICES:
- # fstructs = random.sample(all_fstructs, MAX_CHOICES)
- # fstructs.sort()
- # else:
- # fstructs = all_fstructs
+ #MAX_CHOICES = 5
+ #if len(all_fstructs) > MAX_CHOICES:
+ #fstructs = random.sample(all_fstructs, MAX_CHOICES)
+ #fstructs.sort()
+ #else:
+ #fstructs = all_fstructs
for fs1 in all_fstructs:
for fs2 in all_fstructs:
- print(
- "\n*******************\nfs1 is:\n%s\n\nfs2 is:\n%s\n\nresult is:\n%s"
- % (fs1, fs2, unify(fs1, fs2))
- )
+ print("\n*******************\nfs1 is:\n%s\n\nfs2 is:\n%s\n\nresult is:\n%s" % (fs1, fs2, unify(fs1, fs2)))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
-__all__ = [
- "FeatStruct",
- "FeatDict",
- "FeatList",
- "unify",
- "subsumes",
- "conflicts",
- "Feature",
- "SlashFeature",
- "RangeFeature",
- "SLASH",
- "TYPE",
- "FeatStructReader",
-]
+__all__ = ['FeatStruct', 'FeatDict', 'FeatList', 'unify', 'subsumes', 'conflicts',
+ 'Feature', 'SlashFeature', 'RangeFeature', 'SLASH', 'TYPE',
+ 'FeatStructReader']
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Context Free Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Jason Narad <jason.narad@gmail.com>
with the right hand side (*rhs*) in a tree (*tree*) is known as
"expanding" *lhs* to *rhs* in *tree*.
"""
+from __future__ import print_function, unicode_literals, division
+
import re
from functools import total_ordering
+from six import string_types
+
from nltk.util import transitive_closure, invert_graph
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.internals import raise_unorderable_types
from nltk.probability import ImmutableProbabilisticMixIn
from nltk.featstruct import FeatStruct, FeatDict, FeatStructReader, SLASH, TYPE
-
#################################################################
# Nonterminal
#################################################################
-
@total_ordering
+@python_2_unicode_compatible
class Nonterminal(object):
"""
A non-terminal symbol for a context free grammar. ``Nonterminal``
:ivar _symbol: The node value corresponding to this
``Nonterminal``. This value must be immutable and hashable.
"""
-
def __init__(self, symbol):
"""
Construct a new non-terminal from the given symbol.
:rtype: str
"""
- if isinstance(self._symbol, str):
- return "%s" % self._symbol
+ if isinstance(self._symbol, string_types):
+ return '%s' % self._symbol
else:
- return "%s" % repr(self._symbol)
+ return '%s' % unicode_repr(self._symbol)
def __str__(self):
"""
:rtype: str
"""
- if isinstance(self._symbol, str):
- return "%s" % self._symbol
+ if isinstance(self._symbol, string_types):
+ return '%s' % self._symbol
else:
- return "%s" % repr(self._symbol)
+ return '%s' % unicode_repr(self._symbol)
def __div__(self, rhs):
"""
:type rhs: Nonterminal
:rtype: Nonterminal
"""
- return Nonterminal("%s/%s" % (self._symbol, rhs._symbol))
+ return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
+
def __truediv__(self, rhs):
"""
"""
return self.__div__(rhs)
-
def nonterminals(symbols):
"""
Given a string containing a list of symbol names, return a list of
in the same order as the symbols names.
:rtype: list(Nonterminal)
"""
- if "," in symbols:
- symbol_list = symbols.split(",")
- else:
- symbol_list = symbols.split()
+ if ',' in symbols: symbol_list = symbols.split(',')
+ else: symbol_list = symbols.split()
return [Nonterminal(s.strip()) for s in symbol_list]
-
class FeatStructNonterminal(FeatDict, Nonterminal):
"""A feature structure that's also a nonterminal. It acts as its
own symbol, and automatically freezes itself when hashed."""
-
def __hash__(self):
self.freeze()
return FeatStruct.__hash__(self)
-
def symbol(self):
return self
-
def is_nonterminal(item):
"""
:return: True if the item is a ``Nonterminal``.
# Terminals
#################################################################
-
def is_terminal(item):
"""
Return True if the item is a terminal, which currently is
:rtype: bool
"""
- return hasattr(item, "__hash__") and not isinstance(item, Nonterminal)
+ return hasattr(item, '__hash__') and not isinstance(item, Nonterminal)
#################################################################
# Productions
#################################################################
-
@total_ordering
-
+@python_2_unicode_compatible
class Production(object):
"""
A grammar production. Each production maps a single symbol
:param rhs: The right-hand side of the new ``Production``.
:type rhs: sequence(Nonterminal and terminal)
"""
- if isinstance(rhs, str):
- raise TypeError(
- "production right hand side should be a list, " "not a string"
- )
+ if isinstance(rhs, string_types):
+ raise TypeError('production right hand side should be a list, '
+ 'not a string')
self._lhs = lhs
self._rhs = tuple(rhs)
self._hash = hash((self._lhs, self._rhs))
:rtype: str
"""
- result = "%s -> " % repr(self._lhs)
- result += " ".join(repr(el) for el in self._rhs)
+ result = '%s -> ' % unicode_repr(self._lhs)
+ result += " ".join(unicode_repr(el) for el in self._rhs)
return result
def __repr__(self):
:rtype: str
"""
- return "%s" % self
+ return '%s' % self
def __eq__(self, other):
"""
:rtype: bool
"""
- return (
- type(self) == type(other)
- and self._lhs == other._lhs
- and self._rhs == other._rhs
- )
+ return (type(self) == type(other) and
+ self._lhs == other._lhs and
+ self._rhs == other._rhs)
def __ne__(self, other):
return not self == other
return self._hash
-
+@python_2_unicode_compatible
class DependencyProduction(Production):
"""
A dependency grammar production. Each production maps a single
head word to an unordered list of one or more modifier words.
"""
-
def __str__(self):
"""
Return a verbose string representation of the ``DependencyProduction``.
:rtype: str
"""
- result = "'%s' ->" % (self._lhs,)
+ result = '\'%s\' ->' % (self._lhs,)
for elt in self._rhs:
- result += " '%s'" % (elt,)
+ result += ' \'%s\'' % (elt,)
return result
-
+@python_2_unicode_compatible
class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
"""
A probabilistic context free grammar production.
:see: ``Production``
"""
-
def __init__(self, lhs, rhs, **prob):
"""
Construct a new ``ProbabilisticProduction``.
Production.__init__(self, lhs, rhs)
def __str__(self):
- return super().__str__() + (
- " [1.0]" if (self.prob() == 1.0) else " [%g]" % self.prob()
- )
+ return Production.__unicode__(self) + \
+ (' [1.0]' if (self.prob() == 1.0) else ' [%g]' % self.prob())
def __eq__(self, other):
- return (
- type(self) == type(other)
- and self._lhs == other._lhs
- and self._rhs == other._rhs
- and self.prob() == other.prob()
- )
+ return (type(self) == type(other) and
+ self._lhs == other._lhs and
+ self._rhs == other._rhs and
+ self.prob() == other.prob())
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash((self._lhs, self._rhs, self.prob()))
-
#################################################################
# Grammars
#################################################################
-
-
+@python_2_unicode_compatible
class CFG(object):
"""
A context-free grammar. A grammar consists of a start state and
If you need efficient key-based access to productions, you
can use a subclass to implement it.
"""
-
def __init__(self, start, productions, calculate_leftcorners=True):
"""
Create a new context-free grammar, from the given start state
:type calculate_leftcorners: bool
"""
if not is_nonterminal(start):
- raise TypeError(
- "start should be a Nonterminal object,"
- " not a %s" % type(start).__name__
- )
+ raise TypeError("start should be a Nonterminal object,"
+ " not a %s" % type(start).__name__)
self._start = start
self._productions = productions
def _calculate_leftcorners(self):
# Calculate leftcorner relations, for use in optimized parsing.
- self._immediate_leftcorner_categories = dict(
- (cat, set([cat])) for cat in self._categories
- )
- self._immediate_leftcorner_words = dict(
- (cat, set()) for cat in self._categories
- )
+ self._immediate_leftcorner_categories = dict((cat, set([cat])) for cat in self._categories)
+ self._immediate_leftcorner_words = dict((cat, set()) for cat in self._categories)
for prod in self.productions():
if len(prod) > 0:
cat, left = prod.lhs(), prod.rhs()[0]
self._leftcorners = lc
self._leftcorner_parents = invert_graph(lc)
- nr_leftcorner_categories = sum(
- map(len, self._immediate_leftcorner_categories.values())
- )
+ nr_leftcorner_categories = sum(map(len, self._immediate_leftcorner_categories.values()))
nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values()))
if nr_leftcorner_words > nr_leftcorner_categories > 10000:
# If the grammar is big, the leftcorner-word dictionary will be too large.
@classmethod
def fromstring(cls, input, encoding=None):
"""
- Return the grammar instance corresponding to the input string(s).
+ Return the ``CFG`` corresponding to the input string(s).
:param input: a grammar, either in the form of a string or as a list of strings.
"""
- start, productions = read_grammar(
- input, standard_nonterm_parser, encoding=encoding
- )
- return cls(start, productions)
+ start, productions = read_grammar(input, standard_nonterm_parser,
+ encoding=encoding)
+ return CFG(start, productions)
def start(self):
"""
:rtype: list(Production)
"""
if rhs and empty:
- raise ValueError(
- "You cannot select empty and non-empty " "productions at the same time."
- )
+ raise ValueError("You cannot select empty and non-empty "
+ "productions at the same time.")
# no constraints so return everything
if not lhs and not rhs:
# intersect
else:
- return [
- prod
- for prod in self._lhs_index.get(lhs, [])
- if prod in self._rhs_index.get(rhs, [])
- ]
+ return [prod for prod in self._lhs_index.get(lhs, [])
+ if prod in self._rhs_index.get(rhs, [])]
def leftcorners(self, cat):
"""
elif self._leftcorner_words:
return left in self._leftcorner_words.get(cat, set())
else:
- return any(
- left in self._immediate_leftcorner_words.get(parent, set())
- for parent in self.leftcorners(cat)
- )
+ return any(left in self._immediate_leftcorner_words.get(parent, set())
+ for parent in self.leftcorners(cat))
def leftcorner_parents(self, cat):
"""
:type tokens: list(str)
"""
- missing = [tok for tok in tokens if not self._lexical_index.get(tok)]
+ missing = [tok for tok in tokens
+ if not self._lexical_index.get(tok)]
if missing:
- missing = ", ".join("%r" % (w,) for w in missing)
- raise ValueError(
- "Grammar does not cover some of the " "input words: %r." % missing
- )
+ missing = ', '.join('%r' % (w,) for w in missing)
+ raise ValueError("Grammar does not cover some of the "
+ "input words: %r." % missing)
def _calculate_grammar_forms(self):
"""
"""
prods = self._productions
self._is_lexical = all(p.is_lexical() for p in prods)
- self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1)
+ self._is_nonlexical = all(p.is_nonlexical() for p in prods
+ if len(p) != 1)
self._min_len = min(len(p) for p in prods)
self._max_len = max(len(p) for p in prods)
- self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1)
+ self._all_unary_are_lexical = all(p.is_lexical() for p in prods
+ if len(p) == 1)
def is_lexical(self):
"""
Return True if the grammar is of Chomsky Normal Form, i.e. all productions
are of the form A -> B C, or A -> "s".
"""
- return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical
-
- def chomsky_normal_form(self, new_token_padding="@$@", flexible=False):
- """
- Returns a new Grammer that is in chomsky normal
- :param: new_token_padding
- Customise new rule formation during binarisation
- """
- if self.is_chomsky_normal_form():
- return self
- if self.productions(empty=True):
- raise ValueError(
- ("Grammar has Empty rules. " "Cannot deal with them at the moment")
- )
-
- # check for mixed rules
- for rule in self.productions():
- if rule.is_lexical() and len(rule.rhs()) > 1:
- raise ValueError(
- "Cannot handled mixed rule {} => {}".format(rule.lhs(), rule.rhs())
- )
-
- step1 = CFG.eliminate_start(self)
- step2 = CFG.binarize(step1, new_token_padding)
- if flexible:
- return step2
- step3 = CFG.remove_unitary_rules(step2)
- return step3
-
- @classmethod
- def remove_unitary_rules(cls, grammar):
- """
- Remove nonlexical unitary rules and convert them to
- lexical
- """
- result = []
- unitary = []
- for rule in grammar.productions():
- if len(rule) == 1 and rule.is_nonlexical():
- unitary.append(rule)
- else:
- result.append(rule)
-
- while unitary:
- rule = unitary.pop(0)
- for item in grammar.productions(lhs=rule.rhs()[0]):
- new_rule = Production(rule.lhs(), item.rhs())
- if len(new_rule) != 1 or new_rule.is_lexical():
- result.append(new_rule)
- else:
- unitary.append(new_rule)
-
- n_grammar = CFG(grammar.start(), result)
- return n_grammar
-
- @classmethod
- def binarize(cls, grammar, padding="@$@"):
- """
- Convert all non-binary rules into binary by introducing
- new tokens.
- Example::
- Original:
- A => B C D
- After Conversion:
- A => B A@$@B
- A@$@B => C D
- """
- result = []
-
- for rule in grammar.productions():
- if len(rule.rhs()) > 2:
- # this rule needs to be broken down
- left_side = rule.lhs()
- for k in range(0, len(rule.rhs()) - 2):
- tsym = rule.rhs()[k]
- new_sym = Nonterminal(left_side.symbol() + padding + tsym.symbol())
- new_production = Production(left_side, (tsym, new_sym))
- left_side = new_sym
- result.append(new_production)
- last_prd = Production(left_side, rule.rhs()[-2:])
- result.append(last_prd)
- else:
- result.append(rule)
-
- n_grammar = CFG(grammar.start(), result)
- return n_grammar
-
- @classmethod
- def eliminate_start(cls, grammar):
- """
- Eliminate start rule in case it appears on RHS
- Example: S -> S0 S1 and S0 -> S1 S
- Then another rule S0_Sigma -> S is added
- """
- start = grammar.start()
- result = []
- need_to_add = None
- for rule in grammar.productions():
- if start in rule.rhs():
- need_to_add = True
- result.append(rule)
- if need_to_add:
- start = Nonterminal("S0_SIGMA")
- result.append(Production(start, [grammar.start()]))
- n_grammar = CFG(start, result)
- return n_grammar
- return grammar
+ return (self.is_flexible_chomsky_normal_form() and
+ self._all_unary_are_lexical)
def __repr__(self):
- return "<Grammar with %d productions>" % len(self._productions)
+ return '<Grammar with %d productions>' % len(self._productions)
def __str__(self):
- result = "Grammar with %d productions" % len(self._productions)
- result += " (start state = %r)" % self._start
+ result = 'Grammar with %d productions' % len(self._productions)
+ result += ' (start state = %r)' % self._start
for production in self._productions:
- result += "\n %s" % production
+ result += '\n %s' % production
return result
productions. The set of terminals and nonterminals
is implicitly specified by the productions.
"""
-
def __init__(self, start, productions):
"""
Create a new feature-based grammar, from the given start
self._lexical_index.setdefault(token, set()).add(prod)
@classmethod
- def fromstring(
- cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None
- ):
+ def fromstring(cls, input, features=None, logic_parser=None, fstruct_reader=None,
+ encoding=None):
"""
- Return a feature structure based grammar.
+ Return a feature structure based ``FeatureGrammar``.
:param input: a grammar, either in the form of a string or else
as a list of strings.
features = (SLASH, TYPE)
if fstruct_reader is None:
- fstruct_reader = FeatStructReader(
- features, FeatStructNonterminal, logic_parser=logic_parser
- )
+ fstruct_reader = FeatStructReader(features, FeatStructNonterminal,
+ logic_parser=logic_parser)
elif logic_parser is not None:
- raise Exception(
- "'logic_parser' and 'fstruct_reader' must " "not both be set"
- )
+ raise Exception('\'logic_parser\' and \'fstruct_reader\' must '
+ 'not both be set')
+
+ start, productions = read_grammar(input, fstruct_reader.read_partial,
+ encoding=encoding)
+ return FeatureGrammar(start, productions)
- start, productions = read_grammar(
- input, fstruct_reader.read_partial, encoding=encoding
- )
- return cls(start, productions)
def productions(self, lhs=None, rhs=None, empty=False):
"""
:rtype: list(Production)
"""
if rhs and empty:
- raise ValueError(
- "You cannot select empty and non-empty " "productions at the same time."
- )
+ raise ValueError("You cannot select empty and non-empty "
+ "productions at the same time.")
# no constraints so return everything
if not lhs and not rhs:
# intersect
else:
- return [
- prod
- for prod in self._lhs_index.get(self._get_type_if_possible(lhs), [])
- if prod in self._rhs_index.get(self._get_type_if_possible(rhs), [])
- ]
+ return [prod for prod in self._lhs_index.get(self._get_type_if_possible(lhs), [])
+ if prod in self._rhs_index.get(self._get_type_if_possible(rhs), [])]
def leftcorners(self, cat):
"""
else:
return item
-
@total_ordering
-
+@python_2_unicode_compatible
class FeatureValueType(object):
"""
A helper class for ``FeatureGrammars``, designed to be different
from ordinary strings. This is to stop the ``FeatStruct``
``FOO[]`` from being compare equal to the terminal "FOO".
"""
-
def __init__(self, value):
self._value = value
self._hash = hash(value)
def __repr__(self):
- return "<%s>" % self._value
+ return '<%s>' % self._value
def __eq__(self, other):
return type(self) == type(other) and self._value == other._value
return self._hash
-
+@python_2_unicode_compatible
class DependencyGrammar(object):
"""
A dependency grammar. A DependencyGrammar consists of a set of
productions. Each production specifies a head/modifier relationship
between a pair of words.
"""
-
def __init__(self, productions):
"""
Create a new dependency grammar, from the set of ``Productions``.
@classmethod
def fromstring(cls, input):
productions = []
- for linenum, line in enumerate(input.split("\n")):
+ for linenum, line in enumerate(input.split('\n')):
line = line.strip()
- if line.startswith("#") or line == "":
- continue
- try:
- productions += _read_dependency_production(line)
+ if line.startswith('#') or line=='': continue
+ try: productions += _read_dependency_production(line)
except ValueError:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
if len(productions) == 0:
- raise ValueError("No productions found!")
- return cls(productions)
+ raise ValueError('No productions found!')
+ return DependencyGrammar(productions)
def contains(self, head, mod):
"""
"""
for production in self._productions:
for possibleMod in production._rhs:
- if production._lhs == head and possibleMod == mod:
+ if(production._lhs == head and possibleMod == mod):
return True
return False
"""
for production in self._productions:
for possibleMod in production._rhs:
- if production._lhs == head and possibleMod == mod:
+ if(production._lhs == head and possibleMod == mod):
return True
return False
# return True
# return False
+
def __str__(self):
"""
Return a verbose string representation of the ``DependencyGrammar``
:rtype: str
"""
- str = "Dependency grammar with %d productions" % len(self._productions)
+ str = 'Dependency grammar with %d productions' % len(self._productions)
for production in self._productions:
- str += "\n %s" % production
+ str += '\n %s' % production
return str
def __repr__(self):
"""
Return a concise string representation of the ``DependencyGrammar``
"""
- return "Dependency grammar with %d productions" % len(self._productions)
-
+ return 'Dependency grammar with %d productions' % len(self._productions)
+@python_2_unicode_compatible
class ProbabilisticDependencyGrammar(object):
"""
"""
for production in self._productions:
for possibleMod in production._rhs:
- if production._lhs == head and possibleMod == mod:
+ if(production._lhs == head and possibleMod == mod):
return True
return False
:rtype: str
"""
- str = "Statistical dependency grammar with %d productions" % len(
- self._productions
- )
+ str = 'Statistical dependency grammar with %d productions' % len(self._productions)
for production in self._productions:
- str += "\n %s" % production
- str += "\nEvents:"
+ str += '\n %s' % production
+ str += '\nEvents:'
for event in self._events:
- str += "\n %d:%s" % (self._events[event], event)
- str += "\nTags:"
+ str += '\n %d:%s' % (self._events[event], event)
+ str += '\nTags:'
for tag_word in self._tags:
- str += "\n %s:\t(%s)" % (tag_word, self._tags[tag_word])
+ str += '\n %s:\t(%s)' % (tag_word, self._tags[tag_word])
return str
def __repr__(self):
"""
Return a concise string representation of the ``ProbabilisticDependencyGrammar``
"""
- return "Statistical Dependency grammar with %d productions" % len(
- self._productions
- )
+ return 'Statistical Dependency grammar with %d productions' % len(self._productions)
class PCFG(CFG):
productions with a given left-hand side have probabilities
that sum to 1.
"""
-
EPSILON = 0.01
def __init__(self, start, productions, calculate_leftcorners=True):
# Make sure that the probabilities sum to one.
probs = {}
for production in productions:
- probs[production.lhs()] = probs.get(production.lhs(), 0) + production.prob()
+ probs[production.lhs()] = (probs.get(production.lhs(), 0) +
+ production.prob())
for (lhs, p) in probs.items():
- if not ((1 - PCFG.EPSILON) < p < (1 + PCFG.EPSILON)):
+ if not ((1-PCFG.EPSILON) < p <
+ (1+PCFG.EPSILON)):
raise ValueError("Productions for %r do not sum to 1" % lhs)
+
@classmethod
def fromstring(cls, input, encoding=None):
"""
- Return a probabilistic context-free grammar corresponding to the
+ Return a probabilistic ``PCFG`` corresponding to the
input string(s).
:param input: a grammar, either in the form of a string or else
as a list of strings.
"""
- start, productions = read_grammar(
- input, standard_nonterm_parser, probabilistic=True, encoding=encoding
- )
- return cls(start, productions)
+ start, productions = read_grammar(input, standard_nonterm_parser,
+ probabilistic=True, encoding=encoding)
+ return PCFG(start, productions)
#################################################################
# Contributed by Nathan Bodenstab <bodenstab@cslu.ogi.edu>
-
def induce_pcfg(start, productions):
"""
Induce a PCFG grammar from a list of productions.
for prod in productions:
lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
- pcount[prod] = pcount.get(prod, 0) + 1
+ pcount[prod] = pcount.get(prod, 0) + 1
- prods = [
- ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()])
- for p in pcount
- ]
+ prods = [ProbabilisticProduction(p.lhs(), p.rhs(),
+ prob=pcount[p] / lcount[p.lhs()])
+ for p in pcount]
return PCFG(start, prods)
# Helper functions for reading productions
#################################################################
-
def _read_cfg_production(input):
"""
Return a list of context-free ``Productions``.
"""
return _read_production(input, standard_nonterm_parser)
-
def _read_pcfg_production(input):
"""
Return a list of PCFG ``ProbabilisticProductions``.
"""
return _read_production(input, standard_nonterm_parser, probabilistic=True)
-
def _read_fcfg_production(input, fstruct_reader):
"""
Return a list of feature-based ``Productions``.
# Parsing generic grammars
-_ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE)
-_PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE)
+_ARROW_RE = re.compile(r'\s* -> \s*', re.VERBOSE)
+_PROBABILITY_RE = re.compile(r'( \[ [\d\.]+ \] ) \s*', re.VERBOSE)
_TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
-_DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE)
-
+_DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE)
def _read_production(line, nonterm_parser, probabilistic=False):
"""
# Skip over the arrow.
m = _ARROW_RE.match(line, pos)
- if not m:
- raise ValueError("Expected an arrow")
+ if not m: raise ValueError('Expected an arrow')
pos = m.end()
# Parse the right hand side.
pos = m.end()
probabilities[-1] = float(m.group(1)[1:-1])
if probabilities[-1] > 1.0:
- raise ValueError(
- "Production probability %f, "
- "should not be greater than 1.0" % (probabilities[-1],)
- )
+ raise ValueError('Production probability %f, '
+ 'should not be greater than 1.0' %
+ (probabilities[-1],))
# String -- add terminal.
- elif line[pos] in "'\"":
+ elif line[pos] in "\'\"":
m = _TERMINAL_RE.match(line, pos)
- if not m:
- raise ValueError("Unterminated string")
+ if not m: raise ValueError('Unterminated string')
rhsides[-1].append(m.group(1)[1:-1])
pos = m.end()
# Vertical bar -- start new rhside.
- elif line[pos] == "|":
+ elif line[pos] == '|':
m = _DISJUNCTION_RE.match(line, pos)
probabilities.append(0.0)
rhsides.append([])
rhsides[-1].append(nonterm)
if probabilistic:
- return [
- ProbabilisticProduction(lhs, rhs, prob=probability)
- for (rhs, probability) in zip(rhsides, probabilities)
- ]
+ return [ProbabilisticProduction(lhs, rhs, prob=probability)
+ for (rhs, probability) in zip(rhsides, probabilities)]
else:
return [Production(lhs, rhs) for rhs in rhsides]
# Reading Phrase Structure Grammars
#################################################################
-
def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
"""
Return a pair consisting of a starting category and a list of
"""
if encoding is not None:
input = input.decode(encoding)
- if isinstance(input, str):
- lines = input.split("\n")
+ if isinstance(input, string_types):
+ lines = input.split('\n')
else:
lines = input
start = None
productions = []
- continue_line = ""
+ continue_line = ''
for linenum, line in enumerate(lines):
line = continue_line + line.strip()
- if line.startswith("#") or line == "":
- continue
- if line.endswith("\\"):
- continue_line = line[:-1].rstrip() + " "
+ if line.startswith('#') or line=='': continue
+ if line.endswith('\\'):
+ continue_line = line[:-1].rstrip()+' '
continue
- continue_line = ""
+ continue_line = ''
try:
- if line[0] == "%":
+ if line[0] == '%':
directive, args = line[1:].split(None, 1)
- if directive == "start":
+ if directive == 'start':
start, pos = nonterm_parser(args, 0)
if pos != len(args):
- raise ValueError("Bad argument to start directive")
+ raise ValueError('Bad argument to start directive')
else:
- raise ValueError("Bad directive")
+ raise ValueError('Bad directive')
else:
# expand out the disjunctions on the RHS
productions += _read_production(line, nonterm_parser, probabilistic)
except ValueError as e:
- raise ValueError("Unable to parse line %s: %s\n%s" % (linenum + 1, line, e))
+ raise ValueError('Unable to parse line %s: %s\n%s' %
+ (linenum+1, line, e))
if not productions:
- raise ValueError("No productions found!")
+ raise ValueError('No productions found!')
if not start:
start = productions[0].lhs()
return (start, productions)
-
-_STANDARD_NONTERM_RE = re.compile("( [\w/][\w/^<>-]* ) \s*", re.VERBOSE)
-
+_STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE)
def standard_nonterm_parser(string, pos):
m = _STANDARD_NONTERM_RE.match(string, pos)
- if not m:
- raise ValueError("Expected a nonterminal, found: " + string[pos:])
+ if not m: raise ValueError('Expected a nonterminal, found: '
+ + string[pos:])
return (Nonterminal(m.group(1)), m.end())
# Reading Dependency Grammars
#################################################################
-_READ_DG_RE = re.compile(
- r"""^\s* # leading whitespace
+_READ_DG_RE = re.compile(r'''^\s* # leading whitespace
('[^']+')\s* # single-quoted lhs
(?:[-=]+>)\s* # arrow
(?:( # rhs:
| \| # disjunction
)
\s*) # trailing space
- *$""", # zero or more copies
- re.VERBOSE,
-)
-_SPLIT_DG_RE = re.compile(r"""('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)""")
-
+ *$''', # zero or more copies
+ re.VERBOSE)
+_SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''')
def _read_dependency_production(s):
if not _READ_DG_RE.match(s):
- raise ValueError("Bad production string")
+ raise ValueError('Bad production string')
pieces = _SPLIT_DG_RE.split(s)
- pieces = [p for i, p in enumerate(pieces) if i % 2 == 1]
- lhside = pieces[0].strip("'\"")
+ pieces = [p for i,p in enumerate(pieces) if i%2==1]
+ lhside = pieces[0].strip('\'\"')
rhsides = [[]]
for piece in pieces[2:]:
- if piece == "|":
+ if piece == '|':
rhsides.append([])
else:
- rhsides[-1].append(piece.strip("'\""))
+ rhsides[-1].append(piece.strip('\'\"'))
return [DependencyProduction(lhside, rhside) for rhside in rhsides]
# Demonstration
#################################################################
-
def cfg_demo():
"""
A demonstration showing how ``CFGs`` can be created and used.
from nltk import nonterminals, Production, CFG
# Create some nonterminals
- S, NP, VP, PP = nonterminals("S, NP, VP, PP")
- N, V, P, Det = nonterminals("N, V, P, Det")
- VP_slash_NP = VP / NP
+ S, NP, VP, PP = nonterminals('S, NP, VP, PP')
+ N, V, P, Det = nonterminals('N, V, P, Det')
+ VP_slash_NP = VP/NP
- print("Some nonterminals:", [S, NP, VP, PP, N, V, P, Det, VP / NP])
- print(" S.symbol() =>", repr(S.symbol()))
+ print('Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP])
+ print(' S.symbol() =>', repr(S.symbol()))
print()
print(Production(S, [NP]))
# Create some Grammar Productions
- grammar = CFG.fromstring(
- """
+ grammar = CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | NP PP
N -> 'dog' | 'cat'
V -> 'chased' | 'sat'
P -> 'on' | 'in'
- """
- )
+ """)
- print("A Grammar:", repr(grammar))
- print(" grammar.start() =>", repr(grammar.start()))
- print(" grammar.productions() =>", end=" ")
+ print('A Grammar:', repr(grammar))
+ print(' grammar.start() =>', repr(grammar.start()))
+ print(' grammar.productions() =>', end=' ')
# Use string.replace(...) is to line-wrap the output.
- print(repr(grammar.productions()).replace(",", ",\n" + " " * 25))
+ print(repr(grammar.productions()).replace(',', ',\n'+' '*25))
print()
-
-toy_pcfg1 = PCFG.fromstring(
- """
+toy_pcfg1 = PCFG.fromstring("""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
- """
-)
+ """)
-toy_pcfg2 = PCFG.fromstring(
- """
+toy_pcfg2 = PCFG.fromstring("""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
- """
-)
-
+ """)
def pcfg_demo():
"""
pcfg_prods = toy_pcfg1.productions()
pcfg_prod = pcfg_prods[2]
- print("A PCFG production:", repr(pcfg_prod))
- print(" pcfg_prod.lhs() =>", repr(pcfg_prod.lhs()))
- print(" pcfg_prod.rhs() =>", repr(pcfg_prod.rhs()))
- print(" pcfg_prod.prob() =>", repr(pcfg_prod.prob()))
+ print('A PCFG production:', repr(pcfg_prod))
+ print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs()))
+ print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs()))
+ print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
print()
grammar = toy_pcfg2
- print("A PCFG grammar:", repr(grammar))
- print(" grammar.start() =>", repr(grammar.start()))
- print(" grammar.productions() =>", end=" ")
+ print('A PCFG grammar:', repr(grammar))
+ print(' grammar.start() =>', repr(grammar.start()))
+ print(' grammar.productions() =>', end=' ')
# Use .replace(...) is to line-wrap the output.
- print(repr(grammar.productions()).replace(",", ",\n" + " " * 26))
+ print(repr(grammar.productions()).replace(',', ',\n'+' '*26))
print()
# extract productions from three trees and induce the PCFG
item = treebank._fileids[0]
for tree in treebank.parsed_sents(item)[:3]:
# perform optional tree transformations, e.g.:
- tree.collapse_unary(collapsePOS=False)
- tree.chomsky_normal_form(horzMarkov=2)
+ tree.collapse_unary(collapsePOS = False)
+ tree.chomsky_normal_form(horzMarkov = 2)
productions += tree.productions()
- S = Nonterminal("S")
+ S = Nonterminal('S')
grammar = induce_pcfg(S, productions)
print(grammar)
print()
parser.trace(3)
# doesn't work as tokens are different:
- # sent = treebank.tokenized('wsj_0001.mrg')[0]
+ #sent = treebank.tokenized('wsj_0001.mrg')[0]
sent = treebank.parsed_sents(item)[0].leaves()
print(sent)
for parse in parser.parse(sent):
print(parse)
-
def fcfg_demo():
import nltk.data
-
- g = nltk.data.load("grammars/book_grammars/feat0.fcfg")
+ g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
print(g)
print()
-
def dg_demo():
"""
A demonstration showing the creation and inspection of a
``DependencyGrammar``.
"""
- grammar = DependencyGrammar.fromstring(
- """
+ grammar = DependencyGrammar.fromstring("""
'scratch' -> 'cats' | 'walls'
'walls' -> 'the'
'cats' -> 'the'
- """
- )
+ """)
print(grammar)
-
def sdg_demo():
"""
A demonstration of how to read a string representation of
"""
from nltk.parse import DependencyGraph
- dg = DependencyGraph(
- """
+ dg = DependencyGraph("""
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 met met Prep Prep voor 8 mod _ _
11 of of Conj Conj neven 7 vc _ _
12 terrassen terras N N soort|mv|neut 11 cnj _ _
13 . . Punc Punc punt 12 punct _ _
- """
- )
+ """)
tree = dg.tree()
print(tree.pprint())
-
def demo():
cfg_demo()
pcfg_demo()
dg_demo()
sdg_demo()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
-__all__ = [
- "Nonterminal",
- "nonterminals",
- "CFG",
- "Production",
- "PCFG",
- "ProbabilisticProduction",
- "DependencyGrammar",
- "DependencyProduction",
- "ProbabilisticDependencyGrammar",
- "induce_pcfg",
- "read_grammar",
-]
+__all__ = ['Nonterminal', 'nonterminals',
+ 'CFG', 'Production',
+ 'PCFG', 'ProbabilisticProduction',
+ 'DependencyGrammar', 'DependencyProduction',
+ 'ProbabilisticDependencyGrammar',
+ 'induce_pcfg', 'read_grammar']
# Natural Language Toolkit (NLTK) Help
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Provide structured access to documentation.
"""
+from __future__ import print_function
import re
from textwrap import wrap
from nltk.data import load
-
def brown_tagset(tagpattern=None):
_format_tagset("brown_tagset", tagpattern)
-
def claws5_tagset(tagpattern=None):
_format_tagset("claws5_tagset", tagpattern)
-
def upenn_tagset(tagpattern=None):
_format_tagset("upenn_tagset", tagpattern)
-
#####################################################################
# UTILITIES
#####################################################################
-
def _print_entries(tags, tagdict):
for tag in tags:
entry = tagdict[tag]
defn = [tag + ": " + entry[0]]
- examples = wrap(
- entry[1], width=75, initial_indent=" ", subsequent_indent=" "
- )
+ examples = wrap(entry[1], width=75, initial_indent=' ', subsequent_indent=' ')
print("\n".join(defn + examples))
-
def _format_tagset(tagset, tagpattern=None):
tagdict = load("help/tagsets/" + tagset + ".pickle")
if not tagpattern:
else:
print("No matching tags found.")
-
-if __name__ == "__main__":
- brown_tagset(r"NN.*")
- upenn_tagset(r".*\$")
- claws5_tagset("UNDEFINED")
- brown_tagset(r"NN")
+if __name__ == '__main__':
+ brown_tagset(r'NN.*')
+ upenn_tagset(r'.*\$')
+ claws5_tagset('UNDEFINED')
+ brown_tagset(r'NN')
# Natural Language Toolkit: Inference
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Garrette <dhgarrette@gmail.com>
# Ewan Klein <ewan@inf.ed.ac.uk>
#
from nltk.inference.prover9 import Prover9, Prover9Command
from nltk.inference.resolution import ResolutionProver, ResolutionProverCommand
from nltk.inference.tableau import TableauProver, TableauProverCommand
-from nltk.inference.discourse import (
- ReadingCommand,
- CfgReadingCommand,
- DrtGlueReadingCommand,
- DiscourseTester,
-)
+from nltk.inference.discourse import (ReadingCommand, CfgReadingCommand,
+ DrtGlueReadingCommand, DiscourseTester)
goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy
the assumptions plus the negation of *G*.
"""
-
+from __future__ import print_function
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
import threading
import time
-class Prover(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class Prover(object):
"""
Interface for trying to prove a goal from assumptions. Both the goal and
the assumptions are constrained to be formulas of ``logic.Expression``.
"""
-
def prove(self, goal=None, assumptions=None, verbose=False):
"""
:return: Whether the proof was successful or not.
"""
-class ModelBuilder(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ModelBuilder(object):
"""
Interface for trying to build a model of set of formulas.
Open formulas are assumed to be universally quantified.
Both the goal and the assumptions are constrained to be formulas
of ``logic.Expression``.
"""
-
def build_model(self, goal=None, assumptions=None, verbose=False):
"""
Perform the actual model building.
"""
-class TheoremToolCommand(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TheoremToolCommand(object):
"""
This class holds a goal and a list of assumptions to be used in proving
or model building.
"""
-
@abstractmethod
def add_assumptions(self, new_assumptions):
"""
This class holds a ``Prover``, a goal, and a list of assumptions. When
prove() is called, the ``Prover`` is executed with the goal and assumptions.
"""
-
@abstractmethod
def prove(self, verbose=False):
"""
When build_model() is called, the ``ModelBuilder`` is executed with the goal
and assumptions.
"""
-
@abstractmethod
def build_model(self, verbose=False):
"""
This class holds a goal and a list of assumptions to be used in proving
or model building.
"""
-
def __init__(self, goal=None, assumptions=None):
"""
:param goal: Input expression to prove
This class holds a ``Prover``, a goal, and a list of assumptions. When
prove() is called, the ``Prover`` is executed with the goal and assumptions.
"""
-
def __init__(self, prover, goal=None, assumptions=None):
"""
:param prover: The theorem tool to execute with the assumptions
re-proving.
"""
if self._result is None:
- self._result, self._proof = self._prover._prove(
- self.goal(), self.assumptions(), verbose
- )
+ self._result, self._proof = self._prover._prove(self.goal(),
+ self.assumptions(),
+ verbose)
return self._result
def proof(self, simplify=True):
build_model() is called, the ``ModelBuilder`` is executed with the goal and
assumptions.
"""
-
def __init__(self, modelbuilder, goal=None, assumptions=None):
"""
:param modelbuilder: The theorem tool to execute with the assumptions
re-building.
"""
if self._result is None:
- self._result, self._model = self._modelbuilder._build_model(
- self.goal(), self.assumptions(), verbose
- )
+ self._result, self._model = \
+ self._modelbuilder._build_model(self.goal(),
+ self.assumptions(),
+ verbose)
return self._result
def model(self, format=None):
:return: str
"""
if self._result is None:
- raise LookupError("You have to call build_model() first to " "get a model!")
+ raise LookupError('You have to call build_model() first to '
+ 'get a model!')
else:
return self._decorate_model(self._model, format)
A base decorator for the ``ProverCommandDecorator`` and
``ModelBuilderCommandDecorator`` classes from which decorators can extend.
"""
-
def __init__(self, command):
"""
:param command: ``TheoremToolCommand`` to decorate
A base decorator for the ``ProverCommand`` class from which other
prover command decorators can extend.
"""
-
def __init__(self, proverCommand):
"""
:param proverCommand: ``ProverCommand`` to decorate
def prove(self, verbose=False):
if self._result is None:
prover = self.get_prover()
- self._result, self._proof = prover._prove(
- self.goal(), self.assumptions(), verbose
- )
+ self._result, self._proof = prover._prove(self.goal(),
+ self.assumptions(),
+ verbose)
return self._result
def proof(self, simplify=True):
A base decorator for the ``ModelBuilderCommand`` class from which other
prover command decorators can extend.
"""
-
def __init__(self, modelBuilderCommand):
"""
:param modelBuilderCommand: ``ModelBuilderCommand`` to decorate
"""
if self._result is None:
modelbuilder = self.get_model_builder()
- self._result, self._model = modelbuilder._build_model(
- self.goal(), self.assumptions(), verbose
- )
+ self._result, self._model = \
+ modelbuilder._build_model(self.goal(),
+ self.assumptions(),
+ verbose)
return self._result
def model(self, format=None):
:return: str
"""
if self._result is None:
- raise LookupError("You have to call build_model() first to " "get a model!")
+ raise LookupError('You have to call build_model() first to '
+ 'get a model!')
else:
return self._decorate_model(self._model, format)
parallel. Whichever finishes first, the prover or the model builder, is the
result that will be used.
"""
-
def __init__(self, prover, modelbuilder):
self._prover = prover
self._modelbuilder = modelbuilder
def _prove(self, goal=None, assumptions=None, verbose=False):
- return self._run(goal, assumptions, verbose), ""
+ return self._run(goal, assumptions, verbose), ''
def _build_model(self, goal=None, assumptions=None, verbose=False):
- return not self._run(goal, assumptions, verbose), ""
+ return not self._run(goal, assumptions, verbose), ''
def _run(self, goal, assumptions, verbose):
# Set up two thread, Prover and ModelBuilder to run in parallel
- tp_thread = TheoremToolThread(
- lambda: self._prover.prove(goal, assumptions, verbose), verbose, "TP"
- )
- mb_thread = TheoremToolThread(
- lambda: self._modelbuilder.build_model(goal, assumptions, verbose),
- verbose,
- "MB",
- )
+ tp_thread = TheoremToolThread(lambda: self._prover.prove(goal, assumptions, verbose), verbose, 'TP')
+ mb_thread = TheoremToolThread(lambda: self._modelbuilder.build_model(goal, assumptions, verbose), verbose, 'MB')
tp_thread.start()
mb_thread.start()
Because the theorem prover result is the opposite of the model builder
result, we will treat self._result as meaning "proof found/no model found".
"""
-
def __init__(self, prover, modelbuilder, goal=None, assumptions=None):
BaseProverCommand.__init__(self, prover, goal, assumptions)
BaseModelBuilderCommand.__init__(self, modelbuilder, goal, assumptions)
def _run(self, verbose):
# Set up two thread, Prover and ModelBuilder to run in parallel
- tp_thread = TheoremToolThread(
- lambda: BaseProverCommand.prove(self, verbose), verbose, "TP"
- )
- mb_thread = TheoremToolThread(
- lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, "MB"
- )
+ tp_thread = TheoremToolThread(lambda: BaseProverCommand.prove(self, verbose), verbose, 'TP')
+ mb_thread = TheoremToolThread(lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, 'MB')
tp_thread.start()
mb_thread.start()
try:
self._result = self._command()
if self._verbose:
- print(
- "Thread %s finished with result %s at %s"
- % (self._name, self._result, time.localtime(time.time()))
- )
+ print('Thread %s finished with result %s at %s' % \
+ (self._name, self._result, time.localtime(time.time())))
except Exception as e:
print(e)
- print("Thread %s completed abnormally" % (self._name))
+ print('Thread %s completed abnormally' % (self._name))
@property
- def result(self):
- return self._result
+ def result(self): return self._result
(This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show
those threads which are consistent (taking into account any background assumptions).
"""
-
-import os
+from __future__ import print_function
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+import os
+
from operator import and_, add
from functools import reduce
-
from nltk.data import show_cfg
from nltk.tag import RegexpTagger
from nltk.parse import load_parser
from nltk.inference.prover9 import Prover9Command
-class ReadingCommand(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ReadingCommand(object):
@abstractmethod
def parse_to_readings(self, sentence):
"""
:param gramfile: name of file where grammar can be loaded
:type gramfile: str
"""
- self._gramfile = (
- gramfile if gramfile else "grammars/book_grammars/discourse.fcfg"
- )
+ self._gramfile = (gramfile if gramfile else 'grammars/book_grammars/discourse.fcfg')
self._parser = load_parser(self._gramfile)
def parse_to_readings(self, sentence):
""":see: ReadingCommand.parse_to_readings()"""
from nltk.sem import root_semrep
-
tokens = sentence.split()
trees = self._parser.parse(tokens)
return [root_semrep(tree) for tree in trees]
class DrtGlueReadingCommand(ReadingCommand):
- def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None):
+ def __init__(self, semtype_file=None, remove_duplicates=False,
+ depparser=None):
"""
:param semtype_file: name of file where grammar can be loaded
:param remove_duplicates: should duplicates be removed?
:param depparser: the dependency parser
"""
if semtype_file is None:
- semtype_file = os.path.join(
- "grammars", "sample_grammars", "drt_glue.semtype"
- )
- self._glue = DrtGlue(
- semtype_file=semtype_file,
- remove_duplicates=remove_duplicates,
- depparser=depparser,
- )
+ semtype_file = os.path.join('grammars', 'sample_grammars','drt_glue.semtype')
+ self._glue = DrtGlue(semtype_file=semtype_file,
+ remove_duplicates=remove_duplicates,
+ depparser=depparser)
def parse_to_readings(self, sentence):
""":see: ReadingCommand.parse_to_readings()"""
"""
Check properties of an ongoing discourse.
"""
-
def __init__(self, input, reading_command=None, background=None):
"""
Initialize a ``DiscourseTester``.
:type background: list(Expression)
"""
self._input = input
- self._sentences = dict([("s%s" % i, sent) for i, sent in enumerate(input)])
+ self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)])
self._models = None
self._readings = {}
- self._reading_command = (
- reading_command if reading_command else CfgReadingCommand()
- )
+ self._reading_command = (reading_command if reading_command else CfgReadingCommand())
self._threads = {}
self._filtered_threads = {}
if background is not None:
from nltk.sem.logic import Expression
-
for e in background:
assert isinstance(e, Expression)
self._background = background
for id in sorted(self._sentences):
print("%s: %s" % (id, self._sentences[id]))
- def add_sentence(self, sentence, informchk=False, consistchk=False):
+ def add_sentence(self, sentence, informchk=False, consistchk=False,):
"""
Add a sentence to the current discourse.
for sent_reading in self._get_readings(sentence):
tp = Prover9Command(goal=sent_reading, assumptions=assumptions)
if tp.prove():
- print(
- "Sentence '%s' under reading '%s':"
- % (sentence, str(sent_reading))
- )
+ print("Sentence '%s' under reading '%s':" % (sentence, str(sent_reading)))
print("Not informative relative to thread '%s'" % tid)
self._input.append(sentence)
- self._sentences = dict(
- [("s%s" % i, sent) for i, sent in enumerate(self._input)]
- )
+ self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)])
# check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of
# of assumptions
if consistchk:
try:
self._input.remove(sentence)
except ValueError:
- print(
- "Retraction failed. The sentence '%s' is not part of the current discourse:"
- % sentence
- )
+ print("Retraction failed. The sentence '%s' is not part of the current discourse:" % sentence)
self.sentences()
return None
- self._sentences = dict(
- [("s%s" % i, sent) for i, sent in enumerate(self._input)]
- )
+ self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)])
self.readings(verbose=False)
if verbose:
print("Current sentences are ")
for sid in sorted(self._sentences):
sentence = self._sentences[sid]
readings = self._get_readings(sentence)
- self._readings[sid] = dict(
- [
- ("%s-r%s" % (sid, rid), reading.simplify())
- for rid, reading in enumerate(sorted(readings, key=str))
- ]
- )
+ self._readings[sid] = dict([("%s-r%s" % (sid, rid), reading.simplify())
+ for rid, reading in enumerate(sorted(readings, key=str))])
def _construct_threads(self):
"""
thread_list = [[]]
for sid in sorted(self._readings):
thread_list = self.multiply(thread_list, sorted(self._readings[sid]))
- self._threads = dict(
- [("d%s" % tid, thread) for tid, thread in enumerate(thread_list)]
- )
+ self._threads = dict([("d%s" % tid, thread) for tid, thread in enumerate(thread_list)])
# re-initialize the filtered threads
self._filtered_threads = {}
# keep the same ids, but only include threads which get models
else:
for sid in sorted(self._readings):
print()
- print("%s readings:" % sid)
- print() #'-' * 30
+ print('%s readings:' % sid)
+ print() #'-' * 30
for rid in sorted(self._readings[sid]):
lf = self._readings[sid][rid]
print("%s: %s" % (rid, lf.normalize()))
"""
Print out the value of ``self._threads`` or ``self._filtered_hreads``
"""
- threads = self._filtered_threads if filter else self._threads
+ threads = (self._filtered_threads if filter else self._threads)
for tid in sorted(threads):
if show_thread_readings:
- readings = [
- self._readings[rid.split("-")[0]][rid] for rid in self._threads[tid]
- ]
+ readings = [self._readings[rid.split('-')[0]][rid]
+ for rid in self._threads[tid]]
try:
- thread_reading = (
- ": %s"
- % self._reading_command.combine_readings(readings).normalize()
- )
+ thread_reading = ": %s" % \
+ self._reading_command.combine_readings(readings).normalize()
except Exception as e:
- thread_reading = ": INVALID: %s" % e.__class__.__name__
+ thread_reading = ': INVALID: %s' % e.__class__.__name__
else:
- thread_reading = ""
+ thread_reading = ''
print("%s:" % tid, self._threads[tid], thread_reading)
- def readings(
- self,
- sentence=None,
- threaded=False,
- verbose=True,
- filter=False,
- show_thread_readings=False,
- ):
+
+ def readings(self, sentence=None, threaded=False, verbose=True,
+ filter=False, show_thread_readings=False):
"""
Construct and show the readings of the discourse (or of a single sentence).
if not threaded:
self._show_readings(sentence=sentence)
else:
- self._show_threads(
- filter=filter, show_thread_readings=show_thread_readings
- )
+ self._show_threads(filter=filter,
+ show_thread_readings=show_thread_readings)
def expand_threads(self, thread_id, threads=None):
"""
"""
if threads is None:
threads = self._threads
- return [
- (rid, self._readings[sid][rid])
- for rid in threads[thread_id]
- for sid in rid.split("-")[:1]
- ]
+ return [(rid, self._readings[sid][rid]) for rid in threads[thread_id] for sid in rid.split('-')[:1]]
+
###############################
# Models and Background
def _check_consistency(self, threads, show=False, verbose=False):
results = []
for tid in sorted(threads):
- assumptions = [
- reading for (rid, reading) in self.expand_threads(tid, threads=threads)
- ]
- assumptions = list(
- map(
- self._reading_command.to_fol,
- self._reading_command.process_thread(assumptions),
- )
- )
+ assumptions = [reading for (rid, reading) in self.expand_threads(tid, threads=threads)]
+ assumptions = list(map(self._reading_command.to_fol, self._reading_command.process_thread(assumptions)))
if assumptions:
assumptions += self._background
# if Mace4 finds a model, it always seems to find it quickly
print(a)
spacer(80)
if modelfound:
- print(mb.model(format="cooked"))
+ print(mb.model(format='cooked'))
else:
print("No model found!\n")
return results
"""
self._construct_readings()
self._construct_threads()
- threads = {thread_id: self._threads[thread_id]} if thread_id else self._threads
+ threads = ({thread_id: self._threads[thread_id]} if thread_id else self._threads)
- for (tid, modelfound) in self._check_consistency(
- threads, show=show, verbose=verbose
- ):
+ for (tid, modelfound) in self._check_consistency(threads, show=show, verbose=verbose):
idlist = [rid for rid in threads[tid]]
if not modelfound:
:type background: list(Expression)
"""
from nltk.sem.logic import Expression
-
for (count, e) in enumerate(background):
assert isinstance(e, Expression)
if verbose:
print("Adding assumption %s to background" % count)
self._background.append(e)
- # update the state
+ #update the state
self._construct_readings()
self._construct_threads()
for e in self._background:
print(str(e))
- ###############################
+ ###############################
# Misc
###############################
result.append(new)
return result
+#multiply = DiscourseTester.multiply
+#L1 = [['A'], ['B']]
+#L2 = ['a', 'b', 'c']
+#print multiply(L1,L2)
+
def load_fol(s):
"""
statements = []
for linenum, line in enumerate(s.splitlines()):
line = line.strip()
- if line.startswith("#") or line == "":
+ if line.startswith('#') or line == '':
continue
try:
statements.append(Expression.fromstring(line))
except Exception:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
return statements
"""
Illustrate the various methods of ``DiscourseTester``
"""
- dt = DiscourseTester(
- ["A boxer walks", "Every boxer chases a girl"], reading_command
- )
+ dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
+ reading_command)
dt.models()
print()
# dt.grammar()
print()
dt.readings(threaded=True)
print()
- dt.models("d1")
- dt.add_sentence("John is a boxer")
+ dt.models('d1')
+ dt.add_sentence('John is a boxer')
print()
dt.sentences()
print()
dt.readings(threaded=True)
print()
- dt = DiscourseTester(
- ["A student dances", "Every student is a person"], reading_command
- )
+ dt = DiscourseTester(['A student dances', 'Every student is a person'],
+ reading_command)
print()
- dt.add_sentence("No person dances", consistchk=True)
+ dt.add_sentence('No person dances', consistchk=True)
print()
dt.readings()
print()
- dt.retract_sentence("No person dances", verbose=True)
+ dt.retract_sentence('No person dances', verbose=True)
print()
dt.models()
print()
- dt.readings("A person dances")
+ dt.readings('A person dances')
print()
- dt.add_sentence("A person dances", informchk=True)
- dt = DiscourseTester(
- ["Vincent is a boxer", "Fido is a boxer", "Vincent is married", "Fido barks"],
- reading_command,
- )
+ dt.add_sentence('A person dances', informchk=True)
+ dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
+ 'Vincent is married', 'Fido barks'],
+ reading_command)
dt.readings(filter=True)
import nltk.data
-
- background_file = os.path.join("grammars", "book_grammars", "background.fol")
+ background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
background = nltk.data.load(background_file)
print()
"""
Illustrate the various methods of ``DiscourseTester``
"""
- dt = DiscourseTester(["every dog chases a boy", "he runs"], reading_command)
+ dt = DiscourseTester(['every dog chases a boy', 'he runs'],
+ reading_command)
dt.models()
print()
dt.sentences()
def spacer(num=30):
- print("-" * num)
+ print('-' * num)
def demo():
discourse_demo()
- tagger = RegexpTagger(
- [
- ("^(chases|runs)$", "VB"),
- ("^(a)$", "ex_quant"),
- ("^(every)$", "univ_quant"),
- ("^(dog|boy)$", "NN"),
- ("^(he)$", "PRP"),
- ]
- )
+ tagger = RegexpTagger([('^(chases|runs)$', 'VB'),
+ ('^(a)$', 'ex_quant'),
+ ('^(every)$', 'univ_quant'),
+ ('^(dog|boy)$', 'NN'),
+ ('^(he)$', 'PRP')])
depparser = MaltParser(tagger=tagger)
- drt_discourse_demo(
- DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser)
- )
+ drt_discourse_demo(DrtGlueReadingCommand(remove_duplicates=False,
+ depparser=depparser))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
"""
A model builder that makes use of the external 'Mace4' package.
"""
+from __future__ import print_function
import os
import tempfile
a print_assumptions() method that is used to print the list
of assumptions in multiple formats.
"""
-
_interpformat_bin = None
def __init__(self, goal=None, assumptions=None, max_models=500, model_builder=None):
BaseModelBuilderCommand.__init__(self, model_builder, goal, assumptions)
@property
- def valuation(mbc):
- return mbc.model("valuation")
+ def valuation(mbc): return mbc.model('valuation')
def _convert2val(self, valuation_str):
"""
:return: A model if one is generated; None otherwise.
:rtype: sem.Valuation
"""
- valuation_standard_format = self._transform_output(valuation_str, "standard")
+ valuation_standard_format = self._transform_output(valuation_str, 'standard')
val = []
for line in valuation_standard_format.splitlines(False):
l = line.strip()
- if l.startswith("interpretation"):
+ if l.startswith('interpretation'):
# find the number of entities in the model
- num_entities = int(l[l.index("(") + 1 : l.index(",")].strip())
+ num_entities = int(l[l.index('(')+1:l.index(',')].strip())
- elif l.startswith("function") and l.find("_") == -1:
+ elif l.startswith('function') and l.find('_') == -1:
# replace the integer identifier with a corresponding alphabetic character
- name = l[l.index("(") + 1 : l.index(",")].strip()
+ name = l[l.index('(')+1:l.index(',')].strip()
if is_indvar(name):
name = name.upper()
- value = int(l[l.index("[") + 1 : l.index("]")].strip())
+ value = int(l[l.index('[')+1:l.index(']')].strip())
val.append((name, MaceCommand._make_model_var(value)))
- elif l.startswith("relation"):
- l = l[l.index("(") + 1 :]
- if "(" in l:
- # relation is not nullary
- name = l[: l.index("(")].strip()
- values = [
- int(v.strip())
- for v in l[l.index("[") + 1 : l.index("]")].split(",")
- ]
- val.append(
- (name, MaceCommand._make_relation_set(num_entities, values))
- )
+ elif l.startswith('relation'):
+ l = l[l.index('(')+1:]
+ if '(' in l:
+ #relation is not nullary
+ name = l[:l.index('(')].strip()
+ values = [int(v.strip()) for v in l[l.index('[')+1:l.index(']')].split(',')]
+ val.append((name, MaceCommand._make_relation_set(num_entities, values)))
else:
- # relation is nullary
- name = l[: l.index(",")].strip()
- value = int(l[l.index("[") + 1 : l.index("]")].strip())
+ #relation is nullary
+ name = l[:l.index(',')].strip()
+ value = int(l[l.index('[')+1:l.index(']')].strip())
val.append((name, value == 1))
return Valuation(val)
:type values: list of int
"""
r = set()
- for position in [pos for (pos, v) in enumerate(values) if v == 1]:
- r.add(
- tuple(MaceCommand._make_relation_tuple(position, values, num_entities))
- )
+ for position in [pos for (pos,v) in enumerate(values) if v == 1]:
+ r.add(tuple(MaceCommand._make_relation_tuple(position, values, num_entities)))
return r
@staticmethod
sublist_start = position // sublist_size
sublist_position = int(position % sublist_size)
- sublist = values[
- sublist_start * sublist_size : (sublist_start + 1) * sublist_size
- ]
- return [
- MaceCommand._make_model_var(sublist_start)
- ] + MaceCommand._make_relation_tuple(
- sublist_position, sublist, num_entities
- )
+ sublist = values[sublist_start*sublist_size:(sublist_start+1)*sublist_size]
+ return [MaceCommand._make_model_var(sublist_start)] + \
+ MaceCommand._make_relation_tuple(sublist_position,
+ sublist,
+ num_entities)
@staticmethod
def _make_model_var(value):
:param value: where to index into the list of characters
:type value: int
"""
- letter = [
- "a",
- "b",
- "c",
- "d",
- "e",
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- ][value]
+ letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n',
+ 'o','p','q','r','s','t','u','v','w','x','y','z'][value]
num = value // 26
- return letter + str(num) if num > 0 else letter
+ return (letter + str(num) if num > 0 else letter)
def _decorate_model(self, valuation_str, format):
"""
"""
if not format:
return valuation_str
- elif format == "valuation":
+ elif format == 'valuation':
return self._convert2val(valuation_str)
else:
return self._transform_output(valuation_str, format)
:param format: Output format for displaying models.
:type format: str
"""
- if format in [
- "standard",
- "standard2",
- "portable",
- "tabular",
- "raw",
- "cooked",
- "xml",
- "tex",
- ]:
+ if format in ['standard', 'standard2', 'portable', 'tabular',
+ 'raw', 'cooked', 'xml', 'tex']:
return self._call_interpformat(valuation_str, [format])[0]
else:
raise LookupError("The specified format does not exist")
"""
if self._interpformat_bin is None:
self._interpformat_bin = self._modelbuilder._find_binary(
- "interpformat", verbose
- )
+ 'interpformat', verbose)
- return self._modelbuilder._call(
- input_str, self._interpformat_bin, args, verbose
- )
+ return self._modelbuilder._call(input_str, self._interpformat_bin,
+ args, verbose)
class Mace(Prover9Parent, ModelBuilder):
if not assumptions:
assumptions = []
- stdout, returncode = self._call_mace4(
- self.prover9_input(goal, assumptions), verbose=verbose
- )
+ stdout, returncode = self._call_mace4(self.prover9_input(goal, assumptions),
+ verbose=verbose)
return (returncode == 0, stdout)
def _call_mace4(self, input_str, args=[], verbose=False):
:see: ``config_prover9``
"""
if self._mace4_bin is None:
- self._mace4_bin = self._find_binary("mace4", verbose)
+ self._mace4_bin = self._find_binary('mace4', verbose)
- updated_input_str = ""
+ updated_input_str = ''
if self._end_size > 0:
- updated_input_str += "assign(end_size, %d).\n\n" % self._end_size
+ updated_input_str += 'assign(end_size, %d).\n\n' % self._end_size
updated_input_str += input_str
return self._call(updated_input_str, self._mace4_bin, args, verbose)
def spacer(num=30):
- print("-" * num)
-
+ print('-' * num)
def decode_result(found):
"""
:param found: The output of model_found()
:type found: bool
"""
- return {True: "Countermodel found", False: "No countermodel found", None: "None"}[
- found
- ]
-
+ return {True: 'Countermodel found', False: 'No countermodel found', None: 'None'}[found]
def test_model_found(arguments):
"""
m = MaceCommand(g, assumptions=alist, max_models=50)
found = m.build_model()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, decode_result(found)))
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, decode_result(found)))
def test_build_model(arguments):
"""
Try to build a ``nltk.sem.Valuation``.
"""
- g = Expression.fromstring("all x.man(x)")
- alist = [
- Expression.fromstring(a)
- for a in [
- "man(John)",
- "man(Socrates)",
- "man(Bill)",
- "some x.(-(x = John) & man(x) & sees(John,x))",
- "some x.(-(x = Bill) & man(x))",
- "all x.some y.(man(x) -> gives(Socrates,x,y))",
- ]
- ]
+ g = Expression.fromstring('all x.man(x)')
+ alist = [Expression.fromstring(a) for a in ['man(John)',
+ 'man(Socrates)',
+ 'man(Bill)',
+ 'some x.(-(x = John) & man(x) & sees(John,x))',
+ 'some x.(-(x = Bill) & man(x))',
+ 'all x.some y.(man(x) -> gives(Socrates,x,y))']]
m = MaceCommand(g, assumptions=alist)
m.build_model()
print("Assumptions and Goal")
spacer()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, decode_result(m.build_model())))
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, decode_result(m.build_model())))
spacer()
- # print(m.model('standard'))
- # print(m.model('cooked'))
+ #print m.model('standard')
+ #print m.model('cooked')
print("Valuation")
spacer()
- print(m.valuation, "\n")
-
+ print(m.valuation, '\n')
def test_transform_output(argument_pair):
"""
m = MaceCommand(g, assumptions=alist)
m.build_model()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, m.build_model()))
- for format in ["standard", "portable", "xml", "cooked"]:
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, m.build_model()))
+ for format in ['standard', 'portable', 'xml', 'cooked']:
spacer()
print("Using '%s' format" % format)
spacer()
print(m.model(format=format))
-
def test_make_relation_set():
- print(
- MaceCommand._make_relation_set(num_entities=3, values=[1, 0, 1])
- == set([("c",), ("a",)])
- )
- print(
- MaceCommand._make_relation_set(
- num_entities=3, values=[0, 0, 0, 0, 0, 0, 1, 0, 0]
- )
- == set([("c", "a")])
- )
- print(
- MaceCommand._make_relation_set(num_entities=2, values=[0, 0, 1, 0, 0, 0, 1, 0])
- == set([("a", "b", "a"), ("b", "b", "a")])
- )
-
+ print(MaceCommand._make_relation_set(num_entities=3, values=[1,0,1]) == set([('c',), ('a',)]))
+ print(MaceCommand._make_relation_set(num_entities=3, values=[0,0,0,0,0,0,1,0,0]) == set([('c', 'a')]))
+ print(MaceCommand._make_relation_set(num_entities=2, values=[0,0,1,0,0,0,1,0]) == set([('a', 'b', 'a'), ('b', 'b', 'a')]))
arguments = [
- ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
- ("(not mortal(Socrates))", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
+ ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+ ('(not mortal(Socrates))', ['all x.(man(x) -> mortal(x))', 'man(Socrates)'])
]
-
def demo():
test_model_found(arguments)
test_build_model(arguments)
test_transform_output(arguments[1])
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Daniel H. Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
this module are based on "Logical Foundations of Artificial Intelligence" by
Michael R. Genesereth and Nils J. Nilsson.
"""
+from __future__ import print_function, unicode_literals
+from nltk.inference.prover9 import Prover9, Prover9Command
from collections import defaultdict
from functools import reduce
-from nltk.inference.prover9 import Prover9, Prover9Command
-from nltk.sem.logic import (
- VariableExpression,
- EqualityExpression,
- ApplicationExpression,
- Expression,
- AbstractVariableExpression,
- AllExpression,
- BooleanExpression,
- NegatedExpression,
- ExistsExpression,
- Variable,
- ImpExpression,
- AndExpression,
- unique_variable,
- operator,
-)
+from nltk.sem.logic import (VariableExpression, EqualityExpression,
+ ApplicationExpression, Expression,
+ AbstractVariableExpression, AllExpression,
+ BooleanExpression, NegatedExpression,
+ ExistsExpression, Variable, ImpExpression,
+ AndExpression, unique_variable, operator)
from nltk.inference.api import Prover, ProverCommandDecorator
+from nltk.compat import python_2_unicode_compatible
-
-class ProverParseError(Exception):
- pass
-
+class ProverParseError(Exception): pass
def get_domain(goal, assumptions):
if goal is None:
all_expressions = assumptions + [-goal]
return reduce(operator.or_, (a.constants() for a in all_expressions), set())
-
class ClosedDomainProver(ProverCommandDecorator):
"""
This is a prover decorator that adds domain closure assumptions before
proving.
"""
-
def assumptions(self):
assumptions = [a for a in self._command.assumptions()]
goal = self._command.goal()
:return: ``Expression``
"""
if isinstance(ex, AllExpression):
- conjuncts = [
- ex.term.replace(ex.variable, VariableExpression(d)) for d in domain
- ]
+ conjuncts = [ex.term.replace(ex.variable, VariableExpression(d))
+ for d in domain]
conjuncts = [self.replace_quants(c, domain) for c in conjuncts]
- return reduce(lambda x, y: x & y, conjuncts)
+ return reduce(lambda x,y: x&y, conjuncts)
elif isinstance(ex, BooleanExpression):
- return ex.__class__(
- self.replace_quants(ex.first, domain),
- self.replace_quants(ex.second, domain),
- )
+ return ex.__class__(self.replace_quants(ex.first, domain),
+ self.replace_quants(ex.second, domain) )
elif isinstance(ex, NegatedExpression):
return -self.replace_quants(ex.term, domain)
elif isinstance(ex, ExistsExpression):
- disjuncts = [
- ex.term.replace(ex.variable, VariableExpression(d)) for d in domain
- ]
+ disjuncts = [ex.term.replace(ex.variable, VariableExpression(d))
+ for d in domain]
disjuncts = [self.replace_quants(d, domain) for d in disjuncts]
- return reduce(lambda x, y: x | y, disjuncts)
+ return reduce(lambda x,y: x|y, disjuncts)
else:
return ex
-
class UniqueNamesProver(ProverCommandDecorator):
"""
This is a prover decorator that adds unique names assumptions before
proving.
"""
-
def assumptions(self):
"""
- Domain = union([e.free()|e.constants() for e in all_expressions])
domain = list(get_domain(self._command.goal(), assumptions))
- # build a dictionary of obvious equalities
+ #build a dictionary of obvious equalities
eq_sets = SetHolder()
for a in assumptions:
if isinstance(a, EqualityExpression):
av = a.first.variable
bv = a.second.variable
- # put 'a' and 'b' in the same set
+ #put 'a' and 'b' in the same set
eq_sets[av].add(bv)
new_assumptions = []
- for i, a in enumerate(domain):
- for b in domain[i + 1 :]:
- # if a and b are not already in the same equality set
+ for i,a in enumerate(domain):
+ for b in domain[i+1:]:
+ #if a and b are not already in the same equality set
if b not in eq_sets[a]:
- newEqEx = EqualityExpression(
- VariableExpression(a), VariableExpression(b)
- )
+ newEqEx = EqualityExpression(VariableExpression(a),
+ VariableExpression(b))
if Prover9().prove(newEqEx, assumptions):
- # we can prove that the names are the same entity.
- # remember that they are equal so we don't re-check.
+ #we can prove that the names are the same entity.
+ #remember that they are equal so we don't re-check.
eq_sets[a].add(b)
else:
- # we can't prove it, so assume unique names
+ #we can't prove it, so assume unique names
new_assumptions.append(-newEqEx)
return assumptions + new_assumptions
-
class SetHolder(list):
"""
A list of sets of Variables.
"""
-
def __getitem__(self, item):
"""
:param item: ``Variable``
for s in self:
if item in s:
return s
- # item is not found in any existing set. so create a new set
+ #item is not found in any existing set. so create a new set
new = set([item])
self.append(new)
return new
-
class ClosedWorldProver(ProverCommandDecorator):
"""
This is a prover decorator that completes predicates before proving.
-------------------
-bird(Sam)
"""
-
def assumptions(self):
assumptions = self._command.assumptions()
disjuncts = []
- # Turn the signatures into disjuncts
+ #Turn the signatures into disjuncts
for sig in predHolder.signatures:
equality_exs = []
- for v1, v2 in zip(new_sig_exs, sig):
- equality_exs.append(EqualityExpression(v1, v2))
- disjuncts.append(reduce(lambda x, y: x & y, equality_exs))
+ for v1,v2 in zip(new_sig_exs, sig):
+ equality_exs.append(EqualityExpression(v1,v2))
+ disjuncts.append(reduce(lambda x,y: x&y, equality_exs))
- # Turn the properties into disjuncts
+ #Turn the properties into disjuncts
for prop in predHolder.properties:
- # replace variables from the signature with new sig variables
+ #replace variables from the signature with new sig variables
bindings = {}
- for v1, v2 in zip(new_sig_exs, prop[0]):
+ for v1,v2 in zip(new_sig_exs, prop[0]):
bindings[v2] = v1
disjuncts.append(prop[1].substitute_bindings(bindings))
- # make the assumption
+ #make the assumption
if disjuncts:
- # disjuncts exist, so make an implication
+ #disjuncts exist, so make an implication
antecedent = self._make_antecedent(p, new_sig)
- consequent = reduce(lambda x, y: x | y, disjuncts)
+ consequent = reduce(lambda x,y: x|y, disjuncts)
accum = ImpExpression(antecedent, consequent)
else:
- # nothing has property 'p'
+ #nothing has property 'p'
accum = NegatedExpression(self._make_antecedent(p, new_sig))
- # quantify the implication
+ #quantify the implication
for new_sig_var in new_sig[::-1]:
accum = AllExpression(new_sig_var, accum)
new_assumptions.append(accum)
self._map_predicates(expression.first, predDict)
self._map_predicates(expression.second, predDict)
elif isinstance(expression, AllExpression):
- # collect all the universally quantified variables
+ #collect all the universally quantified variables
sig = [expression.variable]
term = expression.term
while isinstance(term, AllExpression):
sig.append(term.variable)
term = term.term
if isinstance(term, ImpExpression):
- if isinstance(term.first, ApplicationExpression) and isinstance(
- term.second, ApplicationExpression
- ):
+ if isinstance(term.first, ApplicationExpression) and \
+ isinstance(term.second, ApplicationExpression):
func1, args1 = term.first.uncurry()
func2, args2 = term.second.uncurry()
- if (
- isinstance(func1, AbstractVariableExpression)
- and isinstance(func2, AbstractVariableExpression)
- and sig == [v.variable for v in args1]
- and sig == [v.variable for v in args2]
- ):
+ if isinstance(func1, AbstractVariableExpression) and \
+ isinstance(func2, AbstractVariableExpression) and \
+ sig == [v.variable for v in args1] and \
+ sig == [v.variable for v in args2]:
predDict[func2].append_prop((tuple(sig), term.first))
predDict[func1].validate_sig_len(sig)
-
+@python_2_unicode_compatible
class PredHolder(object):
"""
This class will be used by a dictionary that will store information
'all x.all y.(see(x,y) -> know(x,y))' would result in "((x,y),('see(x,y)'))"
for 'know'.
"""
-
def __init__(self):
self.signatures = []
self.properties = []
raise Exception("Signature lengths do not match")
def __str__(self):
- return "(%s,%s,%s)" % (self.signatures, self.properties, self.signature_len)
+ return '(%s,%s,%s)' % (self.signatures, self.properties,
+ self.signature_len)
def __repr__(self):
return "%s" % self
-
def closed_domain_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"exists x.walk(x)")
- p2 = lexpr(r"man(Socrates)")
- c = lexpr(r"walk(Socrates)")
- prover = Prover9Command(c, [p1, p2])
+ p1 = lexpr(r'exists x.walk(x)')
+ p2 = lexpr(r'man(Socrates)')
+ c = lexpr(r'walk(Socrates)')
+ prover = Prover9Command(c, [p1,p2])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
- for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print('assumptions:')
+ for a in cdp.assumptions(): print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"exists x.walk(x)")
- p2 = lexpr(r"man(Socrates)")
- p3 = lexpr(r"-walk(Bill)")
- c = lexpr(r"walk(Socrates)")
- prover = Prover9Command(c, [p1, p2, p3])
+ p1 = lexpr(r'exists x.walk(x)')
+ p2 = lexpr(r'man(Socrates)')
+ p3 = lexpr(r'-walk(Bill)')
+ c = lexpr(r'walk(Socrates)')
+ prover = Prover9Command(c, [p1,p2,p3])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
- for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print('assumptions:')
+ for a in cdp.assumptions(): print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"exists x.walk(x)")
- p2 = lexpr(r"man(Socrates)")
- p3 = lexpr(r"-walk(Bill)")
- c = lexpr(r"walk(Socrates)")
- prover = Prover9Command(c, [p1, p2, p3])
+ p1 = lexpr(r'exists x.walk(x)')
+ p2 = lexpr(r'man(Socrates)')
+ p3 = lexpr(r'-walk(Bill)')
+ c = lexpr(r'walk(Socrates)')
+ prover = Prover9Command(c, [p1,p2,p3])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
- for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print('assumptions:')
+ for a in cdp.assumptions(): print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"walk(Socrates)")
- p2 = lexpr(r"walk(Bill)")
- c = lexpr(r"all x.walk(x)")
- prover = Prover9Command(c, [p1, p2])
+ p1 = lexpr(r'walk(Socrates)')
+ p2 = lexpr(r'walk(Bill)')
+ c = lexpr(r'all x.walk(x)')
+ prover = Prover9Command(c, [p1,p2])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
- for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print('assumptions:')
+ for a in cdp.assumptions(): print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
- p1 = lexpr(r"girl(mary)")
- p2 = lexpr(r"dog(rover)")
- p3 = lexpr(r"all x.(girl(x) -> -dog(x))")
- p4 = lexpr(r"all x.(dog(x) -> -girl(x))")
- p5 = lexpr(r"chase(mary, rover)")
- c = lexpr(r"exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))")
- prover = Prover9Command(c, [p1, p2, p3, p4, p5])
+ p1 = lexpr(r'girl(mary)')
+ p2 = lexpr(r'dog(rover)')
+ p3 = lexpr(r'all x.(girl(x) -> -dog(x))')
+ p4 = lexpr(r'all x.(dog(x) -> -girl(x))')
+ p5 = lexpr(r'chase(mary, rover)')
+ c = lexpr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))')
+ prover = Prover9Command(c, [p1,p2,p3,p4,p5])
print(prover.prove())
cdp = ClosedDomainProver(prover)
- print("assumptions:")
- for a in cdp.assumptions():
- print(" ", a)
- print("goal:", cdp.goal())
+ print('assumptions:')
+ for a in cdp.assumptions(): print(' ', a)
+ print('goal:', cdp.goal())
print(cdp.prove())
-
def unique_names_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"man(Socrates)")
- p2 = lexpr(r"man(Bill)")
- c = lexpr(r"exists x.exists y.(x != y)")
- prover = Prover9Command(c, [p1, p2])
+ p1 = lexpr(r'man(Socrates)')
+ p2 = lexpr(r'man(Bill)')
+ c = lexpr(r'exists x.exists y.(x != y)')
+ prover = Prover9Command(c, [p1,p2])
print(prover.prove())
unp = UniqueNamesProver(prover)
- print("assumptions:")
- for a in unp.assumptions():
- print(" ", a)
- print("goal:", unp.goal())
+ print('assumptions:')
+ for a in unp.assumptions(): print(' ', a)
+ print('goal:', unp.goal())
print(unp.prove())
- p1 = lexpr(r"all x.(walk(x) -> (x = Socrates))")
- p2 = lexpr(r"Bill = William")
- p3 = lexpr(r"Bill = Billy")
- c = lexpr(r"-walk(William)")
- prover = Prover9Command(c, [p1, p2, p3])
+ p1 = lexpr(r'all x.(walk(x) -> (x = Socrates))')
+ p2 = lexpr(r'Bill = William')
+ p3 = lexpr(r'Bill = Billy')
+ c = lexpr(r'-walk(William)')
+ prover = Prover9Command(c, [p1,p2,p3])
print(prover.prove())
unp = UniqueNamesProver(prover)
- print("assumptions:")
- for a in unp.assumptions():
- print(" ", a)
- print("goal:", unp.goal())
+ print('assumptions:')
+ for a in unp.assumptions(): print(' ', a)
+ print('goal:', unp.goal())
print(unp.prove())
-
def closed_world_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"walk(Socrates)")
- p2 = lexpr(r"(Socrates != Bill)")
- c = lexpr(r"-walk(Bill)")
- prover = Prover9Command(c, [p1, p2])
+ p1 = lexpr(r'walk(Socrates)')
+ p2 = lexpr(r'(Socrates != Bill)')
+ c = lexpr(r'-walk(Bill)')
+ prover = Prover9Command(c, [p1,p2])
print(prover.prove())
cwp = ClosedWorldProver(prover)
- print("assumptions:")
- for a in cwp.assumptions():
- print(" ", a)
- print("goal:", cwp.goal())
+ print('assumptions:')
+ for a in cwp.assumptions(): print(' ', a)
+ print('goal:', cwp.goal())
print(cwp.prove())
- p1 = lexpr(r"see(Socrates, John)")
- p2 = lexpr(r"see(John, Mary)")
- p3 = lexpr(r"(Socrates != John)")
- p4 = lexpr(r"(John != Mary)")
- c = lexpr(r"-see(Socrates, Mary)")
- prover = Prover9Command(c, [p1, p2, p3, p4])
+ p1 = lexpr(r'see(Socrates, John)')
+ p2 = lexpr(r'see(John, Mary)')
+ p3 = lexpr(r'(Socrates != John)')
+ p4 = lexpr(r'(John != Mary)')
+ c = lexpr(r'-see(Socrates, Mary)')
+ prover = Prover9Command(c, [p1,p2,p3,p4])
print(prover.prove())
cwp = ClosedWorldProver(prover)
- print("assumptions:")
- for a in cwp.assumptions():
- print(" ", a)
- print("goal:", cwp.goal())
+ print('assumptions:')
+ for a in cwp.assumptions(): print(' ', a)
+ print('goal:', cwp.goal())
print(cwp.prove())
- p1 = lexpr(r"all x.(ostrich(x) -> bird(x))")
- p2 = lexpr(r"bird(Tweety)")
- p3 = lexpr(r"-ostrich(Sam)")
- p4 = lexpr(r"Sam != Tweety")
- c = lexpr(r"-bird(Sam)")
- prover = Prover9Command(c, [p1, p2, p3, p4])
+ p1 = lexpr(r'all x.(ostrich(x) -> bird(x))')
+ p2 = lexpr(r'bird(Tweety)')
+ p3 = lexpr(r'-ostrich(Sam)')
+ p4 = lexpr(r'Sam != Tweety')
+ c = lexpr(r'-bird(Sam)')
+ prover = Prover9Command(c, [p1,p2,p3,p4])
print(prover.prove())
cwp = ClosedWorldProver(prover)
- print("assumptions:")
- for a in cwp.assumptions():
- print(" ", a)
- print("goal:", cwp.goal())
+ print('assumptions:')
+ for a in cwp.assumptions(): print(' ', a)
+ print('goal:', cwp.goal())
print(cwp.prove())
-
def combination_prover_demo():
lexpr = Expression.fromstring
- p1 = lexpr(r"see(Socrates, John)")
- p2 = lexpr(r"see(John, Mary)")
- c = lexpr(r"-see(Socrates, Mary)")
- prover = Prover9Command(c, [p1, p2])
+ p1 = lexpr(r'see(Socrates, John)')
+ p2 = lexpr(r'see(John, Mary)')
+ c = lexpr(r'-see(Socrates, Mary)')
+ prover = Prover9Command(c, [p1,p2])
print(prover.prove())
- command = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover)))
- for a in command.assumptions():
- print(a)
+ command = ClosedDomainProver(
+ UniqueNamesProver(
+ ClosedWorldProver(prover)))
+ for a in command.assumptions(): print(a)
print(command.prove())
-
def default_reasoning_demo():
lexpr = Expression.fromstring
premises = []
- # define taxonomy
- premises.append(lexpr(r"all x.(elephant(x) -> animal(x))"))
- premises.append(lexpr(r"all x.(bird(x) -> animal(x))"))
- premises.append(lexpr(r"all x.(dove(x) -> bird(x))"))
- premises.append(lexpr(r"all x.(ostrich(x) -> bird(x))"))
- premises.append(lexpr(r"all x.(flying_ostrich(x) -> ostrich(x))"))
-
- # default properties
- premises.append(
- lexpr(r"all x.((animal(x) & -Ab1(x)) -> -fly(x))")
- ) # normal animals don't fly
- premises.append(
- lexpr(r"all x.((bird(x) & -Ab2(x)) -> fly(x))")
- ) # normal birds fly
- premises.append(
- lexpr(r"all x.((ostrich(x) & -Ab3(x)) -> -fly(x))")
- ) # normal ostriches don't fly
-
- # specify abnormal entities
- premises.append(lexpr(r"all x.(bird(x) -> Ab1(x))")) # flight
- premises.append(lexpr(r"all x.(ostrich(x) -> Ab2(x))")) # non-flying bird
- premises.append(lexpr(r"all x.(flying_ostrich(x) -> Ab3(x))")) # flying ostrich
-
- # define entities
- premises.append(lexpr(r"elephant(E)"))
- premises.append(lexpr(r"dove(D)"))
- premises.append(lexpr(r"ostrich(O)"))
-
- # print the assumptions
+ #define taxonomy
+ premises.append(lexpr(r'all x.(elephant(x) -> animal(x))'))
+ premises.append(lexpr(r'all x.(bird(x) -> animal(x))'))
+ premises.append(lexpr(r'all x.(dove(x) -> bird(x))'))
+ premises.append(lexpr(r'all x.(ostrich(x) -> bird(x))'))
+ premises.append(lexpr(r'all x.(flying_ostrich(x) -> ostrich(x))'))
+
+ #default properties
+ premises.append(lexpr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly
+ premises.append(lexpr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly
+ premises.append(lexpr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly
+
+ #specify abnormal entities
+ premises.append(lexpr(r'all x.(bird(x) -> Ab1(x))')) #flight
+ premises.append(lexpr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird
+ premises.append(lexpr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich
+
+ #define entities
+ premises.append(lexpr(r'elephant(E)'))
+ premises.append(lexpr(r'dove(D)'))
+ premises.append(lexpr(r'ostrich(O)'))
+
+ #print the assumptions
prover = Prover9Command(None, premises)
command = UniqueNamesProver(ClosedWorldProver(prover))
- for a in command.assumptions():
- print(a)
-
- print_proof("-fly(E)", premises)
- print_proof("fly(D)", premises)
- print_proof("-fly(O)", premises)
+ for a in command.assumptions(): print(a)
+ print_proof('-fly(E)', premises)
+ print_proof('fly(D)', premises)
+ print_proof('-fly(O)', premises)
def print_proof(goal, premises):
lexpr = Expression.fromstring
command = UniqueNamesProver(ClosedWorldProver(prover))
print(goal, prover.prove(), command.prove())
-
def demo():
closed_domain_demo()
unique_names_demo()
combination_prover_demo()
default_reasoning_demo()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Interface to the Prover9 Theorem Prover
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Garrette <dhgarrette@gmail.com>
# Ewan Klein <ewan@inf.ed.ac.uk>
#
"""
A theorem prover that makes use of the external 'Prover9' package.
"""
+from __future__ import print_function
import os
import subprocess
import nltk
-from nltk.sem.logic import (
- Expression,
- ExistsExpression,
- AllExpression,
- NegatedExpression,
- AndExpression,
- IffExpression,
- OrExpression,
- EqualityExpression,
- ImpExpression,
-)
+from nltk.sem.logic import Expression, ExistsExpression, AllExpression, \
+ NegatedExpression, AndExpression, IffExpression, OrExpression, \
+ EqualityExpression, ImpExpression
from nltk.inference.api import BaseProverCommand, Prover
#
#
p9_return_codes = {
0: True,
- 1: "(FATAL)", # A fatal error occurred (user's syntax error).
- 2: False, # (SOS_EMPTY) Prover9 ran out of things to do
- # (sos list exhausted).
- 3: "(MAX_MEGS)", # The max_megs (memory limit) parameter was exceeded.
- 4: "(MAX_SECONDS)", # The max_seconds parameter was exceeded.
- 5: "(MAX_GIVEN)", # The max_given parameter was exceeded.
- 6: "(MAX_KEPT)", # The max_kept parameter was exceeded.
- 7: "(ACTION)", # A Prover9 action terminated the search.
- 101: "(SIGSEGV)", # Prover9 crashed, most probably due to a bug.
-}
+ 1: "(FATAL)", #A fatal error occurred (user's syntax error).
+ 2: False, # (SOS_EMPTY) Prover9 ran out of things to do
+ # (sos list exhausted).
+ 3: "(MAX_MEGS)", # The max_megs (memory limit) parameter was exceeded.
+ 4: "(MAX_SECONDS)", # The max_seconds parameter was exceeded.
+ 5: "(MAX_GIVEN)", # The max_given parameter was exceeded.
+ 6: "(MAX_KEPT)", # The max_kept parameter was exceeded.
+ 7: "(ACTION)", # A Prover9 action terminated the search.
+ 101: "(SIGSEGV)", # Prover9 crashed, most probably due to a bug.
+ }
class Prover9CommandParent(object):
which is responsible for maintaining a goal and a set of assumptions,
and generating prover9-style input files from them.
"""
-
- def print_assumptions(self, output_format="nltk"):
+ def print_assumptions(self, output_format='nltk'):
"""
Print the list of the current assumptions.
"""
- if output_format.lower() == "nltk":
+ if output_format.lower() == 'nltk':
for a in self.assumptions():
print(a)
- elif output_format.lower() == "prover9":
+ elif output_format.lower() == 'prover9':
for a in convert_to_prover9(self.assumptions()):
print(a)
else:
- raise NameError(
- "Unrecognized value for 'output_format': %s" % output_format
- )
-
+ raise NameError("Unrecognized value for 'output_format': %s" %
+ output_format)
class Prover9Command(Prover9CommandParent, BaseProverCommand):
"""
the a print_assumptions() method that is used to print the list
of assumptions in multiple formats.
"""
-
def __init__(self, goal=None, assumptions=None, timeout=60, prover=None):
"""
:param goal: Input expression to prove
:see BaseProverCommand.decorate_proof()
"""
if simplify:
- return self._prover._call_prooftrans(proof_string, ["striplabels"])[
- 0
- ].rstrip()
+ return self._prover._call_prooftrans(proof_string, ['striplabels'])[0].rstrip()
else:
return proof_string.rstrip()
self._binary_location = None
self._prover9_bin = None
else:
- name = "prover9"
+ name = 'prover9'
self._prover9_bin = nltk.internals.find_binary(
- name,
- path_to_bin=binary_location,
- env_vars=["PROVER9"],
- url="http://www.cs.unm.edu/~mccune/prover9/",
- binary_names=[name, name + ".exe"],
- verbose=verbose,
- )
+ name,
+ path_to_bin=binary_location,
+ env_vars=['PROVER9'],
+ url='http://www.cs.unm.edu/~mccune/prover9/',
+ binary_names=[name, name + '.exe'],
+ verbose=verbose)
self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1)
def prover9_input(self, goal, assumptions):
prover9 binary. This string is formed based on the goal,
assumptions, and timeout value of this object.
"""
- s = ""
+ s = ''
if assumptions:
- s += "formulas(assumptions).\n"
+ s += 'formulas(assumptions).\n'
for p9_assumption in convert_to_prover9(assumptions):
- s += " %s.\n" % p9_assumption
- s += "end_of_list.\n\n"
+ s += ' %s.\n' % p9_assumption
+ s += 'end_of_list.\n\n'
if goal:
- s += "formulas(goals).\n"
- s += " %s.\n" % convert_to_prover9(goal)
- s += "end_of_list.\n\n"
+ s += 'formulas(goals).\n'
+ s += ' %s.\n' % convert_to_prover9(goal)
+ s += 'end_of_list.\n\n'
return s
executables. This list is used by ``config_prover9`` when searching
for the prover9 executables.
"""
- return [
- "/usr/local/bin/prover9",
- "/usr/local/bin/prover9/bin",
- "/usr/local/bin",
- "/usr/bin",
- "/usr/local/prover9",
- "/usr/local/share/prover9",
- ]
+ return ['/usr/local/bin/prover9',
+ '/usr/local/bin/prover9/bin',
+ '/usr/local/bin',
+ '/usr/bin',
+ '/usr/local/prover9',
+ '/usr/local/share/prover9']
def _find_binary(self, name, verbose=False):
binary_locations = self.binary_locations()
if self._binary_location is not None:
binary_locations += [self._binary_location]
- return nltk.internals.find_binary(
- name,
+ return nltk.internals.find_binary(name,
searchpath=binary_locations,
- env_vars=["PROVER9"],
- url="http://www.cs.unm.edu/~mccune/prover9/",
- binary_names=[name, name + ".exe"],
- verbose=verbose,
- )
+ env_vars=['PROVER9'],
+ url='http://www.cs.unm.edu/~mccune/prover9/',
+ binary_names=[name, name + '.exe'],
+ verbose=verbose)
def _call(self, input_str, binary, args=[], verbose=False):
"""
:see: ``config_prover9``
"""
if verbose:
- print("Calling:", binary)
- print("Args:", args)
- print("Input:\n", input_str, "\n")
+ print('Calling:', binary)
+ print('Args:', args)
+ print('Input:\n', input_str, '\n')
# Call prover9 via a subprocess
cmd = [binary] + args
input_str = input_str.encode("utf8")
except AttributeError:
pass
- p = subprocess.Popen(
- cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE
- )
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ stdin=subprocess.PIPE)
(stdout, stderr) = p.communicate(input=input_str)
if verbose:
- print("Return code:", p.returncode)
- if stdout:
- print("stdout:\n", stdout, "\n")
- if stderr:
- print("stderr:\n", stderr, "\n")
+ print('Return code:', p.returncode)
+ if stdout: print('stdout:\n', stdout, '\n')
+ if stderr: print('stderr:\n', stderr, '\n')
return (stdout.decode("utf-8"), p.returncode)
try:
result.append(_convert_to_prover9(s.simplify()))
except:
- print("input %s cannot be converted to Prover9 input syntax" % input)
+ print('input %s cannot be converted to Prover9 input syntax' % input)
raise
return result
else:
try:
return _convert_to_prover9(input.simplify())
except:
- print("input %s cannot be converted to Prover9 input syntax" % input)
+ print('input %s cannot be converted to Prover9 input syntax' % input)
raise
-
def _convert_to_prover9(expression):
"""
Convert ``logic.Expression`` to Prover9 formatted string.
"""
if isinstance(expression, ExistsExpression):
- return (
- "exists "
- + str(expression.variable)
- + " "
- + _convert_to_prover9(expression.term)
- )
+ return 'exists ' + str(expression.variable) + ' ' + _convert_to_prover9(expression.term)
elif isinstance(expression, AllExpression):
- return (
- "all "
- + str(expression.variable)
- + " "
- + _convert_to_prover9(expression.term)
- )
+ return 'all ' + str(expression.variable) + ' ' + _convert_to_prover9(expression.term)
elif isinstance(expression, NegatedExpression):
- return "-(" + _convert_to_prover9(expression.term) + ")"
+ return '-(' + _convert_to_prover9(expression.term) + ')'
elif isinstance(expression, AndExpression):
- return (
- "("
- + _convert_to_prover9(expression.first)
- + " & "
- + _convert_to_prover9(expression.second)
- + ")"
- )
+ return '(' + _convert_to_prover9(expression.first) + ' & ' + \
+ _convert_to_prover9(expression.second) + ')'
elif isinstance(expression, OrExpression):
- return (
- "("
- + _convert_to_prover9(expression.first)
- + " | "
- + _convert_to_prover9(expression.second)
- + ")"
- )
+ return '(' + _convert_to_prover9(expression.first) + ' | ' + \
+ _convert_to_prover9(expression.second) + ')'
elif isinstance(expression, ImpExpression):
- return (
- "("
- + _convert_to_prover9(expression.first)
- + " -> "
- + _convert_to_prover9(expression.second)
- + ")"
- )
+ return '(' + _convert_to_prover9(expression.first) + ' -> ' + \
+ _convert_to_prover9(expression.second) + ')'
elif isinstance(expression, IffExpression):
- return (
- "("
- + _convert_to_prover9(expression.first)
- + " <-> "
- + _convert_to_prover9(expression.second)
- + ")"
- )
+ return '(' + _convert_to_prover9(expression.first) + ' <-> ' + \
+ _convert_to_prover9(expression.second) + ')'
elif isinstance(expression, EqualityExpression):
- return (
- "("
- + _convert_to_prover9(expression.first)
- + " = "
- + _convert_to_prover9(expression.second)
- + ")"
- )
+ return '(' + _convert_to_prover9(expression.first) + ' = ' + \
+ _convert_to_prover9(expression.second) + ')'
else:
return str(expression)
if not assumptions:
assumptions = []
- stdout, returncode = self._call_prover9(
- self.prover9_input(goal, assumptions), verbose=verbose
- )
+ stdout, returncode = self._call_prover9(self.prover9_input(goal, assumptions),
+ verbose=verbose)
return (returncode == 0, stdout)
def prover9_input(self, goal, assumptions):
"""
:see: Prover9Parent.prover9_input
"""
- s = "clear(auto_denials).\n" # only one proof required
+ s = 'clear(auto_denials).\n' #only one proof required
return s + Prover9Parent.prover9_input(self, goal, assumptions)
def _call_prover9(self, input_str, args=[], verbose=False):
:see: ``config_prover9``
"""
if self._prover9_bin is None:
- self._prover9_bin = self._find_binary("prover9", verbose)
+ self._prover9_bin = self._find_binary('prover9', verbose)
- updated_input_str = ""
+ updated_input_str = ''
if self._timeout > 0:
- updated_input_str += "assign(max_seconds, %d).\n\n" % self._timeout
+ updated_input_str += 'assign(max_seconds, %d).\n\n' % self._timeout
updated_input_str += input_str
- stdout, returncode = self._call(
- updated_input_str, self._prover9_bin, args, verbose
- )
+ stdout, returncode = self._call(updated_input_str, self._prover9_bin, args, verbose)
- if returncode not in [0, 2]:
- errormsgprefix = "%%ERROR:"
+ if returncode not in [0,2]:
+ errormsgprefix = '%%ERROR:'
if errormsgprefix in stdout:
msgstart = stdout.index(errormsgprefix)
errormsg = stdout[msgstart:].strip()
else:
errormsg = None
- if returncode in [3, 4, 5, 6]:
+ if returncode in [3,4,5,6]:
raise Prover9LimitExceededException(returncode, errormsg)
else:
raise Prover9FatalException(returncode, errormsg)
:see: ``config_prover9``
"""
if self._prooftrans_bin is None:
- self._prooftrans_bin = self._find_binary("prooftrans", verbose)
+ self._prooftrans_bin = self._find_binary('prooftrans', verbose)
return self._call(input_str, self._prooftrans_bin, args, verbose)
def __init__(self, returncode, message):
msg = p9_return_codes[returncode]
if message:
- msg += "\n%s" % message
+ msg += '\n%s' % message
Exception.__init__(self, msg)
-
class Prover9FatalException(Prover9Exception):
pass
-
class Prover9LimitExceededException(Prover9Exception):
pass
+
######################################################################
-# { Tests and Demos
+#{ Tests and Demos
######################################################################
-
def test_config():
- a = Expression.fromstring("(walk(j) & sing(j))")
- g = Expression.fromstring("walk(j)")
+ a = Expression.fromstring('(walk(j) & sing(j))')
+ g = Expression.fromstring('walk(j)')
p = Prover9Command(g, assumptions=[a])
p._executable_path = None
- p.prover9_search = []
+ p.prover9_search=[]
p.prove()
- # config_prover9('/usr/local/bin')
+ #config_prover9('/usr/local/bin')
print(p.prove())
print(p.proof())
-
def test_convert_to_prover9(expr):
"""
Test that parsing works OK.
e = Expression.fromstring(t)
print(convert_to_prover9(e))
-
def test_prove(arguments):
"""
Try some proofs and exhibit the results.
alist = [Expression.fromstring(a) for a in assumptions]
p = Prover9Command(g, assumptions=alist).prove()
for a in alist:
- print(" %s" % a)
- print("|- %s: %s\n" % (g, p))
-
+ print(' %s' % a)
+ print('|- %s: %s\n' % (g, p))
arguments = [
- ("(man(x) <-> (not (not man(x))))", []),
- ("(not (man(x) & (not man(x))))", []),
- ("(man(x) | (not man(x)))", []),
- ("(man(x) & (not man(x)))", []),
- ("(man(x) -> man(x))", []),
- ("(not (man(x) & (not man(x))))", []),
- ("(man(x) | (not man(x)))", []),
- ("(man(x) -> man(x))", []),
- ("(man(x) <-> man(x))", []),
- ("(not (man(x) <-> (not man(x))))", []),
- ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
- ("((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))", []),
- ("(all x.man(x) -> all x.man(x))", []),
- ("some x.all y.sees(x,y)", []),
- (
- "some e3.(walk(e3) & subj(e3, mary))",
- [
- "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))"
- ],
- ),
- (
- "some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))",
- [
- "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))"
- ],
- ),
-]
-
-expressions = [
- r"some x y.sees(x,y)",
- r"some x.(man(x) & walks(x))",
- r"\x.(man(x) & walks(x))",
- r"\x y.sees(x,y)",
- r"walks(john)",
- r"\x.big(x, \y.mouse(y))",
- r"(walks(x) & (runs(x) & (threes(x) & fours(x))))",
- r"(walks(x) -> runs(x))",
- r"some x.(PRO(x) & sees(John, x))",
- r"some x.(man(x) & (not walks(x)))",
- r"all x.(man(x) -> walks(x))",
+ ('(man(x) <-> (not (not man(x))))', []),
+ ('(not (man(x) & (not man(x))))', []),
+ ('(man(x) | (not man(x)))', []),
+ ('(man(x) & (not man(x)))', []),
+ ('(man(x) -> man(x))', []),
+ ('(not (man(x) & (not man(x))))', []),
+ ('(man(x) | (not man(x)))', []),
+ ('(man(x) -> man(x))', []),
+ ('(man(x) <-> man(x))', []),
+ ('(not (man(x) <-> (not man(x))))', []),
+ ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+ ('((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))', []),
+ ('(all x.man(x) -> all x.man(x))', []),
+ ('some x.all y.sees(x,y)', []),
+ ('some e3.(walk(e3) & subj(e3, mary))',
+ ['some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))']),
+ ('some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))',
+ ['some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))'])
]
+expressions = [r'some x y.sees(x,y)',
+ r'some x.(man(x) & walks(x))',
+ r'\x.(man(x) & walks(x))',
+ r'\x y.sees(x,y)',
+ r'walks(john)',
+ r'\x.big(x, \y.mouse(y))',
+ r'(walks(x) & (runs(x) & (threes(x) & fours(x))))',
+ r'(walks(x) -> runs(x))',
+ r'some x.(PRO(x) & sees(John, x))',
+ r'some x.(man(x) & (not walks(x)))',
+ r'all x.(man(x) -> walks(x))']
def spacer(num=45):
- print("-" * num)
-
+ print('-' * num)
def demo():
print("Testing configuration")
spacer()
test_prove(arguments)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
"""
Module for a resolution-based First Order theorem prover.
"""
+from __future__ import print_function, unicode_literals
import operator
from collections import defaultdict
from functools import reduce
from nltk.sem import skolemize
-from nltk.sem.logic import (
- VariableExpression,
- EqualityExpression,
- ApplicationExpression,
- Expression,
- NegatedExpression,
- Variable,
- AndExpression,
- unique_variable,
- OrExpression,
- is_indvar,
- IndividualVariableExpression,
- Expression,
-)
+from nltk.sem.logic import (VariableExpression, EqualityExpression,
+ ApplicationExpression, Expression,
+ NegatedExpression, Variable,
+ AndExpression, unique_variable, OrExpression,
+ is_indvar, IndividualVariableExpression, Expression)
from nltk.inference.api import Prover, BaseProverCommand
+from nltk.compat import python_2_unicode_compatible
-
-class ProverParseError(Exception):
- pass
-
+class ProverParseError(Exception): pass
class ResolutionProver(Prover):
- ANSWER_KEY = "ANSWER"
- _assume_false = True
+ ANSWER_KEY = 'ANSWER'
+ _assume_false=True
def _prove(self, goal=None, assumptions=None, verbose=False):
"""
if verbose:
print(ResolutionProverCommand._decorate_clauses(clauses))
except RuntimeError as e:
- if self._assume_false and str(e).startswith(
- "maximum recursion depth exceeded"
- ):
+ if self._assume_false and str(e).startswith('maximum recursion depth exceeded'):
result = False
clauses = []
else:
return (result, clauses)
def _attempt_proof(self, clauses):
- # map indices to lists of indices, to store attempted unifications
+ #map indices to lists of indices, to store attempted unifications
tried = defaultdict(list)
i = 0
while i < len(clauses):
if not clauses[i].is_tautology():
- # since we try clauses in order, we should start after the last
- # index tried
+ #since we try clauses in order, we should start after the last
+ #index tried
if tried[i]:
j = tried[i][-1] + 1
else:
- j = i + 1 # nothing tried yet for 'i', so start with the next
+ j = i + 1 #nothing tried yet for 'i', so start with the next
while j < len(clauses):
- # don't: 1) unify a clause with itself,
+ #don't: 1) unify a clause with itself,
# 2) use tautologies
if i != j and j and not clauses[j].is_tautology():
tried[i].append(j)
newclauses = clauses[i].unify(clauses[j])
if newclauses:
for newclause in newclauses:
- newclause._parents = (i + 1, j + 1)
+ newclause._parents = (i+1, j+1)
clauses.append(newclause)
- if not len(newclause): # if there's an empty clause
+ if not len(newclause): #if there's an empty clause
return (True, clauses)
- i = -1 # since we added a new clause, restart from the top
+ i=-1 #since we added a new clause, restart from the top
break
j += 1
i += 1
return (False, clauses)
-
class ResolutionProverCommand(BaseProverCommand):
def __init__(self, goal=None, assumptions=None, prover=None):
"""
re-proving.
"""
if self._result is None:
- self._result, clauses = self._prover._prove(
- self.goal(), self.assumptions(), verbose
- )
+ self._result, clauses = self._prover._prove(self.goal(),
+ self.assumptions(),
+ verbose)
self._clauses = clauses
self._proof = ResolutionProverCommand._decorate_clauses(clauses)
return self._result
answer_ex = VariableExpression(Variable(ResolutionProver.ANSWER_KEY))
for clause in self._clauses:
for term in clause:
- if (
- isinstance(term, ApplicationExpression)
- and term.function == answer_ex
- and not isinstance(term.argument, IndividualVariableExpression)
- ):
+ if isinstance(term, ApplicationExpression) and\
+ term.function == answer_ex and\
+ not isinstance(term.argument, IndividualVariableExpression):
answers.add(term.argument)
return answers
"""
Decorate the proof output.
"""
- out = ""
+ out = ''
max_clause_len = max([len(str(clause)) for clause in clauses])
max_seq_len = len(str(len(clauses)))
for i in range(len(clauses)):
- parents = "A"
- taut = ""
+ parents = 'A'
+ taut = ''
if clauses[i].is_tautology():
- taut = "Tautology"
+ taut = 'Tautology'
if clauses[i]._parents:
parents = str(clauses[i]._parents)
- parents = " " * (max_clause_len - len(str(clauses[i])) + 1) + parents
- seq = " " * (max_seq_len - len(str(i + 1))) + str(i + 1)
- out += "[%s] %s %s %s\n" % (seq, clauses[i], parents, taut)
+ parents = ' '*(max_clause_len-len(str(clauses[i]))+1) + parents
+ seq = ' '*(max_seq_len-len(str(i+1))) + str(i+1)
+ out += '[%s] %s %s %s\n' % (seq, clauses[i], parents, taut)
return out
-
+@python_2_unicode_compatible
class Clause(list):
def __init__(self, data):
list.__init__(self, data)
:return: list containing all the resulting ``Clause`` objects that could be
obtained by unification
"""
- if bindings is None:
- bindings = BindingDict()
- if used is None:
- used = ([], [])
- if skipped is None:
- skipped = ([], [])
- if isinstance(debug, bool):
- debug = DebugObject(debug)
-
- newclauses = _iterate_first(
- self, other, bindings, used, skipped, _complete_unify_path, debug
- )
-
- # remove subsumed clauses. make a list of all indices of subsumed
- # clauses, and then remove them from the list
+ if bindings is None: bindings = BindingDict()
+ if used is None: used = ([],[])
+ if skipped is None: skipped = ([],[])
+ if isinstance(debug, bool): debug = DebugObject(debug)
+
+ newclauses = _iterate_first(self, other, bindings, used, skipped, _complete_unify_path, debug)
+
+ #remove subsumed clauses. make a list of all indices of subsumed
+ #clauses, and then remove them from the list
subsumed = []
for i, c1 in enumerate(newclauses):
if i not in subsumed:
for j, c2 in enumerate(newclauses):
- if i != j and j not in subsumed and c1.subsumes(c2):
+ if i!=j and j not in subsumed and c1.subsumes(c2):
subsumed.append(j)
result = []
for i in range(len(newclauses)):
negatedotherClause = Clause(negatedother)
bindings = BindingDict()
- used = ([], [])
- skipped = ([], [])
+ used = ([],[])
+ skipped = ([],[])
debug = DebugObject(False)
- return (
- len(
- _iterate_first(
- self,
- negatedotherClause,
- bindings,
- used,
- skipped,
- _subsumes_finalize,
- debug,
- )
- )
- > 0
- )
+ return len(_iterate_first(self, negatedotherClause, bindings, used,
+ skipped, _subsumes_finalize,
+ debug)) > 0
def __getslice__(self, start, end):
return Clause(list.__getslice__(self, start, end))
"""
if self._is_tautology is not None:
return self._is_tautology
- for i, a in enumerate(self):
+ for i,a in enumerate(self):
if not isinstance(a, EqualityExpression):
- j = len(self) - 1
+ j = len(self)-1
while j > i:
b = self[j]
if isinstance(a, NegatedExpression):
return Clause([atom.substitute_bindings(bindings) for atom in self])
def __str__(self):
- return "{" + ", ".join("%s" % item for item in self) + "}"
+ return '{' + ', '.join("%s" % item for item in self) + '}'
def __repr__(self):
return "%s" % self
-
def _iterate_first(first, second, bindings, used, skipped, finalize_method, debug):
"""
This method facilitates movement through the terms of 'self'
"""
- debug.line("unify(%s,%s) %s" % (first, second, bindings))
+ debug.line('unify(%s,%s) %s'%(first, second, bindings))
- if not len(first) or not len(second): # if no more recursions can be performed
+ if not len(first) or not len(second): #if no more recursions can be performed
return finalize_method(first, second, bindings, used, skipped, debug)
else:
- # explore this 'self' atom
- result = _iterate_second(
- first, second, bindings, used, skipped, finalize_method, debug + 1
- )
+ #explore this 'self' atom
+ result = _iterate_second(first, second, bindings, used, skipped, finalize_method, debug+1)
- # skip this possible 'self' atom
- newskipped = (skipped[0] + [first[0]], skipped[1])
- result += _iterate_first(
- first[1:], second, bindings, used, newskipped, finalize_method, debug + 1
- )
+ #skip this possible 'self' atom
+ newskipped = (skipped[0]+[first[0]], skipped[1])
+ result += _iterate_first(first[1:], second, bindings, used, newskipped, finalize_method, debug+1)
try:
- newbindings, newused, unused = _unify_terms(
- first[0], second[0], bindings, used
- )
- # Unification found, so progress with this line of unification
- # put skipped and unused terms back into play for later unification.
+ newbindings, newused, unused = _unify_terms(first[0], second[0], bindings, used)
+ #Unification found, so progress with this line of unification
+ #put skipped and unused terms back into play for later unification.
newfirst = first[1:] + skipped[0] + unused[0]
newsecond = second[1:] + skipped[1] + unused[1]
- result += _iterate_first(
- newfirst,
- newsecond,
- newbindings,
- newused,
- ([], []),
- finalize_method,
- debug + 1,
- )
+ result += _iterate_first(newfirst, newsecond, newbindings, newused, ([],[]), finalize_method, debug+1)
except BindingException:
- # the atoms could not be unified,
+ #the atoms could not be unified,
pass
return result
-
def _iterate_second(first, second, bindings, used, skipped, finalize_method, debug):
"""
This method facilitates movement through the terms of 'other'
"""
- debug.line("unify(%s,%s) %s" % (first, second, bindings))
+ debug.line('unify(%s,%s) %s'%(first, second, bindings))
- if not len(first) or not len(second): # if no more recursions can be performed
+ if not len(first) or not len(second): #if no more recursions can be performed
return finalize_method(first, second, bindings, used, skipped, debug)
else:
- # skip this possible pairing and move to the next
- newskipped = (skipped[0], skipped[1] + [second[0]])
- result = _iterate_second(
- first, second[1:], bindings, used, newskipped, finalize_method, debug + 1
- )
+ #skip this possible pairing and move to the next
+ newskipped = (skipped[0], skipped[1]+[second[0]])
+ result = _iterate_second(first, second[1:], bindings, used, newskipped, finalize_method, debug+1)
try:
- newbindings, newused, unused = _unify_terms(
- first[0], second[0], bindings, used
- )
- # Unification found, so progress with this line of unification
- # put skipped and unused terms back into play for later unification.
+ newbindings, newused, unused = _unify_terms(first[0], second[0], bindings, used)
+ #Unification found, so progress with this line of unification
+ #put skipped and unused terms back into play for later unification.
newfirst = first[1:] + skipped[0] + unused[0]
newsecond = second[1:] + skipped[1] + unused[1]
- result += _iterate_second(
- newfirst,
- newsecond,
- newbindings,
- newused,
- ([], []),
- finalize_method,
- debug + 1,
- )
+ result += _iterate_second(newfirst, newsecond, newbindings, newused, ([],[]), finalize_method, debug+1)
except BindingException:
- # the atoms could not be unified,
+ #the atoms could not be unified,
pass
return result
-
def _unify_terms(a, b, bindings=None, used=None):
"""
This method attempts to unify two terms. Two expressions are unifiable
assert isinstance(a, Expression)
assert isinstance(b, Expression)
- if bindings is None:
- bindings = BindingDict()
- if used is None:
- used = ([], [])
+ if bindings is None: bindings = BindingDict()
+ if used is None: used = ([],[])
# Use resolution
if isinstance(a, NegatedExpression) and isinstance(b, ApplicationExpression):
newbindings = most_general_unification(a.term, b, bindings)
- newused = (used[0] + [a], used[1] + [b])
- unused = ([], [])
+ newused = (used[0]+[a], used[1]+[b])
+ unused = ([],[])
elif isinstance(a, ApplicationExpression) and isinstance(b, NegatedExpression):
newbindings = most_general_unification(a, b.term, bindings)
- newused = (used[0] + [a], used[1] + [b])
- unused = ([], [])
+ newused = (used[0]+[a], used[1]+[b])
+ unused = ([],[])
# Use demodulation
elif isinstance(a, EqualityExpression):
newbindings = BindingDict([(a.first.variable, a.second)])
- newused = (used[0] + [a], used[1])
- unused = ([], [b])
+ newused = (used[0]+[a], used[1])
+ unused = ([],[b])
elif isinstance(b, EqualityExpression):
newbindings = BindingDict([(b.first.variable, b.second)])
- newused = (used[0], used[1] + [b])
- unused = ([a], [])
+ newused = (used[0], used[1]+[b])
+ unused = ([a],[])
else:
raise BindingException((a, b))
return newbindings, newused, unused
-
def _complete_unify_path(first, second, bindings, used, skipped, debug):
- if used[0] or used[1]: # if bindings were made along the path
+ if used[0] or used[1]: #if bindings were made along the path
newclause = Clause(skipped[0] + skipped[1] + first + second)
- debug.line(" -> New Clause: %s" % newclause)
+ debug.line(' -> New Clause: %s' % newclause)
return [newclause.substitute_bindings(bindings)]
- else: # no bindings made means no unification occurred. so no result
- debug.line(" -> End")
+ else: #no bindings made means no unification occurred. so no result
+ debug.line(' -> End')
return []
-
def _subsumes_finalize(first, second, bindings, used, skipped, debug):
if not len(skipped[0]) and not len(first):
- # If there are no skipped terms and no terms left in 'first', then
- # all of the terms in the original 'self' were unified with terms
- # in 'other'. Therefore, there exists a binding (this one) such that
- # every term in self can be unified with a term in other, which
- # is the definition of subsumption.
+ #If there are no skipped terms and no terms left in 'first', then
+ #all of the terms in the original 'self' were unified with terms
+ #in 'other'. Therefore, there exists a binding (this one) such that
+ #every term in self can be unified with a term in other, which
+ #is the definition of subsumption.
return [True]
else:
return []
-
def clausify(expression):
"""
Skolemize, clausify, and standardize the variables apart.
clause_list.append(clause)
return clause_list
-
def _clausify(expression):
"""
:param expression: a skolemized expression in CNF
raise ProverParseError()
+@python_2_unicode_compatible
class BindingDict(object):
def __init__(self, binding_list=None):
"""
if not existing or binding2 == existing:
self.d[binding.variable] = binding2
else:
- raise BindingException(
- "Variable %s already bound to another " "value" % (variable)
- )
+ raise BindingException('Variable %s already bound to another '
+ 'value' % (variable))
else:
- raise BindingException(
- "Variable %s already bound to another " "value" % (variable)
- )
+ raise BindingException('Variable %s already bound to another '
+ 'value' % (variable))
def __getitem__(self, variable):
"""
combined[v] = other.d[v]
return combined
except BindingException:
- raise BindingException(
- "Attempting to add two contradicting "
- "BindingDicts: '%s' and '%s'" % (self, other)
- )
+ raise BindingException("Attempting to add two contradicting "
+ "BindingDicts: '%s' and '%s'"
+ % (self, other))
def __len__(self):
return len(self.d)
def __str__(self):
- data_str = ", ".join("%s: %s" % (v, self.d[v]) for v in sorted(self.d.keys()))
- return "{" + data_str + "}"
+ data_str = ', '.join('%s: %s' % (v, self.d[v]) for v in sorted(self.d.keys()))
+ return '{' + data_str + '}'
def __repr__(self):
return "%s" % self
return _mgu_var(a, b, bindings)
elif isinstance(b, IndividualVariableExpression):
return _mgu_var(b, a, bindings)
- elif isinstance(a, ApplicationExpression) and isinstance(b, ApplicationExpression):
- return most_general_unification(
- a.function, b.function, bindings
- ) + most_general_unification(a.argument, b.argument, bindings)
+ elif isinstance(a, ApplicationExpression) and\
+ isinstance(b, ApplicationExpression):
+ return most_general_unification(a.function, b.function, bindings) +\
+ most_general_unification(a.argument, b.argument, bindings)
raise BindingException((a, b))
-
def _mgu_var(var, expression, bindings):
- if var.variable in expression.free() | expression.constants():
+ if var.variable in expression.free()|expression.constants():
raise BindingException((var, expression))
else:
return BindingDict([(var.variable, expression)]) + bindings
else:
Exception.__init__(self, arg)
-
class UnificationException(Exception):
def __init__(self, a, b):
- Exception.__init__(self, "'%s' cannot unify with '%s'" % (a, b))
+ Exception.__init__(self, "'%s' cannot unify with '%s'" % (a,b))
class DebugObject(object):
self.indent = indent
def __add__(self, i):
- return DebugObject(self.enabled, self.indent + i)
+ return DebugObject(self.enabled, self.indent+i)
def line(self, line):
if self.enabled:
- print(" " * self.indent + line)
+ print(' '*self.indent + line)
def testResolutionProver():
- resolution_test(r"man(x)")
- resolution_test(r"(man(x) -> man(x))")
- resolution_test(r"(man(x) -> --man(x))")
- resolution_test(r"-(man(x) and -man(x))")
- resolution_test(r"(man(x) or -man(x))")
- resolution_test(r"(man(x) -> man(x))")
- resolution_test(r"-(man(x) and -man(x))")
- resolution_test(r"(man(x) or -man(x))")
- resolution_test(r"(man(x) -> man(x))")
- resolution_test(r"(man(x) iff man(x))")
- resolution_test(r"-(man(x) iff -man(x))")
- resolution_test("all x.man(x)")
- resolution_test("-all x.some y.F(x,y) & some x.all y.(-F(x,y))")
- resolution_test("some x.all y.sees(x,y)")
-
- p1 = Expression.fromstring(r"all x.(man(x) -> mortal(x))")
- p2 = Expression.fromstring(r"man(Socrates)")
- c = Expression.fromstring(r"mortal(Socrates)")
- print("%s, %s |- %s: %s" % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
-
- p1 = Expression.fromstring(r"all x.(man(x) -> walks(x))")
- p2 = Expression.fromstring(r"man(John)")
- c = Expression.fromstring(r"some y.walks(y)")
- print("%s, %s |- %s: %s" % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
-
- p = Expression.fromstring(r"some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))")
- c = Expression.fromstring(r"some e0.walk(e0,mary)")
- print("%s |- %s: %s" % (p, c, ResolutionProver().prove(c, [p])))
-
+ resolution_test(r'man(x)')
+ resolution_test(r'(man(x) -> man(x))')
+ resolution_test(r'(man(x) -> --man(x))')
+ resolution_test(r'-(man(x) and -man(x))')
+ resolution_test(r'(man(x) or -man(x))')
+ resolution_test(r'(man(x) -> man(x))')
+ resolution_test(r'-(man(x) and -man(x))')
+ resolution_test(r'(man(x) or -man(x))')
+ resolution_test(r'(man(x) -> man(x))')
+ resolution_test(r'(man(x) iff man(x))')
+ resolution_test(r'-(man(x) iff -man(x))')
+ resolution_test('all x.man(x)')
+ resolution_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
+ resolution_test('some x.all y.sees(x,y)')
+
+ p1 = Expression.fromstring(r'all x.(man(x) -> mortal(x))')
+ p2 = Expression.fromstring(r'man(Socrates)')
+ c = Expression.fromstring(r'mortal(Socrates)')
+ print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1,p2])))
+
+ p1 = Expression.fromstring(r'all x.(man(x) -> walks(x))')
+ p2 = Expression.fromstring(r'man(John)')
+ c = Expression.fromstring(r'some y.walks(y)')
+ print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1,p2])))
+
+ p = Expression.fromstring(r'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))')
+ c = Expression.fromstring(r'some e0.walk(e0,mary)')
+ print('%s |- %s: %s' % (p, c, ResolutionProver().prove(c, [p])))
def resolution_test(e):
f = Expression.fromstring(e)
t = ResolutionProver().prove(f)
- print("|- %s: %s" % (f, t))
-
+ print('|- %s: %s' % (f, t))
def test_clausify():
lexpr = Expression.fromstring
- print(clausify(lexpr("P(x) | Q(x)")))
- print(clausify(lexpr("(P(x) & Q(x)) | R(x)")))
- print(clausify(lexpr("P(x) | (Q(x) & R(x))")))
- print(clausify(lexpr("(P(x) & Q(x)) | (R(x) & S(x))")))
+ print(clausify(lexpr('P(x) | Q(x)')))
+ print(clausify(lexpr('(P(x) & Q(x)) | R(x)')))
+ print(clausify(lexpr('P(x) | (Q(x) & R(x))')))
+ print(clausify(lexpr('(P(x) & Q(x)) | (R(x) & S(x))')))
- print(clausify(lexpr("P(x) | Q(x) | R(x)")))
- print(clausify(lexpr("P(x) | (Q(x) & R(x)) | S(x)")))
+ print(clausify(lexpr('P(x) | Q(x) | R(x)')))
+ print(clausify(lexpr('P(x) | (Q(x) & R(x)) | S(x)')))
- print(clausify(lexpr("exists x.P(x) | Q(x)")))
+ print(clausify(lexpr('exists x.P(x) | Q(x)')))
- print(clausify(lexpr("-(-P(x) & Q(x))")))
- print(clausify(lexpr("P(x) <-> Q(x)")))
- print(clausify(lexpr("-(P(x) <-> Q(x))")))
- print(clausify(lexpr("-(all x.P(x))")))
- print(clausify(lexpr("-(some x.P(x))")))
+ print(clausify(lexpr('-(-P(x) & Q(x))')))
+ print(clausify(lexpr('P(x) <-> Q(x)')))
+ print(clausify(lexpr('-(P(x) <-> Q(x))')))
+ print(clausify(lexpr('-(all x.P(x))')))
+ print(clausify(lexpr('-(some x.P(x))')))
- print(clausify(lexpr("some x.P(x)")))
- print(clausify(lexpr("some x.all y.P(x,y)")))
- print(clausify(lexpr("all y.some x.P(x,y)")))
- print(clausify(lexpr("all z.all y.some x.P(x,y,z)")))
- print(clausify(lexpr("all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))")))
+ print(clausify(lexpr('some x.P(x)')))
+ print(clausify(lexpr('some x.all y.P(x,y)')))
+ print(clausify(lexpr('all y.some x.P(x,y)')))
+ print(clausify(lexpr('all z.all y.some x.P(x,y,z)')))
+ print(clausify(lexpr('all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))')))
def demo():
testResolutionProver()
print()
- p = Expression.fromstring("man(x)")
+ p = Expression.fromstring('man(x)')
print(ResolutionProverCommand(p, [p]).prove())
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: First-Order Tableau Theorem Prover
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# URL: <http://nltk.org/>
"""
Module for a tableau-based First Order theorem prover.
"""
+from __future__ import print_function, unicode_literals
from nltk.internals import Counter
-from nltk.sem.logic import (
- VariableExpression,
- EqualityExpression,
- ApplicationExpression,
- Expression,
- AbstractVariableExpression,
- AllExpression,
- NegatedExpression,
- ExistsExpression,
- Variable,
- ImpExpression,
- AndExpression,
- unique_variable,
- LambdaExpression,
- IffExpression,
- OrExpression,
- FunctionVariableExpression,
-)
+from nltk.sem.logic import (VariableExpression, EqualityExpression,
+ ApplicationExpression, Expression,
+ AbstractVariableExpression, AllExpression,
+ NegatedExpression,
+ ExistsExpression, Variable, ImpExpression,
+ AndExpression, unique_variable,
+ LambdaExpression, IffExpression,
+ OrExpression, FunctionVariableExpression)
from nltk.inference.api import Prover, BaseProverCommand
_counter = Counter()
-
-class ProverParseError(Exception):
- pass
-
+class ProverParseError(Exception): pass
class TableauProver(Prover):
- _assume_false = False
+ _assume_false=False
def _prove(self, goal=None, assumptions=None, verbose=False):
if not assumptions:
debugger = Debug(verbose)
result = self._attempt_proof(agenda, set(), set(), debugger)
except RuntimeError as e:
- if self._assume_false and str(e).startswith(
- "maximum recursion depth exceeded"
- ):
+ if self._assume_false and str(e).startswith('maximum recursion depth exceeded'):
result = False
else:
if verbose:
print(e)
else:
raise e
- return (result, "\n".join(debugger.lines))
+ return (result, '\n'.join(debugger.lines))
def _attempt_proof(self, agenda, accessible_vars, atoms, debug):
(current, context), category = agenda.pop_first()
- # if there's nothing left in the agenda, and we haven't closed the path
+ #if there's nothing left in the agenda, and we haven't closed the path
if not current:
- debug.line("AGENDA EMPTY")
+ debug.line('AGENDA EMPTY')
return False
- proof_method = {
- Categories.ATOM: self._attempt_proof_atom,
- Categories.PROP: self._attempt_proof_prop,
- Categories.N_ATOM: self._attempt_proof_n_atom,
- Categories.N_PROP: self._attempt_proof_n_prop,
- Categories.APP: self._attempt_proof_app,
- Categories.N_APP: self._attempt_proof_n_app,
- Categories.N_EQ: self._attempt_proof_n_eq,
- Categories.D_NEG: self._attempt_proof_d_neg,
- Categories.N_ALL: self._attempt_proof_n_all,
- Categories.N_EXISTS: self._attempt_proof_n_some,
- Categories.AND: self._attempt_proof_and,
- Categories.N_OR: self._attempt_proof_n_or,
- Categories.N_IMP: self._attempt_proof_n_imp,
- Categories.OR: self._attempt_proof_or,
- Categories.IMP: self._attempt_proof_imp,
- Categories.N_AND: self._attempt_proof_n_and,
- Categories.IFF: self._attempt_proof_iff,
- Categories.N_IFF: self._attempt_proof_n_iff,
- Categories.EQ: self._attempt_proof_eq,
- Categories.EXISTS: self._attempt_proof_some,
- Categories.ALL: self._attempt_proof_all,
- }[category]
+ proof_method = { Categories.ATOM: self._attempt_proof_atom,
+ Categories.PROP: self._attempt_proof_prop,
+ Categories.N_ATOM: self._attempt_proof_n_atom,
+ Categories.N_PROP: self._attempt_proof_n_prop,
+ Categories.APP: self._attempt_proof_app,
+ Categories.N_APP: self._attempt_proof_n_app,
+ Categories.N_EQ: self._attempt_proof_n_eq,
+ Categories.D_NEG: self._attempt_proof_d_neg,
+ Categories.N_ALL: self._attempt_proof_n_all,
+ Categories.N_EXISTS: self._attempt_proof_n_some,
+ Categories.AND: self._attempt_proof_and,
+ Categories.N_OR: self._attempt_proof_n_or,
+ Categories.N_IMP: self._attempt_proof_n_imp,
+ Categories.OR: self._attempt_proof_or,
+ Categories.IMP: self._attempt_proof_imp,
+ Categories.N_AND: self._attempt_proof_n_and,
+ Categories.IFF: self._attempt_proof_iff,
+ Categories.N_IFF: self._attempt_proof_n_iff,
+ Categories.EQ: self._attempt_proof_eq,
+ Categories.EXISTS: self._attempt_proof_some,
+ Categories.ALL: self._attempt_proof_all,
+ }[category]
debug.line((current, context))
return proof_method(current, context, agenda, accessible_vars, atoms, debug)
- def _attempt_proof_atom(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_atom(self, current, context, agenda, accessible_vars, atoms, debug):
# Check if the branch is closed. Return 'True' if it is
if (current, True) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
if context:
if isinstance(context.term, NegatedExpression):
current = current.negate()
agenda.put(context(current).simplify())
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
else:
- # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
- agenda.mark_alls_fresh()
- return self._attempt_proof(
- agenda,
- accessible_vars | set(current.args),
- atoms | set([(current, False)]),
- debug + 1,
- )
-
- def _attempt_proof_n_atom(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+ agenda.mark_alls_fresh();
+ return self._attempt_proof(agenda, accessible_vars|set(current.args), atoms|set([(current, False)]), debug+1)
+
+ def _attempt_proof_n_atom(self, current, context, agenda, accessible_vars, atoms, debug):
# Check if the branch is closed. Return 'True' if it is
if (current.term, False) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
if context:
if isinstance(context.term, NegatedExpression):
current = current.negate()
agenda.put(context(current).simplify())
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
else:
- # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
- agenda.mark_alls_fresh()
- return self._attempt_proof(
- agenda,
- accessible_vars | set(current.term.args),
- atoms | set([(current.term, True)]),
- debug + 1,
- )
-
- def _attempt_proof_prop(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+ agenda.mark_alls_fresh();
+ return self._attempt_proof(agenda, accessible_vars|set(current.term.args), atoms|set([(current.term, True)]), debug+1)
+
+ def _attempt_proof_prop(self, current, context, agenda, accessible_vars, atoms, debug):
# Check if the branch is closed. Return 'True' if it is
if (current, True) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
- # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
- agenda.mark_alls_fresh()
- return self._attempt_proof(
- agenda, accessible_vars, atoms | set([(current, False)]), debug + 1
- )
+ #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+ agenda.mark_alls_fresh();
+ return self._attempt_proof(agenda, accessible_vars, atoms|set([(current, False)]), debug+1)
- def _attempt_proof_n_prop(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_n_prop(self, current, context, agenda, accessible_vars, atoms, debug):
# Check if the branch is closed. Return 'True' if it is
if (current.term, False) in atoms:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
- # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
- agenda.mark_alls_fresh()
- return self._attempt_proof(
- agenda, accessible_vars, atoms | set([(current.term, True)]), debug + 1
- )
+ #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
+ agenda.mark_alls_fresh();
+ return self._attempt_proof(agenda, accessible_vars, atoms|set([(current.term, True)]), debug+1)
- def _attempt_proof_app(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_app(self, current, context, agenda, accessible_vars, atoms, debug):
f, args = current.uncurry()
for i, arg in enumerate(args):
if not TableauProver.is_atom(arg):
ctx = f
- nv = Variable("X%s" % _counter.get())
- for j, a in enumerate(args):
- ctx = ctx(VariableExpression(nv)) if i == j else ctx(a)
+ nv = Variable('X%s' % _counter.get())
+ for j,a in enumerate(args):
+ ctx = (ctx(VariableExpression(nv)) if i == j else ctx(a))
if context:
ctx = context(ctx).simplify()
ctx = LambdaExpression(nv, ctx)
agenda.put(arg, ctx)
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
- raise Exception("If this method is called, there must be a non-atomic argument")
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+ raise Exception('If this method is called, there must be a non-atomic argument')
- def _attempt_proof_n_app(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_n_app(self, current, context, agenda, accessible_vars, atoms, debug):
f, args = current.term.uncurry()
for i, arg in enumerate(args):
if not TableauProver.is_atom(arg):
ctx = f
- nv = Variable("X%s" % _counter.get())
- for j, a in enumerate(args):
- ctx = ctx(VariableExpression(nv)) if i == j else ctx(a)
+ nv = Variable('X%s' % _counter.get())
+ for j,a in enumerate(args):
+ ctx = (ctx(VariableExpression(nv)) if i == j else ctx(a))
if context:
- # combine new context with existing
+ #combine new context with existing
ctx = context(ctx).simplify()
ctx = LambdaExpression(nv, -ctx)
agenda.put(-arg, ctx)
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
- raise Exception("If this method is called, there must be a non-atomic argument")
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+ raise Exception('If this method is called, there must be a non-atomic argument')
- def _attempt_proof_n_eq(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_n_eq(self, current, context, agenda, accessible_vars, atoms, debug):
###########################################################################
# Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b'
###########################################################################
if current.term.first == current.term.second:
- debug.line("CLOSED", 1)
+ debug.line('CLOSED', 1)
return True
- agenda[Categories.N_EQ].add((current, context))
+ agenda[Categories.N_EQ].add((current,context))
current._exhausted = True
- return self._attempt_proof(
- agenda,
- accessible_vars | set([current.term.first, current.term.second]),
- atoms,
- debug + 1,
- )
-
- def _attempt_proof_d_neg(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ return self._attempt_proof(agenda, accessible_vars|set([current.term.first, current.term.second]), atoms, debug+1)
+
+ def _attempt_proof_d_neg(self, current, context, agenda, accessible_vars, atoms, debug):
agenda.put(current.term.term, context)
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
-
- def _attempt_proof_n_all(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
- agenda[Categories.EXISTS].add(
- (ExistsExpression(current.term.variable, -current.term.term), context)
- )
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
-
- def _attempt_proof_n_some(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
- agenda[Categories.ALL].add(
- (AllExpression(current.term.variable, -current.term.term), context)
- )
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
-
- def _attempt_proof_and(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+ def _attempt_proof_n_all(self, current, context, agenda, accessible_vars, atoms, debug):
+ agenda[Categories.EXISTS].add((ExistsExpression(current.term.variable, -current.term.term), context))
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+ def _attempt_proof_n_some(self, current, context, agenda, accessible_vars, atoms, debug):
+ agenda[Categories.ALL].add((AllExpression(current.term.variable, -current.term.term), context))
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
+
+ def _attempt_proof_and(self, current, context, agenda, accessible_vars, atoms, debug):
agenda.put(current.first, context)
agenda.put(current.second, context)
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_n_or(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_n_or(self, current, context, agenda, accessible_vars, atoms, debug):
agenda.put(-current.term.first, context)
agenda.put(-current.term.second, context)
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_n_imp(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_n_imp(self, current, context, agenda, accessible_vars, atoms, debug):
agenda.put(current.term.first, context)
agenda.put(-current.term.second, context)
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_or(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_or(self, current, context, agenda, accessible_vars, atoms, debug):
new_agenda = agenda.clone()
agenda.put(current.first, context)
new_agenda.put(current.second, context)
- return self._attempt_proof(
- agenda, accessible_vars, atoms, debug + 1
- ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_imp(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_imp(self, current, context, agenda, accessible_vars, atoms, debug):
new_agenda = agenda.clone()
agenda.put(-current.first, context)
new_agenda.put(current.second, context)
- return self._attempt_proof(
- agenda, accessible_vars, atoms, debug + 1
- ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_n_and(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_n_and(self, current, context, agenda, accessible_vars, atoms, debug):
new_agenda = agenda.clone()
agenda.put(-current.term.first, context)
new_agenda.put(-current.term.second, context)
- return self._attempt_proof(
- agenda, accessible_vars, atoms, debug + 1
- ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_iff(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_iff(self, current, context, agenda, accessible_vars, atoms, debug):
new_agenda = agenda.clone()
agenda.put(current.first, context)
agenda.put(current.second, context)
new_agenda.put(-current.first, context)
new_agenda.put(-current.second, context)
- return self._attempt_proof(
- agenda, accessible_vars, atoms, debug + 1
- ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_n_iff(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_n_iff(self, current, context, agenda, accessible_vars, atoms, debug):
new_agenda = agenda.clone()
agenda.put(current.term.first, context)
agenda.put(-current.term.second, context)
new_agenda.put(-current.term.first, context)
new_agenda.put(current.term.second, context)
- return self._attempt_proof(
- agenda, accessible_vars, atoms, debug + 1
- ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1)
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \
+ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1)
- def _attempt_proof_eq(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_eq(self, current, context, agenda, accessible_vars, atoms, debug):
#########################################################################
# Since 'current' is of the form '(a = b)', replace ALL free instances
# of 'a' with 'b'
agenda.put_atoms(atoms)
agenda.replace_all(current.first, current.second)
accessible_vars.discard(current.first)
- agenda.mark_neqs_fresh()
- return self._attempt_proof(agenda, accessible_vars, set(), debug + 1)
+ agenda.mark_neqs_fresh();
+ return self._attempt_proof(agenda, accessible_vars, set(), debug+1)
- def _attempt_proof_some(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_some(self, current, context, agenda, accessible_vars, atoms, debug):
new_unique_variable = VariableExpression(unique_variable())
agenda.put(current.term.replace(current.variable, new_unique_variable), context)
agenda.mark_alls_fresh()
- return self._attempt_proof(
- agenda, accessible_vars | set([new_unique_variable]), atoms, debug + 1
- )
+ return self._attempt_proof(agenda, accessible_vars|set([new_unique_variable]), atoms, debug+1)
- def _attempt_proof_all(
- self, current, context, agenda, accessible_vars, atoms, debug
- ):
+ def _attempt_proof_all(self, current, context, agenda, accessible_vars, atoms, debug):
try:
current._used_vars
except AttributeError:
current._used_vars = set()
- # if there are accessible_vars on the path
+ #if there are accessible_vars on the path
if accessible_vars:
# get the set of bound variables that have not be used by this AllExpression
bv_available = accessible_vars - current._used_vars
if bv_available:
variable_to_use = list(bv_available)[0]
- debug.line("--> Using '%s'" % variable_to_use, 2)
+ debug.line('--> Using \'%s\'' % variable_to_use, 2)
current._used_vars |= set([variable_to_use])
- agenda.put(
- current.term.replace(current.variable, variable_to_use), context
- )
- agenda[Categories.ALL].add((current, context))
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
+ agenda.put(current.term.replace(current.variable, variable_to_use), context)
+ agenda[Categories.ALL].add((current,context))
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
else:
- # no more available variables to substitute
- debug.line("--> Variables Exhausted", 2)
+ #no more available variables to substitute
+ debug.line('--> Variables Exhausted', 2)
current._exhausted = True
- agenda[Categories.ALL].add((current, context))
- return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
+ agenda[Categories.ALL].add((current,context))
+ return self._attempt_proof(agenda, accessible_vars, atoms, debug+1)
else:
new_unique_variable = VariableExpression(unique_variable())
- debug.line("--> Using '%s'" % new_unique_variable, 2)
+ debug.line('--> Using \'%s\'' % new_unique_variable, 2)
current._used_vars |= set([new_unique_variable])
- agenda.put(
- current.term.replace(current.variable, new_unique_variable), context
- )
- agenda[Categories.ALL].add((current, context))
+ agenda.put(current.term.replace(current.variable, new_unique_variable), context)
+ agenda[Categories.ALL].add((current,context))
agenda.mark_alls_fresh()
- return self._attempt_proof(
- agenda, accessible_vars | set([new_unique_variable]), atoms, debug + 1
- )
+ return self._attempt_proof(agenda, accessible_vars|set([new_unique_variable]), atoms, debug+1)
@staticmethod
def is_atom(e):
if not TableauProver.is_atom(arg):
return False
return True
- elif isinstance(e, AbstractVariableExpression) or isinstance(
- e, LambdaExpression
- ):
+ elif isinstance(e, AbstractVariableExpression) or \
+ isinstance(e, LambdaExpression):
return True
else:
return False
set_list = [s.copy() for s in self.sets]
new_allExs = set()
- for allEx, _ in set_list[Categories.ALL]:
+ for allEx,_ in set_list[Categories.ALL]:
new_allEx = AllExpression(allEx.variable, allEx.term)
try:
new_allEx._used_vars = set(used for used in allEx._used_vars)
except AttributeError:
new_allEx._used_vars = set()
- new_allExs.add((new_allEx, None))
+ new_allExs.add((new_allEx,None))
set_list[Categories.ALL] = new_allExs
- set_list[Categories.N_EQ] = set(
- (NegatedExpression(n_eq.term), ctx)
- for (n_eq, ctx) in set_list[Categories.N_EQ]
- )
+ set_list[Categories.N_EQ] = set((NegatedExpression(n_eq.term),ctx)
+ for (n_eq,ctx) in set_list[Categories.N_EQ])
new_agenda.sets = tuple(set_list)
return new_agenda
def put_atoms(self, atoms):
for atom, neg in atoms:
if neg:
- self[Categories.N_ATOM].add((-atom, None))
+ self[Categories.N_ATOM].add((-atom,None))
else:
- self[Categories.ATOM].add((atom, None))
+ self[Categories.ATOM].add((atom,None))
def pop_first(self):
""" Pop the first expression that appears in the agenda """
- for i, s in enumerate(self.sets):
+ for i,s in enumerate(self.sets):
if s:
if i in [Categories.N_EQ, Categories.ALL]:
for ex in s:
def replace_all(self, old, new):
for s in self.sets:
- for ex, ctx in s:
+ for ex,ctx in s:
ex.replace(old.variable, new)
if ctx is not None:
ctx.replace(old.variable, new)
def mark_alls_fresh(self):
- for u, _ in self.sets[Categories.ALL]:
+ for u,_ in self.sets[Categories.ALL]:
u._exhausted = False
def mark_neqs_fresh(self):
- for neq, _ in self.sets[Categories.N_EQ]:
+ for neq,_ in self.sets[Categories.N_EQ]:
neq._exhausted = False
def _categorize_expression(self, current):
elif isinstance(current, ApplicationExpression):
return Categories.APP
else:
- raise ProverParseError("cannot categorize %s" % current.__class__.__name__)
+ raise ProverParseError("cannot categorize %s" % \
+ current.__class__.__name__)
def _categorize_NegatedExpression(self, current):
negated = current.term
elif isinstance(negated, ApplicationExpression):
return Categories.N_APP
else:
- raise ProverParseError("cannot categorize %s" % negated.__class__.__name__)
+ raise ProverParseError("cannot categorize %s" % \
+ negated.__class__.__name__)
class Debug(object):
self.verbose = verbose
self.indent = indent
- if not lines:
- lines = []
+ if not lines: lines = []
self.lines = lines
def __add__(self, increment):
- return Debug(self.verbose, self.indent + 1, self.lines)
+ return Debug(self.verbose, self.indent+1, self.lines)
def line(self, data, indent=0):
if isinstance(data, tuple):
ex, ctx = data
if ctx:
- data = "%s, %s" % (ex, ctx)
+ data = '%s, %s' % (ex, ctx)
else:
- data = "%s" % ex
+ data = '%s' % ex
if isinstance(ex, AllExpression):
try:
- used_vars = "[%s]" % (
- ",".join("%s" % ve.variable.name for ve in ex._used_vars)
- )
- data += ": %s" % used_vars
+ used_vars = "[%s]" % (",".join("%s" % ve.variable.name for ve in ex._used_vars))
+ data += ': %s' % used_vars
except AttributeError:
- data += ": []"
+ data += ': []'
- newline = "%s%s" % (" " * (self.indent + indent), data)
+ newline = '%s%s' % (' '*(self.indent+indent), data)
self.lines.append(newline)
if self.verbose:
class Categories(object):
- ATOM = 0
- PROP = 1
- N_ATOM = 2
- N_PROP = 3
- APP = 4
- N_APP = 5
- N_EQ = 6
- D_NEG = 7
- N_ALL = 8
+ ATOM = 0
+ PROP = 1
+ N_ATOM = 2
+ N_PROP = 3
+ APP = 4
+ N_APP = 5
+ N_EQ = 6
+ D_NEG = 7
+ N_ALL = 8
N_EXISTS = 9
- AND = 10
- N_OR = 11
- N_IMP = 12
- OR = 13
- IMP = 14
- N_AND = 15
- IFF = 16
- N_IFF = 17
- EQ = 18
- EXISTS = 19
- ALL = 20
+ AND = 10
+ N_OR = 11
+ N_IMP = 12
+ OR = 13
+ IMP = 14
+ N_AND = 15
+ IFF = 16
+ N_IFF = 17
+ EQ = 18
+ EXISTS = 19
+ ALL = 20
def testTableauProver():
- tableau_test("P | -P")
- tableau_test("P & -P")
- tableau_test("Q", ["P", "(P -> Q)"])
- tableau_test("man(x)")
- tableau_test("(man(x) -> man(x))")
- tableau_test("(man(x) -> --man(x))")
- tableau_test("-(man(x) and -man(x))")
- tableau_test("(man(x) or -man(x))")
- tableau_test("(man(x) -> man(x))")
- tableau_test("-(man(x) and -man(x))")
- tableau_test("(man(x) or -man(x))")
- tableau_test("(man(x) -> man(x))")
- tableau_test("(man(x) iff man(x))")
- tableau_test("-(man(x) iff -man(x))")
- tableau_test("all x.man(x)")
- tableau_test("all x.all y.((x = y) -> (y = x))")
- tableau_test("all x.all y.all z.(((x = y) & (y = z)) -> (x = z))")
- # tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
- # tableau_test('some x.all y.sees(x,y)')
-
- p1 = "all x.(man(x) -> mortal(x))"
- p2 = "man(Socrates)"
- c = "mortal(Socrates)"
+ tableau_test('P | -P')
+ tableau_test('P & -P')
+ tableau_test('Q', ['P', '(P -> Q)'])
+ tableau_test('man(x)')
+ tableau_test('(man(x) -> man(x))')
+ tableau_test('(man(x) -> --man(x))')
+ tableau_test('-(man(x) and -man(x))')
+ tableau_test('(man(x) or -man(x))')
+ tableau_test('(man(x) -> man(x))')
+ tableau_test('-(man(x) and -man(x))')
+ tableau_test('(man(x) or -man(x))')
+ tableau_test('(man(x) -> man(x))')
+ tableau_test('(man(x) iff man(x))')
+ tableau_test('-(man(x) iff -man(x))')
+ tableau_test('all x.man(x)')
+ tableau_test('all x.all y.((x = y) -> (y = x))')
+ tableau_test('all x.all y.all z.(((x = y) & (y = z)) -> (x = z))')
+# tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
+# tableau_test('some x.all y.sees(x,y)')
+
+ p1 = 'all x.(man(x) -> mortal(x))'
+ p2 = 'man(Socrates)'
+ c = 'mortal(Socrates)'
tableau_test(c, [p1, p2])
- p1 = "all x.(man(x) -> walks(x))"
- p2 = "man(John)"
- c = "some y.walks(y)"
+ p1 = 'all x.(man(x) -> walks(x))'
+ p2 = 'man(John)'
+ c = 'some y.walks(y)'
tableau_test(c, [p1, p2])
- p = "((x = y) & walks(y))"
- c = "walks(x)"
+ p = '((x = y) & walks(y))'
+ c = 'walks(x)'
tableau_test(c, [p])
- p = "((x = y) & ((y = z) & (z = w)))"
- c = "(x = w)"
+ p = '((x = y) & ((y = z) & (z = w)))'
+ c = '(x = w)'
tableau_test(c, [p])
- p = "some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))"
- c = "some e0.walk(e0,mary)"
+ p = 'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))'
+ c = 'some e0.walk(e0,mary)'
tableau_test(c, [p])
- c = "(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))"
+ c = '(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))'
tableau_test(c)
-
# p = 'some e1.some e2.((believe e1 john e2) and (walk e2 mary))'
# c = 'some x.some e3.some e4.((believe e3 x e4) and (walk e4 mary))'
# tableau_test(c, [p])
def testHigherOrderTableauProver():
- tableau_test("believe(j, -lie(b))", ["believe(j, -lie(b) & -cheat(b))"])
- tableau_test("believe(j, lie(b) & cheat(b))", ["believe(j, lie(b))"])
- tableau_test(
- "believe(j, lie(b))", ["lie(b)"]
- ) # how do we capture that John believes all things that are true
- tableau_test(
- "believe(j, know(b, cheat(b)))",
- ["believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))"],
- )
- tableau_test("P(Q(y), R(y) & R(z))", ["P(Q(x) & Q(y), R(y) & R(z))"])
+ tableau_test('believe(j, -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
+ tableau_test('believe(j, lie(b) & cheat(b))', ['believe(j, lie(b))'])
+ tableau_test('believe(j, lie(b))', ['lie(b)']) #how do we capture that John believes all things that are true
+ tableau_test('believe(j, know(b, cheat(b)))', ['believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))'])
+ tableau_test('P(Q(y), R(y) & R(z))', ['P(Q(x) & Q(y), R(y) & R(z))'])
- tableau_test("believe(j, cheat(b) & lie(b))", ["believe(j, lie(b) & cheat(b))"])
- tableau_test("believe(j, -cheat(b) & -lie(b))", ["believe(j, -lie(b) & -cheat(b))"])
+ tableau_test('believe(j, cheat(b) & lie(b))', ['believe(j, lie(b) & cheat(b))'])
+ tableau_test('believe(j, -cheat(b) & -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
def tableau_test(c, ps=None, verbose=False):
pc = Expression.fromstring(c)
- pps = [Expression.fromstring(p) for p in ps] if ps else []
+ pps = ([Expression.fromstring(p) for p in ps] if ps else [])
if not ps:
ps = []
- print(
- "%s |- %s: %s"
- % (", ".join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose))
- )
-
+ print('%s |- %s: %s' % (', '.join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose)))
def demo():
testTableauProver()
testHigherOrderTableauProver()
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
+
# Natural Language Toolkit: Internal utility functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Nitin Madnani <nmadnani@ets.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
import subprocess
import os
import sys
import stat
import locale
-from xml.etree import ElementTree
+
+# Use the c version of ElementTree, which is faster, if possible:
+try:
+ from xml.etree import cElementTree as ElementTree
+except ImportError:
+ from xml.etree import ElementTree
+
+from six import string_types
+
+from nltk import __file__
+from nltk import compat
##########################################################################
# Java Via Command-Line
:type options: list(str)
"""
global _java_bin, _java_options
- _java_bin = find_binary(
- "java",
- bin,
- env_vars=["JAVAHOME", "JAVA_HOME"],
- verbose=verbose,
- binary_names=["java.exe"],
- )
+ _java_bin = find_binary('java', bin, env_vars=['JAVAHOME', 'JAVA_HOME'], verbose=verbose, binary_names=['java.exe'])
if options is not None:
- if isinstance(options, str):
+ if isinstance(options, string_types):
options = options.split()
_java_options = list(options)
-
-def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True):
+def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None,
+ blocking=True):
"""
Execute the given java command, by opening a subprocess that calls
Java. If java has not yet been configured, it will be configured
standard input, standard output and standard error file
handles, respectively. Valid values are ``subprocess.PIPE``,
an existing file descriptor (a positive integer), an existing
- file object, 'pipe', 'stdout', 'devnull' and None. ``subprocess.PIPE`` indicates that a
+ file object, and None. ``subprocess.PIPE`` indicates that a
new pipe to the child should be created. With None, no
redirection will occur; the child's file handles will be
inherited from the parent. Additionally, stderr can be
:raise OSError: If the java command returns a nonzero return code.
"""
-
- subprocess_output_dict = {
- "pipe": subprocess.PIPE,
- "stdout": subprocess.STDOUT,
- "devnull": subprocess.DEVNULL,
- }
-
- stdin = subprocess_output_dict.get(stdin, stdin)
- stdout = subprocess_output_dict.get(stdout, stdout)
- stderr = subprocess_output_dict.get(stderr, stderr)
-
- if isinstance(cmd, str):
- raise TypeError("cmd should be a list of strings")
+ if stdin == 'pipe': stdin = subprocess.PIPE
+ if stdout == 'pipe': stdout = subprocess.PIPE
+ if stderr == 'pipe': stderr = subprocess.PIPE
+ if isinstance(cmd, string_types):
+ raise TypeError('cmd should be a list of strings')
# Make sure we know where a java binary is.
if _java_bin is None:
config_java()
# Set up the classpath.
- if isinstance(classpath, str):
- classpaths = [classpath]
+ if isinstance(classpath, string_types):
+ classpaths=[classpath]
else:
- classpaths = list(classpath)
- classpath = os.path.pathsep.join(classpaths)
+ classpaths=list(classpath)
+ classpath=os.path.pathsep.join(classpaths)
# Construct the full command string.
cmd = list(cmd)
- cmd = ["-cp", classpath] + cmd
+ cmd = ['-cp', classpath] + cmd
cmd = [_java_bin] + _java_options + cmd
# Call java via a subprocess
p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr)
- if not blocking:
- return p
+ if not blocking: return p
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
print(_decode_stdoutdata(stderr))
- raise OSError("Java command failed : " + str(cmd))
+ raise OSError('Java command failed : ' + str(cmd))
return (stdout, stderr)
-
if 0:
- # config_java(options='-Xmx512m')
+ #config_java(options='-Xmx512m')
# Write:
- # java('weka.classifiers.bayes.NaiveBayes',
+ #java('weka.classifiers.bayes.NaiveBayes',
# ['-d', '/tmp/names.model', '-t', '/tmp/train.arff'],
# classpath='/Users/edloper/Desktop/weka/weka.jar')
# Read:
- (a, b) = java(
- [
- "weka.classifiers.bayes.NaiveBayes",
- "-l",
- "/tmp/names.model",
- "-T",
- "/tmp/test.arff",
- "-p",
- "0",
- ], # , '-distribution'],
- classpath="/Users/edloper/Desktop/weka/weka.jar",
- )
+ (a,b) = java(['weka.classifiers.bayes.NaiveBayes',
+ '-l', '/tmp/names.model', '-T', '/tmp/test.arff',
+ '-p', '0'],#, '-distribution'],
+ classpath='/Users/edloper/Desktop/weka/weka.jar')
######################################################################
# Parsing
######################################################################
-
class ReadError(ValueError):
"""
Exception raised by read_* functions when they fail.
:param position: The index in the input string where an error occurred.
:param expected: What was expected when an error occurred.
"""
-
def __init__(self, expected, position):
ValueError.__init__(self, expected, position)
self.expected = expected
self.position = position
-
def __str__(self):
- return "Expected %s at %s" % (self.expected, self.position)
-
+ return 'Expected %s at %s' % (self.expected, self.position)
_STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")
-
-
def read_str(s, start_position):
"""
If a Python string literal begins at the specified position in the
"""
# Read the open quote, and any modifiers.
m = _STRING_START_RE.match(s, start_position)
- if not m:
- raise ReadError("open quote", start_position)
+ if not m: raise ReadError('open quote', start_position)
quotemark = m.group(1)
# Find the close quote.
- _STRING_END_RE = re.compile(r"\\|%s" % quotemark)
+ _STRING_END_RE = re.compile(r'\\|%s' % quotemark)
position = m.end()
while True:
match = _STRING_END_RE.search(s, position)
- if not match:
- raise ReadError("close quote", position)
- if match.group(0) == "\\":
- position = match.end() + 1
- else:
- break
+ if not match: raise ReadError('close quote', position)
+ if match.group(0) == '\\': position = match.end()+1
+ else: break
# Process it, using eval. Strings with invalid escape sequences
# might raise ValueEerror.
try:
- return eval(s[start_position : match.end()]), match.end()
+ return eval(s[start_position:match.end()]), match.end()
except ValueError as e:
- raise ReadError("invalid string (%s)" % e)
-
-
-_READ_INT_RE = re.compile(r"-?\d+")
-
+ raise ReadError('invalid string (%s)' % e)
+_READ_INT_RE = re.compile(r'-?\d+')
def read_int(s, start_position):
"""
If an integer begins at the specified position in the given
"""
m = _READ_INT_RE.match(s, start_position)
- if not m:
- raise ReadError("integer", start_position)
+ if not m: raise ReadError('integer', start_position)
return int(m.group()), m.end()
-
-_READ_NUMBER_VALUE = re.compile(r"-?(\d*)([.]?\d*)?")
-
-
+_READ_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')
def read_number(s, start_position):
"""
If an integer or float begins at the specified position in the
"""
m = _READ_NUMBER_VALUE.match(s, start_position)
if not m or not (m.group(1) or m.group(2)):
- raise ReadError("number", start_position)
- if m.group(2):
- return float(m.group()), m.end()
- else:
- return int(m.group()), m.end()
+ raise ReadError('number', start_position)
+ if m.group(2): return float(m.group()), m.end()
+ else: return int(m.group()), m.end()
+
######################################################################
# Check if a method has been overridden
######################################################################
-
def overridden(method):
"""
:return: True if ``method`` overrides some method with the same
:type method: instance method
"""
- if isinstance(method, types.MethodType) and method.__self__.__class__ is not None:
+ # [xx] breaks on classic classes!
+ if isinstance(method, types.MethodType) and compat.get_im_class(method) is not None:
name = method.__name__
- funcs = [
- cls.__dict__[name]
- for cls in _mro(method.__self__.__class__)
- if name in cls.__dict__
- ]
+ funcs = [cls.__dict__[name]
+ for cls in _mro(compat.get_im_class(method))
+ if name in cls.__dict__]
return len(funcs) > 1
else:
- raise TypeError("Expected an instance method.")
-
+ raise TypeError('Expected an instance method.')
def _mro(cls):
"""
return cls.__mro__
else:
mro = [cls]
- for base in cls.__bases__:
- mro.extend(_mro(base))
+ for base in cls.__bases__: mro.extend(_mro(base))
return mro
-
######################################################################
# Deprecation decorator & base class
######################################################################
# [xx] dedent msg first if it comes from a docstring.
-
def _add_epytext_field(obj, field, message):
"""Add an epytext @field to a given object's docstring."""
- indent = ""
+ indent = ''
# If we already have a docstring, then add a blank line to separate
# it from the new field, and check its indentation.
if obj.__doc__:
- obj.__doc__ = obj.__doc__.rstrip() + "\n\n"
- indents = re.findall(r"(?<=\n)[ ]+(?!\s)", obj.__doc__.expandtabs())
- if indents:
- indent = min(indents)
+ obj.__doc__ = obj.__doc__.rstrip()+'\n\n'
+ indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs())
+ if indents: indent = min(indents)
# If we don't have a docstring, add an empty one.
else:
- obj.__doc__ = ""
-
- obj.__doc__ += textwrap.fill(
- "@%s: %s" % (field, message),
- initial_indent=indent,
- subsequent_indent=indent + " ",
- )
+ obj.__doc__ = ''
+ obj.__doc__ += textwrap.fill('@%s: %s' % (field, message),
+ initial_indent=indent,
+ subsequent_indent=indent+' ')
def deprecated(message):
"""
"""
def decorator(func):
- msg = "Function %s() has been deprecated. %s" % (func.__name__, message)
- msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ")
-
+ msg = ("Function %s() has been deprecated. %s"
+ % (func.__name__, message))
+ msg = '\n' + textwrap.fill(msg, initial_indent=' ',
+ subsequent_indent=' ')
def newFunc(*args, **kwargs):
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
return func(*args, **kwargs)
newFunc.__doc__ = func.__doc__
newFunc.__deprecated__ = True
# Add a @deprecated field to the docstring.
- _add_epytext_field(newFunc, "deprecated", message)
+ _add_epytext_field(newFunc, 'deprecated', message)
return newFunc
-
return decorator
-
class Deprecated(object):
"""
A base class used to mark deprecated classes. A typical usage is to
The docstring of the deprecated class will be used in the
deprecation warning message.
"""
-
def __new__(cls, *args, **kwargs):
# Figure out which class is the deprecated one.
dep_cls = None
for base in _mro(cls):
if Deprecated in base.__bases__:
- dep_cls = base
- break
- assert dep_cls, "Unable to determine which base is deprecated."
+ dep_cls = base; break
+ assert dep_cls, 'Unable to determine which base is deprecated.'
# Construct an appropriate warning.
- doc = dep_cls.__doc__ or "".strip()
+ doc = dep_cls.__doc__ or ''.strip()
# If there's a @deprecated field, strip off the field marker.
- doc = re.sub(r"\A\s*@deprecated:", r"", doc)
+ doc = re.sub(r'\A\s*@deprecated:', r'', doc)
# Strip off any indentation.
- doc = re.sub(r"(?m)^\s*", "", doc)
+ doc = re.sub(r'(?m)^\s*', '', doc)
# Construct a 'name' string.
- name = "Class %s" % dep_cls.__name__
+ name = 'Class %s' % dep_cls.__name__
if cls != dep_cls:
- name += " (base class for %s)" % cls.__name__
+ name += ' (base class for %s)' % cls.__name__
# Put it all together.
- msg = "%s has been deprecated. %s" % (name, doc)
+ msg = '%s has been deprecated. %s' % (name, doc)
# Wrap it.
- msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ")
+ msg = '\n' + textwrap.fill(msg, initial_indent=' ',
+ subsequent_indent=' ')
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
# Do the actual work of __new__.
return object.__new__(cls)
-
##########################################################################
# COUNTER, FOR UNIQUE NAMING
##########################################################################
-
class Counter:
"""
A counter that auto-increments each time its value is read.
"""
-
def __init__(self, initial_value=0):
self._value = initial_value
-
def get(self):
self._value += 1
return self._value
-
##########################################################################
# Search for files/binaries
##########################################################################
-
-def find_file_iter(
- filename,
- env_vars=(),
- searchpath=(),
- file_names=None,
- url=None,
- verbose=False,
- finding_dir=False,
-):
+def find_file_iter(filename, env_vars=(), searchpath=(),
+ file_names=None, url=None, verbose=False, finding_dir=False):
"""
Search for a file to be used by nltk.
:param verbose: Whether or not to print path when a file is found.
"""
file_names = [filename] + (file_names or [])
- assert isinstance(filename, str)
- assert not isinstance(file_names, str)
- assert not isinstance(searchpath, str)
- if isinstance(env_vars, str):
+ assert isinstance(filename, string_types)
+ assert not isinstance(file_names, string_types)
+ assert not isinstance(searchpath, string_types)
+ if isinstance(env_vars, string_types):
env_vars = env_vars.split()
yielded = False
path_to_file = os.path.join(filename, alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]' % (filename, path_to_file))
yielded = True
yield path_to_file
# Check the bare alternatives
if os.path.isfile(alternative):
if verbose:
- print("[Found %s: %s]" % (filename, alternative))
+ print('[Found %s: %s]' % (filename, alternative))
yielded = True
yield alternative
# Check if the alternative is inside a 'file' directory
- path_to_file = os.path.join(filename, "file", alternative)
+ path_to_file = os.path.join(filename, 'file', alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]' % (filename, path_to_file))
yielded = True
yield path_to_file
# Check environment variables
for env_var in env_vars:
if env_var in os.environ:
- if finding_dir: # This is to file a directory instead of file
+ if finding_dir: # This is to file a directory instead of file
yielded = True
yield os.environ[env_var]
# Check if the environment variable contains a direct path to the bin
if os.path.isfile(env_dir):
if verbose:
- print("[Found %s: %s]" % (filename, env_dir))
+ print('[Found %s: %s]'%(filename, env_dir))
yielded = True
yield env_dir
# Check if the possible bin names exist inside the environment variable directories
path_to_file = os.path.join(env_dir, alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]'%(filename, path_to_file))
yielded = True
yield path_to_file
# Check if the alternative is inside a 'file' directory
# path_to_file = os.path.join(env_dir, 'file', alternative)
# Check if the alternative is inside a 'bin' directory
- path_to_file = os.path.join(env_dir, "bin", alternative)
+ path_to_file = os.path.join(env_dir, 'bin', alternative)
if os.path.isfile(path_to_file):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_file))
+ print('[Found %s: %s]' % (filename, path_to_file))
yielded = True
yield path_to_file
# If we're on a POSIX system, then try using the 'which' command
# to find the file.
- if os.name == "posix":
+ if os.name == 'posix':
for alternative in file_names:
try:
- p = subprocess.Popen(
- ["which", alternative],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
+ p = subprocess.Popen(['which', alternative],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
path = _decode_stdoutdata(stdout).strip()
if path.endswith(alternative) and os.path.exists(path):
if verbose:
- print("[Found %s: %s]" % (filename, path))
+ print('[Found %s: %s]' % (filename, path))
yielded = True
yield path
except (KeyboardInterrupt, SystemExit, OSError):
raise
- finally:
+ except:
pass
if not yielded:
- msg = (
- "NLTK was unable to find the %s file!"
- "\nUse software specific "
- "configuration paramaters" % filename
- )
- if env_vars:
- msg += " or set the %s environment variable" % env_vars[0]
- msg += "."
+ msg = ("NLTK was unable to find the %s file!" "\nUse software specific "
+ "configuration paramaters" % filename)
+ if env_vars: msg += ' or set the %s environment variable' % env_vars[0]
+ msg += '.'
if searchpath:
- msg += "\n\n Searched in:"
- msg += "".join("\n - %s" % d for d in searchpath)
- if url:
- msg += "\n\n For more information on %s, see:\n <%s>" % (filename, url)
- div = "=" * 75
- raise LookupError("\n\n%s\n%s\n%s" % (div, msg, div))
-
-
-def find_file(
- filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False
-):
- return next(
- find_file_iter(filename, env_vars, searchpath, file_names, url, verbose)
- )
-
-
-def find_dir(
- filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False
-):
- return next(
- find_file_iter(
- filename, env_vars, searchpath, file_names, url, verbose, finding_dir=True
- )
- )
-
-
-def find_binary_iter(
- name,
- path_to_bin=None,
- env_vars=(),
- searchpath=(),
- binary_names=None,
- url=None,
- verbose=False,
-):
+ msg += '\n\n Searched in:'
+ msg += ''.join('\n - %s' % d for d in searchpath)
+ if url: msg += ('\n\n For more information on %s, see:\n <%s>' %
+ (filename, url))
+ div = '='*75
+ raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
+
+
+def find_file(filename, env_vars=(), searchpath=(),
+ file_names=None, url=None, verbose=False):
+ return next(find_file_iter(filename, env_vars, searchpath,
+ file_names, url, verbose))
+
+
+def find_dir(filename, env_vars=(), searchpath=(),
+ file_names=None, url=None, verbose=False):
+ return next(find_file_iter(filename, env_vars, searchpath,
+ file_names, url, verbose, finding_dir=True))
+
+
+def find_binary_iter(name, path_to_bin=None, env_vars=(), searchpath=(),
+ binary_names=None, url=None, verbose=False):
"""
Search for a file to be used by nltk.
:param url: URL presented to user for download help.
:param verbose: Whether or not to print path when a file is found.
"""
- for file in find_file_iter(
- path_to_bin or name, env_vars, searchpath, binary_names, url, verbose
- ):
+ for file in find_file_iter(path_to_bin or name, env_vars, searchpath, binary_names,
+ url, verbose):
yield file
+def find_binary(name, path_to_bin=None, env_vars=(), searchpath=(),
+ binary_names=None, url=None, verbose=False):
+ return next(find_binary_iter(name, path_to_bin, env_vars, searchpath,
+ binary_names, url, verbose))
-def find_binary(
- name,
- path_to_bin=None,
- env_vars=(),
- searchpath=(),
- binary_names=None,
- url=None,
- verbose=False,
-):
- return next(
- find_binary_iter(
- name, path_to_bin, env_vars, searchpath, binary_names, url, verbose
- )
- )
-
-
-def find_jar_iter(
- name_pattern,
- path_to_jar=None,
- env_vars=(),
- searchpath=(),
- url=None,
- verbose=False,
- is_regex=False,
-):
+def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
+ searchpath=(), url=None, verbose=False, is_regex=False):
"""
Search for a jar that is used by nltk.
:param is_regex: Whether name is a regular expression.
"""
- assert isinstance(name_pattern, str)
- assert not isinstance(searchpath, str)
- if isinstance(env_vars, str):
+ assert isinstance(name_pattern, string_types)
+ assert not isinstance(searchpath, string_types)
+ if isinstance(env_vars, string_types):
env_vars = env_vars.split()
yielded = False
# Make sure we check the CLASSPATH first
- env_vars = ["CLASSPATH"] + list(env_vars)
+ env_vars = ['CLASSPATH'] + list(env_vars)
# If an explicit location was given, then check it, and yield it if
# it's present; otherwise, complain.
yielded = True
yield path_to_jar
else:
- raise LookupError(
- "Could not find %s jar file at %s" % (name_pattern, path_to_jar)
- )
+ raise LookupError('Could not find %s jar file at %s' %
+ (name_pattern, path_to_jar))
# Check environment variables
for env_var in env_vars:
if env_var in os.environ:
- if env_var == "CLASSPATH":
- classpath = os.environ["CLASSPATH"]
+ if env_var == 'CLASSPATH':
+ classpath = os.environ['CLASSPATH']
for cp in classpath.split(os.path.pathsep):
if os.path.isfile(cp):
- filename = os.path.basename(cp)
- if (
- is_regex
- and re.match(name_pattern, filename)
- or (not is_regex and filename == name_pattern)
- ):
+ filename=os.path.basename(cp)
+ if is_regex and re.match(name_pattern, filename) or \
+ (not is_regex and filename == name_pattern):
if verbose:
- print("[Found %s: %s]" % (name_pattern, cp))
+ print('[Found %s: %s]' % (name_pattern, cp))
yielded = True
yield cp
# The case where user put directory containing the jar file in the classpath
if os.path.isdir(cp):
if not is_regex:
- if os.path.isfile(os.path.join(cp, name_pattern)):
+ if os.path.isfile(os.path.join(cp,name_pattern)):
if verbose:
- print("[Found %s: %s]" % (name_pattern, cp))
+ print('[Found %s: %s]' % (name_pattern, cp))
yielded = True
- yield os.path.join(cp, name_pattern)
+ yield os.path.join(cp,name_pattern)
else:
# Look for file using regular expression
for file_name in os.listdir(cp):
- if re.match(name_pattern, file_name):
+ if re.match(name_pattern,file_name):
if verbose:
- print(
- "[Found %s: %s]"
- % (
- name_pattern,
- os.path.join(cp, file_name),
- )
- )
+ print('[Found %s: %s]' % (name_pattern, os.path.join(cp,file_name)))
yielded = True
- yield os.path.join(cp, file_name)
+ yield os.path.join(cp,file_name)
else:
jar_env = os.environ[env_var]
- jar_iter = (
- (
- os.path.join(jar_env, path_to_jar)
- for path_to_jar in os.listdir(jar_env)
- )
- if os.path.isdir(jar_env)
- else (jar_env,)
- )
+ jar_iter = ((os.path.join(jar_env, path_to_jar) for path_to_jar in os.listdir(jar_env))
+ if os.path.isdir(jar_env) else (jar_env,))
for path_to_jar in jar_iter:
if os.path.isfile(path_to_jar):
- filename = os.path.basename(path_to_jar)
- if (
- is_regex
- and re.match(name_pattern, filename)
- or (not is_regex and filename == name_pattern)
- ):
+ filename=os.path.basename(path_to_jar)
+ if is_regex and re.match(name_pattern, filename) or \
+ (not is_regex and filename == name_pattern):
if verbose:
- print("[Found %s: %s]" % (name_pattern, path_to_jar))
+ print('[Found %s: %s]' % (name_pattern, path_to_jar))
yielded = True
yield path_to_jar
if os.path.isfile(path_to_jar):
if re.match(name_pattern, filename):
if verbose:
- print("[Found %s: %s]" % (filename, path_to_jar))
+ print('[Found %s: %s]' % (filename, path_to_jar))
yielded = True
yield path_to_jar
else:
path_to_jar = os.path.join(directory, name_pattern)
if os.path.isfile(path_to_jar):
if verbose:
- print("[Found %s: %s]" % (name_pattern, path_to_jar))
+ print('[Found %s: %s]' % (name_pattern, path_to_jar))
yielded = True
yield path_to_jar
if not yielded:
# If nothing was found, raise an error
- msg = "NLTK was unable to find %s!" % name_pattern
- if env_vars:
- msg += " Set the %s environment variable" % env_vars[0]
- msg = textwrap.fill(msg + ".", initial_indent=" ", subsequent_indent=" ")
+ msg = ("NLTK was unable to find %s!" % name_pattern)
+ if env_vars: msg += ' Set the %s environment variable' % env_vars[0]
+ msg = textwrap.fill(msg+'.', initial_indent=' ',
+ subsequent_indent=' ')
if searchpath:
- msg += "\n\n Searched in:"
- msg += "".join("\n - %s" % d for d in searchpath)
+ msg += '\n\n Searched in:'
+ msg += ''.join('\n - %s' % d for d in searchpath)
if url:
- msg += "\n\n For more information, on %s, see:\n <%s>" % (
- name_pattern,
- url,
- )
- div = "=" * 75
- raise LookupError("\n\n%s\n%s\n%s" % (div, msg, div))
-
-
-def find_jar(
- name_pattern,
- path_to_jar=None,
- env_vars=(),
- searchpath=(),
- url=None,
- verbose=False,
- is_regex=False,
-):
- return next(
- find_jar_iter(
- name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex
- )
- )
+ msg += ('\n\n For more information, on %s, see:\n <%s>' %
+ (name_pattern, url))
+ div = '='*75
+ raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
+def find_jar(name_pattern, path_to_jar=None, env_vars=(),
+ searchpath=(), url=None, verbose=False, is_regex=False):
+ return next(find_jar_iter(name_pattern, path_to_jar, env_vars,
+ searchpath, url, verbose, is_regex))
-def find_jars_within_path(path_to_jars):
- return [
- os.path.join(root, filename)
- for root, dirnames, filenames in os.walk(path_to_jars)
- for filename in fnmatch.filter(filenames, "*.jar")
- ]
+def find_jars_within_path(path_to_jars):
+ return [os.path.join(root, filename)
+ for root, dirnames, filenames in os.walk(path_to_jars)
+ for filename in fnmatch.filter(filenames, '*.jar')]
def _decode_stdoutdata(stdoutdata):
""" Convert data read from stdout/stderr to unicode """
return stdoutdata.decode()
return stdoutdata.decode(encoding)
-
##########################################################################
# Import Stdlib Module
##########################################################################
-
def import_from_stdlib(module):
"""
When python is run from within the nltk/ directory tree, the
instead (causing the import to fail).
"""
old_path = sys.path
- sys.path = [d for d in sys.path if d not in ("", ".")]
+ sys.path = [d for d in sys.path if d not in ('', '.')]
m = __import__(module)
sys.path = old_path
return m
# Wrapper for ElementTree Elements
##########################################################################
-
-
+@compat.python_2_unicode_compatible
class ElementWrapper(object):
"""
A wrapper around ElementTree Element objects whose main purpose is
<Element "<?xml version='1.0' encoding='utf8'?>\n<test />">
"""
- if isinstance(etree, str):
+ if isinstance(etree, string_types):
etree = ElementTree.fromstring(etree)
- self.__dict__["_etree"] = etree
+ self.__dict__['_etree'] = etree
def unwrap(self):
"""
return self._etree
##////////////////////////////////////////////////////////////
- # { String Representation
+ #{ String Representation
##////////////////////////////////////////////////////////////
def __repr__(self):
- s = ElementTree.tostring(self._etree, encoding="utf8").decode("utf8")
+ s = ElementTree.tostring(self._etree, encoding='utf8').decode('utf8')
if len(s) > 60:
- e = s.rfind("<")
- if (len(s) - e) > 30:
- e = -20
- s = "%s...%s" % (s[:30], s[e:])
- return "<Element %r>" % s
+ e = s.rfind('<')
+ if (len(s)-e) > 30: e = -20
+ s = '%s...%s' % (s[:30], s[e:])
+ return '<Element %r>' % s
def __str__(self):
"""
:return: the result of applying ``ElementTree.tostring()`` to
the wrapped Element object.
"""
- return (
- ElementTree.tostring(self._etree, encoding="utf8").decode("utf8").rstrip()
- )
+ return ElementTree.tostring(self._etree, encoding='utf8').decode('utf8').rstrip()
##////////////////////////////////////////////////////////////
- # { Element interface Delegation (pass-through)
+ #{ Element interface Delegation (pass-through)
##////////////////////////////////////////////////////////////
def __getattr__(self, attrib):
return len(self._etree)
##////////////////////////////////////////////////////////////
- # { Element interface Delegation (wrap result)
+ #{ Element interface Delegation (wrap result)
##////////////////////////////////////////////////////////////
def __getitem__(self, index):
return [ElementWrapper(elt) for elt in self._etree]
def getiterator(self, tag=None):
- return (ElementWrapper(elt) for elt in self._etree.getiterator(tag))
+ return (ElementWrapper(elt)
+ for elt in self._etree.getiterator(tag))
def makeelement(self, tag, attrib):
return ElementWrapper(self._etree.makeelement(tag, attrib))
def find(self, path):
elt = self._etree.find(path)
- if elt is None:
- return elt
- else:
- return ElementWrapper(elt)
+ if elt is None: return elt
+ else: return ElementWrapper(elt)
def findall(self, path):
return [ElementWrapper(elt) for elt in self._etree.findall(path)]
-
######################################################################
# Helper for Handling Slicing
######################################################################
-
def slice_bounds(sequence, slice_obj, allow_step=False):
"""
Given a slice, return the corresponding (start, stop) bounds,
# value tuple.
if allow_step:
step = slice_obj.step
- if step is None:
- step = 1
+ if step is None: step = 1
# Use a recursive call without allow_step to find the slice
# bounds. If step is negative, then the roles of start and
# stop (in terms of default values, etc), are swapped.
# Otherwise, make sure that no non-default step value is used.
elif slice_obj.step not in (None, 1):
- raise ValueError(
- "slices with steps are not supported by %s" % sequence.__class__.__name__
- )
+ raise ValueError('slices with steps are not supported by %s' %
+ sequence.__class__.__name__)
# Supply default offsets.
- if start is None:
- start = 0
- if stop is None:
- stop = len(sequence)
+ if start is None: start = 0
+ if stop is None: stop = len(sequence)
# Handle negative indices.
- if start < 0:
- start = max(0, len(sequence) + start)
- if stop < 0:
- stop = max(0, len(sequence) + stop)
+ if start < 0: start = max(0, len(sequence)+start)
+ if stop < 0: stop = max(0, len(sequence)+stop)
# Make sure stop doesn't go past the end of the list. Note that
# we avoid calculating len(sequence) if possible, because for lazy
# sequences, calculating the length of a sequence can be expensive.
if stop > 0:
- try:
- sequence[stop - 1]
- except IndexError:
- stop = len(sequence)
+ try: sequence[stop-1]
+ except IndexError: stop = len(sequence)
# Make sure start isn't past stop.
start = min(start, stop)
# That's all folks!
return start, stop
-
######################################################################
# Permission Checking
######################################################################
-
def is_writable(path):
# Ensure that it exists.
if not os.path.exists(path):
return False
# If we're on a posix system, check its permissions.
- if hasattr(os, "getuid"):
+ if hasattr(os, 'getuid'):
statdata = os.stat(path)
perm = stat.S_IMODE(statdata.st_mode)
# is it world-writable?
- if perm & 0o002:
+ if (perm & 0o002):
return True
# do we own it?
elif statdata.st_uid == os.getuid() and (perm & 0o200):
return True
# are we in a group that can write to it?
- elif (statdata.st_gid in [os.getgid()] + os.getgroups()) and (perm & 0o020):
+ elif (statdata.st_gid in [os.getgid()] + os.getgroups()) \
+ and (perm & 0o020):
return True
# otherwise, we can't write to it.
else:
# [xx] should we do other checks on other platforms?
return True
-
######################################################################
# NLTK Error reporting
######################################################################
-
def raise_unorderable_types(ordering, a, b):
- raise TypeError(
- "unorderable types: %s() %s %s()"
- % (type(a).__name__, ordering, type(b).__name__)
- )
+ raise TypeError("unorderable types: %s() %s %s()" % (type(a).__name__, ordering, type(b).__name__))
# -*- coding: utf-8 -*-
# Natural Language Toolkit: JSON Encoder/Decoder Helpers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
json_tags = {}
-TAG_PREFIX = "!"
-
+TAG_PREFIX = '!'
def register_tag(cls):
"""
Decorates a class to register it's json tag.
"""
- json_tags[TAG_PREFIX + getattr(cls, "json_tag")] = cls
+ json_tags[TAG_PREFIX+getattr(cls, 'json_tag')] = cls
return cls
-
class JSONTaggedEncoder(json.JSONEncoder):
def default(self, obj):
- obj_tag = getattr(obj, "json_tag", None)
+ obj_tag = getattr(obj, 'json_tag', None)
if obj_tag is None:
return super(JSONTaggedEncoder, self).default(obj)
obj_tag = TAG_PREFIX + obj_tag
obj = obj.encode_json_obj()
return {obj_tag: obj}
-
class JSONTaggedDecoder(json.JSONDecoder):
def decode(self, s):
return self.decode_obj(super(JSONTaggedDecoder, self).decode(s))
if not isinstance(obj, dict) or len(obj) != 1:
return obj
obj_tag = next(iter(obj.keys()))
- if not obj_tag.startswith("!"):
+ if not obj_tag.startswith('!'):
return obj
if obj_tag not in json_tags:
- raise ValueError("Unknown tag", obj_tag)
+ raise ValueError('Unknown tag', obj_tag)
obj_cls = json_tags[obj_tag]
return obj_cls.decode_json_obj(obj[obj_tag])
-
-__all__ = ["register_tag", "json_tags", "JSONTaggedEncoder", "JSONTaggedDecoder"]
+__all__ = ['register_tag', 'json_tags',
+ 'JSONTaggedEncoder', 'JSONTaggedDecoder']
See the documentation for further information on copyrights,
or contact the author. All Rights Reserved.
"""
+from __future__ import print_function
### Constants
###
-
class LazyModule:
""" Lazy module class.
t = ISO.Week(1998,1,1)
"""
-
# Flag which inidicates whether the LazyModule is initialized or not
__lazymodule_init = 0
# Name of the module to load
- __lazymodule_name = ""
+ __lazymodule_name = ''
# Flag which indicates whether the module was loaded or not
__lazymodule_loaded = 0
if globals is None:
globals = locals
self.__lazymodule_globals = globals
- mainname = globals.get("__name__", "")
+ mainname = globals.get('__name__', '')
if mainname:
- self.__name__ = mainname + "." + name
+ self.__name__ = mainname + '.' + name
self.__lazymodule_name = name
else:
self.__name__ = self.__lazymodule_name = name
if self.__lazymodule_loaded:
return self.__lazymodule_locals[name]
if _debug:
- print("LazyModule: Loading module %r" % name)
- self.__lazymodule_locals[name] = module = __import__(
- name, self.__lazymodule_locals, self.__lazymodule_globals, "*"
- )
+ print('LazyModule: Loading module %r' % name)
+ self.__lazymodule_locals[name] \
+ = module \
+ = __import__(name,
+ self.__lazymodule_locals,
+ self.__lazymodule_globals,
+ '*')
# Fill namespace with all symbols from original module to
# provide faster access.
self.__dict__.update(module.__dict__)
# Set import flag
- self.__dict__["__lazymodule_loaded"] = 1
+ self.__dict__['__lazymodule_loaded'] = 1
if _debug:
- print("LazyModule: Module %r loaded" % name)
+ print('LazyModule: Module %r loaded' % name)
return module
def __getattr__(self, name):
if self.__lazymodule_loaded:
raise AttributeError(name)
if _debug:
- print(
- "LazyModule: "
- "Module load triggered by attribute %r read access" % name
- )
+ print('LazyModule: ' \
+ 'Module load triggered by attribute %r read access' % name)
module = self.__lazymodule_import()
return getattr(module, name)
self.__dict__[name] = value
return
if _debug:
- print(
- "LazyModule: "
- "Module load triggered by attribute %r write access" % name
- )
+ print('LazyModule: ' \
+ 'Module load triggered by attribute %r write access' % name)
module = self.__lazymodule_import()
setattr(module, name, value)
+++ /dev/null
-# Natural Language Toolkit: Language Models
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/
-# For license information, see LICENSE.TXT
-"""
-NLTK Language Modeling Module.
-------------------------------
-
-Currently this module covers only ngram language models, but it should be easy
-to extend to neural models.
-
-
-Preparing Data
-==============
-
-Before we train our ngram models it is necessary to make sure the data we put in
-them is in the right format.
-Let's say we have a text that is a list of sentences, where each sentence is
-a list of strings. For simplicity we just consider a text consisting of
-characters instead of words.
-
- >>> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
-
-If we want to train a bigram model, we need to turn this text into bigrams.
-Here's what the first sentence of our text would look like if we use a function
-from NLTK for this.
-
- >>> from nltk.util import bigrams
- >>> list(bigrams(text[0]))
- [('a', 'b'), ('b', 'c')]
-
-Notice how "b" occurs both as the first and second member of different bigrams
-but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences
-start with "a" and end with "c"?
-A standard way to deal with this is to add special "padding" symbols to the
-sentence before splitting it into ngrams.
-Fortunately, NLTK also has a function for that, let's see what it does to the
-first sentence.
-
- >>> from nltk.util import pad_sequence
- >>> list(pad_sequence(text[0],
- ... pad_left=True,
- ... left_pad_symbol="<s>",
- ... pad_right=True,
- ... right_pad_symbol="</s>",
- ... n=2))
- ['<s>', 'a', 'b', 'c', '</s>']
-
-Note the `n` argument, that tells the function we need padding for bigrams.
-Now, passing all these parameters every time is tedious and in most cases they
-can be safely assumed as defaults anyway.
-Thus our module provides a convenience function that has all these arguments
-already set while the other arguments remain the same as for `pad_sequence`.
-
- >>> from nltk.lm.preprocessing import pad_both_ends
- >>> list(pad_both_ends(text[0], n=2))
- ['<s>', 'a', 'b', 'c', '</s>']
-
-Combining the two parts discussed so far we get the following preparation steps
-for one sentence.
-
- >>> list(bigrams(pad_both_ends(text[0], n=2)))
- [('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]
-
-To make our model more robust we could also train it on unigrams (single words)
-as well as bigrams, its main source of information.
-NLTK once again helpfully provides a function called `everygrams`.
-While not the most efficient, it is conceptually simple.
-
-
- >>> from nltk.util import everygrams
- >>> padded_bigrams = list(pad_both_ends(text[0], n=2))
- >>> list(everygrams(padded_bigrams, max_len=2))
- [('<s>',),
- ('a',),
- ('b',),
- ('c',),
- ('</s>',),
- ('<s>', 'a'),
- ('a', 'b'),
- ('b', 'c'),
- ('c', '</s>')]
-
-We are almost ready to start counting ngrams, just one more step left.
-During training and evaluation our model will rely on a vocabulary that
-defines which words are "known" to the model.
-To create this vocabulary we need to pad our sentences (just like for counting
-ngrams) and then combine the sentences into one flat stream of words.
-
- >>> from nltk.lm.preprocessing import flatten
- >>> list(flatten(pad_both_ends(sent, n=2) for sent in text))
- ['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']
-
-In most cases we want to use the same text as the source for both vocabulary
-and ngram counts.
-Now that we understand what this means for our preprocessing, we can simply import
-a function that does everything for us.
-
- >>> from nltk.lm.preprocessing import padded_everygram_pipeline
- >>> train, vocab = padded_everygram_pipeline(2, text)
-
-So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy
-iterators. They are evaluated on demand at training time.
-
-
-Training
-========
-Having prepared our data we are ready to start training a model.
-As a simple example, let us train a Maximum Likelihood Estimator (MLE).
-We only need to specify the highest ngram order to instantiate it.
-
- >>> from nltk.lm import MLE
- >>> lm = MLE(2)
-
-This automatically creates an empty vocabulary...
-
- >>> len(lm.vocab)
- 0
-
-... which gets filled as we fit the model.
-
- >>> lm.fit(train, vocab)
- >>> print(lm.vocab)
- <Vocabulary with cutoff=1 unk_label='<UNK>' and 9 items>
- >>> len(lm.vocab)
- 9
-
-The vocabulary helps us handle words that have not occurred during training.
-
- >>> lm.vocab.lookup(text[0])
- ('a', 'b', 'c')
- >>> lm.vocab.lookup(["aliens", "from", "Mars"])
- ('<UNK>', '<UNK>', '<UNK>')
-
-Moreover, in some cases we want to ignore words that we did see during training
-but that didn't occur frequently enough, to provide us useful information.
-You can tell the vocabulary to ignore such words.
-To find out how that works, check out the docs for the `Vocabulary` class.
-
-
-Using a Trained Model
-=====================
-When it comes to ngram models the training boils down to counting up the ngrams
-from the training corpus.
-
- >>> print(lm.counts)
- <NgramCounter with 2 ngram orders and 24 ngrams>
-
-This provides a convenient interface to access counts for unigrams...
-
- >>> lm.counts['a']
- 2
-
-...and bigrams (in this case "a b")
-
- >>> lm.counts[['a']]['b']
- 1
-
-And so on. However, the real purpose of training a language model is to have it
-score how probable words are in certain contexts.
-This being MLE, the model returns the item's relative frequency as its score.
-
- >>> lm.score("a")
- 0.15384615384615385
-
-Items that are not seen during training are mapped to the vocabulary's
-"unknown label" token. This is "<UNK>" by default.
-
- >>> lm.score("<UNK>") == lm.score("aliens")
- True
-
-Here's how you get the score for a word given some preceding context.
-For example we want to know what is the chance that "b" is preceded by "a".
-
- >>> lm.score("b", ["a"])
- 0.5
-
-To avoid underflow when working with many small score values it makes sense to
-take their logarithm.
-For convenience this can be done with the `logscore` method.
-
- >>> lm.logscore("a")
- -2.700439718141092
-
-Building on this method, we can also evaluate our model's cross-entropy and
-perplexity with respect to sequences of ngrams.
-
- >>> test = [('a', 'b'), ('c', 'd')]
- >>> lm.entropy(test)
- 1.292481250360578
- >>> lm.perplexity(test)
- 2.449489742783178
-
-It is advisable to preprocess your test text exactly the same way as you did
-the training text.
-
-One cool feature of ngram models is that they can be used to generate text.
-
- >>> lm.generate(1, random_seed=3)
- '<s>'
- >>> lm.generate(5, random_seed=3)
- ['<s>', 'a', 'b', 'c', 'd']
-
-Provide `random_seed` if you want to consistently reproduce the same text all
-other things being equal. Here we are using it to test the examples.
-
-You can also condition your generation on some preceding text with the `context`
-argument.
-
- >>> lm.generate(5, text_seed=['c'], random_seed=3)
- ['</s>', 'c', 'd', 'c', 'd']
-
-Note that an ngram model is restricted in how much preceding context it can
-take into account. For example, a trigram model can only condition its output
-on 2 preceding words. If you pass in a 4-word context, the first two words
-will be ignored.
-"""
-
-from nltk.lm.models import (
- MLE,
- Lidstone,
- Laplace,
- WittenBellInterpolated,
- KneserNeyInterpolated,
-)
-from nltk.lm.counter import NgramCounter
-from nltk.lm.vocabulary import Vocabulary
-
-__all__ = [
- "Vocabulary",
- "NgramCounter",
- "MLE",
- "Lidstone",
- "Laplace",
- "WittenBellInterpolated",
- "KneserNeyInterpolated",
-]
+++ /dev/null
-# Natural Language Toolkit: Language Models
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-"""Language Model Interface."""
-
-import random
-from abc import ABCMeta, abstractmethod
-from bisect import bisect
-
-
-from nltk.lm.counter import NgramCounter
-from nltk.lm.util import log_base2
-from nltk.lm.vocabulary import Vocabulary
-
-from itertools import accumulate
-
-
-class Smoothing(metaclass=ABCMeta):
- """Ngram Smoothing Interface
-
- Implements Chen & Goodman 1995's idea that all smoothing algorithms have
- certain features in common. This should ideally allow smoothing algorithms to
- work both with Backoff and Interpolation.
- """
-
- def __init__(self, vocabulary, counter):
- """
- :param vocabulary: The Ngram vocabulary object.
- :type vocabulary: nltk.lm.vocab.Vocabulary
- :param counter: The counts of the vocabulary items.
- :type counter: nltk.lm.counter.NgramCounter
- """
- self.vocab = vocabulary
- self.counts = counter
-
- @abstractmethod
- def unigram_score(self, word):
- raise NotImplementedError()
-
- @abstractmethod
- def alpha_gamma(self, word, context):
- raise NotImplementedError()
-
-
-def _mean(items):
- """Return average (aka mean) for sequence of items."""
- return sum(items) / len(items)
-
-
-def _random_generator(seed_or_generator):
- if isinstance(seed_or_generator, random.Random):
- return seed_or_generator
- return random.Random(seed_or_generator)
-
-
-def _weighted_choice(population, weights, random_generator=None):
- """Like random.choice, but with weights.
-
- Heavily inspired by python 3.6 `random.choices`.
- """
- if not population:
- raise ValueError("Can't choose from empty population")
- if len(population) != len(weights):
- raise ValueError("The number of weights does not match the population")
- cum_weights = list(accumulate(weights))
- total = cum_weights[-1]
- threshold = random_generator.random()
- return population[bisect(cum_weights, total * threshold)]
-
-
-class LanguageModel(metaclass=ABCMeta):
- """ABC for Language Models.
-
- Cannot be directly instantiated itself.
-
- """
-
- def __init__(self, order, vocabulary=None, counter=None):
- """Creates new LanguageModel.
-
- :param vocabulary: If provided, this vocabulary will be used instead
- of creating a new one when training.
- :type vocabulary: `nltk.lm.Vocabulary` or None
- :param counter: If provided, use this object to count ngrams.
- :type vocabulary: `nltk.lm.NgramCounter` or None
- :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
- sequences.
- :type ngrams_fn: function or None
- :param pad_fn: If given, defines how senteces in training text are padded.
- :type pad_fn: function or None
-
- """
- self.order = order
- self.vocab = Vocabulary() if vocabulary is None else vocabulary
- self.counts = NgramCounter() if counter is None else counter
-
- def fit(self, text, vocabulary_text=None):
- """Trains the model on a text.
-
- :param text: Training text as a sequence of sentences.
-
- """
- if not self.vocab:
- if vocabulary_text is None:
- raise ValueError(
- "Cannot fit without a vocabulary or text to create it from."
- )
- self.vocab.update(vocabulary_text)
- self.counts.update(self.vocab.lookup(sent) for sent in text)
-
- def score(self, word, context=None):
- """Masks out of vocab (OOV) words and computes their model score.
-
- For model-specific logic of calculating scores, see the `unmasked_score`
- method.
- """
- return self.unmasked_score(
- self.vocab.lookup(word), self.vocab.lookup(context) if context else None
- )
-
- @abstractmethod
- def unmasked_score(self, word, context=None):
- """Score a word given some optional context.
-
- Concrete models are expected to provide an implementation.
- Note that this method does not mask its arguments with the OOV label.
- Use the `score` method for that.
-
- :param str word: Word for which we want the score
- :param tuple(str) context: Context the word is in.
- If `None`, compute unigram score.
- :param context: tuple(str) or None
- :rtype: float
-
- """
- raise NotImplementedError()
-
- def logscore(self, word, context=None):
- """Evaluate the log score of this word in this context.
-
- The arguments are the same as for `score` and `unmasked_score`.
-
- """
- return log_base2(self.score(word, context))
-
- def context_counts(self, context):
- """Helper method for retrieving counts for a given context.
-
- Assumes context has been checked and oov words in it masked.
- :type context: tuple(str) or None
-
- """
- return (
- self.counts[len(context) + 1][context] if context else self.counts.unigrams
- )
-
- def entropy(self, text_ngrams):
- """Calculate cross-entropy of model for given evaluation text.
-
- :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
- :rtype: float
-
- """
- return -1 * _mean(
- [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]
- )
-
- def perplexity(self, text_ngrams):
- """Calculates the perplexity of the given text.
-
- This is simply 2 ** cross-entropy for the text, so the arguments are the same.
-
- """
- return pow(2.0, self.entropy(text_ngrams))
-
- def generate(self, num_words=1, text_seed=None, random_seed=None):
- """Generate words from the model.
-
- :param int num_words: How many words to generate. By default 1.
- :param text_seed: Generation can be conditioned on preceding context.
- :param random_seed: A random seed or an instance of `random.Random`. If provided,
- makes the random sampling part of generation reproducible.
- :return: One (str) word or a list of words generated from model.
-
- Examples:
-
- >>> from nltk.lm import MLE
- >>> lm = MLE(2)
- >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
- >>> lm.fit([[("a",), ("b",), ("c",)]])
- >>> lm.generate(random_seed=3)
- 'a'
- >>> lm.generate(text_seed=['a'])
- 'b'
-
- """
- text_seed = [] if text_seed is None else list(text_seed)
- random_generator = _random_generator(random_seed)
- # This is the base recursion case.
- if num_words == 1:
- context = (
- text_seed[-self.order + 1 :]
- if len(text_seed) >= self.order
- else text_seed
- )
- samples = self.context_counts(self.vocab.lookup(context))
- while context and not samples:
- context = context[1:] if len(context) > 1 else []
- samples = self.context_counts(self.vocab.lookup(context))
- # Sorting samples achieves two things:
- # - reproducible randomness when sampling
- # - turns Mapping into Sequence which `_weighted_choice` expects
- samples = sorted(samples)
- return _weighted_choice(
- samples,
- tuple(self.score(w, context) for w in samples),
- random_generator,
- )
- # We build up text one word at a time using the preceding context.
- generated = []
- for _ in range(num_words):
- generated.append(
- self.generate(
- num_words=1,
- text_seed=text_seed + generated,
- random_seed=random_generator,
- )
- )
- return generated
+++ /dev/null
-# Natural Language Toolkit
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-"""
-Language Model Counter
-----------------------
-"""
-
-from collections import defaultdict
-from collections.abc import Sequence
-
-from nltk.probability import ConditionalFreqDist, FreqDist
-
-
-class NgramCounter:
- """Class for counting ngrams.
-
- Will count any ngram sequence you give it ;)
-
- First we need to make sure we are feeding the counter sentences of ngrams.
-
- >>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
- >>> from nltk.util import ngrams
- >>> text_bigrams = [ngrams(sent, 2) for sent in text]
- >>> text_unigrams = [ngrams(sent, 1) for sent in text]
-
- The counting itself is very simple.
-
- >>> from nltk.lm import NgramCounter
- >>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
-
- You can conveniently access ngram counts using standard python dictionary notation.
- String keys will give you unigram counts.
-
- >>> ngram_counts['a']
- 2
- >>> ngram_counts['aliens']
- 0
-
- If you want to access counts for higher order ngrams, use a list or a tuple.
- These are treated as "context" keys, so what you get is a frequency distribution
- over all continuations after the given context.
-
- >>> sorted(ngram_counts[['a']].items())
- [('b', 1), ('c', 1)]
- >>> sorted(ngram_counts[('a',)].items())
- [('b', 1), ('c', 1)]
-
- This is equivalent to specifying explicitly the order of the ngram (in this case
- 2 for bigram) and indexing on the context.
- >>> ngram_counts[2][('a',)] is ngram_counts[['a']]
- True
-
- Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
- It is generally advisable to use the less verbose and more flexible square
- bracket notation.
-
- To get the count of the full ngram "a b", do this:
-
- >>> ngram_counts[['a']]['b']
- 1
-
- Specifying the ngram order as a number can be useful for accessing all ngrams
- in that order.
-
- >>> ngram_counts[2]
- <ConditionalFreqDist with 4 conditions>
-
- The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
- Unigrams can also be accessed with a human-friendly alias.
-
- >>> ngram_counts.unigrams is ngram_counts[1]
- True
-
- Similarly to `collections.Counter`, you can update counts after initialization.
-
- >>> ngram_counts['e']
- 0
- >>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
- >>> ngram_counts['e']
- 1
-
- """
-
- def __init__(self, ngram_text=None):
- """Creates a new NgramCounter.
-
- If `ngram_text` is specified, counts ngrams from it, otherwise waits for
- `update` method to be called explicitly.
-
- :param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
- :type ngram_text: Iterable(Iterable(tuple(str))) or None
-
- """
- self._counts = defaultdict(ConditionalFreqDist)
- self._counts[1] = self.unigrams = FreqDist()
-
- if ngram_text:
- self.update(ngram_text)
-
- def update(self, ngram_text):
- """Updates ngram counts from `ngram_text`.
-
- Expects `ngram_text` to be a sequence of sentences (sequences).
- Each sentence consists of ngrams as tuples of strings.
-
- :param Iterable(Iterable(tuple(str))) ngram_text: Text containing senteces of ngrams.
- :raises TypeError: if the ngrams are not tuples.
-
- """
-
- for sent in ngram_text:
- for ngram in sent:
- if not isinstance(ngram, tuple):
- raise TypeError(
- "Ngram <{0}> isn't a tuple, "
- "but {1}".format(ngram, type(ngram))
- )
-
- ngram_order = len(ngram)
- if ngram_order == 1:
- self.unigrams[ngram[0]] += 1
- continue
-
- context, word = ngram[:-1], ngram[-1]
- self[ngram_order][context][word] += 1
-
- def N(self):
- """Returns grand total number of ngrams stored.
-
- This includes ngrams from all orders, so some duplication is expected.
- :rtype: int
-
- >>> from nltk.lm import NgramCounter
- >>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
- >>> counts.N()
- 3
-
- """
- return sum(val.N() for val in self._counts.values())
-
- def __getitem__(self, item):
- """User-friendly access to ngram counts."""
- if isinstance(item, int):
- return self._counts[item]
- elif isinstance(item, str):
- return self._counts.__getitem__(1)[item]
- elif isinstance(item, Sequence):
- return self._counts.__getitem__(len(item) + 1)[tuple(item)]
-
- def __str__(self):
- return "<{0} with {1} ngram orders and {2} ngrams>".format(
- self.__class__.__name__, len(self._counts), self.N()
- )
-
- def __len__(self):
- return self._counts.__len__()
-
- def __contains__(self, item):
- return item in self._counts
+++ /dev/null
-# Natural Language Toolkit: Language Models
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-"""Language Models"""
-
-from nltk.lm.api import LanguageModel, Smoothing
-from nltk.lm.smoothing import KneserNey, WittenBell
-
-
-class MLE(LanguageModel):
- """Class for providing MLE ngram model scores.
-
- Inherits initialization from BaseNgramModel.
- """
-
- def unmasked_score(self, word, context=None):
- """Returns the MLE score for a word given a context.
-
- Args:
- - word is expcected to be a string
- - context is expected to be something reasonably convertible to a tuple
- """
- return self.context_counts(context).freq(word)
-
-
-class Lidstone(LanguageModel):
- """Provides Lidstone-smoothed scores.
-
- In addition to initialization arguments from BaseNgramModel also requires
- a number by which to increase the counts, gamma.
- """
-
- def __init__(self, gamma, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.gamma = gamma
-
- def unmasked_score(self, word, context=None):
- """Add-one smoothing: Lidstone or Laplace.
-
- To see what kind, look at `gamma` attribute on the class.
-
- """
- counts = self.context_counts(context)
- word_count = counts[word]
- norm_count = counts.N()
- return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
-
-
-class Laplace(Lidstone):
- """Implements Laplace (add one) smoothing.
-
- Initialization identical to BaseNgramModel because gamma is always 1.
- """
-
- def __init__(self, *args, **kwargs):
- super().__init__(1, *args, **kwargs)
-
-
-class InterpolatedLanguageModel(LanguageModel):
- """Logic common to all interpolated language models.
-
- The idea to abstract this comes from Chen & Goodman 1995.
- Do not instantiate this class directly!
- """
-
- def __init__(self, smoothing_cls, order, **kwargs):
- assert issubclass(smoothing_cls, Smoothing)
- params = kwargs.pop("params", {})
- super().__init__(order, **kwargs)
- self.estimator = smoothing_cls(self.vocab, self.counts, **params)
-
- def unmasked_score(self, word, context=None):
- if not context:
- # The base recursion case: no context, we only have a unigram.
- return self.estimator.unigram_score(word)
- if not self.counts[context]:
- # It can also happen that we have no data for this context.
- # In that case we defer to the lower-order ngram.
- # This is the same as setting alpha to 0 and gamma to 1.
- return self.unmasked_score(word, context[1:])
- alpha, gamma = self.estimator.alpha_gamma(word, context)
- return alpha + gamma * self.unmasked_score(word, context[1:])
-
-
-class WittenBellInterpolated(InterpolatedLanguageModel):
- """Interpolated version of Witten-Bell smoothing."""
-
- def __init__(self, order, **kwargs):
- super().__init__(WittenBell, order, **kwargs)
-
-
-class KneserNeyInterpolated(InterpolatedLanguageModel):
- """Interpolated version of Kneser-Ney smoothing."""
-
- def __init__(self, order, discount=0.1, **kwargs):
- super().__init__(KneserNey, order, params={"discount": discount}, **kwargs)
+++ /dev/null
-# Natural Language Toolkit: Language Model Unit Tests
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-from functools import partial
-from itertools import chain
-
-from nltk.util import everygrams, pad_sequence
-
-flatten = chain.from_iterable
-pad_both_ends = partial(
- pad_sequence,
- pad_left=True,
- left_pad_symbol="<s>",
- pad_right=True,
- right_pad_symbol="</s>",
-)
-pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.
-
- Following convention <s> pads the start of sentence </s> pads its end.
- """
-
-
-def padded_everygrams(order, sentence):
- """Helper with some useful defaults.
-
- Applies pad_both_ends to sentence and follows it up with everygrams.
- """
- return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
-
-
-def padded_everygram_pipeline(order, text):
- """Default preprocessing for a sequence of sentences.
-
- Creates two iterators:
- - sentences padded and turned into sequences of `nltk.util.everygrams`
- - sentences padded as above and chained together for a flat stream of words
-
- :param order: Largest ngram length produced by `everygrams`.
- :param text: Text to iterate over. Expected to be an iterable of sentences:
- Iterable[Iterable[str]]
- :return: iterator over text as ngrams, iterator over text as vocabulary data
- """
- padding_fn = partial(pad_both_ends, n=order)
- return (
- (everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
- flatten(map(padding_fn, text)),
- )
+++ /dev/null
-# Natural Language Toolkit: Language Model Unit Tests
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-"""Smoothing algorithms for language modeling.
-
-According to Chen & Goodman 1995 these should work with both Backoff and
-Interpolation.
-"""
-
-from nltk.lm.api import Smoothing
-
-
-def _count_non_zero_vals(dictionary):
- return sum(1.0 for c in dictionary.values() if c > 0)
-
-
-class WittenBell(Smoothing):
- """Witten-Bell smoothing."""
-
- def __init__(self, vocabulary, counter, **kwargs):
- super().__init__(vocabulary, counter, **kwargs)
-
- def alpha_gamma(self, word, context):
- alpha = self.counts[context].freq(word)
- gamma = self._gamma(context)
- return (1.0 - gamma) * alpha, gamma
-
- def _gamma(self, context):
- n_plus = _count_non_zero_vals(self.counts[context])
- return n_plus / (n_plus + self.counts[len(context) + 1].N())
-
- def unigram_score(self, word):
- return self.counts.unigrams.freq(word)
-
-
-class KneserNey(Smoothing):
- """Kneser-Ney Smoothing."""
-
- def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
- super().__init__(vocabulary, counter, **kwargs)
- self.discount = discount
-
- def unigram_score(self, word):
- return 1.0 / len(self.vocab)
-
- def alpha_gamma(self, word, context):
- prefix_counts = self.counts[context]
- prefix_total_ngrams = prefix_counts.N()
- alpha = max(prefix_counts[word] - self.discount, 0.0) / prefix_total_ngrams
- gamma = (
- self.discount * _count_non_zero_vals(prefix_counts) / prefix_total_ngrams
- )
- return alpha, gamma
+++ /dev/null
-# Natural Language Toolkit
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-"""Language Model Utilities"""
-
-from math import log
-
-NEG_INF = float("-inf")
-POS_INF = float("inf")
-
-
-def log_base2(score):
- """Convenience function for computing logarithms with base 2."""
- if score == 0.0:
- return NEG_INF
- return log(score, 2)
+++ /dev/null
-# Natural Language Toolkit
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-"""Language Model Vocabulary"""
-
-import sys
-from collections import Counter
-from collections.abc import Iterable
-from itertools import chain
-from functools import singledispatch
-
-
-@singledispatch
-def _dispatched_lookup(words, vocab):
- raise TypeError(
- "Unsupported type for looking up in vocabulary: {0}".format(type(words))
- )
-
-
-@_dispatched_lookup.register(Iterable)
-def _(words, vocab):
- """Look up a sequence of words in the vocabulary.
-
- Returns an iterator over looked up words.
-
- """
- return tuple(_dispatched_lookup(w, vocab) for w in words)
-
-
-@_dispatched_lookup.register(str)
-def _string_lookup(word, vocab):
- """Looks up one word in the vocabulary."""
- return word if word in vocab else vocab.unk_label
-
-
-class Vocabulary:
- """Stores language model vocabulary.
-
- Satisfies two common language modeling requirements for a vocabulary:
- - When checking membership and calculating its size, filters items
- by comparing their counts to a cutoff value.
- - Adds a special "unknown" token which unseen words are mapped to.
-
- >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
- >>> from nltk.lm import Vocabulary
- >>> vocab = Vocabulary(words, unk_cutoff=2)
-
- Tokens with counts greater than or equal to the cutoff value will
- be considered part of the vocabulary.
-
- >>> vocab['c']
- 3
- >>> 'c' in vocab
- True
- >>> vocab['d']
- 2
- >>> 'd' in vocab
- True
-
- Tokens with frequency counts less than the cutoff value will be considered not
- part of the vocabulary even though their entries in the count dictionary are
- preserved.
-
- >>> vocab['b']
- 1
- >>> 'b' in vocab
- False
- >>> vocab['aliens']
- 0
- >>> 'aliens' in vocab
- False
-
- Keeping the count entries for seen words allows us to change the cutoff value
- without having to recalculate the counts.
-
- >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
- >>> "b" in vocab2
- True
-
- The cutoff value influences not only membership checking but also the result of
- getting the size of the vocabulary using the built-in `len`.
- Note that while the number of keys in the vocabulary's counter stays the same,
- the items in the vocabulary differ depending on the cutoff.
- We use `sorted` to demonstrate because it keeps the order consistent.
-
- >>> sorted(vocab2.counts)
- ['-', 'a', 'b', 'c', 'd', 'r']
- >>> sorted(vocab2)
- ['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
- >>> sorted(vocab.counts)
- ['-', 'a', 'b', 'c', 'd', 'r']
- >>> sorted(vocab)
- ['<UNK>', 'a', 'c', 'd']
-
- In addition to items it gets populated with, the vocabulary stores a special
- token that stands in for so-called "unknown" items. By default it's "<UNK>".
-
- >>> "<UNK>" in vocab
- True
-
- We can look up words in a vocabulary using its `lookup` method.
- "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
- If given one word (a string) as an input, this method will return a string.
-
- >>> vocab.lookup("a")
- 'a'
- >>> vocab.lookup("aliens")
- '<UNK>'
-
- If given a sequence, it will return an tuple of the looked up words.
-
- >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
- ('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')
-
- It's possible to update the counts after the vocabulary has been created.
- In general, the interface is the same as that of `collections.Counter`.
-
- >>> vocab['b']
- 1
- >>> vocab.update(["b", "b", "c"])
- >>> vocab['b']
- 3
- """
-
- def __init__(self, counts=None, unk_cutoff=1, unk_label="<UNK>"):
- """Create a new Vocabulary.
-
- :param counts: Optional iterable or `collections.Counter` instance to
- pre-seed the Vocabulary. In case it is iterable, counts
- are calculated.
- :param int unk_cutoff: Words that occur less frequently than this value
- are not considered part of the vocabulary.
- :param unk_label: Label for marking words not part of vocabulary.
-
- """
- if isinstance(counts, Counter):
- self.counts = counts
- else:
- self.counts = Counter()
- if isinstance(counts, Iterable):
- self.counts.update(counts)
- self.unk_label = unk_label
- if unk_cutoff < 1:
- raise ValueError(
- "Cutoff value cannot be less than 1. Got: {0}".format(unk_cutoff)
- )
- self._cutoff = unk_cutoff
-
- @property
- def cutoff(self):
- """Cutoff value.
-
- Items with count below this value are not considered part of vocabulary.
-
- """
- return self._cutoff
-
- def update(self, *counter_args, **counter_kwargs):
- """Update vocabulary counts.
-
- Wraps `collections.Counter.update` method.
-
- """
- self.counts.update(*counter_args, **counter_kwargs)
-
- def lookup(self, words):
- """Look up one or more words in the vocabulary.
-
- If passed one word as a string will return that word or `self.unk_label`.
- Otherwise will assume it was passed a sequence of words, will try to look
- each of them up and return an iterator over the looked up words.
-
- :param words: Word(s) to look up.
- :type words: Iterable(str) or str
- :rtype: generator(str) or str
- :raises: TypeError for types other than strings or iterables
-
- >>> from nltk.lm import Vocabulary
- >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
- >>> vocab.lookup("a")
- 'a'
- >>> vocab.lookup("aliens")
- '<UNK>'
- >>> vocab.lookup(["a", "b", "c", ["x", "b"]])
- ('a', 'b', '<UNK>', ('<UNK>', 'b'))
-
- """
- return _dispatched_lookup(words, self)
-
- def __getitem__(self, item):
- return self._cutoff if item == self.unk_label else self.counts[item]
-
- def __contains__(self, item):
- """Only consider items with counts GE to cutoff as being in the
- vocabulary."""
- return self[item] >= self.cutoff
-
- def __iter__(self):
- """Building on membership check define how to iterate over
- vocabulary."""
- return chain(
- (item for item in self.counts if item in self),
- [self.unk_label] if self.counts else [],
- )
-
- def __len__(self):
- """Computing size of vocabulary reflects the cutoff."""
- return sum(1 for _ in self)
-
- def __eq__(self, other):
- return (
- self.unk_label == other.unk_label
- and self.cutoff == other.cutoff
- and self.counts == other.counts
- )
-
- def __str__(self):
- return "<{0} with cutoff={1} unk_label='{2}' and {3} items>".format(
- self.__class__.__name__, self.cutoff, self.unk_label, len(self)
- )
# Natural Language Toolkit: Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
Classes and methods for scoring processing modules.
"""
-from nltk.metrics.scores import (
- accuracy,
- precision,
- recall,
- f_measure,
- log_likelihood,
- approxrand,
-)
+from nltk.metrics.scores import (accuracy, precision, recall, f_measure,
+ log_likelihood, approxrand)
from nltk.metrics.confusionmatrix import ConfusionMatrix
-from nltk.metrics.distance import (
- edit_distance,
- edit_distance_align,
- binary_distance,
- jaccard_distance,
- masi_distance,
- interval_distance,
- custom_distance,
- presence,
- fractional_presence,
-)
-from nltk.metrics.paice import Paice
-from nltk.metrics.segmentation import windowdiff, ghd, pk
-from nltk.metrics.agreement import AnnotationTask
-from nltk.metrics.association import (
- NgramAssocMeasures,
- BigramAssocMeasures,
- TrigramAssocMeasures,
- QuadgramAssocMeasures,
- ContingencyMeasures,
-)
-from nltk.metrics.spearman import (
- spearman_correlation,
- ranks_from_sequence,
- ranks_from_scores,
-)
-from nltk.metrics.aline import align
+from nltk.metrics.distance import (edit_distance, binary_distance,
+ jaccard_distance, masi_distance,
+ interval_distance, custom_distance,
+ presence, fractional_presence)
+from nltk.metrics.paice import Paice
+from nltk.metrics.segmentation import windowdiff, ghd, pk
+from nltk.metrics.agreement import AnnotationTask
+from nltk.metrics.association import (NgramAssocMeasures, BigramAssocMeasures,
+ TrigramAssocMeasures, ContingencyMeasures)
+from nltk.metrics.spearman import (spearman_correlation, ranks_from_sequence,
+ ranks_from_scores)
+from nltk.metrics.aline import align
# Natural Language Toolkit: Agreement Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tom Lippincott <tom@cs.columbia.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
1.0
"""
+from __future__ import print_function, unicode_literals, division
import logging
from itertools import groupby
from operator import itemgetter
+from six import iteritems
+
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.internals import deprecated
+from nltk.compat import python_2_unicode_compatible
from nltk.metrics.distance import binary_distance
-log = logging.getLogger(__name__)
-
+log = logging.getLogger(__file__)
+@python_2_unicode_compatible
class AnnotationTask(object):
"""Represents an annotation task, i.e. people assign labels to items.
self.load_array(data)
def __str__(self):
- return "\r\n".join(
- map(
- lambda x: "%s\t%s\t%s"
- % (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])),
- self.data,
- )
- )
+ return "\r\n".join(map(lambda x:"%s\t%s\t%s" %
+ (x['coder'], x['item'].replace('_', "\t"),
+ ",".join(x['labels'])), self.data))
def load_array(self, array):
"""Load an sequence of annotation results, appending to any data already loaded.
self.C.add(coder)
self.K.add(labels)
self.I.add(item)
- self.data.append({"coder": coder, "labels": labels, "item": item})
+ self.data.append({'coder':coder, 'labels':labels, 'item':item})
def agr(self, cA, cB, i, data=None):
"""Agreement between two coders on a given item
# cfedermann: we don't know what combination of coder/item will come
# first in x; to avoid StopIteration problems due to assuming an order
# cA,cB, we allow either for k1 and then look up the missing as k2.
- k1 = next((x for x in data if x["coder"] in (cA, cB) and x["item"] == i))
- if k1["coder"] == cA:
- k2 = next((x for x in data if x["coder"] == cB and x["item"] == i))
+ k1 = next((x for x in data if x['coder'] in (cA,cB) and x['item']==i))
+ if k1['coder'] == cA:
+ k2 = next((x for x in data if x['coder']==cB and x['item']==i))
else:
- k2 = next((x for x in data if x["coder"] == cA and x["item"] == i))
+ k2 = next((x for x in data if x['coder']==cA and x['item']==i))
- ret = 1.0 - float(self.distance(k1["labels"], k2["labels"]))
- log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret)
- log.debug(
- 'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret
- )
+ ret = 1.0 - float(self.distance(k1['labels'], k2['labels']))
+ log.debug("Observed agreement between %s and %s on %s: %f",
+ cA, cB, i, ret)
+ log.debug("Distance between \"%r\" and \"%r\": %f",
+ k1['labels'], k2['labels'], 1.0 - ret)
return ret
def Nk(self, k):
- return float(sum(1 for x in self.data if x["labels"] == k))
+ return float(sum(1 for x in self.data if x['labels'] == k))
def Nik(self, i, k):
- return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k))
+ return float(sum(1 for x in self.data if x['item'] == i and x['labels'] == k))
def Nck(self, c, k):
- return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k))
+ return float(sum(1 for x in self.data if x['coder'] == c and x['labels'] == k))
- @deprecated("Use Nk, Nik or Nck instead")
+ @deprecated('Use Nk, Nik or Nck instead')
def N(self, k=None, i=None, c=None):
"""Implements the "n-notation" used in Artstein and Poesio (2007)
elif k is not None and c is not None and i is None:
ret = self.Nck(c, k)
else:
- raise ValueError(
- "You must pass either i or c, not both! (k=%r,i=%r,c=%r)" % (k, i, c)
- )
+ raise ValueError("You must pass either i or c, not both! (k=%r,i=%r,c=%r)" % (k, i, c))
log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret)
return ret
"""Observed agreement between two coders on all items.
"""
- data = self._grouped_data(
- "item", (x for x in self.data if x["coder"] in (cA, cB))
- )
- ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(
- self.I
- )
+ data = self._grouped_data('item', (x for x in self.data if x['coder'] in (cA, cB)))
+ ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(self.I)
log.debug("Observed agreement between %s and %s: %f", cA, cB, ret)
return ret
log.debug("Average observed agreement: %f", ret)
return ret
- def Do_Kw_pairwise(self, cA, cB, max_distance=1.0):
+ def Do_alpha(self):
+ """The observed disagreement for the alpha coefficient.
+
+ The alpha coefficient, unlike the other metrics, uses this rather than
+ observed agreement.
+ """
+ total = 0.0
+ for i, itemdata in self._grouped_data('item'):
+ label_freqs = FreqDist(x['labels'] for x in itemdata)
+
+ for j, nj in iteritems(label_freqs):
+ for l, nl in iteritems(label_freqs):
+ total += float(nj * nl) * self.distance(l, j)
+ ret = (1.0 / (len(self.I) * len(self.C) * (len(self.C) - 1))) * total
+ log.debug("Observed disagreement: %f", ret)
+ return ret
+
+ def Do_Kw_pairwise(self,cA,cB,max_distance=1.0):
"""The observed disagreement for the weighted kappa coefficient.
"""
total = 0.0
- data = (x for x in self.data if x["coder"] in (cA, cB))
- for i, itemdata in self._grouped_data("item", data):
+ data = (x for x in self.data if x['coder'] in (cA, cB))
+ for i, itemdata in self._grouped_data('item', data):
# we should have two items; distance doesn't care which comes first
- total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"])
+ total += self.distance(next(itemdata)['labels'],
+ next(itemdata)['labels'])
ret = total / (len(self.I) * max_distance)
log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
"""Averaged over all labelers
"""
- ret = self._pairwise_average(
- lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance)
- )
+ ret = self._pairwise_average(lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance))
log.debug("Observed disagreement: %f", ret)
return ret
"""
total = 0.0
- label_freqs = FreqDist(x["labels"] for x in self.data)
- for k, f in label_freqs.items():
+ label_freqs = FreqDist(x['labels'] for x in self.data)
+ for k, f in iteritems(label_freqs):
total += f ** 2
Ae = total / ((len(self.I) * len(self.C)) ** 2)
return (self.avg_Ao() - Ae) / (1 - Ae)
def Ae_kappa(self, cA, cB):
Ae = 0.0
nitems = float(len(self.I))
- label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data)
+ label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
for k in label_freqs.conditions():
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
return Ae
Ae = self._pairwise_average(self.Ae_kappa)
return (self.avg_Ao() - Ae) / (1.0 - Ae)
- def Disagreement(self, label_freqs):
- total_labels = sum(label_freqs.values())
- pairs = 0.0
- for j, nj in label_freqs.items():
- for l, nl in label_freqs.items():
- pairs += float(nj * nl) * self.distance(l, j)
- return 1.0 * pairs / (total_labels * (total_labels - 1))
-
def alpha(self):
"""Krippendorff 1980
"""
# check for degenerate cases
- if len(self.K) == 0:
+ if len(self.K)==0:
raise ValueError("Cannot calculate alpha, no data present!")
if len(self.K) == 1:
log.debug("Only one annotation value, allpha returning 1.")
return 1
- if len(self.C) == 1 and len(self.I) == 1:
+ if len(self.C)==1 and len(self.I) == 1:
raise ValueError("Cannot calculate alpha, only one coder and item present!")
- total_disagreement = 0.0
- total_ratings = 0
- all_valid_labels_freq = FreqDist([])
-
- total_do = 0.0 # Total observed disagreement for all items.
- for i, itemdata in self._grouped_data("item"):
- label_freqs = FreqDist(x["labels"] for x in itemdata)
- labels_count = sum(label_freqs.values())
- if labels_count < 2:
- # Ignore the item.
- continue
- all_valid_labels_freq += label_freqs
- total_do += self.Disagreement(label_freqs) * labels_count
-
- do = total_do / sum(all_valid_labels_freq.values())
+ De = 0.0
- de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
- k_alpha = 1.0 - do / de
-
- return k_alpha
+ label_freqs = FreqDist(x['labels'] for x in self.data)
+ for j in self.K:
+ nj = label_freqs[j]
+ for l in self.K:
+ De += float(nj * label_freqs[l]) * self.distance(j, l)
+ try:
+ De = (1.0 / (len(self.I) * len(self.C) * (len(self.I) * len(self.C) - 1))) * De
+ log.debug("Expected disagreement: %f", De)
+ ret = 1.0 - (self.Do_alpha() / De)
+ except ZeroDivisionError:
+ raise ValueError("Cannot calculate alpha, expected disagreement zero, check the distance function!")
+ return ret
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
"""Cohen 1968
"""
total = 0.0
- label_freqs = ConditionalFreqDist(
- (x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB)
- )
+ label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
+ for x in self.data
+ if x['coder'] in (cA, cB))
for j in self.K:
for l in self.K:
total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
"""Cohen 1968
"""
- return self._pairwise_average(
- lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance)
- )
+ return self._pairwise_average(lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance))
-if __name__ == "__main__":
+if __name__ == '__main__':
import re
import optparse
# process command-line arguments
parser = optparse.OptionParser()
- parser.add_option(
- "-d",
- "--distance",
- dest="distance",
- default="binary_distance",
- help="distance metric to use",
- )
- parser.add_option(
- "-a",
- "--agreement",
- dest="agreement",
- default="kappa",
- help="agreement coefficient to calculate",
- )
- parser.add_option(
- "-e",
- "--exclude",
- dest="exclude",
- action="append",
- default=[],
- help="coder names to exclude (may be specified multiple times)",
- )
- parser.add_option(
- "-i",
- "--include",
- dest="include",
- action="append",
- default=[],
- help="coder names to include, same format as exclude",
- )
- parser.add_option(
- "-f",
- "--file",
- dest="file",
- help="file to read labelings from, each line with three columns: 'labeler item labels'",
- )
- parser.add_option(
- "-v",
- "--verbose",
- dest="verbose",
- default="0",
- help="how much debugging to print on stderr (0-4)",
- )
- parser.add_option(
- "-c",
- "--columnsep",
- dest="columnsep",
- default="\t",
- help="char/string that separates the three columns in the file, defaults to tab",
- )
- parser.add_option(
- "-l",
- "--labelsep",
- dest="labelsep",
- default=",",
- help="char/string that separates labels (if labelers can assign more than one), defaults to comma",
- )
- parser.add_option(
- "-p",
- "--presence",
- dest="presence",
- default=None,
- help="convert each labeling into 1 or 0, based on presence of LABEL",
- )
- parser.add_option(
- "-T",
- "--thorough",
- dest="thorough",
- default=False,
- action="store_true",
- help="calculate agreement for every subset of the annotators",
- )
+ parser.add_option("-d", "--distance", dest="distance", default="binary_distance",
+ help="distance metric to use")
+ parser.add_option("-a", "--agreement", dest="agreement", default="kappa",
+ help="agreement coefficient to calculate")
+ parser.add_option("-e", "--exclude", dest="exclude", action="append",
+ default=[], help="coder names to exclude (may be specified multiple times)")
+ parser.add_option("-i", "--include", dest="include", action="append", default=[],
+ help="coder names to include, same format as exclude")
+ parser.add_option("-f", "--file", dest="file",
+ help="file to read labelings from, each line with three columns: 'labeler item labels'")
+ parser.add_option("-v", "--verbose", dest="verbose", default='0',
+ help="how much debugging to print on stderr (0-4)")
+ parser.add_option("-c", "--columnsep", dest="columnsep", default="\t",
+ help="char/string that separates the three columns in the file, defaults to tab")
+ parser.add_option("-l", "--labelsep", dest="labelsep", default=",",
+ help="char/string that separates labels (if labelers can assign more than one), defaults to comma")
+ parser.add_option("-p", "--presence", dest="presence", default=None,
+ help="convert each labeling into 1 or 0, based on presence of LABEL")
+ parser.add_option("-T", "--thorough", dest="thorough", default=False, action="store_true",
+ help="calculate agreement for every subset of the annotators")
(options, remainder) = parser.parse_args()
if not options.file:
# read in data from the specified file
data = []
- with open(options.file, "r") as infile:
+ with open(options.file, 'r') as infile:
for l in infile:
toks = l.split(options.columnsep)
- coder, object_, labels = (
- toks[0],
- str(toks[1:-1]),
- frozenset(toks[-1].strip().split(options.labelsep)),
- )
- if (
- (options.include == options.exclude)
- or (len(options.include) > 0 and coder in options.include)
- or (len(options.exclude) > 0 and coder not in options.exclude)
- ):
+ coder, object_, labels = toks[0], str(toks[1:-1]), frozenset(toks[-1].strip().split(options.labelsep))
+ if ((options.include == options.exclude) or
+ (len(options.include) > 0 and coder in options.include) or
+ (len(options.exclude) > 0 and coder not in options.exclude)):
data.append((coder, object_, labels))
if options.presence:
- task = AnnotationTask(
- data, getattr(distance, options.distance)(options.presence)
- )
+ task = AnnotationTask(data, getattr(distance, options.distance)(options.presence))
else:
task = AnnotationTask(data, getattr(distance, options.distance))
# -*- coding: utf-8 -*-
# Natural Language Toolkit: ALINE
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Greg Kondrak <gkondrak@ualberta.ca>
# Geoff Bacon <bacon@berkeley.edu> (Python port)
# URL: <http://nltk.org/>
University of Toronto.
"""
+from __future__ import unicode_literals
+
try:
import numpy as np
except ImportError:
# === Constants ===
-inf = float("inf")
+inf = float('inf')
# Default values for maximum similarity scores (Kondrak 2002: 54)
-C_skip = 10 # Indels
-C_sub = 35 # Substitutions
-C_exp = 45 # Expansions/compressions
-C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
-
-consonants = [
- "B",
- "N",
- "R",
- "b",
- "c",
- "d",
- "f",
- "g",
- "h",
- "j",
- "k",
- "l",
- "m",
- "n",
- "p",
- "q",
- "r",
- "s",
- "t",
- "v",
- "x",
- "z",
- "ç",
- "ð",
- "ħ",
- "ŋ",
- "ɖ",
- "ɟ",
- "ɢ",
- "ɣ",
- "ɦ",
- "ɬ",
- "ɮ",
- "ɰ",
- "ɱ",
- "ɲ",
- "ɳ",
- "ɴ",
- "ɸ",
- "ɹ",
- "ɻ",
- "ɽ",
- "ɾ",
- "ʀ",
- "ʁ",
- "ʂ",
- "ʃ",
- "ʈ",
- "ʋ",
- "ʐ ",
- "ʒ",
- "ʔ",
- "ʕ",
- "ʙ",
- "ʝ",
- "β",
- "θ",
- "χ",
- "ʐ",
- "w",
-]
+C_skip = 10 # Indels
+C_sub = 35 # Substitutions
+C_exp = 45 # Expansions/compressions
+C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
+
+consonants = ['B', 'N', 'R', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
+ 'n', 'p', 'q', 'r', 's', 't', 'v', 'x', 'z', 'ç', 'ð', 'ħ',
+ 'ŋ', 'ɖ', 'ɟ', 'ɢ', 'ɣ', 'ɦ', 'ɬ', 'ɮ', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ',
+ 'ɸ', 'ɹ', 'ɻ', 'ɽ', 'ɾ', 'ʀ', 'ʁ', 'ʂ', 'ʃ', 'ʈ', 'ʋ', 'ʐ ', 'ʒ',
+ 'ʔ', 'ʕ', 'ʙ', 'ʝ', 'β', 'θ', 'χ', 'ʐ', 'w']
# Relevant features for comparing consonants and vowels
-R_c = [
- "aspirated",
- "lateral",
- "manner",
- "nasal",
- "place",
- "retroflex",
- "syllabic",
- "voice",
-]
+R_c = ['aspirated', 'lateral', 'manner', 'nasal', 'place', 'retroflex',
+ 'syllabic', 'voice']
# 'high' taken out of R_v because same as manner
-R_v = [
- "back",
- "lateral",
- "long",
- "manner",
- "nasal",
- "place",
- "retroflex",
- "round",
- "syllabic",
- "voice",
-]
+R_v = ['back', 'lateral', 'long', 'manner', 'nasal', 'place',
+ 'retroflex', 'round', 'syllabic', 'voice']
# Flattened feature matrix (Kondrak 2002: 56)
similarity_matrix = {
- # place
- "bilabial": 1.0,
- "labiodental": 0.95,
- "dental": 0.9,
- "alveolar": 0.85,
- "retroflex": 0.8,
- "palato-alveolar": 0.75,
- "palatal": 0.7,
- "velar": 0.6,
- "uvular": 0.5,
- "pharyngeal": 0.3,
- "glottal": 0.1,
- "labiovelar": 1.0,
- "vowel": -1.0, # added 'vowel'
- # manner
- "stop": 1.0,
- "affricate": 0.9,
- "fricative": 0.85, # increased fricative from 0.8
- "trill": 0.7,
- "tap": 0.65,
- "approximant": 0.6,
- "high vowel": 0.4,
- "mid vowel": 0.2,
- "low vowel": 0.0,
- "vowel2": 0.5, # added vowel
- # high
- "high": 1.0,
- "mid": 0.5,
- "low": 0.0,
- # back
- "front": 1.0,
- "central": 0.5,
- "back": 0.0,
- # binary features
- "plus": 1.0,
- "minus": 0.0,
+ #place
+ 'bilabial': 1.0, 'labiodental': 0.95, 'dental': 0.9,
+ 'alveolar': 0.85, 'retroflex': 0.8, 'palato-alveolar': 0.75,
+ 'palatal': 0.7, 'velar': 0.6, 'uvular': 0.5, 'pharyngeal': 0.3,
+ 'glottal': 0.1, 'labiovelar': 1.0, 'vowel': -1.0, # added 'vowel'
+ #manner
+ 'stop': 1.0, 'affricate': 0.9, 'fricative': 0.85, # increased fricative from 0.8
+ 'trill': 0.7, 'tap': 0.65, 'approximant': 0.6, 'high vowel': 0.4,
+ 'mid vowel': 0.2, 'low vowel': 0.0, 'vowel2': 0.5, # added vowel
+ #high
+ 'high': 1.0, 'mid': 0.5, 'low': 0.0,
+ #back
+ 'front': 1.0, 'central': 0.5, 'back': 0.0,
+ #binary features
+ 'plus': 1.0, 'minus': 0.0
}
# Relative weights of phonetic features (Kondrak 2002: 55)
salience = {
- "syllabic": 5,
- "place": 40,
- "manner": 50,
- "voice": 5, # decreased from 10
- "nasal": 20, # increased from 10
- "retroflex": 10,
- "lateral": 10,
- "aspirated": 5,
- "long": 0, # decreased from 1
- "high": 3, # decreased from 5
- "back": 2, # decreased from 5
- "round": 2, # decreased from 5
+ 'syllabic': 5,
+ 'place': 40,
+ 'manner': 50,
+ 'voice': 5, # decreased from 10
+ 'nasal': 20, # increased from 10
+ 'retroflex': 10,
+ 'lateral': 10,
+ 'aspirated': 5,
+ 'long': 0, # decreased from 1
+ 'high': 3, # decreased from 5
+ 'back': 2, # decreased from 5
+ 'round': 2 # decreased from 5
}
# (Kondrak 2002: 59-60)
feature_matrix = {
- # Consonants
- "p": {
- "place": "bilabial",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "b": {
- "place": "bilabial",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "t": {
- "place": "alveolar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "d": {
- "place": "alveolar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʈ": {
- "place": "retroflex",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɖ": {
- "place": "retroflex",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "c": {
- "place": "palatal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɟ": {
- "place": "palatal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "k": {
- "place": "velar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "g": {
- "place": "velar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "q": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɢ": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʔ": {
- "place": "glottal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "m": {
- "place": "bilabial",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɱ": {
- "place": "labiodental",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "n": {
- "place": "alveolar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɳ": {
- "place": "retroflex",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɲ": {
- "place": "palatal",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ŋ": {
- "place": "velar",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɴ": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "N": {
- "place": "uvular",
- "manner": "stop",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "plus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʙ": {
- "place": "bilabial",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "B": {
- "place": "bilabial",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "r": {
- "place": "alveolar",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʀ": {
- "place": "uvular",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "R": {
- "place": "uvular",
- "manner": "trill",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɾ": {
- "place": "alveolar",
- "manner": "tap",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɽ": {
- "place": "retroflex",
- "manner": "tap",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɸ": {
- "place": "bilabial",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "β": {
- "place": "bilabial",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "f": {
- "place": "labiodental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "v": {
- "place": "labiodental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "θ": {
- "place": "dental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ð": {
- "place": "dental",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "s": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "z": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʃ": {
- "place": "palato-alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʒ": {
- "place": "palato-alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʂ": {
- "place": "retroflex",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʐ": {
- "place": "retroflex",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ç": {
- "place": "palatal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʝ": {
- "place": "palatal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "x": {
- "place": "velar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɣ": {
- "place": "velar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "χ": {
- "place": "uvular",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʁ": {
- "place": "uvular",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ħ": {
- "place": "pharyngeal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ʕ": {
- "place": "pharyngeal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "h": {
- "place": "glottal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɦ": {
- "place": "glottal",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɬ": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "minus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "plus",
- "aspirated": "minus",
- },
- "ɮ": {
- "place": "alveolar",
- "manner": "fricative",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "plus",
- "aspirated": "minus",
- },
- "ʋ": {
- "place": "labiodental",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɹ": {
- "place": "alveolar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɻ": {
- "place": "retroflex",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "plus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "j": {
- "place": "palatal",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "ɰ": {
- "place": "velar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- "l": {
- "place": "alveolar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "plus",
- "aspirated": "minus",
- },
- "w": {
- "place": "labiovelar",
- "manner": "approximant",
- "syllabic": "minus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "aspirated": "minus",
- },
- # Vowels
- "i": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "y": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "front",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "e": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "E": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "minus",
- "long": "plus",
- "aspirated": "minus",
- },
- "ø": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ɛ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "œ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "front",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "æ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "a": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "front",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "A": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "front",
- "round": "minus",
- "long": "plus",
- "aspirated": "minus",
- },
- "ɨ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "central",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ʉ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "central",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ə": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "central",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "u": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "back",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "U": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "back",
- "round": "plus",
- "long": "plus",
- "aspirated": "minus",
- },
- "o": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "back",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "O": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "back",
- "round": "plus",
- "long": "plus",
- "aspirated": "minus",
- },
- "ɔ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "mid",
- "back": "back",
- "round": "plus",
- "long": "minus",
- "aspirated": "minus",
- },
- "ɒ": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "low",
- "back": "back",
- "round": "minus",
- "long": "minus",
- "aspirated": "minus",
- },
- "I": {
- "place": "vowel",
- "manner": "vowel2",
- "syllabic": "plus",
- "voice": "plus",
- "nasal": "minus",
- "retroflex": "minus",
- "lateral": "minus",
- "high": "high",
- "back": "front",
- "round": "minus",
- "long": "plus",
- "aspirated": "minus",
- },
+# Consonants
+'p': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'b': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'t': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'d': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʈ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɖ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'c': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɟ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'k': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'g': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'q': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɢ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʔ': {'place': 'glottal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'m': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɱ': {'place': 'labiodental', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'n': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɳ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɲ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ŋ': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɴ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'N': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʙ': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'B': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'r': {'place': 'alveolar', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʀ': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'R': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɾ': {'place': 'alveolar', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɽ': {'place': 'retroflex', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɸ': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'β': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'f': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'v': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'θ': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ð': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'s': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'z': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʃ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʒ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʂ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʐ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ç': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʝ': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'x': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɣ': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'χ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʁ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ħ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʕ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'h': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɦ': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɬ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'ɮ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'ʋ': {'place': 'labiodental', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɹ': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɻ': {'place': 'retroflex', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'j': {'place': 'palatal', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɰ': {'place': 'velar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'l': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'w': {'place': 'labiovelar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+# Vowels
+
+'i': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'y': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'e': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'E': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ø': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ɛ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'œ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'æ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'a': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'A': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ɨ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ʉ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'central','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ə': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'u': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'U': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
+
+'o': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'O': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ɔ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ɒ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'back','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'I': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
}
# === Algorithm ===
-
def align(str1, str2, epsilon=0):
"""
Compute the alignment of two phonetic strings.
(Kondrak 2002: 51)
"""
- if np is None:
- raise ImportError("You need numpy in order to use the align function")
+ if np == None:
+ raise ImportError('You need numpy in order to use the align function')
assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
m = len(str1)
n = len(str2)
# This includes Kondrak's initialization of row 0 and column 0 to all 0s.
- S = np.zeros((m + 1, n + 1), dtype=float)
+ S = np.zeros((m+1, n+1), dtype=float)
# If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense,
# and breaks array and string indices. Make sure they never get chosen
# by setting them to -inf.
- for i in range(1, m + 1):
- for j in range(1, n + 1):
- edit1 = S[i - 1, j] + sigma_skip(str1[i - 1])
- edit2 = S[i, j - 1] + sigma_skip(str2[j - 1])
- edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1])
+ for i in range(1, m+1):
+ for j in range(1, n+1):
+ edit1 = S[i-1, j] + sigma_skip(str1[i-1])
+ edit2 = S[i, j-1] + sigma_skip(str2[j-1])
+ edit3 = S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1])
if i > 1:
- edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i])
+ edit4 = S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i])
else:
edit4 = -inf
if j > 1:
- edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j])
+ edit5 = S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j])
else:
edit5 = -inf
S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0)
- T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments
+ T = (1-epsilon)*np.amax(S) # Threshold score for near-optimal alignments
alignments = []
- for i in range(1, m + 1):
- for j in range(1, n + 1):
- if S[i, j] >= T:
+ for i in range(1, m+1):
+ for j in range(1, n+1):
+ if S[i,j] >= T:
alignments.append(_retrieve(i, j, 0, S, T, str1, str2, []))
return alignments
-
def _retrieve(i, j, s, S, T, str1, str2, out):
"""
Retrieve the path through the similarity matrix S starting at (i, j).
if S[i, j] == 0:
return out
else:
- if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T:
- out.insert(0, (str1[i - 1], str2[j - 2 : j]))
- _retrieve(
- i - 1,
- j - 2,
- s + sigma_exp(str1[i - 1], str2[j - 2 : j]),
- S,
- T,
- str1,
- str2,
- out,
- )
- elif (
- i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
- ):
- out.insert(0, (str1[i - 2 : i], str2[j - 1]))
- _retrieve(
- i - 2,
- j - 1,
- s + sigma_exp(str2[j - 1], str1[i - 2 : i]),
- S,
- T,
- str1,
- str2,
- out,
- )
- elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T:
- out.insert(0, ("-", str2[j - 1]))
- _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out)
- elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T:
- out.insert(0, (str1[i - 1], "-"))
- _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out)
- elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T:
- out.insert(0, (str1[i - 1], str2[j - 1]))
- _retrieve(
- i - 1,
- j - 1,
- s + sigma_sub(str1[i - 1], str2[j - 1]),
- S,
- T,
- str1,
- str2,
- out,
- )
+ if j > 1 and S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j]) + s >= T:
+ out.insert(0, (str1[i-1], str2[j-2:j]))
+ _retrieve(i-1, j-2, s+sigma_exp(str1[i-1], str2[j-2:j]), S, T, str1, str2, out)
+ elif i > 1 and S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i]) + s >= T:
+ out.insert(0, (str1[i-2:i], str2[j-1]))
+ _retrieve(i-2, j-1, s+sigma_exp(str2[j-1], str1[i-2:i]), S, T, str1, str2, out)
+ elif S[i, j-1] + sigma_skip(str2[j-1]) + s >= T:
+ out.insert(0, ('-', str2[j-1]))
+ _retrieve(i, j-1, s+sigma_skip(str2[j-1]), S, T, str1, str2, out)
+ elif S[i-1, j] + sigma_skip(str1[i-1]) + s >= T:
+ out.insert(0, (str1[i-1], '-'))
+ _retrieve(i-1, j, s+sigma_skip(str1[i-1]), S, T, str1, str2, out)
+ elif S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1]) + s >= T:
+ out.insert(0, (str1[i-1], str2[j-1]))
+ _retrieve(i-1, j-1, s+sigma_sub(str1[i-1], str2[j-1]), S, T, str1, str2, out)
return out
-
def sigma_skip(p):
"""
Returns score of an indel of P.
"""
return C_skip
-
def sigma_sub(p, q):
"""
Returns score of a substitution of P with Q.
"""
return C_sub - delta(p, q) - V(p) - V(q)
-
def sigma_exp(p, q):
"""
Returns score of an expansion/compression.
q2 = q[1]
return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
-
def delta(p, q):
"""
Return weighted sum of difference between P and Q.
total += diff(p, q, f) * salience[f]
return total
-
def diff(p, q, f):
"""
Returns difference between phonetic segments P and Q for feature F.
p_features, q_features = feature_matrix[p], feature_matrix[q]
return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
-
def R(p, q):
"""
Return relevant features for segment comparsion.
return R_c
return R_v
-
def V(p):
"""
Return vowel weight if P is vowel.
return 0
return C_vwl
-
# === Test ===
-
def demo():
"""
A demonstration of the result of aligning phonetic sequences
used in Kondrak's (2002) dissertation.
"""
- data = [pair.split(",") for pair in cognate_data.split("\n")]
+ data = [pair.split(',') for pair in cognate_data.split('\n')]
for pair in data:
alignment = align(pair[0], pair[1])[0]
- alignment = ["({}, {})".format(a[0], a[1]) for a in alignment]
- alignment = " ".join(alignment)
- print("{} ~ {} : {}".format(pair[0], pair[1], alignment))
-
+ alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment]
+ alignment = ' '.join(alignment)
+ print('{} ~ {} : {}'.format(pair[0], pair[1], alignment))
cognate_data = """jo,ʒə
tu,ty
pematesiweni,pematesewen
asenja,aʔsɛn"""
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Ngram Association Measures
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
"""
-import math as _math
+from __future__ import division
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+import math as _math
from functools import reduce
-
-
_log2 = lambda x: _math.log(x, 2.0)
_ln = _math.log
try:
from scipy.stats import fisher_exact
except ImportError:
-
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
-
### Indices to marginals arguments:
NGRAM = 0
"""Marginals index for the number of words in the data"""
-class NgramAssocMeasures(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class NgramAssocMeasures(object):
"""
An abstract class defining a collection of generic association measures.
Each public method returns a score, taking the following arguments::
@abstractmethod
def _contingency(*marginals):
"""Calculates values of a contingency table from marginal values."""
- raise NotImplementedError(
- "The contingency table is not available" "in the general ngram case"
- )
+ raise NotImplementedError("The contingency table is not available"
+ "in the general ngram case")
@staticmethod
@abstractmethod
def _marginals(*contingency):
"""Calculates values of contingency table marginals from its values."""
- raise NotImplementedError(
- "The contingency table is not available" "in the general ngram case"
- )
+ raise NotImplementedError("The contingency table is not available"
+ "in the general ngram case")
@classmethod
def _expected_values(cls, cont):
# For each contingency table cell
for i in range(len(cont)):
# Yield the expected value
- yield (
- _product(
- sum(cont[x] for x in range(2 ** cls._n) if (x & j) == (i & j))
- for j in bits
- )
- / (n_all ** (cls._n - 1))
- )
+ yield (_product(sum(cont[x] for x in range(2 ** cls._n)
+ if (x & j) == (i & j))
+ for j in bits) /
+ (n_all ** (cls._n - 1)))
@staticmethod
def raw_freq(*marginals):
"""Scores ngrams using Student's t test with independence hypothesis
for unigrams, as in Manning and Schutze 5.3.1.
"""
- return (
- marginals[NGRAM]
- - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
- ) / (marginals[NGRAM] + _SMALL) ** 0.5
+ return ((marginals[NGRAM] -
+ _product(marginals[UNIGRAMS]) /
+ (marginals[TOTAL] ** (cls._n - 1))) /
+ (marginals[NGRAM] + _SMALL) ** .5)
@classmethod
def chi_sq(cls, *marginals):
"""
cont = cls._contingency(*marginals)
exps = cls._expected_values(cont)
- return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
+ return sum((obs - exp) ** 2 / (exp + _SMALL)
+ for obs, exp in zip(cont, exps))
@staticmethod
def mi_like(*marginals, **kwargs):
argument power sets an exponent (default 3) for the numerator. No
logarithm of the result is calculated.
"""
- return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
- marginals[UNIGRAMS]
- )
+ return (marginals[NGRAM] ** kwargs.get('power', 3) /
+ _product(marginals[UNIGRAMS]))
@classmethod
def pmi(cls, *marginals):
"""Scores ngrams by pointwise mutual information, as in Manning and
Schutze 5.4.
"""
- return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
- _product(marginals[UNIGRAMS])
- )
+ return (_log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) -
+ _log2(_product(marginals[UNIGRAMS])))
@classmethod
def likelihood_ratio(cls, *marginals):
"""Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
"""
cont = cls._contingency(*marginals)
- return cls._n * sum(
- obs * _ln(obs / (exp + _SMALL) + _SMALL)
- for obs, exp in zip(cont, cls._expected_values(cont))
- )
+ return (cls._n *
+ sum(obs * _ln(obs / (exp + _SMALL) + _SMALL)
+ for obs, exp in zip(cont, cls._expected_values(cont))))
@classmethod
def poisson_stirling(cls, *marginals):
"""Scores ngrams using the Poisson-Stirling measure."""
- exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
+ exp = (_product(marginals[UNIGRAMS]) /
+ (marginals[TOTAL] ** (cls._n - 1)))
return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
@classmethod
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
- return (n_ii * n_oo - n_io * n_oi) ** 2 / (
- (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
- )
+ return ((n_ii*n_oo - n_io*n_oi)**2 /
+ ((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)))
@classmethod
def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
- (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
+ (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
@staticmethod
n_ioo = n_ixx - n_iii - n_ioi - n_iio
n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
- return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
+ return (n_iii, n_oii, n_ioi, n_ooi,
+ n_iio, n_oio, n_ioo, n_ooo)
@staticmethod
def _marginals(*contingency):
(1, (1, 1, 1), (1, 73, 1), 2000)
"""
n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
- return (
- n_iii,
- (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
- (
- n_iii + n_ioi + n_iio + n_ioo,
- n_iii + n_oii + n_iio + n_oio,
- n_iii + n_oii + n_ioi + n_ooi,
- ),
- sum(contingency),
- )
+ return (n_iii,
+ (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
+ (n_iii + n_ioi + n_iio + n_ioo,
+ n_iii + n_oii + n_iio + n_oio,
+ n_iii + n_oii + n_ioi + n_ooi),
+ sum(contingency))
class QuadgramAssocMeasures(NgramAssocMeasures):
n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
- n_oooo = (
- n_xxxx
- - n_iiii
- - n_oiii
- - n_ioii
- - n_iioi
- - n_ooii
- - n_oioi
- - n_iooi
- - n_oooi
- - n_iiio
- - n_oiio
- - n_ioio
- - n_ooio
- - n_iioo
- - n_oioo
- - n_iooo
- )
-
- return (
- n_iiii,
- n_oiii,
- n_ioii,
- n_ooii,
- n_iioi,
- n_oioi,
- n_iooi,
- n_oooi,
- n_iiio,
- n_oiio,
- n_ioio,
- n_ooio,
- n_iioo,
- n_oioo,
- n_iooo,
- n_oooo,
- )
+ n_oooo = n_xxxx - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_oioi - n_iooi - \
+ n_oooi - n_iiio - n_oiio - n_ioio - n_ooio - n_iioo - n_oioo - n_iooo
+
+ return (n_iiii, n_oiii, n_ioii, n_ooii, n_iioi,
+ n_oioi, n_iooi, n_oooi, n_iiio, n_oiio,
+ n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo)
@staticmethod
def _marginals(*contingency):
QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
(1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
"""
- n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo = (
- contingency
- )
+ n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, \
+ n_iioo, n_oioo, n_iooo, n_oooo = contingency
n_iiix = n_iiii + n_iiio
n_iixi = n_iiii + n_iioi
n_all = sum(contingency)
- return (
- n_iiii,
- (n_iiix, n_iixi, n_ixii, n_xiii),
- (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
- (n_ixxx, n_xixx, n_xxix, n_xxxi),
- n_all,
- )
+ return (n_iiii,
+ (n_iiix, n_iixi, n_ixii, n_xiii),
+ (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
+ (n_ixxx, n_xixx, n_xxix, n_xxxi),
+ n_all)
class ContingencyMeasures(object):
def __init__(self, measures):
"""Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
- self.__class__.__name__ = "Contingency" + measures.__class__.__name__
+ self.__class__.__name__ = 'Contingency' + measures.__class__.__name__
for k in dir(measures):
- if k.startswith("__"):
+ if k.startswith('__'):
continue
v = getattr(measures, k)
- if not k.startswith("_"):
+ if not k.startswith('_'):
v = self._make_contingency_fn(measures, v)
setattr(self, k, v)
"""From an association measure function, produces a new function which
accepts contingency table values as its arguments.
"""
-
def res(*contingency):
return old_fn(*measures._marginals(*contingency))
-
res.__doc__ = old_fn.__doc__
res.__name__ = old_fn.__name__
return res
# Natural Language Toolkit: Confusion Matrices
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
+from __future__ import print_function, unicode_literals
from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class ConfusionMatrix(object):
"""
The confusion matrix between a list of reference values and a
the same length.
"""
if len(reference) != len(test):
- raise ValueError("Lists must have the same length.")
+ raise ValueError('Lists must have the same length.')
# Get a list of all values.
if sort_by_count:
ref_fdist = FreqDist(reference)
test_fdist = FreqDist(test)
-
- def key(v):
- return -(ref_fdist[v] + test_fdist[v])
-
- values = sorted(set(reference + test), key=key)
+ def key(v): return -(ref_fdist[v]+test_fdist[v])
+ values = sorted(set(reference+test), key=key)
else:
- values = sorted(set(reference + test))
+ values = sorted(set(reference+test))
# Construct a value->index dictionary
- indices = dict((val, i) for (i, val) in enumerate(values))
+ indices = dict((val,i) for (i,val) in enumerate(values))
# Make a confusion matrix table.
confusion = [[0 for val in values] for val in values]
- max_conf = 0 # Maximum confusion
- for w, g in zip(reference, test):
+ max_conf = 0 # Maximum confusion
+ for w,g in zip(reference, test):
confusion[indices[w]][indices[g]] += 1
max_conf = max(max_conf, confusion[indices[w]][indices[g]])
return self._confusion[i][j]
def __repr__(self):
- return "<ConfusionMatrix: %s/%s correct>" % (self._correct, self._total)
+ return '<ConfusionMatrix: %s/%s correct>' % (self._correct,
+ self._total)
def __str__(self):
return self.pretty_format()
- def pretty_format(
- self,
- show_percents=False,
- values_in_chart=True,
- truncate=None,
- sort_by_count=False,
- ):
+ def pretty_format(self, show_percents=False, values_in_chart=True,
+ truncate=None, sort_by_count=False):
"""
:return: A multi-line string representation of this confusion matrix.
:type truncate: int
values = self._values
if sort_by_count:
- values = sorted(
- values, key=lambda v: -sum(self._confusion[self._indices[v]])
- )
+ values = sorted(values, key=lambda v:
+ -sum(self._confusion[self._indices[v]]))
if truncate:
values = values[:truncate]
if values_in_chart:
value_strings = ["%s" % val for val in values]
else:
- value_strings = [str(n + 1) for n in range(len(values))]
+ value_strings = [str(n+1) for n in range(len(values))]
# Construct a format string for row values
valuelen = max(len(val) for val in value_strings)
- value_format = "%" + repr(valuelen) + "s | "
+ value_format = '%' + repr(valuelen) + 's | '
# Construct a format string for matrix entries
if show_percents:
entrylen = 6
- entry_format = "%5.1f%%"
- zerostr = " ."
+ entry_format = '%5.1f%%'
+ zerostr = ' .'
else:
entrylen = len(repr(self._max_conf))
- entry_format = "%" + repr(entrylen) + "d"
- zerostr = " " * (entrylen - 1) + "."
+ entry_format = '%' + repr(entrylen) + 'd'
+ zerostr = ' '*(entrylen-1) + '.'
# Write the column values.
- s = ""
+ s = ''
for i in range(valuelen):
- s += (" " * valuelen) + " |"
+ s += (' '*valuelen)+' |'
for val in value_strings:
- if i >= valuelen - len(val):
- s += val[i - valuelen + len(val)].rjust(entrylen + 1)
+ if i >= valuelen-len(val):
+ s += val[i-valuelen+len(val)].rjust(entrylen+1)
else:
- s += " " * (entrylen + 1)
- s += " |\n"
+ s += ' '*(entrylen+1)
+ s += ' |\n'
# Write a dividing line
- s += "%s-+-%s+\n" % ("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+ s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values)))
# Write the entries.
for val, li in zip(value_strings, values):
if confusion[i][j] == 0:
s += zerostr
elif show_percents:
- s += entry_format % (100.0 * confusion[i][j] / self._total)
+ s += entry_format % (100.0*confusion[i][j]/self._total)
else:
s += entry_format % confusion[i][j]
if i == j:
- prevspace = s.rfind(" ")
- s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">"
- else:
- s += " "
- s += "|\n"
+ prevspace = s.rfind(' ')
+ s = s[:prevspace] + '<' + s[prevspace+1:] + '>'
+ else: s += ' '
+ s += '|\n'
# Write a dividing line
- s += "%s-+-%s+\n" % ("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+ s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values)))
# Write a key
- s += "(row = reference; col = test)\n"
+ s += '(row = reference; col = test)\n'
if not values_in_chart:
- s += "Value key:\n"
+ s += 'Value key:\n'
for i, value in enumerate(values):
- s += "%6d: %s\n" % (i + 1, value)
+ s += '%6d: %s\n' % (i+1, value)
return s
def key(self):
values = self._values
- str = "Value key:\n"
- indexlen = len(repr(len(values) - 1))
- key_format = " %" + repr(indexlen) + "d: %s\n"
+ str = 'Value key:\n'
+ indexlen = len(repr(len(values)-1))
+ key_format = ' %'+repr(indexlen)+'d: %s\n'
for i in range(len(values)):
str += key_format % (i, values[i])
return str
-
def demo():
- reference = "DET NN VB DET JJ NN NN IN DET NN".split()
- test = "DET VB VB DET NN NN NN IN DET NN".split()
- print("Reference =", reference)
- print("Test =", test)
- print("Confusion matrix:")
+ reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+ test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+ print('Reference =', reference)
+ print('Test =', test)
+ print('Confusion matrix:')
print(ConfusionMatrix(reference, test))
print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
-# -*- coding: utf-8 -*-
# Natural Language Toolkit: Distance Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Tom Lippincott <tom@cs.columbia.edu>
3. d(a, c) <= d(a, b) + d(b, c)
"""
-import warnings
-import operator
+from __future__ import print_function
+from __future__ import division
def _edit_dist_init(len1, len2):
for i in range(len1):
lev.append([0] * len2) # initialize 2D array to zero
for i in range(len1):
- lev[i][0] = i # column 0: 0,1,2,3,4,...
+ lev[i][0] = i # column 0: 0,1,2,3,4,...
for j in range(len2):
- lev[0][j] = j # row 0: 0,1,2,3,4,...
+ lev[0][j] = j # row 0: 0,1,2,3,4,...
return lev
been done in other orders, but at least three steps are needed.
Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
- because sometimes it makes sense to assign greater penalties to
- substitutions.
+ because sometimes it makes sense to assign greater penalties to substitutions.
This also optionally allows transposition edits (e.g., "ab" -> "ba"),
though this is disabled by default.
# iterate over the array
for i in range(len1):
for j in range(len2):
- _edit_dist_step(
- lev,
- i + 1,
- j + 1,
- s1,
- s2,
- substitution_cost=substitution_cost,
- transpositions=transpositions,
- )
+ _edit_dist_step(lev, i + 1, j + 1, s1, s2,
+ substitution_cost=substitution_cost, transpositions=transpositions)
return lev[len1][len2]
-def _edit_dist_backtrace(lev):
- i, j = len(lev) - 1, len(lev[0]) - 1
- alignment = [(i, j)]
-
- while (i, j) != (0, 0):
- directions = [
- (i - 1, j), # skip s1
- (i, j - 1), # skip s2
- (i - 1, j - 1), # substitution
- ]
-
- direction_costs = (
- (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
- for i, j in directions
- )
- _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
-
- alignment.append((i, j))
- return list(reversed(alignment))
-
-
-def edit_distance_align(s1, s2, substitution_cost=1):
- """
- Calculate the minimum Levenshtein edit-distance based alignment
- mapping between two strings. The alignment finds the mapping
- from string s1 to s2 that minimizes the edit distance cost.
- For example, mapping "rain" to "shine" would involve 2
- substitutions, 2 matches and an insertion resulting in
- the following mapping:
- [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
- NB: (0, 0) is the start state without any letters associated
- See more: https://web.stanford.edu/class/cs124/lec/med.pdf
-
- In case of multiple valid minimum-distance alignments, the
- backtrace has the following operation precedence:
- 1. Skip s1 character
- 2. Skip s2 character
- 3. Substitute s1 and s2 characters
- The backtrace is carried out in reverse string order.
-
- This function does not support transposition.
-
- :param s1, s2: The strings to be aligned
- :type s1: str
- :type s2: str
- :type substitution_cost: int
- :rtype List[Tuple(int, int)]
- """
- # set up a 2-D array
- len1 = len(s1)
- len2 = len(s2)
- lev = _edit_dist_init(len1 + 1, len2 + 1)
-
- # iterate over the array
- for i in range(len1):
- for j in range(len2):
- _edit_dist_step(
- lev,
- i + 1,
- j + 1,
- s1,
- s2,
- substitution_cost=substitution_cost,
- transpositions=False,
- )
-
- # backtrace to find alignment
- alignment = _edit_dist_backtrace(lev)
- return alignment
-
-
def binary_distance(label1, label2):
"""Simple equality test.
"""Distance metric comparing set-similarity.
"""
- return (len(label1.union(label2)) - len(label1.intersection(label2))) / len(
- label1.union(label2)
- )
+ return (len(label1.union(label2)) - len(label1.intersection(label2)))/len(label1.union(label2))
def masi_distance(label1, label2):
>>> from nltk.metrics import masi_distance
>>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
- 0.665
+ 0.335
Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
for Semantic and Pragmatic Annotation.
else:
m = 0
- return 1 - len_intersection / len_union * m
+ return (1 - (len_intersection / float(len_union))) * m
-def interval_distance(label1, label2):
+def interval_distance(label1,label2):
"""Krippendorff's interval distance metric
>>> from nltk.metrics import interval_distance
try:
return pow(label1 - label2, 2)
- # return pow(list(label1)[0]-list(label2)[0],2)
+# return pow(list(label1)[0]-list(label2)[0],2)
except:
print("non-numeric labels not supported with interval distance")
def fractional_presence(label):
- return (
- lambda x, y: abs(((1.0 / len(x)) - (1.0 / len(y))))
- * (label in x and label in y)
- or 0.0 * (label not in x and label not in y)
- or abs((1.0 / len(x))) * (label in x and label not in y)
+ return lambda x, y:\
+ abs(((1.0 / len(x)) - (1.0 / len(y)))) * (label in x and label in y) \
+ or 0.0 * (label not in x and label not in y) \
+ or abs((1.0 / len(x))) * (label in x and label not in y) \
or ((1.0 / len(y))) * (label not in x and label in y)
- )
def custom_distance(file):
data = {}
- with open(file, "r") as infile:
+ with open(file, 'r') as infile:
for l in infile:
labelA, labelB, dist = l.strip().split("\t")
labelA = frozenset([labelA])
labelB = frozenset([labelB])
- data[frozenset([labelA, labelB])] = float(dist)
- return lambda x, y: data[frozenset([x, y])]
-
-
-def jaro_similarity(s1, s2):
- """
- Computes the Jaro similarity between 2 sequences from:
-
- Matthew A. Jaro (1989). Advances in record linkage methodology
- as applied to the 1985 census of Tampa Florida. Journal of the
- American Statistical Association. 84 (406): 414-20.
-
- The Jaro distance between is the min no. of single-character transpositions
- required to change one word into another. The Jaro similarity formula from
- https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance :
-
- jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)
-
- where:
- - |s_i| is the length of string s_i
- - m is the no. of matching characters
- - t is the half no. of possible transpositions.
-
- """
- # First, store the length of the strings
- # because they will be re-used several times.
- len_s1, len_s2 = len(s1), len(s2)
-
- # The upper bound of the distance for being a matched character.
- match_bound = max(len_s1, len_s2) // 2 - 1
-
- # Initialize the counts for matches and transpositions.
- matches = 0 # no.of matched characters in s1 and s2
- transpositions = 0 # no. of transpositions between s1 and s2
- flagged_1 = [] # positions in s1 which are matches to some character in s2
- flagged_2 = [] # positions in s2 which are matches to some character in s1
-
- # Iterate through sequences, check for matches and compute transpositions.
- for i in range(len_s1): # Iterate through each character.
- upperbound = min(i + match_bound, len_s2 - 1)
- lowerbound = max(0, i - match_bound)
- for j in range(lowerbound, upperbound + 1):
- if s1[i] == s2[j] and j not in flagged_2:
- matches += 1
- flagged_1.append(i)
- flagged_2.append(j)
- break
- flagged_2.sort()
- for i, j in zip(flagged_1, flagged_2):
- if s1[i] != s2[j]:
- transpositions += 1
-
- if matches == 0:
- return 0
- else:
- return (
- 1
- / 3
- * (
- matches / len_s1
- + matches / len_s2
- + (matches - transpositions // 2) / matches
- )
- )
-
-
-def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4):
- """
- The Jaro Winkler distance is an extension of the Jaro similarity in:
-
- William E. Winkler. 1990. String Comparator Metrics and Enhanced
- Decision Rules in the Fellegi-Sunter Model of Record Linkage.
- Proceedings of the Section on Survey Research Methods.
- American Statistical Association: 354-359.
- such that:
-
- jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) )
-
- where,
-
- - jaro_sim is the output from the Jaro Similarity,
- see jaro_similarity()
- - l is the length of common prefix at the start of the string
- - this implementation provides an upperbound for the l value
- to keep the prefixes.A common value of this upperbound is 4.
- - p is the constant scaling factor to overweigh common prefixes.
- The Jaro-Winkler similarity will fall within the [0, 1] bound,
- given that max(p)<=0.25 , default is p=0.1 in Winkler (1990)
-
-
- Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf
- from "Table 5 Comparison of String Comparators Rescaled between 0 and 1"
-
- >>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"),
- ... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"),
- ... ("dixon", "dickson"), ("billy", "susan")]
-
- >>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000]
- >>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000]
-
- # One way to match the values on the Winkler's paper is to provide a different
- # p scaling factor for different pairs of strings, e.g.
- >>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1]
-
- >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
- ... assert round(jaro_similarity(s1, s2), 3) == jscore
- ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
-
-
- Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from
- "Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names"
-
- >>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'),
- ... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'),
- ... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'),
- ... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'),
- ... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'),
- ... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'),
- ... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'),
- ... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')]
-
- >>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926,
- ... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905,
- ... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000]
-
- >>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926,
- ... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943,
- ... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000]
-
- # One way to match the values on the Winkler's paper is to provide a different
- # p scaling factor for different pairs of strings, e.g.
- >>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20,
- ... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
-
-
- >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
- ... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]:
- ... continue # Skip bad examples from the paper.
- ... assert round(jaro_similarity(s1, s2), 3) == jscore
- ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
-
-
-
- This test-case proves that the output of Jaro-Winkler similarity depends on
- the product l * p and not on the product max_l * p. Here the product max_l * p > 1
- however the product l * p <= 1
-
- >>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3)
- 0.88
-
-
- """
- # To ensure that the output of the Jaro-Winkler's similarity
- # falls between [0,1], the product of l * p needs to be
- # also fall between [0,1].
- if not 0 <= max_l * p <= 1:
- warnings.warn(
- str(
- "The product `max_l * p` might not fall between [0,1]."
- "Jaro-Winkler similarity might not be between 0 and 1."
- )
- )
-
- # Compute the Jaro similarity
- jaro_sim = jaro_similarity(s1, s2)
-
- # Initialize the upper bound for the no. of prefixes.
- # if user did not pre-define the upperbound,
- # use shorter length between s1 and s2
-
- # Compute the prefix matches.
- l = 0
- # zip() will automatically loop until the end of shorter string.
- for s1_i, s2_i in zip(s1, s2):
- if s1_i == s2_i:
- l += 1
- else:
- break
- if l == max_l:
- break
- # Return the similarity value as described in docstring.
- return jaro_sim + (l * p * (1 - jaro_sim))
+ data[frozenset([labelA,labelB])] = float(dist)
+ return lambda x,y:data[frozenset([x,y])]
def demo():
- string_distance_examples = [
- ("rain", "shine"),
- ("abcdef", "acbdef"),
- ("language", "lnaguaeg"),
- ("language", "lnaugage"),
- ("language", "lngauage"),
- ]
- for s1, s2 in string_distance_examples:
- print("Edit distance btwn '%s' and '%s':" % (s1, s2), edit_distance(s1, s2))
- print(
- "Edit dist with transpositions btwn '%s' and '%s':" % (s1, s2),
- edit_distance(s1, s2, transpositions=True),
- )
- print("Jaro similarity btwn '%s' and '%s':" % (s1, s2), jaro_similarity(s1, s2))
- print(
- "Jaro-Winkler similarity btwn '%s' and '%s':" % (s1, s2),
- jaro_winkler_similarity(s1, s2),
- )
- print(
- "Jaro-Winkler distance btwn '%s' and '%s':" % (s1, s2),
- 1 - jaro_winkler_similarity(s1, s2),
- )
+ edit_distance_examples = [
+ ("rain", "shine"), ("abcdef", "acbdef"), ("language", "lnaguaeg"),
+ ("language", "lnaugage"), ("language", "lngauage")]
+ for s1, s2 in edit_distance_examples:
+ print("Edit distance between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2))
+ for s1, s2 in edit_distance_examples:
+ print("Edit distance with transpositions between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2, transpositions=True))
+
s1 = set([1, 2, 3, 4])
s2 = set([3, 4, 5])
print("s1:", s1)
print("Jaccard distance:", jaccard_distance(s1, s2))
print("MASI distance:", masi_distance(s1, s2))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Agreement Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Lauri Hallila <laurihallila@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
def get_words_from_dictionary(lemmas):
- """
+ '''
Get original set of words used for analysis.
:param lemmas: A dictionary where keys are lemmas and values are sets
:type lemmas: dict(str): list(str)
:return: Set of words that exist as values in the dictionary
:rtype: set(str)
- """
+ '''
words = set()
for lemma in lemmas:
words.update(set(lemmas[lemma]))
def _truncate(words, cutlength):
- """Group words by stems defined by truncating them at given length.
+ '''Group words by stems defined by truncating them at given length.
:param words: Set of words used for analysis
:param cutlength: Words are stemmed by cutting at this length.
:return: Dictionary where keys are stems and values are sets of words
corresponding to that stem.
:rtype: dict(str): set(str)
- """
+ '''
stems = {}
for word in words:
stem = word[:cutlength]
# Reference: http://en.wikipedia.org/wiki/Line-line_intersection
def _count_intersection(l1, l2):
- """Count intersection between two line segments defined by coordinate pairs.
+ '''Count intersection between two line segments defined by coordinate pairs.
:param l1: Tuple of two coordinate pairs defining the first line segment
:param l2: Tuple of two coordinate pairs defining the second line segment
:type l2: tuple(float, float)
:return: Coordinates of the intersection
:rtype: tuple(float, float)
- """
+ '''
x1, y1 = l1[0]
x2, y2 = l1[1]
x3, y3 = l2[0]
denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
- if denominator == 0.0: # lines are parallel
+ if denominator == 0.0: # lines are parallel
if x1 == x2 == x3 == x4 == 0.0:
# When lines are parallel, they must be on the y-axis.
# We can ignore x-axis because we stop counting the
# OI (y-axis) diminishes when we go along the truncation line.
return (0.0, y4)
- x = (
- (x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)
- ) / denominator
- y = (
- (x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)
- ) / denominator
+ x = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denominator
+ y = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / denominator
return (x, y)
def _get_derivative(coordinates):
- """Get derivative of the line from (0,0) to given coordinates.
+ '''Get derivative of the line from (0,0) to given coordinates.
:param coordinates: A coordinate pair
:type coordinates: tuple(float, float)
:return: Derivative; inf if x is zero
:rtype: float
- """
+ '''
try:
return coordinates[1] / coordinates[0]
except ZeroDivisionError:
- return float("inf")
+ return float('inf')
def _calculate_cut(lemmawords, stems):
- """Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
+ '''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
:param lemmawords: Set or list of words corresponding to certain lemma.
:param stems: A dictionary where keys are stems and values are sets
:return: Amount of understemmed and overstemmed pairs contributed by words
existing in both lemmawords and stems.
:rtype: tuple(float, float)
- """
+ '''
umt, wmt = 0.0, 0.0
for stem in stems:
cut = set(lemmawords) & set(stems[stem])
def _calculate(lemmas, stems):
- """Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
+ '''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:rtype: tuple(float, float, float, float)
- """
+ '''
n = sum(len(lemmas[word]) for word in lemmas)
def _indexes(gumt, gdmt, gwmt, gdnt):
- """Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
+ '''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
:param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
global desired merge total (gdmt),
Overstemming Index (OI) and
Stemming Weight (SW).
:rtype: tuple(float, float, float)
- """
+ '''
# Calculate Understemming Index (UI),
# Overstemming Index (OI) and Stemming Weight (SW)
try:
except ZeroDivisionError:
if oi == 0.0:
# OI and UI are 0, define SW as 'not a number'
- sw = float("nan")
+ sw = float('nan')
else:
# UI is 0, define SW as infinity
- sw = float("inf")
+ sw = float('inf')
return (ui, oi, sw)
class Paice(object):
- """Class for storing lemmas, stems and evaluation metrics."""
-
+ '''Class for storing lemmas, stems and evaluation metrics.'''
def __init__(self, lemmas, stems):
- """
+ '''
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict(str): list(str)
:type stems: dict(str): set(str)
- """
+ '''
self.lemmas = lemmas
self.stems = stems
self.coords = []
self.update()
def __str__(self):
- text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt]
- text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt)
- text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt)
- text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt)
- text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui)
- text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi)
- text.append("Stemming Weight (OI / UI): %s\n" % self.sw)
- text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt)
- coordinates = " ".join(["(%s, %s)" % item for item in self.coords])
- text.append("Truncation line: %s" % coordinates)
- return "".join(text)
+ text = ['Global Unachieved Merge Total (GUMT): %s\n' % self.gumt]
+ text.append('Global Desired Merge Total (GDMT): %s\n' % self.gdmt)
+ text.append('Global Wrongly-Merged Total (GWMT): %s\n' % self.gwmt)
+ text.append('Global Desired Non-merge Total (GDNT): %s\n' % self.gdnt)
+ text.append('Understemming Index (GUMT / GDMT): %s\n' % self.ui)
+ text.append('Overstemming Index (GWMT / GDNT): %s\n' % self.oi)
+ text.append('Stemming Weight (OI / UI): %s\n' % self.sw)
+ text.append('Error-Rate Relative to Truncation (ERRT): %s\r\n' % self.errt)
+ coordinates = ' '.join(['(%s, %s)' % item for item in self.coords])
+ text.append('Truncation line: %s' % coordinates)
+ return ''.join(text)
def _get_truncation_indexes(self, words, cutlength):
- """Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
+ '''Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
:param words: Words used for the analysis
:param cutlength: Words are stemmed by cutting them at this length
:type cutlength: int
:return: Understemming and overstemming indexes
:rtype: tuple(int, int)
- """
+ '''
truncated = _truncate(words, cutlength)
gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
return (ui, oi)
def _get_truncation_coordinates(self, cutlength=0):
- """Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
+ '''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
:param cutlength: Optional parameter to start counting from (ui, oi)
coordinates gotten by stemming at this length. Useful for speeding up
:type cutlength: int
:return: List of coordinate pairs that define the truncation line
:rtype: list(tuple(float, float))
- """
+ '''
words = get_words_from_dictionary(self.lemmas)
maxlength = max(len(word) for word in words)
return coords
def _errt(self):
- """Count Error-Rate Relative to Truncation (ERRT).
+ '''Count Error-Rate Relative to Truncation (ERRT).
:return: ERRT, length of the line from origo to (UI, OI) divided by
the length of the line from origo to the point defined by the same
line when extended until the truncation line.
:rtype: float
- """
+ '''
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
self.coords = self._get_truncation_coordinates()
if (0.0, 0.0) in self.coords:
# Truncation line goes through origo, so ERRT cannot be counted
if (self.ui, self.oi) != (0.0, 0.0):
- return float("inf")
+ return float('inf')
else:
- return float("nan")
+ return float('nan')
if (self.ui, self.oi) == (0.0, 0.0):
# (ui, oi) is origo; define errt as 0.0
return 0.0
# Count the intersection point
# Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates
# so we have actual line segments instead of a line segment and a point
- intersection = _count_intersection(
- ((0, 0), (self.ui, self.oi)), self.coords[-2:]
- )
+ intersection = _count_intersection(((0, 0), (self.ui, self.oi)),
+ self.coords[-2:]
+ )
# Count OP (length of the line from origo to (ui, oi))
op = sqrt(self.ui ** 2 + self.oi ** 2)
# Count OT (length of the line from origo to truncation line that goes through (ui, oi))
return op / ot
def update(self):
- """Update statistics after lemmas and stems have been set."""
+ '''Update statistics after lemmas and stems have been set.'''
self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
self.errt = self._errt()
def demo():
- """Demonstration of the module."""
+ '''Demonstration of the module.'''
# Some words with their real lemmas
- lemmas = {
- "kneel": ["kneel", "knelt"],
- "range": ["range", "ranged"],
- "ring": ["ring", "rang", "rung"],
- }
+ lemmas = {'kneel': ['kneel', 'knelt'],
+ 'range': ['range', 'ranged'],
+ 'ring': ['ring', 'rang', 'rung']
+ }
# Same words with stems from a stemming algorithm
- stems = {
- "kneel": ["kneel"],
- "knelt": ["knelt"],
- "rang": ["rang", "range", "ranged"],
- "ring": ["ring"],
- "rung": ["rung"],
- }
- print("Words grouped by their lemmas:")
+ stems = {'kneel': ['kneel'],
+ 'knelt': ['knelt'],
+ 'rang': ['rang', 'range', 'ranged'],
+ 'ring': ['ring'],
+ 'rung': ['rung']
+ }
+ print('Words grouped by their lemmas:')
for lemma in sorted(lemmas):
- print("%s => %s" % (lemma, " ".join(lemmas[lemma])))
+ print('%s => %s' % (lemma, ' '.join(lemmas[lemma])))
print()
- print("Same words grouped by a stemming algorithm:")
+ print('Same words grouped by a stemming algorithm:')
for stem in sorted(stems):
- print("%s => %s" % (stem, " ".join(stems[stem])))
+ print('%s => %s' % (stem, ' '.join(stems[stem])))
print()
p = Paice(lemmas, stems)
print(p)
print()
# Let's "change" results from a stemming algorithm
- stems = {
- "kneel": ["kneel"],
- "knelt": ["knelt"],
- "rang": ["rang"],
- "range": ["range", "ranged"],
- "ring": ["ring"],
- "rung": ["rung"],
- }
- print("Counting stats after changing stemming results:")
+ stems = {'kneel': ['kneel'],
+ 'knelt': ['knelt'],
+ 'rang': ['rang'],
+ 'range': ['range', 'ranged'],
+ 'ring': ['ring'],
+ 'rung': ['rung']
+ }
+ print('Counting stats after changing stemming results:')
for stem in sorted(stems):
- print("%s => %s" % (stem, " ".join(stems[stem])))
+ print('%s => %s' % (stem, ' '.join(stems[stem])))
print()
p.stems = stems
p.update()
print(p)
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Evaluation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division
from math import fabs
import operator
from random import shuffle
from functools import reduce
+from six.moves import range, zip
+
try:
from scipy.stats.stats import betai
except ImportError:
from nltk.util import LazyConcatenation, LazyMap
-
def accuracy(reference, test):
"""
Given a list of reference values and a corresponding list of test
raise ValueError("Lists must have the same length.")
return sum(x == y for x, y in zip(reference, test)) / len(test)
-
def precision(reference, test):
"""
Given a set of reference values and a set of test values, return
:param test: A set of values to compare against the reference set.
:rtype: float or None
"""
- if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
- raise TypeError("reference and test should be sets")
+ if (not hasattr(reference, 'intersection') or
+ not hasattr(test, 'intersection')):
+ raise TypeError('reference and test should be sets')
if len(test) == 0:
return None
else:
return len(reference.intersection(test)) / len(test)
-
def recall(reference, test):
"""
Given a set of reference values and a set of test values, return
:param test: A set of values to compare against the reference set.
:rtype: float or None
"""
- if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
- raise TypeError("reference and test should be sets")
+ if (not hasattr(reference, 'intersection') or
+ not hasattr(test, 'intersection')):
+ raise TypeError('reference and test should be sets')
if len(reference) == 0:
return None
else:
return len(reference.intersection(test)) / len(reference)
-
def f_measure(reference, test, alpha=0.5):
"""
Given a set of reference values and a set of test values, return
return None
if p == 0 or r == 0:
return 0
- return 1.0 / (alpha / p + (1 - alpha) / r)
-
+ return 1.0 / (alpha / p + (1-alpha) / r)
def log_likelihood(reference, test):
"""
raise ValueError("Lists must have the same length.")
# Return the average value of dist.logprob(val).
- total_likelihood = sum(dist.logprob(val) for (val, dist) in zip(reference, test))
+ total_likelihood = sum(dist.logprob(val)
+ for (val, dist) in zip(reference, test))
return total_likelihood / len(reference)
-
def approxrand(a, b, **kwargs):
"""
Returns an approximate significance level between two lists of
:param b: another list of independently generated test values
:type b: list
"""
- shuffles = kwargs.get("shuffles", 999)
+ shuffles = kwargs.get('shuffles', 999)
# there's no point in trying to shuffle beyond all possible permutations
- shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
- stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
- verbose = kwargs.get("verbose", False)
+ shuffles = \
+ min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
+ stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst))
+ verbose = kwargs.get('verbose', False)
if verbose:
- print("shuffles: %d" % shuffles)
+ print('shuffles: %d' % shuffles)
actual_stat = fabs(stat(a) - stat(b))
if verbose:
- print("actual statistic: %f" % actual_stat)
- print("-" * 60)
+ print('actual statistic: %f' % actual_stat)
+ print('-' * 60)
c = 1e-100
lst = LazyConcatenation([a, b])
for i in range(shuffles):
if verbose and i % 10 == 0:
- print("shuffle: %d" % i)
+ print('shuffle: %d' % i)
shuffle(indices)
- pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[: len(a)]))
- pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a) :]))
+ pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
+ pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)
if pseudo_stat >= actual_stat:
c += 1
if verbose and i % 10 == 0:
- print("pseudo-statistic: %f" % pseudo_stat)
- print("significance: %f" % ((c + 1) / (i + 1)))
- print("-" * 60)
+ print('pseudo-statistic: %f' % pseudo_stat)
+ print('significance: %f' % ((c + 1) / (i + 1)))
+ print('-' * 60)
significance = (c + 1) / (shuffles + 1)
if verbose:
- print("significance: %f" % significance)
+ print('significance: %f' % significance)
if betai:
for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))
def demo():
- print("-" * 75)
- reference = "DET NN VB DET JJ NN NN IN DET NN".split()
- test = "DET VB VB DET NN NN NN IN DET NN".split()
- print("Reference =", reference)
- print("Test =", test)
- print("Accuracy:", accuracy(reference, test))
-
- print("-" * 75)
+ print('-'*75)
+ reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+ test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+ print('Reference =', reference)
+ print('Test =', test)
+ print('Accuracy:', accuracy(reference, test))
+
+ print('-'*75)
reference_set = set(reference)
test_set = set(test)
- print("Reference =", reference_set)
- print("Test = ", test_set)
- print("Precision:", precision(reference_set, test_set))
- print(" Recall:", recall(reference_set, test_set))
- print("F-Measure:", f_measure(reference_set, test_set))
- print("-" * 75)
-
-
-if __name__ == "__main__":
+ print('Reference =', reference_set)
+ print('Test = ', test_set)
+ print('Precision:', precision(reference_set, test_set))
+ print(' Recall:', recall(reference_set, test_set))
+ print('F-Measure:', f_measure(reference_set, test_set))
+ print('-'*75)
+
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Text Segmentation Metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# David Doukhan <david.doukhan@gmail.com>
# For license information, see LICENSE.TXT
+
"""
Text Segmentation Metrics
except ImportError:
pass
+from six.moves import range
+
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
"""
if len(seg1) != len(seg2):
raise ValueError("Segmentations have unequal length")
if k > len(seg1):
- raise ValueError(
- "Window width k should be smaller or equal than segmentation lengths"
- )
+ raise ValueError("Window width k should be smaller or equal than segmentation lengths")
wd = 0
for i in range(len(seg1) - k + 1):
- ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
+ ndiff = abs(seg1[i:i+k].count(boundary) - seg2[i:i+k].count(boundary))
if weighted:
wd += ndiff
else:
wd += min(1, ndiff)
- return wd / (len(seg1) - k + 1.0)
+ return wd / (len(seg1) - k + 1.)
-# Generalized Hamming Distance
+# Generalized Hamming Distance
def _init_mat(nrows, ncols, ins_cost, del_cost):
mat = np.empty((nrows, ncols))
mat[i + 1, j + 1] = min(tcost, shift_cost)
-def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
+def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'):
"""
Compute the Generalized Hamming Distance for a reference and a hypothetical
segmentation, corresponding to the cost related to the transformation
# Beeferman's Pk text segmentation evaluation metric
-
-def pk(ref, hyp, k=None, boundary="1"):
+def pk(ref, hyp, k=None, boundary='1'):
"""
Compute the Pk metric for a pair of segmentations A segmentation
is any sequence over a vocabulary of two items (e.g. "0", "1"),
"""
if k is None:
- k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
+ k = int(round(len(ref) / (ref.count(boundary) * 2.)))
err = 0
- for i in range(len(ref) - k + 1):
- r = ref[i : i + k].count(boundary) > 0
- h = hyp[i : i + k].count(boundary) > 0
+ for i in range(len(ref)-k +1):
+ r = ref[i:i+k].count(boundary) > 0
+ h = hyp[i:i+k].count(boundary) > 0
if r != h:
- err += 1
- return err / (len(ref) - k + 1.0)
+ err += 1
+ return err / (len(ref)-k +1.)
# skip doctests if numpy is not installed
def setup_module(module):
from nose import SkipTest
-
try:
import numpy
except ImportError:
# Natural Language Toolkit: Spearman Rank Correlation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
+from __future__ import division
"""
Tools for comparing ranked lists.
"""
-
def _rank_dists(ranks1, ranks2):
"""Finds the difference between the values in ranks1 and ranks2 for keys
present in both dicts. If the arguments are not dicts, they are converted
res += d * d
n += 1
try:
- return 1 - (6 * res / (n * (n * n - 1)))
+ return 1 - (6 * res / (n * (n*n - 1)))
except ZeroDivisionError:
# Result is undefined if only one item is ranked
return 0.0
yield key, rank
prev_score = score
+
# Natural Language Toolkit: Miscellaneous modules
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
module is kept in NLTK source code in order to provide better error
messages for people following the NLTK Book 2.0.
"""
-
+from __future__ import print_function
def babelize_shell():
print("Babelfish online translation service is no longer available.")
(CHOMSKY n) -- for example
(CHOMSKY 5) generates half a screen of linguistic truth.
"""
+from __future__ import print_function
leadins = """To characterize a linguistic level L,
On the other hand,
is necessary to impose an interpretation on
appears to correlate rather closely with
is rather different from"""
-# List of VERBs chosen for autorecursive obfuscation.
+#List of VERBs chosen for autorecursive obfuscation.
objects = """ problems of phonemic and morphological analysis.
a corpus of utterance tokens upon which conformity has been defined \
import textwrap, random
from itertools import chain, islice
+from six.moves import zip
+
def generate_chomsky(times=5, line_length=72):
parts = []
output = chain(*islice(zip(*parts), 0, times))
print(textwrap.fill(" ".join(output), line_length))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
generate_chomsky()
# Natural Language Toolkit: Minimal Sets
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
from collections import defaultdict
-
class MinimalSet(object):
"""
Find contexts where more than one possible target value can
cases like wind (noun) 'air in rapid motion', vs wind (verb)
'coil, wrap'.
"""
-
def __init__(self, parameters=None):
"""
Create a new minimal set.
:type parameters: list(tuple(str, str, str))
"""
self._targets = set() # the contrastive information
- self._contexts = set() # what we are controlling for
+ self._contexts = set() # what we are controlling for
self._seen = defaultdict(set) # to record what we have seen
- self._displays = {} # what we will display
+ self._displays = {} # what we will display
if parameters:
for context, target, display in parameters:
result = []
for target in self._targets:
x = self.display(context, target)
- if x:
- result.append(x)
+ if x: result.append(x)
return result
def targets(self):
return self._targets
+
# Natural Language Toolkit: List Sorting
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
illustrate the many different algorithms (recipes) for solving a
problem, and how to analyze algorithms experimentally.
"""
+from __future__ import print_function, division
+
# These algorithms are taken from:
# Levitin (2004) The Design and Analysis of Algorithms
# Selection Sort
##################################################################
-
def selection(a):
"""
Selection Sort: scan the list to find its smallest element, then
for i in range(len(a) - 1):
min = i
- for j in range(i + 1, len(a)):
+ for j in range(i+1, len(a)):
if a[j] < a[min]:
min = j
count += 1
- a[min], a[i] = a[i], a[min]
+ a[min],a[i] = a[i],a[min]
return count
-
##################################################################
# Bubble Sort
##################################################################
-
def bubble(a):
"""
Bubble Sort: compare adjacent elements of the list left-to-right,
apply the same method to this list, and so on.
"""
count = 0
- for i in range(len(a) - 1):
- for j in range(len(a) - i - 1):
- if a[j + 1] < a[j]:
- a[j], a[j + 1] = a[j + 1], a[j]
+ for i in range(len(a)-1):
+ for j in range(len(a)-i-1):
+ if a[j+1] < a[j]:
+ a[j],a[j+1] = a[j+1],a[j]
count += 1
return count
# Merge Sort
##################################################################
-
def _merge_lists(b, c):
count = 0
i = j = 0
a = []
- while i < len(b) and j < len(c):
+ while (i < len(b) and j < len(c)):
count += 1
if b[i] <= c[j]:
a.append(b[i])
a += b[i:]
return a, count
-
def merge(a):
"""
Merge Sort: split the list in half, and sort each half, then
count_b = merge(b)
count_c = merge(c)
result, count_a = _merge_lists(b, c)
- a[:] = result # copy the result back into a.
+ a[:] = result # copy the result back into a.
count = count_a + count_b + count_c
return count
-
##################################################################
# Quick Sort
##################################################################
-
def _partition(a, l, r):
- p = a[l]
- i = l
- j = r + 1
+ p = a[l]; i = l; j = r+1
count = 0
while True:
while i < r:
i += 1
- if a[i] >= p:
- break
+ if a[i] >= p: break
while j > l:
j -= 1
- if j < l or a[j] <= p:
- break
- a[i], a[j] = a[j], a[i] # swap
+ if j < l or a[j] <= p: break
+ a[i],a[j] = a[j],a[i] # swap
count += 1
- if i >= j:
- break
- a[i], a[j] = a[j], a[i] # undo last swap
- a[l], a[j] = a[j], a[l]
+ if i >= j: break
+ a[i],a[j] = a[j],a[i] # undo last swap
+ a[l],a[j] = a[j],a[l]
return j, count
-
def _quick(a, l, r):
count = 0
- if l < r:
+ if l<r:
s, count = _partition(a, l, r)
- count += _quick(a, l, s - 1)
- count += _quick(a, s + 1, r)
+ count += _quick(a, l, s-1)
+ count += _quick(a, s+1, r)
return count
-
def quick(a):
- return _quick(a, 0, len(a) - 1)
-
+ return _quick(a, 0, len(a)-1)
##################################################################
# Demonstration
##################################################################
-
def demo():
from random import shuffle
a = list(range(size))
# various sort methods
- shuffle(a)
- count_selection = selection(a)
- shuffle(a)
- count_bubble = bubble(a)
- shuffle(a)
- count_merge = merge(a)
- shuffle(a)
- count_quick = quick(a)
-
- print(
- (
- ("size=%5d: selection=%8d, bubble=%8d, " "merge=%6d, quick=%6d")
- % (size, count_selection, count_bubble, count_merge, count_quick)
- )
- )
-
-
-if __name__ == "__main__":
+ shuffle(a); count_selection = selection(a)
+ shuffle(a); count_bubble = bubble(a)
+ shuffle(a); count_merge = merge(a)
+ shuffle(a); count_quick = quick(a)
+
+ print((("size=%5d: selection=%8d, bubble=%8d, "
+ "merge=%6d, quick=%6d") %
+ (size, count_selection, count_bubble,
+ count_merge, count_quick)))
+
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Word Finder
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Simplified from PHP version by Robert Klein <brathna@gmail.com>
# http://fswordfinder.sourceforge.net/
+from __future__ import print_function
import random
# reverse a word with probability 0.5
def revword(word):
- if random.randint(1, 2) == 1:
+ if random.randint(1,2) == 1:
return word[::-1]
return word
-
# try to insert word at position x,y; direction encoded in xf,yf
def step(word, x, xf, y, yf, grid):
for i in range(len(word)):
grid[xf(i)][yf(i)] = word[i]
return True
-
# try to insert word at position x,y, in direction dir
def check(word, dir, x, y, grid, rows, cols):
- if dir == 1:
- if x - len(word) < 0 or y - len(word) < 0:
+ if dir==1:
+ if x-len(word)<0 or y-len(word)<0:
return False
- return step(word, x, lambda i: x - i, y, lambda i: y - i, grid)
- elif dir == 2:
- if x - len(word) < 0:
+ return step(word, x, lambda i:x-i, y, lambda i:y-i, grid)
+ elif dir==2:
+ if x-len(word)<0:
return False
- return step(word, x, lambda i: x - i, y, lambda i: y, grid)
- elif dir == 3:
- if x - len(word) < 0 or y + (len(word) - 1) >= cols:
+ return step(word, x, lambda i:x-i, y, lambda i:y, grid)
+ elif dir==3:
+ if x-len(word)<0 or y+(len(word)-1)>=cols:
return False
- return step(word, x, lambda i: x - i, y, lambda i: y + i, grid)
- elif dir == 4:
- if y - len(word) < 0:
+ return step(word, x, lambda i:x-i, y, lambda i:y+i, grid)
+ elif dir==4:
+ if y-len(word)<0:
return False
- return step(word, x, lambda i: x, y, lambda i: y - i, grid)
-
+ return step(word, x, lambda i:x, y, lambda i:y-i, grid)
-def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
+def wordfinder(words, rows=20, cols=20, attempts=50,
+ alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
"""
Attempt to arrange words into a letter-grid with the specified
number of rows and columns. Try each word in several positions
# try to place each word
for word in words:
word = word.strip().upper() # normalize
- save = word # keep a record of the word
+ save = word # keep a record of the word
word = revword(word)
for attempt in range(attempts):
r = random.randint(0, len(word))
- dir = random.choice([1, 2, 3, 4])
- x = random.randint(0, rows)
- y = random.randint(0, cols)
- if dir == 1:
- x += r
- y += r
- elif dir == 2:
- x += r
- elif dir == 3:
- x += r
- y -= r
- elif dir == 4:
- y += r
- if 0 <= x < rows and 0 <= y < cols:
+ dir = random.choice([1,2,3,4])
+ x = random.randint(0,rows)
+ y = random.randint(0,cols)
+ if dir==1: x+=r; y+=r
+ elif dir==2: x+=r
+ elif dir==3: x+=r; y-=r
+ elif dir==4: y+=r
+ if 0<=x<rows and 0<=y<cols:
if check(word, dir, x, y, grid, rows, cols):
- # used.append((save, dir, x, y, word))
+# used.append((save, dir, x, y, word))
used.append(save)
break
# Fill up the remaining spaces
for i in range(rows):
for j in range(cols):
- if grid[i][j] == "":
+ if grid[i][j] == '':
grid[i][j] = random.choice(alph)
return grid, used
-
def word_finder():
from nltk.corpus import words
-
wordlist = words.words()
random.shuffle(wordlist)
wordlist = wordlist[:200]
print("Word Finder\n")
for i in range(len(grid)):
for j in range(len(grid[i])):
- print(grid[i][j], end=" ")
+ print(grid[i][j], end=' ')
print()
print()
for i in range(len(used)):
- print("%d:" % (i + 1), used[i])
-
+ print("%d:" % (i+1), used[i])
-if __name__ == "__main__":
+if __name__ == '__main__':
word_finder()
# Natural Language Toolkit: Parsers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
"""
from nltk.parse.api import ParserI
-from nltk.parse.chart import (
- ChartParser,
- SteppingChartParser,
- TopDownChartParser,
- BottomUpChartParser,
- BottomUpLeftCornerChartParser,
- LeftCornerChartParser,
-)
-from nltk.parse.featurechart import (
- FeatureChartParser,
- FeatureTopDownChartParser,
- FeatureBottomUpChartParser,
- FeatureBottomUpLeftCornerChartParser,
-)
-from nltk.parse.earleychart import (
- IncrementalChartParser,
- EarleyChartParser,
- IncrementalTopDownChartParser,
- IncrementalBottomUpChartParser,
- IncrementalBottomUpLeftCornerChartParser,
- IncrementalLeftCornerChartParser,
- FeatureIncrementalChartParser,
- FeatureEarleyChartParser,
- FeatureIncrementalTopDownChartParser,
- FeatureIncrementalBottomUpChartParser,
- FeatureIncrementalBottomUpLeftCornerChartParser,
-)
-from nltk.parse.pchart import (
- BottomUpProbabilisticChartParser,
- InsideChartParser,
- RandomChartParser,
- UnsortedChartParser,
- LongestChartParser,
-)
-from nltk.parse.recursivedescent import (
- RecursiveDescentParser,
- SteppingRecursiveDescentParser,
-)
-from nltk.parse.shiftreduce import ShiftReduceParser, SteppingShiftReduceParser
+from nltk.parse.chart import (ChartParser, SteppingChartParser, TopDownChartParser,
+ BottomUpChartParser, BottomUpLeftCornerChartParser,
+ LeftCornerChartParser)
+from nltk.parse.featurechart import (FeatureChartParser, FeatureTopDownChartParser,
+ FeatureBottomUpChartParser,
+ FeatureBottomUpLeftCornerChartParser)
+from nltk.parse.earleychart import (IncrementalChartParser, EarleyChartParser,
+ IncrementalTopDownChartParser,
+ IncrementalBottomUpChartParser,
+ IncrementalBottomUpLeftCornerChartParser,
+ IncrementalLeftCornerChartParser,
+ FeatureIncrementalChartParser,
+ FeatureEarleyChartParser,
+ FeatureIncrementalTopDownChartParser,
+ FeatureIncrementalBottomUpChartParser,
+ FeatureIncrementalBottomUpLeftCornerChartParser)
+from nltk.parse.pchart import (BottomUpProbabilisticChartParser, InsideChartParser,
+ RandomChartParser, UnsortedChartParser,
+ LongestChartParser)
+from nltk.parse.recursivedescent import (RecursiveDescentParser,
+ SteppingRecursiveDescentParser)
+from nltk.parse.shiftreduce import (ShiftReduceParser, SteppingShiftReduceParser)
from nltk.parse.util import load_parser, TestGrammar, extract_test_sentences
from nltk.parse.viterbi import ViterbiParser
from nltk.parse.dependencygraph import DependencyGraph
-from nltk.parse.projectivedependencyparser import (
- ProjectiveDependencyParser,
- ProbabilisticProjectiveDependencyParser,
-)
-from nltk.parse.nonprojectivedependencyparser import (
- NonprojectiveDependencyParser,
- NaiveBayesDependencyScorer,
- ProbabilisticNonprojectiveParser,
-)
+from nltk.parse.projectivedependencyparser import (ProjectiveDependencyParser,
+ ProbabilisticProjectiveDependencyParser)
+from nltk.parse.nonprojectivedependencyparser import (NonprojectiveDependencyParser,
+ NaiveBayesDependencyScorer,
+ ProbabilisticNonprojectiveParser)
from nltk.parse.malt import MaltParser
from nltk.parse.evaluate import DependencyEvaluator
from nltk.parse.transitionparser import TransitionParser
# Natural Language Toolkit: Parser API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
from nltk.internals import overridden
-
class ParserI(object):
"""
A processing class for deriving trees that represent possible
Subclasses may define:
- ``grammar()``
"""
-
def grammar(self):
"""
:return: The grammar used by this parser.
if overridden(self.parse_sents):
return next(self.parse_sents([sent], *args, **kwargs))
elif overridden(self.parse_one):
- return (
- tree
- for tree in [self.parse_one(sent, *args, **kwargs)]
- if tree is not None
- )
+ return (tree for tree in [self.parse_one(sent, *args, **kwargs)] if tree is not None)
elif overridden(self.parse_all):
return iter(self.parse_all(sent, *args, **kwargs))
else:
#
# Author: David McClosky <dmcc@bigasterisk.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
+
from nltk.parse.api import ParserI
from nltk.tree import Tree
on BLLIP Parser's Python interface.
"""
-__all__ = ["BllipParser"]
+__all__ = ['BllipParser']
# this block allows this module to be imported even if bllipparser isn't
# available
def _ensure_bllip_import_or_error():
pass
-
-
except ImportError as ie:
-
def _ensure_bllip_import_or_error(ie=ie):
raise ImportError("Couldn't import bllipparser module: %s" % ie)
-
def _ensure_ascii(words):
try:
for i, word in enumerate(words):
- word.decode("ascii")
+ word.decode('ascii')
except UnicodeDecodeError:
- raise ValueError(
- "Token %d (%r) is non-ASCII. BLLIP Parser "
- "currently doesn't support non-ASCII inputs." % (i, word)
- )
-
+ raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser "
+ "currently doesn't support non-ASCII inputs." %
+ (i, word))
def _scored_parse_to_nltk_tree(scored_parse):
return Tree.fromstring(str(scored_parse.ptb_parse))
-
class BllipParser(ParserI):
"""
Interface for parsing with BLLIP Parser. BllipParser objects can be
constructed with the ``BllipParser.from_unified_model_dir`` class
method or manually using the ``BllipParser`` constructor.
"""
-
- def __init__(
- self,
- parser_model=None,
- reranker_features=None,
- reranker_weights=None,
- parser_options=None,
- reranker_options=None,
- ):
+ def __init__(self, parser_model=None, reranker_features=None,
+ reranker_weights=None, parser_options=None,
+ reranker_options=None):
"""
Load a BLLIP Parser model from scratch. You'll typically want to
use the ``from_unified_model_dir()`` class method to construct
self.rrp = RerankingParser()
self.rrp.load_parser_model(parser_model, **parser_options)
if reranker_features and reranker_weights:
- self.rrp.load_reranker_model(
- features_filename=reranker_features,
- weights_filename=reranker_weights,
- **reranker_options
- )
+ self.rrp.load_reranker_model(features_filename=reranker_features,
+ weights_filename=reranker_weights,
+ **reranker_options)
def parse(self, sentence):
"""
yield _scored_parse_to_nltk_tree(scored_parse)
@classmethod
- def from_unified_model_dir(
- cls, model_dir, parser_options=None, reranker_options=None
- ):
+ def from_unified_model_dir(this_class, model_dir, parser_options=None,
+ reranker_options=None):
"""
Create a ``BllipParser`` object from a unified parsing model
directory. Unified parsing model directories are a standardized
:type reranker_options: dict(str)
:rtype: BllipParser
"""
- (
- parser_model_dir,
- reranker_features_filename,
- reranker_weights_filename,
- ) = get_unified_model_parameters(model_dir)
- return cls(
- parser_model_dir,
- reranker_features_filename,
- reranker_weights_filename,
- parser_options,
- reranker_options,
- )
-
+ (parser_model_dir, reranker_features_filename,
+ reranker_weights_filename) = get_unified_model_parameters(model_dir)
+ return this_class(parser_model_dir, reranker_features_filename,
+ reranker_weights_filename, parser_options,
+ reranker_options)
def demo():
"""This assumes the Python module bllipparser is installed."""
# sudo python -m nltk.downloader bllip_wsj_no_aux
from nltk.data import find
+ model_dir = find('models/bllip_wsj_no_aux').path
- model_dir = find("models/bllip_wsj_no_aux").path
-
- print("Loading BLLIP Parsing models...")
+ print('Loading BLLIP Parsing models...')
# the easiest way to get started is to use a unified model
bllip = BllipParser.from_unified_model_dir(model_dir)
- print("Done.")
+ print('Done.')
- sentence1 = "British left waffles on Falklands .".split()
- sentence2 = "I saw the man with the telescope .".split()
+ sentence1 = 'British left waffles on Falklands .'.split()
+ sentence2 = 'I saw the man with the telescope .'.split()
# this sentence is known to fail under the WSJ parsing model
- fail1 = "# ! ? : -".split()
+ fail1 = '# ! ? : -'.split()
for sentence in (sentence1, sentence2, fail1):
- print("Sentence: %r" % " ".join(sentence))
+ print('Sentence: %r' % ' '.join(sentence))
try:
tree = next(bllip.parse(sentence))
print(tree)
# n-best parsing demo
for i, parse in enumerate(bllip.parse(sentence1)):
- print("parse %d:\n%s" % (i, parse))
+ print('parse %d:\n%s' % (i, parse))
# using external POS tag constraints
- print(
- "forcing 'tree' to be 'NN':",
- next(bllip.tagged_parse([("A", None), ("tree", "NN")])),
- )
- print(
- "forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
- next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])),
- )
+ print("forcing 'tree' to be 'NN':",
+ next(bllip.tagged_parse([('A', None), ('tree', 'NN')])))
+ print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
+ next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])))
# constraints don't have to make sense... (though on more complicated
# sentences, they may cause the parse to fail)
- print(
- "forcing 'A' to be 'NNP':",
- next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])),
- )
-
+ print("forcing 'A' to be 'NNP':",
+ next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])))
def setup_module(module):
from nose import SkipTest
try:
_ensure_bllip_import_or_error()
except ImportError:
- raise SkipTest(
- "doctests from nltk.parse.bllip are skipped because "
- "the bllipparser module is not installed"
- )
+ raise SkipTest('doctests from nltk.parse.bllip are skipped because '
+ 'the bllipparser module is not installed')
+
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: A Chart Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jean Mark Gawron <gawron@mail.sdsu.edu>
- ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
be used to step through the parsing process.
"""
+from __future__ import print_function, division, unicode_literals
import itertools
import re
import warnings
from functools import total_ordering
+from six.moves import range
+
from nltk.tree import Tree
from nltk.grammar import PCFG, is_nonterminal, is_terminal
from nltk.util import OrderedDict
from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.parse.api import ParserI
## Edges
########################################################################
-
@total_ordering
class EdgeI(object):
"""
The ``EdgeI`` interface provides a common interface to both types
of edge, allowing chart parsers to treat them in a uniform manner.
"""
-
def __init__(self):
if self.__class__ == EdgeI:
- raise TypeError("Edge is an abstract interface")
+ raise TypeError('Edge is an abstract interface')
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Span
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def span(self):
"""
"""
raise NotImplementedError()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Left Hand Side
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def lhs(self):
"""
"""
raise NotImplementedError()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Right Hand Side
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def rhs(self):
"""
"""
raise NotImplementedError()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Comparisons & hashing
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def __eq__(self, other):
- return (
- self.__class__ is other.__class__
- and self._comparison_key == other._comparison_key
- )
+ return (self.__class__ is other.__class__ and
+ self._comparison_key == other._comparison_key)
def __ne__(self, other):
return not self == other
return self._hash
+@python_2_unicode_compatible
class TreeEdge(EdgeI):
"""
An edge that records the fact that a tree is (partially)
For more information about edges, see the ``EdgeI`` interface.
"""
-
def __init__(self, span, lhs, rhs, dot=0):
"""
Construct a new ``TreeEdge``.
:rtype: TreeEdge
"""
- return TreeEdge(
- span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
- )
+ return TreeEdge(span=(index, index), lhs=production.lhs(),
+ rhs=production.rhs(), dot=0)
def move_dot_forward(self, new_end):
"""
:type new_end: int
:rtype: TreeEdge
"""
- return TreeEdge(
- span=(self._span[0], new_end),
- lhs=self._lhs,
- rhs=self._rhs,
- dot=self._dot + 1,
- )
+ return TreeEdge(span=(self._span[0], new_end),
+ lhs=self._lhs, rhs=self._rhs,
+ dot=self._dot+1)
# Accessors
- def lhs(self):
- return self._lhs
-
- def span(self):
- return self._span
-
- def start(self):
- return self._span[0]
-
- def end(self):
- return self._span[1]
-
- def length(self):
- return self._span[1] - self._span[0]
-
- def rhs(self):
- return self._rhs
-
- def dot(self):
- return self._dot
-
- def is_complete(self):
- return self._dot == len(self._rhs)
-
- def is_incomplete(self):
- return self._dot != len(self._rhs)
-
+ def lhs(self): return self._lhs
+ def span(self): return self._span
+ def start(self): return self._span[0]
+ def end(self): return self._span[1]
+ def length(self): return self._span[1] - self._span[0]
+ def rhs(self): return self._rhs
+ def dot(self): return self._dot
+ def is_complete(self): return self._dot == len(self._rhs)
+ def is_incomplete(self): return self._dot != len(self._rhs)
def nextsym(self):
- if self._dot >= len(self._rhs):
- return None
- else:
- return self._rhs[self._dot]
+ if self._dot >= len(self._rhs): return None
+ else: return self._rhs[self._dot]
# String representation
def __str__(self):
- str = "[%s:%s] " % (self._span[0], self._span[1])
- str += "%-2r ->" % (self._lhs,)
+ str = '[%s:%s] ' % (self._span[0], self._span[1])
+ str += '%-2r ->' % (self._lhs,)
for i in range(len(self._rhs)):
- if i == self._dot:
- str += " *"
- str += " %s" % repr(self._rhs[i])
- if len(self._rhs) == self._dot:
- str += " *"
+ if i == self._dot: str += ' *'
+ str += ' %s' % unicode_repr(self._rhs[i])
+ if len(self._rhs) == self._dot: str += ' *'
return str
def __repr__(self):
- return "[Edge: %s]" % self
+ return '[Edge: %s]' % self
+@python_2_unicode_compatible
class LeafEdge(EdgeI):
"""
An edge that records the fact that a leaf value is consistent with
side is ``()``. Its span is ``[index, index+1]``, and its dot
position is ``0``.
"""
-
def __init__(self, leaf, index):
"""
Construct a new ``LeafEdge``.
self._comparison_key = (leaf, index)
# Accessors
- def lhs(self):
- return self._leaf
-
- def span(self):
- return (self._index, self._index + 1)
-
- def start(self):
- return self._index
-
- def end(self):
- return self._index + 1
-
- def length(self):
- return 1
-
- def rhs(self):
- return ()
-
- def dot(self):
- return 0
-
- def is_complete(self):
- return True
-
- def is_incomplete(self):
- return False
-
- def nextsym(self):
- return None
+ def lhs(self): return self._leaf
+ def span(self): return (self._index, self._index+1)
+ def start(self): return self._index
+ def end(self): return self._index+1
+ def length(self): return 1
+ def rhs(self): return ()
+ def dot(self): return 0
+ def is_complete(self): return True
+ def is_incomplete(self): return False
+ def nextsym(self): return None
# String representations
def __str__(self):
- return "[%s:%s] %s" % (self._index, self._index + 1, repr(self._leaf))
-
+ return '[%s:%s] %s' % (self._index, self._index+1, unicode_repr(self._leaf))
def __repr__(self):
- return "[Edge: %s]" % (self)
-
+ return '[Edge: %s]' % (self)
########################################################################
## Chart
########################################################################
-
class Chart(object):
"""
A blackboard for hypotheses about the syntactic constituents of a
to indices, where each index maps the corresponding edge
attribute values to lists of edges.
"""
-
def __init__(self, tokens):
"""
Construct a new chart. The chart is initialized with the
# (used by select()).
self._indexes = {}
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Sentence Access
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def num_leaves(self):
"""
"""
return self._tokens
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Edge access
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def edges(self):
"""
:rtype: iter(EdgeI)
"""
# If there are no restrictions, then return all edges.
- if restrictions == {}:
- return iter(self._edges)
+ if restrictions=={}: return iter(self._edges)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = {}
vals = tuple(getattr(edge, key)() for key in restr_keys)
index.setdefault(vals, []).append(edge)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Edge Insertion
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
"""
Add a new edge to the chart, using a pointer to the previous edge.
"""
cpls = self.child_pointer_lists(previous_edge)
- new_cpls = [cpl + (child_edge,) for cpl in cpls]
+ new_cpls = [cpl+(child_edge,) for cpl in cpls]
return self.insert(new_edge, *new_cpls)
def insert(self, edge, *child_pointer_lists):
def _append_edge(self, edge):
self._edges.append(edge)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Tree extraction & child pointer lists
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def parses(self, root, tree_class=Tree):
"""
# Get the set of child choices for each child pointer.
# child_choices[i] is the set of choices for the tree's
# ith child.
- child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
+ child_choices = [self._trees(cp, complete, memo, tree_class)
+ for cp in cpl]
# For each combination of children, add a tree.
for children in itertools.product(*child_choices):
# If the edge is incomplete, then extend it with "partial trees":
if edge.is_incomplete():
- unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]]
+ unexpanded = [tree_class(elt,[])
+ for elt in edge.rhs()[edge.dot():]]
for tree in trees:
tree.extend(unexpanded)
# Make a copy, in case they modify it.
return self._edge_to_cpls.get(edge, {}).keys()
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Display
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def pretty_format_edge(self, edge, width=None):
"""
Return a pretty-printed string representation of a given edge
:param width: The number of characters allotted to each
index in the sentence.
"""
- if width is None:
- width = 50 // (self.num_leaves() + 1)
+ if width is None: width = 50 // (self.num_leaves()+1)
(start, end) = (edge.start(), edge.end())
- str = "|" + ("." + " " * (width - 1)) * start
+ str = '|' + ('.'+' '*(width-1))*start
# Zero-width edges are "#" if complete, ">" if incomplete
if start == end:
- if edge.is_complete():
- str += "#"
- else:
- str += ">"
+ if edge.is_complete(): str += '#'
+ else: str += '>'
# Spanning complete edges are "[===]"; Other edges are
# "[---]" if complete, "[--->" if incomplete
- elif edge.is_complete() and edge.span() == (0, self._num_leaves):
- str += "[" + ("=" * width) * (end - start - 1) + "=" * (width - 1) + "]"
+ elif edge.is_complete() and edge.span() == (0,self._num_leaves):
+ str += '['+('='*width)*(end-start-1) + '='*(width-1)+']'
elif edge.is_complete():
- str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + "]"
+ str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']'
else:
- str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + ">"
+ str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>'
- str += (" " * (width - 1) + ".") * (self._num_leaves - end)
- return str + "| %s" % edge
+ str += (' '*(width-1)+'.')*(self._num_leaves-end)
+ return str + '| %s' % edge
def pretty_format_leaves(self, width=None):
"""
chart's leaves. This string can be used as a header
for calls to ``pretty_format_edge``.
"""
- if width is None:
- width = 50 // (self.num_leaves() + 1)
+ if width is None: width = 50 // (self.num_leaves()+1)
- if self._tokens is not None and width > 1:
- header = "|."
+ if self._tokens is not None and width>1:
+ header = '|.'
for tok in self._tokens:
- header += tok[: width - 1].center(width - 1) + "."
- header += "|"
+ header += tok[:width-1].center(width-1)+'.'
+ header += '|'
else:
- header = ""
+ header = ''
return header
index in the sentence.
:rtype: str
"""
- if width is None:
- width = 50 // (self.num_leaves() + 1)
+ if width is None: width = 50 // (self.num_leaves()+1)
# sort edges: primary key=length, secondary key=start index.
# (and filter out the token edges)
edges = sorted([(e.length(), e.start(), e) for e in self])
- edges = [e for (_, _, e) in edges]
+ edges = [e for (_,_,e) in edges]
- return (
- self.pretty_format_leaves(width)
- + "\n"
- + "\n".join(self.pretty_format_edge(edge, width) for edge in edges)
- )
+ return (self.pretty_format_leaves(width) + '\n' +
+ '\n'.join(self.pretty_format_edge(edge, width) for edge in edges))
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Display: Dot (AT&T Graphviz)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def dot_digraph(self):
# Header
- s = "digraph nltk_chart {\n"
- # s += ' size="5,5";\n'
- s += " rankdir=LR;\n"
- s += " node [height=0.1,width=0.1];\n"
+ s = 'digraph nltk_chart {\n'
+ #s += ' size="5,5";\n'
+ s += ' rankdir=LR;\n'
+ s += ' node [height=0.1,width=0.1];\n'
s += ' node [style=filled, color="lightgray"];\n'
# Set up the nodes
for y in range(self.num_edges(), -1, -1):
if y == 0:
s += ' node [style=filled, color="black"];\n'
- for x in range(self.num_leaves() + 1):
- if y == 0 or (
- x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
- ):
- s += ' %04d.%04d [label=""];\n' % (x, y)
+ for x in range(self.num_leaves()+1):
+ if y == 0 or (x <= self._edges[y-1].start() or
+ x >= self._edges[y-1].end()):
+ s += ' %04d.%04d [label=""];\n' % (x,y)
# Add a spacer
- s += " x [style=invis]; x->0000.0000 [style=invis];\n"
+ s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
# Declare ranks.
- for x in range(self.num_leaves() + 1):
- s += " {rank=same;"
- for y in range(self.num_edges() + 1):
- if y == 0 or (
- x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
- ):
- s += " %04d.%04d" % (x, y)
- s += "}\n"
+ for x in range(self.num_leaves()+1):
+ s += ' {rank=same;'
+ for y in range(self.num_edges()+1):
+ if y == 0 or (x <= self._edges[y-1].start() or
+ x >= self._edges[y-1].end()):
+ s += ' %04d.%04d' % (x,y)
+ s += '}\n'
# Add the leaves
- s += " edge [style=invis, weight=100];\n"
- s += " node [shape=plaintext]\n"
- s += " 0000.0000"
+ s += ' edge [style=invis, weight=100];\n'
+ s += ' node [shape=plaintext]\n'
+ s += ' 0000.0000'
for x in range(self.num_leaves()):
- s += "->%s->%04d.0000" % (self.leaf(x), x + 1)
- s += ";\n\n"
+ s += '->%s->%04d.0000' % (self.leaf(x), x+1)
+ s += ';\n\n'
# Add the edges
- s += " edge [style=solid, weight=1];\n"
+ s += ' edge [style=solid, weight=1];\n'
for y, edge in enumerate(self):
for x in range(edge.start()):
- s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
- x,
- y + 1,
- x + 1,
- y + 1,
- )
- s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % (
- edge.start(),
- y + 1,
- edge.end(),
- y + 1,
- edge,
- )
+ s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
+ (x, y+1, x+1, y+1))
+ s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' %
+ (edge.start(), y+1, edge.end(), y+1, edge))
for x in range(edge.end(), self.num_leaves()):
- s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
- x,
- y + 1,
- x + 1,
- y + 1,
- )
- s += "}\n"
+ s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
+ (x, y+1, x+1, y+1))
+ s += '}\n'
return s
-
########################################################################
## Chart Rules
########################################################################
-
class ChartRuleI(object):
"""
A rule that specifies what new edges are licensed by any given set
to license new edges. Typically, this number ranges from zero
to two.
"""
-
def apply(self, chart, grammar, *edges):
"""
Return a generator that will add edges licensed by this rule
raise NotImplementedError()
+@python_2_unicode_compatible
class AbstractChartRule(ChartRuleI):
"""
An abstract base class for chart rules. ``AbstractChartRule``
for e1 in chart:
for e2 in chart:
for e3 in chart:
- for new_edge in self.apply(chart, grammar, e1, e2, e3):
+ for new_edge in self.apply(chart,grammar,e1,e2,e3):
yield new_edge
else:
- raise AssertionError("NUM_EDGES>3 is not currently supported")
+ raise AssertionError('NUM_EDGES>3 is not currently supported')
# Default: return a name based on the class name.
def __str__(self):
# Add spaces between InitialCapsWords.
- return re.sub("([a-z])([A-Z])", r"\1 \2", self.__class__.__name__)
-
+ return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Fundamental Rule
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class FundamentalRule(AbstractChartRule):
"""
- ``[A -> alpha B * beta][i:j]``
"""
-
NUM_EDGES = 2
-
def apply(self, chart, grammar, left_edge, right_edge):
# Make sure the rule is applicable.
- if not (
- left_edge.is_incomplete()
- and right_edge.is_complete()
- and left_edge.end() == right_edge.start()
- and left_edge.nextsym() == right_edge.lhs()
- ):
+ if not (left_edge.is_incomplete() and
+ right_edge.is_complete() and
+ left_edge.end() == right_edge.start() and
+ left_edge.nextsym() == right_edge.lhs()):
return
# Construct the new edge.
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
-
class SingleEdgeFundamentalRule(FundamentalRule):
"""
A rule that joins a given edge with adjacent edges in the chart,
:note: This is basically ``FundamentalRule``, with one edge left
unspecified.
"""
-
NUM_EDGES = 1
def apply(self, chart, grammar, edge):
yield new_edge
def _apply_complete(self, chart, grammar, right_edge):
- for left_edge in chart.select(
- end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
- ):
+ for left_edge in chart.select(end=right_edge.start(),
+ is_complete=False,
+ nextsym=right_edge.lhs()):
new_edge = left_edge.move_dot_forward(right_edge.end())
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
def _apply_incomplete(self, chart, grammar, left_edge):
- for right_edge in chart.select(
- start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
- ):
+ for right_edge in chart.select(start=left_edge.end(),
+ is_complete=True,
+ lhs=left_edge.nextsym()):
new_edge = left_edge.move_dot_forward(right_edge.end())
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
-
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Inserting Terminal Leafs
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class LeafInitRule(AbstractChartRule):
- NUM_EDGES = 0
-
+ NUM_EDGES=0
def apply(self, chart, grammar):
for index in range(chart.num_leaves()):
new_edge = LeafEdge(chart.leaf(index), index)
if chart.insert(new_edge, ()):
yield new_edge
-
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Top-Down Prediction
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class TopDownInitRule(AbstractChartRule):
"""
``[S -> \* alpha][0:i]`` is licensed for each grammar production
``S -> alpha``, where ``S`` is the grammar's start symbol.
"""
-
NUM_EDGES = 0
-
def apply(self, chart, grammar):
for prod in grammar.productions(lhs=grammar.start()):
new_edge = TreeEdge.from_production(prod, 0)
if chart.insert(new_edge, ()):
yield new_edge
-
class TopDownPredictRule(AbstractChartRule):
"""
A rule licensing edges corresponding to the grammar productions
:note: This rule corresponds to the Predictor Rule in Earley parsing.
"""
-
NUM_EDGES = 1
-
def apply(self, chart, grammar, edge):
- if edge.is_complete():
- return
+ if edge.is_complete(): return
for prod in grammar.productions(lhs=edge.nextsym()):
new_edge = TreeEdge.from_production(prod, edge.end())
if chart.insert(new_edge, ()):
yield new_edge
-
class CachedTopDownPredictRule(TopDownPredictRule):
"""
A cached version of ``TopDownPredictRule``. After the first time
If ``chart`` or ``grammar`` are changed, then the cache is flushed.
"""
-
def __init__(self):
TopDownPredictRule.__init__(self)
self._done = {}
def apply(self, chart, grammar, edge):
- if edge.is_complete():
- return
+ if edge.is_complete(): return
nextsym, index = edge.nextsym(), edge.end()
- if not is_nonterminal(nextsym):
- return
+ if not is_nonterminal(nextsym): return
# If we've already applied this rule to an edge with the same
# next & end, and the chart & grammar have not changed, then
# just return (no new edges to add).
- done = self._done.get((nextsym, index), (None, None))
- if done[0] is chart and done[1] is grammar:
- return
+ done = self._done.get((nextsym, index), (None,None))
+ if done[0] is chart and done[1] is grammar: return
# Add all the edges indicated by the top down expand rule.
for prod in grammar.productions(lhs=nextsym):
if prod.rhs():
first = prod.rhs()[0]
if is_terminal(first):
- if index >= chart.num_leaves() or first != chart.leaf(index):
- continue
+ if index >= chart.num_leaves() or first != chart.leaf(index): continue
new_edge = TreeEdge.from_production(prod, index)
if chart.insert(new_edge, ()):
# Record the fact that we've applied this rule.
self._done[nextsym, index] = (chart, grammar)
-
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Bottom-Up Prediction
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class BottomUpPredictRule(AbstractChartRule):
"""
particular, this rule specifies that ``[A -> alpha \*]`` licenses
the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``.
"""
-
NUM_EDGES = 1
-
def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
+ if edge.is_incomplete(): return
for prod in grammar.productions(rhs=edge.lhs()):
new_edge = TreeEdge.from_production(prod, edge.start())
if chart.insert(new_edge, ()):
yield new_edge
-
class BottomUpPredictCombineRule(BottomUpPredictRule):
"""
A rule licensing any edge corresponding to a production whose
:note: This is like ``BottomUpPredictRule``, but it also applies
the ``FundamentalRule`` to the resulting edge.
"""
-
NUM_EDGES = 1
-
def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
+ if edge.is_incomplete(): return
for prod in grammar.productions(rhs=edge.lhs()):
new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
if chart.insert(new_edge, (edge,)):
yield new_edge
-
class EmptyPredictRule(AbstractChartRule):
"""
A rule that inserts all empty productions as passive edges,
in every position in the chart.
"""
-
NUM_EDGES = 0
-
def apply(self, chart, grammar):
for prod in grammar.productions(empty=True):
for index in range(chart.num_leaves() + 1):
## Filtered Bottom Up
########################################################################
-
class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
def _apply_complete(self, chart, grammar, right_edge):
end = right_edge.end()
nexttoken = end < chart.num_leaves() and chart.leaf(end)
- for left_edge in chart.select(
- end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
- ):
+ for left_edge in chart.select(end=right_edge.start(),
+ is_complete=False,
+ nextsym=right_edge.lhs()):
if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
new_edge = left_edge.move_dot_forward(right_edge.end())
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
def _apply_incomplete(self, chart, grammar, left_edge):
- for right_edge in chart.select(
- start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
- ):
+ for right_edge in chart.select(start=left_edge.end(),
+ is_complete=True,
+ lhs=left_edge.nextsym()):
end = right_edge.end()
nexttoken = end < chart.num_leaves() and chart.leaf(end)
if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
-
class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
def apply(self, chart, grammar, edge):
if edge.is_incomplete():
if chart.insert(new_edge, (edge,)):
yield new_edge
-
def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
if len(rhs) <= dot + 1:
return True
## Generic Chart Parser
########################################################################
-TD_STRATEGY = [
- LeafInitRule(),
- TopDownInitRule(),
- CachedTopDownPredictRule(),
- SingleEdgeFundamentalRule(),
-]
-BU_STRATEGY = [
- LeafInitRule(),
- EmptyPredictRule(),
- BottomUpPredictRule(),
- SingleEdgeFundamentalRule(),
-]
-BU_LC_STRATEGY = [
- LeafInitRule(),
- EmptyPredictRule(),
- BottomUpPredictCombineRule(),
- SingleEdgeFundamentalRule(),
-]
-
-LC_STRATEGY = [
- LeafInitRule(),
- FilteredBottomUpPredictCombineRule(),
- FilteredSingleEdgeFundamentalRule(),
-]
-
+TD_STRATEGY = [LeafInitRule(),
+ TopDownInitRule(),
+ CachedTopDownPredictRule(),
+ SingleEdgeFundamentalRule()]
+BU_STRATEGY = [LeafInitRule(),
+ EmptyPredictRule(),
+ BottomUpPredictRule(),
+ SingleEdgeFundamentalRule()]
+BU_LC_STRATEGY = [LeafInitRule(),
+ EmptyPredictRule(),
+ BottomUpPredictCombineRule(),
+ SingleEdgeFundamentalRule()]
+
+LC_STRATEGY = [LeafInitRule(),
+ FilteredBottomUpPredictCombineRule(),
+ FilteredSingleEdgeFundamentalRule()]
class ChartParser(ParserI):
"""
| Apply *rule* to any applicable edges in the chart.
| Return any complete parses in the chart
"""
-
- def __init__(
- self,
- grammar,
- strategy=BU_LC_STRATEGY,
- trace=0,
- trace_chart_width=50,
- use_agenda=True,
- chart_class=Chart,
- ):
+ def __init__(self, grammar, strategy=BU_LC_STRATEGY, trace=0,
+ trace_chart_width=50, use_agenda=True, chart_class=Chart):
"""
Create a new chart parser, that uses ``grammar`` to parse
texts.
return self._grammar
def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
- if not trace:
- return
+ if not trace: return
print_rule_header = trace > 1
for edge in new_edges:
if print_rule_header:
- print("%s:" % rule)
+ print('%s:' % rule)
print_rule_header = False
print(chart.pretty_format_edge(edge, edge_width))
:type tokens: list(str)
:rtype: Chart
"""
- if trace is None:
- trace = self._trace
+ if trace is None: trace = self._trace
trace_new_edges = self._trace_new_edges
tokens = list(tokens)
# Width, for printing trace edges.
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
- if trace:
- print(chart.pretty_format_leaves(trace_edge_width))
+ if trace: print(chart.pretty_format_leaves(trace_edge_width))
if self._use_agenda:
# Use an agenda-based algorithm.
chart = self.chart_parse(tokens)
return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
-
class TopDownChartParser(ChartParser):
"""
A ``ChartParser`` using a top-down parsing strategy.
See ``ChartParser`` for more information.
"""
-
def __init__(self, grammar, **parser_args):
ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
-
class BottomUpChartParser(ChartParser):
"""
A ``ChartParser`` using a bottom-up parsing strategy.
See ``ChartParser`` for more information.
"""
-
def __init__(self, grammar, **parser_args):
if isinstance(grammar, PCFG):
- warnings.warn(
- "BottomUpChartParser only works for CFG, "
- "use BottomUpProbabilisticChartParser instead",
- category=DeprecationWarning,
- )
+ warnings.warn("BottomUpChartParser only works for CFG, "
+ "use BottomUpProbabilisticChartParser instead",
+ category=DeprecationWarning)
ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
-
class BottomUpLeftCornerChartParser(ChartParser):
"""
A ``ChartParser`` using a bottom-up left-corner parsing strategy.
This strategy is often more efficient than standard bottom-up.
See ``ChartParser`` for more information.
"""
-
def __init__(self, grammar, **parser_args):
ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
-
class LeftCornerChartParser(ChartParser):
def __init__(self, grammar, **parser_args):
if not grammar.is_nonempty():
- raise ValueError(
- "LeftCornerParser only works for grammars " "without empty productions."
- )
+ raise ValueError("LeftCornerParser only works for grammars "
+ "without empty productions.")
ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
-
########################################################################
## Stepping Chart Parser
########################################################################
-
class SteppingChartParser(ChartParser):
"""
A ``ChartParser`` that allows you to step through the parsing
or chart has been changed. If so, then ``step`` must restart
the parsing algorithm.
"""
-
def __init__(self, grammar, strategy=[], trace=0):
self._chart = None
self._current_chartrule = None
self._restart = False
ChartParser.__init__(self, grammar, strategy, trace)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Initialization
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def initialize(self, tokens):
"Begin parsing the given tokens."
self._chart = Chart(list(tokens))
self._restart = True
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Stepping
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def step(self):
"""
added with the current strategy and grammar.
"""
if self._chart is None:
- raise ValueError("Parser must be initialized first")
+ raise ValueError('Parser must be initialized first')
while True:
self._restart = False
- w = 50 // (self._chart.num_leaves() + 1)
+ w = 50 // (self._chart.num_leaves()+1)
for e in self._parse():
- if self._trace > 1:
- print(self._current_chartrule)
- if self._trace > 0:
- print(self._chart.pretty_format_edge(e, w))
+ if self._trace > 1: print(self._current_chartrule)
+ if self._trace > 0: print(self._chart.pretty_format_edge(e,w))
yield e
- if self._restart:
- break
+ if self._restart: break
else:
- yield None # No more edges.
+ yield None # No more edges.
def _parse(self):
"""
edges_added += 1
yield e
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Accessors
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def strategy(self):
"Return the strategy used by this parser."
"Return the parse trees currently contained in the chart."
return self._chart.parses(self._grammar.start(), tree_class)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Parser modification
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def set_strategy(self, strategy):
"""
:param strategy: A list of rules that should be used to decide
what edges to add to the chart.
"""
- if strategy == self._strategy:
- return
- self._strategy = strategy[:] # Make a copy.
+ if strategy == self._strategy: return
+ self._strategy = strategy[:] # Make a copy.
self._restart = True
def set_grammar(self, grammar):
"Change the grammar used by the parser."
- if grammar is self._grammar:
- return
+ if grammar is self._grammar: return
self._grammar = grammar
self._restart = True
def set_chart(self, chart):
"Load a given chart into the chart parser."
- if chart is self._chart:
- return
+ if chart is self._chart: return
self._chart = chart
self._restart = True
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Standard parser methods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def parse(self, tokens, tree_class=Tree):
tokens = list(tokens)
# Step until no more edges are generated.
for e in self.step():
- if e is None:
- break
+ if e is None: break
# Return an iterator of complete parses.
return self.parses(tree_class=tree_class)
-
########################################################################
## Demo Code
########################################################################
-
def demo_grammar():
from nltk.grammar import CFG
-
- return CFG.fromstring(
- """
+ return CFG.fromstring("""
S -> NP VP
PP -> "with" NP
NP -> NP PP
Verb -> "saw"
Prep -> "with"
Prep -> "under"
-"""
- )
-
-
-def demo(
- choice=None,
- print_times=True,
- print_grammar=False,
- print_trees=True,
- trace=2,
- sent="I saw John with a dog with my cookie",
- numparses=5,
-):
+""")
+
+def demo(choice=None,
+ print_times=True, print_grammar=False,
+ print_trees=True, trace=2,
+ sent='I saw John with a dog with my cookie', numparses=5):
"""
A demonstration of the chart parsers.
"""
# Ask the user which parser to test,
# if the parser wasn't provided as an argument
if choice is None:
- print(" 1: Top-down chart parser")
- print(" 2: Bottom-up chart parser")
- print(" 3: Bottom-up left-corner chart parser")
- print(" 4: Left-corner chart parser with bottom-up filter")
- print(" 5: Stepping chart parser (alternating top-down & bottom-up)")
- print(" 6: All parsers")
- print("\nWhich parser (1-6)? ", end=" ")
+ print(' 1: Top-down chart parser')
+ print(' 2: Bottom-up chart parser')
+ print(' 3: Bottom-up left-corner chart parser')
+ print(' 4: Left-corner chart parser with bottom-up filter')
+ print(' 5: Stepping chart parser (alternating top-down & bottom-up)')
+ print(' 6: All parsers')
+ print('\nWhich parser (1-6)? ', end=' ')
choice = sys.stdin.readline().strip()
print()
choice = str(choice)
if choice not in "123456":
- print("Bad parser number")
+ print('Bad parser number')
return
# Keep track of how long each parser takes.
times = {}
- strategies = {
- "1": ("Top-down", TD_STRATEGY),
- "2": ("Bottom-up", BU_STRATEGY),
- "3": ("Bottom-up left-corner", BU_LC_STRATEGY),
- "4": ("Filtered left-corner", LC_STRATEGY),
- }
+ strategies = {'1': ('Top-down', TD_STRATEGY),
+ '2': ('Bottom-up', BU_STRATEGY),
+ '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
+ '4': ('Filtered left-corner', LC_STRATEGY)}
choices = []
- if choice in strategies:
- choices = [choice]
- if choice == "6":
- choices = "1234"
+ if choice in strategies: choices = [choice]
+ if choice=='6': choices = "1234"
# Run the requested chart parser(s), except the stepping parser.
for strategy in choices:
chart = cp.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
- times[strategies[strategy][0]] = time.time() - t
+ times[strategies[strategy][0]] = time.time()-t
print("Nr edges in chart:", len(chart.edges()))
if numparses:
- assert len(parses) == numparses, "Not all parses found"
+ assert len(parses)==numparses, 'Not all parses found'
if print_trees:
- for tree in parses:
- print(tree)
+ for tree in parses: print(tree)
else:
print("Nr trees:", len(parses))
print()
cp = SteppingChartParser(grammar, trace=trace)
cp.initialize(tokens)
for i in range(5):
- print("*** SWITCH TO TOP DOWN")
+ print('*** SWITCH TO TOP DOWN')
cp.set_strategy(TD_STRATEGY)
for j, e in enumerate(cp.step()):
- if j > 20 or e is None:
- break
- print("*** SWITCH TO BOTTOM UP")
+ if j>20 or e is None: break
+ print('*** SWITCH TO BOTTOM UP')
cp.set_strategy(BU_STRATEGY)
for j, e in enumerate(cp.step()):
- if j > 20 or e is None:
- break
- times["Stepping"] = time.time() - t
+ if j>20 or e is None: break
+ times['Stepping'] = time.time()-t
print("Nr edges in chart:", len(cp.chart().edges()))
if numparses:
- assert len(list(cp.parses())) == numparses, "Not all parses found"
+ assert len(list(cp.parses()))==numparses, 'Not all parses found'
if print_trees:
- for tree in cp.parses():
- print(tree)
+ for tree in cp.parses(): print(tree)
else:
print("Nr trees:", len(list(cp.parses())))
print()
# Print the times of all parsers:
- if not (print_times and times):
- return
+ if not (print_times and times): return
print("* Parsing times")
print()
maxlen = max(len(key) for key in times)
- format = "%" + repr(maxlen) + "s parser: %6.3fsec"
+ format = '%' + repr(maxlen) + 's parser: %6.3fsec'
times_items = times.items()
- for (parser, t) in sorted(times_items, key=lambda a: a[1]):
+ for (parser, t) in sorted(times_items, key=lambda a:a[1]):
print(format % (parser, t))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the CoreNLP REST API.
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2016 NLTK Project
# Author: Dmitrijs Milajevs <dimazest@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+
import re
import json
import time
from nltk.internals import find_jar_iter, config_java, java, _java_options
-from nltk.tag.api import TaggerI
from nltk.parse.api import ParserI
from nltk.tokenize.api import TokenizerI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tree import Tree
-from unittest import skip
-
-_stanford_url = "http://stanfordnlp.github.io/CoreNLP/"
+_stanford_url = 'http://stanfordnlp.github.io/CoreNLP/'
class CoreNLPServerError(EnvironmentError):
def try_port(port=0):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- sock.bind(("", port))
+ sock.bind(('', port))
p = sock.getsockname()[1]
sock.close()
class CoreNLPServer(object):
- _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
- _JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
+ _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar'
+ _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar'
def __init__(
- self,
- path_to_jar=None,
- path_to_models_jar=None,
- verbose=False,
- java_options=None,
- corenlp_options=None,
- port=None,
+ self, path_to_jar=None, path_to_models_jar=None, verbose=False,
+ java_options=None, corenlp_options=None, port=None,
):
if corenlp_options is None:
- corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
+ corenlp_options = [
+ '-preload', 'tokenize,ssplit,pos,lemma,parse,depparse',
+ ]
- jars = list(
- find_jar_iter(
- self._JAR,
- path_to_jar,
- env_vars=("CORENLP",),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
- is_regex=True,
- )
- )
+ jars = list(find_jar_iter(
+ self._JAR,
+ path_to_jar,
+ env_vars=('CORENLP', ),
+ searchpath=(),
+ url=_stanford_url,
+ verbose=verbose,
+ is_regex=True,
+ ))
# find the most recent code and model jar
- stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name))
+ stanford_jar = max(
+ jars,
+ key=lambda model_name: re.match(self._JAR, model_name)
+ )
if port is None:
try:
else:
try_port(port)
- self.url = "http://localhost:{}".format(port)
+ self.url = 'http://localhost:{}'.format(port)
model_jar = max(
find_jar_iter(
self._MODEL_JAR_PATTERN,
path_to_models_jar,
- env_vars=("CORENLP_MODELS",),
+ env_vars=('CORENLP_MODELS', ),
searchpath=(),
url=_stanford_url,
verbose=verbose,
is_regex=True,
),
- key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name),
+ key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)
)
self.verbose = verbose
self._classpath = stanford_jar, model_jar
self.corenlp_options = corenlp_options
- self.java_options = java_options or ["-mx2g"]
+ self.java_options = java_options or ['-mx2g']
- def start(self, stdout="devnull", stderr="devnull"):
- """ Starts the CoreNLP server
-
- :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
- """
+ def start(self):
import requests
- cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
+ cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']
if self.corenlp_options:
cmd.extend(self.corenlp_options)
# Configure java.
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
config_java(options=self.java_options, verbose=self.verbose)
try:
+ # TODO: it's probably a bad idea to pipe stdout, as it will
+ # accumulate when lots of text is being parsed.
self.popen = java(
cmd,
classpath=self._classpath,
blocking=False,
- stdout=stdout,
- stderr=stderr,
+ stdout='pipe',
+ stderr='pipe',
)
finally:
# Return java configurations to their default values.
_, stderrdata = self.popen.communicate()
raise CoreNLPServerError(
returncode,
- "Could not start the server. "
- "The error was: {}".format(stderrdata.decode("ascii")),
+ 'Could not start the server. '
+ 'The error was: {}'.format(stderrdata.decode('ascii'))
)
for i in range(30):
try:
- response = requests.get(requests.compat.urljoin(self.url, "live"))
+ response = requests.get(requests.compat.urljoin(self.url, 'live'))
except requests.exceptions.ConnectionError:
time.sleep(1)
else:
if response.ok:
break
else:
- raise CoreNLPServerError("Could not connect to the server.")
+ raise CoreNLPServerError(
+ 'Could not connect to the server.'
+ )
for i in range(60):
try:
- response = requests.get(requests.compat.urljoin(self.url, "ready"))
+ response = requests.get(requests.compat.urljoin(self.url, 'ready'))
except requests.exceptions.ConnectionError:
time.sleep(1)
else:
if response.ok:
break
else:
- raise CoreNLPServerError("The server is not ready.")
+ raise CoreNLPServerError(
+ 'The server is not ready.'
+ )
def stop(self):
self.popen.terminate()
return False
-class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
+class GenericCoreNLPParser(ParserI, TokenizerI):
"""Interface to the CoreNLP Parser."""
- def __init__(self, url="http://localhost:9000", encoding="utf8", tagtype=None):
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
import requests
self.url = url
self.encoding = encoding
- if tagtype not in ["pos", "ner", None]:
- raise ValueError("tagtype must be either 'pos', 'ner' or None")
-
- self.tagtype = tagtype
-
self.session = requests.Session()
def parse_sents(self, sentences, *args, **kwargs):
:rtype: iter(iter(Tree))
"""
# Converting list(list(str)) -> list(str)
- sentences = (" ".join(words) for words in sentences)
+ sentences = (' '.join(words) for words in sentences)
return self.raw_parse_sents(sentences, *args, **kwargs)
def raw_parse(self, sentence, properties=None, *args, **kwargs):
:type sentence: str
:rtype: iter(Tree)
"""
- default_properties = {"tokenize.whitespace": "false"}
+ default_properties = {
+ 'tokenize.whitespace': 'false',
+ }
default_properties.update(properties or {})
return next(
self.raw_parse_sents(
- [sentence], properties=default_properties, *args, **kwargs
+ [sentence],
+ properties=default_properties,
+ *args,
+ **kwargs
)
)
- def api_call(self, data, properties=None, timeout=60):
+ def api_call(self, data, properties=None):
default_properties = {
- "outputFormat": "json",
- "annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
- parser_annotator=self.parser_annotator
+ 'outputFormat': 'json',
+ 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format(
+ parser_annotator=self.parser_annotator,
),
}
response = self.session.post(
self.url,
- params={"properties": json.dumps(default_properties)},
+ params={
+ 'properties': json.dumps(default_properties),
+ },
data=data.encode(self.encoding),
- timeout=timeout,
+ timeout=60,
)
response.raise_for_status()
return response.json()
def raw_parse_sents(
- self, sentences, verbose=False, properties=None, *args, **kwargs
+ self,
+ sentences,
+ verbose=False,
+ properties=None,
+ *args,
+ **kwargs
):
"""Parse multiple sentences.
"""
default_properties = {
# Only splits on '\n', never inside the sentence.
- "ssplit.eolonly": "true"
+ 'ssplit.ssplit.eolonly': 'true',
}
default_properties.update(properties or {})
tree = self.make_tree(parse)
yield iter([tree])
"""
- parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
- for parsed_sent in parsed_data["sentences"]:
+ parsed_data = self.api_call('\n'.join(sentences), properties=default_properties)
+ for parsed_sent in parsed_data['sentences']:
tree = self.make_tree(parsed_sent)
yield iter([tree])
+
def parse_text(self, text, *args, **kwargs):
"""Parse a piece of text.
"""
parsed_data = self.api_call(text, *args, **kwargs)
- for parse in parsed_data["sentences"]:
+ for parse in parsed_data['sentences']:
yield self.make_tree(parse)
def tokenize(self, text, properties=None):
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
"""
- default_properties = {"annotators": "tokenize,ssplit"}
+ default_properties = {
+ 'annotators': 'tokenize,ssplit',
+
+ }
default_properties.update(properties or {})
result = self.api_call(text, properties=default_properties)
- for sentence in result["sentences"]:
- for token in sentence["tokens"]:
- yield token["originalText"] or token["word"]
-
- def tag_sents(self, sentences):
- """
- Tag multiple sentences.
-
- Takes multiple sentences as a list where each sentence is a list of
- tokens.
-
- :param sentences: Input sentences to tag
- :type sentences: list(list(str))
- :rtype: list(list(tuple(str, str))
- """
- # Converting list(list(str)) -> list(str)
- sentences = (" ".join(words) for words in sentences)
- return [sentences[0] for sentences in self.raw_tag_sents(sentences)]
-
- def tag(self, sentence):
- """
- Tag a list of tokens.
-
- :rtype: list(tuple(str, str))
-
- >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
- >>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
- >>> parser.tag(tokens)
- [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
- ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
-
- >>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
- >>> tokens = "What is the airspeed of an unladen swallow ?".split()
- >>> parser.tag(tokens)
- [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
- ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
- ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
- """
- return self.tag_sents([sentence])[0]
-
- def raw_tag_sents(self, sentences):
- """
- Tag multiple sentences.
-
- Takes multiple sentences as a list where each sentence is a string.
-
- :param sentences: Input sentences to tag
- :type sentences: list(str)
- :rtype: list(list(list(tuple(str, str)))
- """
- default_properties = {
- "ssplit.isOneSentence": "true",
- "annotators": "tokenize,ssplit,",
- }
-
- # Supports only 'pos' or 'ner' tags.
- assert self.tagtype in ["pos", "ner"]
- default_properties["annotators"] += self.tagtype
- for sentence in sentences:
- tagged_data = self.api_call(sentence, properties=default_properties)
- yield [
- [
- (token["word"], token[self.tagtype])
- for token in tagged_sentence["tokens"]
- ]
- for tagged_sentence in tagged_data["sentences"]
- ]
+ for sentence in result['sentences']:
+ for token in sentence['tokens']:
+ yield token['originalText'] or token['word']
class CoreNLPParser(GenericCoreNLPParser):
"""
- _OUTPUT_FORMAT = "penn"
- parser_annotator = "parse"
+ _OUTPUT_FORMAT = 'penn'
+ parser_annotator = 'parse'
def make_tree(self, result):
- return Tree.fromstring(result["parse"])
+ return Tree.fromstring(result['parse'])
class CoreNLPDependencyParser(GenericCoreNLPParser):
"""
- _OUTPUT_FORMAT = "conll2007"
- parser_annotator = "depparse"
+ _OUTPUT_FORMAT = 'conll2007'
+ parser_annotator = 'depparse'
def make_tree(self, result):
return DependencyGraph(
(
- " ".join(n_items[1:]) # NLTK expects an iterable of strings...
+ ' '.join(n_items[1:]) # NLTK expects an iterable of strings...
for n_items in sorted(transform(result))
),
- cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token.
+ cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token.
)
def transform(sentence):
- for dependency in sentence["basicDependencies"]:
+ for dependency in sentence['basicDependencies']:
- dependent_index = dependency["dependent"]
- token = sentence["tokens"][dependent_index - 1]
+ dependent_index = dependency['dependent']
+ token = sentence['tokens'][dependent_index - 1]
# Return values that we don't know as '_'. Also, consider tag and ctag
# to be equal.
yield (
dependent_index,
- "_",
- token["word"],
- token["lemma"],
- token["pos"],
- token["pos"],
- "_",
- str(dependency["governor"]),
- dependency["dep"],
- "_",
- "_",
+ '_',
+ token['word'],
+ token['lemma'],
+ token['pos'],
+ token['pos'],
+ '_',
+ str(dependency['governor']),
+ dependency['dep'],
+ '_',
+ '_',
)
-@skip("Skipping all CoreNLP tests.")
def setup_module(module):
from nose import SkipTest
global server
-
try:
server = CoreNLPServer(port=9000)
except LookupError as e:
- raise SkipTest("Could not instantiate CoreNLPServer.")
+ raise SkipTest('Could not instantiate CoreNLPServer.')
try:
server.start()
except CoreNLPServerError as e:
raise SkipTest(
- "Skipping CoreNLP tests because the server could not be started. "
- "Make sure that the 9000 port is free. "
- "{}".format(e.strerror)
+ 'Skipping CoreNLP tests because the server could not be started. '
+ 'Make sure that the 9000 port is free. '
+ '{}'.format(e.strerror)
)
-@skip("Skipping all CoreNLP tests.")
def teardown_module(module):
server.stop()
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (modifications)
#
The input is assumed to be in Malt-TAB format
(http://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
"""
+from __future__ import print_function, unicode_literals
from collections import defaultdict
from itertools import chain
import subprocess
import warnings
+from six import string_types
+
from nltk.tree import Tree
+from nltk.compat import python_2_unicode_compatible
+
#################################################################
# DependencyGraph Class
#################################################################
+@python_2_unicode_compatible
class DependencyGraph(object):
"""
A container for the nodes and labelled edges of a dependency structure.
"""
- def __init__(
- self,
- tree_str=None,
- cell_extractor=None,
- zero_based=False,
- cell_separator=None,
- top_relation_label="ROOT",
- ):
+ def __init__(self, tree_str=None, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
"""Dependency graph.
We place a dummy `TOP` node with the index 0, since the root node is
identified, for examlple, `ROOT`, `null` or `TOP`.
"""
- self.nodes = defaultdict(
- lambda: {
- "address": None,
- "word": None,
- "lemma": None,
- "ctag": None,
- "tag": None,
- "feats": None,
- "head": None,
- "deps": defaultdict(list),
- "rel": None,
+ self.nodes = defaultdict(lambda: {'address': None,
+ 'word': None,
+ 'lemma': None,
+ 'ctag': None,
+ 'tag': None,
+ 'feats': None,
+ 'head': None,
+ 'deps': defaultdict(list),
+ 'rel': None,
+ })
+
+ self.nodes[0].update(
+ {
+ 'ctag': 'TOP',
+ 'tag': 'TOP',
+ 'address': 0,
}
)
- self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0})
-
self.root = None
if tree_str:
"""
for node in self.nodes.values():
new_deps = []
- for dep in node["deps"]:
+ for dep in node['deps']:
if dep in originals:
new_deps.append(redirect)
else:
new_deps.append(dep)
- node["deps"] = new_deps
+ node['deps'] = new_deps
def add_arc(self, head_address, mod_address):
"""
Adds an arc from the node specified by head_address to the
node specified by the mod address.
"""
- relation = self.nodes[mod_address]["rel"]
- self.nodes[head_address]["deps"].setdefault(relation, [])
- self.nodes[head_address]["deps"][relation].append(mod_address)
- # self.nodes[head_address]['deps'].append(mod_address)
+ relation = self.nodes[mod_address]['rel']
+ self.nodes[head_address]['deps'].setdefault(relation, [])
+ self.nodes[head_address]['deps'][relation].append(mod_address)
+ #self.nodes[head_address]['deps'].append(mod_address)
+
def connect_graph(self):
"""
"""
for node1 in self.nodes.values():
for node2 in self.nodes.values():
- if node1["address"] != node2["address"] and node2["rel"] != "TOP":
- relation = node2["rel"]
- node1["deps"].setdefault(relation, [])
- node1["deps"][relation].append(node2["address"])
- # node1['deps'].append(node2['address'])
+ if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
+ relation = node2['rel']
+ node1['deps'].setdefault(relation, [])
+ node1['deps'][relation].append(node2['address'])
+ #node1['deps'].append(node2['address'])
def get_by_address(self, node_address):
"""Return the node with the given address."""
"""
# Start the digraph specification
- s = "digraph G{\n"
- s += "edge [dir=forward]\n"
- s += "node [shape=plaintext]\n"
+ s = 'digraph G{\n'
+ s += 'edge [dir=forward]\n'
+ s += 'node [shape=plaintext]\n'
# Draw the remaining nodes
- for node in sorted(self.nodes.values(), key=lambda v: v["address"]):
- s += '\n%s [label="%s (%s)"]' % (
- node["address"],
- node["address"],
- node["word"],
- )
- for rel, deps in node["deps"].items():
+ for node in sorted(self.nodes.values(), key=lambda v: v['address']):
+ s += '\n%s [label="%s (%s)"]' % (node['address'], node['address'], node['word'])
+ for rel, deps in node['deps'].items():
for dep in deps:
if rel is not None:
- s += '\n%s -> %s [label="%s"]' % (node["address"], dep, rel)
+ s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel)
else:
- s += "\n%s -> %s " % (node["address"], dep)
+ s += '\n%s -> %s ' % (node['address'], dep)
s += "\n}"
return s
try:
process = subprocess.Popen(
- ["dot", "-Tsvg"],
+ ['dot', '-Tsvg'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
except OSError:
- raise Exception("Cannot find the dot binary from Graphviz package")
+ raise Exception('Cannot find the dot binary from Graphviz package')
out, err = process.communicate(dot_string)
if err:
raise Exception(
- "Cannot create svg representation by running dot from string: {}"
- "".format(dot_string)
- )
+ 'Cannot create svg representation by running dot from string: {}'
+ ''.format(dot_string))
return out
def __str__(self):
return "<DependencyGraph with {0} nodes>".format(len(self.nodes))
@staticmethod
- def load(
- filename, zero_based=False, cell_separator=None, top_relation_label="ROOT"
- ):
+ def load(filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
"""
:param filename: a name of a file in Malt-TAB format
:param zero_based: nodes in the input file are numbered starting from 0
cell_separator=cell_separator,
top_relation_label=top_relation_label,
)
- for tree_str in infile.read().split("\n\n")
+ for tree_str in infile.read().split('\n\n')
]
def left_children(self, node_index):
Returns the number of left children under the node specified
by the given address.
"""
- children = chain.from_iterable(self.nodes[node_index]["deps"].values())
- index = self.nodes[node_index]["address"]
+ children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+ index = self.nodes[node_index]['address']
return sum(1 for c in children if c < index)
def right_children(self, node_index):
Returns the number of right children under the node specified
by the given address.
"""
- children = chain.from_iterable(self.nodes[node_index]["deps"].values())
- index = self.nodes[node_index]["address"]
+ children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+ index = self.nodes[node_index]['address']
return sum(1 for c in children if c > index)
def add_node(self, node):
- if not self.contains_address(node["address"]):
- self.nodes[node["address"]].update(node)
-
- def _parse(
- self,
- input_,
- cell_extractor=None,
- zero_based=False,
- cell_separator=None,
- top_relation_label="ROOT",
- ):
+ if not self.contains_address(node['address']):
+ self.nodes[node['address']].update(node)
+
+ def _parse(self, input_, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
"""Parse a sentence.
:param extractor: a function that given a tuple of cells returns a
def extract_3_cells(cells, index):
word, tag, head = cells
- return index, word, word, tag, tag, "", head, ""
+ return index, word, word, tag, tag, '', head, ''
def extract_4_cells(cells, index):
word, tag, head, rel = cells
- return index, word, word, tag, tag, "", head, rel
+ return index, word, word, tag, tag, '', head, rel
def extract_7_cells(cells, index):
line_index, word, lemma, tag, _, head, rel = cells
except ValueError:
# index can't be parsed as an integer, use default
pass
- return index, word, lemma, tag, tag, "", head, rel
+ return index, word, lemma, tag, tag, '', head, rel
def extract_10_cells(cells, index):
line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
10: extract_10_cells,
}
- if isinstance(input_, str):
- input_ = (line for line in input_.split("\n"))
+ if isinstance(input_, string_types):
+ input_ = (line for line in input_.split('\n'))
lines = (l.rstrip() for l in input_)
lines = (l for l in lines if l)
cell_extractor = extractors[cell_number]
except KeyError:
raise ValueError(
- "Number of tab-delimited fields ({0}) not supported by "
- "CoNLL(10) or Malt-Tab(4) format".format(cell_number)
+ 'Number of tab-delimited fields ({0}) not supported by '
+ 'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
)
try:
- index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(
- cells, index
- )
+ index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells, index)
except (TypeError, ValueError):
# cell_extractor doesn't take 2 arguments or doesn't return 8
# values; assume the cell_extractor is an older external
# extractor and doesn't accept or return an index.
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
- if head == "_":
+ if head == '_':
continue
head = int(head)
self.nodes[index].update(
{
- "address": index,
- "word": word,
- "lemma": lemma,
- "ctag": ctag,
- "tag": tag,
- "feats": feats,
- "head": head,
- "rel": rel,
+ 'address': index,
+ 'word': word,
+ 'lemma': lemma,
+ 'ctag': ctag,
+ 'tag': tag,
+ 'feats': feats,
+ 'head': head,
+ 'rel': rel,
}
)
# Make sure that the fake root node has labeled dependencies.
if (cell_number == 3) and (head == 0):
rel = top_relation_label
- self.nodes[head]["deps"][rel].append(index)
+ self.nodes[head]['deps'][rel].append(index)
- if self.nodes[0]["deps"][top_relation_label]:
- root_address = self.nodes[0]["deps"][top_relation_label][0]
+ if self.nodes[0]['deps'][top_relation_label]:
+ root_address = self.nodes[0]['deps'][top_relation_label][0]
self.root = self.nodes[root_address]
self.top_relation_label = top_relation_label
else:
warnings.warn(
- "The graph doesn't contain a node " "that depends on the root element."
+ "The graph doesn't contain a node "
+ "that depends on the root element."
)
def _word(self, node, filter=True):
- w = node["word"]
+ w = node['word']
if filter:
- if w != ",":
+ if w != ',':
return w
return w
:return: either a word (if the indexed node is a leaf) or a ``Tree``.
"""
node = self.get_by_address(i)
- word = node["word"]
- deps = sorted(chain.from_iterable(node["deps"].values()))
+ word = node['word']
+ deps = sorted(chain.from_iterable(node['deps'].values()))
if deps:
return Tree(word, [self._tree(dep) for dep in deps])
"""
node = self.root
- word = node["word"]
- deps = sorted(chain.from_iterable(node["deps"].values()))
+ word = node['word']
+ deps = sorted(chain.from_iterable(node['deps'].values()))
return Tree(word, [self._tree(dep) for dep in deps])
def triples(self, node=None):
if not node:
node = self.root
- head = (node["word"], node["ctag"])
- for i in sorted(chain.from_iterable(node["deps"].values())):
+ head = (node['word'], node['ctag'])
+ for i in sorted(chain.from_iterable(node['deps'].values())):
dep = self.get_by_address(i)
- yield (head, dep["rel"], (dep["word"], dep["ctag"]))
+ yield (head, dep['rel'], (dep['word'], dep['ctag']))
for triple in self.triples(node=dep):
yield triple
def _hd(self, i):
try:
- return self.nodes[i]["head"]
+ return self.nodes[i]['head']
except IndexError:
return None
def _rel(self, i):
try:
- return self.nodes[i]["rel"]
+ return self.nodes[i]['rel']
except IndexError:
return None
distances = {}
for node in self.nodes.values():
- for dep in node["deps"]:
- key = tuple([node["address"], dep])
+ for dep in node['deps']:
+ key = tuple([node['address'], dep])
distances[key] = 1
for _ in self.nodes:
return False # return []?
def get_cycle_path(self, curr_node, goal_node_index):
- for dep in curr_node["deps"]:
+ for dep in curr_node['deps']:
if dep == goal_node_index:
- return [curr_node["address"]]
- for dep in curr_node["deps"]:
+ return [curr_node['address']]
+ for dep in curr_node['deps']:
path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
if len(path) > 0:
- path.insert(0, curr_node["address"])
+ path.insert(0, curr_node['address'])
return path
return []
"""
if style == 3:
- template = "{word}\t{tag}\t{head}\n"
+ template = '{word}\t{tag}\t{head}\n'
elif style == 4:
- template = "{word}\t{tag}\t{head}\t{rel}\n"
+ template = '{word}\t{tag}\t{head}\t{rel}\n'
elif style == 10:
- template = (
- "{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n"
- )
+ template = '{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n'
else:
raise ValueError(
- "Number of tab-delimited fields ({0}) not supported by "
- "CoNLL(10) or Malt-Tab(4) format".format(style)
+ 'Number of tab-delimited fields ({0}) not supported by '
+ 'CoNLL(10) or Malt-Tab(4) format'.format(style)
)
- return "".join(
- template.format(i=i, **node)
- for i, node in sorted(self.nodes.items())
- if node["tag"] != "TOP"
- )
+ return ''.join(template.format(i=i, **node) for i, node in sorted(self.nodes.items()) if node['tag'] != 'TOP')
def nx_graph(self):
"""Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
nx_nodelist = list(range(1, len(self.nodes)))
nx_edgelist = [
- (n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n)
+ (n, self._hd(n), self._rel(n))
+ for n in nx_nodelist if self._hd(n)
]
self.nx_labels = {}
for n in nx_nodelist:
- self.nx_labels[n] = self.nodes[n]["word"]
+ self.nx_labels[n] = self.nodes[n]['word']
g = networkx.MultiDiGraph()
g.add_nodes_from(nx_nodelist)
A demonstration of the result of reading a dependency
version of the first sentence of the Penn Treebank.
"""
- dg = DependencyGraph(
- """Pierre NNP 2 NMOD
+ dg = DependencyGraph("""Pierre NNP 2 NMOD
Vinken NNP 8 SUB
, , 2 P
61 CD 5 NMOD
Nov. NNP 9 VMOD
29 CD 16 NMOD
. . 9 VMOD
-"""
- )
+""")
tree = dg.tree()
tree.pprint()
if nx:
networkx.draw_networkx_labels(g, pos, dg.nx_labels)
pylab.xticks([])
pylab.yticks([])
- pylab.savefig("tree.png")
+ pylab.savefig('tree.png')
pylab.show()
def conll_file_demo():
- print("Mass conll_read demo...")
- graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+ print('Mass conll_read demo...')
+ graphs = [DependencyGraph(entry)
+ for entry in conll_data2.split('\n\n') if entry]
for graph in graphs:
tree = graph.tree()
- print("\n")
+ print('\n')
tree.pprint()
dg = DependencyGraph(treebank_data)
print(dg.contains_cycle())
cyclic_dg = DependencyGraph()
- cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0})
- cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1})
- cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2})
- cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3})
- cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4})
+ cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0})
+ cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1})
+ cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2})
+ cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3})
+ cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4})
print(cyclic_dg.contains_cycle())
-
treebank_data = """Pierre NNP 2 NMOD
Vinken NNP 8 SUB
, , 2 P
16 . . Punc Punc punt 15 punct _ _
"""
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Incremental Earley Chart Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Rob Speer <rspeer@mit.edu>
# Edward Loper <edloper@gmail.com>
The main parser class is ``EarleyChartParser``, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
"""
-
-from time import perf_counter
-
-from nltk.parse.chart import (
- Chart,
- ChartParser,
- EdgeI,
- LeafEdge,
- LeafInitRule,
- BottomUpPredictRule,
- BottomUpPredictCombineRule,
- TopDownInitRule,
- SingleEdgeFundamentalRule,
- EmptyPredictRule,
- CachedTopDownPredictRule,
- FilteredSingleEdgeFundamentalRule,
- FilteredBottomUpPredictCombineRule,
-)
-from nltk.parse.featurechart import (
- FeatureChart,
- FeatureChartParser,
- FeatureTopDownInitRule,
- FeatureTopDownPredictRule,
- FeatureEmptyPredictRule,
- FeatureBottomUpPredictRule,
- FeatureBottomUpPredictCombineRule,
- FeatureSingleEdgeFundamentalRule,
-)
-
-# ////////////////////////////////////////////////////////////
+from __future__ import print_function, division
+
+from six.moves import range
+
+from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule,
+ BottomUpPredictRule, BottomUpPredictCombineRule,
+ TopDownInitRule, SingleEdgeFundamentalRule,
+ EmptyPredictRule,
+ CachedTopDownPredictRule,
+ FilteredSingleEdgeFundamentalRule,
+ FilteredBottomUpPredictCombineRule)
+from nltk.parse.featurechart import (FeatureChart, FeatureChartParser,
+ FeatureTopDownInitRule,
+ FeatureTopDownPredictRule,
+ FeatureEmptyPredictRule,
+ FeatureBottomUpPredictRule,
+ FeatureBottomUpPredictCombineRule,
+ FeatureSingleEdgeFundamentalRule)
+
+#////////////////////////////////////////////////////////////
# Incremental Chart
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class IncrementalChart(Chart):
def initialize(self):
edgelist = self._edgelists[end]
# If there are no restrictions, then return all edges.
- if restrictions == {}:
- return iter(edgelist)
+ if restrictions=={}: return iter(edgelist)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
edgelist = self._edgelists[end]
# If there are no restrictions, then return all edges.
- if restrictions == {}:
- return iter(edgelist)
+ if restrictions=={}: return iter(edgelist)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
if restr_keys not in self._indexes:
self._add_index(restr_keys)
- vals = tuple(
- self._get_type_if_possible(restrictions[key]) for key in restr_keys
- )
+ vals = tuple(self._get_type_if_possible(restrictions[key])
+ for key in restr_keys)
return iter(self._indexes[restr_keys][end].get(vals, []))
def _add_index(self, restr_keys):
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
for end, edgelist in enumerate(self._edgelists):
this_index = index[end]
for edge in edgelist:
- vals = tuple(
- self._get_type_if_possible(getattr(edge, key)())
- for key in restr_keys
- )
+ vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+ for key in restr_keys)
this_index.setdefault(vals, []).append(edge)
def _register_with_indexes(self, edge):
end = edge.end()
for (restr_keys, index) in self._indexes.items():
- vals = tuple(
- self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
- )
+ vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+ for key in restr_keys)
index[end].setdefault(vals, []).append(edge)
-
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Incremental CFG Rules
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class CompleteFundamentalRule(SingleEdgeFundamentalRule):
def _apply_incomplete(self, chart, grammar, left_edge):
end = left_edge.end()
# When the chart is incremental, we only have to look for
# empty complete edges here.
- for right_edge in chart.select(
- start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
- ):
+ for right_edge in chart.select(start=end, end=end,
+ is_complete=True,
+ lhs=left_edge.nextsym()):
new_edge = left_edge.move_dot_forward(right_edge.end())
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
-
class CompleterRule(CompleteFundamentalRule):
_fundamental_rule = CompleteFundamentalRule()
-
def apply(self, chart, grammar, edge):
if not isinstance(edge, LeafEdge):
for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
yield new_edge
-
class ScannerRule(CompleteFundamentalRule):
_fundamental_rule = CompleteFundamentalRule()
-
def apply(self, chart, grammar, edge):
if isinstance(edge, LeafEdge):
for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
yield new_edge
-
class PredictorRule(CachedTopDownPredictRule):
pass
-
class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
def apply(self, chart, grammar, edge):
# Since the Filtered rule only works for grammars without empty productions,
for new_edge in self._apply_complete(chart, grammar, edge):
yield new_edge
-
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Incremental FCFG Rules
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
def _apply_incomplete(self, chart, grammar, left_edge):
end = left_edge.end()
# When the chart is incremental, we only have to look for
# empty complete edges here.
- for right_edge in chart.select(
- start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
- ):
+ for right_edge in chart.select(start=end, end=end,
+ is_complete=True,
+ lhs=left_edge.nextsym()):
for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
yield new_edge
-
class FeatureCompleterRule(CompleterRule):
_fundamental_rule = FeatureCompleteFundamentalRule()
-
class FeatureScannerRule(ScannerRule):
_fundamental_rule = FeatureCompleteFundamentalRule()
-
class FeaturePredictorRule(FeatureTopDownPredictRule):
pass
-
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Incremental CFG Chart Parsers
-# ////////////////////////////////////////////////////////////
-
-EARLEY_STRATEGY = [
- LeafInitRule(),
- TopDownInitRule(),
- CompleterRule(),
- ScannerRule(),
- PredictorRule(),
-]
-TD_INCREMENTAL_STRATEGY = [
- LeafInitRule(),
- TopDownInitRule(),
- CachedTopDownPredictRule(),
- CompleteFundamentalRule(),
-]
-BU_INCREMENTAL_STRATEGY = [
- LeafInitRule(),
- EmptyPredictRule(),
- BottomUpPredictRule(),
- CompleteFundamentalRule(),
-]
-BU_LC_INCREMENTAL_STRATEGY = [
- LeafInitRule(),
- EmptyPredictRule(),
- BottomUpPredictCombineRule(),
- CompleteFundamentalRule(),
-]
-
-LC_INCREMENTAL_STRATEGY = [
- LeafInitRule(),
- FilteredBottomUpPredictCombineRule(),
- FilteredCompleteFundamentalRule(),
-]
-
+#////////////////////////////////////////////////////////////
+
+EARLEY_STRATEGY = [LeafInitRule(),
+ TopDownInitRule(),
+ CompleterRule(),
+ ScannerRule(),
+ PredictorRule()]
+TD_INCREMENTAL_STRATEGY = [LeafInitRule(),
+ TopDownInitRule(),
+ CachedTopDownPredictRule(),
+ CompleteFundamentalRule()]
+BU_INCREMENTAL_STRATEGY = [LeafInitRule(),
+ EmptyPredictRule(),
+ BottomUpPredictRule(),
+ CompleteFundamentalRule()]
+BU_LC_INCREMENTAL_STRATEGY = [LeafInitRule(),
+ EmptyPredictRule(),
+ BottomUpPredictCombineRule(),
+ CompleteFundamentalRule()]
+
+LC_INCREMENTAL_STRATEGY = [LeafInitRule(),
+ FilteredBottomUpPredictCombineRule(),
+ FilteredCompleteFundamentalRule()]
class IncrementalChartParser(ChartParser):
"""
| Apply CompleterRule to edge
| Return any complete parses in the chart
"""
-
- def __init__(
- self,
- grammar,
- strategy=BU_LC_INCREMENTAL_STRATEGY,
- trace=0,
- trace_chart_width=50,
- chart_class=IncrementalChart,
- ):
+ def __init__(self, grammar, strategy=BU_LC_INCREMENTAL_STRATEGY,
+ trace=0, trace_chart_width=50,
+ chart_class=IncrementalChart):
"""
Create a new Earley chart parser, that uses ``grammar`` to
parse texts.
elif rule.NUM_EDGES == 1:
self._inference_rules.append(rule)
else:
- raise ValueError(
- "Incremental inference rules must have " "NUM_EDGES == 0 or 1"
- )
+ raise ValueError("Incremental inference rules must have "
+ "NUM_EDGES == 0 or 1")
def chart_parse(self, tokens, trace=None):
- if trace is None:
- trace = self._trace
+ if trace is None: trace = self._trace
trace_new_edges = self._trace_new_edges
tokens = list(tokens)
# Width, for printing trace edges.
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
- if trace:
- print(chart.pretty_format_leaves(trace_edge_width))
+ if trace: print(chart.pretty_format_leaves(trace_edge_width))
for axiom in self._axioms:
new_edges = list(axiom.apply(chart, grammar))
trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
inference_rules = self._inference_rules
- for end in range(chart.num_leaves() + 1):
- if trace > 1:
- print("\n* Processing queue:", end, "\n")
+ for end in range(chart.num_leaves()+1):
+ if trace > 1: print("\n* Processing queue:", end, "\n")
agenda = list(chart.select(end=end))
while agenda:
edge = agenda.pop()
new_edges = list(rule.apply(chart, grammar, edge))
trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
for new_edge in new_edges:
- if new_edge.end() == end:
+ if new_edge.end()==end:
agenda.append(new_edge)
return chart
-
class EarleyChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
-
+ pass
class IncrementalTopDownChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
- IncrementalChartParser.__init__(
- self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args
- )
-
+ IncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args)
class IncrementalBottomUpChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
- IncrementalChartParser.__init__(
- self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args
- )
-
+ IncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args)
class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
- IncrementalChartParser.__init__(
- self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args
- )
-
+ IncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args)
class IncrementalLeftCornerChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
if not grammar.is_nonempty():
- raise ValueError(
- "IncrementalLeftCornerParser only works for grammars "
- "without empty productions."
- )
- IncrementalChartParser.__init__(
- self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args
- )
+ raise ValueError("IncrementalLeftCornerParser only works for grammars "
+ "without empty productions.")
+ IncrementalChartParser.__init__(self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args)
-
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Incremental FCFG Chart Parsers
-# ////////////////////////////////////////////////////////////
-
-EARLEY_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureTopDownInitRule(),
- FeatureCompleterRule(),
- FeatureScannerRule(),
- FeaturePredictorRule(),
-]
-TD_INCREMENTAL_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureTopDownInitRule(),
- FeatureTopDownPredictRule(),
- FeatureCompleteFundamentalRule(),
-]
-BU_INCREMENTAL_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureEmptyPredictRule(),
- FeatureBottomUpPredictRule(),
- FeatureCompleteFundamentalRule(),
-]
-BU_LC_INCREMENTAL_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureEmptyPredictRule(),
- FeatureBottomUpPredictCombineRule(),
- FeatureCompleteFundamentalRule(),
-]
-
+#////////////////////////////////////////////////////////////
+
+EARLEY_FEATURE_STRATEGY = [LeafInitRule(),
+ FeatureTopDownInitRule(),
+ FeatureCompleterRule(),
+ FeatureScannerRule(),
+ FeaturePredictorRule()]
+TD_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(),
+ FeatureTopDownInitRule(),
+ FeatureTopDownPredictRule(),
+ FeatureCompleteFundamentalRule()]
+BU_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(),
+ FeatureEmptyPredictRule(),
+ FeatureBottomUpPredictRule(),
+ FeatureCompleteFundamentalRule()]
+BU_LC_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(),
+ FeatureEmptyPredictRule(),
+ FeatureBottomUpPredictCombineRule(),
+ FeatureCompleteFundamentalRule()]
class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
- def __init__(
- self,
- grammar,
- strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
- trace_chart_width=20,
- chart_class=FeatureIncrementalChart,
- **parser_args
- ):
- IncrementalChartParser.__init__(
- self,
- grammar,
- strategy=strategy,
- trace_chart_width=trace_chart_width,
- chart_class=chart_class,
- **parser_args
- )
-
+ def __init__(self, grammar,
+ strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
+ trace_chart_width=20,
+ chart_class=FeatureIncrementalChart,
+ **parser_args):
+ IncrementalChartParser.__init__(self, grammar,
+ strategy=strategy,
+ trace_chart_width=trace_chart_width,
+ chart_class=chart_class,
+ **parser_args)
class FeatureEarleyChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
- FeatureIncrementalChartParser.__init__(
- self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args
- )
-
+ FeatureIncrementalChartParser.__init__(self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args)
class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
- FeatureIncrementalChartParser.__init__(
- self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args
- )
-
+ FeatureIncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
- FeatureIncrementalChartParser.__init__(
- self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args
- )
-
+ FeatureIncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
- FeatureIncrementalChartParser.__init__(
- self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args
- )
+ FeatureIncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Demonstration
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
-
-def demo(
- print_times=True,
- print_grammar=False,
- print_trees=True,
- trace=2,
- sent="I saw John with a dog with my cookie",
- numparses=5,
-):
+def demo(print_times=True, print_grammar=False,
+ print_trees=True, trace=2,
+ sent='I saw John with a dog with my cookie', numparses=5):
"""
A demonstration of the Earley parsers.
"""
# Do the parsing.
earley = EarleyChartParser(grammar, trace=trace)
- t = perf_counter()
+ t = time.clock()
chart = earley.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
- t = perf_counter() - t
+ t = time.clock()-t
# Print results.
if numparses:
- assert len(parses) == numparses, "Not all parses found"
+ assert len(parses)==numparses, 'Not all parses found'
if print_trees:
- for tree in parses:
- print(tree)
+ for tree in parses: print(tree)
else:
print("Nr trees:", len(parses))
if print_times:
print("Time:", t)
-
-if __name__ == "__main__":
- demo()
+if __name__ == '__main__': demo()
#
# Author: Long Duong <longdt219@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import division
+
import unicodedata
>>> de = DependencyEvaluator([parsed_sent],[gold_sent])
>>> las, uas = de.eval()
>>> las
- 0.6...
- >>> uas
0.8...
- >>> abs(uas - 0.8) < 0.00001
+ >>> abs(uas - 0.6) < 0.00001
True
"""
:return : tuple(float,float)
"""
- if len(self._parsed_sents) != len(self._gold_sents):
- raise ValueError(
- " Number of parsed sentence is different with number of gold sentence."
- )
+ if (len(self._parsed_sents) != len(self._gold_sents)):
+ raise ValueError(" Number of parsed sentence is different with number of gold sentence.")
corr = 0
corrL = 0
parsed_sent_nodes = self._parsed_sents[i].nodes
gold_sent_nodes = self._gold_sents[i].nodes
- if len(parsed_sent_nodes) != len(gold_sent_nodes):
+ if (len(parsed_sent_nodes) != len(gold_sent_nodes)):
raise ValueError("Sentences must have equal length.")
for parsed_node_address, parsed_node in parsed_sent_nodes.items():
if parsed_node["rel"] == gold_node["rel"]:
corrL += 1
- return corrL / total, corr / total
+ return corr / total, corrL / total
+
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Chart Parser for Feature-Based Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rob Speer <rspeer@mit.edu>
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# URL: <http://nltk.org/>
Extension of chart parsing implementation to handle grammars with
feature structures as nodes.
"""
-from time import perf_counter
+from __future__ import print_function, unicode_literals
+from six.moves import range
+
+from nltk.compat import python_2_unicode_compatible
from nltk.featstruct import FeatStruct, unify, TYPE, find_variables
from nltk.sem import logic
from nltk.tree import Tree
-from nltk.grammar import (
- Nonterminal,
- Production,
- CFG,
- FeatStructNonterminal,
- is_nonterminal,
- is_terminal,
-)
-from nltk.parse.chart import (
- TreeEdge,
- Chart,
- ChartParser,
- EdgeI,
- FundamentalRule,
- LeafInitRule,
- EmptyPredictRule,
- BottomUpPredictRule,
- SingleEdgeFundamentalRule,
- BottomUpPredictCombineRule,
- CachedTopDownPredictRule,
- TopDownInitRule,
-)
-
-# ////////////////////////////////////////////////////////////
+from nltk.grammar import (Nonterminal, Production, CFG,
+ FeatStructNonterminal, is_nonterminal,
+ is_terminal)
+from nltk.parse.chart import (TreeEdge, Chart, ChartParser, EdgeI,
+ FundamentalRule, LeafInitRule,
+ EmptyPredictRule, BottomUpPredictRule,
+ SingleEdgeFundamentalRule,
+ BottomUpPredictCombineRule,
+ CachedTopDownPredictRule,
+ TopDownInitRule)
+
+#////////////////////////////////////////////////////////////
# Tree Edge
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
+@python_2_unicode_compatible
class FeatureTreeEdge(TreeEdge):
"""
A specialized tree edge that allows shared variable bindings
every nonterminal in the edge whose symbol implements the
interface ``SubstituteBindingsI``.
"""
-
def __init__(self, span, lhs, rhs, dot=0, bindings=None):
"""
Construct a new edge. If the edge is incomplete (i.e., if
clear the bindings. See ``TreeEdge`` for a description of
the other arguments.
"""
- if bindings is None:
- bindings = {}
+ if bindings is None: bindings = {}
# If the edge is complete, then substitute in the bindings,
# and then throw them away. (If we didn't throw them away, we
``(index,index)``; and its dot position will be ``0``.
:rtype: TreeEdge
"""
- return FeatureTreeEdge(
- span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
- )
+ return FeatureTreeEdge(span=(index, index), lhs=production.lhs(),
+ rhs=production.rhs(), dot=0)
def move_dot_forward(self, new_end, bindings=None):
"""
:param bindings: Bindings for the new edge.
:type bindings: dict
"""
- return FeatureTreeEdge(
- span=(self._span[0], new_end),
- lhs=self._lhs,
- rhs=self._rhs,
- dot=self._dot + 1,
- bindings=bindings,
- )
+ return FeatureTreeEdge(span=(self._span[0], new_end),
+ lhs=self._lhs, rhs=self._rhs,
+ dot=self._dot+1, bindings=bindings)
def _bind(self, nt, bindings):
- if not isinstance(nt, FeatStructNonterminal):
- return nt
+ if not isinstance(nt, FeatStructNonterminal): return nt
return nt.substitute_bindings(bindings)
def next_with_bindings(self):
:return: The set of variables used by this edge.
:rtype: set(Variable)
"""
- return find_variables(
- [self._lhs]
- + list(self._rhs)
- + list(self._bindings.keys())
- + list(self._bindings.values()),
- fs_class=FeatStruct,
- )
+ return find_variables([self._lhs] + list(self._rhs) +
+ list(self._bindings.keys()) +
+ list(self._bindings.values()),
+ fs_class=FeatStruct)
def __str__(self):
if self.is_complete():
- return super().__str__()
+ return TreeEdge.__unicode__(self)
else:
- bindings = "{%s}" % ", ".join(
- "%s: %r" % item for item in sorted(self._bindings.items())
- )
- return "%s %s" % (super().__str__(), bindings)
+ bindings = '{%s}' % ', '.join('%s: %r' % item for item in
+ sorted(self._bindings.items()))
+ return '%s %s' % (TreeEdge.__unicode__(self), bindings)
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# A specialized Chart for feature grammars
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# TODO: subsumes check when adding new edges
-
class FeatureChart(Chart):
"""
A Chart for feature grammars.
``restrictions`` on the edges.
"""
# If there are no restrictions, then return all edges.
- if restrictions == {}:
- return iter(self._edges)
+ if restrictions=={}: return iter(self._edges)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
if restr_keys not in self._indexes:
self._add_index(restr_keys)
- vals = tuple(
- self._get_type_if_possible(restrictions[key]) for key in restr_keys
- )
+ vals = tuple(self._get_type_if_possible(restrictions[key])
+ for key in restr_keys)
return iter(self._indexes[restr_keys].get(vals, []))
def _add_index(self, restr_keys):
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
- raise ValueError("Bad restriction: %s" % key)
+ raise ValueError('Bad restriction: %s' % key)
# Create the index.
index = self._indexes[restr_keys] = {}
# Add all existing edges to the index.
for edge in self._edges:
- vals = tuple(
- self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
- )
+ vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+ for key in restr_keys)
index.setdefault(vals, []).append(edge)
def _register_with_indexes(self, edge):
edge with all existing indexes.
"""
for (restr_keys, index) in self._indexes.items():
- vals = tuple(
- self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
- )
+ vals = tuple(self._get_type_if_possible(getattr(edge, key)())
+ for key in restr_keys)
index.setdefault(vals, []).append(edge)
def _get_type_if_possible(self, item):
def parses(self, start, tree_class=Tree):
for edge in self.select(start=0, end=self._num_leaves):
- if (
- (isinstance(edge, FeatureTreeEdge))
- and (edge.lhs()[TYPE] == start[TYPE])
- and (unify(edge.lhs(), start, rename_vars=True))
- ):
+ if ((isinstance(edge, FeatureTreeEdge)) and
+ (edge.lhs()[TYPE] == start[TYPE]) and
+ (unify(edge.lhs(), start, rename_vars=True))
+ ):
for tree in self.trees(edge, complete=True, tree_class=tree_class):
yield tree
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Fundamental Rule
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class FeatureFundamentalRule(FundamentalRule):
"""
assuming that B1 and B2 can be unified to generate B3.
"""
-
def apply(self, chart, grammar, left_edge, right_edge):
# Make sure the rule is applicable.
- if not (
- left_edge.end() == right_edge.start()
- and left_edge.is_incomplete()
- and right_edge.is_complete()
- and isinstance(left_edge, FeatureTreeEdge)
- ):
+ if not (left_edge.end() == right_edge.start() and
+ left_edge.is_incomplete() and
+ right_edge.is_complete() and
+ isinstance(left_edge, FeatureTreeEdge)):
return
found = right_edge.lhs()
nextsym = left_edge.nextsym()
if isinstance(right_edge, FeatureTreeEdge):
- if not is_nonterminal(nextsym):
- return
- if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]:
- return
+ if not is_nonterminal(nextsym): return
+ if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return
# Create a copy of the bindings.
bindings = left_edge.bindings()
# We rename vars here, because we don't want variables
# Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
# generate B3 (result).
result = unify(nextsym, found, bindings, rename_vars=False)
- if result is None:
- return
+ if result is None: return
else:
- if nextsym != found:
- return
+ if nextsym != found: return
# Create a copy of the bindings.
bindings = left_edge.bindings()
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
-
class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
"""
A specialized version of the completer / single edge fundamental rule
Rather than simply comparing the nonterminals for equality, they are
unified.
"""
-
_fundamental_rule = FeatureFundamentalRule()
def _apply_complete(self, chart, grammar, right_edge):
fr = self._fundamental_rule
- for left_edge in chart.select(
- end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
- ):
+ for left_edge in chart.select(end=right_edge.start(),
+ is_complete=False,
+ nextsym=right_edge.lhs()):
for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
yield new_edge
def _apply_incomplete(self, chart, grammar, left_edge):
fr = self._fundamental_rule
- for right_edge in chart.select(
- start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
- ):
+ for right_edge in chart.select(start=left_edge.end(),
+ is_complete=True,
+ lhs=left_edge.nextsym()):
for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
yield new_edge
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Top-Down Prediction
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class FeatureTopDownInitRule(TopDownInitRule):
def apply(self, chart, grammar):
if chart.insert(new_edge, ()):
yield new_edge
-
class FeatureTopDownPredictRule(CachedTopDownPredictRule):
"""
A specialized version of the (cached) top down predict rule that operates
for each grammar production ``B2 -> gamma``, assuming that B1
and B2 can be unified.
"""
-
def apply(self, chart, grammar, edge):
- if edge.is_complete():
- return
+ if edge.is_complete(): return
nextsym, index = edge.nextsym(), edge.end()
- if not is_nonterminal(nextsym):
- return
+ if not is_nonterminal(nextsym): return
# If we've already applied this rule to an edge with the same
# next & end, and the chart & grammar have not changed, then
if prod.rhs():
first = prod.rhs()[0]
if is_terminal(first):
- if index >= chart.num_leaves():
- continue
- if first != chart.leaf(index):
- continue
+ if index >= chart.num_leaves(): continue
+ if first != chart.leaf(index): continue
# We rename vars here, because we don't want variables
# from the two different productions to match.
self._done[nextsym_with_bindings, index] = (chart, grammar)
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Bottom-Up Prediction
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class FeatureBottomUpPredictRule(BottomUpPredictRule):
def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
+ if edge.is_incomplete(): return
for prod in grammar.productions(rhs=edge.lhs()):
if isinstance(edge, FeatureTreeEdge):
_next = prod.rhs()[0]
- if not is_nonterminal(_next):
- continue
+ if not is_nonterminal(_next): continue
new_edge = FeatureTreeEdge.from_production(prod, edge.start())
if chart.insert(new_edge, ()):
yield new_edge
-
class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
+ if edge.is_incomplete(): return
found = edge.lhs()
for prod in grammar.productions(rhs=found):
bindings = {}
if isinstance(edge, FeatureTreeEdge):
_next = prod.rhs()[0]
- if not is_nonterminal(_next):
- continue
+ if not is_nonterminal(_next): continue
# We rename vars here, because we don't want variables
# from the two different productions to match.
- used_vars = find_variables(
- (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct
- )
+ used_vars = find_variables((prod.lhs(),) + prod.rhs(),
+ fs_class=FeatStruct)
found = found.rename_variables(used_vars=used_vars)
result = unify(_next, found, bindings, rename_vars=False)
- if result is None:
- continue
+ if result is None: continue
- new_edge = FeatureTreeEdge.from_production(
- prod, edge.start()
- ).move_dot_forward(edge.end(), bindings)
+ new_edge = (FeatureTreeEdge.from_production(prod, edge.start())
+ .move_dot_forward(edge.end(), bindings))
if chart.insert(new_edge, (edge,)):
yield new_edge
-
class FeatureEmptyPredictRule(EmptyPredictRule):
def apply(self, chart, grammar):
for prod in grammar.productions(empty=True):
yield new_edge
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Feature Chart Parser
-# ////////////////////////////////////////////////////////////
-
-TD_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureTopDownInitRule(),
- FeatureTopDownPredictRule(),
- FeatureSingleEdgeFundamentalRule(),
-]
-BU_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureEmptyPredictRule(),
- FeatureBottomUpPredictRule(),
- FeatureSingleEdgeFundamentalRule(),
-]
-BU_LC_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureEmptyPredictRule(),
- FeatureBottomUpPredictCombineRule(),
- FeatureSingleEdgeFundamentalRule(),
-]
-
+#////////////////////////////////////////////////////////////
+
+TD_FEATURE_STRATEGY = [LeafInitRule(),
+ FeatureTopDownInitRule(),
+ FeatureTopDownPredictRule(),
+ FeatureSingleEdgeFundamentalRule()]
+BU_FEATURE_STRATEGY = [LeafInitRule(),
+ FeatureEmptyPredictRule(),
+ FeatureBottomUpPredictRule(),
+ FeatureSingleEdgeFundamentalRule()]
+BU_LC_FEATURE_STRATEGY = [LeafInitRule(),
+ FeatureEmptyPredictRule(),
+ FeatureBottomUpPredictCombineRule(),
+ FeatureSingleEdgeFundamentalRule()]
class FeatureChartParser(ChartParser):
- def __init__(
- self,
- grammar,
- strategy=BU_LC_FEATURE_STRATEGY,
- trace_chart_width=20,
- chart_class=FeatureChart,
- **parser_args
- ):
- ChartParser.__init__(
- self,
- grammar,
- strategy=strategy,
- trace_chart_width=trace_chart_width,
- chart_class=chart_class,
- **parser_args
- )
-
+ def __init__(self, grammar,
+ strategy=BU_LC_FEATURE_STRATEGY,
+ trace_chart_width=20,
+ chart_class=FeatureChart,
+ **parser_args):
+ ChartParser.__init__(self, grammar,
+ strategy=strategy,
+ trace_chart_width=trace_chart_width,
+ chart_class=chart_class,
+ **parser_args)
class FeatureTopDownChartParser(FeatureChartParser):
def __init__(self, grammar, **parser_args):
FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args)
-
class FeatureBottomUpChartParser(FeatureChartParser):
def __init__(self, grammar, **parser_args):
FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args)
-
class FeatureBottomUpLeftCornerChartParser(FeatureChartParser):
def __init__(self, grammar, **parser_args):
- FeatureChartParser.__init__(
- self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args
- )
+ FeatureChartParser.__init__(self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args)
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Instantiate Variable Chart
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
class InstantiateVarsChart(FeatureChart):
"""
variables in the edge's ``lhs`` whose names start with '@' will be
replaced by unique new ``Variable``s.
"""
-
def __init__(self, tokens):
FeatureChart.__init__(self, tokens)
FeatureChart.initialize(self)
def insert(self, edge, child_pointer_list):
- if edge in self._instantiated:
- return False
+ if edge in self._instantiated: return False
self.instantiate_edge(edge)
return FeatureChart.insert(self, edge, child_pointer_list)
"""
# If the edge is a leaf, or is not complete, or is
# already in the chart, then just return it as-is.
- if not isinstance(edge, FeatureTreeEdge):
- return
- if not edge.is_complete():
- return
- if edge in self._edge_to_cpls:
- return
+ if not isinstance(edge, FeatureTreeEdge): return
+ if not edge.is_complete(): return
+ if edge in self._edge_to_cpls: return
# Get a list of variables that need to be instantiated.
# If there are none, then return as-is.
inst_vars = self.inst_vars(edge)
- if not inst_vars:
- return
+ if not inst_vars: return
# Instantiate the edge!
self._instantiated.add(edge)
edge._lhs = edge.lhs().substitute_bindings(inst_vars)
def inst_vars(self, edge):
- return dict(
- (var, logic.unique_variable())
- for var in edge.lhs().variables()
- if var.name.startswith("@")
- )
+ return dict((var, logic.unique_variable())
+ for var in edge.lhs().variables()
+ if var.name.startswith('@'))
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
# Demo
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
def demo_grammar():
from nltk.grammar import FeatureGrammar
-
- return FeatureGrammar.fromstring(
- """
+ return FeatureGrammar.fromstring("""
S -> NP VP
PP -> Prep NP
NP -> NP PP
Verb -> "saw"
Prep -> "with"
Prep -> "under"
-"""
- )
-
-
-def demo(
- print_times=True,
- print_grammar=True,
- print_trees=True,
- print_sentence=True,
- trace=1,
- parser=FeatureChartParser,
- sent="I saw John with a dog with my cookie",
-):
- import sys, time
+""")
+def demo(print_times=True, print_grammar=True,
+ print_trees=True, print_sentence=True,
+ trace=1,
+ parser=FeatureChartParser,
+ sent='I saw John with a dog with my cookie'):
+ import sys, time
print()
grammar = demo_grammar()
if print_grammar:
if print_sentence:
print("Sentence:", sent)
tokens = sent.split()
- t = perf_counter()
+ t = time.clock()
cp = parser(grammar, trace=trace)
chart = cp.chart_parse(tokens)
trees = list(chart.parses(grammar.start()))
if print_times:
- print("Time: %s" % (perf_counter() - t))
+ print("Time: %s" % (time.clock() - t))
if print_trees:
- for tree in trees:
- print(tree)
+ for tree in trees: print(tree)
else:
print("Nr trees:", len(trees))
-
def run_profile():
import profile
-
- profile.run("for i in range(1): demo()", "/tmp/profile.out")
+ profile.run('for i in range(1): demo()', '/tmp/profile.out')
import pstats
+ p = pstats.Stats('/tmp/profile.out')
+ p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
+ p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
- p = pstats.Stats("/tmp/profile.out")
- p.strip_dirs().sort_stats("time", "cum").print_stats(60)
- p.strip_dirs().sort_stats("cum", "time").print_stats(60)
-
-
-if __name__ == "__main__":
+if __name__ == '__main__':
from nltk.data import load
-
demo()
print()
- grammar = load("grammars/book_grammars/feat0.fcfg")
+ grammar = load('grammars/book_grammars/feat0.fcfg')
cp = FeatureChartParser(grammar, trace=2)
- sent = "Kim likes children"
+ sent = 'Kim likes children'
tokens = sent.split()
trees = cp.parse(tokens)
for tree in trees:
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Generating from a CFG
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import print_function
import itertools
import sys
except RuntimeError as _error:
if _error.message == "maximum recursion depth exceeded":
# Helpful error message while still showing the recursion stack.
- raise RuntimeError(
- "The grammar has rule(s) that yield infinite recursion!!"
- )
+ raise RuntimeError("The grammar has rule(s) that yield infinite recursion!!")
else:
raise
else:
if depth > 0:
if isinstance(item, Nonterminal):
for prod in grammar.productions(lhs=item):
- for frag in _generate_all(grammar, prod.rhs(), depth - 1):
+ for frag in _generate_all(grammar, prod.rhs(), depth-1):
yield frag
else:
yield [item]
-
demo_grammar = """
S -> NP VP
NP -> Det N
def demo(N=23):
from nltk.grammar import CFG
- print("Generating the first %d sentences for demo grammar:" % (N,))
+ print('Generating the first %d sentences for demo grammar:' % (N,))
print(demo_grammar)
grammar = CFG.fromstring(demo_grammar)
for n, sent in enumerate(generate(grammar, n=N), 1):
- print("%3d. %s" % (n, " ".join(sent)))
+ print('%3d. %s' % (n, ' '.join(sent)))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Author: Dan Garrette <dhgarrette@gmail.com>
# Contributor: Liling Tan, Mustufain, osamamukhtar11
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from six import text_type
import os
import sys
import tempfile
def malt_regex_tagger():
from nltk.tag import RegexpTagger
-
_tagger = RegexpTagger(
- [
- (r"\.$", "."),
- (r"\,$", ","),
- (r"\?$", "?"), # fullstop, comma, Qmark
- (r"\($", "("),
- (r"\)$", ")"), # round brackets
- (r"\[$", "["),
- (r"\]$", "]"), # square brackets
- (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
- (r"(The|the|A|a|An|an)$", "DT"), # articles
- (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
- (r"(His|his|Her|her|Its|its)$", "PRP$"), # possesive
- (r"(my|Your|your|Yours|yours)$", "PRP$"), # possesive
- (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
- (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
- (r"(till|Till|until|Until)$", "IN"), # time prepopsitions
- (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
- (r"(under|Under|below|Below)$", "IN"), # space prepopsitions
- (r"(over|Over|above|Above)$", "IN"), # space prepopsitions
- (r"(across|Across|through|Through)$", "IN"), # space prepopsitions
- (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
- (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
- (r".*able$", "JJ"), # adjectives
- (r".*ness$", "NN"), # nouns formed from adjectives
- (r".*ly$", "RB"), # adverbs
- (r".*s$", "NNS"), # plural nouns
- (r".*ing$", "VBG"), # gerunds
- (r".*ed$", "VBD"), # past tense verbs
- (r".*", "NN"), # nouns (default)
- ]
- )
+ [(r'\.$','.'), (r'\,$',','), (r'\?$','?'), # fullstop, comma, Qmark
+ (r'\($','('), (r'\)$',')'), # round brackets
+ (r'\[$','['), (r'\]$',']'), # square brackets
+ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
+ (r'(The|the|A|a|An|an)$', 'DT'), # articles
+ (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
+ (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive
+ (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive
+ (r'(on|On|in|In|at|At|since|Since)$', 'IN'),# time prepopsitions
+ (r'(for|For|ago|Ago|before|Before)$', 'IN'),# time prepopsitions
+ (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions
+ (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions
+ (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions
+ (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions
+ (r'(across|Across|through|Through)$', 'IN'),# space prepopsitions
+ (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions
+ (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions
+ (r'.*able$', 'JJ'), # adjectives
+ (r'.*ness$', 'NN'), # nouns formed from adjectives
+ (r'.*ly$', 'RB'), # adverbs
+ (r'.*s$', 'NNS'), # plural nouns
+ (r'.*ing$', 'VBG'), # gerunds
+ (r'.*ed$', 'VBD'), # past tense verbs
+ (r'.*', 'NN'), # nouns (default)
+ ])
return _tagger.tag
"""
A module to find MaltParser .jar file and its dependencies.
"""
- if os.path.exists(parser_dirname): # If a full path is given.
+ if os.path.exists(parser_dirname): # If a full path is given.
_malt_dir = parser_dirname
- else: # Try to find path to maltparser directory in environment variables.
- _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
+ else: # Try to find path to maltparser directory in environment variables.
+ _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
# Checks that that the found directory contains all the necessary .jar
- malt_dependencies = ["", "", ""]
+ malt_dependencies = ['','','']
_malt_jars = set(find_jars_within_path(_malt_dir))
_jars = set(os.path.split(jar)[1] for jar in _malt_jars)
- malt_dependencies = set(["log4j.jar", "libsvm.jar", "liblinear-1.8.jar"])
+ malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
assert malt_dependencies.issubset(_jars)
- assert any(
- filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
- )
+ assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
return list(_malt_jars)
"""
A module to find pre-trained MaltParser model.
"""
- if model_filename is None:
- return "malt_temp.mco"
- elif os.path.exists(model_filename): # If a full path is given.
+ if model_filename == None:
+ return 'malt_temp.mco'
+ elif os.path.exists(model_filename): # If a full path is given.
return model_filename
- else: # Try to find path to malt model in environment variables.
- return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
+ else: # Try to find path to malt model in environment variables.
+ return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
class MaltParser(ParserI):
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
(shot I (elephant an) (in (pajamas my)) .)
"""
-
- def __init__(
- self,
- parser_dirname,
- model_filename=None,
- tagger=None,
- additional_java_args=None,
- ):
+ def __init__(self, parser_dirname, model_filename=None, tagger=None, additional_java_args=None):
"""
An interface for parsing with the Malt Parser.
# Find all the necessary jar files for MaltParser.
self.malt_jars = find_maltparser(parser_dirname)
# Initialize additional java arguments.
- self.additional_java_args = (
- additional_java_args if additional_java_args is not None else []
- )
+ self.additional_java_args = additional_java_args if \
+ additional_java_args is not None else []
# Initialize model.
self.model = find_malt_model(model_filename)
- self._trained = self.model != "malt_temp.mco"
+ self._trained = self.model != 'malt_temp.mco'
# Set the working_dir parameters i.e. `-w` from MaltParser's option.
self.working_dir = tempfile.gettempdir()
# Initialize POS tagger.
self.tagger = tagger if tagger is not None else malt_regex_tagger()
- def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
+ def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
"""
Use MaltParser to parse multiple POS tagged sentences. Takes multiple
sentences where each sentence is a list of (word, tag) tuples.
if not self._trained:
raise Exception("Parser has not been trained. Call train() first.")
- with tempfile.NamedTemporaryFile(
- prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
- ) as input_file:
- with tempfile.NamedTemporaryFile(
- prefix="malt_output.conll.",
- dir=self.working_dir,
- mode="w",
- delete=False,
- ) as output_file:
+ with tempfile.NamedTemporaryFile(prefix='malt_input.conll.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
+ with tempfile.NamedTemporaryFile(prefix='malt_output.conll.',
+ dir=self.working_dir, mode='w', delete=False) as output_file:
# Convert list of sentences to CONLL format.
for line in taggedsents_to_conll(sentences):
- input_file.write(str(line))
+ input_file.write(text_type(line))
input_file.close()
# Generate command to run maltparser.
- cmd = self.generate_malt_command(
- input_file.name, output_file.name, mode="parse"
- )
+ cmd =self.generate_malt_command(input_file.name,
+ output_file.name, mode="parse")
# This is a maltparser quirk, it needs to be run
# where the model file is. otherwise it goes into an awkward
# missing .jars or strange -w working_dir problem.
- _current_path = os.getcwd() # Remembers the current path.
- try: # Change to modelfile path
+ _current_path = os.getcwd() # Remembers the current path.
+ try: # Change to modelfile path
os.chdir(os.path.split(self.model)[0])
except:
pass
- ret = self._execute(cmd, verbose) # Run command.
- os.chdir(_current_path) # Change back to current path.
+ ret = self._execute(cmd, verbose) # Run command.
+ os.chdir(_current_path) # Change back to current path.
- if ret != 0:
- raise Exception(
- "MaltParser parsing (%s) failed with exit "
- "code %d" % (" ".join(cmd), ret)
- )
+ if ret is not 0:
+ raise Exception("MaltParser parsing (%s) failed with exit "
+ "code %d" % (' '.join(cmd), ret))
# Must return iter(iter(Tree))
with open(output_file.name) as infile:
- for tree_str in infile.read().split("\n\n"):
- yield (
- iter(
- [
- DependencyGraph(
- tree_str, top_relation_label=top_relation_label
- )
- ]
- )
- )
+ for tree_str in infile.read().split('\n\n'):
+ yield(iter([DependencyGraph(tree_str, top_relation_label=top_relation_label)]))
os.remove(input_file.name)
os.remove(output_file.name)
- def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
+ def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
"""
Use MaltParser to parse multiple sentences.
Takes a list of sentences, where each sentence is a list of words.
:return: iter(DependencyGraph)
"""
tagged_sentences = (self.tagger(sentence) for sentence in sentences)
- return self.parse_tagged_sents(
- tagged_sentences, verbose, top_relation_label=top_relation_label
- )
+ return self.parse_tagged_sents(tagged_sentences, verbose, top_relation_label=top_relation_label)
def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
"""
:type outputfilename: str
"""
- cmd = ["java"]
- cmd += self.additional_java_args # Adds additional java arguments
+ cmd = ['java']
+ cmd+= self.additional_java_args # Adds additional java arguments
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
- classpaths_separator = ";" if sys.platform.startswith("win") else ":"
- cmd += [
- "-cp",
- classpaths_separator.join(self.malt_jars),
- ] # Adds classpaths for jars
- cmd += ["org.maltparser.Malt"] # Adds the main function.
+ classpaths_separator = ';' if sys.platform.startswith('win') else ':'
+ cmd+= ['-cp', classpaths_separator.join(self.malt_jars)] # Adds classpaths for jars
+ cmd+= ['org.maltparser.Malt'] # Adds the main function.
# Adds the model file.
- if os.path.exists(self.model): # when parsing
- cmd += ["-c", os.path.split(self.model)[-1]]
- else: # when learning
- cmd += ["-c", self.model]
-
- cmd += ["-i", inputfilename]
- if mode == "parse":
- cmd += ["-o", outputfilename]
- cmd += ["-m", mode] # mode use to generate parses.
+ if os.path.exists(self.model): # when parsing
+ cmd+= ['-c', os.path.split(self.model)[-1]]
+ else: # when learning
+ cmd+= ['-c', self.model]
+
+ cmd+= ['-i', inputfilename]
+ if mode == 'parse':
+ cmd+= ['-o', outputfilename]
+ cmd+= ['-m', mode] # mode use to generate parses.
return cmd
@staticmethod
"""
# Write the conll_str to malt_train.conll file in /tmp/
- with tempfile.NamedTemporaryFile(
- prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
- ) as input_file:
- input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
- input_file.write(str(input_str))
+ with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
+ input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs))
+ input_file.write(text_type(input_str))
# Trains the model with the malt_train.conll
self.train_from_file(input_file.name, verbose=verbose)
# Removes the malt_train.conll once training finishes.
# If conll_file is a ZipFilePathPointer,
# then we need to do some extra massaging
if isinstance(conll_file, ZipFilePathPointer):
- with tempfile.NamedTemporaryFile(
- prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
- ) as input_file:
+ with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
with conll_file.open() as conll_input_file:
conll_str = conll_input_file.read()
- input_file.write(str(conll_str))
+ input_file.write(text_type(conll_str))
return self.train_from_file(input_file.name, verbose=verbose)
# Generate command to run maltparser.
- cmd = self.generate_malt_command(conll_file, mode="learn")
+ cmd =self.generate_malt_command(conll_file, mode="learn")
ret = self._execute(cmd, verbose)
if ret != 0:
- raise Exception(
- "MaltParser training (%s) failed with exit "
- "code %d" % (" ".join(cmd), ret)
- )
+ raise Exception("MaltParser training (%s) failed with exit "
+ "code %d" % (' '.join(cmd), ret))
self._trained = True
if __name__ == '__main__':
- """
- A demonstration function to show how NLTK users can use the malt parser API.
+ '''
+ A demostration function to show how NLTK users can use the malt parser API.
>>> from nltk import pos_tag
>>> assert 'MALT_PARSER' in os.environ, str(
>>> # Parse a single sentence.
>>> parsed_sent1 = mp.parse_one(sent1)
>>> parsed_sent2 = mp.parse_one(sent2)
- >>> print(parsed_sent1.tree())
+ >>> print (parsed_sent1.tree())
(sees John Mary .)
- >>> print(parsed_sent2.tree())
+ >>> print (parsed_sent2.tree())
(walks John (dog a) .)
>>>
>>> # Parsing multiple sentences.
(shot I (elephant an) (in (pajamas my)) .)
>>> print(next(next(parsed_sents)).tree())
(flies Time (like banana) .)
- """
-
+ '''
import doctest
doctest.testmod()
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import print_function
import math
import logging
+from six.moves import range
+
from nltk.parse.dependencygraph import DependencyGraph
logger = logging.getLogger(__name__)
def __init__(self):
if self.__class__ == DependencyScorerI:
- raise TypeError("DependencyScorerI is an abstract interface")
+ raise TypeError('DependencyScorerI is an abstract interface')
def train(self, graphs):
"""
"""
raise NotImplementedError()
-
#################################################################
# NaiveBayesDependencyScorer
#################################################################
for graph in graphs:
for head_node in graph.nodes.values():
for child_index, child_node in graph.nodes.items():
- if child_index in head_node["deps"]:
+ if child_index in head_node['deps']:
label = "T"
else:
label = "F"
labeled_examples.append(
(
dict(
- a=head_node["word"],
- b=head_node["tag"],
- c=child_node["word"],
- d=child_node["tag"],
+ a=head_node['word'],
+ b=head_node['tag'],
+ c=child_node['word'],
+ d=child_node['tag'],
),
label,
)
edges.append(
(
dict(
- a=head_node["word"],
- b=head_node["tag"],
- c=child_node["word"],
- d=child_node["tag"],
+ a=head_node['word'],
+ b=head_node['tag'],
+ c=child_node['word'],
+ d=child_node['tag'],
)
)
)
row = []
count = 0
for pdist in self.classifier.prob_classify_many(edges):
- logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
+ logger.debug('%.4f %.4f', pdist.prob('T'), pdist.prob('F'))
# smoothing in case the probability = 0
- row.append([math.log(pdist.prob("T") + 0.00000000001)])
+ row.append([math.log(pdist.prob("T")+0.00000000001)])
count += 1
if count == len(graph.nodes):
edge_scores.append(row)
# A short class necessary to show parsing example from paper
class DemoScorer(DependencyScorerI):
def train(self, graphs):
- print("Training...")
+ print('Training...')
def score(self, graph):
# scores for Keith Hall 'K-best Spanning Tree Parsing' paper
- return [
- [[], [5], [1], [1]],
- [[], [], [11], [4]],
- [[], [10], [], [5]],
- [[], [8], [8], []],
- ]
-
+ return [[[], [5], [1], [1]],
+ [[], [], [11], [4]],
+ [[], [10], [], [5]],
+ [[], [8], [8], []]]
#################################################################
# Non-Projective Probabilistic Parsing
4
"""
-
def __init__(self):
"""
Creates a new non-projective parser.
"""
- logging.debug("initializing prob. nonprojective...")
+ logging.debug('initializing prob. nonprojective...')
def train(self, graphs, dependency_scorer):
"""
:type g_graph, b_graph, c_graph: DependencyGraph
:param g_graph, b_graph, c_graph: Graphs which need to be updated.
"""
- logger.debug("Collapsing nodes...")
+ logger.debug('Collapsing nodes...')
# Collapse all cycle nodes into v_n+1 in G_Graph
for cycle_node_index in cycle_path:
g_graph.remove_by_address(cycle_node_index)
g_graph.add_node(new_node)
- g_graph.redirect_arcs(cycle_path, new_node["address"])
+ g_graph.redirect_arcs(cycle_path, new_node['address'])
def update_edge_scores(self, new_node, cycle_path):
"""
:type cycle_path: A list of integers.
:param cycle_path: A list of node addresses that belong to the cycle.
"""
- logger.debug("cycle %s", cycle_path)
+ logger.debug('cycle %s', cycle_path)
cycle_path = self.compute_original_indexes(cycle_path)
- logger.debug("old cycle %s", cycle_path)
- logger.debug("Prior to update: %s", self.scores)
+ logger.debug('old cycle %s', cycle_path)
+ logger.debug('Prior to update: %s', self.scores)
for i, row in enumerate(self.scores):
for j, column in enumerate(self.scores[i]):
logger.debug(self.scores[i][j])
- if j in cycle_path and i not in cycle_path and self.scores[i][j]:
+ if (
+ j in cycle_path
+ and i not in cycle_path
+ and self.scores[i][j]
+ ):
subtract_val = self.compute_max_subtract_score(j, cycle_path)
- logger.debug("%s - %s", self.scores[i][j], subtract_val)
+ logger.debug('%s - %s', self.scores[i][j], subtract_val)
new_vals = []
for cur_val in self.scores[i][j]:
if i in cycle_path and j in cycle_path:
self.scores[i][j] = []
- logger.debug("After update: %s", self.scores)
+ logger.debug('After update: %s', self.scores)
def compute_original_indexes(self, new_indexes):
"""
the node that is arced to.
"""
originals = self.compute_original_indexes([node_index])
- logger.debug("originals: %s", originals)
+ logger.debug('originals: %s', originals)
max_arc = None
max_score = None
for row_index in range(len(self.scores)):
for col_index in range(len(self.scores[row_index])):
- if col_index in originals and (
- max_score is None or self.scores[row_index][col_index] > max_score
- ):
+ # print self.scores[row_index][col_index]
+ if col_index in originals and (max_score is None or self.scores[row_index][col_index] > max_score):
max_score = self.scores[row_index][col_index]
max_arc = row_index
- logger.debug("%s, %s", row_index, col_index)
+ logger.debug('%s, %s', row_index, col_index)
logger.debug(max_score)
max_orig = None
for row_index in range(len(self.scores)):
for col_index in range(len(self.scores[row_index])):
- if col_index in originals and (
- max_score is None or self.scores[row_index][col_index] > max_score
- ):
+ if col_index in originals and (max_score is None or self.scores[row_index][col_index] > max_score):
max_score = self.scores[row_index][col_index]
max_arc = row_index
max_orig = col_index
g_graph = DependencyGraph()
for index, token in enumerate(tokens):
g_graph.nodes[index + 1].update(
- {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+ {
+ 'word': token,
+ 'tag': tags[index],
+ 'rel': 'NTOP',
+ 'address': index + 1,
+ }
)
+ #print (g_graph.nodes)
+
# Fully connect non-root nodes in g_graph
g_graph.connect_graph()
original_graph = DependencyGraph()
for index, token in enumerate(tokens):
original_graph.nodes[index + 1].update(
- {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+ {
+ 'word': token,
+ 'tag': tags[index],
+ 'rel': 'NTOP',
+ 'address': index+1,
+ }
)
b_graph = DependencyGraph()
for index, token in enumerate(tokens):
c_graph.nodes[index + 1].update(
- {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+ {
+ 'word': token,
+ 'tag': tags[index],
+ 'rel': 'NTOP',
+ 'address': index + 1,
+ }
)
# Assign initial scores to g_graph edges
self.initialize_edge_scores(g_graph)
logger.debug(self.scores)
# Initialize a list of unvisited vertices (by node address)
- unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
+ unvisited_vertices = [
+ vertex['address'] for vertex in c_graph.nodes.values()
+ ]
# Iterate over unvisited vertices
nr_vertices = len(tokens)
betas = {}
while unvisited_vertices:
# Mark current node as visited
current_vertex = unvisited_vertices.pop(0)
- logger.debug("current_vertex: %s", current_vertex)
+ logger.debug('current_vertex: %s', current_vertex)
# Get corresponding node n_i to vertex v_i
current_node = g_graph.get_by_address(current_vertex)
- logger.debug("current_node: %s", current_node)
+ logger.debug('current_node: %s', current_node)
# Get best in-edge node b for current node
best_in_edge = self.best_incoming_arc(current_vertex)
betas[current_vertex] = self.original_best_arc(current_vertex)
- logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
+ logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex)
# b_graph = Union(b_graph, b)
for new_vertex in [current_vertex, best_in_edge]:
b_graph.nodes[new_vertex].update(
- {"word": "TEMP", "rel": "NTOP", "address": new_vertex}
+ {
+ 'word': 'TEMP',
+ 'rel': 'NTOP',
+ 'address': new_vertex,
+ }
)
b_graph.add_arc(best_in_edge, current_vertex)
# Beta(current node) = b - stored for parse recovery
cycle_path = b_graph.contains_cycle()
if cycle_path:
# Create a new node v_n+1 with address = len(nodes) + 1
- new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
+ new_node = {
+ 'word': 'NONE',
+ 'rel': 'NTOP',
+ 'address': nr_vertices + 1,
+ }
# c_graph = Union(c_graph, v_n+1)
c_graph.add_node(new_node)
# Collapse all nodes in cycle C into v_n+1
self.update_edge_scores(new_node, cycle_path)
self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
for cycle_index in cycle_path:
- c_graph.add_arc(new_node["address"], cycle_index)
+ c_graph.add_arc(new_node['address'], cycle_index)
# self.replaced_by[cycle_index] = new_node['address']
- self.inner_nodes[new_node["address"]] = cycle_path
+ self.inner_nodes[new_node['address']] = cycle_path
# Add v_n+1 to list of unvisited vertices
unvisited_vertices.insert(0, nr_vertices + 1)
for cycle_node_address in cycle_path:
b_graph.remove_by_address(cycle_node_address)
- logger.debug("g_graph: %s", g_graph)
- logger.debug("b_graph: %s", b_graph)
- logger.debug("c_graph: %s", c_graph)
- logger.debug("Betas: %s", betas)
- logger.debug("replaced nodes %s", self.inner_nodes)
+ logger.debug('g_graph: %s', g_graph)
+ logger.debug('b_graph: %s', b_graph)
+ logger.debug('c_graph: %s', c_graph)
+ logger.debug('Betas: %s', betas)
+ logger.debug('replaced nodes %s', self.inner_nodes)
# Recover parse tree
- logger.debug("Final scores: %s", self.scores)
+ logger.debug('Final scores: %s', self.scores)
- logger.debug("Recovering parse...")
+ logger.debug('Recovering parse...')
for i in range(len(tokens) + 1, nr_vertices + 1):
betas[betas[i][1]] = betas[i]
- logger.debug("Betas: %s", betas)
+ logger.debug('Betas: %s', betas)
for node in original_graph.nodes.values():
# TODO: It's dangerous to assume that deps it a dictionary
# because it's a default dictionary. Ideally, here we should not
# be concerned how dependencies are stored inside of a dependency
# graph.
- node["deps"] = {}
+ node['deps'] = {}
for i in range(1, len(tokens) + 1):
original_graph.add_arc(betas[i][0], betas[i][1])
- logger.debug("Done.")
+ logger.debug('Done.')
yield original_graph
-
#################################################################
# Rule-based Non-Projective Parser
#################################################################
for index, token in enumerate(tokens):
self._graph.nodes[index] = {
- "word": token,
- "deps": [],
- "rel": "NTOP",
- "address": index,
+ 'word': token,
+ 'deps': [],
+ 'rel': 'NTOP',
+ 'address': index,
}
for head_node in self._graph.nodes.values():
deps = []
- for dep_node in self._graph.nodes.values():
+ for dep_node in self._graph.nodes.values() :
if (
- self._grammar.contains(head_node["word"], dep_node["word"])
- and head_node["word"] != dep_node["word"]
+ self._grammar.contains(head_node['word'], dep_node['word'])
+ and head_node['word'] != dep_node['word']
):
- deps.append(dep_node["address"])
- head_node["deps"] = deps
+ deps.append(dep_node['address'])
+ head_node['deps'] = deps
# Create lattice of possible heads
roots = []
graph = DependencyGraph()
graph.root = graph.nodes[analysis.index(-1) + 1]
- for address, (token, head_index) in enumerate(
- zip(tokens, analysis), start=1
- ):
+ for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
head_address = head_index + 1
node = graph.nodes[address]
- node.update({"word": token, "address": address})
+ node.update(
+ {
+ 'word': token,
+ 'address': address,
+ }
+ )
if head_address == 0:
- rel = "ROOT"
+ rel = 'ROOT'
else:
- rel = ""
- graph.nodes[head_index + 1]["deps"][rel].append(address)
+ rel = ''
+ graph.nodes[head_index + 1]['deps'][rel].append(address)
# TODO: check for cycles
yield graph
# Demos
#################################################################
-
def demo():
# hall_demo()
nonprojective_conll_parse_demo()
def hall_demo():
npp = ProbabilisticNonprojectiveParser()
npp.train([], DemoScorer())
- for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
+ for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]):
print(parse_graph)
def nonprojective_conll_parse_demo():
from nltk.parse.dependencygraph import conll_data2
- graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+ graphs = [
+ DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry
+ ]
npp = ProbabilisticNonprojectiveParser()
npp.train(graphs, NaiveBayesDependencyScorer())
- for parse_graph in npp.parse(
- ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
- ):
+ for parse_graph in npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']):
print(parse_graph)
def rule_based_demo():
from nltk.grammar import DependencyGrammar
- grammar = DependencyGrammar.fromstring(
- """
+ grammar = DependencyGrammar.fromstring("""
'taught' -> 'play' | 'man'
'man' -> 'the' | 'in'
'in' -> 'corner'
'corner' -> 'the'
'play' -> 'golf' | 'dachshund' | 'to'
'dachshund' -> 'his'
- """
- )
+ """)
print(grammar)
ndp = NonprojectiveDependencyParser(grammar)
- graphs = ndp.parse(
- [
- "the",
- "man",
- "in",
- "the",
- "corner",
- "taught",
- "his",
- "dachshund",
- "to",
- "play",
- "golf",
- ]
- )
- print("Graphs:")
+ graphs = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
+ print('Graphs:')
for graph in graphs:
print(graph)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Probabilistic Chart Parsers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
argument beam_size. If non-zero, this controls the size of the beam
(aka the edge queue). This option is most useful with InsideChartParser.
"""
+from __future__ import print_function, unicode_literals
##//////////////////////////////////////////////////////
## Bottom-Up PCFG Chart Parser
# [XX] This might not be implemented quite right -- it would be better
# to associate probabilities with child pointer lists.
-import random
from functools import reduce
from nltk.tree import Tree, ProbabilisticTree
from nltk.grammar import Nonterminal, PCFG
from nltk.parse.api import ParserI
from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule
+from nltk.compat import python_2_unicode_compatible
# Probabilistic edges
class ProbabilisticLeafEdge(LeafEdge):
- def prob(self):
- return 1.0
-
+ def prob(self): return 1.0
class ProbabilisticTreeEdge(TreeEdge):
def __init__(self, prob, *args, **kwargs):
# two edges with different probabilities are not equal.
self._comparison_key = (self._comparison_key, prob)
- def prob(self):
- return self._prob
+ def prob(self): return self._prob
@staticmethod
def from_production(production, index, p):
- return ProbabilisticTreeEdge(
- p, (index, index), production.lhs(), production.rhs(), 0
- )
-
+ return ProbabilisticTreeEdge(p, (index, index), production.lhs(),
+ production.rhs(), 0)
# Rules using probabilistic edges
class ProbabilisticBottomUpInitRule(AbstractChartRule):
- NUM_EDGES = 0
-
+ NUM_EDGES=0
def apply(self, chart, grammar):
for index in range(chart.num_leaves()):
new_edge = ProbabilisticLeafEdge(chart.leaf(index), index)
if chart.insert(new_edge, ()):
yield new_edge
-
class ProbabilisticBottomUpPredictRule(AbstractChartRule):
- NUM_EDGES = 1
-
+ NUM_EDGES=1
def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
+ if edge.is_incomplete(): return
for prod in grammar.productions():
if edge.lhs() == prod.rhs()[0]:
- new_edge = ProbabilisticTreeEdge.from_production(
- prod, edge.start(), prod.prob()
- )
+ new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob())
if chart.insert(new_edge, ()):
yield new_edge
-
class ProbabilisticFundamentalRule(AbstractChartRule):
- NUM_EDGES = 2
-
+ NUM_EDGES=2
def apply(self, chart, grammar, left_edge, right_edge):
# Make sure the rule is applicable.
- if not (
- left_edge.end() == right_edge.start()
- and left_edge.nextsym() == right_edge.lhs()
- and left_edge.is_incomplete()
- and right_edge.is_complete()
- ):
+ if not (left_edge.end() == right_edge.start() and
+ left_edge.nextsym() == right_edge.lhs() and
+ left_edge.is_incomplete() and right_edge.is_complete()):
return
# Construct the new edge.
p = left_edge.prob() * right_edge.prob()
- new_edge = ProbabilisticTreeEdge(
- p,
- span=(left_edge.start(), right_edge.end()),
- lhs=left_edge.lhs(),
- rhs=left_edge.rhs(),
- dot=left_edge.dot() + 1,
- )
+ new_edge = ProbabilisticTreeEdge(p,
+ span=(left_edge.start(), right_edge.end()),
+ lhs=left_edge.lhs(), rhs=left_edge.rhs(),
+ dot=left_edge.dot()+1)
# Add it to the chart, with appropriate child pointers.
changed_chart = False
for cpl1 in chart.child_pointer_lists(left_edge):
- if chart.insert(new_edge, cpl1 + (right_edge,)):
+ if chart.insert(new_edge, cpl1+(right_edge,)):
changed_chart = True
# If we changed the chart, then generate the edge.
- if changed_chart:
- yield new_edge
-
+ if changed_chart: yield new_edge
+@python_2_unicode_compatible
class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
- NUM_EDGES = 1
+ NUM_EDGES=1
_fundamental_rule = ProbabilisticFundamentalRule()
fr = self._fundamental_rule
if edge1.is_incomplete():
# edge1 = left_edge; edge2 = right_edge
- for edge2 in chart.select(
- start=edge1.end(), is_complete=True, lhs=edge1.nextsym()
- ):
+ for edge2 in chart.select(start=edge1.end(), is_complete=True,
+ lhs=edge1.nextsym()):
for new_edge in fr.apply(chart, grammar, edge1, edge2):
yield new_edge
else:
# edge2 = left_edge; edge1 = right_edge
- for edge2 in chart.select(
- end=edge1.start(), is_complete=False, nextsym=edge1.lhs()
- ):
+ for edge2 in chart.select(end=edge1.start(), is_complete=False,
+ nextsym=edge1.lhs()):
for new_edge in fr.apply(chart, grammar, edge2, edge1):
yield new_edge
def __str__(self):
- return "Fundamental Rule"
-
+ return 'Fundamental Rule'
class BottomUpProbabilisticChartParser(ParserI):
"""
:ivar _trace: The level of tracing output that should be generated
when parsing a text.
"""
-
def __init__(self, grammar, beam_size=0, trace=0):
"""
Create a new ``BottomUpProbabilisticChartParser``, that uses
# Initialize the chart.
for edge in bu_init.apply(chart, grammar):
if self._trace > 1:
- print(
- " %-50s [%s]"
- % (chart.pretty_format_edge(edge, width=2), edge.prob())
- )
+ print(' %-50s [%s]' % (chart.pretty_format_edge(edge,width=2),
+ edge.prob()))
queue.append(edge)
while len(queue) > 0:
# Get the best edge.
edge = queue.pop()
if self._trace > 0:
- print(
- " %-50s [%s]"
- % (chart.pretty_format_edge(edge, width=2), edge.prob())
- )
+ print(' %-50s [%s]' % (chart.pretty_format_edge(edge,width=2),
+ edge.prob()))
# Apply BU & FR to it.
queue.extend(bu.apply(chart, grammar, edge))
return iter(parses)
def _setprob(self, tree, prod_probs):
- if tree.prob() is not None:
- return
+ if tree.prob() is not None: return
# Get the prob of the CFG production.
lhs = Nonterminal(tree.label())
def _prune(self, queue, chart):
""" Discard items in the queue if the queue is longer than the beam."""
if len(queue) > self.beam_size:
- split = len(queue) - self.beam_size
+ split = len(queue)-self.beam_size
if self._trace > 2:
for edge in queue[:split]:
- print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2))
+ print(' %-50s [DISCARDED]' % chart.pretty_format_edge(edge,2))
del queue[:split]
-
class InsideChartParser(BottomUpProbabilisticChartParser):
"""
A bottom-up parser for ``PCFG`` grammars that tries edges in descending
This sorting order results in a type of lowest-cost-first search
strategy.
"""
-
# Inherit constructor.
def sort_queue(self, queue, chart):
"""
"""
queue.sort(key=lambda edge: edge.prob())
-
# Eventually, this will become some sort of inside-outside parser:
# class InsideOutsideParser(BottomUpProbabilisticChartParser):
# def __init__(self, grammar, trace=0):
# bestp.get(elt,0))
#
# self._bestp = bestp
-# for (k,v) in self._bestp.items(): print(k,v)
+# for (k,v) in self._bestp.items(): print k,v
#
# def _sortkey(self, edge):
# return edge.structure()[PROB] * self._bestp[edge.lhs()]
# def sort_queue(self, queue, chart):
# queue.sort(key=self._sortkey)
-
+import random
class RandomChartParser(BottomUpProbabilisticChartParser):
"""
A bottom-up parser for ``PCFG`` grammars that tries edges in random order.
This sorting order results in a random search strategy.
"""
-
# Inherit constructor
def sort_queue(self, queue, chart):
- i = random.randint(0, len(queue) - 1)
+ i = random.randint(0, len(queue)-1)
(queue[-1], queue[i]) = (queue[i], queue[-1])
-
class UnsortedChartParser(BottomUpProbabilisticChartParser):
"""
A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order.
"""
-
# Inherit constructor
- def sort_queue(self, queue, chart):
- return
-
+ def sort_queue(self, queue, chart): return
class LongestChartParser(BottomUpProbabilisticChartParser):
"""
shorter ones. This sorting order results in a type of best-first
search strategy.
"""
-
# Inherit constructor
def sort_queue(self, queue, chart):
queue.sort(key=lambda edge: edge.length())
-
##//////////////////////////////////////////////////////
## Test Code
##//////////////////////////////////////////////////////
-
def demo(choice=None, draw_parses=None, print_parses=None):
"""
A demonstration of the probabilistic parsers. The user is
from nltk.parse import pchart
# Define two demos. Each demo has a sentence and a grammar.
- toy_pcfg1 = PCFG.fromstring(
- """
+ toy_pcfg1 = PCFG.fromstring("""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
- """
- )
+ """)
- toy_pcfg2 = PCFG.fromstring(
- """
+ toy_pcfg2 = PCFG.fromstring("""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
- """
- )
+ """)
- demos = [
- ("I saw John with my telescope", toy_pcfg1),
- ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
- ]
+ demos = [('I saw John with my telescope', toy_pcfg1),
+ ('the boy saw Jack with Bob under the table with a telescope',
+ toy_pcfg2)]
if choice is None:
# Ask the user which demo they want to use.
print()
for i in range(len(demos)):
- print("%3s: %s" % (i + 1, demos[i][0]))
- print(" %r" % demos[i][1])
+ print('%3s: %s' % (i+1, demos[i][0]))
+ print(' %r' % demos[i][1])
print()
- print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
- choice = int(sys.stdin.readline().strip()) - 1
+ print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
+ choice = int(sys.stdin.readline().strip())-1
try:
sent, grammar = demos[choice]
except:
- print("Bad sentence number")
+ print('Bad sentence number')
return
# Tokenize the sentence.
pchart.RandomChartParser(grammar),
pchart.UnsortedChartParser(grammar),
pchart.LongestChartParser(grammar),
- pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser
- ]
+ pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) # was BeamParser
+ ]
# Run the parsers on the tokenized sentence.
times = []
num_parses = []
all_parses = {}
for parser in parsers:
- print("\ns: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar))
+ print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
parser.trace(3)
t = time.time()
parses = list(parser.parse(tokens))
- times.append(time.time() - t)
- p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
+ times.append(time.time()-t)
+ p = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0)
average_p.append(p)
num_parses.append(len(parses))
- for p in parses:
- all_parses[p.freeze()] = 1
+ for p in parses: all_parses[p.freeze()] = 1
# Print some summary statistics
print()
- print(" Parser Beam | Time (secs) # Parses Average P(parse)")
- print("------------------------+------------------------------------------")
+ print(' Parser Beam | Time (secs) # Parses Average P(parse)')
+ print('------------------------+------------------------------------------')
for i in range(len(parsers)):
- print(
- "%18s %4d |%11.4f%11d%19.14f"
- % (
- parsers[i].__class__.__name__,
- parsers[i].beam_size,
- times[i],
- num_parses[i],
- average_p[i],
- )
- )
+ print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
+ parsers[i].beam_size,
+ times[i],num_parses[i],average_p[i]))
parses = all_parses.keys()
- if parses:
- p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
- else:
- p = 0
- print("------------------------+------------------------------------------")
- print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p))
+ if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
+ else: p = 0
+ print('------------------------+------------------------------------------')
+ print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))
if draw_parses is None:
# Ask the user if we should draw the parses.
print()
- print("Draw parses (y/n)? ", end=" ")
- draw_parses = sys.stdin.readline().strip().lower().startswith("y")
+ print('Draw parses (y/n)? ', end=' ')
+ draw_parses = sys.stdin.readline().strip().lower().startswith('y')
if draw_parses:
from nltk.draw.tree import draw_trees
-
- print(" please wait...")
+ print(' please wait...')
draw_trees(*parses)
if print_parses is None:
# Ask the user if we should print the parses.
print()
- print("Print parses (y/n)? ", end=" ")
- print_parses = sys.stdin.readline().strip().lower().startswith("y")
+ print('Print parses (y/n)? ', end=' ')
+ print_parses = sys.stdin.readline().strip().lower().startswith('y')
if print_parses:
for parse in parses:
print(parse)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
+from __future__ import print_function, unicode_literals
from collections import defaultdict
from itertools import chain
from functools import total_ordering
-from nltk.grammar import (
- DependencyProduction,
- DependencyGrammar,
- ProbabilisticDependencyGrammar,
-)
+from nltk.grammar import (DependencyProduction, DependencyGrammar,
+ ProbabilisticDependencyGrammar)
from nltk.parse.dependencygraph import DependencyGraph
from nltk.internals import raise_unorderable_types
-
+from nltk.compat import python_2_unicode_compatible
#################################################################
# Dependency Span
#################################################################
-
@total_ordering
+@python_2_unicode_compatible
class DependencySpan(object):
"""
A contiguous span over some part of the input string representing
to the head word for the entire span. This is the same as the root node if
the dependency structure were depicted as a graph.
"""
-
def __init__(self, start_index, end_index, head_index, arcs, tags):
self._start_index = start_index
self._end_index = end_index
:return: A concise string representatino of the ``DependencySpan``.
:rtype: str.
"""
- return "Span %d-%d; Head Index: %d" % (
- self._start_index,
- self._end_index,
- self._head_index,
- )
+ return 'Span %d-%d; Head Index: %d' % (self._start_index, self._end_index, self._head_index)
def __str__(self):
"""
:return: A verbose string representation of the ``DependencySpan``.
:rtype: str
"""
- str = "Span %d-%d; Head Index: %d" % (
- self._start_index,
- self._end_index,
- self._head_index,
- )
+ str = 'Span %d-%d; Head Index: %d' % (self._start_index, self._end_index, self._head_index)
for i in range(len(self._arcs)):
- str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i])
+ str += '\n%d <- %d, %s' % (i, self._arcs[i], self._tags[i])
return str
def __eq__(self, other):
- return (
- type(self) == type(other) and self._comparison_key == other._comparison_key
- )
+ return (type(self) == type(other) and
+ self._comparison_key == other._comparison_key)
def __ne__(self, other):
return not self == other
"""
return self._hash
-
#################################################################
# Chart Cell
#################################################################
-
+@python_2_unicode_compatible
class ChartCell(object):
"""
A cell from the parse chart formed when performing the CYK algorithm.
Each cell keeps track of its x and y coordinates (though this will probably
be discarded), and a list of spans serving as the cell's entries.
"""
-
def __init__(self, x, y):
"""
:param x: This cell's x coordinate.
:return: A verbose string representation of this ``ChartCell``.
:rtype: str.
"""
- return "CC[%d,%d]: %s" % (self._x, self._y, self._entries)
+ return 'CC[%d,%d]: %s' % (self._x, self._y, self._entries)
def __repr__(self):
"""
:return: A concise string representation of this ``ChartCell``.
:rtype: str.
"""
- return "%s" % self
+ return '%s' % self
#################################################################
for i in range(0, len(self._tokens) + 1):
chart.append([])
for j in range(0, len(self._tokens) + 1):
- chart[i].append(ChartCell(i, j))
- if i == j + 1:
- chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"]))
+ chart[i].append(ChartCell(i,j))
+ if i==j+1:
+ chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], ['null']))
- for i in range(1, len(self._tokens) + 1):
- for j in range(i - 2, -1, -1):
- for k in range(i - 1, j, -1):
+ for i in range(1,len(self._tokens)+1):
+ for j in range(i-2,-1,-1):
+ for k in range(i-1,j,-1):
for span1 in chart[k][j]._entries:
for span2 in chart[i][k]._entries:
for newspan in self.concatenate(span1, span2):
for parse in chart[len(self._tokens)][0]._entries:
conll_format = ""
- # malt_format = ""
+# malt_format = ""
for i in range(len(tokens)):
- # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
- # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
- # Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
- conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
- i + 1,
- tokens[i],
- tokens[i],
- "null",
- "null",
- "null",
- parse._arcs[i] + 1,
- "ROOT",
- "-",
- "-",
- )
+# malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
+ #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
+ # Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
+ conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'ROOT', '-', '-')
dg = DependencyGraph(conll_format)
- # if self.meets_arity(dg):
+# if self.meets_arity(dg):
yield dg.tree()
+
def concatenate(self, span1, span2):
"""
Concatenates the two spans in whichever way possible. This
"""
spans = []
if span1._start_index == span2._start_index:
- print("Error: Mismatched spans - replace this with thrown error")
+ print('Error: Mismatched spans - replace this with thrown error')
if span1._start_index > span2._start_index:
temp_span = span1
span1 = span2
# adjacent rightward covered concatenation
new_arcs = span1._arcs + span2._arcs
new_tags = span1._tags + span2._tags
- if self._grammar.contains(
- self._tokens[span1._head_index], self._tokens[span2._head_index]
- ):
- # print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index))
+ if self._grammar.contains(self._tokens[span1._head_index], self._tokens[span2._head_index]):
+# print 'Performing rightward cover %d to %d' % (span1._head_index, span2._head_index)
new_arcs[span2._head_index - span1._start_index] = span1._head_index
- spans.append(
- DependencySpan(
- span1._start_index,
- span2._end_index,
- span1._head_index,
- new_arcs,
- new_tags,
- )
- )
+ spans.append(DependencySpan(span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags))
# adjacent leftward covered concatenation
new_arcs = span1._arcs + span2._arcs
- if self._grammar.contains(
- self._tokens[span2._head_index], self._tokens[span1._head_index]
- ):
- # print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index))
+ if self._grammar.contains(self._tokens[span2._head_index], self._tokens[span1._head_index]):
+# print 'performing leftward cover %d to %d' % (span2._head_index, span1._head_index)
new_arcs[span1._head_index - span1._start_index] = span2._head_index
- spans.append(
- DependencySpan(
- span1._start_index,
- span2._end_index,
- span2._head_index,
- new_arcs,
- new_tags,
- )
- )
+ spans.append(DependencySpan(span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags))
return spans
for i in range(0, len(self._tokens) + 1):
chart.append([])
for j in range(0, len(self._tokens) + 1):
- chart[i].append(ChartCell(i, j))
- if i == j + 1:
- if tokens[i - 1] in self._grammar._tags:
- for tag in self._grammar._tags[tokens[i - 1]]:
- chart[i][j].add(
- DependencySpan(i - 1, i, i - 1, [-1], [tag])
- )
+ chart[i].append(ChartCell(i,j))
+ if i==j+1:
+ if tokens[i-1] in self._grammar._tags:
+ for tag in self._grammar._tags[tokens[i-1]]:
+ chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [tag]))
else:
- print(
- "No tag found for input token '%s', parse is impossible."
- % tokens[i - 1]
- )
+ print('No tag found for input token \'%s\', parse is impossible.' % tokens[i-1])
return []
- for i in range(1, len(self._tokens) + 1):
- for j in range(i - 2, -1, -1):
- for k in range(i - 1, j, -1):
+ for i in range(1,len(self._tokens)+1):
+ for j in range(i-2,-1,-1):
+ for k in range(i-1,j,-1):
for span1 in chart[k][j]._entries:
- for span2 in chart[i][k]._entries:
- for newspan in self.concatenate(span1, span2):
- chart[i][j].add(newspan)
+ for span2 in chart[i][k]._entries:
+ for newspan in self.concatenate(span1, span2):
+ chart[i][j].add(newspan)
trees = []
max_parse = None
max_score = 0
conll_format = ""
malt_format = ""
for i in range(len(tokens)):
- malt_format += "%s\t%s\t%d\t%s\n" % (
- tokens[i],
- "null",
- parse._arcs[i] + 1,
- "null",
- )
- # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
- # Modify to comply with recent change in dependency graph such that there must be a ROOT element.
- conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
- i + 1,
- tokens[i],
- tokens[i],
- parse._tags[i],
- parse._tags[i],
- "null",
- parse._arcs[i] + 1,
- "ROOT",
- "-",
- "-",
- )
+ malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
+ #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
+ # Modify to comply with recent change in dependency graph such that there must be a ROOT element.
+ conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'ROOT', '-', '-')
dg = DependencyGraph(conll_format)
- score = self.compute_prob(dg)
+ score = self.compute_prob(dg)
trees.append((score, dg.tree()))
trees.sort()
return (tree for (score, tree) in trees)
+
def concatenate(self, span1, span2):
"""
Concatenates the two spans in whichever way possible. This
"""
spans = []
if span1._start_index == span2._start_index:
- print("Error: Mismatched spans - replace this with thrown error")
+ print('Error: Mismatched spans - replace this with thrown error')
if span1._start_index > span2._start_index:
temp_span = span1
span1 = span2
# adjacent rightward covered concatenation
new_arcs = span1._arcs + span2._arcs
new_tags = span1._tags + span2._tags
- if self._grammar.contains(
- self._tokens[span1._head_index], self._tokens[span2._head_index]
- ):
+ if self._grammar.contains(self._tokens[span1._head_index], self._tokens[span2._head_index]):
new_arcs[span2._head_index - span1._start_index] = span1._head_index
- spans.append(
- DependencySpan(
- span1._start_index,
- span2._end_index,
- span1._head_index,
- new_arcs,
- new_tags,
- )
- )
+ spans.append(DependencySpan(span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags))
# adjacent leftward covered concatenation
new_arcs = span1._arcs + span2._arcs
new_tags = span1._tags + span2._tags
- if self._grammar.contains(
- self._tokens[span2._head_index], self._tokens[span1._head_index]
- ):
+ if self._grammar.contains(self._tokens[span2._head_index], self._tokens[span1._head_index]):
new_arcs[span1._head_index - span1._start_index] = span2._head_index
- spans.append(
- DependencySpan(
- span1._start_index,
- span2._end_index,
- span2._head_index,
- new_arcs,
- new_tags,
- )
- )
+ spans.append(DependencySpan(span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags))
return spans
def train(self, graphs):
tags = {}
for dg in graphs:
for node_index in range(1, len(dg.nodes)):
- # children = dg.nodes[node_index]['deps']
- children = list(chain(*dg.nodes[node_index]["deps"].values()))
-
+ #children = dg.nodes[node_index]['deps']
+ children = list(chain(*dg.nodes[node_index]['deps'].values()))
+
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
nr_children = nr_left_children + nr_right_children
- for child_index in range(
- 0 - (nr_left_children + 1), nr_right_children + 2
- ):
- head_word = dg.nodes[node_index]["word"]
- head_tag = dg.nodes[node_index]["tag"]
+ for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
+ head_word = dg.nodes[node_index]['word']
+ head_tag = dg.nodes[node_index]['tag']
if head_word in tags:
tags[head_word].add(head_tag)
else:
tags[head_word] = set([head_tag])
- child = "STOP"
- child_tag = "STOP"
- prev_word = "START"
- prev_tag = "START"
+ child = 'STOP'
+ child_tag = 'STOP'
+ prev_word = 'START'
+ prev_tag = 'START'
if child_index < 0:
array_index = child_index + nr_left_children
if array_index >= 0:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != -1:
- prev_word = dg.nodes[children[array_index + 1]]["word"]
- prev_tag = dg.nodes[children[array_index + 1]]["tag"]
- if child != "STOP":
+ prev_word = dg.nodes[children[array_index + 1]]['word']
+ prev_tag = dg.nodes[children[array_index + 1]]['tag']
+ if child != 'STOP':
productions.append(DependencyProduction(head_word, [child]))
- head_event = "(head (%s %s) (mods (%s, %s, %s) left))" % (
- child,
- child_tag,
- prev_tag,
- head_word,
- head_tag,
- )
- mod_event = "(mods (%s, %s, %s) left))" % (
- prev_tag,
- head_word,
- head_tag,
- )
+ head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag)
+ mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag)
events[head_event] += 1
events[mod_event] += 1
elif child_index > 0:
array_index = child_index + nr_left_children - 1
if array_index < nr_children:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != 1:
- prev_word = dg.nodes[children[array_index - 1]]["word"]
- prev_tag = dg.nodes[children[array_index - 1]]["tag"]
- if child != "STOP":
+ prev_word = dg.nodes[children[array_index - 1]]['word']
+ prev_tag = dg.nodes[children[array_index - 1]]['tag']
+ if child != 'STOP':
productions.append(DependencyProduction(head_word, [child]))
- head_event = "(head (%s %s) (mods (%s, %s, %s) right))" % (
- child,
- child_tag,
- prev_tag,
- head_word,
- head_tag,
- )
- mod_event = "(mods (%s, %s, %s) right))" % (
- prev_tag,
- head_word,
- head_tag,
- )
+ head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag)
+ mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag)
events[head_event] += 1
events[mod_event] += 1
self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)
"""
prob = 1.0
for node_index in range(1, len(dg.nodes)):
- # children = dg.nodes[node_index]['deps']
- children = list(chain(*dg.nodes[node_index]["deps"].values()))
-
+ #children = dg.nodes[node_index]['deps']
+ children = list(chain(*dg.nodes[node_index]['deps'].values()))
+
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
nr_children = nr_left_children + nr_right_children
for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
- head_word = dg.nodes[node_index]["word"]
- head_tag = dg.nodes[node_index]["tag"]
- child = "STOP"
- child_tag = "STOP"
- prev_word = "START"
- prev_tag = "START"
+ head_word = dg.nodes[node_index]['word']
+ head_tag = dg.nodes[node_index]['tag']
+ child = 'STOP'
+ child_tag = 'STOP'
+ prev_word = 'START'
+ prev_tag = 'START'
if child_index < 0:
array_index = child_index + nr_left_children
if array_index >= 0:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != -1:
- prev_word = dg.nodes[children[array_index + 1]]["word"]
- prev_tag = dg.nodes[children[array_index + 1]]["tag"]
- head_event = "(head (%s %s) (mods (%s, %s, %s) left))" % (
- child,
- child_tag,
- prev_tag,
- head_word,
- head_tag,
- )
- mod_event = "(mods (%s, %s, %s) left))" % (
- prev_tag,
- head_word,
- head_tag,
- )
+ prev_word = dg.nodes[children[array_index + 1]]['word']
+ prev_tag = dg.nodes[children[array_index + 1]]['tag']
+ head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag)
+ mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag)
h_count = self._grammar._events[head_event]
m_count = self._grammar._events[mod_event]
-
- # If the grammar is not covered
+
+ # If the grammar is not covered
if m_count != 0:
- prob *= h_count / m_count
+ prob *= (h_count / m_count)
else:
- prob = 0.00000001 # Very small number
-
+ prob = 0.00000001 # Very small number
+
elif child_index > 0:
array_index = child_index + nr_left_children - 1
if array_index < nr_children:
- child = dg.nodes[children[array_index]]["word"]
- child_tag = dg.nodes[children[array_index]]["tag"]
+ child = dg.nodes[children[array_index]]['word']
+ child_tag = dg.nodes[children[array_index]]['tag']
if child_index != 1:
- prev_word = dg.nodes[children[array_index - 1]]["word"]
- prev_tag = dg.nodes[children[array_index - 1]]["tag"]
- head_event = "(head (%s %s) (mods (%s, %s, %s) right))" % (
- child,
- child_tag,
- prev_tag,
- head_word,
- head_tag,
- )
- mod_event = "(mods (%s, %s, %s) right))" % (
- prev_tag,
- head_word,
- head_tag,
- )
+ prev_word = dg.nodes[children[array_index - 1]]['word']
+ prev_tag = dg.nodes[children[array_index - 1]]['tag']
+ head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag)
+ mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag)
h_count = self._grammar._events[head_event]
m_count = self._grammar._events[mod_event]
if m_count != 0:
- prob *= h_count / m_count
+ prob *= (h_count / m_count)
else:
- prob = 0.00000001 # Very small number
+ prob = 0.00000001 # Very small number
return prob
# Demos
#################################################################
-
def demo():
projective_rule_parse_demo()
- # arity_parse_demo()
+# arity_parse_demo()
projective_prob_parse_demo()
``DependencyGrammar`` to perform a projective dependency
parse.
"""
- grammar = DependencyGrammar.fromstring(
- """
+ grammar = DependencyGrammar.fromstring("""
'scratch' -> 'cats' | 'walls'
'walls' -> 'the'
'cats' -> 'the'
- """
- )
+ """)
print(grammar)
pdp = ProjectiveDependencyParser(grammar)
- trees = pdp.parse(["the", "cats", "scratch", "the", "walls"])
+ trees = pdp.parse(['the', 'cats', 'scratch', 'the', 'walls'])
for tree in trees:
print(tree)
-
def arity_parse_demo():
"""
A demonstration showing the creation of a ``DependencyGrammar``
created by a ``ProjectiveDependencyParser``.
"""
print()
- print("A grammar with no arity constraints. Each DependencyProduction")
- print("specifies a relationship between one head word and only one")
- print("modifier word.")
- grammar = DependencyGrammar.fromstring(
- """
+ print('A grammar with no arity constraints. Each DependencyProduction')
+ print('specifies a relationship between one head word and only one')
+ print('modifier word.')
+ grammar = DependencyGrammar.fromstring("""
'fell' -> 'price' | 'stock'
'price' -> 'of' | 'the'
'of' -> 'stock'
'stock' -> 'the'
- """
- )
+ """)
print(grammar)
print()
- print("For the sentence 'The price of the stock fell', this grammar")
- print("will produce the following three parses:")
+ print('For the sentence \'The price of the stock fell\', this grammar')
+ print('will produce the following three parses:')
pdp = ProjectiveDependencyParser(grammar)
- trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
+ trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
for tree in trees:
print(tree)
print()
- print("By contrast, the following grammar contains a ")
- print("DependencyProduction that specifies a relationship")
- print("between a single head word, 'price', and two modifier")
- print("words, 'of' and 'the'.")
- grammar = DependencyGrammar.fromstring(
- """
+ print('By contrast, the following grammar contains a ')
+ print('DependencyProduction that specifies a relationship')
+ print('between a single head word, \'price\', and two modifier')
+ print('words, \'of\' and \'the\'.')
+ grammar = DependencyGrammar.fromstring("""
'fell' -> 'price' | 'stock'
'price' -> 'of' 'the'
'of' -> 'stock'
'stock' -> 'the'
- """
- )
+ """)
print(grammar)
print()
- print(
- "This constrains the number of possible parses to just one:"
- ) # unimplemented, soon to replace
+ print('This constrains the number of possible parses to just one:') # unimplemented, soon to replace
pdp = ProjectiveDependencyParser(grammar)
- trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
+ trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
for tree in trees:
print(tree)
"""
from nltk.parse.dependencygraph import conll_data2
- graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+ graphs = [DependencyGraph(entry)
+ for entry in conll_data2.split('\n\n') if entry]
ppdp = ProbabilisticProjectiveDependencyParser()
- print("Training Probabilistic Projective Dependency Parser...")
+ print('Training Probabilistic Projective Dependency Parser...')
ppdp.train(graphs)
-
- sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."]
- print("Parsing '", " ".join(sent), "'...")
- print("Parse:")
+
+ sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
+ print('Parsing \'', " ".join(sent), '\'...')
+ print('Parse:')
for tree in ppdp.parse(sent):
print(tree)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Recursive Descent Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
from nltk.grammar import Nonterminal
from nltk.tree import Tree, ImmutableTree
+from nltk.compat import unicode_repr
from nltk.parse.api import ParserI
:see: ``nltk.grammar``
"""
-
def __init__(self, grammar, trace=0):
"""
Create a new ``RecursiveDescentParser``, that uses ``grammar``
"""
tree_leaf = tree[frontier[0]]
- if len(rtext) > 0 and tree_leaf == rtext[0]:
+ if (len(rtext) > 0 and tree_leaf == rtext[0]):
# If it's a terminal that matches rtext[0], then substitute
# in the token, and continue parsing.
newtree = tree.copy(deep=True)
leaves that have not yet been matched.
"""
- if production is None:
- productions = self._grammar.productions()
- else:
- productions = [production]
+ if production is None: productions = self._grammar.productions()
+ else: productions = [production]
for production in productions:
lhs = production.lhs().symbol()
else:
newtree = tree.copy(deep=True)
newtree[frontier[0]] = subtree
- new_frontier = [
- frontier[0] + (i,) for i in range(len(production.rhs()))
- ]
+ new_frontier = [frontier[0]+(i,) for i in
+ range(len(production.rhs()))]
if self._trace:
self._trace_expand(newtree, new_frontier, production)
- for result in self._parse(
- remaining_text, newtree, new_frontier + frontier[1:]
- ):
+ for result in self._parse(remaining_text, newtree,
+ new_frontier + frontier[1:]):
yield result
def _production_to_tree(self, production):
:rtype: None
"""
- if treeloc == ():
- print("*", end=" ")
+ if treeloc == (): print("*", end=' ')
if isinstance(tree, Tree):
if len(tree) == 0:
- print(repr(Nonterminal(tree.label())), end=" ")
+ print(unicode_repr(Nonterminal(tree.label())), end=' ')
for i in range(len(tree)):
if treeloc is not None and i == treeloc[0]:
self._trace_fringe(tree[i], treeloc[1:])
else:
self._trace_fringe(tree[i])
else:
- print(repr(tree), end=" ")
+ print(unicode_repr(tree), end=' ')
def _trace_tree(self, tree, frontier, operation):
"""
generated the current state.
:rtype: None
"""
- if self._trace == 2:
- print(" %c [" % operation, end=" ")
- else:
- print(" [", end=" ")
- if len(frontier) > 0:
- self._trace_fringe(tree, frontier[0])
- else:
- self._trace_fringe(tree)
- print("]")
+ if self._trace == 2: print(' %c [' % operation, end=' ')
+ else: print(' [', end=' ')
+ if len(frontier) > 0: self._trace_fringe(tree, frontier[0])
+ else: self._trace_fringe(tree)
+ print(']')
def _trace_start(self, tree, frontier, text):
- print("Parsing %r" % " ".join(text))
- if self._trace > 2:
- print("Start:")
- if self._trace > 1:
- self._trace_tree(tree, frontier, " ")
+ print('Parsing %r' % " ".join(text))
+ if self._trace > 2: print('Start:')
+ if self._trace > 1: self._trace_tree(tree, frontier, ' ')
def _trace_expand(self, tree, frontier, production):
- if self._trace > 2:
- print("Expand: %s" % production)
- if self._trace > 1:
- self._trace_tree(tree, frontier, "E")
+ if self._trace > 2: print('Expand: %s' % production)
+ if self._trace > 1: self._trace_tree(tree, frontier, 'E')
def _trace_match(self, tree, frontier, tok):
- if self._trace > 2:
- print("Match: %r" % tok)
- if self._trace > 1:
- self._trace_tree(tree, frontier, "M")
+ if self._trace > 2: print('Match: %r' % tok)
+ if self._trace > 1: self._trace_tree(tree, frontier, 'M')
def _trace_succeed(self, tree, frontier):
- if self._trace > 2:
- print("GOOD PARSE:")
- if self._trace == 1:
- print("Found a parse:\n%s" % tree)
- if self._trace > 1:
- self._trace_tree(tree, frontier, "+")
+ if self._trace > 2: print('GOOD PARSE:')
+ if self._trace == 1: print('Found a parse:\n%s' % tree)
+ if self._trace > 1: self._trace_tree(tree, frontier, '+')
def _trace_backtrack(self, tree, frontier, toks=None):
if self._trace > 2:
- if toks:
- print("Backtrack: %r match failed" % toks[0])
- else:
- print("Backtrack")
-
+ if toks: print('Backtrack: %r match failed' % toks[0])
+ else: print('Backtrack')
##//////////////////////////////////////////////////////
## Stepping Recursive Descent Parser
or not to match a token.
:see: ``nltk.grammar``
"""
-
def __init__(self, grammar, trace=0):
super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
self._rtext = None
# something nicer when we get the chance.
def _freeze(self, tree):
c = tree.copy()
- # for pos in c.treepositions('leaves'):
- # c[pos] = c[pos].freeze()
+# for pos in c.treepositions('leaves'):
+# c[pos] = c[pos].freeze()
return ImmutableTree.convert(c)
def parse(self, tokens):
# Try matching (if we haven't already)
if self.untried_match():
token = self.match()
- if token is not None:
- return token
+ if token is not None: return token
# Try expanding.
production = self.expand()
- if production is not None:
- return production
+ if production is not None: return production
# Try backtracking
if self.backtrack():
# If they didn't specify a production, check all untried ones.
if production is None:
productions = self.untried_expandable_productions()
- else:
- productions = [production]
+ else: productions = [production]
parses = []
for prod in productions:
:return: true if an operation was successfully undone.
:rtype: bool
"""
- if len(self._history) == 0:
- return False
+ if len(self._history) == 0: return False
(self._rtext, self._tree, self._frontier) = self._history.pop()
return True
:rtype: list(Production)
"""
# Make sure we *can* expand.
- if len(self._frontier) == 0:
- return []
+ if len(self._frontier) == 0: return []
frontier_child = self._tree[self._frontier[0]]
- if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
+ if (len(self._frontier) == 0 or
+ not isinstance(frontier_child, Tree)):
return []
- return [
- p
- for p in self._grammar.productions()
- if p.lhs().symbol() == frontier_child.label()
- ]
+ return [p for p in self._grammar.productions()
+ if p.lhs().symbol() == frontier_child.label()]
def untried_expandable_productions(self):
"""
"""
tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
- return [p for p in self.expandable_productions() if p not in tried_expansions]
+ return [p for p in self.expandable_productions()
+ if p not in tried_expansions]
def untried_match(self):
"""
:rtype: bool
"""
- if len(self._rtext) == 0:
- return False
+ if len(self._rtext) == 0: return False
tried_matches = self._tried_m.get(self._freeze(self._tree), [])
- return self._rtext[0] not in tried_matches
+ return (self._rtext[0] not in tried_matches)
def currently_complete(self):
"""
complete parse.
:rtype: bool
"""
- return len(self._frontier) == 0 and len(self._rtext) == 0
+ return (len(self._frontier) == 0 and len(self._rtext) == 0)
def _parse(self, remaining_text, tree, frontier):
"""
:return: ``[1]``
:rtype: list of int
"""
- self._history.append((self._rtext, self._tree, self._frontier))
+ self._history.append( (self._rtext, self._tree, self._frontier) )
self._rtext = remaining_text
self._tree = tree
self._frontier = frontier
# Is it a good parse? If so, record it.
- if len(frontier) == 0 and len(remaining_text) == 0:
+ if (len(frontier) == 0 and len(remaining_text) == 0):
self._parses.append(tree)
self._trace_succeed(self._tree, self._frontier)
"""
self._grammar = grammar
-
##//////////////////////////////////////////////////////
## Demonstration Code
##//////////////////////////////////////////////////////
-
def demo():
"""
A demonstration of the recursive descent parser.
from nltk import parse, CFG
- grammar = CFG.fromstring(
- """
+ grammar = CFG.fromstring("""
S -> NP VP
NP -> Det N | Det N PP
VP -> V NP | V NP PP
Det -> 'the' | 'a'
P -> 'in' | 'with'
V -> 'saw'
- """
- )
+ """)
for prod in grammar.productions():
print(prod)
- sent = "I saw a man in the park".split()
+ sent = 'I saw a man in the park'.split()
parser = parse.RecursiveDescentParser(grammar, trace=2)
for p in parser.parse(sent):
print(p)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Natural Language Toolkit: Shift-Reduce Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
from nltk.grammar import Nonterminal
from nltk.tree import Tree
+from nltk.compat import unicode_repr
from nltk.parse.api import ParserI
:see: ``nltk.grammar``
"""
-
def __init__(self, grammar, trace=0):
"""
Create a new ``ShiftReduceParser``, that uses ``grammar`` to
# Trace output.
if self._trace:
- print("Parsing %r" % " ".join(tokens))
+ print('Parsing %r' % " ".join(tokens))
self._trace_stack(stack, remaining_text)
# iterate through the text, pushing the token onto
# the stack, then reducing the stack.
while len(remaining_text) > 0:
self._shift(stack, remaining_text)
- while self._reduce(stack, remaining_text):
- pass
+ while self._reduce(stack, remaining_text): pass
# Did we reduce everything?
- if len(stack) == 1:
+ if len(stack) == 1:
# Did we end up with the right category?
if stack[0].label() == self._grammar.start().symbol():
yield stack[0]
"""
stack.append(remaining_text[0])
remaining_text.remove(remaining_text[0])
- if self._trace:
- self._trace_shift(stack, remaining_text)
+ if self._trace: self._trace_shift(stack, remaining_text)
def _match_rhs(self, rhs, rightmost_stack):
"""
stack.
"""
- if len(rightmost_stack) != len(rhs):
- return False
+ if len(rightmost_stack) != len(rhs): return False
for i in range(len(rightmost_stack)):
if isinstance(rightmost_stack[i], Tree):
- if not isinstance(rhs[i], Nonterminal):
- return False
- if rightmost_stack[i].label() != rhs[i].symbol():
- return False
+ if not isinstance(rhs[i], Nonterminal): return False
+ if rightmost_stack[i].label() != rhs[i].symbol(): return False
else:
- if isinstance(rhs[i], Nonterminal):
- return False
- if rightmost_stack[i] != rhs[i]:
- return False
+ if isinstance(rhs[i], Nonterminal): return False
+ if rightmost_stack[i] != rhs[i]: return False
return True
def _reduce(self, stack, remaining_text, production=None):
# 3: display which tokens & productions are shifed/reduced
self._trace = trace
- def _trace_stack(self, stack, remaining_text, marker=" "):
+ def _trace_stack(self, stack, remaining_text, marker=' '):
"""
Print trace output displaying the given stack and text.
stack. This is used with trace level 2 to print 'S'
before shifted stacks and 'R' before reduced stacks.
"""
- s = " " + marker + " [ "
+ s = ' '+marker+' [ '
for elt in stack:
if isinstance(elt, Tree):
- s += repr(Nonterminal(elt.label())) + " "
+ s += unicode_repr(Nonterminal(elt.label())) + ' '
else:
- s += repr(elt) + " "
- s += "* " + " ".join(remaining_text) + "]"
+ s += unicode_repr(elt) + ' '
+ s += '* ' + ' '.join(remaining_text) + ']'
print(s)
def _trace_shift(self, stack, remaining_text):
:rtype: None
"""
- if self._trace > 2:
- print("Shift %r:" % stack[-1])
- if self._trace == 2:
- self._trace_stack(stack, remaining_text, "S")
- elif self._trace > 0:
- self._trace_stack(stack, remaining_text)
+ if self._trace > 2: print('Shift %r:' % stack[-1])
+ if self._trace == 2: self._trace_stack(stack, remaining_text, 'S')
+ elif self._trace > 0: self._trace_stack(stack, remaining_text)
def _trace_reduce(self, stack, production, remaining_text):
"""
"""
if self._trace > 2:
rhs = " ".join(production.rhs())
- print("Reduce %r <- %s" % (production.lhs(), rhs))
- if self._trace == 2:
- self._trace_stack(stack, remaining_text, "R")
- elif self._trace > 1:
- self._trace_stack(stack, remaining_text)
+ print('Reduce %r <- %s' % (production.lhs(), rhs))
+ if self._trace == 2: self._trace_stack(stack, remaining_text, 'R')
+ elif self._trace > 1: self._trace_stack(stack, remaining_text)
def _check_grammar(self):
"""
# Any production whose RHS is an extension of another production's RHS
# will never be used.
for i in range(len(productions)):
- for j in range(i + 1, len(productions)):
+ for j in range(i+1, len(productions)):
rhs1 = productions[i].rhs()
rhs2 = productions[j].rhs()
- if rhs1[: len(rhs2)] == rhs2:
- print("Warning: %r will never be used" % productions[i])
-
+ if rhs1[:len(rhs2)] == rhs2:
+ print('Warning: %r will never be used' % productions[i])
##//////////////////////////////////////////////////////
## Stepping Shift/Reduce Parser
history is used to implement the ``undo`` operation.
:see: ``nltk.grammar``
"""
-
def __init__(self, grammar, trace=0):
super(SteppingShiftReduceParser, self).__init__(grammar, trace)
self._stack = None
:return: True if the shift operation was successful.
:rtype: bool
"""
- if len(self._remaining_text) == 0:
- return False
- self._history.append((self._stack[:], self._remaining_text[:]))
+ if len(self._remaining_text) == 0: return False
+ self._history.append( (self._stack[:], self._remaining_text[:]) )
self._shift(self._stack, self._remaining_text)
return True
:rtype: Production or None
"""
- self._history.append((self._stack[:], self._remaining_text[:]))
- return_val = self._reduce(self._stack, self._remaining_text, production)
+ self._history.append( (self._stack[:], self._remaining_text[:]) )
+ return_val = self._reduce(self._stack, self._remaining_text,
+ production)
- if not return_val:
- self._history.pop()
+ if not return_val: self._history.pop()
return return_val
def undo(self):
:return: true if an operation was successfully undone.
:rtype: bool
"""
- if len(self._history) == 0:
- return False
+ if len(self._history) == 0: return False
(self._stack, self._remaining_text) = self._history.pop()
return True
parser so far.
:rtype: iter(Tree)
"""
- if (
- len(self._remaining_text) == 0
- and len(self._stack) == 1
- and self._stack[0].label() == self._grammar.start().symbol()
- ):
+ if (len(self._remaining_text) == 0 and
+ len(self._stack) == 1 and
+ self._stack[0].label() == self._grammar.start().symbol()
+ ):
yield self._stack[0]
- # copied from nltk.parser
+# copied from nltk.parser
def set_grammar(self, grammar):
"""
"""
self._grammar = grammar
-
##//////////////////////////////////////////////////////
## Demonstration Code
##//////////////////////////////////////////////////////
-
def demo():
"""
A demonstration of the shift-reduce parser.
from nltk import parse, CFG
- grammar = CFG.fromstring(
- """
+ grammar = CFG.fromstring("""
S -> NP VP
NP -> Det N | Det N PP
VP -> V NP | V NP PP
Det -> 'the' | 'a'
P -> 'in' | 'with'
V -> 'saw'
- """
- )
+ """)
- sent = "I saw a man in the park".split()
+ sent = 'I saw a man in the park'.split()
parser = parse.ShiftReduceParser(grammar, trace=2)
for p in parser.parse(sent):
print(p)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
+
import tempfile
import os
+import re
import warnings
-from unittest import skip
from subprocess import PIPE
+from io import StringIO
+
+from six import text_type
-from nltk.internals import (
- find_jar_iter,
- config_java,
- java,
- _java_options,
- find_jars_within_path,
-)
+from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options, find_jars_within_path
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tree import Tree
-_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
-
+_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'
class GenericStanfordParser(ParserI):
"""Interface to the Stanford Parser"""
- _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
- _JAR = r"stanford-parser\.jar"
- _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
+ _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar'
+ _JAR = r'stanford-parser\.jar'
+ _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
_USE_STDIN = False
_DOUBLE_SPACED_OUTPUT = False
- def __init__(
- self,
- path_to_jar=None,
- path_to_models_jar=None,
- model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
- encoding="utf8",
- verbose=False,
- java_options="-mx4g",
- corenlp_options="",
- ):
+ def __init__(self, path_to_jar=None, path_to_models_jar=None,
+ model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
+ encoding='utf8', verbose=False,
+ java_options='-mx1000m', corenlp_options=''):
# find the most recent code and model jar
stanford_jar = max(
find_jar_iter(
- self._JAR,
- path_to_jar,
- env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
- is_regex=True,
+ self._JAR, path_to_jar,
+ env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
+ searchpath=(), url=_stanford_url,
+ verbose=verbose, is_regex=True
),
- key=lambda model_path: os.path.dirname(model_path),
+ key=lambda model_path: os.path.dirname(model_path)
)
- model_jar = max(
+ model_jar=max(
find_jar_iter(
- self._MODEL_JAR_PATTERN,
- path_to_models_jar,
- env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
- is_regex=True,
+ self._MODEL_JAR_PATTERN, path_to_models_jar,
+ env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
+ searchpath=(), url=_stanford_url,
+ verbose=verbose, is_regex=True
),
- key=lambda model_path: os.path.dirname(model_path),
+ key=lambda model_path: os.path.dirname(model_path)
)
- # self._classpath = (stanford_jar, model_jar)
+
+ #self._classpath = (stanford_jar, model_jar)
# Adding logging jar files to classpath
stanford_dir = os.path.split(stanford_jar)[0]
cur_trees = []
blank = False
for line in output_.splitlines(False):
- if line == "":
+ if line == '':
if blank:
res.append(iter(cur_trees))
cur_trees = []
blank = False
elif self._DOUBLE_SPACED_OUTPUT:
- cur_trees.append(self._make_tree("\n".join(cur_lines)))
+ cur_trees.append(self._make_tree('\n'.join(cur_lines)))
cur_lines = []
blank = True
else:
- res.append(iter([self._make_tree("\n".join(cur_lines))]))
+ res.append(iter([self._make_tree('\n'.join(cur_lines))]))
cur_lines = []
else:
cur_lines.append(line)
"""
cmd = [
self._MAIN_CLASS,
- "-model",
- self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
- self._OUTPUT_FORMAT,
- "-tokenized",
- "-escaper",
- "edu.stanford.nlp.process.PTBEscapingProcessor",
+ '-model', self.model_path,
+ '-sentences', 'newline',
+ '-outputFormat', self._OUTPUT_FORMAT,
+ '-tokenized',
+ '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor',
]
- return self._parse_trees_output(
- self._execute(
- cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
- )
- )
+ return self._parse_trees_output(self._execute(
+ cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose))
def raw_parse(self, sentence, verbose=False):
"""
"""
cmd = [
self._MAIN_CLASS,
- "-model",
- self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
- self._OUTPUT_FORMAT,
+ '-model', self.model_path,
+ '-sentences', 'newline',
+ '-outputFormat', self._OUTPUT_FORMAT,
]
- return self._parse_trees_output(
- self._execute(cmd, "\n".join(sentences), verbose)
- )
+ return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), verbose))
def tagged_parse(self, sentence, verbose=False):
"""
:type sentences: list(list(tuple(str, str)))
:rtype: iter(iter(Tree))
"""
- tag_separator = "/"
+ tag_separator = '/'
cmd = [
self._MAIN_CLASS,
- "-model",
- self.model_path,
- "-sentences",
- "newline",
- "-outputFormat",
- self._OUTPUT_FORMAT,
- "-tokenized",
- "-tagSeparator",
- tag_separator,
- "-tokenizerFactory",
- "edu.stanford.nlp.process.WhitespaceTokenizer",
- "-tokenizerMethod",
- "newCoreLabelTokenizerFactory",
+ '-model', self.model_path,
+ '-sentences', 'newline',
+ '-outputFormat', self._OUTPUT_FORMAT,
+ '-tokenized',
+ '-tagSeparator', tag_separator,
+ '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer',
+ '-tokenizerMethod', 'newCoreLabelTokenizerFactory',
]
# We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
- return self._parse_trees_output(
- self._execute(
- cmd,
- "\n".join(
- " ".join(tag_separator.join(tagged) for tagged in sentence)
- for sentence in sentences
- ),
- verbose,
- )
- )
+ return self._parse_trees_output(self._execute(
+ cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose))
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
- cmd.extend(["-encoding", encoding])
+ cmd.extend(['-encoding', encoding])
if self.corenlp_options:
cmd.append(self.corenlp_options)
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
- with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+ with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
- if isinstance(input_, str) and encoding:
+ if isinstance(input_, text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
# Run the tagger and get the output.
if self._USE_STDIN:
input_file.seek(0)
- stdout, stderr = java(
- cmd,
- classpath=self._classpath,
- stdin=input_file,
- stdout=PIPE,
- stderr=PIPE,
- )
+ stdout, stderr = java(cmd, classpath=self._classpath,
+ stdin=input_file, stdout=PIPE, stderr=PIPE)
else:
cmd.append(input_file.name)
- stdout, stderr = java(
- cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
- )
+ stdout, stderr = java(cmd, classpath=self._classpath,
+ stdout=PIPE, stderr=PIPE)
- stdout = stdout.replace(b"\xc2\xa0", b" ")
- stdout = stdout.replace(b"\x00\xa0", b" ")
+ stdout = stdout.replace(b'\xc2\xa0',b' ')
+ stdout = stdout.replace(b'\x00\xa0',b' ')
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
return stdout
-
class StanfordParser(GenericStanfordParser):
"""
>>> parser=StanfordParser(
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
[Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
- Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
- Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
+ Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']),
+ Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])]
>>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
... (
[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
"""
- _OUTPUT_FORMAT = "penn"
-
- def __init__(self, *args, **kwargs):
- warnings.warn(
- "The StanfordParser will be deprecated\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
- DeprecationWarning,
- stacklevel=2,
- )
-
- super(StanfordParser, self).__init__(*args, **kwargs)
+ _OUTPUT_FORMAT = 'penn'
def _make_tree(self, result):
return Tree.fromstring(result)
"""
- _OUTPUT_FORMAT = "conll2007"
-
- def __init__(self, *args, **kwargs):
- warnings.warn(
- "The StanfordDependencyParser will be deprecated\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
- DeprecationWarning,
- stacklevel=2,
- )
-
- super(StanfordDependencyParser, self).__init__(*args, **kwargs)
+ _OUTPUT_FORMAT = 'conll2007'
def _make_tree(self, result):
- return DependencyGraph(result, top_relation_label="root")
+ return DependencyGraph(result, top_relation_label='root')
class StanfordNeuralDependencyParser(GenericStanfordParser):
- """
+ '''
>>> from nltk.parse.stanford import StanfordNeuralDependencyParser
- >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')
+ >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx3g')
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
... ))], []) # doctest: +NORMALIZE_WHITESPACE
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
- """
+ '''
- _OUTPUT_FORMAT = "conll"
- _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
- _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
- _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
+ _OUTPUT_FORMAT = 'conll'
+ _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP'
+ _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar'
+ _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar'
_USE_STDIN = True
_DOUBLE_SPACED_OUTPUT = True
def __init__(self, *args, **kwargs):
- warnings.warn(
- "The StanfordNeuralDependencyParser will be deprecated\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
- DeprecationWarning,
- stacklevel=2,
- )
-
super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs)
- self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
+ self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse'
def tagged_parse_sents(self, sentences, verbose=False):
- """
+ '''
Currently unimplemented because the neural dependency parser (and
the StanfordCoreNLP pipeline class) doesn't support passing in pre-
tagged tokens.
- """
+ '''
raise NotImplementedError(
- "tagged_parse[_sents] is not supported by "
- "StanfordNeuralDependencyParser; use "
- "parse[_sents] or raw_parse[_sents] instead."
+ 'tagged_parse[_sents] is not supported by '
+ 'StanfordNeuralDependencyParser; use '
+ 'parse[_sents] or raw_parse[_sents] instead.'
)
def _make_tree(self, result):
- return DependencyGraph(result, top_relation_label="ROOT")
+ return DependencyGraph(result, top_relation_label='ROOT')
-@skip("doctests from nltk.parse.stanford are skipped because it's deprecated")
def setup_module(module):
from nose import SkipTest
try:
StanfordParser(
- model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+ model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
)
StanfordNeuralDependencyParser()
except LookupError:
- raise SkipTest(
- "doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn't exist"
- )
+ raise SkipTest('doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist')
#
# Author: Long Duong <longdt219@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
import tempfile
import pickle
from os import remove
from copy import deepcopy
from operator import itemgetter
-
try:
from numpy import array
from scipy import sparse
from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
+
class Configuration(object):
"""
Class for holding configuration which is the partial analysis of the input sentence.
self._max_address = len(self.buffer)
def __str__(self):
- return (
- "Stack : "
- + str(self.stack)
- + " Buffer : "
- + str(self.buffer)
- + " Arcs : "
- + str(self.arcs)
- )
+ return 'Stack : ' + \
+ str(self.stack) + ' Buffer : ' + str(self.buffer) + ' Arcs : ' + str(self.arcs)
def _check_informative(self, feat, flag=False):
"""
"""
if feat is None:
return False
- if feat == "":
+ if feat == '':
return False
if flag is False:
- if feat == "_":
+ if feat == '_':
return False
return True
# Stack 0
stack_idx0 = self.stack[len(self.stack) - 1]
token = self._tokens[stack_idx0]
- if self._check_informative(token["word"], True):
- result.append("STK_0_FORM_" + token["word"])
- if "lemma" in token and self._check_informative(token["lemma"]):
- result.append("STK_0_LEMMA_" + token["lemma"])
- if self._check_informative(token["tag"]):
- result.append("STK_0_POS_" + token["tag"])
- if "feats" in token and self._check_informative(token["feats"]):
- feats = token["feats"].split("|")
+ if self._check_informative(token['word'], True):
+ result.append('STK_0_FORM_' + token['word'])
+ if 'lemma' in token and self._check_informative(token['lemma']):
+ result.append('STK_0_LEMMA_' + token['lemma'])
+ if self._check_informative(token['tag']):
+ result.append('STK_0_POS_' + token['tag'])
+ if 'feats' in token and self._check_informative(token['feats']):
+ feats = token['feats'].split("|")
for feat in feats:
- result.append("STK_0_FEATS_" + feat)
+ result.append('STK_0_FEATS_' + feat)
# Stack 1
if len(self.stack) > 1:
stack_idx1 = self.stack[len(self.stack) - 2]
token = self._tokens[stack_idx1]
- if self._check_informative(token["tag"]):
- result.append("STK_1_POS_" + token["tag"])
+ if self._check_informative(token['tag']):
+ result.append('STK_1_POS_' + token['tag'])
# Left most, right most dependency of stack[0]
left_most = 1000000
right_most = -1
- dep_left_most = ""
- dep_right_most = ""
+ dep_left_most = ''
+ dep_right_most = ''
for (wi, r, wj) in self.arcs:
if wi == stack_idx0:
if (wj > wi) and (wj > right_most):
left_most = wj
dep_left_most = r
if self._check_informative(dep_left_most):
- result.append("STK_0_LDEP_" + dep_left_most)
+ result.append('STK_0_LDEP_' + dep_left_most)
if self._check_informative(dep_right_most):
- result.append("STK_0_RDEP_" + dep_right_most)
+ result.append('STK_0_RDEP_' + dep_right_most)
# Check Buffered 0
if len(self.buffer) > 0:
# Buffer 0
buffer_idx0 = self.buffer[0]
token = self._tokens[buffer_idx0]
- if self._check_informative(token["word"], True):
- result.append("BUF_0_FORM_" + token["word"])
- if "lemma" in token and self._check_informative(token["lemma"]):
- result.append("BUF_0_LEMMA_" + token["lemma"])
- if self._check_informative(token["tag"]):
- result.append("BUF_0_POS_" + token["tag"])
- if "feats" in token and self._check_informative(token["feats"]):
- feats = token["feats"].split("|")
+ if self._check_informative(token['word'], True):
+ result.append('BUF_0_FORM_' + token['word'])
+ if 'lemma' in token and self._check_informative(token['lemma']):
+ result.append('BUF_0_LEMMA_' + token['lemma'])
+ if self._check_informative(token['tag']):
+ result.append('BUF_0_POS_' + token['tag'])
+ if 'feats' in token and self._check_informative(token['feats']):
+ feats = token['feats'].split("|")
for feat in feats:
- result.append("BUF_0_FEATS_" + feat)
+ result.append('BUF_0_FEATS_' + feat)
# Buffer 1
if len(self.buffer) > 1:
buffer_idx1 = self.buffer[1]
token = self._tokens[buffer_idx1]
- if self._check_informative(token["word"], True):
- result.append("BUF_1_FORM_" + token["word"])
- if self._check_informative(token["tag"]):
- result.append("BUF_1_POS_" + token["tag"])
+ if self._check_informative(token['word'], True):
+ result.append('BUF_1_FORM_' + token['word'])
+ if self._check_informative(token['tag']):
+ result.append('BUF_1_POS_' + token['tag'])
if len(self.buffer) > 2:
buffer_idx2 = self.buffer[2]
token = self._tokens[buffer_idx2]
- if self._check_informative(token["tag"]):
- result.append("BUF_2_POS_" + token["tag"])
+ if self._check_informative(token['tag']):
+ result.append('BUF_2_POS_' + token['tag'])
if len(self.buffer) > 3:
buffer_idx3 = self.buffer[3]
token = self._tokens[buffer_idx3]
- if self._check_informative(token["tag"]):
- result.append("BUF_3_POS_" + token["tag"])
+ if self._check_informative(token['tag']):
+ result.append('BUF_3_POS_' + token['tag'])
# Left most, right most dependency of stack[0]
left_most = 1000000
right_most = -1
- dep_left_most = ""
- dep_right_most = ""
+ dep_left_most = ''
+ dep_right_most = ''
for (wi, r, wj) in self.arcs:
if wi == buffer_idx0:
if (wj > wi) and (wj > right_most):
left_most = wj
dep_left_most = r
if self._check_informative(dep_left_most):
- result.append("BUF_0_LDEP_" + dep_left_most)
+ result.append('BUF_0_LDEP_' + dep_left_most)
if self._check_informative(dep_right_most):
- result.append("BUF_0_RDEP_" + dep_right_most)
+ result.append('BUF_0_RDEP_' + dep_right_most)
return result
This class defines a set of transition which is applied to a configuration to get another configuration
Note that for different parsing algorithm, the transition is different.
"""
-
# Define set of transitions
- LEFT_ARC = "LEFTARC"
- RIGHT_ARC = "RIGHTARC"
- SHIFT = "SHIFT"
- REDUCE = "REDUCE"
+ LEFT_ARC = 'LEFTARC'
+ RIGHT_ARC = 'RIGHTARC'
+ SHIFT = 'SHIFT'
+ REDUCE = 'REDUCE'
def __init__(self, alg_option):
"""
"""
self._algo = alg_option
if alg_option not in [
- TransitionParser.ARC_STANDARD,
- TransitionParser.ARC_EAGER,
- ]:
- raise ValueError(
- " Currently we only support %s and %s "
- % (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)
- )
+ TransitionParser.ARC_STANDARD,
+ TransitionParser.ARC_EAGER]:
+ raise ValueError(" Currently we only support %s and %s " %
+ (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER))
def left_arc(self, conf, relation):
"""
"""
Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
"""
-
- ARC_STANDARD = "arc-standard"
- ARC_EAGER = "arc-eager"
+ ARC_STANDARD = 'arc-standard'
+ ARC_EAGER = 'arc-eager'
def __init__(self, algorithm):
"""
:param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
:type algorithm: str
"""
- if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
- raise ValueError(
- " Currently we only support %s and %s "
- % (self.ARC_STANDARD, self.ARC_EAGER)
- )
+ if not(algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
+ raise ValueError(" Currently we only support %s and %s " %
+ (self.ARC_STANDARD, self.ARC_EAGER))
self._algorithm = algorithm
self._dictionary = {}
p_node = depgraph.nodes[idx_parent]
c_node = depgraph.nodes[idx_child]
- if c_node["word"] is None:
+ if c_node['word'] is None:
return None # Root word
- if c_node["head"] == p_node["address"]:
- return c_node["rel"]
+ if c_node['head'] == p_node['address']:
+ return c_node['rel']
else:
return None
unsorted_result.append(self._dictionary[feature])
# Default value of each feature is 1.0
- return " ".join(
- str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
- )
+ return ' '.join(str(featureID) + ':1.0' for featureID in sorted(unsorted_result))
def _is_projective(self, depgraph):
arc_list = []
for key in depgraph.nodes:
node = depgraph.nodes[key]
- if "head" in node:
- childIdx = node["address"]
- parentIdx = node["head"]
+ if 'head' in node:
+ childIdx = node['address']
+ parentIdx = node['head']
if parentIdx is not None:
arc_list.append((parentIdx, childIdx))
self._transition.setdefault(key, len(self._transition) + 1)
self._match_transition[self._transition[key]] = key
- input_str = str(self._transition[key]) + " " + binary_features + "\n"
- input_file.write(input_str.encode("utf-8"))
+ input_str = str(self._transition[key]) + ' ' + binary_features + '\n'
+ input_file.write(input_str.encode('utf-8'))
def _create_training_examples_arc_std(self, depgraphs, input_file):
"""
# Left-arc operation
rel = self._get_dep_relation(b0, s0, depgraph)
if rel is not None:
- key = Transition.LEFT_ARC + ":" + rel
+ key = Transition.LEFT_ARC + ':' + rel
self._write_to_file(key, binary_features, input_file)
operation.left_arc(conf, rel)
training_seq.append(key)
precondition = False
if precondition:
- key = Transition.RIGHT_ARC + ":" + rel
- self._write_to_file(key, binary_features, input_file)
+ key = Transition.RIGHT_ARC + ':' + rel
+ self._write_to_file(
+ key,
+ binary_features,
+ input_file)
operation.right_arc(conf, rel)
training_seq.append(key)
continue
# Left-arc operation
rel = self._get_dep_relation(b0, s0, depgraph)
if rel is not None:
- key = Transition.LEFT_ARC + ":" + rel
+ key = Transition.LEFT_ARC + ':' + rel
self._write_to_file(key, binary_features, input_file)
operation.left_arc(conf, rel)
training_seq.append(key)
# Right-arc operation
rel = self._get_dep_relation(s0, b0, depgraph)
if rel is not None:
- key = Transition.RIGHT_ARC + ":" + rel
+ key = Transition.RIGHT_ARC + ':' + rel
self._write_to_file(key, binary_features, input_file)
operation.right_arc(conf, rel)
training_seq.append(key)
try:
input_file = tempfile.NamedTemporaryFile(
- prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
- )
+ prefix='transition_parse.train',
+ dir=tempfile.gettempdir(),
+ delete=False)
if self._algorithm == self.ARC_STANDARD:
self._create_training_examples_arc_std(depgraphs, input_file)
# Todo : because of probability = True => very slow due to
# cross-validation. Need to improve the speed here
model = svm.SVC(
- kernel="poly",
+ kernel='poly',
degree=2,
coef0=0,
gamma=0.2,
C=0.5,
verbose=verbose,
- probability=True,
- )
+ probability=True)
model.fit(x_train, y_train)
# Save the model to file name (as pickle)
- pickle.dump(model, open(modelfile, "wb"))
+ pickle.dump(model, open(modelfile, 'wb'))
finally:
remove(input_file.name)
"""
result = []
# First load the model
- model = pickle.load(open(modelFile, "rb"))
+ model = pickle.load(open(modelFile, 'rb'))
operation = Transition(self._algorithm)
for depgraph in depgraphs:
np_row = array(row)
np_data = array(data)
- x_test = sparse.csr_matrix(
- (np_data, (np_row, np_col)), shape=(1, len(self._dictionary))
- )
+ x_test = sparse.csr_matrix((np_data, (np_row, np_col)), shape=(1, len(self._dictionary)))
# It's best to use decision function as follow BUT it's not supported yet for sparse SVM
# Using decision funcion to build the votes array
- # dec_func = model.decision_function(x_test)[0]
- # votes = {}
- # k = 0
+ #dec_func = model.decision_function(x_test)[0]
+ #votes = {}
+ #k = 0
# for i in range(len(model.classes_)):
# for j in range(i+1, len(model.classes_)):
# #if dec_func[k] > 0:
# votes[j] +=1
# k +=1
# Sort votes according to the values
- # sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
+ #sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
# We will use predict_proba instead of decision_function
prob_dict = {}
pred_prob = model.predict_proba(x_test)[0]
for i in range(len(pred_prob)):
prob_dict[i] = pred_prob[i]
- sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True)
+ sorted_Prob = sorted(
+ prob_dict.items(),
+ key=itemgetter(1),
+ reverse=True)
# Note that SHIFT is always a valid operation
for (y_pred_idx, confidence) in sorted_Prob:
- # y_pred = model.predict(x_test)[0]
+ #y_pred = model.predict(x_test)[0]
# From the prediction match to the operation
y_pred = model.classes_[y_pred_idx]
baseTransition = strTransition.split(":")[0]
if baseTransition == Transition.LEFT_ARC:
- if (
- operation.left_arc(conf, strTransition.split(":")[1])
- != -1
- ):
+ if operation.left_arc(conf, strTransition.split(":")[1]) != -1:
break
elif baseTransition == Transition.RIGHT_ARC:
- if (
- operation.right_arc(conf, strTransition.split(":")[1])
- != -1
- ):
+ if operation.right_arc(conf, strTransition.split(":")[1]) != -1:
break
elif baseTransition == Transition.REDUCE:
if operation.reduce(conf) != -1:
if operation.shift(conf) != -1:
break
else:
- raise ValueError(
- "The predicted transition is not recognized, expected errors"
- )
+ raise ValueError("The predicted transition is not recognized, expected errors")
# Finish with operations build the dependency graph from Conf.arcs
new_depgraph = deepcopy(depgraph)
for key in new_depgraph.nodes:
node = new_depgraph.nodes[key]
- node["rel"] = ""
+ node['rel'] = ''
# With the default, all the token depend on the Root
- node["head"] = 0
+ node['head'] = 0
for (head, rel, child) in conf.arcs:
c_node = new_depgraph.nodes[child]
- c_node["head"] = head
- c_node["rel"] = rel
+ c_node['head'] = head
+ c_node['rel'] = rel
result.append(new_depgraph)
return result
Note that result is very poor because of only one training example.
"""
+
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility functions for parsers.
"""
+from __future__ import print_function
from nltk.grammar import CFG, FeatureGrammar, PCFG
from nltk.data import load
from nltk.parse.pchart import InsideChartParser
from nltk.parse.featurechart import FeatureChart, FeatureChartParser
-
-def load_parser(
- grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
-):
+def load_parser(grammar_url, trace=0,
+ parser=None, chart_class=None,
+ beam_size=0, **load_args):
"""
Load a grammar from a file, and build a parser based on that grammar.
The parser depends on the grammar format, and might also depend
"""
grammar = load(grammar_url, **load_args)
if not isinstance(grammar, CFG):
- raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
+ raise ValueError("The grammar must be a CFG, "
+ "or a subclass thereof.")
if isinstance(grammar, PCFG):
if parser is None:
parser = InsideChartParser
chart_class = FeatureChart
return parser(grammar, trace=trace, chart_class=chart_class)
- else: # Plain CFG.
+ else: # Plain CFG.
if parser is None:
parser = ChartParser
if chart_class is None:
chart_class = Chart
return parser(grammar, trace=trace, chart_class=chart_class)
-
def taggedsent_to_conll(sentence):
- """
- A module to convert a single POS tagged sentence into CONLL format.
-
- >>> from nltk import word_tokenize, pos_tag
- >>> text = "This is a foobar sentence."
- >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
- ... print(line, end="")
+ """
+ A module to convert a single POS tagged sentence into CONLL format.
+
+ >>> from nltk import word_tokenize, pos_tag
+ >>> text = "This is a foobar sentence."
+ >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
+ ... print(line, end="")
1 This _ DT DT _ 0 a _ _
2 is _ VBZ VBZ _ 0 a _ _
3 a _ DT DT _ 0 a _ _
4 foobar _ JJ JJ _ 0 a _ _
5 sentence _ NN NN _ 0 a _ _
6 . _ . . _ 0 a _ _
-
- :param sentence: A single input sentence to parse
- :type sentence: list(tuple(str, str))
- :rtype: iter(str)
- :return: a generator yielding a single sentence in CONLL format.
- """
- for (i, (word, tag)) in enumerate(sentence, start=1):
- input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
- input_str = "\t".join(input_str) + "\n"
- yield input_str
+
+ :param sentence: A single input sentence to parse
+ :type sentence: list(tuple(str, str))
+ :rtype: iter(str)
+ :return: a generator yielding a single sentence in CONLL format.
+ """
+ for (i, (word, tag)) in enumerate(sentence, start=1):
+ input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
+ input_str = "\t".join(input_str) + "\n"
+ yield input_str
def taggedsents_to_conll(sentences):
- """
- A module to convert the a POS tagged document stream
- (i.e. list of list of tuples, a list of sentences) and yield lines
- in CONLL format. This module yields one line per word and two newlines
- for end of sentence.
-
- >>> from nltk import word_tokenize, sent_tokenize, pos_tag
- >>> text = "This is a foobar sentence. Is that right?"
- >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
- >>> for line in taggedsents_to_conll(sentences):
- ... if line:
- ... print(line, end="")
- 1 This _ DT DT _ 0 a _ _
- 2 is _ VBZ VBZ _ 0 a _ _
- 3 a _ DT DT _ 0 a _ _
- 4 foobar _ JJ JJ _ 0 a _ _
- 5 sentence _ NN NN _ 0 a _ _
- 6 . _ . . _ 0 a _ _
- <BLANKLINE>
- <BLANKLINE>
- 1 Is _ VBZ VBZ _ 0 a _ _
- 2 that _ IN IN _ 0 a _ _
- 3 right _ NN NN _ 0 a _ _
- 4 ? _ . . _ 0 a _ _
- <BLANKLINE>
- <BLANKLINE>
-
- :param sentences: Input sentences to parse
- :type sentence: list(list(tuple(str, str)))
- :rtype: iter(str)
- :return: a generator yielding sentences in CONLL format.
- """
- for sentence in sentences:
- for input_str in taggedsent_to_conll(sentence):
- yield input_str
- yield "\n\n"
-
+ """
+ A module to convert the a POS tagged document stream
+ (i.e. list of list of tuples, a list of sentences) and yield lines
+ in CONLL format. This module yields one line per word and two newlines
+ for end of sentence.
+
+ >>> from nltk import word_tokenize, sent_tokenize, pos_tag
+ >>> text = "This is a foobar sentence. Is that right?"
+ >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
+ >>> for line in taggedsents_to_conll(sentences):
+ ... if line:
+ ... print(line, end="")
+ 1 This _ DT DT _ 0 a _ _
+ 2 is _ VBZ VBZ _ 0 a _ _
+ 3 a _ DT DT _ 0 a _ _
+ 4 foobar _ JJ JJ _ 0 a _ _
+ 5 sentence _ NN NN _ 0 a _ _
+ 6 . _ . . _ 0 a _ _
+ <BLANKLINE>
+ <BLANKLINE>
+ 1 Is _ VBZ VBZ _ 0 a _ _
+ 2 that _ IN IN _ 0 a _ _
+ 3 right _ NN NN _ 0 a _ _
+ 4 ? _ . . _ 0 a _ _
+ <BLANKLINE>
+ <BLANKLINE>
+
+ :param sentences: Input sentences to parse
+ :type sentence: list(list(tuple(str, str)))
+ :rtype: iter(str)
+ :return: a generator yielding sentences in CONLL format.
+ """
+ for sentence in sentences:
+ for input_str in taggedsent_to_conll(sentence):
+ yield input_str
+ yield '\n\n'
######################################################################
-# { Test Suites
+#{ Test Suites
######################################################################
-
class TestGrammar(object):
"""
Unit tests for CFG.
"""
-
def __init__(self, grammar, suite, accept=None, reject=None):
self.test_grammar = grammar
self._accept = accept
self._reject = reject
+
def run(self, show_trees=False):
"""
Sentences in the test suite are divided into two classes:
according to the grammar, then the value of ``trees`` will be None.
"""
for test in self.suite:
- print(test["doc"] + ":", end=" ")
- for key in ["accept", "reject"]:
+ print(test['doc'] + ":", end=' ')
+ for key in ['accept', 'reject']:
for sent in test[key]:
tokens = sent.split()
trees = list(self.cp.parse(tokens))
print(sent)
for tree in trees:
print(tree)
- if key == "accept":
+ if key == 'accept':
if trees == []:
raise ValueError("Sentence '%s' failed to parse'" % sent)
else:
if accepted and rejected:
print("All tests passed!")
-
def extract_test_sentences(string, comment_chars="#%;", encoding=None):
"""
Parses a string with one test sentence per line.
if encoding is not None:
string = string.decode(encoding)
sentences = []
- for sentence in string.split("\n"):
- if sentence == "" or sentence[0] in comment_chars:
+ for sentence in string.split('\n'):
+ if sentence == '' or sentence[0] in comment_chars:
continue
- split_info = sentence.split(":", 1)
+ split_info = sentence.split(':', 1)
result = None
if len(split_info) == 2:
- if split_info[0] in ["True", "true", "False", "false"]:
- result = split_info[0] in ["True", "true"]
+ if split_info[0] in ['True','true','False','false']:
+ result = split_info[0] in ['True','true']
sentence = split_info[1]
else:
result = int(split_info[0])
sentences += [(tokens, result)]
return sentences
-
# nose thinks it is a test
extract_test_sentences.__test__ = False
# Natural Language Toolkit: Viterbi Probabilistic Parser
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
from functools import reduce
from nltk.tree import Tree, ProbabilisticTree
+from nltk.compat import python_2_unicode_compatible
from nltk.parse.api import ParserI
## Viterbi PCFG Parser
##//////////////////////////////////////////////////////
-
+@python_2_unicode_compatible
class ViterbiParser(ParserI):
"""
A bottom-up ``PCFG`` parser that uses dynamic programming to find
:ivar _trace: The level of tracing output that should be generated
when parsing a text.
"""
-
def __init__(self, grammar, trace=0):
"""
Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
# Initialize the constituents dictionary with the words from
# the text.
- if self._trace:
- print(("Inserting tokens into the most likely" + " constituents table..."))
+ if self._trace: print(('Inserting tokens into the most likely'+
+ ' constituents table...'))
for index in range(len(tokens)):
token = tokens[index]
- constituents[index, index + 1, token] = token
+ constituents[index,index+1,token] = token
if self._trace > 1:
self._trace_lexical_insertion(token, index, len(tokens))
# Consider each span of length 1, 2, ..., n; and add any trees
# that might cover that span to the constituents dictionary.
- for length in range(1, len(tokens) + 1):
+ for length in range(1, len(tokens)+1):
if self._trace:
- print(
- (
- "Finding the most likely constituents"
- + " spanning %d text elements..." % length
- )
- )
- for start in range(len(tokens) - length + 1):
- span = (start, start + length)
- self._add_constituents_spanning(span, constituents, tokens)
+ print(('Finding the most likely constituents'+
+ ' spanning %d text elements...' % length))
+ for start in range(len(tokens)-length+1):
+ span = (start, start+length)
+ self._add_constituents_spanning(span, constituents,
+ tokens)
# Return the tree that spans the entire text & have the right cat
tree = constituents.get((0, len(tokens), self._grammar.start()))
# probability.
for (production, children) in instantiations:
subtrees = [c for c in children if isinstance(c, Tree)]
- p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
+ p = reduce(lambda pr,t:pr*t.prob(),
+ subtrees, production.prob())
node = production.lhs().symbol()
tree = ProbabilisticTree(node, children, prob=p)
if self._trace > 1:
if c is None or c != tree:
if c is None or c.prob() < tree.prob():
- print(" Insert:", end=" ")
+ print(' Insert:', end=' ')
else:
- print(" Discard:", end=" ")
+ print(' Discard:', end=' ')
self._trace_production(production, p, span, len(tokens))
if c is None or c.prob() < tree.prob():
constituents[span[0], span[1], production.lhs()] = tree
childlists = self._match_rhs(production.rhs(), span, constituents)
for childlist in childlists:
- rv.append((production, childlist))
+ rv.append( (production, childlist) )
return rv
def _match_rhs(self, rhs, span, constituents):
(start, end) = span
# Base case
- if start >= end and rhs == ():
- return [[]]
- if start >= end or rhs == ():
- return []
+ if start >= end and rhs == (): return [[]]
+ if start >= end or rhs == (): return []
# Find everything that matches the 1st symbol of the RHS
childlists = []
- for split in range(start, end + 1):
- l = constituents.get((start, split, rhs[0]))
+ for split in range(start, end+1):
+ l=constituents.get((start,split,rhs[0]))
if l is not None:
- rights = self._match_rhs(rhs[1:], (split, end), constituents)
- childlists += [[l] + r for r in rights]
+ rights = self._match_rhs(rhs[1:], (split,end), constituents)
+ childlists += [[l]+r for r in rights]
return childlists
:rtype: None
"""
- str = "|" + "." * span[0]
- str += "=" * (span[1] - span[0])
- str += "." * (width - span[1]) + "| "
- str += "%s" % production
- if self._trace > 2:
- str = "%-40s %12.10f " % (str, p)
+ str = '|' + '.' * span[0]
+ str += '=' * (span[1] - span[0])
+ str += '.' * (width - span[1]) + '| '
+ str += '%s' % production
+ if self._trace > 2: str = '%-40s %12.10f ' % (str, p)
print(str)
def _trace_lexical_insertion(self, token, index, width):
- str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
- str += "%s" % (token,)
+ str = ' Insert: |' + '.' * index + '=' + '.' * (width-index-1) + '| '
+ str += '%s' % (token,)
print(str)
def __repr__(self):
- return "<ViterbiParser for %r>" % self._grammar
+ return '<ViterbiParser for %r>' % self._grammar
##//////////////////////////////////////////////////////
## Test Code
##//////////////////////////////////////////////////////
-
def demo():
"""
A demonstration of the probabilistic parsers. The user is
from nltk.grammar import toy_pcfg1, toy_pcfg2
# Define two demos. Each demo has a sentence and a grammar.
- demos = [
- ("I saw the man with my telescope", toy_pcfg1),
- ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
- ]
+ demos = [('I saw the man with my telescope', toy_pcfg1),
+ ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)]
# Ask the user which demo they want to use.
print()
for i in range(len(demos)):
- print("%3s: %s" % (i + 1, demos[i][0]))
- print(" %r" % demos[i][1])
+ print('%3s: %s' % (i+1, demos[i][0]))
+ print(' %r' % demos[i][1])
print()
- print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
+ print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
try:
- snum = int(sys.stdin.readline().strip()) - 1
+ snum = int(sys.stdin.readline().strip())-1
sent, grammar = demos[snum]
except:
- print("Bad sentence number")
+ print('Bad sentence number')
return
# Tokenize the sentence.
parser = ViterbiParser(grammar)
all_parses = {}
- print("\nsent: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar))
+ print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
parser.trace(3)
t = time.time()
parses = parser.parse_all(tokens)
- time = time.time() - t
- average = (
- reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
- )
+ time = time.time()-t
+ average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
+ if parses else 0)
num_parses = len(parses)
for p in parses:
all_parses[p.freeze()] = 1
# Print some summary statistics
print()
- print("Time (secs) # Parses Average P(parse)")
- print("-----------------------------------------")
- print("%11.4f%11d%19.14f" % (time, num_parses, average))
+ print('Time (secs) # Parses Average P(parse)')
+ print('-----------------------------------------')
+ print('%11.4f%11d%19.14f' % (time, num_parses, average))
parses = all_parses.keys()
if parses:
- p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
- else:
- p = 0
- print("------------------------------------------")
- print("%11s%11d%19.14f" % ("n/a", len(parses), p))
+ p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
+ else: p = 0
+ print('------------------------------------------')
+ print('%11s%11d%19.14f' % ('n/a', len(parses), p))
# Ask the user if we should draw the parses.
print()
- print("Draw parses (y/n)? ", end=" ")
- if sys.stdin.readline().strip().lower().startswith("y"):
+ print('Draw parses (y/n)? ', end=' ')
+ if sys.stdin.readline().strip().lower().startswith('y'):
from nltk.draw.tree import draw_trees
-
- print(" please wait...")
+ print(' please wait...')
draw_trees(*parses)
# Ask the user if we should print the parses.
print()
- print("Print parses (y/n)? ", end=" ")
- if sys.stdin.readline().strip().lower().startswith("y"):
+ print('Print parses (y/n)? ', end=' ')
+ if sys.stdin.readline().strip().lower().startswith('y'):
for parse in parses:
print(parse)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Probability and Statistics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (additions)
# Trevor Cohn <tacohn@cs.mu.oz.au> (additions)
``ConditionalProbDist``, a derived distribution.
"""
+from __future__ import print_function, unicode_literals, division
import math
import random
import warnings
import array
+from operator import itemgetter
from collections import defaultdict, Counter
from functools import reduce
from abc import ABCMeta, abstractmethod
+from six import itervalues, text_type, add_metaclass
+
+from nltk import compat
from nltk.internals import raise_unorderable_types
-_NINF = float("-1e300")
+_NINF = float('-1e300')
##//////////////////////////////////////////////////////
## Frequency Distributions
##//////////////////////////////////////////////////////
-
-
+@compat.python_2_unicode_compatible
class FreqDist(Counter):
"""
A frequency distribution for the outcomes of an experiment. A
"""
return [item for item in self if self[item] == 1]
+
def Nr(self, r, bins=None):
return self.r_Nr(bins)[r]
:rtype: any or None
"""
if len(self) == 0:
- raise ValueError(
- "A FreqDist must have at least one sample before max is defined."
- )
+ raise ValueError('A FreqDist must have at least one sample before max is defined.')
return self.most_common(1)[0][0]
def plot(self, *args, **kwargs):
:type title: bool
"""
try:
- import matplotlib.pyplot as plt
+ from matplotlib import pylab
except ImportError:
- raise ValueError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
- )
+ raise ValueError('The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/')
if len(args) == 0:
args = [len(self)]
samples = [item for item, _ in self.most_common(*args)]
- cumulative = _get_kwarg(kwargs, "cumulative", False)
- percents = _get_kwarg(kwargs, "percents", False)
+ cumulative = _get_kwarg(kwargs, 'cumulative', False)
if cumulative:
freqs = list(self._cumulative_frequencies(samples))
ylabel = "Cumulative Counts"
- if percents:
- freqs = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
- ylabel = "Cumulative Percents"
else:
freqs = [self[sample] for sample in samples]
ylabel = "Counts"
# percents = [f * 100 for f in freqs] only in ProbDist?
- ax = plt.gca()
- ax.grid(True, color="silver")
-
- if "linewidth" not in kwargs:
+ pylab.grid(True, color="silver")
+ if not "linewidth" in kwargs:
kwargs["linewidth"] = 2
if "title" in kwargs:
- ax.set_title(kwargs["title"])
+ pylab.title(kwargs["title"])
del kwargs["title"]
-
- ax.plot(freqs, **kwargs)
- ax.set_xticks(range(len(samples)))
- ax.set_xticklabels([str(s) for s in samples], rotation=90)
- ax.set_xlabel("Samples")
- ax.set_ylabel(ylabel)
-
- plt.show()
-
- return ax
+ pylab.plot(freqs, **kwargs)
+ pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+ pylab.xlabel("Samples")
+ pylab.ylabel(ylabel)
+ pylab.show()
def tabulate(self, *args, **kwargs):
"""
args = [len(self)]
samples = [item for item, _ in self.most_common(*args)]
- cumulative = _get_kwarg(kwargs, "cumulative", False)
+ cumulative = _get_kwarg(kwargs, 'cumulative', False)
if cumulative:
freqs = list(self._cumulative_frequencies(samples))
else:
freqs = [self[sample] for sample in samples]
# percents = [f * 100 for f in freqs] only in ProbDist?
- width = max(len("{}".format(s)) for s in samples)
+ width = max(len("%s" % s) for s in samples)
width = max(width, max(len("%d" % f) for f in freqs))
for i in range(len(samples)):
- print("%*s" % (width, samples[i]), end=" ")
+ print("%*s" % (width, samples[i]), end=' ')
print()
for i in range(len(samples)):
- print("%*d" % (width, freqs[i]), end=" ")
+ print("%*d" % (width, freqs[i]), end=' ')
print()
def copy(self):
return self.__class__(super(FreqDist, self).__and__(other))
def __le__(self, other):
- """
- Returns True if this frequency distribution is a subset of the other
- and for no key the value exceeds the value of the same key from
- the other frequency distribution.
-
- The <= operator forms partial order and satisfying the axioms
- reflexivity, antisymmetry and transitivity.
-
- >>> FreqDist('a') <= FreqDist('a')
- True
- >>> a = FreqDist('abc')
- >>> b = FreqDist('aabc')
- >>> (a <= b, b <= a)
- (True, False)
- >>> FreqDist('a') <= FreqDist('abcd')
- True
- >>> FreqDist('abc') <= FreqDist('xyz')
- False
- >>> FreqDist('xyz') <= FreqDist('abc')
- False
- >>> c = FreqDist('a')
- >>> d = FreqDist('aa')
- >>> e = FreqDist('aaa')
- >>> c <= d and d <= e and c <= e
- True
- """
if not isinstance(other, FreqDist):
raise_unorderable_types("<=", self, other)
- return set(self).issubset(other) and all(
- self[key] <= other[key] for key in self
- )
-
- def __ge__(self, other):
- if not isinstance(other, FreqDist):
- raise_unorderable_types(">=", self, other)
- return set(self).issuperset(other) and all(
- self[key] >= other[key] for key in other
- )
+ return set(self).issubset(other) and all(self[key] <= other[key] for key in self)
+ # @total_ordering doesn't work here, since the class inherits from a builtin class
+ __ge__ = lambda self, other: not self <= other or self == other
__lt__ = lambda self, other: self <= other and not self == other
- __gt__ = lambda self, other: self >= other and not self == other
+ __gt__ = lambda self, other: not self <= other
def __repr__(self):
"""
:type maxlen: int
:rtype: string
"""
- items = ["{0!r}: {1!r}".format(*item) for item in self.most_common(maxlen)]
+ items = ['{0!r}: {1!r}'.format(*item) for item in self.most_common(maxlen)]
if len(self) > maxlen:
- items.append("...")
- return "FreqDist({{{0}}})".format(", ".join(items))
+ items.append('...')
+ return 'FreqDist({{{0}}})'.format(', '.join(items))
def __str__(self):
"""
:rtype: string
"""
- return "<FreqDist with %d samples and %d outcomes>" % (len(self), self.N())
-
- def __iter__(self):
- """
- Return an iterator which yields tokens ordered by frequency.
-
- :rtype: iterator
- """
- for token, _ in self.most_common(self.B()):
- yield token
+ return '<FreqDist with %d samples and %d outcomes>' % (len(self), self.N())
##//////////////////////////////////////////////////////
## Probability Distributions
##//////////////////////////////////////////////////////
-
-class ProbDistI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ProbDistI(object):
"""
A probability distribution for the outcomes of an experiment. A
probability distribution specifies how likely it is that an
used to model the probability distribution of the experiment used
to generate a frequency distribution.
"""
-
SUM_TO_ONE = True
"""True if the probabilities of the samples in this probability
distribution will always sum to one."""
"""
# Default definition, in terms of prob()
p = self.prob(sample)
- return math.log(p, 2) if p != 0 else _NINF
+ return (math.log(p, 2) if p != 0 else _NINF)
@abstractmethod
def max(self):
p_init = p
for sample in self.samples():
p -= self.prob(sample)
- if p <= 0:
- return sample
+ if p <= 0: return sample
# allow for some rounding error:
- if p < 0.0001:
+ if p < .0001:
return sample
# we *should* never get here
if self.SUM_TO_ONE:
- warnings.warn(
- "Probability distribution %r sums to %r; generate()"
- " is returning an arbitrary sample." % (self, p_init - p)
- )
+ warnings.warn("Probability distribution %r sums to %r; generate()"
+ " is returning an arbitrary sample." % (self, p_init-p))
return random.choice(list(self.samples()))
-
+@compat.python_2_unicode_compatible
class UniformProbDist(ProbDistI):
"""
A probability distribution that assigns equal probability to each
sample in a given set; and a zero probability to all other
samples.
"""
-
def __init__(self, samples):
"""
Construct a new uniform probability distribution, that assigns
:raise ValueError: If ``samples`` is empty.
"""
if len(samples) == 0:
- raise ValueError(
- "A Uniform probability distribution must " + "have at least one sample."
- )
+ raise ValueError('A Uniform probability distribution must '+
+ 'have at least one sample.')
self._sampleset = set(samples)
- self._prob = 1.0 / len(self._sampleset)
+ self._prob = 1.0/len(self._sampleset)
self._samples = list(self._sampleset)
def prob(self, sample):
- return self._prob if sample in self._sampleset else 0
+ return (self._prob if sample in self._sampleset else 0)
def max(self):
return self._samples[0]
return self._samples
def __repr__(self):
- return "<UniformProbDist with %d samples>" % len(self._sampleset)
-
+ return '<UniformProbDist with %d samples>' % len(self._sampleset)
+@compat.python_2_unicode_compatible
class RandomProbDist(ProbDistI):
"""
Generates a random probability distribution whereby each sample
will be between 0 and 1 with equal probability (uniform random distribution.
Also called a continuous uniform distribution).
"""
-
def __init__(self, samples):
if len(samples) == 0:
- raise ValueError(
- "A probability distribution must " + "have at least one sample."
- )
+ raise ValueError('A probability distribution must '+
+ 'have at least one sample.')
self._probs = self.unirand(samples)
self._samples = list(self._probs.keys())
randrow = [random.random() for i in range(len(samples))]
total = sum(randrow)
for i, x in enumerate(randrow):
- randrow[i] = x / total
+ randrow[i] = x/total
total = sum(randrow)
if total != 1:
- # this difference, if present, is so small (near NINF) that it
- # can be subtracted from any element without risking probs not (0 1)
+ #this difference, if present, is so small (near NINF) that it
+ #can be subtracted from any element without risking probs not (0 1)
randrow[-1] -= total - 1
return dict((s, randrow[i]) for i, s in enumerate(samples))
- def max(self):
- if not hasattr(self, "_max"):
- self._max = max((p, v) for (v, p) in self._probs.items())[1]
- return self._max
-
def prob(self, sample):
return self._probs.get(sample, 0)
return self._samples
def __repr__(self):
- return "<RandomUniformProbDist with %d samples>" % len(self._probs)
-
+ return '<RandomUniformProbDist with %d samples>' %len(self._probs)
+@compat.python_2_unicode_compatible
class DictionaryProbDist(ProbDistI):
"""
A probability distribution whose probabilities are directly
specified by a given dictionary. The given dictionary maps
samples to probabilities.
"""
-
def __init__(self, prob_dict=None, log=False, normalize=False):
"""
Construct a new probability distribution from the given
distribution assigns zero probability to all values.
"""
- self._prob_dict = prob_dict.copy() if prob_dict is not None else {}
+ self._prob_dict = (prob_dict.copy() if prob_dict is not None else {})
self._log = log
# Normalize the distribution, if requested.
if normalize:
if len(prob_dict) == 0:
- raise ValueError(
- "A DictionaryProbDist must have at least one sample "
- + "before it can be normalized."
- )
+ raise ValueError('A DictionaryProbDist must have at least one sample ' +
+ 'before it can be normalized.')
if log:
value_sum = sum_logs(list(self._prob_dict.values()))
if value_sum <= _NINF:
- logp = math.log(1.0 / len(prob_dict), 2)
+ logp = math.log(1.0/len(prob_dict), 2)
for x in prob_dict:
self._prob_dict[x] = logp
else:
else:
value_sum = sum(self._prob_dict.values())
if value_sum == 0:
- p = 1.0 / len(prob_dict)
+ p = 1.0/len(prob_dict)
for x in prob_dict:
self._prob_dict[x] = p
else:
- norm_factor = 1.0 / value_sum
+ norm_factor = 1.0/value_sum
for (x, p) in self._prob_dict.items():
self._prob_dict[x] *= norm_factor
def prob(self, sample):
if self._log:
- return 2 ** (self._prob_dict[sample]) if sample in self._prob_dict else 0
+ return (2**(self._prob_dict[sample]) if sample in self._prob_dict else 0)
else:
return self._prob_dict.get(sample, 0)
if self._log:
return self._prob_dict.get(sample, _NINF)
else:
- if sample not in self._prob_dict:
- return _NINF
- elif self._prob_dict[sample] == 0:
- return _NINF
- else:
- return math.log(self._prob_dict[sample], 2)
+ if sample not in self._prob_dict: return _NINF
+ elif self._prob_dict[sample] == 0: return _NINF
+ else: return math.log(self._prob_dict[sample], 2)
def max(self):
- if not hasattr(self, "_max"):
- self._max = max((p, v) for (v, p) in self._prob_dict.items())[1]
+ if not hasattr(self, '_max'):
+ self._max = max((p,v) for (v,p) in self._prob_dict.items())[1]
return self._max
-
def samples(self):
return self._prob_dict.keys()
-
def __repr__(self):
- return "<ProbDist with %d samples>" % len(self._prob_dict)
-
+ return '<ProbDist with %d samples>' % len(self._prob_dict)
+@compat.python_2_unicode_compatible
class MLEProbDist(ProbDistI):
"""
The maximum likelihood estimate for the probability distribution
each sample as the frequency of that sample in the frequency
distribution.
"""
-
def __init__(self, freqdist, bins=None):
"""
Use the maximum likelihood estimate to create a probability
:rtype: str
:return: A string representation of this ``ProbDist``.
"""
- return "<MLEProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<MLEProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class LidstoneProbDist(ProbDistI):
"""
The Lidstone estimate for the probability distribution of the
*gamma* to the count for each bin, and taking the maximum
likelihood estimate of the resulting frequency distribution.
"""
-
SUM_TO_ONE = False
-
def __init__(self, freqdist, gamma, bins=None):
"""
Use the Lidstone estimate to create a probability distribution
"""
if (bins == 0) or (bins is None and freqdist.N() == 0):
name = self.__class__.__name__[:-8]
- raise ValueError(
- "A %s probability distribution " % name + "must have at least one bin."
- )
+ raise ValueError('A %s probability distribution ' % name +
+ 'must have at least one bin.')
if (bins is not None) and (bins < freqdist.B()):
name = self.__class__.__name__[:-8]
- raise ValueError(
- "\nThe number of bins in a %s distribution " % name
- + "(%d) must be greater than or equal to\n" % bins
- + "the number of bins in the FreqDist used "
- + "to create it (%d)." % freqdist.B()
- )
+ raise ValueError('\nThe number of bins in a %s distribution ' % name +
+ '(%d) must be greater than or equal to\n' % bins +
+ 'the number of bins in the FreqDist used ' +
+ 'to create it (%d).' % freqdist.B())
self._freqdist = freqdist
self._gamma = float(gamma)
:rtype: str
"""
- return "<LidstoneProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class LaplaceProbDist(LidstoneProbDist):
"""
The Laplace estimate for the probability distribution of the
each bin, and taking the maximum likelihood estimate of the
resulting frequency distribution.
"""
-
def __init__(self, freqdist, bins=None):
"""
Use the Laplace estimate to create a probability distribution
:rtype: str
:return: A string representation of this ``ProbDist``.
"""
- return "<LaplaceProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class ELEProbDist(LidstoneProbDist):
"""
The expected likelihood estimate for the probability distribution
to the count for each bin, and taking the maximum likelihood
estimate of the resulting frequency distribution.
"""
-
def __init__(self, freqdist, bins=None):
"""
Use the expected likelihood estimate to create a probability
:rtype: str
"""
- return "<ELEProbDist based on %d samples>" % self._freqdist.N()
-
+ return '<ELEProbDist based on %d samples>' % self._freqdist.N()
+@compat.python_2_unicode_compatible
class HeldoutProbDist(ProbDistI):
"""
The heldout estimate for the probability distribution of the
in the base distribution. ``_max_r`` is used to decide how
large ``_estimate`` must be.
"""
-
SUM_TO_ONE = False
-
def __init__(self, base_fdist, heldout_fdist, bins=None):
"""
Use the heldout estimate to create a probability distribution
# Calculate Tr, Nr, and N.
Tr = self._calculate_Tr()
r_Nr = base_fdist.r_Nr(bins)
- Nr = [r_Nr[r] for r in range(self._max_r + 1)]
+ Nr = [r_Nr[r] for r in range(self._max_r+1)]
N = heldout_fdist.N()
# Use Tr, Nr, and N to compute the probability estimate for
:rtype: list(float)
"""
- Tr = [0.0] * (self._max_r + 1)
+ Tr = [0.0] * (self._max_r+1)
for sample in self._heldout_fdist:
r = self._base_fdist[sample]
Tr[r] += self._heldout_fdist[sample]
frequency distribution.
"""
estimate = []
- for r in range(self._max_r + 1):
- if Nr[r] == 0:
- estimate.append(None)
- else:
- estimate.append(Tr[r] / (Nr[r] * N))
+ for r in range(self._max_r+1):
+ if Nr[r] == 0: estimate.append(None)
+ else: estimate.append(Tr[r]/(Nr[r]*N))
return estimate
def base_fdist(self):
:rtype: str
:return: A string representation of this ``ProbDist``.
"""
- s = "<HeldoutProbDist: %d base samples; %d heldout samples>"
+ s = '<HeldoutProbDist: %d base samples; %d heldout samples>'
return s % (self._base_fdist.N(), self._heldout_fdist.N())
-
+@compat.python_2_unicode_compatible
class CrossValidationProbDist(ProbDistI):
"""
The cross-validation estimate for the probability distribution of
is found by averaging the held-out estimates for the sample in
each pair of frequency distributions.
"""
-
SUM_TO_ONE = False
-
def __init__(self, freqdists, bins):
"""
Use the cross-validation estimate to create a probability
prob = 0.0
for heldout_probdist in self._heldout_probdists:
prob += heldout_probdist.prob(sample)
- return prob / len(self._heldout_probdists)
+ return prob/len(self._heldout_probdists)
def discount(self):
raise NotImplementedError()
:rtype: str
"""
- return "<CrossValidationProbDist: %d-way>" % len(self._freqdists)
-
+ return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
+@compat.python_2_unicode_compatible
class WittenBellProbDist(ProbDistI):
"""
The Witten-Bell estimate of a probability distribution. This distribution
it's assumed to be equal to that of the ``freqdist``
:type bins: int
"""
- assert bins is None or bins >= freqdist.B(), (
- "bins parameter must not be less than %d=freqdist.B()" % freqdist.B()
- )
+ assert bins is None or bins >= freqdist.B(),\
+ 'bins parameter must not be less than %d=freqdist.B()' % freqdist.B()
if bins is None:
bins = freqdist.B()
self._freqdist = freqdist
self._Z = bins - self._freqdist.B()
self._N = self._freqdist.N()
# self._P0 is P(0), precalculated for efficiency:
- if self._N == 0:
+ if self._N==0:
# if freqdist is empty, we approximate P(0) by a UniformProbDist:
self._P0 = 1.0 / self._Z
else:
def prob(self, sample):
# inherit docs from ProbDistI
c = self._freqdist[sample]
- return c / (self._N + self._T) if c != 0 else self._P0
+ return (c / (self._N + self._T) if c != 0 else self._P0)
def max(self):
return self._freqdist.max()
:rtype: str
"""
- return "<WittenBellProbDist based on %d samples>" % self._freqdist.N()
+ return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
##//////////////////////////////////////////////////////
## Simple Good-Turing Probablity Distributions
##//////////////////////////////////////////////////////
-
-
+@compat.python_2_unicode_compatible
class SimpleGoodTuringProbDist(ProbDistI):
"""
SimpleGoodTuring ProbDist approximates from frequency to frequency of
- slope: b = sigma ((xi-E(x)(yi-E(y))) / sigma ((xi-E(x))(xi-E(x)))
- intercept: a = E(y) - b.E(x)
"""
-
SUM_TO_ONE = False
-
def __init__(self, freqdist, bins=None):
"""
:param freqdist: The frequency counts upon which to base the
then it's assumed to be equal to ``freqdist``.B() + 1
:type bins: int
"""
- assert (
- bins is None or bins > freqdist.B()
- ), "bins parameter must not be less than %d=freqdist.B()+1" % (freqdist.B() + 1)
+ assert bins is None or bins > freqdist.B(),\
+ 'bins parameter must not be less than %d=freqdist.B()+1' % (freqdist.B()+1)
if bins is None:
bins = freqdist.B() + 1
self._freqdist = freqdist
zr = []
for j in range(len(r)):
- i = r[j - 1] if j > 0 else 0
- k = 2 * r[j] - i if j == len(r) - 1 else r[j + 1]
+ i = (r[j-1] if j > 0 else 0)
+ k = (2 * r[j] - i if j == len(r) - 1 else r[j+1])
zr_ = 2.0 * nr[j] / (k - i)
zr.append(zr_)
y_mean = sum(log_zr) / len(log_zr)
for (x, y) in zip(log_r, log_zr):
xy_cov += (x - x_mean) * (y - y_mean)
- x_var += (x - x_mean) ** 2
- self._slope = xy_cov / x_var if x_var != 0 else 0.0
+ x_var += (x - x_mean)**2
+ self._slope = (xy_cov / x_var if x_var != 0 else 0.0)
if self._slope >= -1:
- warnings.warn(
- "SimpleGoodTuring did not find a proper best fit "
- "line for smoothing probabilities of occurrences. "
- "The probability estimates are likely to be "
- "unreliable."
- )
+ warnings.warn('SimpleGoodTuring did not find a proper best fit '
+ 'line for smoothing probabilities of occurrences. '
+ 'The probability estimates are likely to be '
+ 'unreliable.')
self._intercept = y_mean - self._slope * x_mean
def _switch(self, r, nr):
when estimating E[Nr].
"""
for i, r_ in enumerate(r):
- if len(r) == i + 1 or r[i + 1] != r_ + 1:
+ if len(r) == i + 1 or r[i+1] != r_ + 1:
# We are at the end of r, or there is a gap in r
self._switch_at = r_
break
Sr = self.smoothedNr
- smooth_r_star = (r_ + 1) * Sr(r_ + 1) / Sr(r_)
- unsmooth_r_star = (r_ + 1) * nr[i + 1] / nr[i]
+ smooth_r_star = (r_ + 1) * Sr(r_+1) / Sr(r_)
+ unsmooth_r_star = (r_ + 1) * nr[i+1] / nr[i]
- std = math.sqrt(self._variance(r_, nr[i], nr[i + 1]))
- if abs(unsmooth_r_star - smooth_r_star) <= 1.96 * std:
+ std = math.sqrt(self._variance(r_, nr[i], nr[i+1]))
+ if abs(unsmooth_r_star-smooth_r_star) <= 1.96 * std:
self._switch_at = r_
break
r = float(r)
nr = float(nr)
nr_1 = float(nr_1)
- return (r + 1.0) ** 2 * (nr_1 / nr ** 2) * (1.0 + nr_1 / nr)
+ return (r + 1.0)**2 * (nr_1 / nr**2) * (1.0 + nr_1 / nr)
def _renormalize(self, r, nr):
"""
"""
prob_cov = 0.0
for r_, nr_ in zip(r, nr):
- prob_cov += nr_ * self._prob_measure(r_)
+ prob_cov += nr_ * self._prob_measure(r_)
if prob_cov:
self._renormal = (1 - self._prob_measure(0)) / prob_cov
return p
def _prob_measure(self, count):
- if count == 0 and self._freqdist.N() == 0:
+ if count == 0 and self._freqdist.N() == 0 :
return 1.0
elif count == 0 and self._freqdist.N() != 0:
return self._freqdist.Nr(1) / self._freqdist.N()
if self._switch_at > count:
- Er_1 = self._freqdist.Nr(count + 1)
+ Er_1 = self._freqdist.Nr(count+1)
Er = self._freqdist.Nr(count)
else:
- Er_1 = self.smoothedNr(count + 1)
+ Er_1 = self.smoothedNr(count+1)
Er = self.smoothedNr(count)
r_star = (count + 1) * Er_1 / Er
def check(self):
prob_sum = 0.0
- for i in range(0, len(self._Nr)):
+ for i in range(0, len(self._Nr)):
prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal
print("Probability Sum:", prob_sum)
- # assert prob_sum != 1.0, "probability sum should be one!"
+ #assert prob_sum != 1.0, "probability sum should be one!"
def discount(self):
"""
This function returns the total mass of probability transfers from the
seen samples to the unseen samples.
"""
- return self.smoothedNr(1) / self._freqdist.N()
+ return self.smoothedNr(1) / self._freqdist.N()
def max(self):
return self._freqdist.max()
:rtype: str
"""
- return "<SimpleGoodTuringProbDist based on %d samples>" % self._freqdist.N()
+ return '<SimpleGoodTuringProbDist based on %d samples>'\
+ % self._freqdist.N()
class MutableProbDist(ProbDistI):
self._data[i] = prob_dist.prob(samples[i])
self._logs = store_logs
- def max(self):
- # inherit documentation
- return max((p, v) for (v, p) in self._sample_dict.items())[1]
-
def samples(self):
# inherit documentation
return self._samples
i = self._sample_dict.get(sample)
if i is None:
return 0.0
- return 2 ** (self._data[i]) if self._logs else self._data[i]
+ return (2**(self._data[i]) if self._logs else self._data[i])
def logprob(self, sample):
# inherit documentation
i = self._sample_dict.get(sample)
if i is None:
- return float("-inf")
- return self._data[i] if self._logs else math.log(self._data[i], 2)
+ return float('-inf')
+ return (self._data[i] if self._logs else math.log(self._data[i], 2))
def update(self, sample, prob, log=True):
"""
i = self._sample_dict.get(sample)
assert i is not None
if self._logs:
- self._data[i] = prob if log else math.log(prob, 2)
+ self._data[i] = (prob if log else math.log(prob, 2))
else:
- self._data[i] = 2 ** (prob) if log else prob
-
+ self._data[i] = (2**(prob) if log else prob)
##/////////////////////////////////////////////////////
## Kneser-Ney Probability Distribution
# and take advantage of storing and retrieving information in dictionaries
# where possible.
-
-
+@compat.python_2_unicode_compatible
class KneserNeyProbDist(ProbDistI):
"""
Kneser-Ney estimate of a probability distribution. This is a version of
value can be specified. The default discount is set to 0.75.
"""
-
def __init__(self, freqdist, bins=None, discount=0.75):
"""
:param freqdist: The trigram frequency distribution upon which to base
self._trigrams_contain = defaultdict(float)
self._wordtypes_before = defaultdict(float)
for w0, w1, w2 in freqdist:
- self._bigrams[(w0, w1)] += freqdist[(w0, w1, w2)]
- self._wordtypes_after[(w0, w1)] += 1
+ self._bigrams[(w0,w1)] += freqdist[(w0, w1, w2)]
+ self._wordtypes_after[(w0,w1)] += 1
self._trigrams_contain[w1] += 1
- self._wordtypes_before[(w1, w2)] += 1
+ self._wordtypes_before[(w1,w2)] += 1
def prob(self, trigram):
# sample must be a triple
if len(trigram) != 3:
- raise ValueError("Expected an iterable with 3 members.")
+ raise ValueError('Expected an iterable with 3 members.')
trigram = tuple(trigram)
w0, w1, w2 = trigram
else:
# if the sample trigram was seen during training
if trigram in self._trigrams:
- prob = (self._trigrams[trigram] - self.discount()) / self._bigrams[
- (w0, w1)
- ]
+ prob = (self._trigrams[trigram]
+ - self.discount())/self._bigrams[(w0, w1)]
# else if the 'rougher' environment was seen during training
- elif (w0, w1) in self._bigrams and (w1, w2) in self._wordtypes_before:
+ elif (w0,w1) in self._bigrams and (w1,w2) in self._wordtypes_before:
aftr = self._wordtypes_after[(w0, w1)]
bfr = self._wordtypes_before[(w1, w2)]
# the probability left over from alphas
- leftover_prob = (aftr * self.discount()) / self._bigrams[(w0, w1)]
+ leftover_prob = ((aftr * self.discount())
+ / self._bigrams[(w0, w1)])
# the beta (including normalization)
- beta = bfr / (self._trigrams_contain[w1] - aftr)
+ beta = bfr /(self._trigrams_contain[w1] - aftr)
prob = leftover_prob * beta
return self._trigrams.max()
def __repr__(self):
- """
+ '''
Return a string representation of this ProbDist
:rtype: str
- """
- return "<KneserNeyProbDist based on {0} trigrams".format(self._trigrams.N())
-
+ '''
+ return '<KneserNeyProbDist based on {0} trigrams'.format(self._trigrams.N())
##//////////////////////////////////////////////////////
## Probability Distribution Operations
##//////////////////////////////////////////////////////
-
def log_likelihood(test_pdist, actual_pdist):
- if not isinstance(test_pdist, ProbDistI) or not isinstance(actual_pdist, ProbDistI):
- raise ValueError("expected a ProbDist.")
+ if (not isinstance(test_pdist, ProbDistI) or
+ not isinstance(actual_pdist, ProbDistI)):
+ raise ValueError('expected a ProbDist.')
# Is this right?
- return sum(
- actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2) for s in actual_pdist
- )
-
+ return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2)
+ for s in actual_pdist)
def entropy(pdist):
probs = (pdist.prob(s) for s in pdist.samples())
- return -sum(p * math.log(p, 2) for p in probs)
-
+ return -sum(p * math.log(p,2) for p in probs)
##//////////////////////////////////////////////////////
## Conditional Distributions
##//////////////////////////////////////////////////////
-
-
+@compat.python_2_unicode_compatible
class ConditionalFreqDist(defaultdict):
"""
A collection of frequency distributions for a single experiment
condition.
"""
-
def __init__(self, cond_samples=None):
"""
Construct a new empty conditional frequency distribution. In
:rtype: int
"""
- return sum(fdist.N() for fdist in self.values())
+ return sum(fdist.N() for fdist in itervalues(self))
def plot(self, *args, **kwargs):
"""
:type conditions: list
"""
try:
- import matplotlib.pyplot as plt #import statment fix
+ from matplotlib import pylab
except ImportError:
- raise ValueError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
- )
+ raise ValueError('The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/')
cumulative = _get_kwarg(kwargs, 'cumulative', False)
- percents = _get_kwarg(kwargs, 'percents', False)
- conditions = [c for c in _get_kwarg(kwargs, 'conditions', self.conditions()) if c in self] # conditions should be in self
+ conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
title = _get_kwarg(kwargs, 'title', '')
- samples = _get_kwarg(
- kwargs, 'samples', sorted(set(v
- for c in conditions
- for v in self[c]))
- ) # this computation could be wasted
- if "linewidth" not in kwargs:
+ samples = _get_kwarg(kwargs, 'samples',
+ sorted(set(v for c in conditions for v in self[c]))) # this computation could be wasted
+ if not "linewidth" in kwargs:
kwargs["linewidth"] = 2
- ax = plt.gca()
- if (len(conditions) != 0):
- freqs = []
- for condition in conditions:
- if cumulative:
- # freqs should be a list of list where each sub list will be a frequency of a condition
- freqs.append(list(self[condition]._cumulative_frequencies(samples)))
- ylabel = "Cumulative Counts"
- legend_loc = 'lower right'
- if percents:
- freqs[-1] = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
- ylabel = "Cumulative Percents"
- else:
- freqs.append([self[condition][sample] for sample in samples])
- ylabel = "Counts"
- legend_loc = 'upper right'
- # percents = [f * 100 for f in freqs] only in ConditionalProbDist?
-
- i = 0
- for freq in freqs:
- kwargs['label'] = conditions[i] #label for each condition
- i += 1
- ax.plot(freq, *args, **kwargs)
- ax.legend(loc=legend_loc)
- ax.grid(True, color="silver")
- ax.set_xticks(range(len(samples)))
- ax.set_xticklabels([str(s) for s in samples], rotation=90)
- if title:
- ax.set_title(title)
- ax.set_xlabel("Samples")
- ax.set_ylabel(ylabel)
- plt.show()
-
- return ax
+
+ for condition in conditions:
+ if cumulative:
+ freqs = list(self[condition]._cumulative_frequencies(samples))
+ ylabel = "Cumulative Counts"
+ legend_loc = 'lower right'
+ else:
+ freqs = [self[condition][sample] for sample in samples]
+ ylabel = "Counts"
+ legend_loc = 'upper right'
+ # percents = [f * 100 for f in freqs] only in ConditionalProbDist?
+ kwargs['label'] = "%s" % condition
+ pylab.plot(freqs, *args, **kwargs)
+
+ pylab.legend(loc=legend_loc)
+ pylab.grid(True, color="silver")
+ pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+ if title:
+ pylab.title(title)
+ pylab.xlabel("Samples")
+ pylab.ylabel(ylabel)
+ pylab.show()
def tabulate(self, *args, **kwargs):
"""
:type title: bool
"""
- cumulative = _get_kwarg(kwargs, "cumulative", False)
- conditions = _get_kwarg(kwargs, "conditions", sorted(self.conditions()))
- samples = _get_kwarg(
- kwargs,
- "samples",
- sorted(set(v for c in conditions if c in self for v in self[c])),
- ) # this computation could be wasted
+ cumulative = _get_kwarg(kwargs, 'cumulative', False)
+ conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
+ samples = _get_kwarg(kwargs, 'samples',
+ sorted(set(v for c in conditions for v in self[c]))) # this computation could be wasted
width = max(len("%s" % s) for s in samples)
freqs = dict()
width = max(width, max(len("%d" % f) for f in freqs[c]))
condition_size = max(len("%s" % c) for c in conditions)
- print(" " * condition_size, end=" ")
+ print(' ' * condition_size, end=' ')
for s in samples:
- print("%*s" % (width, s), end=" ")
+ print("%*s" % (width, s), end=' ')
print()
for c in conditions:
- print("%*s" % (condition_size, c), end=" ")
+ print("%*s" % (condition_size, c), end=' ')
for f in freqs[c]:
- print("%*d" % (width, f), end=" ")
+ print("%*d" % (width, f), end=' ')
print()
# Mathematical operators
def __le__(self, other):
if not isinstance(other, ConditionalFreqDist):
raise_unorderable_types("<=", self, other)
- return set(self.conditions()).issubset(other.conditions()) and all(
- self[c] <= other[c] for c in self.conditions()
- )
-
+ return set(self.conditions()).issubset(other.conditions()) \
+ and all(self[c] <= other[c] for c in self.conditions())
def __lt__(self, other):
if not isinstance(other, ConditionalFreqDist):
raise_unorderable_types("<", self, other)
return self <= other and self != other
-
def __ge__(self, other):
if not isinstance(other, ConditionalFreqDist):
raise_unorderable_types(">=", self, other)
return other <= self
-
def __gt__(self, other):
if not isinstance(other, ConditionalFreqDist):
raise_unorderable_types(">", self, other)
:rtype: str
"""
- return "<ConditionalFreqDist with %d conditions>" % len(self)
-
+ return '<ConditionalFreqDist with %d conditions>' % len(self)
-class ConditionalProbDistI(dict, metaclass=ABCMeta):
+@compat.python_2_unicode_compatible
+@add_metaclass(ABCMeta)
+class ConditionalProbDistI(dict):
"""
A collection of probability distributions for a single experiment
run under different conditions. Conditional probability
condition to the ``ProbDist`` for the experiment under that
condition.
"""
-
@abstractmethod
def __init__(self):
"""
:rtype: str
"""
- return "<%s with %d conditions>" % (type(self).__name__, len(self))
+ return '<%s with %d conditions>' % (type(self).__name__, len(self))
class ConditionalProbDist(ConditionalProbDistI):
0.423...
"""
-
- def __init__(self, cfdist, probdist_factory, *factory_args, **factory_kw_args):
+ def __init__(self, cfdist, probdist_factory,
+ *factory_args, **factory_kw_args):
"""
Construct a new conditional probability distribution, based on
the given conditional frequency distribution and ``ProbDist``
self._factory_kw_args = factory_kw_args
for condition in cfdist:
- self[condition] = probdist_factory(
- cfdist[condition], *factory_args, **factory_kw_args
- )
+ self[condition] = probdist_factory(cfdist[condition],
+ *factory_args, **factory_kw_args)
def __missing__(self, key):
- self[key] = self._probdist_factory(
- FreqDist(), *self._factory_args, **self._factory_kw_args
- )
+ self[key] = self._probdist_factory(FreqDist(),
+ *self._factory_args,
+ **self._factory_kw_args)
return self[key]
-
class DictionaryConditionalProbDist(ConditionalProbDistI):
"""
An alternative ConditionalProbDist that simply wraps a dictionary of
self[key] = DictionaryProbDist()
return self[key]
-
##//////////////////////////////////////////////////////
## Adding in log-space.
##//////////////////////////////////////////////////////
# If the difference is bigger than this, then just take the bigger one:
_ADD_LOGS_MAX_DIFF = math.log(1e-30, 2)
-
def add_logs(logx, logy):
"""
Given two numbers ``logx`` = *log(x)* and ``logy`` = *log(y)*, return
``log(2**(logx)+2**(logy))``, but the actual implementation
avoids overflow errors that could result from direct computation.
"""
- if logx < logy + _ADD_LOGS_MAX_DIFF:
+ if (logx < logy + _ADD_LOGS_MAX_DIFF):
return logy
- if logy < logx + _ADD_LOGS_MAX_DIFF:
+ if (logy < logx + _ADD_LOGS_MAX_DIFF):
return logx
base = min(logx, logy)
- return base + math.log(2 ** (logx - base) + 2 ** (logy - base), 2)
-
+ return base + math.log(2**(logx-base) + 2**(logy-base), 2)
def sum_logs(logs):
- return reduce(add_logs, logs[1:], logs[0]) if len(logs) != 0 else _NINF
-
+ return (reduce(add_logs, logs[1:], logs[0]) if len(logs) != 0 else _NINF)
##//////////////////////////////////////////////////////
## Probabilistic Mix-in
##//////////////////////////////////////////////////////
-
class ProbabilisticMixIn(object):
"""
A mix-in class to associate probabilities with other classes
You should generally also redefine the string representation
methods, the comparison methods, and the hashing method.
"""
-
def __init__(self, **kwargs):
"""
Initialize this object's probability. This initializer should
the object.
:type logprob: float
"""
- if "prob" in kwargs:
- if "logprob" in kwargs:
- raise TypeError("Must specify either prob or logprob " "(not both)")
+ if 'prob' in kwargs:
+ if 'logprob' in kwargs:
+ raise TypeError('Must specify either prob or logprob '
+ '(not both)')
else:
- ProbabilisticMixIn.set_prob(self, kwargs["prob"])
- elif "logprob" in kwargs:
- ProbabilisticMixIn.set_logprob(self, kwargs["logprob"])
+ ProbabilisticMixIn.set_prob(self, kwargs['prob'])
+ elif 'logprob' in kwargs:
+ ProbabilisticMixIn.set_logprob(self, kwargs['logprob'])
else:
self.__prob = self.__logprob = None
:rtype: float
"""
if self.__prob is None:
- if self.__logprob is None:
- return None
- self.__prob = 2 ** (self.__logprob)
+ if self.__logprob is None: return None
+ self.__prob = 2**(self.__logprob)
return self.__prob
def logprob(self):
:rtype: float
"""
if self.__logprob is None:
- if self.__prob is None:
- return None
+ if self.__prob is None: return None
self.__logprob = math.log(self.__prob, 2)
return self.__logprob
-
class ImmutableProbabilisticMixIn(ProbabilisticMixIn):
def set_prob(self, prob):
- raise ValueError("%s is immutable" % self.__class__.__name__)
-
+ raise ValueError('%s is immutable' % self.__class__.__name__)
def set_logprob(self, prob):
- raise ValueError("%s is immutable" % self.__class__.__name__)
-
+ raise ValueError('%s is immutable' % self.__class__.__name__)
## Helper function for processing keyword arguments
-
def _get_kwarg(kwargs, key, default):
if key in kwargs:
arg = kwargs[key]
arg = default
return arg
-
##//////////////////////////////////////////////////////
## Demonstration
##//////////////////////////////////////////////////////
-
def _create_rand_fdist(numsamples, numoutcomes):
"""
Create a new frequency distribution, with random samples. The
samples are numbers from 1 to ``numsamples``, and are generated by
summing two numbers, each of which has a uniform distribution.
"""
-
+ import random
fdist = FreqDist()
for x in range(numoutcomes):
- y = random.randint(1, (1 + numsamples) // 2) + random.randint(
- 0, numsamples // 2
- )
+ y = (random.randint(1, (1 + numsamples) // 2) +
+ random.randint(0, numsamples // 2))
fdist[y] += 1
return fdist
-
def _create_sum_pdist(numsamples):
"""
Return the true probability distribution for the experiment
fdist = FreqDist()
for x in range(1, (1 + numsamples) // 2 + 1):
for y in range(0, numsamples // 2 + 1):
- fdist[x + y] += 1
+ fdist[x+y] += 1
return MLEProbDist(fdist)
-
def demo(numsamples=6, numoutcomes=500):
"""
A demonstration of frequency distributions and probability
# Find the probability of each sample.
vals = []
- for n in range(1, numsamples + 1):
- vals.append(tuple([n, fdist1.freq(n)] + [pdist.prob(n) for pdist in pdists]))
+ for n in range(1,numsamples+1):
+ vals.append(tuple([n, fdist1.freq(n)] +
+ [pdist.prob(n) for pdist in pdists]))
# Print the results in a formatted table.
- print(
- (
- "%d samples (1-%d); %d outcomes were sampled for each FreqDist"
- % (numsamples, numsamples, numoutcomes)
- )
- )
- print("=" * 9 * (len(pdists) + 2))
- FORMATSTR = " FreqDist " + "%8s " * (len(pdists) - 1) + "| Actual"
+ print(('%d samples (1-%d); %d outcomes were sampled for each FreqDist' %
+ (numsamples, numsamples, numoutcomes)))
+ print('='*9*(len(pdists)+2))
+ FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual'
print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1]))
- print("-" * 9 * (len(pdists) + 2))
- FORMATSTR = "%3d %8.6f " + "%8.6f " * (len(pdists) - 1) + "| %8.6f"
+ print('-'*9*(len(pdists)+2))
+ FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f'
for val in vals:
print(FORMATSTR % val)
# Print the totals for each column (should all be 1.0)
zvals = list(zip(*vals))
sums = [sum(val) for val in zvals[1:]]
- print("-" * 9 * (len(pdists) + 2))
- FORMATSTR = "Total " + "%8.6f " * (len(pdists)) + "| %8.6f"
+ print('-'*9*(len(pdists)+2))
+ FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f'
print(FORMATSTR % tuple(sums))
- print("=" * 9 * (len(pdists) + 2))
+ print('='*9*(len(pdists)+2))
# Display the distributions themselves, if they're short enough.
if len("%s" % fdist1) < 70:
- print(" fdist1: %s" % fdist1)
- print(" fdist2: %s" % fdist2)
- print(" fdist3: %s" % fdist3)
+ print(' fdist1: %s' % fdist1)
+ print(' fdist2: %s' % fdist2)
+ print(' fdist3: %s' % fdist3)
print()
- print("Generating:")
+ print('Generating:')
for pdist in pdists:
fdist = FreqDist(pdist.generate() for i in range(5000))
- print("%20s %s" % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55]))
+ print('%20s %s' % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55]))
print()
-
def gt_demo():
from nltk import corpus
-
- emma_words = corpus.gutenberg.words("austen-emma.txt")
+ emma_words = corpus.gutenberg.words('austen-emma.txt')
fd = FreqDist(emma_words)
sgt = SimpleGoodTuringProbDist(fd)
- print("%18s %8s %14s" % ("word", "freqency", "SimpleGoodTuring"))
- fd_keys_sorted = (
- key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True)
- )
+ print('%18s %8s %14s' \
+ % ("word", "freqency", "SimpleGoodTuring"))
+ fd_keys_sorted=(key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True))
for key in fd_keys_sorted:
- print("%18s %8d %14e" % (key, fd[key], sgt.prob(key)))
-
+ print('%18s %8d %14e' \
+ % (key, fd[key], sgt.prob(key)))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo(6, 10)
demo(5, 5000)
gt_demo()
-__all__ = [
- "ConditionalFreqDist",
- "ConditionalProbDist",
- "ConditionalProbDistI",
- "CrossValidationProbDist",
- "DictionaryConditionalProbDist",
- "DictionaryProbDist",
- "ELEProbDist",
- "FreqDist",
- "SimpleGoodTuringProbDist",
- "HeldoutProbDist",
- "ImmutableProbabilisticMixIn",
- "LaplaceProbDist",
- "LidstoneProbDist",
- "MLEProbDist",
- "MutableProbDist",
- "KneserNeyProbDist",
- "ProbDistI",
- "ProbabilisticMixIn",
- "UniformProbDist",
- "WittenBellProbDist",
- "add_logs",
- "log_likelihood",
- "sum_logs",
- "entropy",
-]
+__all__ = ['ConditionalFreqDist', 'ConditionalProbDist',
+ 'ConditionalProbDistI', 'CrossValidationProbDist',
+ 'DictionaryConditionalProbDist', 'DictionaryProbDist', 'ELEProbDist',
+ 'FreqDist', 'SimpleGoodTuringProbDist', 'HeldoutProbDist',
+ 'ImmutableProbabilisticMixIn', 'LaplaceProbDist', 'LidstoneProbDist',
+ 'MLEProbDist', 'MutableProbDist', 'KneserNeyProbDist', 'ProbDistI', 'ProbabilisticMixIn',
+ 'UniformProbDist', 'WittenBellProbDist', 'add_logs',
+ 'log_likelihood', 'sum_logs', 'entropy']
# Natural Language Toolkit: Semantic Interpretation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
>>> m = Model(dom, val)
"""
-from nltk.sem.util import parse_sents, interpret_sents, evaluate_sents, root_semrep
-from nltk.sem.evaluate import (
- Valuation,
- Assignment,
- Model,
- Undefined,
- is_rel,
- set2rel,
- arity,
- read_valuation,
-)
-from nltk.sem.logic import (
- boolean_ops,
- binding_ops,
- equality_preds,
- read_logic,
- Variable,
- Expression,
- ApplicationExpression,
- LogicalExpressionException,
-)
+from nltk.sem.util import (parse_sents, interpret_sents, evaluate_sents,
+ root_semrep)
+from nltk.sem.evaluate import (Valuation, Assignment, Model, Undefined,
+ is_rel, set2rel, arity, read_valuation)
+from nltk.sem.logic import (boolean_ops, binding_ops, equality_preds,
+ read_logic, Variable, Expression,
+ ApplicationExpression, LogicalExpressionException)
from nltk.sem.skolemize import skolemize
from nltk.sem.lfg import FStructure
-from nltk.sem.relextract import extract_rels, rtuple, clause
+from nltk.sem.relextract import (extract_rels, rtuple, clause)
from nltk.sem.boxer import Boxer
from nltk.sem.drt import DrtExpression, DRS
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
models/
boxer/
"""
+from __future__ import print_function, unicode_literals
import os
import re
from nltk.internals import find_binary
-from nltk.sem.logic import (
- ExpectedMoreTokensException,
- LogicalExpressionException,
- UnexpectedTokenException,
- Variable,
-)
-
-from nltk.sem.drt import (
- DRS,
- DrtApplicationExpression,
- DrtEqualityExpression,
- DrtNegatedExpression,
- DrtOrExpression,
- DrtParser,
- DrtProposition,
- DrtTokens,
- DrtVariableExpression,
-)
+from nltk.sem.logic import (ExpectedMoreTokensException, LogicalExpressionException,
+ UnexpectedTokenException, Variable)
+from nltk.sem.drt import (DRS, DrtApplicationExpression, DrtEqualityExpression,
+ DrtNegatedExpression, DrtOrExpression, DrtParser,
+ DrtProposition, DrtTokens, DrtVariableExpression)
+
+from nltk.compat import python_2_unicode_compatible
class Boxer(object):
"""
semantic parser that produces Discourse Representation Structures (DRSs).
"""
- def __init__(
- self,
- boxer_drs_interpreter=None,
- elimeq=False,
- bin_dir=None,
- verbose=False,
- resolve=True,
- ):
+ def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False, resolve=True):
"""
:param boxer_drs_interpreter: A class that converts from the
``AbstractBoxerDrs`` object hierarchy to a different object. The
:param elimeq: When set to true, Boxer removes all equalities from the
DRSs and discourse referents standing in the equality relation are
unified, but only if this can be done in a meaning-preserving manner.
- :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
+ :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
Resolution follows Van der Sandt's theory of binding and accommodation.
"""
if boxer_drs_interpreter is None:
self.set_bin_dir(bin_dir, verbose)
def set_bin_dir(self, bin_dir, verbose=False):
- self._candc_bin = self._find_binary("candc", bin_dir, verbose)
- self._candc_models_path = os.path.normpath(
- os.path.join(self._candc_bin[:-5], "../models")
- )
- self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
+ self._candc_bin = self._find_binary('candc', bin_dir, verbose)
+ self._candc_models_path = os.path.normpath(os.path.join(self._candc_bin[:-5], '../models'))
+ self._boxer_bin = self._find_binary('boxer', bin_dir, verbose)
def interpret(self, input, discourse_id=None, question=False, verbose=False):
"""
:param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
:return: ``drt.DrtExpression``
"""
- discourse_ids = [discourse_id] if discourse_id is not None else None
+ discourse_ids = ([discourse_id] if discourse_id is not None else None)
d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
if not d:
raise Exception('Unable to interpret: "{0}"'.format(input))
:param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
:return: ``drt.DrtExpression``
"""
- discourse_ids = [discourse_id] if discourse_id is not None else None
+ discourse_ids = ([discourse_id] if discourse_id is not None else None)
d, = self.interpret_multi_sents([input], discourse_ids, question, verbose)
if not d:
raise Exception('Unable to interpret: "{0}"'.format(input))
return d
- def interpret_sents(
- self, inputs, discourse_ids=None, question=False, verbose=False
- ):
+ def interpret_sents(self, inputs, discourse_ids=None, question=False, verbose=False):
"""
Use Boxer to give a first order representation.
:param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
:return: list of ``drt.DrtExpression``
"""
- return self.interpret_multi_sents(
- [[input] for input in inputs], discourse_ids, question, verbose
- )
+ return self.interpret_multi_sents([[input] for input in inputs], discourse_ids, question, verbose)
- def interpret_multi_sents(
- self, inputs, discourse_ids=None, question=False, verbose=False
- ):
+ def interpret_multi_sents(self, inputs, discourse_ids=None, question=False, verbose=False):
"""
Use Boxer to give a first order representation.
candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose)
boxer_out = self._call_boxer(candc_out, verbose=verbose)
- # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
- # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
+# if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
+# raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id)
return [drs_dict.get(id, None) for id in discourse_ids]
:param filename: str A filename for the output file
:return: stdout
"""
- args = [
- "--models",
- os.path.join(self._candc_models_path, ["boxer", "questions"][question]),
- "--candc-printer",
- "boxer",
- ]
- return self._call(
- "\n".join(
- sum(
- (
- ["<META>'{0}'".format(id)] + d
- for d, id in zip(inputs, discourse_ids)
- ),
- [],
- )
- ),
- self._candc_bin,
- args,
- verbose,
- )
+ args = ['--models', os.path.join(self._candc_models_path, ['boxer','questions'][question]),
+ '--candc-printer', 'boxer']
+ return self._call('\n'.join(sum((["<META>'{0}'".format(id)] + d for d,id in zip(inputs,discourse_ids)), [])), self._candc_bin, args, verbose)
def _call_boxer(self, candc_out, verbose=False):
"""
"""
f = None
try:
- fd, temp_filename = tempfile.mkstemp(
- prefix="boxer-", suffix=".in", text=True
- )
- f = os.fdopen(fd, "w")
+ fd, temp_filename = tempfile.mkstemp(prefix='boxer-', suffix='.in', text=True)
+ f = os.fdopen(fd, 'w')
f.write(candc_out)
finally:
- if f:
- f.close()
-
- args = [
- "--box",
- "false",
- "--semantics",
- "drs",
- #'--flat', 'false', # removed from boxer
- "--resolve",
- ["false", "true"][self._resolve],
- "--elimeq",
- ["false", "true"][self._elimeq],
- "--format",
- "prolog",
- "--instantiate",
- "true",
- "--input",
- temp_filename,
- ]
+ if f: f.close()
+
+ args = ['--box', 'false',
+ '--semantics', 'drs',
+ #'--flat', 'false', # removed from boxer
+ '--resolve', ['false','true'][self._resolve],
+ '--elimeq', ['false','true'][self._elimeq],
+ '--format', 'prolog',
+ '--instantiate', 'true',
+ '--input', temp_filename]
stdout = self._call(None, self._boxer_bin, args, verbose)
os.remove(temp_filename)
return stdout
def _find_binary(self, name, bin_dir, verbose=False):
- return find_binary(
- name,
+ return find_binary(name,
path_to_bin=bin_dir,
- env_vars=["CANDC"],
- url="http://svn.ask.it.usyd.edu.au/trac/candc/",
- binary_names=[name, name + ".exe"],
- verbose=verbose,
- )
+ env_vars=['CANDC'],
+ url='http://svn.ask.it.usyd.edu.au/trac/candc/',
+ binary_names=[name, name + '.exe'],
+ verbose=verbose)
def _call(self, input_str, binary, args=[], verbose=False):
"""
:return: stdout
"""
if verbose:
- print("Calling:", binary)
- print("Args:", args)
- print("Input:", input_str)
- print("Command:", binary + " " + " ".join(args))
+ print('Calling:', binary)
+ print('Args:', args)
+ print('Input:', input_str)
+ print('Command:', binary + ' ' + ' '.join(args))
# Call via a subprocess
if input_str is None:
cmd = [binary] + args
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
- cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, " ".join(args))
- p = subprocess.Popen(
- cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
- )
+ cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, ' '.join(args))
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
if verbose:
- print("Return code:", p.returncode)
- if stdout:
- print("stdout:\n", stdout, "\n")
- if stderr:
- print("stderr:\n", stderr, "\n")
+ print('Return code:', p.returncode)
+ if stdout: print('stdout:\n', stdout, '\n')
+ if stderr: print('stderr:\n', stderr, '\n')
if p.returncode != 0:
- raise Exception(
- "ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}".format(
- binary, " ".join(args), p.returncode, stderr
- )
- )
+ raise Exception('ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}'.format(binary, ' '.join(args), p.returncode, stderr))
return stdout
def _parse_to_drs_dict(self, boxer_out, use_disc_id):
- lines = boxer_out.split("\n")
+ lines = boxer_out.split('\n')
drs_dict = {}
i = 0
while i < len(lines):
line = lines[i]
- if line.startswith("id("):
- comma_idx = line.index(",")
+ if line.startswith('id('):
+ comma_idx = line.index(',')
discourse_id = line[3:comma_idx]
if discourse_id[0] == "'" and discourse_id[-1] == "'":
discourse_id = discourse_id[1:-1]
- drs_id = line[comma_idx + 1 : line.index(")")]
+ drs_id = line[comma_idx+1:line.index(')')]
i += 1
line = lines[i]
- assert line.startswith("sem({0},".format(drs_id))
+ assert line.startswith('sem({0},'.format(drs_id))
if line[-4:] == "').'":
line = line[:-4] + ")."
- assert line.endswith(")."), "can't parse line: {0}".format(line)
+ assert line.endswith(').'), "can't parse line: {0}".format(line)
- search_start = len("sem({0},[".format(drs_id))
+ search_start = len('sem({0},['.format(drs_id))
brace_count = 1
drs_start = -1
- for j, c in enumerate(line[search_start:]):
- if c == "[":
+ for j,c in enumerate(line[search_start:]):
+ if(c == '['):
brace_count += 1
- if c == "]":
+ if(c == ']'):
brace_count -= 1
- if brace_count == 0:
+ if(brace_count == 0):
drs_start = search_start + j + 1
- if line[drs_start : drs_start + 3] == "','":
+ if line[drs_start:drs_start+3] == "','":
drs_start = drs_start + 3
else:
drs_start = drs_start + 1
return drs_dict
def _parse_drs(self, drs_string, discourse_id, use_disc_id):
- return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string)
+ return BoxerOutputDrsParser([None,discourse_id][use_disc_id]).parse(drs_string)
class BoxerOutputDrsParser(DrtParser):
return DrtParser.parse(self, data, signature)
def get_all_symbols(self):
- return ["(", ")", ",", "[", "]", ":"]
+ return ['(', ')', ',', '[', ']',':']
def handle(self, tok, context):
return self.handle_drs(tok)
return accum
def handle_drs(self, tok):
- if tok == "drs":
+ if tok == 'drs':
return self.parse_drs()
- elif tok in ["merge", "smerge"]:
+ elif tok in ['merge', 'smerge']:
return self._handle_binary_expression(self._make_merge_expression)(None, [])
- elif tok in ["alfa"]:
+ elif tok in ['alfa']:
return self._handle_alfa(self._make_merge_expression)(None, [])
def handle_condition(self, tok, indices):
:param indices: list of int
:return: list of ``DrtExpression``
"""
- if tok == "not":
+ if tok == 'not':
return [self._handle_not()]
- if tok == "or":
+ if tok == 'or':
conds = [self._handle_binary_expression(self._make_or_expression)]
- elif tok == "imp":
+ elif tok == 'imp':
conds = [self._handle_binary_expression(self._make_imp_expression)]
- elif tok == "eq":
+ elif tok == 'eq':
conds = [self._handle_eq()]
- elif tok == "prop":
+ elif tok == 'prop':
conds = [self._handle_prop()]
- elif tok == "pred":
+ elif tok == 'pred':
conds = [self._handle_pred()]
- elif tok == "named":
+ elif tok == 'named':
conds = [self._handle_named()]
- elif tok == "rel":
+ elif tok == 'rel':
conds = [self._handle_rel()]
- elif tok == "timex":
+ elif tok == 'timex':
conds = self._handle_timex()
- elif tok == "card":
+ elif tok == 'card':
conds = [self._handle_card()]
- elif tok == "whq":
+ elif tok == 'whq':
conds = [self._handle_whq()]
- elif tok == "duplex":
- conds = [self._handle_duplex()]
+ elif tok == 'duplex':
+ conds = [self._handle_duplex()]
else:
conds = []
- return sum(
- [
- [cond(sent_index, word_indices) for cond in conds]
- for sent_index, word_indices in self._sent_and_word_indices(indices)
- ],
- [],
- )
+ return sum([[cond(sent_index, word_indices) for cond in conds] for sent_index, word_indices in self._sent_and_word_indices(indices)], [])
def _handle_not(self):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
drs = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return BoxerNot(drs)
def _handle_pred(self):
- # pred(_G3943, dog, n, 0)
- self.assertToken(self.token(), "(")
+ #pred(_G3943, dog, n, 0)
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
name = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
pos = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
sense = int(self.token())
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
def _handle_pred_f(sent_index, word_indices):
- return BoxerPred(
- self.discourse_id, sent_index, word_indices, variable, name, pos, sense
- )
-
+ return BoxerPred(self.discourse_id, sent_index, word_indices, variable, name, pos, sense)
return _handle_pred_f
def _handle_duplex(self):
- # duplex(whq, drs(...), var, drs(...))
- self.assertToken(self.token(), "(")
+ #duplex(whq, drs(...), var, drs(...))
+ self.assertToken(self.token(), '(')
# self.assertToken(self.token(), '[')
ans_types = []
# while self.token(0) != ']':
# else:
# ans_types.append(self.token())
# self.token() #swallow the ']'
-
- self.assertToken(self.token(), "whq")
- self.assertToken(self.token(), ",")
+
+ self.assertToken(self.token(), 'whq')
+ self.assertToken(self.token(), ',')
d1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
ref = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
d2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: BoxerWhq(
- self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
+
def _handle_named(self):
- # named(x0, john, per, 0)
- self.assertToken(self.token(), "(")
+ #named(x0, john, per, 0)
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
name = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
type = self.token()
- self.assertToken(self.token(), ",")
- sense = self.token() # as per boxer rev 2554
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: BoxerNamed(
- self.discourse_id, sent_index, word_indices, variable, name, type, sense
- )
+ self.assertToken(self.token(), ',')
+ sense = self.token() # as per boxer rev 2554
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerNamed(self.discourse_id, sent_index, word_indices, variable, name, type, sense)
def _handle_rel(self):
- # rel(_G3993, _G3943, agent, 0)
- self.assertToken(self.token(), "(")
+ #rel(_G3993, _G3943, agent, 0)
+ self.assertToken(self.token(), '(')
var1 = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
var2 = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
rel = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
sense = int(self.token())
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: BoxerRel(
- self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerRel(self.discourse_id, sent_index, word_indices, var1, var2, rel, sense)
def _handle_timex(self):
- # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
- self.assertToken(self.token(), "(")
+ #timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
+ self.assertToken(self.token(), '(')
arg = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
new_conds = self._handle_time_expression(arg)
- self.assertToken(self.token(), ")")
+ self.assertToken(self.token(), ')')
return new_conds
def _handle_time_expression(self, arg):
- # date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
+ #date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
tok = self.token()
- self.assertToken(self.token(), "(")
- if tok == "date":
+ self.assertToken(self.token(), '(')
+ if tok == 'date':
conds = self._handle_date(arg)
- elif tok == "time":
+ elif tok == 'time':
conds = self._handle_time(arg)
else:
return None
- self.assertToken(self.token(), ")")
- return [
- lambda sent_index, word_indices: BoxerPred(
- self.discourse_id, sent_index, word_indices, arg, tok, "n", 0
- )
- ] + [lambda sent_index, word_indices: cond for cond in conds]
+ self.assertToken(self.token(), ')')
+ return [lambda sent_index, word_indices: BoxerPred(self.discourse_id, sent_index, word_indices, arg, tok, 'n', 0)] + \
+ [lambda sent_index, word_indices: cond for cond in conds]
def _handle_date(self, arg):
- # []: (+), []:'XXXX', [1004]:'04', []:'XX'
+ #[]: (+), []:'XXXX', [1004]:'04', []:'XX'
conds = []
- (sent_index, word_indices), = self._sent_and_word_indices(
- self._parse_index_list()
- )
- self.assertToken(self.token(), "(")
+ (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
+ self.assertToken(self.token(), '(')
pol = self.token()
- self.assertToken(self.token(), ")")
- conds.append(
- BoxerPred(
- self.discourse_id,
- sent_index,
- word_indices,
- arg,
- "date_pol_{0}".format(pol),
- "a",
- 0,
- )
- )
- self.assertToken(self.token(), ",")
-
- (sent_index, word_indices), = self._sent_and_word_indices(
- self._parse_index_list()
- )
+ self.assertToken(self.token(), ')')
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_pol_{0}'.format(pol), 'a', 0))
+ self.assertToken(self.token(), ',')
+
+ (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
year = self.token()
- if year != "XXXX":
- year = year.replace(":", "_")
- conds.append(
- BoxerPred(
- self.discourse_id,
- sent_index,
- word_indices,
- arg,
- "date_year_{0}".format(year),
- "a",
- 0,
- )
- )
- self.assertToken(self.token(), ",")
-
- (sent_index, word_indices), = self._sent_and_word_indices(
- self._parse_index_list()
- )
+ if year != 'XXXX':
+ year = year.replace(':', '_')
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_year_{0}'.format(year), 'a', 0))
+ self.assertToken(self.token(), ',')
+
+ (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
month = self.token()
- if month != "XX":
- conds.append(
- BoxerPred(
- self.discourse_id,
- sent_index,
- word_indices,
- arg,
- "date_month_{0}".format(month),
- "a",
- 0,
- )
- )
- self.assertToken(self.token(), ",")
-
- (sent_index, word_indices), = self._sent_and_word_indices(
- self._parse_index_list()
- )
+ if month != 'XX':
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_month_{0}'.format(month), 'a', 0))
+ self.assertToken(self.token(), ',')
+
+ (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
day = self.token()
- if day != "XX":
- conds.append(
- BoxerPred(
- self.discourse_id,
- sent_index,
- word_indices,
- arg,
- "date_day_{0}".format(day),
- "a",
- 0,
- )
- )
+ if day != 'XX':
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_day_{0}'.format(day), 'a', 0))
return conds
def _handle_time(self, arg):
- # time([1018]:'18', []:'XX', []:'XX')
+ #time([1018]:'18', []:'XX', []:'XX')
conds = []
self._parse_index_list()
hour = self.token()
- if hour != "XX":
- conds.append(self._make_atom("r_hour_2", arg, hour))
- self.assertToken(self.token(), ",")
+ if hour != 'XX':
+ conds.append(self._make_atom('r_hour_2',arg,hour))
+ self.assertToken(self.token(), ',')
self._parse_index_list()
min = self.token()
- if min != "XX":
- conds.append(self._make_atom("r_min_2", arg, min))
- self.assertToken(self.token(), ",")
+ if min != 'XX':
+ conds.append(self._make_atom('r_min_2',arg,min))
+ self.assertToken(self.token(), ',')
self._parse_index_list()
sec = self.token()
- if sec != "XX":
- conds.append(self._make_atom("r_sec_2", arg, sec))
+ if sec != 'XX':
+ conds.append(self._make_atom('r_sec_2',arg,sec))
return conds
def _handle_card(self):
- # card(_G18535, 28, ge)
- self.assertToken(self.token(), "(")
+ #card(_G18535, 28, ge)
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
value = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
type = self.token()
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: BoxerCard(
- self.discourse_id, sent_index, word_indices, variable, value, type
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerCard(self.discourse_id, sent_index, word_indices, variable, value, type)
def _handle_prop(self):
- # prop(_G15949, drs(...))
- self.assertToken(self.token(), "(")
+ #prop(_G15949, drs(...))
+ self.assertToken(self.token(), '(')
variable = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: BoxerProp(
- self.discourse_id, sent_index, word_indices, variable, drs
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerProp(self.discourse_id, sent_index, word_indices, variable, drs)
def _parse_index_list(self):
- # [1001,1002]:
+ #[1001,1002]:
indices = []
- self.assertToken(self.token(), "[")
- while self.token(0) != "]":
+ self.assertToken(self.token(), '[')
+ while self.token(0) != ']':
indices.append(self.parse_index())
- if self.token(0) == ",":
- self.token() # swallow ','
- self.token() # swallow ']'
- self.assertToken(self.token(), ":")
+ if self.token(0) == ',':
+ self.token() #swallow ','
+ self.token() #swallow ']'
+ self.assertToken(self.token(), ':')
return indices
def parse_drs(self):
- # drs([[1001]:_G3943],
+ #drs([[1001]:_G3943],
# [[1002]:pred(_G3943, dog, n, 0)]
# )
- self.assertToken(self.token(), "(")
- self.assertToken(self.token(), "[")
+ self.assertToken(self.token(), '(')
+ self.assertToken(self.token(), '[')
refs = set()
- while self.token(0) != "]":
+ while self.token(0) != ']':
indices = self._parse_index_list()
refs.add(self.parse_variable())
- if self.token(0) == ",":
- self.token() # swallow ','
- self.token() # swallow ']'
- self.assertToken(self.token(), ",")
- self.assertToken(self.token(), "[")
+ if self.token(0) == ',':
+ self.token() #swallow ','
+ self.token() #swallow ']'
+ self.assertToken(self.token(), ',')
+ self.assertToken(self.token(), '[')
conds = []
- while self.token(0) != "]":
+ while self.token(0) != ']':
indices = self._parse_index_list()
conds.extend(self.parse_condition(indices))
- if self.token(0) == ",":
- self.token() # swallow ','
- self.token() # swallow ']'
- self.assertToken(self.token(), ")")
+ if self.token(0) == ',':
+ self.token() #swallow ','
+ self.token() #swallow ']'
+ self.assertToken(self.token(), ')')
return BoxerDrs(list(refs), conds)
def _handle_binary_expression(self, make_callback):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
drs1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: make_callback(
- sent_index, word_indices, drs1, drs2
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: make_callback(sent_index, word_indices, drs1, drs2)
def _handle_alfa(self, make_callback):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
type = self.token()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
drs2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: make_callback(
- sent_index, word_indices, drs1, drs2
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: make_callback(sent_index, word_indices, drs1, drs2)
def _handle_eq(self):
- self.assertToken(self.token(), "(")
+ self.assertToken(self.token(), '(')
var1 = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
var2 = self.parse_variable()
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: BoxerEq(
- self.discourse_id, sent_index, word_indices, var1, var2
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerEq(self.discourse_id, sent_index, word_indices, var1, var2)
+
def _handle_whq(self):
- self.assertToken(self.token(), "(")
- self.assertToken(self.token(), "[")
+ self.assertToken(self.token(), '(')
+ self.assertToken(self.token(), '[')
ans_types = []
- while self.token(0) != "]":
+ while self.token(0) != ']':
cat = self.token()
- self.assertToken(self.token(), ":")
- if cat == "des":
+ self.assertToken(self.token(), ':')
+ if cat == 'des':
ans_types.append(self.token())
- elif cat == "num":
- ans_types.append("number")
+ elif cat == 'num':
+ ans_types.append('number')
typ = self.token()
- if typ == "cou":
- ans_types.append("count")
+ if typ == 'cou':
+ ans_types.append('count')
else:
ans_types.append(typ)
else:
ans_types.append(self.token())
- self.token() # swallow the ']'
+ self.token() #swallow the ']'
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
d1 = self.process_next_expression(None)
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
ref = self.parse_variable()
- self.assertToken(self.token(), ",")
+ self.assertToken(self.token(), ',')
d2 = self.process_next_expression(None)
- self.assertToken(self.token(), ")")
- return lambda sent_index, word_indices: BoxerWhq(
- self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
- )
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
def parse_variable(self):
var = self.token()
- assert re.match("^[exps]\d+$", var), var
+ assert re.match('^[exps]\d+$', var), var
return var
def parse_index(self):
"""
:return: list of (sent_index, word_indices) tuples
"""
- sent_indices = set((i / 1000) - 1 for i in indices if i >= 0)
+ sent_indices = set((i / 1000)-1 for i in indices if i>=0)
if sent_indices:
pairs = []
for sent_index in sent_indices:
- word_indices = [
- (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1
- ]
+ word_indices = [(i % 1000)-1 for i in indices if sent_index == (i / 1000)-1]
pairs.append((sent_index, word_indices))
return pairs
else:
- word_indices = [(i % 1000) - 1 for i in indices]
+ word_indices = [(i % 1000)-1 for i in indices]
return [(None, word_indices)]
"""
Reparse the str form of subclasses of ``AbstractBoxerDrs``
"""
-
def __init__(self, discourse_id=None):
DrtParser.__init__(self)
self.discourse_id = discourse_id
def get_all_symbols(self):
- return [
- DrtTokens.OPEN,
- DrtTokens.CLOSE,
- DrtTokens.COMMA,
- DrtTokens.OPEN_BRACKET,
- DrtTokens.CLOSE_BRACKET,
- ]
+ return [DrtTokens.OPEN, DrtTokens.CLOSE, DrtTokens.COMMA, DrtTokens.OPEN_BRACKET, DrtTokens.CLOSE_BRACKET]
def attempt_adjuncts(self, expression, context):
return expression
def handle(self, tok, context):
try:
- # if tok == 'drs':
- # self.assertNextToken(DrtTokens.OPEN)
- # label = int(self.token())
- # self.assertNextToken(DrtTokens.COMMA)
- # refs = list(map(int, self.handle_refs()))
- # self.assertNextToken(DrtTokens.COMMA)
- # conds = self.handle_conds(None)
- # self.assertNextToken(DrtTokens.CLOSE)
- # return BoxerDrs(label, refs, conds)
- if tok == "pred":
+# if tok == 'drs':
+# self.assertNextToken(DrtTokens.OPEN)
+# label = int(self.token())
+# self.assertNextToken(DrtTokens.COMMA)
+# refs = list(map(int, self.handle_refs()))
+# self.assertNextToken(DrtTokens.COMMA)
+# conds = self.handle_conds(None)
+# self.assertNextToken(DrtTokens.CLOSE)
+# return BoxerDrs(label, refs, conds)
+ if tok == 'pred':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = self.nullableIntToken()
self.assertNextToken(DrtTokens.COMMA)
sense = int(self.token())
self.assertNextToken(DrtTokens.CLOSE)
return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
- elif tok == "named":
+ elif tok == 'named':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = int(self.token())
self.assertNextToken(DrtTokens.COMMA)
self.assertNextToken(DrtTokens.COMMA)
sense = int(self.token())
self.assertNextToken(DrtTokens.CLOSE)
- return BoxerNamed(
- disc_id, sent_id, word_ids, variable, name, type, sense
- )
- elif tok == "rel":
+ return BoxerNamed(disc_id, sent_id, word_ids, variable, name, type, sense)
+ elif tok == 'rel':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = self.nullableIntToken()
self.assertNextToken(DrtTokens.COMMA)
sense = int(self.token())
self.assertNextToken(DrtTokens.CLOSE)
return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
- elif tok == "prop":
+ elif tok == 'prop':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = int(self.token())
self.assertNextToken(DrtTokens.COMMA)
drs = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
- elif tok == "not":
+ elif tok == 'not':
self.assertNextToken(DrtTokens.OPEN)
drs = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerNot(drs)
- elif tok == "imp":
+ elif tok == 'imp':
self.assertNextToken(DrtTokens.OPEN)
drs1 = self.process_next_expression(None)
self.assertNextToken(DrtTokens.COMMA)
drs2 = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerDrs(drs1.refs, drs1.conds, drs2)
- elif tok == "or":
+ elif tok == 'or':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = self.nullableIntToken()
self.assertNextToken(DrtTokens.COMMA)
drs2 = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
- elif tok == "eq":
+ elif tok == 'eq':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = self.nullableIntToken()
self.assertNextToken(DrtTokens.COMMA)
var2 = int(self.token())
self.assertNextToken(DrtTokens.CLOSE)
return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
- elif tok == "card":
+ elif tok == 'card':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = self.nullableIntToken()
self.assertNextToken(DrtTokens.COMMA)
type = self.token()
self.assertNextToken(DrtTokens.CLOSE)
return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
- elif tok == "whq":
+ elif tok == 'whq':
self.assertNextToken(DrtTokens.OPEN)
- disc_id = (
- self.discourse_id if self.discourse_id is not None else self.token()
- )
+ disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
sent_id = self.nullableIntToken()
self.assertNextToken(DrtTokens.COMMA)
def nullableIntToken(self):
t = self.token()
- return int(t) if t != "None" else None
+ return [None,int(t)][t != 'None']
def get_next_token_variable(self, description):
try:
return self.token()
except ExpectedMoreTokensException as e:
- raise ExpectedMoreTokensException(e.index, "Variable expected.")
+ raise ExpectedMoreTokensException(e.index, 'Variable expected.')
+
class AbstractBoxerDrs(object):
def variable_types(self):
vartypes = {}
- for t, vars in zip(("z", "e", "p"), self.variables()):
+ for t,vars in zip(('z','e','p'), self.variables()):
for v in vars:
vartypes[v] = t
return vartypes
return self
def _clean_name(self, name):
- return name.replace("-", "_").replace("'", "_")
+ return name.replace('-','_').replace("'", "_")
def renumber_sentences(self, f):
return self
return hash("{0}".format(self))
+@python_2_unicode_compatible
class BoxerDrs(AbstractBoxerDrs):
def __init__(self, refs, conds, consequent=None):
AbstractBoxerDrs.__init__(self)
def _variables(self):
variables = (set(), set(), set())
for cond in self.conds:
- for s, v in zip(variables, cond._variables()):
+ for s,v in zip(variables, cond._variables()):
s.update(v)
if self.consequent is not None:
- for s, v in zip(variables, self.consequent._variables()):
+ for s,v in zip(variables, self.consequent._variables()):
s.update(v)
return variables
return atoms
def clean(self):
- consequent = self.consequent.clean() if self.consequent else None
+ consequent = (self.consequent.clean() if self.consequent else None)
return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
def renumber_sentences(self, f):
- consequent = self.consequent.renumber_sentences(f) if self.consequent else None
- return BoxerDrs(
- self.refs, [c.renumber_sentences(f) for c in self.conds], consequent
- )
+ consequent = (self.consequent.renumber_sentences(f) if self.consequent else None)
+ return BoxerDrs(self.refs, [c.renumber_sentences(f) for c in self.conds], consequent)
def __repr__(self):
- s = "drs([%s], [%s])" % (
- ", ".join("%s" % r for r in self.refs),
- ", ".join("%s" % c for c in self.conds),
- )
+ s = 'drs([%s], [%s])' % (', '.join("%s" % r for r in self.refs),
+ ', '.join("%s" % c for c in self.conds))
if self.consequent is not None:
- s = "imp(%s, %s)" % (s, self.consequent)
+ s = 'imp(%s, %s)' % (s, self.consequent)
return s
def __eq__(self, other):
- return (
- self.__class__ == other.__class__
- and self.refs == other.refs
- and len(self.conds) == len(other.conds)
- and reduce(
- operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds))
- )
- and self.consequent == other.consequent
- )
+ return self.__class__ == other.__class__ and \
+ self.refs == other.refs and \
+ len(self.conds) == len(other.conds) and \
+ reduce(operator.and_, (c1==c2 for c1,c2 in zip(self.conds, other.conds))) and \
+ self.consequent == other.consequent
def __ne__(self, other):
return not self == other
__hash__ = AbstractBoxerDrs.__hash__
+@python_2_unicode_compatible
class BoxerNot(AbstractBoxerDrs):
def __init__(self, drs):
AbstractBoxerDrs.__init__(self)
return BoxerNot(self.drs.renumber_sentences(f))
def __repr__(self):
- return "not(%s)" % (self.drs)
+ return 'not(%s)' % (self.drs)
def __eq__(self, other):
return self.__class__ == other.__class__ and self.drs == other.drs
__hash__ = AbstractBoxerDrs.__hash__
-
+@python_2_unicode_compatible
class BoxerIndexed(AbstractBoxerDrs):
def __init__(self, discourse_id, sent_index, word_indices):
AbstractBoxerDrs.__init__(self)
return set([self])
def __eq__(self, other):
- return (
- self.__class__ == other.__class__
- and self.discourse_id == other.discourse_id
- and self.sent_index == other.sent_index
- and self.word_indices == other.word_indices
- and reduce(operator.and_, (s == o for s, o in zip(self, other)))
- )
+ return self.__class__ == other.__class__ and \
+ self.discourse_id == other.discourse_id and \
+ self.sent_index == other.sent_index and \
+ self.word_indices == other.word_indices and \
+ reduce(operator.and_, (s==o for s,o in zip(self, other)))
def __ne__(self, other):
return not self == other
__hash__ = AbstractBoxerDrs.__hash__
def __repr__(self):
- s = "%s(%s, %s, [%s]" % (
- self._pred(),
- self.discourse_id,
- self.sent_index,
- ", ".join("%s" % wi for wi in self.word_indices),
- )
+ s = '%s(%s, %s, [%s]' % (self._pred(), self.discourse_id,
+ self.sent_index, ', '.join("%s" % wi for wi in self.word_indices))
for v in self:
- s += ", %s" % v
- return s + ")"
-
+ s += ', %s' % v
+ return s + ')'
class BoxerPred(BoxerIndexed):
def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
return (set([self.var]), set(), set())
def change_var(self, var):
- return BoxerPred(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- var,
- self.name,
- self.pos,
- self.sense,
- )
+ return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.pos, self.sense)
def clean(self):
- return BoxerPred(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- self.var,
- self._clean_name(self.name),
- self.pos,
- self.sense,
- )
+ return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.pos, self.sense)
def renumber_sentences(self, f):
new_sent_index = f(self.sent_index)
- return BoxerPred(
- self.discourse_id,
- new_sent_index,
- self.word_indices,
- self.var,
- self.name,
- self.pos,
- self.sense,
- )
+ return BoxerPred(self.discourse_id, new_sent_index, self.word_indices, self.var, self.name, self.pos, self.sense)
def __iter__(self):
return iter((self.var, self.name, self.pos, self.sense))
def _pred(self):
- return "pred"
-
+ return 'pred'
class BoxerNamed(BoxerIndexed):
def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
return (set([self.var]), set(), set())
def change_var(self, var):
- return BoxerNamed(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- var,
- self.name,
- self.type,
- self.sense,
- )
+ return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.type, self.sense)
def clean(self):
- return BoxerNamed(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- self.var,
- self._clean_name(self.name),
- self.type,
- self.sense,
- )
+ return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.type, self.sense)
def renumber_sentences(self, f):
- return BoxerNamed(
- self.discourse_id,
- f(self.sent_index),
- self.word_indices,
- self.var,
- self.name,
- self.type,
- self.sense,
- )
+ return BoxerNamed(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.name, self.type, self.sense)
def __iter__(self):
return iter((self.var, self.name, self.type, self.sense))
def _pred(self):
- return "named"
-
+ return 'named'
class BoxerRel(BoxerIndexed):
def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
return (set([self.var1, self.var2]), set(), set())
def clean(self):
- return BoxerRel(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- self.var1,
- self.var2,
- self._clean_name(self.rel),
- self.sense,
- )
+ return BoxerRel(self.discourse_id, self.sent_index, self.word_indices, self.var1, self.var2, self._clean_name(self.rel), self.sense)
def renumber_sentences(self, f):
- return BoxerRel(
- self.discourse_id,
- f(self.sent_index),
- self.word_indices,
- self.var1,
- self.var2,
- self.rel,
- self.sense,
- )
+ return BoxerRel(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, self.rel, self.sense)
def __iter__(self):
return iter((self.var1, self.var2, self.rel, self.sense))
def _pred(self):
- return "rel"
-
+ return 'rel'
class BoxerProp(BoxerIndexed):
def __init__(self, discourse_id, sent_index, word_indices, var, drs):
self.drs = drs
def _variables(self):
- return tuple(
- map(operator.or_, (set(), set(), set([self.var])), self.drs._variables())
- )
+ return tuple(map(operator.or_, (set(), set(), set([self.var])), self.drs._variables()))
def referenced_labels(self):
return set([self.drs])
return self.drs.atoms()
def clean(self):
- return BoxerProp(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- self.var,
- self.drs.clean(),
- )
+ return BoxerProp(self.discourse_id, self.sent_index, self.word_indices, self.var, self.drs.clean())
def renumber_sentences(self, f):
- return BoxerProp(
- self.discourse_id,
- f(self.sent_index),
- self.word_indices,
- self.var,
- self.drs.renumber_sentences(f),
- )
+ return BoxerProp(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.drs.renumber_sentences(f))
def __iter__(self):
return iter((self.var, self.drs))
def _pred(self):
- return "prop"
-
+ return 'prop'
class BoxerEq(BoxerIndexed):
def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
return set()
def renumber_sentences(self, f):
- return BoxerEq(
- self.discourse_id,
- f(self.sent_index),
- self.word_indices,
- self.var1,
- self.var2,
- )
+ return BoxerEq(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2)
def __iter__(self):
return iter((self.var1, self.var2))
def _pred(self):
- return "eq"
-
+ return 'eq'
class BoxerCard(BoxerIndexed):
def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
return (set([self.var]), set(), set())
def renumber_sentences(self, f):
- return BoxerCard(
- self.discourse_id,
- f(self.sent_index),
- self.word_indices,
- self.var,
- self.value,
- self.type,
- )
+ return BoxerCard(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.value, self.type)
def __iter__(self):
return iter((self.var, self.value, self.type))
def _pred(self):
- return "card"
-
+ return 'card'
class BoxerOr(BoxerIndexed):
def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
return self.drs1.atoms() | self.drs2.atoms()
def clean(self):
- return BoxerOr(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- self.drs1.clean(),
- self.drs2.clean(),
- )
+ return BoxerOr(self.discourse_id, self.sent_index, self.word_indices, self.drs1.clean(), self.drs2.clean())
def renumber_sentences(self, f):
- return BoxerOr(
- self.discourse_id,
- f(self.sent_index),
- self.word_indices,
- self.drs1,
- self.drs2,
- )
+ return BoxerOr(self.discourse_id, f(self.sent_index), self.word_indices, self.drs1, self.drs2)
def __iter__(self):
return iter((self.drs1, self.drs2))
def _pred(self):
- return "or"
-
+ return 'or'
class BoxerWhq(BoxerIndexed):
- def __init__(
- self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2
- ):
+ def __init__(self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2):
BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
self.ans_types = ans_types
self.drs1 = drs1
self.drs2 = drs2
def _variables(self):
- return tuple(
- map(
- operator.or_,
- (set([self.variable]), set(), set()),
- self.drs1._variables(),
- self.drs2._variables(),
- )
- )
+ return tuple(map(operator.or_, (set([self.variable]), set(), set()), self.drs1._variables(), self.drs2._variables()))
def atoms(self):
return self.drs1.atoms() | self.drs2.atoms()
def clean(self):
- return BoxerWhq(
- self.discourse_id,
- self.sent_index,
- self.word_indices,
- self.ans_types,
- self.drs1.clean(),
- self.variable,
- self.drs2.clean(),
- )
+ return BoxerWhq(self.discourse_id, self.sent_index, self.word_indices, self.ans_types, self.drs1.clean(), self.variable, self.drs2.clean())
def renumber_sentences(self, f):
- return BoxerWhq(
- self.discourse_id,
- f(self.sent_index),
- self.word_indices,
- self.ans_types,
- self.drs1,
- self.variable,
- self.drs2,
- )
+ return BoxerWhq(self.discourse_id, f(self.sent_index), self.word_indices, self.ans_types, self.drs1, self.variable, self.drs2)
def __iter__(self):
- return iter(
- ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2)
- )
+ return iter(('['+','.join(self.ans_types)+']', self.drs1, self.variable, self.drs2))
def _pred(self):
- return "whq"
+ return 'whq'
+
class PassthroughBoxerDrsInterpreter(object):
:return: ``DrtExpression``
"""
if isinstance(ex, BoxerDrs):
- drs = DRS(
- [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds))
- )
+ drs = DRS([Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)))
if ex.consequent is not None:
drs.consequent = self.interpret(ex.consequent)
return drs
elif isinstance(ex, BoxerNot):
return DrtNegatedExpression(self.interpret(ex.drs))
elif isinstance(ex, BoxerPred):
- pred = self._add_occur_indexing("%s_%s" % (ex.pos, ex.name), ex)
+ pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex)
return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerNamed):
- pred = self._add_occur_indexing("ne_%s_%s" % (ex.type, ex.name), ex)
+ pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex)
return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerRel):
- pred = self._add_occur_indexing("%s" % (ex.rel), ex)
+ pred = self._add_occur_indexing('%s' % (ex.rel), ex)
return self._make_atom(pred, ex.var1, ex.var2)
elif isinstance(ex, BoxerProp):
return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
elif isinstance(ex, BoxerEq):
- return DrtEqualityExpression(
- DrtVariableExpression(Variable(ex.var1)),
- DrtVariableExpression(Variable(ex.var2)),
- )
+ return DrtEqualityExpression(DrtVariableExpression(Variable(ex.var1)),
+ DrtVariableExpression(Variable(ex.var2)))
elif isinstance(ex, BoxerCard):
- pred = self._add_occur_indexing("card_%s_%s" % (ex.type, ex.value), ex)
+ pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex)
return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerOr):
return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
drs1 = self.interpret(ex.drs1)
drs2 = self.interpret(ex.drs2)
return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
- assert False, "%s: %s" % (ex.__class__.__name__, ex)
+ assert False, '%s: %s' % (ex.__class__.__name__, ex)
def _make_atom(self, pred, *args):
accum = DrtVariableExpression(Variable(pred))
for arg in args:
- accum = DrtApplicationExpression(
- accum, DrtVariableExpression(Variable(arg))
- )
+ accum = DrtApplicationExpression(accum, DrtVariableExpression(Variable(arg)))
return accum
def _add_occur_indexing(self, base, ex):
if self._occur_index and ex.sent_index is not None:
if ex.discourse_id:
- base += "_%s" % ex.discourse_id
- base += "_s%s" % ex.sent_index
- base += "_w%s" % sorted(ex.word_indices)[0]
+ base += '_%s' % ex.discourse_id
+ base += '_s%s' % ex.sent_index
+ base += '_w%s' % sorted(ex.word_indices)[0]
return base
pass
-if __name__ == "__main__":
+if __name__ == '__main__':
opts = OptionParser("usage: %prog TEXT [options]")
- opts.add_option(
- "--verbose",
- "-v",
- help="display verbose logs",
- action="store_true",
- default=False,
- dest="verbose",
- )
- opts.add_option(
- "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol"
- )
- opts.add_option(
- "--question",
- "-q",
- help="input is a question",
- action="store_true",
- default=False,
- dest="question",
- )
- opts.add_option(
- "--occur",
- "-o",
- help="occurrence index",
- action="store_true",
- default=False,
- dest="occur_index",
- )
+ opts.add_option("--verbose", "-v", help="display verbose logs", action="store_true", default=False, dest="verbose")
+ opts.add_option("--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol")
+ opts.add_option("--question", "-q", help="input is a question", action="store_true", default=False, dest="question")
+ opts.add_option("--occur", "-o", help="occurrence index", action="store_true", default=False, dest="occur_index")
(options, args) = opts.parse_args()
if len(args) != 1:
opts.error("incorrect number of arguments")
interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
- drs = Boxer(interpreter).interpret_multi(
- args[0].split(r"\n"), question=options.question, verbose=options.verbose
- )
+ drs = Boxer(interpreter).interpret_multi(args[0].split(r'\n'), question=options.question, verbose=options.verbose)
if drs is None:
print(None)
else:
# Natural Language Toolkit: Chat-80 KB Reader
# See http://www.w3.org/TR/swbp-skos-core-guide/
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>,
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
current directory.
"""
+from __future__ import print_function, unicode_literals
import re
import shelve
import os
import sys
+from six import string_types
+
import nltk.data
+from nltk.compat import python_2_unicode_compatible
###########################################################################
# Chat-80 relation metadata bundles needed to build the valuation
###########################################################################
-borders = {
- "rel_name": "borders",
- "closures": ["symmetric"],
- "schema": ["region", "border"],
- "filename": "borders.pl",
-}
-
-contains = {
- "rel_name": "contains0",
- "closures": ["transitive"],
- "schema": ["region", "contain"],
- "filename": "contain.pl",
-}
-
-city = {
- "rel_name": "city",
- "closures": [],
- "schema": ["city", "country", "population"],
- "filename": "cities.pl",
-}
-
-country = {
- "rel_name": "country",
- "closures": [],
- "schema": [
- "country",
- "region",
- "latitude",
- "longitude",
- "area",
- "population",
- "capital",
- "currency",
- ],
- "filename": "countries.pl",
-}
-
-circle_of_lat = {
- "rel_name": "circle_of_latitude",
- "closures": [],
- "schema": ["circle_of_latitude", "degrees"],
- "filename": "world1.pl",
-}
-
-circle_of_long = {
- "rel_name": "circle_of_longitude",
- "closures": [],
- "schema": ["circle_of_longitude", "degrees"],
- "filename": "world1.pl",
-}
-
-continent = {
- "rel_name": "continent",
- "closures": [],
- "schema": ["continent"],
- "filename": "world1.pl",
-}
-
-region = {
- "rel_name": "in_continent",
- "closures": [],
- "schema": ["region", "continent"],
- "filename": "world1.pl",
-}
-
-ocean = {
- "rel_name": "ocean",
- "closures": [],
- "schema": ["ocean"],
- "filename": "world1.pl",
-}
-
-sea = {"rel_name": "sea", "closures": [], "schema": ["sea"], "filename": "world1.pl"}
-
-
-items = [
- "borders",
- "contains",
- "city",
- "country",
- "circle_of_lat",
- "circle_of_long",
- "continent",
- "region",
- "ocean",
- "sea",
-]
+borders = {'rel_name': 'borders',
+ 'closures': ['symmetric'],
+ 'schema': ['region', 'border'],
+ 'filename': 'borders.pl'}
+
+contains = {'rel_name': 'contains0',
+ 'closures': ['transitive'],
+ 'schema': ['region', 'contain'],
+ 'filename': 'contain.pl'}
+
+city = {'rel_name': 'city',
+ 'closures': [],
+ 'schema': ['city', 'country', 'population'],
+ 'filename': 'cities.pl'}
+
+country = {'rel_name': 'country',
+ 'closures': [],
+ 'schema': ['country', 'region', 'latitude', 'longitude',
+ 'area', 'population', 'capital', 'currency'],
+ 'filename': 'countries.pl'}
+
+circle_of_lat = {'rel_name': 'circle_of_latitude',
+ 'closures': [],
+ 'schema': ['circle_of_latitude', 'degrees'],
+ 'filename': 'world1.pl'}
+
+circle_of_long = {'rel_name': 'circle_of_longitude',
+ 'closures': [],
+ 'schema': ['circle_of_longitude', 'degrees'],
+ 'filename': 'world1.pl'}
+
+continent = {'rel_name': 'continent',
+ 'closures': [],
+ 'schema': ['continent'],
+ 'filename': 'world1.pl'}
+
+region = {'rel_name': 'in_continent',
+ 'closures': [],
+ 'schema': ['region', 'continent'],
+ 'filename': 'world1.pl'}
+
+ocean = {'rel_name': 'ocean',
+ 'closures': [],
+ 'schema': ['ocean'],
+ 'filename': 'world1.pl'}
+
+sea = {'rel_name': 'sea',
+ 'closures': [],
+ 'schema': ['sea'],
+ 'filename': 'world1.pl'}
+
+
+
+items = ['borders', 'contains', 'city', 'country', 'circle_of_lat',
+ 'circle_of_long', 'continent', 'region', 'ocean', 'sea']
items = tuple(sorted(items))
item_metadata = {
- "borders": borders,
- "contains": contains,
- "city": city,
- "country": country,
- "circle_of_lat": circle_of_lat,
- "circle_of_long": circle_of_long,
- "continent": continent,
- "region": region,
- "ocean": ocean,
- "sea": sea,
-}
+ 'borders': borders,
+ 'contains': contains,
+ 'city': city,
+ 'country': country,
+ 'circle_of_lat': circle_of_lat,
+ 'circle_of_long': circle_of_long,
+ 'continent': continent,
+ 'region': region,
+ 'ocean': ocean,
+ 'sea': sea
+ }
rels = item_metadata.values()
-not_unary = ["borders.pl", "contain.pl"]
+not_unary = ['borders.pl', 'contain.pl']
###########################################################################
-
+@python_2_unicode_compatible
class Concept(object):
"""
A Concept class, loosely based on SKOS
(http://www.w3.org/TR/swbp-skos-core-guide/).
"""
-
def __init__(self, prefLabel, arity, altLabels=[], closures=[], extension=set()):
"""
:param prefLabel: the preferred label for the concept
self.arity = arity
self.altLabels = altLabels
self.closures = closures
- # keep _extension internally as a set
+ #keep _extension internally as a set
self._extension = extension
- # public access is via a list (for slicing)
+ #public access is via a list (for slicing)
self.extension = sorted(list(extension))
def __str__(self):
- # _extension = ''
- # for element in sorted(self.extension):
- # if isinstance(element, tuple):
- # element = '(%s, %s)' % (element)
- # _extension += element + ', '
- # _extension = _extension[:-1]
-
- return "Label = '%s'\nArity = %s\nExtension = %s" % (
- self.prefLabel,
- self.arity,
- self.extension,
- )
+ #_extension = ''
+ #for element in sorted(self.extension):
+ #if isinstance(element, tuple):
+ #element = '(%s, %s)' % (element)
+ #_extension += element + ', '
+ #_extension = _extension[:-1]
+
+ return "Label = '%s'\nArity = %s\nExtension = %s" % \
+ (self.prefLabel, self.arity, self.extension)
def __repr__(self):
return "Concept('%s')" % self.prefLabel
self.extension = sorted(list(self._extension))
return self._extension
+
def _make_graph(self, s):
"""
Convert a set of pairs into an adjacency linked list encoding of a graph.
pairs.append((node, adjacent))
return set(pairs)
+
def close(self):
"""
Close a binary relation in the ``Concept``'s extension set.
relation is closed under a given property
"""
from nltk.sem import is_rel
-
assert is_rel(self._extension)
- if "symmetric" in self.closures:
+ if 'symmetric' in self.closures:
pairs = []
for (x, y) in self._extension:
pairs.append((y, x))
sym = set(pairs)
self._extension = self._extension.union(sym)
- if "transitive" in self.closures:
- all = self._make_graph(self._extension)
- closed = self._transclose(all)
+ if 'transitive' in self.closures:
+ all = self._make_graph(self._extension)
+ closed = self._transclose(all)
trans = self._make_pairs(closed)
+ #print sorted(trans)
self._extension = self._extension.union(trans)
self.extension = sorted(list(self._extension))
return concepts
-
def cities2table(filename, rel_name, dbname, verbose=False, setup=False):
"""
Convert a file of Prolog clauses into a database table.
:type schema: str
"""
import sqlite3
-
records = _str2records(filename, rel_name)
- connection = sqlite3.connect(dbname)
+ connection = sqlite3.connect(dbname)
cur = connection.cursor()
if setup:
- cur.execute(
- """CREATE TABLE city_table
- (City text, Country text, Population int)"""
- )
+ cur.execute('''CREATE TABLE city_table
+ (City text, Country text, Population int)''')
table_name = "city_table"
for t in records:
- cur.execute("insert into %s values (?,?,?)" % table_name, t)
+ cur.execute('insert into %s values (?,?,?)' % table_name, t)
if verbose:
print("inserting values into %s: " % table_name, t)
connection.commit()
print("Committing update to %s" % dbname)
cur.close()
-
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:type rel_name: str
"""
import sqlite3
-
try:
path = nltk.data.find(dbname)
- connection = sqlite3.connect(str(path))
+ connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
-
- warnings.warn(
- "Make sure the database file %s is installed and uncompressed." % dbname
- )
+ warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
-
def _str2records(filename, rel):
"""
Read a file into memory and convert each relation clause into a list.
contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
for line in contents.splitlines():
if line.startswith(rel):
- line = re.sub(rel + r"\(", "", line)
- line = re.sub(r"\)\.$", "", line)
- record = line.split(",")
+ line = re.sub(rel+r'\(', '', line)
+ line = re.sub(r'\)\.$', '', line)
+ record = line.split(',')
recs.append(record)
return recs
-
def unary_concept(label, subj, records):
"""
Make a unary concept out of the primary key in a record.
c.augment(record[subj])
return c
-
def binary_concept(label, closures, subj, obj, records):
"""
Make a binary concept out of the primary key and another field in a record.
:return: ``Concept`` of arity 2
:rtype: Concept
"""
- if not label == "border" and not label == "contain":
- label = label + "_of"
+ if not label == 'border' and not label == 'contain':
+ label = label + '_of'
c = Concept(label, arity=2, closures=closures, extension=set())
for record in records:
c.augment((record[subj], record[obj]))
"""
concepts = {}
for rel in rels:
- rel_name = rel["rel_name"]
- closures = rel["closures"]
- schema = rel["schema"]
- filename = rel["filename"]
+ rel_name = rel['rel_name']
+ closures = rel['closures']
+ schema = rel['schema']
+ filename = rel['filename']
concept_list = clause2concepts(filename, rel_name, schema, closures)
for c in concept_list:
label = c.prefLabel
- if label in concepts:
+ if (label in concepts):
for data in c.extension:
concepts[label].augment(data)
concepts[label].close()
for c in concepts:
vals.append((c.prefLabel, c.extension))
- if lexicon:
- read = True
+ if lexicon: read = True
if read:
from nltk.sem import Valuation
-
val = Valuation({})
val.update(vals)
# add labels for individuals
"""
concepts = process_bundle(rels).values()
valuation = make_valuation(concepts, read=True)
- db_out = shelve.open(db, "n")
+ db_out = shelve.open(db, 'n')
db_out.update(valuation)
The suffix '.db' should be omitted from the name.
:type db: str
"""
- dbname = db + ".db"
+ dbname = db+".db"
if not os.access(dbname, os.R_OK):
sys.exit("Cannot read file: %s" % dbname)
else:
db_in = shelve.open(db)
from nltk.sem import Valuation
-
val = Valuation(db_in)
- # val.read(db_in.items())
+# val.read(db_in.items())
return val
-# def alpha(str):
-# """
-# Utility to filter out non-alphabetic constants.
+#def alpha(str):
+ #"""
+ #Utility to filter out non-alphabetic constants.
-#:param str: candidate constant
-#:type str: string
-#:rtype: bool
-# """
-# try:
-# int(str)
-# return False
-# except ValueError:
-## some unknown values in records are labeled '?'
-# if not str == '?':
-# return True
+ #:param str: candidate constant
+ #:type str: string
+ #:rtype: bool
+ #"""
+ #try:
+ #int(str)
+ #return False
+ #except ValueError:
+ ## some unknown values in records are labeled '?'
+ #if not str == '?':
+ #return True
def label_indivs(valuation, lexicon=False):
pairs = [(e, e) for e in domain]
if lexicon:
lex = make_lex(domain)
- with open("chat_pnames.cfg", "w") as outfile:
+ with open("chat_pnames.cfg", 'w') as outfile:
outfile.writelines(lex)
# read the pairs into the valuation
valuation.update(pairs)
return valuation
-
def make_lex(symbols):
"""
Create lexical CFG rules for each individual symbol.
template = "PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n"
for s in symbols:
- parts = s.split("_")
+ parts = s.split('_')
caps = [p.capitalize() for p in parts]
- pname = "_".join(caps)
+ pname = '_'.join(caps)
rule = template % (s, pname)
lex.append(rule)
return lex
# Interface function to emulate other corpus readers
###########################################################################
-
-def concepts(items=items):
+def concepts(items = items):
"""
Build a list of concepts corresponding to the relation names in ``items``.
:return: the ``Concept`` objects which are extracted from the relations
:rtype: list(Concept)
"""
- if isinstance(items, str):
- items = (items,)
+ if isinstance(items, string_types): items = (items,)
rels = [item_metadata[r] for r in items]
return concept_map.values()
+
+
###########################################################################
def main():
import sys
from optparse import OptionParser
-
- description = """
+ description = \
+ """
Extract data from the Chat-80 Prolog files and convert them into a
Valuation object for use in the NLTK semantics package.
"""
opts = OptionParser(description=description)
opts.set_defaults(verbose=True, lex=False, vocab=False)
- opts.add_option(
- "-s", "--store", dest="outdb", help="store a valuation in DB", metavar="DB"
- )
- opts.add_option(
- "-l",
- "--load",
- dest="indb",
- help="load a stored valuation from DB",
- metavar="DB",
- )
- opts.add_option(
- "-c",
- "--concepts",
- action="store_true",
- help="print concepts instead of a valuation",
- )
- opts.add_option(
- "-r",
- "--relation",
- dest="label",
- help="print concept with label REL (check possible labels with '-v' option)",
- metavar="REL",
- )
- opts.add_option(
- "-q",
- "--quiet",
- action="store_false",
- dest="verbose",
- help="don't print out progress info",
- )
- opts.add_option(
- "-x",
- "--lex",
- action="store_true",
- dest="lex",
- help="write a file of lexical entries for country names, then exit",
- )
- opts.add_option(
- "-v",
- "--vocab",
- action="store_true",
- dest="vocab",
- help="print out the vocabulary of concept labels and their arity, then exit",
- )
+ opts.add_option("-s", "--store", dest="outdb",
+ help="store a valuation in DB", metavar="DB")
+ opts.add_option("-l", "--load", dest="indb",
+ help="load a stored valuation from DB", metavar="DB")
+ opts.add_option("-c", "--concepts", action="store_true",
+ help="print concepts instead of a valuation")
+ opts.add_option("-r", "--relation", dest="label",
+ help="print concept with label REL (check possible labels with '-v' option)", metavar="REL")
+ opts.add_option("-q", "--quiet", action="store_false", dest="verbose",
+ help="don't print out progress info")
+ opts.add_option("-x", "--lex", action="store_true", dest="lex",
+ help="write a file of lexical entries for country names, then exit")
+ opts.add_option("-v", "--vocab", action="store_true", dest="vocab",
+ help="print out the vocabulary of concept labels and their arity, then exit")
(options, args) = opts.parse_args()
if options.outdb and options.indb:
opts.error("Options --store and --load are mutually exclusive")
+
if options.outdb:
# write the valuation to a persistent database
if options.verbose:
- outdb = options.outdb + ".db"
+ outdb = options.outdb+".db"
print("Dumping a valuation to %s" % outdb)
val_dump(rels, options.outdb)
sys.exit(0)
else:
# try to read in a valuation from a database
if options.indb is not None:
- dbname = options.indb + ".db"
+ dbname = options.indb+".db"
if not os.access(dbname, os.R_OK):
sys.exit("Cannot read file: %s" % dbname)
else:
"""
print()
print("Using SQL to extract rows from 'city.db' RDB.")
- for row in sql_query("corpora/city_database/city.db", "SELECT * FROM city_table"):
+ for row in sql_query('corpora/city_database/city.db', "SELECT * FROM city_table"):
print(row)
-if __name__ == "__main__":
+if __name__ == '__main__':
main()
sql_demo()
# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from nltk.sem.logic import LambdaExpression, ApplicationExpression, Variable
from nltk.parse import load_parser
from nltk.parse.featurechart import InstantiateVarsChart
-
class CooperStore(object):
"""
A container for handling quantifier ambiguity via Cooper storage.
"""
-
def __init__(self, featstruct):
"""
:param featstruct: The value of the ``sem`` node in a tree from
self.featstruct = featstruct
self.readings = []
try:
- self.core = featstruct["CORE"]
- self.store = featstruct["STORE"]
+ self.core = featstruct['CORE']
+ self.store = featstruct['STORE']
except KeyError:
print("%s is not a Cooper storage structure" % featstruct)
:type lst: list
:rtype: iter
"""
- remove = lambda lst0, index: lst0[:index] + lst0[index + 1 :]
+ remove = lambda lst0, index: lst0[:index] + lst0[index+1:]
if lst:
for index, x in enumerate(lst):
for y in self._permute(remove(lst, index)):
- yield (x,) + y
- else:
- yield ()
+ yield (x,)+y
+ else: yield ()
def s_retrieve(self, trace=False):
"""
"""
for perm, store_perm in enumerate(self._permute(self.store)):
if trace:
- print("Permutation %s" % (perm + 1))
+ print("Permutation %s" % (perm+1))
term = self.core
for bindop in store_perm:
# we just want the arguments that are wrapped by the 'bo' predicate
quant, varex = tuple(bindop.args)
# use var to make an abstraction over the current term and then
# apply the quantifier to it
- term = ApplicationExpression(
- quant, LambdaExpression(varex.variable, term)
- )
+ term = ApplicationExpression(quant, LambdaExpression(varex.variable, term))
if trace:
print(" ", term)
term = term.simplify()
Use a grammar with Binding Operators to parse a sentence.
"""
if not grammar:
- grammar = "grammars/book_grammars/storage.fcfg"
+ grammar = 'grammars/book_grammars/storage.fcfg'
parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart)
# Parse the sentence.
tokens = sentence.split()
def demo():
from nltk.sem import cooper_storage as cs
-
sentence = "every girl chases a dog"
- # sentence = "a man gives a bone to every dog"
+ #sentence = "a man gives a bone to every dog"
print()
print("Analyis of sentence '%s'" % sentence)
print("=" * 50)
trees = cs.parse_with_bindops(sentence, trace=0)
for tree in trees:
- semrep = cs.CooperStore(tree.label()["SEM"])
+ semrep = cs.CooperStore(tree.label()['SEM'])
print()
print("Binding operators:")
print("-" * 15)
print("-" * 15)
for i, reading in enumerate(semrep.readings):
- print("%s: %s" % (i + 1, reading))
-
+ print("%s: %s" % (i+1, reading))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
import operator
from functools import reduce
from itertools import chain
-from nltk.sem.logic import (
- APP,
- AbstractVariableExpression,
- AllExpression,
- AndExpression,
- ApplicationExpression,
- BinaryExpression,
- BooleanExpression,
- ConstantExpression,
- EqualityExpression,
- EventVariableExpression,
- ExistsExpression,
- Expression,
- FunctionVariableExpression,
- ImpExpression,
- IndividualVariableExpression,
- LambdaExpression,
- Tokens,
- LogicParser,
- NegatedExpression,
- OrExpression,
- Variable,
- is_eventvar,
- is_funcvar,
- is_indvar,
- unique_variable,
-)
+from six import string_types
+
+from nltk.compat import python_2_unicode_compatible
+from nltk.sem.logic import (APP, AbstractVariableExpression, AllExpression,
+ AndExpression, ApplicationExpression, BinaryExpression,
+ BooleanExpression, ConstantExpression, EqualityExpression,
+ EventVariableExpression, ExistsExpression, Expression,
+ FunctionVariableExpression, ImpExpression,
+ IndividualVariableExpression, LambdaExpression, Tokens,
+ LogicParser, NegatedExpression, OrExpression, Variable,
+ is_eventvar, is_funcvar, is_indvar, unique_variable)
# Import Tkinter-based modules if they are available
try:
- from tkinter import Canvas, Tk
- from tkinter.font import Font
+ from six.moves.tkinter import Canvas, Tk
+ from six.moves.tkinter_font import Font
from nltk.util import in_idle
except ImportError:
# No need to print a warning here, nltk.draw has already printed one.
pass
-
class DrtTokens(Tokens):
- DRS = "DRS"
- DRS_CONC = "+"
- PRONOUN = "PRO"
- OPEN_BRACKET = "["
- CLOSE_BRACKET = "]"
- COLON = ":"
+ DRS = 'DRS'
+ DRS_CONC = '+'
+ PRONOUN = 'PRO'
+ OPEN_BRACKET = '['
+ CLOSE_BRACKET = ']'
+ COLON = ':'
PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON]
class DrtParser(LogicParser):
"""A lambda calculus expression parser."""
-
def __init__(self):
LogicParser.__init__(self)
self.operator_precedence = dict(
- [(x, 1) for x in DrtTokens.LAMBDA_LIST]
- + [(x, 2) for x in DrtTokens.NOT_LIST]
- + [(APP, 3)]
- + [(x, 4) for x in DrtTokens.EQ_LIST + Tokens.NEQ_LIST]
- + [(DrtTokens.COLON, 5)]
- + [(DrtTokens.DRS_CONC, 6)]
- + [(x, 7) for x in DrtTokens.OR_LIST]
- + [(x, 8) for x in DrtTokens.IMP_LIST]
- + [(None, 9)]
- )
+ [(x,1) for x in DrtTokens.LAMBDA_LIST] + \
+ [(x,2) for x in DrtTokens.NOT_LIST] + \
+ [(APP,3)] + \
+ [(x,4) for x in DrtTokens.EQ_LIST+Tokens.NEQ_LIST] + \
+ [(DrtTokens.COLON,5)] + \
+ [(DrtTokens.DRS_CONC,6)] + \
+ [(x,7) for x in DrtTokens.OR_LIST] + \
+ [(x,8) for x in DrtTokens.IMP_LIST] + \
+ [(None,9)])
def get_all_symbols(self):
"""This method exists to be overridden"""
def handle_DRS(self, tok, context):
# a DRS
refs = self.handle_refs()
- if (
- self.inRange(0) and self.token(0) == DrtTokens.COMMA
- ): # if there is a comma (it's optional)
- self.token() # swallow the comma
+ if self.inRange(0) and self.token(0) == DrtTokens.COMMA: #if there is a comma (it's optional)
+ self.token() # swallow the comma
conds = self.handle_conds(context)
self.assertNextToken(DrtTokens.CLOSE)
return DRS(refs, conds, None)
self.assertNextToken(DrtTokens.OPEN_BRACKET)
refs = []
while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET:
- # Support expressions like: DRS([x y],C) == DRS([x,y],C)
+ # Support expressions like: DRS([x y],C) == DRS([x,y],C)
if refs and self.token(0) == DrtTokens.COMMA:
- self.token() # swallow the comma
- refs.append(self.get_next_token_variable("quantified"))
+ self.token() # swallow the comma
+ refs.append(self.get_next_token_variable('quantified'))
self.assertNextToken(DrtTokens.CLOSE_BRACKET)
return refs
while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET:
# Support expressions like: DRS([x y],C) == DRS([x, y],C)
if conds and self.token(0) == DrtTokens.COMMA:
- self.token() # swallow the comma
+ self.token() # swallow the comma
conds.append(self.process_next_expression(context))
self.assertNextToken(DrtTokens.CLOSE_BRACKET)
return conds
def handle_prop(self, tok, context):
variable = self.make_VariableExpression(tok)
- self.assertNextToken(":")
+ self.assertNextToken(':')
drs = self.process_next_expression(DrtTokens.COLON)
return DrtProposition(variable, drs)
elif tok in DrtTokens.OR_LIST:
return DrtOrExpression
elif tok in DrtTokens.IMP_LIST:
-
def make_imp_expression(first, second):
if isinstance(first, DRS):
return DRS(first.refs, first.conds, second)
if isinstance(first, DrtConcatenation):
return DrtConcatenation(first.first, first.second, second)
- raise Exception("Antecedent of implication must be a DRS")
-
+ raise Exception('Antecedent of implication must be a DRS')
return make_imp_expression
else:
return None
return DRS(self.refs, self.conds, other)
if isinstance(self, DrtConcatenation):
return DrtConcatenation(self.first, self.second, other)
- raise Exception("Antecedent of implication must be a DRS")
+ raise Exception('Antecedent of implication must be a DRS')
def equiv(self, other, prover=None):
"""
"""
assert isinstance(other, DrtExpression)
- f1 = self.simplify().fol()
- f2 = other.simplify().fol()
+ f1 = self.simplify().fol();
+ f2 = other.simplify().fol();
return f1.equiv(f2, prover)
@property
def type(self):
- raise AttributeError(
- "'%s' object has no attribute 'type'" % self.__class__.__name__
- )
+ raise AttributeError("'%s' object has no attribute 'type'" %
+ self.__class__.__name__)
def typecheck(self, signature=None):
raise NotImplementedError()
def is_pronoun_function(self):
""" Is self of the form "PRO(x)"? """
- return (
- isinstance(self, DrtApplicationExpression)
- and isinstance(self.function, DrtAbstractVariableExpression)
- and self.function.variable.name == DrtTokens.PRONOUN
- and isinstance(self.argument, DrtIndividualVariableExpression)
- )
+ return isinstance(self, DrtApplicationExpression) and \
+ isinstance(self.function, DrtAbstractVariableExpression) and \
+ self.function.variable.name == DrtTokens.PRONOUN and \
+ isinstance(self.argument, DrtIndividualVariableExpression)
def make_EqualityExpression(self, first, second):
return DrtEqualityExpression(first, second)
return resolve_anaphora(self)
def eliminate_equality(self):
- return self.visit_structured(lambda e: e.eliminate_equality(), self.__class__)
+ return self.visit_structured(lambda e: e.eliminate_equality(),
+ self.__class__)
def pretty_format(self):
"""
Draw the DRS
:return: the pretty print string
"""
- return "\n".join(self._pretty())
+ return '\n'.join(self._pretty())
def pretty_print(self):
print(self.pretty_format())
DrsDrawer(self).draw()
+@python_2_unicode_compatible
class DRS(DrtExpression, Expression):
"""A Discourse Representation Structure."""
-
def __init__(self, refs, conds, consequent=None):
"""
:param refs: list of ``DrtIndividualVariableExpression`` for the
"""Replace all instances of variable v with expression E in self,
where v is free in self."""
if variable in self.refs:
- # if a bound variable is the thing being replaced
+ #if a bound variable is the thing being replaced
if not replace_bound:
return self
else:
i = self.refs.index(variable)
if self.consequent:
- consequent = self.consequent.replace(
- variable, expression, True, alpha_convert
- )
+ consequent = self.consequent.replace(variable, expression, True, alpha_convert)
else:
consequent = None
- return DRS(
- self.refs[:i] + [expression.variable] + self.refs[i + 1 :],
- [
- cond.replace(variable, expression, True, alpha_convert)
- for cond in self.conds
- ],
- consequent,
- )
+ return DRS(self.refs[:i]+[expression.variable]+self.refs[i+1:],
+ [cond.replace(variable, expression, True, alpha_convert)
+ for cond in self.conds],
+ consequent)
else:
if alpha_convert:
# any bound variable that appears in the expression must
# be alpha converted to avoid a conflict
- for ref in set(self.refs) & expression.free():
+ for ref in (set(self.refs) & expression.free()):
newvar = unique_variable(ref)
newvarex = DrtVariableExpression(newvar)
i = self.refs.index(ref)
if self.consequent:
- consequent = self.consequent.replace(
- ref, newvarex, True, alpha_convert
- )
+ consequent = self.consequent.replace(ref, newvarex, True, alpha_convert)
else:
consequent = None
- self = DRS(
- self.refs[:i] + [newvar] + self.refs[i + 1 :],
- [
- cond.replace(ref, newvarex, True, alpha_convert)
- for cond in self.conds
- ],
- consequent,
- )
-
- # replace in the conditions
+ self = DRS(self.refs[:i]+[newvar]+self.refs[i+1:],
+ [cond.replace(ref, newvarex, True, alpha_convert)
+ for cond in self.conds],
+ consequent)
+
+ #replace in the conditions
if self.consequent:
- consequent = self.consequent.replace(
- variable, expression, replace_bound, alpha_convert
- )
+ consequent = self.consequent.replace(variable, expression, replace_bound, alpha_convert)
else:
consequent = None
- return DRS(
- self.refs,
- [
- cond.replace(variable, expression, replace_bound, alpha_convert)
- for cond in self.conds
- ],
- consequent,
- )
+ return DRS(self.refs,
+ [cond.replace(variable, expression, replace_bound, alpha_convert)
+ for cond in self.conds],
+ consequent)
def free(self):
""":see: Expression.free()"""
def get_refs(self, recursive=False):
""":see: AbstractExpression.get_refs()"""
if recursive:
- conds_refs = self.refs + list(
- chain(*(c.get_refs(True) for c in self.conds))
- )
+ conds_refs = self.refs + list(chain(*(c.get_refs(True) for c in self.conds)))
if self.consequent:
conds_refs.extend(self.consequent.get_refs(True))
return conds_refs
def visit_structured(self, function, combinator):
""":see: Expression.visit_structured()"""
- consequent = function(self.consequent) if self.consequent else None
+ consequent = (function(self.consequent) if self.consequent else None)
return combinator(self.refs, list(map(function, self.conds)), consequent)
def eliminate_equality(self):
i = 0
while i < len(drs.conds):
cond = drs.conds[i]
- if (
- isinstance(cond, EqualityExpression)
- and isinstance(cond.first, AbstractVariableExpression)
- and isinstance(cond.second, AbstractVariableExpression)
- ):
- drs = DRS(
- list(set(drs.refs) - set([cond.second.variable])),
- drs.conds[:i] + drs.conds[i + 1 :],
- drs.consequent,
- )
+ if isinstance(cond, EqualityExpression) and \
+ isinstance(cond.first, AbstractVariableExpression) and \
+ isinstance(cond.second, AbstractVariableExpression):
+ drs = DRS(list(set(drs.refs)-set([cond.second.variable])),
+ drs.conds[:i]+drs.conds[i+1:],
+ drs.consequent)
if cond.second.variable != cond.first.variable:
drs = drs.replace(cond.second.variable, cond.first, False, False)
i = 0
for cond in drs.conds:
new_cond = cond.eliminate_equality()
new_cond_simp = new_cond.simplify()
- if (
- not isinstance(new_cond_simp, DRS)
- or new_cond_simp.refs
- or new_cond_simp.conds
- or new_cond_simp.consequent
- ):
+ if not isinstance(new_cond_simp, DRS) or \
+ new_cond_simp.refs or new_cond_simp.conds or \
+ new_cond_simp.consequent:
conds.append(new_cond)
- consequent = drs.consequent.eliminate_equality() if drs.consequent else None
+ consequent = (drs.consequent.eliminate_equality() if drs.consequent else None)
return DRS(drs.refs, conds, consequent)
def fol(self):
return accum
def _pretty(self):
- refs_line = " ".join(self._order_ref_strings(self.refs))
+ refs_line = ' '.join(self._order_ref_strings(self.refs))
- cond_lines = [
- cond
- for cond_line in [
- filter(lambda s: s.strip(), cond._pretty()) for cond in self.conds
- ]
- for cond in cond_line
- ]
+ cond_lines = [cond for cond_line in [filter(lambda s: s.strip(), cond._pretty())
+ for cond in self.conds]
+ for cond in cond_line]
length = max([len(refs_line)] + list(map(len, cond_lines)))
- drs = (
- [
- " _" + "_" * length + "_ ",
- "| " + refs_line.ljust(length) + " |",
- "|-" + "-" * length + "-|",
- ]
- + ["| " + line.ljust(length) + " |" for line in cond_lines]
- + ["|_" + "_" * length + "_|"]
- )
+ drs = ([' _' + '_' * length + '_ ',
+ '| ' + refs_line.ljust(length) + ' |',
+ '|-' + '-' * length + '-|'] +
+ ['| ' + line.ljust(length) + ' |' for line in cond_lines] +
+ ['|_' + '_' * length + '_|'])
if self.consequent:
- return DrtBinaryExpression._assemble_pretty(
- drs, DrtTokens.IMP, self.consequent._pretty()
- )
+ return DrtBinaryExpression._assemble_pretty(drs, DrtTokens.IMP,
+ self.consequent._pretty())
return drs
def _order_ref_strings(self, refs):
event_vars.append(s)
else:
other_vars.append(s)
- return (
- sorted(other_vars)
- + sorted(event_vars, key=lambda v: int([v[2:], -1][len(v[2:]) == 0]))
- + sorted(func_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0])))
- + sorted(ind_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0])))
- )
+ return sorted(other_vars) + \
+ sorted(event_vars, key=lambda v: int([v[2:],-1][len(v[2:]) == 0])) + \
+ sorted(func_vars, key=lambda v: (v[0], int([v[1:],-1][len(v[1:])==0]))) + \
+ sorted(ind_vars, key=lambda v: (v[0], int([v[1:],-1][len(v[1:])==0])))
def __eq__(self, other):
r"""Defines equality modulo alphabetic variance.
for (r1, r2) in zip(self.refs, converted_other.refs):
varex = self.make_VariableExpression(r1)
converted_other = converted_other.replace(r2, varex, True)
- if self.consequent == converted_other.consequent and len(
- self.conds
- ) == len(converted_other.conds):
+ if self.consequent == converted_other.consequent and \
+ len(self.conds) == len(converted_other.conds):
for c1, c2 in zip(self.conds, converted_other.conds):
if not (c1 == c2):
return False
__hash__ = Expression.__hash__
def __str__(self):
- drs = "([%s],[%s])" % (
- ",".join(self._order_ref_strings(self.refs)),
- ", ".join("%s" % cond for cond in self.conds),
- ) # map(str, self.conds)))
+ drs = '([%s],[%s])' % (','.join(self._order_ref_strings(self.refs)),
+ ', '.join("%s" % cond for cond in self.conds)) # map(str, self.conds)))
if self.consequent:
- return (
- DrtTokens.OPEN
- + drs
- + " "
- + DrtTokens.IMP
- + " "
- + "%s" % self.consequent
- + DrtTokens.CLOSE
- )
+ return DrtTokens.OPEN + drs + ' ' + DrtTokens.IMP + ' ' + \
+ "%s" % self.consequent + DrtTokens.CLOSE
return drs
def _pretty(self):
s = "%s" % self
- blank = " " * len(s)
+ blank = ' '*len(s)
return [blank, blank, s, blank]
def eliminate_equality(self):
return self
-
-class DrtIndividualVariableExpression(
- DrtAbstractVariableExpression, IndividualVariableExpression
-):
+class DrtIndividualVariableExpression(DrtAbstractVariableExpression, IndividualVariableExpression):
pass
-
-class DrtFunctionVariableExpression(
- DrtAbstractVariableExpression, FunctionVariableExpression
-):
+class DrtFunctionVariableExpression(DrtAbstractVariableExpression, FunctionVariableExpression):
pass
-
-class DrtEventVariableExpression(
- DrtIndividualVariableExpression, EventVariableExpression
-):
+class DrtEventVariableExpression(DrtIndividualVariableExpression, EventVariableExpression):
pass
-
class DrtConstantExpression(DrtAbstractVariableExpression, ConstantExpression):
pass
+@python_2_unicode_compatible
class DrtProposition(DrtExpression, Expression):
def __init__(self, variable, drs):
self.variable = variable
def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
if self.variable == variable:
- assert isinstance(
- expression, DrtAbstractVariableExpression
- ), "Can only replace a proposition label with a variable"
- return DrtProposition(
- expression.variable,
- self.drs.replace(variable, expression, replace_bound, alpha_convert),
- )
+ assert isinstance(expression, DrtAbstractVariableExpression), "Can only replace a proposition label with a variable"
+ return DrtProposition(expression.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert))
else:
- return DrtProposition(
- self.variable,
- self.drs.replace(variable, expression, replace_bound, alpha_convert),
- )
+ return DrtProposition(self.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert))
def eliminate_equality(self):
return DrtProposition(self.variable, self.drs.eliminate_equality())
def get_refs(self, recursive=False):
- return self.drs.get_refs(True) if recursive else []
+ return (self.drs.get_refs(True) if recursive else [])
def __eq__(self, other):
- return (
- self.__class__ == other.__class__
- and self.variable == other.variable
- and self.drs == other.drs
- )
+ return self.__class__ == other.__class__ and \
+ self.variable == other.variable and \
+ self.drs == other.drs
def __ne__(self, other):
return not self == other
def _pretty(self):
drs_s = self.drs._pretty()
- blank = " " * len("%s" % self.variable)
- return (
- [blank + " " + line for line in drs_s[:1]]
- + ["%s" % self.variable + ":" + line for line in drs_s[1:2]]
- + [blank + " " + line for line in drs_s[2:]]
- )
+ blank = ' ' * len("%s" % self.variable)
+ return ([blank + ' ' + line for line in drs_s[:1]] +
+ ["%s" % self.variable + ':' + line for line in drs_s[1:2]] +
+ [blank + ' ' + line for line in drs_s[2:]])
def visit(self, function, combinator):
""":see: Expression.visit()"""
return combinator(self.variable, function(self.drs))
def __str__(self):
- return "prop(%s, %s)" % (self.variable, self.drs)
+ return 'prop(%s, %s)' % (self.variable, self.drs)
class DrtNegatedExpression(DrtExpression, NegatedExpression):
def _pretty(self):
term_lines = self.term._pretty()
- return (
- [" " + line for line in term_lines[:2]]
- + ["__ " + line for line in term_lines[2:3]]
- + [" | " + line for line in term_lines[3:4]]
- + [" " + line for line in term_lines[4:]]
- )
-
+ return ([' ' + line for line in term_lines[:2]] +
+ ['__ ' + line for line in term_lines[2:3]] +
+ [' | ' + line for line in term_lines[3:4]] +
+ [' ' + line for line in term_lines[4:]])
class DrtLambdaExpression(DrtExpression, LambdaExpression):
def alpha_convert(self, newvar):
binder in the expression to ``newvar``.
:param newvar: ``Variable``, for the new variable
"""
- return self.__class__(
- newvar,
- self.term.replace(self.variable, DrtVariableExpression(newvar), True),
- )
+ return self.__class__(newvar, self.term.replace(self.variable,
+ DrtVariableExpression(newvar), True))
def fol(self):
return LambdaExpression(self.variable, self.term.fol())
while term.__class__ == self.__class__:
variables.append(term.variable)
term = term.term
- var_string = " ".join("%s" % v for v in variables) + DrtTokens.DOT
+ var_string = ' '.join("%s" % v for v in variables) + DrtTokens.DOT
term_lines = term._pretty()
- blank = " " * len(var_string)
- return (
- [" " + blank + line for line in term_lines[:1]]
- + [" \ " + blank + line for line in term_lines[1:2]]
- + [" /\ " + var_string + line for line in term_lines[2:3]]
- + [" " + blank + line for line in term_lines[3:]]
- )
-
+ blank = ' ' * len(var_string)
+ return ([' ' + blank + line for line in term_lines[:1]] +
+ [' \ ' + blank + line for line in term_lines[1:2]] +
+ [' /\ ' + var_string + line for line in term_lines[2:3]] +
+ [' ' + blank + line for line in term_lines[3:]])
class DrtBinaryExpression(DrtExpression, BinaryExpression):
def get_refs(self, recursive=False):
""":see: AbstractExpression.get_refs()"""
- return (
- self.first.get_refs(True) + self.second.get_refs(True) if recursive else []
- )
+ return self.first.get_refs(True) + self.second.get_refs(True) if recursive else []
def _pretty(self):
- return DrtBinaryExpression._assemble_pretty(
- self._pretty_subex(self.first),
- self.getOp(),
- self._pretty_subex(self.second),
- )
+ return DrtBinaryExpression._assemble_pretty(self._pretty_subex(self.first), self.getOp(), self._pretty_subex(self.second))
@staticmethod
def _assemble_pretty(first_lines, op, second_lines):
max_lines = max(len(first_lines), len(second_lines))
first_lines = _pad_vertically(first_lines, max_lines)
second_lines = _pad_vertically(second_lines, max_lines)
- blank = " " * len(op)
+ blank = ' ' * len(op)
first_second_lines = list(zip(first_lines, second_lines))
- return (
- [
- " " + first_line + " " + blank + " " + second_line + " "
- for first_line, second_line in first_second_lines[:2]
- ]
- + [
- "(" + first_line + " " + op + " " + second_line + ")"
- for first_line, second_line in first_second_lines[2:3]
- ]
- + [
- " " + first_line + " " + blank + " " + second_line + " "
- for first_line, second_line in first_second_lines[3:]
- ]
- )
+ return ([' ' + first_line + ' ' + blank + ' ' + second_line + ' ' for first_line, second_line in first_second_lines[:2]] +
+ ['(' + first_line + ' ' + op + ' ' + second_line + ')' for first_line, second_line in first_second_lines[2:3]] +
+ [' ' + first_line + ' ' + blank + ' ' + second_line + ' ' for first_line, second_line in first_second_lines[3:]])
def _pretty_subex(self, subex):
return subex._pretty()
-
class DrtBooleanExpression(DrtBinaryExpression, BooleanExpression):
pass
-
class DrtOrExpression(DrtBooleanExpression, OrExpression):
def fol(self):
return OrExpression(self.first.fol(), self.second.fol())
return [line[1:-1] for line in subex._pretty()]
return DrtBooleanExpression._pretty_subex(self, subex)
-
class DrtEqualityExpression(DrtBinaryExpression, EqualityExpression):
def fol(self):
return EqualityExpression(self.first.fol(), self.second.fol())
-
+@python_2_unicode_compatible
class DrtConcatenation(DrtBooleanExpression):
"""DRS of the form '(DRS + DRS)'"""
-
def __init__(self, first, second, consequent=None):
DrtBooleanExpression.__init__(self, first, second)
self.consequent = consequent
# If variable is bound
if variable in self.get_refs():
if replace_bound:
- first = first.replace(
- variable, expression, replace_bound, alpha_convert
- )
- second = second.replace(
- variable, expression, replace_bound, alpha_convert
- )
+ first = first.replace(variable, expression, replace_bound, alpha_convert)
+ second = second.replace(variable, expression, replace_bound, alpha_convert)
if consequent:
- consequent = consequent.replace(
- variable, expression, replace_bound, alpha_convert
- )
+ consequent = consequent.replace(variable, expression, replace_bound, alpha_convert)
else:
if alpha_convert:
# alpha convert every ref that is free in 'expression'
- for ref in set(self.get_refs(True)) & expression.free():
+ for ref in (set(self.get_refs(True)) & expression.free()):
v = DrtVariableExpression(unique_variable(ref))
- first = first.replace(ref, v, True, alpha_convert)
+ first = first.replace(ref, v, True, alpha_convert)
second = second.replace(ref, v, True, alpha_convert)
if consequent:
consequent = consequent.replace(ref, v, True, alpha_convert)
- first = first.replace(variable, expression, replace_bound, alpha_convert)
+ first = first.replace(variable, expression, replace_bound, alpha_convert)
second = second.replace(variable, expression, replace_bound, alpha_convert)
if consequent:
- consequent = consequent.replace(
- variable, expression, replace_bound, alpha_convert
- )
+ consequent = consequent.replace(variable, expression, replace_bound, alpha_convert)
return self.__class__(first, second, consequent)
def eliminate_equality(self):
- # TODO: at some point. for now, simplify.
+ #TODO: at some point. for now, simplify.
drs = self.simplify()
assert not isinstance(drs, DrtConcatenation)
return drs.eliminate_equality()
def simplify(self):
first = self.first.simplify()
second = self.second.simplify()
- consequent = self.consequent.simplify() if self.consequent else None
+ consequent = (self.consequent.simplify() if self.consequent else None)
if isinstance(first, DRS) and isinstance(second, DRS):
# For any ref that is in both 'first' and 'second'
- for ref in set(first.get_refs(True)) & set(second.get_refs(True)):
+ for ref in (set(first.get_refs(True)) & set(second.get_refs(True))):
# alpha convert the ref in 'second' to prevent collision
newvar = DrtVariableExpression(unique_variable(ref))
second = second.replace(ref, newvar, True)
other_refs = other.get_refs()
if len(self_refs) == len(other_refs):
converted_other = other
- for (r1, r2) in zip(self_refs, other_refs):
+ for (r1,r2) in zip(self_refs, other_refs):
varex = self.make_VariableExpression(r1)
converted_other = converted_other.replace(r2, varex, True)
- return (
- self.first == converted_other.first
- and self.second == converted_other.second
- and self.consequent == converted_other.consequent
- )
+ return self.first == converted_other.first and \
+ self.second == converted_other.second and \
+ self.consequent == converted_other.consequent
return False
def __ne__(self, other):
return e
def _pretty(self):
- drs = DrtBinaryExpression._assemble_pretty(
- self._pretty_subex(self.first),
- self.getOp(),
- self._pretty_subex(self.second),
- )
+ drs = DrtBinaryExpression._assemble_pretty(self._pretty_subex(self.first),
+ self.getOp(),
+ self._pretty_subex(self.second))
if self.consequent:
- drs = DrtBinaryExpression._assemble_pretty(
- drs, DrtTokens.IMP, self._pretty(self.consequent)
- )
+ drs = DrtBinaryExpression._assemble_pretty(drs, DrtTokens.IMP,
+ self._pretty(self.consequent))
return drs
def _pretty_subex(self, subex):
return [line[1:-1] for line in subex._pretty()]
return DrtBooleanExpression._pretty_subex(self, subex)
+
def visit(self, function, combinator):
""":see: Expression.visit()"""
if self.consequent:
- return combinator(
- [function(self.first), function(self.second), function(self.consequent)]
- )
+ return combinator([function(self.first), function(self.second), function(self.consequent)])
else:
return combinator([function(self.first), function(self.second)])
def __str__(self):
first = self._str_subex(self.first)
second = self._str_subex(self.second)
- drs = Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE
+ drs = Tokens.OPEN + first + ' ' + self.getOp() \
+ + ' ' + second + Tokens.CLOSE
if self.consequent:
- return (
- DrtTokens.OPEN
- + drs
- + " "
- + DrtTokens.IMP
- + " "
- + "%s" % self.consequent
- + DrtTokens.CLOSE
- )
+ return DrtTokens.OPEN + drs + ' ' + DrtTokens.IMP + ' ' + \
+ "%s" % self.consequent + DrtTokens.CLOSE
return drs
def _str_subex(self, subex):
def get_refs(self, recursive=False):
""":see: AbstractExpression.get_refs()"""
- return (
- self.function.get_refs(True) + self.argument.get_refs(True)
- if recursive
- else []
- )
+ return (self.function.get_refs(True) + self.argument.get_refs(True)
+ if recursive else [])
def _pretty(self):
function, args = self.uncurry()
function_lines = _pad_vertically(function_lines, max_lines)
args_lines = [_pad_vertically(arg_lines, max_lines) for arg_lines in args_lines]
func_args_lines = list(zip(function_lines, list(zip(*args_lines))))
- return (
- [
- func_line + " " + " ".join(args_line) + " "
- for func_line, args_line in func_args_lines[:2]
- ]
- + [
- func_line + "(" + ",".join(args_line) + ")"
- for func_line, args_line in func_args_lines[2:3]
- ]
- + [
- func_line + " " + " ".join(args_line) + " "
- for func_line, args_line in func_args_lines[3:]
- ]
- )
+ return ([func_line + ' ' + ' '.join(args_line) + ' ' for func_line, args_line in func_args_lines[:2]] +
+ [func_line + '(' + ','.join(args_line) + ')' for func_line, args_line in func_args_lines[2:3]] +
+ [func_line + ' ' + ' '.join(args_line) + ' ' for func_line, args_line in func_args_lines[3:]])
def _pad_vertically(lines, max_lines):
- pad_line = [" " * len(lines[0])]
+ pad_line = [' ' * len(lines[0])]
return lines + pad_line * (max_lines - len(lines))
+@python_2_unicode_compatible
class PossibleAntecedents(list, DrtExpression, Expression):
def free(self):
"""Set of free variables."""
def _pretty(self):
s = "%s" % self
- blank = " " * len(s)
+ blank = ' ' * len(s)
return [blank, blank, s]
def __str__(self):
- return "[" + ",".join("%s" % it for it in self) + "]"
+ return '[' + ','.join("%s" % it for it in self) + ']'
class AnaphoraResolutionException(Exception):
for ref in ancestor.get_refs():
refex = expression.make_VariableExpression(ref)
- # ==========================================================
+ #==========================================================
# Don't allow resolution to itself or other types
- # ==========================================================
- if refex.__class__ == expression.argument.__class__ and not (
- refex == expression.argument
- ):
+ #==========================================================
+ if refex.__class__ == expression.argument.__class__ and \
+ not (refex == expression.argument):
possible_antecedents.append(refex)
if len(possible_antecedents) == 1:
# if the condition is of the form '(x = [])' then raise exception
if isinstance(r_cond, EqualityExpression):
if isinstance(r_cond.first, PossibleAntecedents):
- # Reverse the order so that the variable is on the left
+ #Reverse the order so that the variable is on the left
temp = r_cond.first
r_cond.first = r_cond.second
r_cond.second = temp
if isinstance(r_cond.second, PossibleAntecedents):
if not r_cond.second:
- raise AnaphoraResolutionException(
- "Variable '%s' does not "
- "resolve to anything." % r_cond.first
- )
+ raise AnaphoraResolutionException("Variable '%s' does not "
+ "resolve to anything." % r_cond.first)
r_conds.append(r_cond)
if expression.consequent:
return expression
elif isinstance(expression, NegatedExpression):
- return expression.__class__(
- resolve_anaphora(expression.term, trail + [expression])
- )
+ return expression.__class__(resolve_anaphora(expression.term, trail + [expression]))
elif isinstance(expression, DrtConcatenation):
if expression.consequent:
consequent = resolve_anaphora(expression.consequent, trail + [expression])
else:
consequent = None
- return expression.__class__(
- resolve_anaphora(expression.first, trail + [expression]),
- resolve_anaphora(expression.second, trail + [expression]),
- consequent,
- )
+ return expression.__class__(resolve_anaphora(expression.first, trail + [expression]),
+ resolve_anaphora(expression.second, trail + [expression]),
+ consequent)
elif isinstance(expression, BinaryExpression):
- return expression.__class__(
- resolve_anaphora(expression.first, trail + [expression]),
- resolve_anaphora(expression.second, trail + [expression]),
- )
+ return expression.__class__(resolve_anaphora(expression.first, trail + [expression]),
+ resolve_anaphora(expression.second, trail + [expression]))
elif isinstance(expression, LambdaExpression):
- return expression.__class__(
- expression.variable, resolve_anaphora(expression.term, trail + [expression])
- )
+ return expression.__class__(expression.variable, resolve_anaphora(expression.term, trail + [expression]))
class DrsDrawer(object):
- BUFFER = 3 # Space between elements
- TOPSPACE = 10 # Space above whole DRS
- OUTERSPACE = 6 # Space to the left, right, and bottom of the whle DRS
+ BUFFER = 3 #Space between elements
+ TOPSPACE = 10 #Space above whole DRS
+ OUTERSPACE = 6 #Space to the left, right, and bottom of the whle DRS
def __init__(self, drs, size_canvas=True, canvas=None):
"""
master = Tk()
master.title("DRT")
- font = Font(family="helvetica", size=12)
+ font = Font(family='helvetica', size=12)
if size_canvas:
canvas = Canvas(master, width=0, height=0)
self.canvas = canvas
(right, bottom) = self._visit(drs, self.OUTERSPACE, self.TOPSPACE)
- width = max(right + self.OUTERSPACE, 100)
- height = bottom + self.OUTERSPACE
- canvas = Canvas(master, width=width, height=height) # , bg='white')
+ width = max(right+self.OUTERSPACE, 100)
+ height = bottom+self.OUTERSPACE
+ canvas = Canvas(master, width=width, height=height)#, bg='white')
else:
canvas = Canvas(master, width=300, height=300)
:param y: the left side of the current drawing area
:return: the bottom-rightmost point
"""
- if isinstance(item, str):
- self.canvas.create_text(x, y, anchor="nw", font=self.canvas.font, text=item)
+ if isinstance(item, string_types):
+ self.canvas.create_text(x, y, anchor='nw', font=self.canvas.font, text=item)
elif isinstance(item, tuple):
# item is the lower-right of a box
(right, bottom) = item
self.canvas.create_rectangle(x, y, right, bottom)
- horiz_line_y = (
- y + self._get_text_height() + (self.BUFFER * 2)
- ) # the line separating refs from conds
+ horiz_line_y = y + self._get_text_height() + (self.BUFFER * 2) #the line separating refs from conds
self.canvas.create_line(x, horiz_line_y, right, horiz_line_y)
return self._visit_command(item, x, y)
:param y: the left side of the current drawing area
:return: the bottom-rightmost point
"""
- if isinstance(item, str):
+ if isinstance(item, string_types):
return (x + self.canvas.font.measure(item), y + self._get_text_height())
elif isinstance(item, tuple):
return item
:return: the bottom-rightmost point
"""
if command == self._visit_command:
- # if we don't need to draw the item, then we can use the cached values
+ #if we don't need to draw the item, then we can use the cached values
try:
- # attempt to retrieve cached values
+ #attempt to retrieve cached values
right = expression._drawing_width + x
bottom = expression._drawing_height + y
return (right, bottom)
except AttributeError:
- # the values have not been cached yet, so compute them
+ #the values have not been cached yet, so compute them
pass
if isinstance(expression, DrtAbstractVariableExpression):
(right, bottom) = factory(expression, command, x, y)
- # cache the values
+ #cache the values
expression._drawing_width = right - x
expression._drawing_height = bottom - y
(right, bottom) = self._handle(expression.term, command, right, y)
# Handle variables now that we know the y-coordinate
- command(
- DrtTokens.NOT,
- x,
- self._get_centered_top(y, bottom - y, self._get_text_height()),
- )
+ command(DrtTokens.NOT, x, self._get_centered_top(y, bottom - y, self._get_text_height()))
return (right, bottom)
def _handle_DRS(self, expression, command, x, y):
- left = x + self.BUFFER # indent the left side
- bottom = y + self.BUFFER # indent the top
+ left = x + self.BUFFER #indent the left side
+ bottom = y + self.BUFFER #indent the top
# Handle Discourse Referents
if expression.refs:
- refs = " ".join("%s" % r for r in expression.refs)
+ refs = ' '.join("%s"%r for r in expression.refs)
else:
- refs = " "
+ refs = ' '
(max_right, bottom) = command(refs, left, bottom)
- bottom += self.BUFFER * 2
+ bottom += (self.BUFFER * 2)
# Handle Conditions
if expression.conds:
def _handle_ApplicationExpression(self, expression, command, x, y):
function, args = expression.uncurry()
if not isinstance(function, DrtAbstractVariableExpression):
- # It's not a predicate expression ("P(x,y)"), so leave arguments curried
+ #It's not a predicate expression ("P(x,y)"), so leave arguments curried
function = expression.function
args = [expression.argument]
# Get the max bottom of any element on the line
function_bottom = self._visit(function, x, y)[1]
- max_bottom = max(
- [function_bottom] + [self._visit(arg, x, y)[1] for arg in args]
- )
+ max_bottom = max([function_bottom] + [self._visit(arg, x, y)[1] for arg in args])
line_height = max_bottom - y
# Handle 'function'
- function_drawing_top = self._get_centered_top(
- y, line_height, function._drawing_height
- )
+ function_drawing_top = self._get_centered_top(y, line_height, function._drawing_height)
right = self._handle(function, command, x, function_drawing_top)[0]
# Handle open paren
- centred_string_top = self._get_centered_top(
- y, line_height, self._get_text_height()
- )
+ centred_string_top = self._get_centered_top(y, line_height, self._get_text_height())
right = command(DrtTokens.OPEN, right, centred_string_top)[0]
# Handle each arg
- for (i, arg) in enumerate(args):
- arg_drawing_top = self._get_centered_top(
- y, line_height, arg._drawing_height
- )
+ for (i,arg) in enumerate(args):
+ arg_drawing_top = self._get_centered_top(y, line_height, arg._drawing_height)
right = self._handle(arg, command, right, arg_drawing_top)[0]
- if i + 1 < len(args):
- # since it's not the last arg, add a comma
- right = command(DrtTokens.COMMA + " ", right, centred_string_top)[0]
+ if i+1 < len(args):
+ #since it's not the last arg, add a comma
+ right = command(DrtTokens.COMMA + ' ', right, centred_string_top)[0]
# Handle close paren
right = command(DrtTokens.CLOSE, right, centred_string_top)[0]
(right, bottom) = self._handle(expression.term, command, right, y)
# Handle variables now that we know the y-coordinate
- command(
- variables, x, self._get_centered_top(y, bottom - y, self._get_text_height())
- )
+ command(variables, x, self._get_centered_top(y, bottom - y, self._get_text_height()))
return (right, bottom)
line_height = max(first_height, second_height)
# Handle open paren
- centred_string_top = self._get_centered_top(
- y, line_height, self._get_text_height()
- )
+ centred_string_top = self._get_centered_top(y, line_height, self._get_text_height())
right = command(DrtTokens.OPEN, x, centred_string_top)[0]
# Handle the first operand
first_height = expression.first._drawing_height
- (right, first_bottom) = self._handle(
- expression.first,
- command,
- right,
- self._get_centered_top(y, line_height, first_height),
- )
+ (right, first_bottom) = self._handle(expression.first, command, right, self._get_centered_top(y, line_height, first_height))
# Handle the operator
- right = command(" %s " % expression.getOp(), right, centred_string_top)[0]
+ right = command(' %s ' % expression.getOp(), right, centred_string_top)[0]
# Handle the second operand
second_height = expression.second._drawing_height
- (right, second_bottom) = self._handle(
- expression.second,
- command,
- right,
- self._get_centered_top(y, line_height, second_height),
- )
+ (right, second_bottom) = self._handle(expression.second, command, right, self._get_centered_top(y, line_height, second_height))
# Handle close paren
right = command(DrtTokens.CLOSE, right, centred_string_top)[0]
def demo():
- print("=" * 20 + "TEST PARSE" + "=" * 20)
+ print('='*20 + 'TEST PARSE' + '='*20)
dexpr = DrtExpression.fromstring
- print(dexpr(r"([x,y],[sees(x,y)])"))
- print(dexpr(r"([x],[man(x), walks(x)])"))
- print(dexpr(r"\x.\y.([],[sees(x,y)])"))
- print(dexpr(r"\x.([],[walks(x)])(john)"))
- print(dexpr(r"(([x],[walks(x)]) + ([y],[runs(y)]))"))
- print(dexpr(r"(([],[walks(x)]) -> ([],[runs(x)]))"))
- print(dexpr(r"([x],[PRO(x), sees(John,x)])"))
- print(dexpr(r"([x],[man(x), -([],[walks(x)])])"))
- print(dexpr(r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])"))
-
- print("=" * 20 + "Test fol()" + "=" * 20)
- print(dexpr(r"([x,y],[sees(x,y)])").fol())
-
- print("=" * 20 + "Test alpha conversion and lambda expression equality" + "=" * 20)
- e1 = dexpr(r"\x.([],[P(x)])")
+ print(dexpr(r'([x,y],[sees(x,y)])'))
+ print(dexpr(r'([x],[man(x), walks(x)])'))
+ print(dexpr(r'\x.\y.([],[sees(x,y)])'))
+ print(dexpr(r'\x.([],[walks(x)])(john)'))
+ print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))'))
+ print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))'))
+ print(dexpr(r'([x],[PRO(x), sees(John,x)])'))
+ print(dexpr(r'([x],[man(x), -([],[walks(x)])])'))
+ print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'))
+
+ print('='*20 + 'Test fol()' + '='*20)
+ print(dexpr(r'([x,y],[sees(x,y)])').fol())
+
+ print('='*20 + 'Test alpha conversion and lambda expression equality' + '='*20)
+ e1 = dexpr(r'\x.([],[P(x)])')
print(e1)
- e2 = e1.alpha_convert(Variable("z"))
+ e2 = e1.alpha_convert(Variable('z'))
print(e2)
print(e1 == e2)
- print("=" * 20 + "Test resolve_anaphora()" + "=" * 20)
- print(resolve_anaphora(dexpr(r"([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])")))
- print(
- resolve_anaphora(dexpr(r"([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])"))
- )
- print(resolve_anaphora(dexpr(r"(([x,y],[]) + ([],[PRO(x)]))")))
+ print('='*20 + 'Test resolve_anaphora()' + '='*20)
+ print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])')))
+ print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])')))
+ print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')))
- print("=" * 20 + "Test pretty_print()" + "=" * 20)
+ print('='*20 + 'Test pretty_print()' + '='*20)
dexpr(r"([],[])").pretty_print()
- dexpr(
- r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])"
- ).pretty_print()
+ dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print()
dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print()
dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print()
dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print()
def test_draw():
try:
- from tkinter import Tk
+ from six.moves.tkinter import Tk
except ImportError:
from nose import SkipTest
-
raise SkipTest("tkinter is required, but it's not available.")
expressions = [
- r"x",
- r"([],[])",
- r"([x],[])",
- r"([x],[man(x)])",
- r"([x,y],[sees(x,y)])",
- r"([x],[man(x), walks(x)])",
- r"\x.([],[man(x), walks(x)])",
- r"\x y.([],[sees(x,y)])",
- r"([],[(([],[walks(x)]) + ([],[runs(x)]))])",
- r"([x],[man(x), -([],[walks(x)])])",
- r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])",
- ]
+ r'x',
+ r'([],[])',
+ r'([x],[])',
+ r'([x],[man(x)])',
+
+ r'([x,y],[sees(x,y)])',
+ r'([x],[man(x), walks(x)])',
+ r'\x.([],[man(x), walks(x)])',
+ r'\x y.([],[sees(x,y)])',
+ r'([],[(([],[walks(x)]) + ([],[runs(x)]))])',
+
+ r'([x],[man(x), -([],[walks(x)])])',
+ r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'
+ ]
for e in expressions:
d = DrtExpression.fromstring(e)
d.draw()
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
try:
- from tkinter import (
- Button,
- Frame,
- IntVar,
- Label,
- Listbox,
- Menu,
- Scrollbar,
- Tk,
- )
- from tkinter.font import Font
+ from six.moves.tkinter import (Button, Frame, IntVar, Label, Listbox, Menu,
+ Scrollbar, Tk)
+ from six.moves.tkinter_font import Font
from nltk.draw.util import CanvasFrame, ShowText
except ImportError:
def __init__(self, examples):
# Set up the main window.
self._top = Tk()
- self._top.title("DRT Glue Demo")
+ self._top.title('DRT Glue Demo')
# Set up key bindings.
self._init_bindings()
self._init_canvas(self._top)
# Resize callback
- self._canvas.bind("<Configure>", self._configure)
+ self._canvas.bind('<Configure>', self._configure)
#########################################
## Initialization Helpers
def _init_glue(self):
tagger = RegexpTagger(
- [
- ("^(David|Mary|John)$", "NNP"),
- (
- "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
- "VB",
- ),
- ("^(go|order|vanish|find|approach)$", "VB"),
- ("^(a)$", "ex_quant"),
- ("^(every)$", "univ_quant"),
- ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
- ("^(big|gray|former)$", "JJ"),
- ("^(him|himself)$", "PRP"),
- ]
- )
+ [('^(David|Mary|John)$', 'NNP'),
+ ('^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'),
+ ('^(go|order|vanish|find|approach)$', 'VB'),
+ ('^(a)$', 'ex_quant'),
+ ('^(every)$', 'univ_quant'),
+ ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+ ('^(big|gray|former)$', 'JJ'),
+ ('^(him|himself)$', 'PRP')
+ ])
depparser = MaltParser(tagger=tagger)
self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
- self._size.set(self._sysfont.cget("size"))
+ self._size.set(self._sysfont.cget('size'))
- self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
- self._font = Font(family="helvetica", size=self._size.get())
- if self._size.get() < 0:
- big = self._size.get() - 2
- else:
- big = self._size.get() + 2
- self._bigfont = Font(family="helvetica", weight="bold", size=big)
+ self._boldfont = Font(family='helvetica', weight='bold',
+ size=self._size.get())
+ self._font = Font(family='helvetica',
+ size=self._size.get())
+ if self._size.get() < 0: big = self._size.get()-2
+ else: big = self._size.get()+2
+ self._bigfont = Font(family='helvetica', weight='bold',
+ size=big)
def _init_exampleListbox(self, parent):
self._exampleFrame = listframe = Frame(parent)
- self._exampleFrame.pack(fill="both", side="left", padx=2)
- self._exampleList_label = Label(
- self._exampleFrame, font=self._boldfont, text="Examples"
- )
+ self._exampleFrame.pack(fill='both', side='left', padx=2)
+ self._exampleList_label = Label(self._exampleFrame, font=self._boldfont,
+ text='Examples')
self._exampleList_label.pack()
- self._exampleList = Listbox(
- self._exampleFrame,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
- font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
- )
-
- self._exampleList.pack(side="right", fill="both", expand=1)
+ self._exampleList = Listbox(self._exampleFrame, selectmode='single',
+ relief='groove', background='white',
+ foreground='#909090', font=self._font,
+ selectforeground='#004040',
+ selectbackground='#c0f0c0')
+
+ self._exampleList.pack(side='right', fill='both', expand=1)
for example in self._examples:
- self._exampleList.insert("end", (" %s" % example))
+ self._exampleList.insert('end', (' %s' % example))
self._exampleList.config(height=min(len(self._examples), 25), width=40)
# Add a scrollbar if there are more than 25 examples.
if len(self._examples) > 25:
- listscroll = Scrollbar(self._exampleFrame, orient="vertical")
- self._exampleList.config(yscrollcommand=listscroll.set)
+ listscroll = Scrollbar(self._exampleFrame,
+ orient='vertical')
+ self._exampleList.config(yscrollcommand = listscroll.set)
listscroll.config(command=self._exampleList.yview)
- listscroll.pack(side="left", fill="y")
+ listscroll.pack(side='left', fill='y')
# If they select a example, apply it.
- self._exampleList.bind("<<ListboxSelect>>", self._exampleList_select)
+ self._exampleList.bind('<<ListboxSelect>>', self._exampleList_select)
def _init_readingListbox(self, parent):
self._readingFrame = listframe = Frame(parent)
- self._readingFrame.pack(fill="both", side="left", padx=2)
- self._readingList_label = Label(
- self._readingFrame, font=self._boldfont, text="Readings"
- )
+ self._readingFrame.pack(fill='both', side='left', padx=2)
+ self._readingList_label = Label(self._readingFrame, font=self._boldfont,
+ text='Readings')
self._readingList_label.pack()
- self._readingList = Listbox(
- self._readingFrame,
- selectmode="single",
- relief="groove",
- background="white",
- foreground="#909090",
- font=self._font,
- selectforeground="#004040",
- selectbackground="#c0f0c0",
- )
-
- self._readingList.pack(side="right", fill="both", expand=1)
+ self._readingList = Listbox(self._readingFrame, selectmode='single',
+ relief='groove', background='white',
+ foreground='#909090', font=self._font,
+ selectforeground='#004040',
+ selectbackground='#c0f0c0')
+
+ self._readingList.pack(side='right', fill='both', expand=1)
# Add a scrollbar if there are more than 25 examples.
- listscroll = Scrollbar(self._readingFrame, orient="vertical")
- self._readingList.config(yscrollcommand=listscroll.set)
+ listscroll = Scrollbar(self._readingFrame,
+ orient='vertical')
+ self._readingList.config(yscrollcommand = listscroll.set)
listscroll.config(command=self._readingList.yview)
- listscroll.pack(side="right", fill="y")
+ listscroll.pack(side='right', fill='y')
self._populate_readingListbox()
def _populate_readingListbox(self):
# Populate the listbox with integers
- self._readingList.delete(0, "end")
+ self._readingList.delete(0, 'end')
for i in range(len(self._readings)):
- self._readingList.insert("end", (" %s" % (i + 1)))
+ self._readingList.insert('end', (' %s' % (i+1)))
self._readingList.config(height=min(len(self._readings), 25), width=5)
# If they select a example, apply it.
- self._readingList.bind("<<ListboxSelect>>", self._readingList_select)
+ self._readingList.bind('<<ListboxSelect>>', self._readingList_select)
def _init_bindings(self):
# Key bindings are a good thing.
- self._top.bind("<Control-q>", self.destroy)
- self._top.bind("<Control-x>", self.destroy)
- self._top.bind("<Escape>", self.destroy)
- self._top.bind("n", self.next)
- self._top.bind("<space>", self.next)
- self._top.bind("p", self.prev)
- self._top.bind("<BackSpace>", self.prev)
+ self._top.bind('<Control-q>', self.destroy)
+ self._top.bind('<Control-x>', self.destroy)
+ self._top.bind('<Escape>', self.destroy)
+ self._top.bind('n', self.next)
+ self._top.bind('<space>', self.next)
+ self._top.bind('p', self.prev)
+ self._top.bind('<BackSpace>', self.prev)
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
- buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
- Button(
- buttonframe,
- text="Prev",
- background="#90c0d0",
- foreground="black",
- command=self.prev,
- ).pack(side="left")
- Button(
- buttonframe,
- text="Next",
- background="#90c0d0",
- foreground="black",
- command=self.next,
- ).pack(side="left")
+ buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
+ Button(buttonframe, text='Prev',
+ background='#90c0d0', foreground='black',
+ command=self.prev,).pack(side='left')
+ Button(buttonframe, text='Next',
+ background='#90c0d0', foreground='black',
+ command=self.next,).pack(side='left')
def _configure(self, event):
self._autostep = 0
(x1, y1, x2, y2) = self._cframe.scrollregion()
y2 = event.height - 6
- self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
+ self._canvas['scrollregion'] = '%d %d %d %d' % (x1,y1,x2,y2)
self._redraw()
def _init_canvas(self, parent):
- self._cframe = CanvasFrame(
- parent,
- background="white",
- # width=525, height=250,
- closeenough=10,
- border=2,
- relief="sunken",
- )
- self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+ self._cframe = CanvasFrame(parent, background='white',
+ #width=525, height=250,
+ closeenough=10,
+ border=2, relief='sunken')
+ self._cframe.pack(expand=1, fill='both', side='top', pady=2)
canvas = self._canvas = self._cframe.canvas()
# Initially, there's no tree or text
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="q"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
+ filemenu.add_command(label='Exit', underline=1,
+ command=self.destroy, accelerator='q')
+ menubar.add_cascade(label='File', underline=0, menu=filemenu)
actionmenu = Menu(menubar, tearoff=0)
- actionmenu.add_command(
- label="Next", underline=0, command=self.next, accelerator="n, Space"
- )
- actionmenu.add_command(
- label="Previous", underline=0, command=self.prev, accelerator="p, Backspace"
- )
- menubar.add_cascade(label="Action", underline=0, menu=actionmenu)
+ actionmenu.add_command(label='Next', underline=0,
+ command=self.next, accelerator='n, Space')
+ actionmenu.add_command(label='Previous', underline=0,
+ command=self.prev, accelerator='p, Backspace')
+ menubar.add_cascade(label='Action', underline=0, menu=actionmenu)
optionmenu = Menu(menubar, tearoff=0)
- optionmenu.add_checkbutton(
- label="Remove Duplicates",
- underline=0,
- variable=self._glue.remove_duplicates,
- command=self._toggle_remove_duplicates,
- accelerator="r",
- )
- menubar.add_cascade(label="Options", underline=0, menu=optionmenu)
+ optionmenu.add_checkbutton(label='Remove Duplicates', underline=0,
+ variable=self._glue.remove_duplicates,
+ command=self._toggle_remove_duplicates,
+ accelerator='r')
+ menubar.add_cascade(label='Options', underline=0, menu=optionmenu)
viewmenu = Menu(menubar, tearoff=0)
- viewmenu.add_radiobutton(
- label="Tiny",
- variable=self._size,
- underline=0,
- value=10,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Small",
- variable=self._size,
- underline=0,
- value=12,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Medium",
- variable=self._size,
- underline=0,
- value=14,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Large",
- variable=self._size,
- underline=0,
- value=18,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Huge",
- variable=self._size,
- underline=0,
- value=24,
- command=self.resize,
- )
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+ viewmenu.add_radiobutton(label='Tiny', variable=self._size,
+ underline=0, value=10, command=self.resize)
+ viewmenu.add_radiobutton(label='Small', variable=self._size,
+ underline=0, value=12, command=self.resize)
+ viewmenu.add_radiobutton(label='Medium', variable=self._size,
+ underline=0, value=14, command=self.resize)
+ viewmenu.add_radiobutton(label='Large', variable=self._size,
+ underline=0, value=18, command=self.resize)
+ viewmenu.add_radiobutton(label='Huge', variable=self._size,
+ underline=0, value=24, command=self.resize)
+ menubar.add_cascade(label='View', underline=0, menu=viewmenu)
helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+ helpmenu.add_command(label='About', underline=0,
+ command=self.about)
+ menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
parent.config(menu=menubar)
self._drsWidget.clear()
if self._drs:
- self._drsWidget = DrsWidget(self._canvas, self._drs)
+ self._drsWidget = DrsWidget( self._canvas, self._drs )
self._drsWidget.draw()
if self._error:
- self._drsWidget = DrsWidget(self._canvas, self._error)
+ self._drsWidget = DrsWidget( self._canvas, self._error )
self._drsWidget.draw()
#########################################
def destroy(self, *e):
self._autostep = 0
- if self._top is None:
- return
+ if self._top is None: return
self._top.destroy()
self._top = None
if index <= 0:
self._select_previous_example()
else:
- self._readingList_store_selection(index - 1)
+ self._readingList_store_selection(index-1)
else:
- # select its first reading
- self._readingList_store_selection(readingListSize - 1)
+ #select its first reading
+ self._readingList_store_selection(readingListSize-1)
else:
self._select_previous_example()
+
def _select_previous_example(self):
- # if the current example is not the first example
+ #if the current example is not the first example
if self._curExample > 0:
- self._exampleList_store_selection(self._curExample - 1)
+ self._exampleList_store_selection(self._curExample-1)
else:
- # go to the last example
- self._exampleList_store_selection(len(self._examples) - 1)
+ #go to the last example
+ self._exampleList_store_selection(len(self._examples)-1)
def next(self, *e):
selection = self._readingList.curselection()
index = int(selection[0])
# if it's on (or past) the last item
- if index >= (readingListSize - 1):
+ if index >= (readingListSize-1):
self._select_next_example()
else:
- self._readingList_store_selection(index + 1)
+ self._readingList_store_selection(index+1)
else:
- # select its first reading
+ #select its first reading
self._readingList_store_selection(0)
else:
self._select_next_example()
def _select_next_example(self):
- # if the current example is not the last example
- if self._curExample < len(self._examples) - 1:
- self._exampleList_store_selection(self._curExample + 1)
+ #if the current example is not the last example
+ if self._curExample < len(self._examples)-1:
+ self._exampleList_store_selection(self._curExample+1)
else:
- # go to the first example
+ #go to the first example
self._exampleList_store_selection(0)
+
def about(self, *e):
- ABOUT = (
- "NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"
- + "Written by Daniel H. Garrette"
- )
- TITLE = "About: NLTK DRT Glue Demo"
+ ABOUT = ("NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"+
+ "Written by Daniel H. Garrette")
+ TITLE = 'About: NLTK DRT Glue Demo'
try:
- from tkinter.messagebox import Message
-
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
- if in_idle():
- return
+ if in_idle(): return
self._top.mainloop(*args, **kwargs)
def resize(self, size=None):
- if size is not None:
- self._size.set(size)
+ if size is not None: self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
self._boldfont.configure(size=-(abs(size)))
self._sysfont.configure(size=-(abs(size)))
- self._bigfont.configure(size=-(abs(size + 2)))
+ self._bigfont.configure(size=-(abs(size+2)))
self._redraw()
def _toggle_remove_duplicates(self):
self._glue.remove_duplicates = not self._glue.remove_duplicates
- self._exampleList.selection_clear(0, "end")
+ self._exampleList.selection_clear(0, 'end')
self._readings = []
self._populate_readingListbox()
self._readingCache = [None for ex in self._examples]
self._drs = None
self._redraw()
+
def _exampleList_select(self, event):
selection = self._exampleList.curselection()
- if len(selection) != 1:
- return
+ if len(selection) != 1: return
self._exampleList_store_selection(int(selection[0]))
def _exampleList_store_selection(self, index):
self._curExample = index
example = self._examples[index]
- self._exampleList.selection_clear(0, "end")
+ self._exampleList.selection_clear(0, 'end')
if example:
cache = self._readingCache[index]
if cache:
self._readingCache[index] = self._readings
except Exception as e:
self._readings = []
- self._error = DrtVariableExpression(Variable("Error: " + str(e)))
+ self._error = DrtVariableExpression(Variable('Error: ' + str(e)))
self._readingCache[index] = self._error
- # add a star to the end of the example
+ #add a star to the end of the example
self._exampleList.delete(index)
- self._exampleList.insert(index, (" %s *" % example))
- self._exampleList.config(
- height=min(len(self._examples), 25), width=40
- )
+ self._exampleList.insert(index, (' %s *' % example))
+ self._exampleList.config(height=min(len(self._examples), 25), width=40)
self._populate_readingListbox()
self._drs = None
self._redraw()
+
def _readingList_select(self, event):
selection = self._readingList.curselection()
- if len(selection) != 1:
- return
+ if len(selection) != 1: return
self._readingList_store_selection(int(selection[0]))
def _readingList_store_selection(self, index):
reading = self._readings[index]
- self._readingList.selection_clear(0, "end")
+ self._readingList.selection_clear(0, 'end')
if reading:
self._readingList.selection_set(index)
def __init__(self, canvas, drs, **attribs):
self._drs = drs
self._canvas = canvas
- canvas.font = Font(
- font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font")
- )
+ canvas.font = Font(font=canvas.itemcget(canvas.create_text(0, 0, text=''), 'font'))
canvas._BUFFER = 3
self.bbox = (0, 0, 0, 0)
def draw(self):
(right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw()
- self.bbox = (0, 0, right + 1, bottom + 1)
+ self.bbox = (0, 0, right+1, bottom+1)
def clear(self):
- self._canvas.create_rectangle(self.bbox, fill="white", width="0")
-
+ self._canvas.create_rectangle(self.bbox, fill="white", width="0" )
def demo():
- examples = [
- "John walks",
- "David sees Mary",
- "David eats a sandwich",
- "every man chases a dog",
- # 'every man believes a dog yawns',
- # 'John gives David a sandwich',
- "John chases himself",
- # 'John persuades David to order a pizza',
- # 'John tries to go',
- # 'John tries to find a unicorn',
- # 'John seems to vanish',
- # 'a unicorn seems to approach',
- # 'every big cat leaves',
- # 'every gray cat leaves',
- # 'every big gray cat leaves',
- # 'a former senator leaves',
- # 'John likes a cat',
- # 'John likes every cat',
- # 'he walks',
- # 'John walks and he leaves'
- ]
+ examples = ['John walks',
+ 'David sees Mary',
+ 'David eats a sandwich',
+ 'every man chases a dog',
+# 'every man believes a dog yawns',
+# 'John gives David a sandwich',
+ 'John chases himself',
+# 'John persuades David to order a pizza',
+# 'John tries to go',
+# 'John tries to find a unicorn',
+# 'John seems to vanish',
+# 'a unicorn seems to approach',
+# 'every big cat leaves',
+# 'every gray cat leaves',
+# 'every big gray cat leaves',
+# 'a former senator leaves',
+# 'John likes a cat',
+# 'John likes every cat',
+# 'he walks',
+# 'John walks and he leaves'
+ ]
DrtGlueDemo(examples).mainloop()
-
-if __name__ == "__main__":
- demo()
+if __name__ == '__main__': demo()
# Natural Language Toolkit: Models for first-order languages with lambda
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>,
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
-# TODO:
-# - fix tracing
-# - fix iterator-based approach to existentials
+#TODO:
+ #- fix tracing
+ #- fix iterator-based approach to existentials
"""
This module provides data structures for representing first-order
models.
"""
+from __future__ import print_function, unicode_literals
from pprint import pformat
import inspect
import re
import sys
-from nltk.decorators import decorator # this used in code that is commented out
+from six import string_types
-from nltk.sem.logic import (
- AbstractVariableExpression,
- AllExpression,
- Expression,
- AndExpression,
- ApplicationExpression,
- EqualityExpression,
- ExistsExpression,
- IffExpression,
- ImpExpression,
- IndividualVariableExpression,
- LambdaExpression,
- NegatedExpression,
- OrExpression,
- Variable,
- is_indvar,
-)
+from nltk.decorators import decorator # this used in code that is commented out
+from nltk.compat import python_2_unicode_compatible
+from nltk.sem.logic import (AbstractVariableExpression, AllExpression, Expression,
+ AndExpression, ApplicationExpression, EqualityExpression,
+ ExistsExpression, IffExpression, ImpExpression,
+ IndividualVariableExpression, LambdaExpression,
+ NegatedExpression, OrExpression,
+ Variable, is_indvar)
-class Error(Exception):
- pass
+class Error(Exception): pass
-class Undefined(Error):
- pass
-
+class Undefined(Error): pass
def trace(f, *args, **kw):
- argspec = inspect.getfullargspec(f)
+ if sys.version_info[0] >= 3:
+ argspec = inspect.getfullargspec(f)
+ else:
+ argspec = inspect.getargspec(f)
d = dict(zip(argspec[0], args))
- if d.pop("trace", None):
+ if d.pop('trace', None):
print()
for item in d.items():
print("%s => %s" % item)
return f(*args, **kw)
-
def is_rel(s):
"""
Check whether a set represents a relation (of any arity).
if len(s) == 0:
return True
# all the elements are tuples of the same length
- elif all(isinstance(el, tuple) for el in s) and len(max(s)) == len(min(s)):
+ elif all(isinstance(el, tuple) for el in s) and len(max(s))==len(min(s)):
return True
else:
raise ValueError("Set %r contains sequences of different lengths" % s)
-
def set2rel(s):
"""
Convert a set containing individuals (strings or numbers) into a set of
"""
new = set()
for elem in s:
- if isinstance(elem, str):
+ if isinstance(elem, string_types):
new.add((elem,))
elif isinstance(elem, int):
- new.add((str(elem)))
+ new.add((str(elem,)))
else:
new.add(elem)
return new
-
def arity(rel):
"""
Check the arity of a relation.
return len(list(rel)[0])
+@python_2_unicode_compatible
class Valuation(dict):
"""
A dictionary which represents a model-theoretic Valuation of non-logical constants.
just behave like a standard dictionary) if indexed with an expression that
is not in its list of symbols.
"""
-
def __init__(self, xs):
"""
:param xs: a list of (symbol, value) pairs.
"""
super(Valuation, self).__init__()
for (sym, val) in xs:
- if isinstance(val, str) or isinstance(val, bool):
+ if isinstance(val, string_types) or isinstance(val, bool):
self[sym] = val
elif isinstance(val, set):
self[sym] = set2rel(val)
else:
- msg = textwrap.fill(
- "Error in initializing Valuation. "
- "Unrecognized value for symbol '%s':\n%s" % (sym, val),
- width=66,
- )
+ msg = textwrap.fill("Error in initializing Valuation. "
+ "Unrecognized value for symbol '%s':\n%s" % (sym, val), width=66)
raise ValueError(msg)
"""Set-theoretic domain of the value-space of a Valuation."""
dom = []
for val in self.values():
- if isinstance(val, str):
+ if isinstance(val, string_types):
dom.append(val)
elif not isinstance(val, bool):
- dom.extend(
- [elem for tuple_ in val for elem in tuple_ if elem is not None]
- )
+ dom.extend([elem for tuple_ in val for elem in tuple_ if elem is not None])
return set(dom)
@property
##########################################
# REs used by the _read_valuation function
##########################################
-_VAL_SPLIT_RE = re.compile(r"\s*=+>\s*")
-_ELEMENT_SPLIT_RE = re.compile(r"\s*,\s*")
-_TUPLES_RE = re.compile(
- r"""\s*
+_VAL_SPLIT_RE = re.compile(r'\s*=+>\s*')
+_ELEMENT_SPLIT_RE = re.compile(r'\s*,\s*')
+_TUPLES_RE = re.compile(r"""\s*
(\([^)]+\)) # tuple-expression
- \s*""",
- re.VERBOSE,
-)
-
+ \s*""", re.VERBOSE)
def _read_valuation_line(s):
"""
symbol = pieces[0]
value = pieces[1]
# check whether the value is meant to be a set
- if value.startswith("{"):
+ if value.startswith('{'):
value = value[1:-1]
tuple_strings = _TUPLES_RE.findall(value)
# are the set elements tuples?
value = set(set_elements)
return symbol, value
-
def read_valuation(s, encoding=None):
"""
Convert a valuation string into a valuation.
statements = []
for linenum, line in enumerate(s.splitlines()):
line = line.strip()
- if line.startswith("#") or line == "":
- continue
+ if line.startswith('#') or line=='': continue
try:
statements.append(_read_valuation_line(line))
except ValueError:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
return Valuation(statements)
+@python_2_unicode_compatible
class Assignment(dict):
"""
A dictionary which represents an assignment of values to variables.
self.domain = domain
if assign:
for (var, val) in assign:
- assert val in self.domain, "'%s' is not in the domain: %s" % (
- val,
- self.domain,
- )
- assert is_indvar(var), (
- "Wrong format for an Individual Variable: '%s'" % var
- )
+ assert val in self.domain,\
+ "'%s' is not in the domain: %s" % (val, self.domain)
+ assert is_indvar(var),\
+ "Wrong format for an Individual Variable: '%s'" % var
self[var] = val
self.variant = None
self._addvariant()
``self.variant``.
"""
- assert val in self.domain, "%s is not in the domain %s" % (val, self.domain)
- assert is_indvar(var), "Wrong format for an Individual Variable: '%s'" % var
+ assert val in self.domain,\
+ "%s is not in the domain %s" % (val, self.domain)
+ assert is_indvar(var),\
+ "Wrong format for an Individual Variable: '%s'" % var
self[var] = val
self._addvariant()
return self
+@python_2_unicode_compatible
class Model(object):
"""
A first order model is a domain *D* of discourse and a valuation *V*.
self.domain = domain
self.valuation = valuation
if not domain.issuperset(valuation.domain):
- raise Error(
- "The valuation domain, %s, must be a subset of the model's domain, %s"
- % (valuation.domain, domain)
- )
+ raise Error("The valuation domain, %s, must be a subset of the model's domain, %s"\
+ % (valuation.domain, domain))
def __repr__(self):
return "(%r, %r)" % (self.domain, self.valuation)
value = self.satisfy(parsed, g, trace=trace)
if trace:
print()
- print("'%s' evaluates to %s under M, %s" % (expr, value, g))
+ print("'%s' evaluates to %s under M, %s" % (expr, value, g))
return value
except Undefined:
if trace:
print()
- print("'%s' is undefined under M, %s" % (expr, g))
- return "Undefined"
+ print("'%s' is undefined under M, %s" % (expr, g))
+ return 'Undefined'
+
def satisfy(self, parsed, g, trace=None):
"""
if isinstance(parsed, ApplicationExpression):
function, arguments = parsed.uncurry()
if isinstance(function, AbstractVariableExpression):
- # It's a predicate expression ("P(x,y)"), so used uncurried arguments
+ #It's a predicate expression ("P(x,y)"), so used uncurried arguments
funval = self.satisfy(function, g)
argvals = tuple(self.satisfy(arg, g) for arg in arguments)
return argvals in funval
else:
- # It must be a lambda expression, so use curried form
+ #It must be a lambda expression, so use curried form
funval = self.satisfy(parsed.function, g)
argval = self.satisfy(parsed.argument, g)
return funval[argval]
elif isinstance(parsed, NegatedExpression):
return not self.satisfy(parsed.term, g)
elif isinstance(parsed, AndExpression):
- return self.satisfy(parsed.first, g) and self.satisfy(parsed.second, g)
+ return self.satisfy(parsed.first, g) and \
+ self.satisfy(parsed.second, g)
elif isinstance(parsed, OrExpression):
- return self.satisfy(parsed.first, g) or self.satisfy(parsed.second, g)
+ return self.satisfy(parsed.first, g) or \
+ self.satisfy(parsed.second, g)
elif isinstance(parsed, ImpExpression):
- return (not self.satisfy(parsed.first, g)) or self.satisfy(parsed.second, g)
+ return (not self.satisfy(parsed.first, g)) or \
+ self.satisfy(parsed.second, g)
elif isinstance(parsed, IffExpression):
- return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g)
+ return self.satisfy(parsed.first, g) == \
+ self.satisfy(parsed.second, g)
elif isinstance(parsed, EqualityExpression):
- return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g)
+ return self.satisfy(parsed.first, g) == \
+ self.satisfy(parsed.second, g)
elif isinstance(parsed, AllExpression):
new_g = g.copy()
for u in self.domain:
else:
return self.i(parsed, g, trace)
- # @decorator(trace_eval)
+ #@decorator(trace_eval)
def i(self, parsed, g, trace=False):
"""
An interpretation function.
:return: a set of the entities that satisfy ``parsed``.
"""
- spacer = " "
+ spacer = ' '
indent = spacer + (spacer * nesting)
candidates = []
- if isinstance(varex, str):
+ if isinstance(varex, string_types):
var = Variable(varex)
else:
var = varex
if var in parsed.free():
if trace:
print()
- print(
- (spacer * nesting)
- + "Open formula is '%s' with assignment %s" % (parsed, g)
- )
+ print((spacer * nesting) + "Open formula is '%s' with assignment %s" % (parsed, g))
for u in self.domain:
new_g = g.copy()
new_g.add(var.name, u)
if trace and trace > 1:
- lowtrace = trace - 1
+ lowtrace = trace-1
else:
lowtrace = 0
value = self.satisfy(parsed, new_g, lowtrace)
# parsed == False under g[u/var]?
if value == False:
if trace:
- print(
- indent + "value of '%s' under %s is False" % (parsed, new_g)
- )
+ print(indent + "value of '%s' under %s is False" % (parsed, new_g))
# so g[u/var] is a satisfying assignment
else:
candidates.append(u)
if trace:
- print(
- indent
- + "value of '%s' under %s is %s" % (parsed, new_g, value)
- )
+ print(indent + "value of '%s' under %s is %s" % (parsed, new_g, value))
result = set(c for c in candidates)
# var isn't free in parsed
return result
-# //////////////////////////////////////////////////////////////////////
+
+
+
+#//////////////////////////////////////////////////////////////////////
# Demo..
-# //////////////////////////////////////////////////////////////////////
+#//////////////////////////////////////////////////////////////////////
# number of spacer chars
mult = 30
"""Example of a propositional model."""
global val1, dom1, m1, g1
- val1 = Valuation([("P", True), ("Q", True), ("R", False)])
+ val1 = Valuation([('P', True), ('Q', True), ('R', False)])
dom1 = set([])
m1 = Model(dom1, val1)
g1 = Assignment(dom1)
print()
- print("*" * mult)
+ print('*' * mult)
print("Propositional Formulas Demo")
- print("*" * mult)
- print("(Propositional constants treated as nullary predicates)")
+ print('*' * mult)
+ print('(Propositional constants treated as nullary predicates)')
print()
print("Model m1:\n", m1)
- print("*" * mult)
+ print('*' * mult)
sentences = [
- "(P & Q)",
- "(P & R)",
- "- P",
- "- R",
- "- - P",
- "- (P & R)",
- "(P | R)",
- "(R | P)",
- "(R | R)",
- "(- P | R)",
- "(P | - P)",
- "(P -> Q)",
- "(P -> R)",
- "(R -> P)",
- "(P <-> P)",
- "(R <-> R)",
- "(P <-> R)",
+ '(P & Q)',
+ '(P & R)',
+ '- P',
+ '- R',
+ '- - P',
+ '- (P & R)',
+ '(P | R)',
+ '(R | P)',
+ '(R | R)',
+ '(- P | R)',
+ '(P | - P)',
+ '(P -> Q)',
+ '(P -> R)',
+ '(R -> P)',
+ '(P <-> P)',
+ '(R <-> R)',
+ '(P <-> R)',
]
for sent in sentences:
else:
print("The value of '%s' is: %s" % (sent, m1.evaluate(sent, g1)))
-
# Demo 2: FOL Model
#############
-
def folmodel(quiet=False, trace=None):
"""Example of a first-order model."""
global val2, v2, dom2, m2, g2
- v2 = [
- ("adam", "b1"),
- ("betty", "g1"),
- ("fido", "d1"),
- ("girl", set(["g1", "g2"])),
- ("boy", set(["b1", "b2"])),
- ("dog", set(["d1"])),
- ("love", set([("b1", "g1"), ("b2", "g2"), ("g1", "b1"), ("g2", "b1")])),
- ]
+ v2 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\
+ ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])),
+ ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
val2 = Valuation(v2)
dom2 = val2.domain
m2 = Model(dom2, val2)
- g2 = Assignment(dom2, [("x", "b1"), ("y", "g2")])
+ g2 = Assignment(dom2, [('x', 'b1'), ('y', 'g2')])
if not quiet:
print()
- print("*" * mult)
+ print('*' * mult)
print("Models Demo")
print("*" * mult)
- print("Model m2:\n", "-" * 14, "\n", m2)
+ print("Model m2:\n", "-" * 14,"\n", m2)
print("Variable assignment = ", g2)
- exprs = ["adam", "boy", "love", "walks", "x", "y", "z"]
+ exprs = ['adam', 'boy', 'love', 'walks', 'x', 'y', 'z']
parsed_exprs = [Expression.fromstring(e) for e in exprs]
print()
for parsed in parsed_exprs:
try:
- print(
- "The interpretation of '%s' in m2 is %s"
- % (parsed, m2.i(parsed, g2))
- )
+ print("The interpretation of '%s' in m2 is %s" % (parsed, m2.i(parsed, g2)))
except Undefined:
print("The interpretation of '%s' in m2 is Undefined" % parsed)
- applications = [
- ("boy", ("adam")),
- ("walks", ("adam",)),
- ("love", ("adam", "y")),
- ("love", ("y", "adam")),
- ]
+
+ applications = [('boy', ('adam')), ('walks', ('adam',)), ('love', ('adam', 'y')), ('love', ('y', 'adam'))]
for (fun, args) in applications:
try:
except Undefined:
print("%s(%s) evaluates to Undefined" % (fun, args))
-
# Demo 3: FOL
#########
-
def foldemo(trace=None):
"""
Interpretation of closed expressions in a first-order model.
folmodel(quiet=True)
print()
- print("*" * mult)
+ print('*' * mult)
print("FOL Formulas Demo")
- print("*" * mult)
+ print('*' * mult)
formulas = [
- "love (adam, betty)",
- "(adam = mia)",
- "\\x. (boy(x) | girl(x))",
- "\\x. boy(x)(adam)",
- "\\x y. love(x, y)",
- "\\x y. love(x, y)(adam)(betty)",
- "\\x y. love(x, y)(adam, betty)",
- "\\x y. (boy(x) & love(x, y))",
- "\\x. exists y. (boy(x) & love(x, y))",
- "exists z1. boy(z1)",
- "exists x. (boy(x) & -(x = adam))",
- "exists x. (boy(x) & all y. love(y, x))",
- "all x. (boy(x) | girl(x))",
- "all x. (girl(x) -> exists y. boy(y) & love(x, y))", # Every girl loves exists boy.
- "exists x. (boy(x) & all y. (girl(y) -> love(y, x)))", # There is exists boy that every girl loves.
- "exists x. (boy(x) & all y. (girl(y) -> love(x, y)))", # exists boy loves every girl.
- "all x. (dog(x) -> - girl(x))",
- "exists x. exists y. (love(x, y) & love(x, y))",
+ 'love (adam, betty)',
+ '(adam = mia)',
+ '\\x. (boy(x) | girl(x))',
+ '\\x. boy(x)(adam)',
+ '\\x y. love(x, y)',
+ '\\x y. love(x, y)(adam)(betty)',
+ '\\x y. love(x, y)(adam, betty)',
+ '\\x y. (boy(x) & love(x, y))',
+ '\\x. exists y. (boy(x) & love(x, y))',
+ 'exists z1. boy(z1)',
+ 'exists x. (boy(x) & -(x = adam))',
+ 'exists x. (boy(x) & all y. love(y, x))',
+ 'all x. (boy(x) | girl(x))',
+ 'all x. (girl(x) -> exists y. boy(y) & love(x, y))', #Every girl loves exists boy.
+ 'exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', #There is exists boy that every girl loves.
+ 'exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', #exists boy loves every girl.
+ 'all x. (dog(x) -> - girl(x))',
+ 'exists x. exists y. (love(x, y) & love(x, y))'
]
+
for fmla in formulas:
g2.purge()
if trace:
# Demo 3: Satisfaction
#############
-
def satdemo(trace=None):
"""Satisfiers of an open formula in a first order model."""
print()
- print("*" * mult)
+ print('*' * mult)
print("Satisfiers Demo")
- print("*" * mult)
+ print('*' * mult)
folmodel(quiet=True)
formulas = [
- "boy(x)",
- "(x = x)",
- "(boy(x) | girl(x))",
- "(boy(x) & girl(x))",
- "love(adam, x)",
- "love(x, adam)",
- "-(x = adam)",
- "exists z22. love(x, z22)",
- "exists y. love(y, x)",
- "all y. (girl(y) -> love(x, y))",
- "all y. (girl(y) -> love(y, x))",
- "all y. (girl(y) -> (boy(x) & love(y, x)))",
- "(boy(x) & all y. (girl(y) -> love(x, y)))",
- "(boy(x) & all y. (girl(y) -> love(y, x)))",
- "(boy(x) & exists y. (girl(y) & love(y, x)))",
- "(girl(x) -> dog(x))",
- "all y. (dog(y) -> (x = y))",
- "exists y. love(y, x)",
- "exists y. (love(adam, y) & love(y, x))",
- ]
+ 'boy(x)',
+ '(x = x)',
+ '(boy(x) | girl(x))',
+ '(boy(x) & girl(x))',
+ 'love(adam, x)',
+ 'love(x, adam)',
+ '-(x = adam)',
+ 'exists z22. love(x, z22)',
+ 'exists y. love(y, x)',
+ 'all y. (girl(y) -> love(x, y))',
+ 'all y. (girl(y) -> love(y, x))',
+ 'all y. (girl(y) -> (boy(x) & love(y, x)))',
+ '(boy(x) & all y. (girl(y) -> love(x, y)))',
+ '(boy(x) & all y. (girl(y) -> love(y, x)))',
+ '(boy(x) & exists y. (girl(y) & love(y, x)))',
+ '(girl(x) -> dog(x))',
+ 'all y. (dog(y) -> (x = y))',
+ 'exists y. love(y, x)',
+ 'exists y. (love(adam, y) & love(y, x))'
+ ]
if trace:
print(m2)
for p in parsed:
g2.purge()
- print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, "x", g2, trace)))
+ print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, 'x', g2, trace)))
def demo(num=0, trace=None):
:param trace: trace = 1, or trace = 2 for more verbose tracing
"""
- demos = {1: propdemo, 2: folmodel, 3: foldemo, 4: satdemo}
+ demos = {
+ 1: propdemo,
+ 2: folmodel,
+ 3: foldemo,
+ 4: satdemo}
try:
demos[num](trace=trace)
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
import os
from itertools import chain
+from six import string_types
+
import nltk
from nltk.internals import Counter
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
-from nltk.sem.logic import (
- Expression,
- Variable,
- VariableExpression,
- LambdaExpression,
- AbstractVariableExpression,
-)
+from nltk.sem.logic import (Expression, Variable, VariableExpression,
+ LambdaExpression, AbstractVariableExpression)
+from nltk.compat import python_2_unicode_compatible
from nltk.sem import drt
from nltk.sem import linearlogic
-SPEC_SEMTYPES = {
- "a": "ex_quant",
- "an": "ex_quant",
- "every": "univ_quant",
- "the": "def_art",
- "no": "no_quant",
- "default": "ex_quant",
-}
-
-OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]
+SPEC_SEMTYPES = {'a' : 'ex_quant',
+ 'an' : 'ex_quant',
+ 'every' : 'univ_quant',
+ 'the' : 'def_art',
+ 'no' : 'no_quant',
+ 'default' : 'ex_quant'}
+OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct']
+@python_2_unicode_compatible
class GlueFormula(object):
def __init__(self, meaning, glue, indices=None):
if not indices:
indices = set()
- if isinstance(meaning, str):
+ if isinstance(meaning, string_types):
self.meaning = Expression.fromstring(meaning)
elif isinstance(meaning, Expression):
self.meaning = meaning
else:
- raise RuntimeError(
- "Meaning term neither string or expression: %s, %s"
- % (meaning, meaning.__class__)
- )
+ raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__))
- if isinstance(glue, str):
+ if isinstance(glue, string_types):
self.glue = linearlogic.LinearLogicParser().parse(glue)
elif isinstance(glue, linearlogic.Expression):
self.glue = glue
else:
- raise RuntimeError(
- "Glue term neither string or expression: %s, %s"
- % (glue, glue.__class__)
- )
+ raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__))
self.indices = indices
arg = (john , subj)
returns ((walk john), f)
"""
- if self.indices & arg.indices: # if the sets are NOT disjoint
- raise linearlogic.LinearLogicApplicationException(
- "'%s' applied to '%s'. Indices are not disjoint." % (self, arg)
- )
- else: # if the sets ARE disjoint
- return_indices = self.indices | arg.indices
+ if self.indices & arg.indices: # if the sets are NOT disjoint
+ raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'. Indices are not disjoint." % (self, arg))
+ else: # if the sets ARE disjoint
+ return_indices = (self.indices | arg.indices)
try:
- return_glue = linearlogic.ApplicationExpression(
- self.glue, arg.glue, arg.indices
- )
+ return_glue = linearlogic.ApplicationExpression(self.glue, arg.glue, arg.indices)
except linearlogic.LinearLogicApplicationException:
- raise linearlogic.LinearLogicApplicationException(
- "'%s' applied to '%s'" % (self.simplify(), arg.simplify())
- )
+ raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'" % (self.simplify(), arg.simplify()))
arg_meaning_abstracted = arg.meaning
if return_indices:
- for dep in self.glue.simplify().antecedent.dependencies[
- ::-1
- ]: # if self.glue is (A -o B), dep is in A.dependencies
- arg_meaning_abstracted = self.make_LambdaExpression(
- Variable("v%s" % dep), arg_meaning_abstracted
- )
+ for dep in self.glue.simplify().antecedent.dependencies[::-1]: # if self.glue is (A -o B), dep is in A.dependencies
+ arg_meaning_abstracted = self.make_LambdaExpression(Variable('v%s' % dep),
+ arg_meaning_abstracted)
return_meaning = self.meaning.applyto(arg_meaning_abstracted)
return self.__class__(return_meaning, return_glue, return_indices)
def lambda_abstract(self, other):
assert isinstance(other, GlueFormula)
assert isinstance(other.meaning, AbstractVariableExpression)
- return self.__class__(
- self.make_LambdaExpression(other.meaning.variable, self.meaning),
- linearlogic.ImpExpression(other.glue, self.glue),
- )
+ return self.__class__(self.make_LambdaExpression(other.meaning.variable,
+ self.meaning),
+ linearlogic.ImpExpression(other.glue, self.glue))
def compile(self, counter=None):
"""From Iddo Lev's PhD Dissertation p108-109"""
if not counter:
counter = Counter()
- (compiled_glue, new_forms) = self.glue.simplify().compile_pos(
- counter, self.__class__
- )
- return new_forms + [
- self.__class__(self.meaning, compiled_glue, set([counter.get()]))
- ]
+ (compiled_glue, new_forms) = self.glue.simplify().compile_pos(counter, self.__class__)
+ return new_forms + [self.__class__(self.meaning, compiled_glue, set([counter.get()]))]
def simplify(self):
- return self.__class__(
- self.meaning.simplify(), self.glue.simplify(), self.indices
- )
+ return self.__class__(self.meaning.simplify(), self.glue.simplify(), self.indices)
def __eq__(self, other):
- return (
- self.__class__ == other.__class__
- and self.meaning == other.meaning
- and self.glue == other.glue
- )
+ return self.__class__ == other.__class__ and self.meaning == other.meaning and self.glue == other.glue
def __ne__(self, other):
return not self == other
def __str__(self):
assert isinstance(self.indices, set)
- accum = "%s : %s" % (self.meaning, self.glue)
+ accum = '%s : %s' % (self.meaning, self.glue)
if self.indices:
- accum += " : {" + ", ".join(str(index) for index in self.indices) + "}"
+ accum += ' : {' + ', '.join(str(index) for index in self.indices) + '}'
return accum
def __repr__(self):
return "%s" % self
-
+@python_2_unicode_compatible
class GlueDict(dict):
def __init__(self, filename, encoding=None):
self.filename = filename
self.clear()
try:
- contents = nltk.data.load(
- self.filename, format="text", encoding=self.file_encoding
- )
+ contents = nltk.data.load(self.filename, format='text', encoding=self.file_encoding)
# TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
except LookupError as e:
try:
- contents = nltk.data.load(
- "file:" + self.filename, format="text", encoding=self.file_encoding
- )
+ contents = nltk.data.load('file:' + self.filename, format='text', encoding=self.file_encoding)
except LookupError:
raise e
lines = contents.splitlines()
- for line in lines: # example: 'n : (\\x.(<word> x), (v-or))'
- # lambdacalc -^ linear logic -^
- line = line.strip() # remove trailing newline
- if not len(line):
- continue # skip empty lines
- if line[0] == "#":
- continue # skip commented out lines
+ for line in lines: # example: 'n : (\\x.(<word> x), (v-or))'
+ # lambdacalc -^ linear logic -^
+ line = line.strip() # remove trailing newline
+ if not len(line): continue # skip empty lines
+ if line[0] == '#': continue # skip commented out lines
- parts = line.split(
- " : ", 2
- ) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
+ parts = line.split(' : ', 2) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
glue_formulas = []
paren_count = 0
if len(parts) > 1:
for (i, c) in enumerate(parts[1]):
- if c == "(":
- if paren_count == 0: # if it's the first '(' of a tuple
- tuple_start = i + 1 # then save the index
+ if c == '(':
+ if paren_count == 0: # if it's the first '(' of a tuple
+ tuple_start = i+1 # then save the index
paren_count += 1
- elif c == ")":
+ elif c == ')':
paren_count -= 1
- if paren_count == 0: # if it's the last ')' of a tuple
- meaning_term = parts[1][
- tuple_start:tuple_comma
- ] # '\\x.(<word> x)'
- glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)'
- glue_formulas.append(
- [meaning_term, glue_term]
- ) # add the GlueFormula to the list
- elif c == ",":
- if (
- paren_count == 1
- ): # if it's a comma separating the parts of the tuple
- tuple_comma = i # then save the index
- elif c == "#": # skip comments at the ends of lines
- if (
- paren_count != 0
- ): # if the line hasn't parsed correctly so far
- raise RuntimeError(
- "Formula syntax is incorrect for entry " + line
- )
- break # break to the next line
-
- if len(parts) > 2: # if there is a relationship entry at the end
- rel_start = parts[2].index("[") + 1
- rel_end = parts[2].index("]")
+ if paren_count == 0: # if it's the last ')' of a tuple
+ meaning_term = parts[1][tuple_start:tuple_comma] # '\\x.(<word> x)'
+ glue_term = parts[1][tuple_comma+1:i] # '(v-r)'
+ glue_formulas.append([meaning_term, glue_term]) # add the GlueFormula to the list
+ elif c == ',':
+ if paren_count == 1: # if it's a comma separating the parts of the tuple
+ tuple_comma = i # then save the index
+ elif c == '#': # skip comments at the ends of lines
+ if paren_count != 0: # if the line hasn't parsed correctly so far
+ raise RuntimeError('Formula syntax is incorrect for entry ' + line)
+ break # break to the next line
+
+ if len(parts) > 2: #if there is a relationship entry at the end
+ rel_start = parts[2].index('[')+1
+ rel_end = parts[2].index(']')
if rel_start == rel_end:
relationships = frozenset()
else:
- relationships = frozenset(
- r.strip() for r in parts[2][rel_start:rel_end].split(",")
- )
+ relationships = frozenset(r.strip() for r in parts[2][rel_start:rel_end].split(','))
try:
- start_inheritance = parts[0].index("(")
- end_inheritance = parts[0].index(")")
+ start_inheritance = parts[0].index('(')
+ end_inheritance = parts[0].index(')')
sem = parts[0][:start_inheritance].strip()
- supertype = parts[0][start_inheritance + 1 : end_inheritance]
+ supertype = parts[0][start_inheritance+1:end_inheritance]
except:
sem = parts[0].strip()
supertype = None
if sem not in self:
self[sem] = {}
- if (
- relationships is None
- ): # if not specified for a specific relationship set
- # add all relationship entries for parents
+ if relationships is None: #if not specified for a specific relationship set
+ #add all relationship entries for parents
if supertype:
for rels in self[supertype]:
if rels not in self[sem]:
self[sem][rels] = []
glue = self[supertype][rels]
self[sem][rels].extend(glue)
- self[sem][rels].extend(
- glue_formulas
- ) # add the glue formulas to every rel entry
+ self[sem][rels].extend(glue_formulas) # add the glue formulas to every rel entry
else:
if None not in self[sem]:
self[sem][None] = []
- self[sem][None].extend(
- glue_formulas
- ) # add the glue formulas to every rel entry
+ self[sem][None].extend(glue_formulas) # add the glue formulas to every rel entry
else:
if relationships not in self[sem]:
self[sem][relationships] = []
if supertype:
self[sem][relationships].extend(self[supertype][relationships])
- self[sem][relationships].extend(
- glue_formulas
- ) # add the glue entry to the dictionary
+ self[sem][relationships].extend(glue_formulas) # add the glue entry to the dictionary
def __str__(self):
- accum = ""
+ accum = ''
for pos in self:
str_pos = "%s" % pos
for relset in self[pos]:
i = 1
for gf in self[pos][relset]:
if i == 1:
- accum += str_pos + ": "
+ accum += str_pos + ': '
else:
- accum += " " * (len(str_pos) + 2)
+ accum += ' '*(len(str_pos)+2)
accum += "%s" % gf
if relset and i == len(self[pos][relset]):
- accum += " : %s" % relset
- accum += "\n"
+ accum += ' : %s' % relset
+ accum += '\n'
i += 1
return accum
if node is None:
# TODO: should it be depgraph.root? Is this code tested?
top = depgraph.nodes[0]
- depList = list(chain(*top["deps"].values()))
+ depList = list(chain(*top['deps'].values()))
root = depgraph.nodes[depList[0]]
return self.to_glueformula_list(depgraph, root, Counter(), verbose)
glueformulas = self.lookup(node, depgraph, counter)
- for dep_idx in chain(*node["deps"].values()):
+ for dep_idx in chain(*node['deps'].values()):
dep = depgraph.nodes[dep_idx]
- glueformulas.extend(
- self.to_glueformula_list(depgraph, dep, counter, verbose)
- )
+ glueformulas.extend(self.to_glueformula_list(depgraph, dep, counter, verbose))
return glueformulas
def lookup(self, node, depgraph, counter):
if not len(lookup):
raise KeyError(
"There is no GlueDict entry for sem type of '%s' "
- "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
- )
+ "with tag '%s', and rel '%s'" %
+ (node['word'], node['tag'], node['rel'])
+ )
- return self.get_glueformulas_from_semtype_entry(
- lookup, node["word"], node, depgraph, counter
- )
+ return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
def add_missing_dependencies(self, node, depgraph):
- rel = node["rel"].lower()
+ rel = node['rel'].lower()
- if rel == "main":
- headnode = depgraph.nodes[node["head"]]
- subj = self.lookup_unique("subj", headnode, depgraph)
- relation = subj["rel"]
- node["deps"].setdefault(relation, [])
- node["deps"][relation].append(subj["address"])
- # node['deps'].append(subj['address'])
+ if rel == 'main':
+ headnode = depgraph.nodes[node['head']]
+ subj = self.lookup_unique('subj', headnode, depgraph)
+ relation = subj['rel']
+ node['deps'].setdefault(relation,[])
+ node['deps'][relation].append(subj['address'])
+ #node['deps'].append(subj['address'])
def _lookup_semtype_option(self, semtype, node, depgraph):
relationships = frozenset(
- depgraph.nodes[dep]["rel"].lower()
- for dep in chain(*node["deps"].values())
- if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
+ depgraph.nodes[dep]['rel'].lower()
+ for dep in chain(*node['deps'].values())
+ if depgraph.nodes[dep]['rel'].lower() not in OPTIONAL_RELATIONSHIPS
)
try:
# most relations of any possible relationship set that is a subset
# of the actual depgraph
best_match = frozenset()
- for relset_option in set(semtype) - set([None]):
- if (
- len(relset_option) > len(best_match)
- and relset_option < relationships
- ):
+ for relset_option in set(semtype)-set([None]):
+ if len(relset_option) > len(best_match) and \
+ relset_option < relationships:
best_match = relset_option
if not best_match:
if None in semtype:
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
- rel = node["rel"].lower()
- word = node["word"].lower()
+ rel = node['rel'].lower()
+ word = node['word'].lower()
- if rel == "spec":
+ if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
- return [SPEC_SEMTYPES["default"]]
- elif rel in ["nmod", "vmod"]:
- return [node["tag"], rel]
+ return [SPEC_SEMTYPES['default']]
+ elif rel in ['nmod', 'vmod']:
+ return [node['tag'], rel]
else:
- return [node["tag"]]
+ return [node['tag']]
- def get_glueformulas_from_semtype_entry(
- self, lookup, word, node, depgraph, counter
- ):
+ def get_glueformulas_from_semtype_entry(self, lookup, word, node, depgraph, counter):
glueformulas = []
glueFormulaFactory = self.get_GlueFormula_factory()
if not len(glueformulas):
gf.word = word
else:
- gf.word = "%s%s" % (word, len(glueformulas) + 1)
+ gf.word = '%s%s' % (word, len(glueformulas)+1)
gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
parameter "<word>"
:param word: The actual word to be replace "<word>"
"""
- word = word.replace(".", "")
- return generic.replace("<word>", word)
+ word = word.replace('.', '')
+ return generic.replace('<word>', word)
def initialize_labels(self, expr, node, depgraph, unique_index):
if isinstance(expr, linearlogic.AtomicExpression):
else:
return linearlogic.ImpExpression(
self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
- self.initialize_labels(expr.consequent, node, depgraph, unique_index),
+ self.initialize_labels(expr.consequent, node, depgraph, unique_index)
)
def find_label_name(self, name, node, depgraph, unique_index):
try:
- dot = name.index(".")
+ dot = name.index('.')
before_dot = name[:dot]
- after_dot = name[dot + 1 :]
- if before_dot == "super":
- return self.find_label_name(
- after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
- )
+ after_dot = name[dot+1:]
+ if before_dot == 'super':
+ return self.find_label_name(after_dot, depgraph.nodes[node['head']], depgraph, unique_index)
else:
- return self.find_label_name(
- after_dot,
- self.lookup_unique(before_dot, node, depgraph),
- depgraph,
- unique_index,
- )
+ return self.find_label_name(after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index)
except ValueError:
lbl = self.get_label(node)
- if name == "f":
+ if name == 'f':
return lbl
- elif name == "v":
- return "%sv" % lbl
- elif name == "r":
- return "%sr" % lbl
- elif name == "super":
- return self.get_label(depgraph.nodes[node["head"]])
- elif name == "var":
- return "%s%s" % (lbl.upper(), unique_index)
- elif name == "a":
- return self.get_label(self.lookup_unique("conja", node, depgraph))
- elif name == "b":
- return self.get_label(self.lookup_unique("conjb", node, depgraph))
+ elif name == 'v':
+ return '%sv' % lbl
+ elif name == 'r':
+ return '%sr' % lbl
+ elif name == 'super':
+ return self.get_label(depgraph.nodes[node['head']])
+ elif name == 'var':
+ return '%s%s' % (lbl.upper(), unique_index)
+ elif name == 'a':
+ return self.get_label(self.lookup_unique('conja', node, depgraph))
+ elif name == 'b':
+ return self.get_label(self.lookup_unique('conjb', node, depgraph))
else:
return self.get_label(self.lookup_unique(name, node, depgraph))
:param value: where to index into the list of characters
:type value: int
"""
- value = node["address"]
-
- letter = [
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- "a",
- "b",
- "c",
- "d",
- "e",
- ][value - 1]
+ value = node['address']
+
+ letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s',
+ 't','u','v','w','x','y','z','a','b','c','d','e'][value-1]
num = int(value) // 26
if num > 0:
return letter + str(num)
"""
deps = [
depgraph.nodes[dep]
- for dep in chain(*node["deps"].values())
- if depgraph.nodes[dep]["rel"].lower() == rel.lower()
+ for dep in chain(*node['deps'].values())
+ if depgraph.nodes[dep]['rel'].lower() == rel.lower()
]
if len(deps) == 0:
- raise KeyError("'%s' doesn't contain a feature '%s'" % (node["word"], rel))
+ raise KeyError("'%s' doesn't contain a feature '%s'" % (node['word'], rel))
elif len(deps) > 1:
- raise KeyError(
- "'%s' should only have one feature '%s'" % (node["word"], rel)
- )
+ raise KeyError("'%s' should only have one feature '%s'" % (node['word'], rel))
else:
return deps[0]
class Glue(object):
- def __init__(
- self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
- ):
+ def __init__(self, semtype_file=None, remove_duplicates=False,
+ depparser=None, verbose=False):
self.verbose = verbose
self.remove_duplicates = remove_duplicates
self.depparser = depparser
from nltk import Prover9
-
self.prover = Prover9()
if semtype_file:
self.semtype_file = semtype_file
else:
- self.semtype_file = os.path.join(
- "grammars", "sample_grammars", "glue.semtype"
- )
+ self.semtype_file = os.path.join('grammars', 'sample_grammars','glue.semtype')
def train_depparser(self, depgraphs=None):
if depgraphs:
self.depparser.train(depgraphs)
else:
- self.depparser.train_from_file(
- nltk.data.find(
- os.path.join("grammars", "sample_grammars", "glue_train.conll")
- )
- )
+ self.depparser.train_from_file(nltk.data.find(
+ os.path.join('grammars', 'sample_grammars',
+ 'glue_train.conll')))
def parse_to_meaning(self, sentence):
readings = []
agenda_length = len(agenda)
atomics = dict()
nonatomics = dict()
- while agenda: # is not empty
+ while agenda: # is not empty
cur = agenda.pop()
glue_simp = cur.glue.simplify()
- if isinstance(
- glue_simp, linearlogic.ImpExpression
- ): # if cur.glue is non-atomic
+ if isinstance(glue_simp, linearlogic.ImpExpression): # if cur.glue is non-atomic
for key in atomics:
try:
if isinstance(cur.glue, linearlogic.ApplicationExpression):
bindings = linearlogic.BindingDict()
glue_simp.antecedent.unify(key, bindings)
for atomic in atomics[key]:
- if not (
- cur.indices & atomic.indices
- ): # if the sets of indices are disjoint
+ if not (cur.indices & atomic.indices): # if the sets of indices are disjoint
try:
agenda.append(cur.applyto(atomic))
except linearlogic.LinearLogicApplicationException:
except KeyError:
nonatomics[glue_simp.antecedent] = [cur]
- else: # else cur.glue is atomic
+ else: # else cur.glue is atomic
for key in nonatomics:
for nonatomic in nonatomics[key]:
try:
- if isinstance(
- nonatomic.glue, linearlogic.ApplicationExpression
- ):
+ if isinstance(nonatomic.glue, linearlogic.ApplicationExpression):
bindings = nonatomic.glue.bindings
else:
bindings = linearlogic.BindingDict()
glue_simp.unify(key, bindings)
- if not (
- cur.indices & nonatomic.indices
- ): # if the sets of indices are disjoint
+ if not (cur.indices & nonatomic.indices): # if the sets of indices are disjoint
try:
agenda.append(nonatomic.applyto(cur))
except linearlogic.LinearLogicApplicationException:
add_reading = False
break
except Exception as e:
- # if there is an exception, the syntax of the formula
- # may not be understandable by the prover, so don't
- # throw out the reading.
- print("Error when checking logical equality of statements", e)
-
+ #if there is an exception, the syntax of the formula
+ #may not be understandable by the prover, so don't
+ #throw out the reading.
+ print('Error when checking logical equality of statements', e)
+ pass
if add_reading:
reading_list.append(glueformula.meaning)
:rtype: DependencyGraph
"""
- # Lazy-initialize the depparser
+ #Lazy-initialize the depparser
if self.depparser is None:
from nltk.parse import MaltParser
-
self.depparser = MaltParser(tagger=self.get_pos_tagger())
if not self.depparser._trained:
self.train_depparser()
return_list.extend(gf.compile(index_counter))
if self.verbose:
- print("Compiled Glue Premises:")
+ print('Compiled Glue Premises:')
for cgf in return_list:
print(cgf)
def get_pos_tagger(self):
from nltk.corpus import brown
-
regexp_tagger = RegexpTagger(
- [
- (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
- (r"(The|the|A|a|An|an)$", "AT"), # articles
- (r".*able$", "JJ"), # adjectives
- (r".*ness$", "NN"), # nouns formed from adjectives
- (r".*ly$", "RB"), # adverbs
- (r".*s$", "NNS"), # plural nouns
- (r".*ing$", "VBG"), # gerunds
- (r".*ed$", "VBD"), # past tense verbs
- (r".*", "NN"), # nouns (default)
- ]
- )
- brown_train = brown.tagged_sents(categories="news")
+ [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
+ (r'(The|the|A|a|An|an)$', 'AT'), # articles
+ (r'.*able$', 'JJ'), # adjectives
+ (r'.*ness$', 'NN'), # nouns formed from adjectives
+ (r'.*ly$', 'RB'), # adverbs
+ (r'.*s$', 'NNS'), # plural nouns
+ (r'.*ing$', 'VBG'), # gerunds
+ (r'.*ed$', 'VBD'), # past tense verbs
+ (r'.*', 'NN') # nouns (default)
+ ])
+ brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
- # Override particular words
+ #Override particular words
main_tagger = RegexpTagger(
- [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
- backoff=trigram_tagger,
- )
+ [(r'(A|a|An|an)$', 'ex_quant'),
+ (r'(Every|every|All|all)$', 'univ_quant')
+ ], backoff=trigram_tagger)
return main_tagger
if not indices:
indices = set()
- if isinstance(meaning, str):
+ if isinstance(meaning, string_types):
self.meaning = drt.DrtExpression.fromstring(meaning)
elif isinstance(meaning, drt.DrtExpression):
self.meaning = meaning
else:
- raise RuntimeError(
- "Meaning term neither string or expression: %s, %s"
- % (meaning, meaning.__class__)
- )
+ raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__))
- if isinstance(glue, str):
+ if isinstance(glue, string_types):
self.glue = linearlogic.LinearLogicParser().parse(glue)
elif isinstance(glue, linearlogic.Expression):
self.glue = glue
else:
- raise RuntimeError(
- "Glue term neither string or expression: %s, %s"
- % (glue, glue.__class__)
- )
+ raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__))
self.indices = indices
def make_LambdaExpression(self, variable, term):
return drt.DrtLambdaExpression(variable, term)
-
class DrtGlueDict(GlueDict):
def get_GlueFormula_factory(self):
return DrtGlueFormula
-
class DrtGlue(Glue):
- def __init__(
- self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
- ):
+ def __init__(self, semtype_file=None, remove_duplicates=False,
+ depparser=None, verbose=False):
if not semtype_file:
- semtype_file = os.path.join(
- "grammars", "sample_grammars", "drt_glue.semtype"
- )
+ semtype_file = os.path.join('grammars', 'sample_grammars','drt_glue.semtype')
Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
def get_glue_dict(self):
def demo(show_example=-1):
from nltk.parse import MaltParser
-
- examples = [
- "David sees Mary",
- "David eats a sandwich",
- "every man chases a dog",
- "every man believes a dog sleeps",
- "John gives David a sandwich",
- "John chases himself",
- ]
- # 'John persuades David to order a pizza',
- # 'John tries to go',
- # 'John tries to find a unicorn',
- # 'John seems to vanish',
- # 'a unicorn seems to approach',
- # 'every big cat leaves',
- # 'every gray cat leaves',
- # 'every big gray cat leaves',
- # 'a former senator leaves',
-
- print("============== DEMO ==============")
+ examples = ['David sees Mary',
+ 'David eats a sandwich',
+ 'every man chases a dog',
+ 'every man believes a dog sleeps',
+ 'John gives David a sandwich',
+ 'John chases himself']
+# 'John persuades David to order a pizza',
+# 'John tries to go',
+# 'John tries to find a unicorn',
+# 'John seems to vanish',
+# 'a unicorn seems to approach',
+# 'every big cat leaves',
+# 'every gray cat leaves',
+# 'every big gray cat leaves',
+# 'a former senator leaves',
+
+ print('============== DEMO ==============')
tagger = RegexpTagger(
- [
- ("^(David|Mary|John)$", "NNP"),
- (
- "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
- "VB",
- ),
- ("^(go|order|vanish|find|approach)$", "VB"),
- ("^(a)$", "ex_quant"),
- ("^(every)$", "univ_quant"),
- ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
- ("^(big|gray|former)$", "JJ"),
- ("^(him|himself)$", "PRP"),
- ]
- )
+ [('^(David|Mary|John)$', 'NNP'),
+ ('^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'),
+ ('^(go|order|vanish|find|approach)$', 'VB'),
+ ('^(a)$', 'ex_quant'),
+ ('^(every)$', 'univ_quant'),
+ ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+ ('^(big|gray|former)$', 'JJ'),
+ ('^(him|himself)$', 'PRP')
+ ])
depparser = MaltParser(tagger=tagger)
glue = Glue(depparser=depparser, verbose=False)
for (i, sentence) in enumerate(examples):
- if i == show_example or show_example == -1:
- print("[[[Example %s]]] %s" % (i, sentence))
+ if i==show_example or show_example==-1:
+ print('[[[Example %s]]] %s' % (i, sentence))
for reading in glue.parse_to_meaning(sentence.split()):
print(reading.simplify())
- print("")
+ print('')
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# Author: Peter Wang
# Updated by: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
representation that is not easy to read. We use a "plugging" algorithm to
convert that representation into first-order logic formulas.
"""
+from __future__ import print_function, unicode_literals
from functools import reduce
+from six import itervalues
+
+from nltk import compat
from nltk.parse import load_parser
from nltk.sem.skolemize import skolemize
-from nltk.sem.logic import (
- AllExpression,
- AndExpression,
- ApplicationExpression,
- ExistsExpression,
- IffExpression,
- ImpExpression,
- LambdaExpression,
- NegatedExpression,
- OrExpression,
-)
+from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression,
+ ExistsExpression, IffExpression, ImpExpression,
+ LambdaExpression, NegatedExpression, OrExpression)
# Note that in this code there may be multiple types of trees being referred to:
# 4. the search space when plugging (search tree)
#
-
class Constants(object):
- ALL = "ALL"
- EXISTS = "EXISTS"
- NOT = "NOT"
- AND = "AND"
- OR = "OR"
- IMP = "IMP"
- IFF = "IFF"
- PRED = "PRED"
- LEQ = "LEQ"
- HOLE = "HOLE"
- LABEL = "LABEL"
-
- MAP = {
- ALL: lambda v, e: AllExpression(v.variable, e),
- EXISTS: lambda v, e: ExistsExpression(v.variable, e),
- NOT: NegatedExpression,
- AND: AndExpression,
- OR: OrExpression,
- IMP: ImpExpression,
- IFF: IffExpression,
- PRED: ApplicationExpression,
- }
+ ALL = 'ALL'
+ EXISTS = 'EXISTS'
+ NOT = 'NOT'
+ AND = 'AND'
+ OR = 'OR'
+ IMP = 'IMP'
+ IFF = 'IFF'
+ PRED = 'PRED'
+ LEQ = 'LEQ'
+ HOLE = 'HOLE'
+ LABEL = 'LABEL'
+
+ MAP = {ALL: lambda v, e: AllExpression(v.variable, e),
+ EXISTS: lambda v, e: ExistsExpression(v.variable, e),
+ NOT: NegatedExpression,
+ AND: AndExpression,
+ OR: OrExpression,
+ IMP: ImpExpression,
+ IFF: IffExpression,
+ PRED: ApplicationExpression}
class HoleSemantics(object):
then provides some operations on the semantics dealing with holes, labels
and finding legal ways to plug holes with labels.
"""
-
def __init__(self, usr):
"""
Constructor. `usr' is a ``sem.Expression`` representing an
def _find_top_nodes(self, node_list):
top_nodes = node_list.copy()
- for f in self.fragments.values():
+ for f in itervalues(self.fragments):
# the label is the first argument of the predicate
args = f[1]
for arg in args:
(node, ancestors) = queue[0]
if node in self.holes:
# The node is a hole, try to plug it.
- self._plug_hole(
- node, ancestors, queue[1:], potential_labels, plug_acc, record
- )
+ self._plug_hole(node, ancestors, queue[1:], potential_labels, plug_acc, record)
else:
assert node in self.labels
# The node is a label. Replace it in the queue by the holes and
head = [(a, ancestors) for a in args if self.is_node(a)]
self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record)
else:
- raise Exception("queue empty")
+ raise Exception('queue empty')
- def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record):
+ def _plug_hole(self, hole, ancestors0, queue, potential_labels0,
+ plug_acc0, record):
"""
Try all possible ways of plugging a single hole.
See _plug_nodes for the meanings of the parameters.
# before filling level i+1.
# A depth-first search would work as well since the trees must
# be finite but the bookkeeping would be harder.
- self._plug_nodes(
- queue + [(l, ancestors)], potential_labels, plug_acc, record
- )
+ self._plug_nodes(queue + [(l, ancestors)], potential_labels, plug_acc, record)
def _violates_constraints(self, label, ancestors):
"""
return node
+@compat.python_2_unicode_compatible
class Constraint(object):
"""
This class represents a constraint of the form (L =< N),
where L is a label and N is a node (a label or a hole).
"""
-
def __init__(self, lhs, rhs):
self.lhs = lhs
self.rhs = rhs
return hash(repr(self))
def __repr__(self):
- return "(%s < %s)" % (self.lhs, self.rhs)
+ return '(%s < %s)' % (self.lhs, self.rhs)
def hole_readings(sentence, grammar_filename=None, verbose=False):
if not grammar_filename:
- grammar_filename = "grammars/sample_grammars/hole.fcfg"
+ grammar_filename = 'grammars/sample_grammars/hole.fcfg'
if verbose:
- print("Reading grammar file", grammar_filename)
+ print('Reading grammar file', grammar_filename)
parser = load_parser(grammar_filename)
tokens = sentence.split()
trees = list(parser.parse(tokens))
if verbose:
- print("Got %d different parses" % len(trees))
+ print('Got %d different parses' % len(trees))
all_readings = []
for tree in trees:
# Get the semantic feature from the top of the parse tree.
- sem = tree.label()["SEM"].simplify()
+ sem = tree.label()['SEM'].simplify()
# Print the raw semantic representation.
if verbose:
- print("Raw: ", sem)
+ print('Raw: ', sem)
# Skolemize away all quantifiers. All variables become unique.
while isinstance(sem, LambdaExpression):
skolemized = skolemize(sem)
if verbose:
- print("Skolemized:", skolemized)
+ print('Skolemized:', skolemized)
# Break the hole semantics representation down into its components
# i.e. holes, labels, formula fragments and constraints.
# Maybe show the details of the semantic representation.
if verbose:
- print("Holes: ", hole_sem.holes)
- print("Labels: ", hole_sem.labels)
- print("Constraints: ", hole_sem.constraints)
- print("Top hole: ", hole_sem.top_hole)
- print("Top labels: ", hole_sem.top_most_labels)
- print("Fragments:")
+ print('Holes: ', hole_sem.holes)
+ print('Labels: ', hole_sem.labels)
+ print('Constraints: ', hole_sem.constraints)
+ print('Top hole: ', hole_sem.top_hole)
+ print('Top labels: ', hole_sem.top_most_labels)
+ print('Fragments:')
for l, f in hole_sem.fragments.items():
- print("\t%s: %s" % (l, f))
+ print('\t%s: %s' % (l, f))
# Find all the possible ways to plug the formulas together.
pluggings = hole_sem.pluggings()
if verbose:
for i, r in enumerate(readings):
print()
- print("%d. %s" % (i, r))
+ print('%d. %s' % (i, r))
print()
all_readings.extend(readings)
return all_readings
-if __name__ == "__main__":
- for r in hole_readings("a dog barks"):
+if __name__ == '__main__':
+ for r in hole_readings('a dog barks'):
print(r)
print()
- for r in hole_readings("every girl chases a dog"):
+ for r in hole_readings('every girl chases a dog'):
print(r)
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
from itertools import chain
from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class FStructure(dict):
def safeappend(self, key, item):
"""
def to_depgraph(self, rel=None):
from nltk.parse.dependencygraph import DependencyGraph
-
depgraph = DependencyGraph()
nodes = depgraph.nodes
- self._to_depgraph(nodes, 0, "ROOT")
+ self._to_depgraph(nodes, 0, 'ROOT')
# Add all the dependencies for all the nodes
for address, node in nodes.items():
- for n2 in (n for n in nodes.values() if n["rel"] != "TOP"):
- if n2["head"] == address:
- relation = n2["rel"]
- node["deps"].setdefault(relation, [])
- node["deps"][relation].append(n2["address"])
+ for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'):
+ if n2['head'] == address:
+ relation = n2['rel']
+ node['deps'].setdefault(relation,[])
+ node['deps'][relation].append(n2['address'])
depgraph.root = nodes[1]
nodes[index].update(
{
- "address": index,
- "word": self.pred[0],
- "tag": self.pred[1],
- "head": head,
- "rel": rel,
+ 'address': index,
+ 'word': self.pred[0],
+ 'tag': self.pred[1],
+ 'head': head,
+ 'rel': rel,
}
)
new_index = len(nodes)
nodes[new_index].update(
{
- "address": new_index,
- "word": item[0],
- "tag": item[1],
- "head": index,
- "rel": feature,
+ 'address': new_index,
+ 'word': item[0],
+ 'tag': item[1],
+ 'head': index,
+ 'rel': feature,
}
)
elif isinstance(item, list):
for n in item:
n._to_depgraph(nodes, index, feature)
else:
- raise Exception(
- "feature %s is not an FStruct, a list, or a tuple" % feature
- )
+ raise Exception('feature %s is not an FStruct, a list, or a tuple' % feature)
@staticmethod
def read_depgraph(depgraph):
if not label_counter:
label_counter = Counter()
- if node["rel"].lower() in ["spec", "punct"]:
+ if node['rel'].lower() in ['spec', 'punct']:
# the value of a 'spec' entry is a word, not an FStructure
- return (node["word"], node["tag"])
+ return (node['word'], node['tag'])
else:
fstruct = FStructure()
fstruct.parent = parent
- word, tag = node["word"], node["tag"]
- if tag[:2] == "VB":
- if tag[2:3] == "D":
- fstruct.safeappend("tense", ("PAST", "tense"))
+ word, tag = node['word'], node['tag']
+ if tag[:2] == 'VB':
+ if tag[2:3] == 'D':
+ fstruct.safeappend('tense', ('PAST', 'tense'))
fstruct.pred = (word, tag[:2])
if not fstruct.pred:
fstruct.pred = (word, tag)
- children = [depgraph.nodes[idx] for idx in chain(*node["deps"].values())]
+ children = [depgraph.nodes[idx] for idx in chain(*node['deps'].values())]
for child in children:
- fstruct.safeappend(
- child["rel"],
- FStructure._read_depgraph(child, depgraph, label_counter, fstruct),
- )
+ fstruct.safeappend(child['rel'], FStructure._read_depgraph(child, depgraph, label_counter, fstruct))
return fstruct
:param value: where to index into the list of characters
:type value: int
"""
- letter = [
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- "a",
- "b",
- "c",
- "d",
- "e",
- ][value - 1]
+ letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s',
+ 't','u','v','w','x','y','z','a','b','c','d','e'][value-1]
num = int(value) // 26
if num > 0:
return letter + str(num)
return letter
def __repr__(self):
- return self.__str__().replace("\n", "")
+ return self.__unicode__().replace('\n', '')
def __str__(self):
return self.pretty_format()
def pretty_format(self, indent=3):
try:
- accum = "%s:[" % self.label
+ accum = '%s:[' % self.label
except NameError:
- accum = "["
+ accum = '['
try:
- accum += "pred '%s'" % (self.pred[0])
+ accum += 'pred \'%s\'' % (self.pred[0])
except NameError:
pass
for feature in sorted(self):
for item in self[feature]:
if isinstance(item, FStructure):
- next_indent = indent + len(feature) + 3 + len(self.label)
- accum += "\n%s%s %s" % (
- " " * (indent),
- feature,
- item.pretty_format(next_indent),
- )
+ next_indent = indent+len(feature)+3+len(self.label)
+ accum += '\n%s%s %s' % (' '*(indent), feature, item.pretty_format(next_indent))
elif isinstance(item, tuple):
- accum += "\n%s%s '%s'" % (" " * (indent), feature, item[0])
+ accum += '\n%s%s \'%s\'' % (' '*(indent), feature, item[0])
elif isinstance(item, list):
- accum += "\n%s%s {%s}" % (
- " " * (indent),
- feature,
- ("\n%s" % (" " * (indent + len(feature) + 2))).join(item),
- )
- else: # ERROR
- raise Exception(
- "feature %s is not an FStruct, a list, or a tuple" % feature
- )
- return accum + "]"
+ accum += '\n%s%s {%s}' % (' '*(indent), feature, ('\n%s' % (' '*(indent+len(feature)+2))).join(item))
+ else: # ERROR
+ raise Exception('feature %s is not an FStruct, a list, or a tuple' % feature)
+ return accum+']'
+
def demo_read_depgraph():
from nltk.parse.dependencygraph import DependencyGraph
-
- dg1 = DependencyGraph(
- """\
+ dg1 = DependencyGraph("""\
Esso NNP 2 SUB
said VBD 0 ROOT
the DT 5 NMOD
started VBD 2 VMOD
production NN 6 OBJ
Tuesday NNP 6 VMOD
-"""
- )
- dg2 = DependencyGraph(
- """\
+""")
+ dg2 = DependencyGraph("""\
John NNP 2 SUB
sees VBP 0 ROOT
Mary NNP 2 OBJ
-"""
- )
- dg3 = DependencyGraph(
- """\
+""")
+ dg3 = DependencyGraph("""\
a DT 2 SPEC
man NN 3 SUBJ
walks VB 0 ROOT
-"""
- )
- dg4 = DependencyGraph(
- """\
+""")
+ dg4 = DependencyGraph("""\
every DT 2 SPEC
girl NN 3 SUBJ
chases VB 0 ROOT
a DT 5 SPEC
dog NN 3 OBJ
-"""
- )
+""")
- depgraphs = [dg1, dg2, dg3, dg4]
+ depgraphs = [dg1,dg2,dg3,dg4]
for dg in depgraphs:
print(FStructure.read_depgraph(dg))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo_read_depgraph()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+from six import string_types
from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
from nltk.sem.logic import LogicParser, APP
_counter = Counter()
-
class Tokens(object):
- # Punctuation
- OPEN = "("
- CLOSE = ")"
+ #Punctuation
+ OPEN = '('
+ CLOSE = ')'
- # Operations
- IMP = "-o"
+ #Operations
+ IMP = '-o'
PUNCT = [OPEN, CLOSE]
TOKENS = PUNCT + [IMP]
-
class LinearLogicParser(LogicParser):
"""A linear logic expression parser."""
-
def __init__(self):
LogicParser.__init__(self)
argument expression."""
if self.has_priority(APP, context):
if self.inRange(0) and self.token(0) == Tokens.OPEN:
- self.token() # swallow then open paren
+ self.token() #swallow then open paren
argument = self.process_next_expression(APP)
self.assertNextToken(Tokens.CLOSE)
expression = ApplicationExpression(expression, argument, None)
return ConstantExpression(name)
+@python_2_unicode_compatible
class Expression(object):
_linear_logic_parser = LinearLogicParser()
return self.applyto(other)
def __repr__(self):
- return "<%s %s>" % (self.__class__.__name__, self)
+ return '<%s %s>' % (self.__class__.__name__, self)
+@python_2_unicode_compatible
class AtomicExpression(Expression):
def __init__(self, name, dependencies=None):
"""
:param name: str for the constant name
:param dependencies: list of int for the indices on which this atom is dependent
"""
- assert isinstance(name, str)
+ assert isinstance(name, string_types)
self.name = name
if not dependencies:
def __hash__(self):
return hash(self.name)
-
class ConstantExpression(AtomicExpression):
def unify(self, other, bindings):
"""
return bindings
raise UnificationException(self, other, bindings)
-
class VariableExpression(AtomicExpression):
def unify(self, other, bindings):
"""
except VariableBindingException:
raise UnificationException(self, other, bindings)
-
+@python_2_unicode_compatible
class ImpExpression(Expression):
def __init__(self, antecedent, consequent):
"""
self.consequent = consequent
def simplify(self, bindings=None):
- return self.__class__(
- self.antecedent.simplify(bindings), self.consequent.simplify(bindings)
- )
+ return self.__class__(self.antecedent.simplify(bindings), self.consequent.simplify(bindings))
def unify(self, other, bindings):
"""
"""
assert isinstance(other, ImpExpression)
try:
- return (
- bindings
- + self.antecedent.unify(other.antecedent, bindings)
- + self.consequent.unify(other.consequent, bindings)
- )
+ return bindings + self.antecedent.unify(other.antecedent, bindings) + self.consequent.unify(other.consequent, bindings)
except VariableBindingException:
raise UnificationException(self, other, bindings)
"""
(a, a_new) = self.antecedent.compile_neg(index_counter, glueFormulaFactory)
(c, c_new) = self.consequent.compile_pos(index_counter, glueFormulaFactory)
- return (ImpExpression(a, c), a_new + c_new)
+ return (ImpExpression(a,c), a_new + c_new)
def compile_neg(self, index_counter, glueFormulaFactory):
"""
(c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory)
fresh_index = index_counter.get()
c.dependencies.append(fresh_index)
- new_v = glueFormulaFactory("v%s" % fresh_index, a, set([fresh_index]))
+ new_v = glueFormulaFactory('v%s' % fresh_index, a, set([fresh_index]))
return (c, a_new + c_new + [new_v])
def initialize_labels(self, fstruct):
self.consequent.initialize_labels(fstruct)
def __eq__(self, other):
- return (
- self.__class__ == other.__class__
- and self.antecedent == other.antecedent
- and self.consequent == other.consequent
- )
+ return self.__class__ == other.__class__ and \
+ self.antecedent == other.antecedent and self.consequent == other.consequent
def __ne__(self, other):
return not self == other
def __str__(self):
return "%s%s %s %s%s" % (
- Tokens.OPEN,
- self.antecedent,
- Tokens.IMP,
- self.consequent,
- Tokens.CLOSE,
- )
+ Tokens.OPEN, self.antecedent, Tokens.IMP, self.consequent, Tokens.CLOSE)
def __hash__(self):
- return hash(
- "%s%s%s" % (hash(self.antecedent), Tokens.IMP, hash(self.consequent))
- )
-
+ return hash('%s%s%s' % (hash(self.antecedent), Tokens.IMP, hash(self.consequent)))
+@python_2_unicode_compatible
class ApplicationExpression(Expression):
def __init__(self, function, argument, argument_indices=None):
"""
bindings += argument.bindings
bindings += function_simp.antecedent.unify(argument_simp, bindings)
except UnificationException as e:
- raise LinearLogicApplicationException(
- "Cannot apply %s to %s. %s" % (function_simp, argument_simp, e)
- )
+ raise LinearLogicApplicationException('Cannot apply %s to %s. %s' % (function_simp, argument_simp, e))
# If you are running it on complied premises, more conditions apply
if argument_indices:
# A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices
if not set(function_simp.antecedent.dependencies) < argument_indices:
- raise LinearLogicApplicationException(
- "Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s"
- % (function_simp, argument_simp)
- )
+ raise LinearLogicApplicationException('Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s' % (function_simp, argument_simp))
if set(function_simp.antecedent.dependencies) == argument_indices:
- raise LinearLogicApplicationException(
- "Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s"
- % (function_simp, argument_simp)
- )
+ raise LinearLogicApplicationException('Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s' % (function_simp, argument_simp))
self.function = function
self.argument = argument
return self.function.simplify(bindings).consequent
def __eq__(self, other):
- return (
- self.__class__ == other.__class__
- and self.function == other.function
- and self.argument == other.argument
- )
+ return self.__class__ == other.__class__ and \
+ self.function == other.function and self.argument == other.argument
def __ne__(self, other):
return not self == other
return "%s" % self.function + Tokens.OPEN + "%s" % self.argument + Tokens.CLOSE
def __hash__(self):
- return hash(
- "%s%s%s" % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent))
- )
-
+ return hash('%s%s%s' % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent)))
+@python_2_unicode_compatible
class BindingDict(object):
def __init__(self, bindings=None):
"""
if not existing or binding == existing:
self.d[variable] = binding
else:
- raise VariableBindingException(
- "Variable %s already bound to another value" % (variable)
- )
+ raise VariableBindingException('Variable %s already bound to another value' % (variable))
def __getitem__(self, variable):
"""
combined[v] = other.d[v]
return combined
except VariableBindingException:
- raise VariableBindingException(
- "Attempting to add two contradicting"
- " VariableBindingsLists: %s, %s" % (self, other)
- )
+ raise VariableBindingException('Attempting to add two contradicting'\
+ ' VariableBindingsLists: %s, %s' % (self, other))
def __ne__(self, other):
return not self == other
return self.d == other.d
def __str__(self):
- return "{" + ", ".join("%s: %s" % (v, self.d[v]) for v in self.d) + "}"
+ return '{' + ', '.join('%s: %s' % (v, self.d[v]) for v in self.d) + '}'
def __repr__(self):
- return "BindingDict: %s" % self
-
+ return 'BindingDict: %s' % self
class VariableBindingException(Exception):
pass
-
class UnificationException(Exception):
def __init__(self, a, b, bindings):
- Exception.__init__(self, "Cannot unify %s with %s given %s" % (a, b, bindings))
-
+ Exception.__init__(self, 'Cannot unify %s with %s given %s' % (a, b, bindings))
class LinearLogicApplicationException(Exception):
pass
def demo():
lexpr = Expression.fromstring
- print(lexpr(r"f"))
- print(lexpr(r"(g -o f)"))
- print(lexpr(r"((g -o G) -o G)"))
- print(lexpr(r"g -o h -o f"))
- print(lexpr(r"(g -o f)(g)").simplify())
- print(lexpr(r"(H -o f)(g)").simplify())
- print(lexpr(r"((g -o G) -o G)((g -o f))").simplify())
- print(lexpr(r"(H -o H)((g -o f))").simplify())
+ print(lexpr(r'f'))
+ print(lexpr(r'(g -o f)'))
+ print(lexpr(r'((g -o G) -o G)'))
+ print(lexpr(r'g -o h -o f'))
+ print(lexpr(r'(g -o f)(g)').simplify())
+ print(lexpr(r'(H -o f)(g)').simplify())
+ print(lexpr(r'((g -o G) -o G)((g -o f))').simplify())
+ print(lexpr(r'(H -o H)((g -o f))').simplify())
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
A version of first order predicate logic, built on
top of the typed lambda calculus.
"""
+from __future__ import print_function, unicode_literals
import re
import operator
from collections import defaultdict
from functools import reduce, total_ordering
+from six import string_types
+
from nltk.util import Trie
from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
-APP = "APP"
+APP = 'APP'
_counter = Counter()
-
class Tokens(object):
- LAMBDA = "\\"
- LAMBDA_LIST = ["\\"]
-
- # Quantifiers
- EXISTS = "exists"
- EXISTS_LIST = ["some", "exists", "exist"]
- ALL = "all"
- ALL_LIST = ["all", "forall"]
-
- # Punctuation
- DOT = "."
- OPEN = "("
- CLOSE = ")"
- COMMA = ","
-
- # Operations
- NOT = "-"
- NOT_LIST = ["not", "-", "!"]
- AND = "&"
- AND_LIST = ["and", "&", "^"]
- OR = "|"
- OR_LIST = ["or", "|"]
- IMP = "->"
- IMP_LIST = ["implies", "->", "=>"]
- IFF = "<->"
- IFF_LIST = ["iff", "<->", "<=>"]
- EQ = "="
- EQ_LIST = ["=", "=="]
- NEQ = "!="
- NEQ_LIST = ["!="]
-
- # Collections of tokens
+ LAMBDA = '\\'; LAMBDA_LIST = ['\\']
+
+ #Quantifiers
+ EXISTS = 'exists'; EXISTS_LIST = ['some', 'exists', 'exist']
+ ALL = 'all'; ALL_LIST = ['all', 'forall']
+
+ #Punctuation
+ DOT = '.'
+ OPEN = '('
+ CLOSE = ')'
+ COMMA = ','
+
+ #Operations
+ NOT = '-'; NOT_LIST = ['not', '-', '!']
+ AND = '&'; AND_LIST = ['and', '&', '^']
+ OR = '|'; OR_LIST = ['or', '|']
+ IMP = '->'; IMP_LIST = ['implies', '->', '=>']
+ IFF = '<->'; IFF_LIST = ['iff', '<->', '<=>']
+ EQ = '='; EQ_LIST = ['=', '==']
+ NEQ = '!='; NEQ_LIST = ['!=']
+
+ #Collections of tokens
BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST
QUANTS = EXISTS_LIST + ALL_LIST
PUNCT = [DOT, OPEN, CLOSE, COMMA]
TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST
- # Special
- SYMBOLS = [x for x in TOKENS if re.match(r"^[-\\.(),!&^|>=<]*$", x)]
+ #Special
+ SYMBOLS = [x for x in TOKENS if re.match(r'^[-\\.(),!&^|>=<]*$', x)]
def boolean_ops():
"""
Boolean operators
"""
- names = ["negation", "conjunction", "disjunction", "implication", "equivalence"]
+ names = ["negation", "conjunction", "disjunction", "implication", "equivalence"]
for pair in zip(names, [Tokens.NOT, Tokens.AND, Tokens.OR, Tokens.IMP, Tokens.IFF]):
- print("%-15s\t%s" % pair)
-
+ print("%-15s\t%s" % pair)
def equality_preds():
"""
Equality predicates
"""
- names = ["equality", "inequality"]
+ names = ["equality", "inequality"]
for pair in zip(names, [Tokens.EQ, Tokens.NEQ]):
- print("%-15s\t%s" % pair)
-
+ print("%-15s\t%s" % pair)
def binding_ops():
"""
Binding operators
"""
- names = ["existential", "universal", "lambda"]
+ names = ["existential", "universal", "lambda"]
for pair in zip(names, [Tokens.EXISTS, Tokens.ALL, Tokens.LAMBDA]):
- print("%-15s\t%s" % pair)
+ print("%-15s\t%s" % pair)
+@python_2_unicode_compatible
class LogicParser(object):
"""A lambda calculus expression parser."""
self.quote_chars = []
self.operator_precedence = dict(
- [(x, 1) for x in Tokens.LAMBDA_LIST]
- + [(x, 2) for x in Tokens.NOT_LIST]
- + [(APP, 3)]
- + [(x, 4) for x in Tokens.EQ_LIST + Tokens.NEQ_LIST]
- + [(x, 5) for x in Tokens.QUANTS]
- + [(x, 6) for x in Tokens.AND_LIST]
- + [(x, 7) for x in Tokens.OR_LIST]
- + [(x, 8) for x in Tokens.IMP_LIST]
- + [(x, 9) for x in Tokens.IFF_LIST]
- + [(None, 10)]
- )
+ [(x,1) for x in Tokens.LAMBDA_LIST] + \
+ [(x,2) for x in Tokens.NOT_LIST] + \
+ [(APP,3)] + \
+ [(x,4) for x in Tokens.EQ_LIST+Tokens.NEQ_LIST] + \
+ [(x,5) for x in Tokens.QUANTS] + \
+ [(x,6) for x in Tokens.AND_LIST] + \
+ [(x,7) for x in Tokens.OR_LIST] + \
+ [(x,8) for x in Tokens.IMP_LIST] + \
+ [(x,9) for x in Tokens.IFF_LIST] + \
+ [(None,10)])
self.right_associated_operations = [APP]
def parse(self, data, signature=None):
try:
result = self.process_next_expression(None)
if self.inRange(0):
- raise UnexpectedTokenException(self._currentIndex + 1, self.token(0))
+ raise UnexpectedTokenException(self._currentIndex+1, self.token(0))
except LogicalExpressionException as e:
- msg = "%s\n%s\n%s^" % (e, data, " " * mapping[e.index - 1])
+ msg = '%s\n%s\n%s^' % (e, data, ' '*mapping[e.index-1])
raise LogicalExpressionException(None, msg)
if self.type_check:
out = []
mapping = {}
tokenTrie = Trie(self.get_all_symbols())
- token = ""
+ token = ''
data_idx = 0
token_start_idx = data_idx
while data_idx < len(data):
st = tokenTrie
c = data[data_idx]
- symbol = ""
+ symbol = ''
while c in st:
symbol += c
st = st[c]
- if len(data) - data_idx > len(symbol):
- c = data[data_idx + len(symbol)]
+ if len(data)-data_idx > len(symbol):
+ c = data[data_idx+len(symbol)]
else:
break
if Trie.LEAF in st:
- # token is a complete symbol
+ #token is a complete symbol
if token:
mapping[len(out)] = token_start_idx
out.append(token)
- token = ""
+ token = ''
mapping[len(out)] = data_idx
out.append(symbol)
data_idx += len(symbol)
else:
- if data[data_idx] in " \t\n": # any whitespace
+ if data[data_idx] in ' \t\n': #any whitespace
if token:
mapping[len(out)] = token_start_idx
out.append(token)
- token = ""
+ token = ''
else:
if not token:
token_start_idx = data_idx
mapping[len(out)] = token_start_idx
out.append(token)
mapping[len(out)] = len(data)
- mapping[len(out) + 1] = len(data) + 1
+ mapping[len(out)+1] = len(data)+1
return out, mapping
def process_quoted_token(self, data_idx, data):
- token = ""
+ token = ''
c = data[data_idx]
i = data_idx
for start, end, escape, incl_quotes in self.quote_chars:
if incl_quotes:
token += data[i]
i += 1
- if len(data) == i: # if there are no more chars
- raise LogicalExpressionException(
- None,
- "End of input reached. "
- "Escape character [%s] found at end." % escape,
- )
+ if len(data) == i: #if there are no more chars
+ raise LogicalExpressionException(None, "End of input reached. "
+ "Escape character [%s] found at end."
+ % escape)
token += data[i]
else:
token += data[i]
i += 1
if len(data) == i:
- raise LogicalExpressionException(
- None, "End of input reached. " "Expected: [%s]" % end
- )
+ raise LogicalExpressionException(None, "End of input reached. "
+ "Expected: [%s]" % end)
if incl_quotes:
token += data[i]
i += 1
if not token:
- raise LogicalExpressionException(None, "Empty quoted token found")
+ raise LogicalExpressionException(None, 'Empty quoted token found')
break
return token, i
def inRange(self, location):
"""Return TRUE if the given location is within the buffer"""
- return self._currentIndex + location < len(self._buffer)
+ return self._currentIndex+location < len(self._buffer)
def token(self, location=None):
"""Get the next waiting token. If a location is given, then
tok = self._buffer[self._currentIndex]
self._currentIndex += 1
else:
- tok = self._buffer[self._currentIndex + location]
+ tok = self._buffer[self._currentIndex+location]
return tok
except IndexError:
- raise ExpectedMoreTokensException(self._currentIndex + 1)
+ raise ExpectedMoreTokensException(self._currentIndex+1)
def isvariable(self, tok):
return tok not in Tokens.TOKENS
try:
tok = self.token()
except ExpectedMoreTokensException:
- raise ExpectedMoreTokensException(
- self._currentIndex + 1, message="Expression expected."
- )
+ raise ExpectedMoreTokensException(self._currentIndex+1, message='Expression expected.')
accum = self.handle(tok, context)
if not accum:
- raise UnexpectedTokenException(
- self._currentIndex, tok, message="Expression expected."
- )
+ raise UnexpectedTokenException(self._currentIndex, tok, message='Expression expected.')
return self.attempt_adjuncts(accum, context)
def attempt_adjuncts(self, expression, context):
cur_idx = None
- while cur_idx != self._currentIndex: # while adjuncts are added
+ while cur_idx != self._currentIndex: #while adjuncts are added
cur_idx = self._currentIndex
expression = self.attempt_EqualityExpression(expression, context)
expression = self.attempt_ApplicationExpression(expression, context)
return NegatedExpression(expression)
def handle_variable(self, tok, context):
- # It's either: 1) a predicate expression: sees(x,y)
+ #It's either: 1) a predicate expression: sees(x,y)
# 2) an application expression: P(x)
# 3) a solo variable: john OR x
accum = self.make_VariableExpression(tok)
if self.inRange(0) and self.token(0) == Tokens.OPEN:
- # The predicate has arguments
- if not isinstance(accum, FunctionVariableExpression) and not isinstance(
- accum, ConstantExpression
- ):
- raise LogicalExpressionException(
- self._currentIndex,
- "'%s' is an illegal predicate name. "
- "Individual variables may not be used as "
- "predicates." % tok,
- )
- self.token() # swallow the Open Paren
-
- # curry the arguments
- accum = self.make_ApplicationExpression(
- accum, self.process_next_expression(APP)
- )
+ #The predicate has arguments
+ if not isinstance(accum, FunctionVariableExpression) and \
+ not isinstance(accum, ConstantExpression):
+ raise LogicalExpressionException(self._currentIndex,
+ "'%s' is an illegal predicate name. "
+ "Individual variables may not be used as "
+ "predicates." % tok)
+ self.token() #swallow the Open Paren
+
+ #curry the arguments
+ accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP))
while self.inRange(0) and self.token(0) == Tokens.COMMA:
- self.token() # swallow the comma
- accum = self.make_ApplicationExpression(
- accum, self.process_next_expression(APP)
- )
+ self.token() #swallow the comma
+ accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP))
self.assertNextToken(Tokens.CLOSE)
return accum
try:
tok = self.token()
except ExpectedMoreTokensException as e:
- raise ExpectedMoreTokensException(e.index, "Variable expected.")
+ raise ExpectedMoreTokensException(e.index, 'Variable expected.')
if isinstance(self.make_VariableExpression(tok), ConstantExpression):
- raise LogicalExpressionException(
- self._currentIndex,
- "'%s' is an illegal variable name. "
- "Constants may not be %s." % (tok, description),
- )
+ raise LogicalExpressionException(self._currentIndex,
+ "'%s' is an illegal variable name. "
+ "Constants may not be %s." % (tok, description))
return Variable(tok)
def handle_lambda(self, tok, context):
# Expression is a lambda expression
if not self.inRange(0):
- raise ExpectedMoreTokensException(
- self._currentIndex + 2,
- message="Variable and Expression expected following lambda operator.",
- )
- vars = [self.get_next_token_variable("abstracted")]
+ raise ExpectedMoreTokensException(self._currentIndex+2,
+ message="Variable and Expression expected following lambda operator.")
+ vars = [self.get_next_token_variable('abstracted')]
while True:
- if not self.inRange(0) or (
- self.token(0) == Tokens.DOT and not self.inRange(1)
- ):
- raise ExpectedMoreTokensException(
- self._currentIndex + 2, message="Expression expected."
- )
+ if not self.inRange(0) or (self.token(0) == Tokens.DOT and not self.inRange(1)):
+ raise ExpectedMoreTokensException(self._currentIndex+2, message="Expression expected.")
if not self.isvariable(self.token(0)):
break
# Support expressions like: \x y.M == \x.\y.M
- vars.append(self.get_next_token_variable("abstracted"))
+ vars.append(self.get_next_token_variable('abstracted'))
if self.inRange(0) and self.token(0) == Tokens.DOT:
- self.token() # swallow the dot
+ self.token() #swallow the dot
accum = self.process_next_expression(tok)
while vars:
factory = self.get_QuantifiedExpression_factory(tok)
if not self.inRange(0):
- raise ExpectedMoreTokensException(
- self._currentIndex + 2,
- message="Variable and Expression expected following quantifier '%s'."
- % tok,
- )
- vars = [self.get_next_token_variable("quantified")]
+ raise ExpectedMoreTokensException(self._currentIndex+2,
+ message="Variable and Expression expected following quantifier '%s'." % tok)
+ vars = [self.get_next_token_variable('quantified')]
while True:
- if not self.inRange(0) or (
- self.token(0) == Tokens.DOT and not self.inRange(1)
- ):
- raise ExpectedMoreTokensException(
- self._currentIndex + 2, message="Expression expected."
- )
+ if not self.inRange(0) or (self.token(0) == Tokens.DOT and not self.inRange(1)):
+ raise ExpectedMoreTokensException(self._currentIndex+2, message="Expression expected.")
if not self.isvariable(self.token(0)):
break
# Support expressions like: some x y.M == some x.some y.M
- vars.append(self.get_next_token_variable("quantified"))
+ vars.append(self.get_next_token_variable('quantified'))
if self.inRange(0) and self.token(0) == Tokens.DOT:
- self.token() # swallow the dot
+ self.token() #swallow the dot
accum = self.process_next_expression(tok)
while vars:
return factory(variable, term)
def handle_open(self, tok, context):
- # Expression is in parens
+ #Expression is in parens
accum = self.process_next_expression(None)
self.assertNextToken(Tokens.CLOSE)
return accum
Otherwise, the parameter will be returned."""
if self.inRange(0):
tok = self.token(0)
- if tok in Tokens.EQ_LIST + Tokens.NEQ_LIST and self.has_priority(
- tok, context
- ):
- self.token() # swallow the "=" or "!="
- expression = self.make_EqualityExpression(
- expression, self.process_next_expression(tok)
- )
+ if tok in Tokens.EQ_LIST + Tokens.NEQ_LIST and self.has_priority(tok, context):
+ self.token() #swallow the "=" or "!="
+ expression = self.make_EqualityExpression(expression, self.process_next_expression(tok))
if tok in Tokens.NEQ_LIST:
expression = self.make_NegatedExpression(expression)
return expression
tok = self.token(0)
factory = self.get_BooleanExpression_factory(tok)
if factory and self.has_priority(tok, context):
- self.token() # swallow the operator
- expression = self.make_BooleanExpression(
- factory, expression, self.process_next_expression(tok)
- )
+ self.token() #swallow the operator
+ expression = self.make_BooleanExpression(factory, expression,
+ self.process_next_expression(tok))
else:
break
return expression
argument expression."""
if self.has_priority(APP, context):
if self.inRange(0) and self.token(0) == Tokens.OPEN:
- if (
- not isinstance(expression, LambdaExpression)
- and not isinstance(expression, ApplicationExpression)
- and not isinstance(expression, FunctionVariableExpression)
- and not isinstance(expression, ConstantExpression)
- ):
- raise LogicalExpressionException(
- self._currentIndex,
- ("The function '%s" % expression)
- + "' is not a Lambda Expression, an "
- "Application Expression, or a "
- "functional predicate, so it may "
- "not take arguments.",
- )
- self.token() # swallow then open paren
- # curry the arguments
- accum = self.make_ApplicationExpression(
- expression, self.process_next_expression(APP)
- )
+ if not isinstance(expression, LambdaExpression) and \
+ not isinstance(expression, ApplicationExpression) and \
+ not isinstance(expression, FunctionVariableExpression) and \
+ not isinstance(expression, ConstantExpression):
+ raise LogicalExpressionException(self._currentIndex,
+ ("The function '%s" % expression) +
+ "' is not a Lambda Expression, an "
+ "Application Expression, or a "
+ "functional predicate, so it may "
+ "not take arguments.")
+ self.token() #swallow then open paren
+ #curry the arguments
+ accum = self.make_ApplicationExpression(expression, self.process_next_expression(APP))
while self.inRange(0) and self.token(0) == Tokens.COMMA:
- self.token() # swallow the comma
- accum = self.make_ApplicationExpression(
- accum, self.process_next_expression(APP)
- )
+ self.token() #swallow the comma
+ accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP))
self.assertNextToken(Tokens.CLOSE)
return accum
return expression
return LambdaExpression(variable, term)
def has_priority(self, operation, context):
- return self.operator_precedence[operation] < self.operator_precedence[
- context
- ] or (
- operation in self.right_associated_operations
- and self.operator_precedence[operation] == self.operator_precedence[context]
- )
+ return self.operator_precedence[operation] < self.operator_precedence[context] or \
+ (operation in self.right_associated_operations and \
+ self.operator_precedence[operation] == self.operator_precedence[context])
def assertNextToken(self, expected):
try:
tok = self.token()
except ExpectedMoreTokensException as e:
- raise ExpectedMoreTokensException(
- e.index, message="Expected token '%s'." % expected
- )
+ raise ExpectedMoreTokensException(e.index, message="Expected token '%s'." % expected)
if isinstance(expected, list):
if tok not in expected:
def __repr__(self):
if self.inRange(0):
- msg = "Next token: " + self.token(0)
+ msg = 'Next token: ' + self.token(0)
else:
- msg = "No more tokens"
- return "<" + self.__class__.__name__ + ": " + msg + ">"
+ msg = 'No more tokens'
+ return '<' + self.__class__.__name__ + ': ' + msg + '>'
def read_logic(s, logic_parser=None, encoding=None):
statements = []
for linenum, line in enumerate(s.splitlines()):
line = line.strip()
- if line.startswith("#") or line == "":
- continue
+ if line.startswith('#') or line=='': continue
try:
statements.append(logic_parser.parse(line))
except LogicalExpressionException:
- raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+ raise ValueError('Unable to parse line %s: %s' % (linenum, line))
return statements
@total_ordering
+@python_2_unicode_compatible
class Variable(object):
def __init__(self, name):
"""
:param name: the name of the variable
"""
- assert isinstance(name, str), "%s is not a string" % name
+ assert isinstance(name, string_types), "%s is not a string" % name
self.name = name
def __eq__(self, other):
"""
if pattern is not None:
if is_indvar(pattern.name):
- prefix = "z"
+ prefix = 'z'
elif is_funcvar(pattern.name):
- prefix = "F"
+ prefix = 'F'
elif is_eventvar(pattern.name):
- prefix = "e0"
+ prefix = 'e0'
else:
assert False, "Cannot generate a unique constant"
else:
- prefix = "z"
+ prefix = 'z'
v = Variable("%s%s" % (prefix, _counter.get()))
while ignore is not None and v in ignore:
v = Variable("%s%s" % (prefix, _counter.get()))
return v
-
def skolem_function(univ_scope=None):
"""
Return a skolem function over the variables in univ_scope
param univ_scope
"""
- skolem = VariableExpression(Variable("F%s" % _counter.get()))
+ skolem = VariableExpression(Variable('F%s' % _counter.get()))
if univ_scope:
for v in list(univ_scope):
skolem = skolem(VariableExpression(v))
return skolem
+@python_2_unicode_compatible
class Type(object):
def __repr__(self):
return "%s" % self
def fromstring(cls, s):
return read_type(s)
-
+@python_2_unicode_compatible
class ComplexType(Type):
def __init__(self, first, second):
- assert isinstance(first, Type), "%s is not a Type" % first
- assert isinstance(second, Type), "%s is not a Type" % second
+ assert(isinstance(first, Type)), "%s is not a Type" % first
+ assert(isinstance(second, Type)), "%s is not a Type" % second
self.first = first
self.second = second
def __eq__(self, other):
- return (
- isinstance(other, ComplexType)
- and self.first == other.first
- and self.second == other.second
- )
+ return isinstance(other, ComplexType) and \
+ self.first == other.first and \
+ self.second == other.second
def __ne__(self, other):
return not self == other
def matches(self, other):
if isinstance(other, ComplexType):
- return self.first.matches(other.first) and self.second.matches(other.second)
+ return self.first.matches(other.first) and \
+ self.second.matches(other.second)
else:
return self == ANY_TYPE
f = self.first.resolve(other.first)
s = self.second.resolve(other.second)
if f and s:
- return ComplexType(f, s)
+ return ComplexType(f,s)
else:
return None
elif self == ANY_TYPE:
if self == ANY_TYPE:
return "%s" % ANY_TYPE
else:
- return "<%s,%s>" % (self.first, self.second)
+ return '<%s,%s>' % (self.first, self.second)
def str(self):
if self == ANY_TYPE:
return ANY_TYPE.str()
else:
- return "(%s -> %s)" % (self.first.str(), self.second.str())
-
+ return '(%s -> %s)' % (self.first.str(), self.second.str())
class BasicType(Type):
def __eq__(self, other):
else:
return None
-
+@python_2_unicode_compatible
class EntityType(BasicType):
def __str__(self):
- return "e"
+ return 'e'
def str(self):
- return "IND"
-
+ return 'IND'
+@python_2_unicode_compatible
class TruthValueType(BasicType):
def __str__(self):
- return "t"
+ return 't'
def str(self):
- return "BOOL"
-
+ return 'BOOL'
+@python_2_unicode_compatible
class EventType(BasicType):
def __str__(self):
- return "v"
+ return 'v'
def str(self):
- return "EVENT"
-
+ return 'EVENT'
+@python_2_unicode_compatible
class AnyType(BasicType, ComplexType):
def __init__(self):
pass
@property
- def first(self):
- return self
+ def first(self): return self
@property
- def second(self):
- return self
+ def second(self): return self
def __eq__(self, other):
return isinstance(other, AnyType) or other.__eq__(self)
return other
def __str__(self):
- return "?"
+ return '?'
def str(self):
- return "ANY"
+ return 'ANY'
TRUTH_TYPE = TruthValueType()
def read_type(type_string):
- assert isinstance(type_string, str)
- type_string = type_string.replace(" ", "") # remove spaces
+ assert isinstance(type_string, string_types)
+ type_string = type_string.replace(' ', '') #remove spaces
- if type_string[0] == "<":
- assert type_string[-1] == ">"
+ if type_string[0] == '<':
+ assert type_string[-1] == '>'
paren_count = 0
- for i, char in enumerate(type_string):
- if char == "<":
+ for i,char in enumerate(type_string):
+ if char == '<':
paren_count += 1
- elif char == ">":
+ elif char == '>':
paren_count -= 1
assert paren_count > 0
- elif char == ",":
+ elif char == ',':
if paren_count == 1:
break
- return ComplexType(
- read_type(type_string[1:i]), read_type(type_string[i + 1 : -1])
- )
+ return ComplexType(read_type(type_string[1 :i ]),
+ read_type(type_string[i+1:-1]))
elif type_string[0] == "%s" % ENTITY_TYPE:
return ENTITY_TYPE
elif type_string[0] == "%s" % TRUTH_TYPE:
elif type_string[0] == "%s" % ANY_TYPE:
return ANY_TYPE
else:
- raise LogicalExpressionException(
- None, "Unexpected character: '%s'." % type_string[0]
- )
+ raise LogicalExpressionException("Unexpected character: '%s'." % type_string[0])
class TypeException(Exception):
def __init__(self, msg):
super(TypeException, self).__init__(msg)
-
class InconsistentTypeHierarchyException(TypeException):
def __init__(self, variable, expression=None):
if expression:
- msg = (
- "The variable '%s' was found in multiple places with different"
+ msg = "The variable '%s' was found in multiple places with different"\
" types in '%s'." % (variable, expression)
- )
else:
- msg = (
- "The variable '%s' was found in multiple places with different"
+ msg = "The variable '%s' was found in multiple places with different"\
" types." % (variable)
- )
super(InconsistentTypeHierarchyException, self).__init__(msg)
-
class TypeResolutionException(TypeException):
def __init__(self, expression, other_type):
super(TypeResolutionException, self).__init__(
- "The type of '%s', '%s', cannot be resolved with type '%s'"
- % (expression, expression.type, other_type)
- )
-
+ "The type of '%s', '%s', cannot be resolved with type '%s'" %
+ (expression, expression.type, other_type))
class IllegalTypeException(TypeException):
def __init__(self, expression, other_type, allowed_type):
super(IllegalTypeException, self).__init__(
- "Cannot set type of %s '%s' to '%s'; must match type '%s'."
- % (expression.__class__.__name__, expression, other_type, allowed_type)
- )
-
+ "Cannot set type of %s '%s' to '%s'; must match type '%s'." %
+ (expression.__class__.__name__, expression, other_type,
+ allowed_type))
def typecheck(expressions, signature=None):
"""
:param signature: dict that maps variable names to types (or string
representations of types)
"""
- # typecheck and create master signature
+ #typecheck and create master signature
for expression in expressions:
signature = expression.typecheck(signature)
- # apply master signature to all expressions
+ #apply master signature to all expressions
for expression in expressions[:-1]:
expression.typecheck(signature)
return signature
An interface for classes that can perform substitutions for
variables.
"""
-
def substitute_bindings(self, bindings):
"""
:return: The object that is obtained by replacing
raise NotImplementedError()
+@python_2_unicode_compatible
class Expression(SubstituteBindingsI):
"""This is the base abstract object for all logical expressions"""
if prover is None:
from nltk.inference import Prover9
-
prover = Prover9()
bicond = IffExpression(self.simplify(), other.simplify())
return prover.prove(bicond)
if isinstance(val, Variable):
val = self.make_VariableExpression(val)
elif not isinstance(val, Expression):
- raise ValueError(
- "Can not substitute a non-expression "
- "value into an expression: %r" % (val,)
- )
+ raise ValueError('Can not substitute a non-expression '
+ 'value into an expression: %r' % (val,))
# Substitute bindings in the target value.
val = val.substitute_bindings(bindings)
# Replace var w/ the target value.
:param alpha_convert: bool Alpha convert automatically to avoid name clashes?
"""
assert isinstance(variable, Variable), "%s is not a Variable" % variable
- assert isinstance(expression, Expression), (
- "%s is not an Expression" % expression
- )
+ assert isinstance(expression, Expression), "%s is not an Expression" % expression
- return self.visit_structured(
- lambda e: e.replace(variable, expression, replace_bound, alpha_convert),
- self.__class__,
- )
+ return self.visit_structured(lambda e: e.replace(variable, expression,
+ replace_bound, alpha_convert),
+ self.__class__)
def normalize(self, newvars=None):
"""Rename auto-generated unique variables"""
-
def get_indiv_vars(e):
if isinstance(e, IndividualVariableExpression):
return set([e])
elif isinstance(e, AbstractVariableExpression):
return set()
else:
- return e.visit(
- get_indiv_vars, lambda parts: reduce(operator.or_, parts, set())
- )
+ return e.visit(get_indiv_vars,
+ lambda parts: reduce(operator.or_, parts, set()))
result = self
- for i, e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)):
- if isinstance(e, EventVariableExpression):
- newVar = e.__class__(Variable("e0%s" % (i + 1)))
- elif isinstance(e, IndividualVariableExpression):
- newVar = e.__class__(Variable("z%s" % (i + 1)))
+ for i,e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)):
+ if isinstance(e,EventVariableExpression):
+ newVar = e.__class__(Variable('e0%s' % (i+1)))
+ elif isinstance(e,IndividualVariableExpression):
+ newVar = e.__class__(Variable('z%s' % (i+1)))
else:
newVar = e
result = result.replace(e.variable, newVar, True)
return self.visit(function, lambda parts: combinator(*parts))
def __repr__(self):
- return "<%s %s>" % (self.__class__.__name__, self)
+ return '<%s %s>' % (self.__class__.__name__, self)
def __str__(self):
return self.str()
variables and any variable starting with '?' or '@'.
:return: set of ``Variable`` objects
"""
- return self.free() | set(
- p for p in self.predicates() | self.constants() if re.match("^[?@]", p.name)
- )
+ return self.free() | set(p for p in self.predicates()|self.constants()
+ if re.match('^[?@]', p.name))
def free(self):
"""
both individual and predicate variables, but not constants.
:return: set of ``Variable`` objects
"""
- return self.visit(
- lambda e: e.free(), lambda parts: reduce(operator.or_, parts, set())
- )
+ return self.visit(lambda e: e.free(),
+ lambda parts: reduce(operator.or_, parts, set()))
def constants(self):
"""
Return a set of individual constants (non-predicates).
:return: set of ``Variable`` objects
"""
- return self.visit(
- lambda e: e.constants(), lambda parts: reduce(operator.or_, parts, set())
- )
+ return self.visit(lambda e: e.constants(),
+ lambda parts: reduce(operator.or_, parts, set()))
def predicates(self):
"""
Return a set of predicates (constants, not variables).
:return: set of ``Variable`` objects
"""
- return self.visit(
- lambda e: e.predicates(), lambda parts: reduce(operator.or_, parts, set())
- )
+ return self.visit(lambda e: e.predicates(),
+ lambda parts: reduce(operator.or_, parts, set()))
def simplify(self):
"""
return VariableExpression(variable)
+@python_2_unicode_compatible
class ApplicationExpression(Expression):
r"""
This class is used to represent two related types of logical expressions.
``AbstractVariableExpression``). This means that the example from above
will be returned as "(\x y.see(x,y)(john))(mary)".
"""
-
def __init__(self, function, argument):
"""
:param function: ``Expression``, for the function expression
self.argument._set_type(ANY_TYPE, signature)
try:
- self.function._set_type(
- ComplexType(self.argument.type, other_type), signature
- )
+ self.function._set_type(ComplexType(self.argument.type, other_type), signature)
except TypeResolutionException:
raise TypeException(
- "The function '%s' is of type '%s' and cannot be applied "
- "to '%s' of type '%s'. Its argument must match type '%s'."
- % (
- self.function,
- self.function.type,
- self.argument,
- self.argument.type,
- self.function.type.first,
- )
- )
+ "The function '%s' is of type '%s' and cannot be applied "
+ "to '%s' of type '%s'. Its argument must match type '%s'."
+ % (self.function, self.function.type, self.argument,
+ self.argument.type, self.function.type.first))
def findtype(self, variable):
""":see Expression.findtype()"""
if self.is_atom():
function, args = self.uncurry()
else:
- # It's not a predicate expression ("P(x,y)"), so leave args curried
+ #It's not a predicate expression ("P(x,y)"), so leave args curried
function = self.function
args = [self.argument]
- found = [arg.findtype(variable) for arg in [function] + args]
+ found = [arg.findtype(variable) for arg in [function]+args]
unique = []
for f in found:
return combinator([function(self.function), function(self.argument)])
def __eq__(self, other):
- return (
- isinstance(other, ApplicationExpression)
- and self.function == other.function
- and self.argument == other.argument
- )
+ return isinstance(other, ApplicationExpression) and \
+ self.function == other.function and \
+ self.argument == other.argument
def __ne__(self, other):
return not self == other
# uncurry the arguments and find the base function
if self.is_atom():
function, args = self.uncurry()
- arg_str = ",".join("%s" % arg for arg in args)
+ arg_str = ','.join("%s" % arg for arg in args)
else:
- # Leave arguments curried
+ #Leave arguments curried
function = self.function
arg_str = "%s" % self.argument
parenthesize_function = False
if isinstance(function, LambdaExpression):
if isinstance(function.term, ApplicationExpression):
- if not isinstance(function.term.function, AbstractVariableExpression):
+ if not isinstance(function.term.function,
+ AbstractVariableExpression):
parenthesize_function = True
elif not isinstance(function.term, BooleanExpression):
parenthesize_function = True
function = self.function
args = [self.argument]
while isinstance(function, ApplicationExpression):
- # (\x.\y.sees(x,y)(john))(mary)
+ #(\x.\y.sees(x,y)(john))(mary)
args.insert(0, function.argument)
function = function.function
return (function, args)
@total_ordering
+@python_2_unicode_compatible
class AbstractVariableExpression(Expression):
"""This class represents a variable to be used as a predicate or entity"""
-
def __init__(self, variable):
"""
:param variable: ``Variable``, for the variable
def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
""":see: Expression.replace()"""
assert isinstance(variable, Variable), "%s is not an Variable" % variable
- assert isinstance(expression, Expression), (
- "%s is not an Expression" % expression
- )
+ assert isinstance(expression, Expression), "%s is not an Expression" % expression
if self.variable == variable:
return expression
else:
def __eq__(self, other):
"""Allow equality between instances of ``AbstractVariableExpression``
subtypes."""
- return (
- isinstance(other, AbstractVariableExpression)
- and self.variable == other.variable
- )
+ return isinstance(other, AbstractVariableExpression) and \
+ self.variable == other.variable
def __ne__(self, other):
return not self == other
def __str__(self):
return "%s" % self.variable
-
class IndividualVariableExpression(AbstractVariableExpression):
"""This class represents variables that take the form of a single lowercase
character (other than 'e') followed by zero or more digits."""
-
def _set_type(self, other_type=ANY_TYPE, signature=None):
""":see Expression._set_type()"""
assert isinstance(other_type, Type)
signature[self.variable.name].append(self)
- def _get_type(self):
- return ENTITY_TYPE
-
+ def _get_type(self): return ENTITY_TYPE
type = property(_get_type, _set_type)
def free(self):
""":see: Expression.constants()"""
return set()
-
class FunctionVariableExpression(AbstractVariableExpression):
"""This class represents variables that take the form of a single uppercase
character followed by zero or more digits."""
-
type = ANY_TYPE
def free(self):
""":see: Expression.constants()"""
return set()
-
class EventVariableExpression(IndividualVariableExpression):
"""This class represents variables that take the form of a single lowercase
'e' character followed by zero or more digits."""
-
type = EVENT_TYPE
-
class ConstantExpression(AbstractVariableExpression):
"""This class represents variables that do not take the form of a single
character followed by zero or more digits."""
-
type = ENTITY_TYPE
def _set_type(self, other_type=ANY_TYPE, signature=None):
signature = defaultdict(list)
if other_type == ANY_TYPE:
- # entity type by default, for individuals
+ #entity type by default, for individuals
resolution = ENTITY_TYPE
else:
resolution = other_type
class VariableBinderExpression(Expression):
"""This an abstract class for any Expression that binds a variable in an
Expression. This includes LambdaExpressions and Quantified Expressions"""
-
def __init__(self, variable, term):
"""
:param variable: ``Variable``, for the variable
def replace(self, variable, expression, replace_bound=False, alpha_convert=True):
""":see: Expression.replace()"""
assert isinstance(variable, Variable), "%s is not a Variable" % variable
- assert isinstance(expression, Expression), (
- "%s is not an Expression" % expression
- )
- # if the bound variable is the thing being replaced
+ assert isinstance(expression, Expression), "%s is not an Expression" % expression
+ #if the bound variable is the thing being replaced
if self.variable == variable:
if replace_bound:
- assert isinstance(expression, AbstractVariableExpression), (
- "%s is not a AbstractVariableExpression" % expression
- )
- return self.__class__(
- expression.variable,
- self.term.replace(variable, expression, True, alpha_convert),
- )
+ assert isinstance(expression, AbstractVariableExpression),\
+ "%s is not a AbstractVariableExpression" % expression
+ return self.__class__(expression.variable,
+ self.term.replace(variable, expression, True, alpha_convert))
else:
return self
else:
if alpha_convert and self.variable in expression.free():
self = self.alpha_convert(unique_variable(pattern=self.variable))
- # replace in the term
- return self.__class__(
- self.variable,
- self.term.replace(variable, expression, replace_bound, alpha_convert),
- )
+ #replace in the term
+ return self.__class__(self.variable,
+ self.term.replace(variable, expression, replace_bound, alpha_convert))
def alpha_convert(self, newvar):
"""Rename all occurrences of the variable introduced by this variable
:param newvar: ``Variable``, for the new variable
"""
assert isinstance(newvar, Variable), "%s is not a Variable" % newvar
- return self.__class__(
- newvar, self.term.replace(self.variable, VariableExpression(newvar), True)
- )
+ return self.__class__(newvar,
+ self.term.replace(self.variable,
+ VariableExpression(newvar),
+ True))
def free(self):
""":see: Expression.free()"""
def __eq__(self, other):
r"""Defines equality modulo alphabetic variance. If we are comparing
\x.M and \y.N, then check equality of M and N[x/y]."""
- if isinstance(self, other.__class__) or isinstance(other, self.__class__):
+ if isinstance(self, other.__class__) or \
+ isinstance(other, self.__class__):
if self.variable == other.variable:
return self.term == other.term
else:
__hash__ = Expression.__hash__
+@python_2_unicode_compatible
class LambdaExpression(VariableBinderExpression):
@property
def type(self):
- return ComplexType(self.term.findtype(self.variable), self.term.type)
+ return ComplexType(self.term.findtype(self.variable),
+ self.term.type)
def _set_type(self, other_type=ANY_TYPE, signature=None):
""":see Expression._set_type()"""
while term.__class__ == self.__class__:
variables.append(term.variable)
term = term.term
- return (
- Tokens.LAMBDA
- + " ".join("%s" % v for v in variables)
- + Tokens.DOT
- + "%s" % term
- )
+ return Tokens.LAMBDA + ' '.join("%s" % v for v in variables) + \
+ Tokens.DOT + "%s" % term
+@python_2_unicode_compatible
class QuantifiedExpression(VariableBinderExpression):
@property
- def type(self):
- return TRUTH_TYPE
+ def type(self): return TRUTH_TYPE
def _set_type(self, other_type=ANY_TYPE, signature=None):
""":see Expression._set_type()"""
while term.__class__ == self.__class__:
variables.append(term.variable)
term = term.term
- return (
- self.getQuantifier()
- + " "
- + " ".join("%s" % v for v in variables)
- + Tokens.DOT
- + "%s" % term
- )
-
+ return self.getQuantifier() + ' ' + ' '.join("%s" % v for v in variables) + \
+ Tokens.DOT + "%s" % term
class ExistsExpression(QuantifiedExpression):
def getQuantifier(self):
return Tokens.EXISTS
-
class AllExpression(QuantifiedExpression):
def getQuantifier(self):
return Tokens.ALL
+@python_2_unicode_compatible
class NegatedExpression(Expression):
def __init__(self, term):
assert isinstance(term, Expression), "%s is not an Expression" % term
self.term = term
@property
- def type(self):
- return TRUTH_TYPE
+ def type(self): return TRUTH_TYPE
def _set_type(self, other_type=ANY_TYPE, signature=None):
""":see Expression._set_type()"""
return Tokens.NOT + "%s" % self.term
+@python_2_unicode_compatible
class BinaryExpression(Expression):
def __init__(self, first, second):
assert isinstance(first, Expression), "%s is not an Expression" % first
self.second = second
@property
- def type(self):
- return TRUTH_TYPE
+ def type(self): return TRUTH_TYPE
def findtype(self, variable):
""":see Expression.findtype()"""
return combinator([function(self.first), function(self.second)])
def __eq__(self, other):
- return (
- (isinstance(self, other.__class__) or isinstance(other, self.__class__))
- and self.first == other.first
- and self.second == other.second
- )
+ return (isinstance(self, other.__class__) or \
+ isinstance(other, self.__class__)) and \
+ self.first == other.first and self.second == other.second
def __ne__(self, other):
return not self == other
def __str__(self):
first = self._str_subex(self.first)
second = self._str_subex(self.second)
- return Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE
+ return Tokens.OPEN + first + ' ' + self.getOp() \
+ + ' ' + second + Tokens.CLOSE
def _str_subex(self, subex):
return "%s" % subex
self.first._set_type(TRUTH_TYPE, signature)
self.second._set_type(TRUTH_TYPE, signature)
-
class AndExpression(BooleanExpression):
"""This class represents conjunctions"""
-
def getOp(self):
return Tokens.AND
return s[1:-1]
return s
-
class OrExpression(BooleanExpression):
"""This class represents disjunctions"""
-
def getOp(self):
return Tokens.OR
return s[1:-1]
return s
-
class ImpExpression(BooleanExpression):
"""This class represents implications"""
-
def getOp(self):
return Tokens.IMP
-
class IffExpression(BooleanExpression):
"""This class represents biconditionals"""
-
def getOp(self):
return Tokens.IFF
class EqualityExpression(BinaryExpression):
"""This class represents equality expressions like "(x = y)"."""
-
def _set_type(self, other_type=ANY_TYPE, signature=None):
""":see Expression._set_type()"""
assert isinstance(other_type, Type)
### Utilities
-
class LogicalExpressionException(Exception):
def __init__(self, index, message):
self.index = index
Exception.__init__(self, message)
-
class UnexpectedTokenException(LogicalExpressionException):
def __init__(self, index, unexpected=None, expected=None, message=None):
if unexpected and expected:
- msg = "Unexpected token: '%s'. " "Expected token '%s'." % (
- unexpected,
- expected,
- )
+ msg = "Unexpected token: '%s'. " \
+ "Expected token '%s'." % (unexpected, expected)
elif unexpected:
msg = "Unexpected token: '%s'." % unexpected
if message:
- msg += " " + message
+ msg += ' '+message
else:
msg = "Expected token '%s'." % expected
LogicalExpressionException.__init__(self, index, msg)
-
class ExpectedMoreTokensException(LogicalExpressionException):
def __init__(self, index, message=None):
if not message:
- message = "More tokens expected."
- LogicalExpressionException.__init__(
- self, index, "End of input found. " + message
- )
+ message = 'More tokens expected.'
+ LogicalExpressionException.__init__(self, index, 'End of input found. ' + message)
def is_indvar(expr):
:param expr: str
:return: bool True if expr is of the correct form
"""
- assert isinstance(expr, str), "%s is not a string" % expr
- return re.match(r"^[a-df-z]\d*$", expr) is not None
-
+ assert isinstance(expr, string_types), "%s is not a string" % expr
+ return re.match(r'^[a-df-z]\d*$', expr) is not None
def is_funcvar(expr):
"""
:param expr: str
:return: bool True if expr is of the correct form
"""
- assert isinstance(expr, str), "%s is not a string" % expr
- return re.match(r"^[A-Z]\d*$", expr) is not None
-
+ assert isinstance(expr, string_types), "%s is not a string" % expr
+ return re.match(r'^[A-Z]\d*$', expr) is not None
def is_eventvar(expr):
"""
:param expr: str
:return: bool True if expr is of the correct form
"""
- assert isinstance(expr, str), "%s is not a string" % expr
- return re.match(r"^e\d*$", expr) is not None
+ assert isinstance(expr, string_types), "%s is not a string" % expr
+ return re.match(r'^e\d*$', expr) is not None
def demo():
lexpr = Expression.fromstring
- print("=" * 20 + "Test reader" + "=" * 20)
- print(lexpr(r"john"))
- print(lexpr(r"man(x)"))
- print(lexpr(r"-man(x)"))
- print(lexpr(r"(man(x) & tall(x) & walks(x))"))
- print(lexpr(r"exists x.(man(x) & tall(x) & walks(x))"))
- print(lexpr(r"\x.man(x)"))
- print(lexpr(r"\x.man(x)(john)"))
- print(lexpr(r"\x y.sees(x,y)"))
- print(lexpr(r"\x y.sees(x,y)(a,b)"))
- print(lexpr(r"(\x.exists y.walks(x,y))(x)"))
- print(lexpr(r"exists x.x = y"))
- print(lexpr(r"exists x.(x = y)"))
- print(lexpr("P(x) & x=y & P(y)"))
- print(lexpr(r"\P Q.exists x.(P(x) & Q(x))"))
- print(lexpr(r"man(x) <-> tall(x)"))
-
- print("=" * 20 + "Test simplify" + "=" * 20)
- print(lexpr(r"\x.\y.sees(x,y)(john)(mary)").simplify())
- print(lexpr(r"\x.\y.sees(x,y)(john, mary)").simplify())
- print(lexpr(r"all x.(man(x) & (\x.exists y.walks(x,y))(x))").simplify())
- print(lexpr(r"(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))").simplify())
-
- print("=" * 20 + "Test alpha conversion and binder expression equality" + "=" * 20)
- e1 = lexpr("exists x.P(x)")
+ print('='*20 + 'Test reader' + '='*20)
+ print(lexpr(r'john'))
+ print(lexpr(r'man(x)'))
+ print(lexpr(r'-man(x)'))
+ print(lexpr(r'(man(x) & tall(x) & walks(x))'))
+ print(lexpr(r'exists x.(man(x) & tall(x) & walks(x))'))
+ print(lexpr(r'\x.man(x)'))
+ print(lexpr(r'\x.man(x)(john)'))
+ print(lexpr(r'\x y.sees(x,y)'))
+ print(lexpr(r'\x y.sees(x,y)(a,b)'))
+ print(lexpr(r'(\x.exists y.walks(x,y))(x)'))
+ print(lexpr(r'exists x.x = y'))
+ print(lexpr(r'exists x.(x = y)'))
+ print(lexpr('P(x) & x=y & P(y)'))
+ print(lexpr(r'\P Q.exists x.(P(x) & Q(x))'))
+ print(lexpr(r'man(x) <-> tall(x)'))
+
+ print('='*20 + 'Test simplify' + '='*20)
+ print(lexpr(r'\x.\y.sees(x,y)(john)(mary)').simplify())
+ print(lexpr(r'\x.\y.sees(x,y)(john, mary)').simplify())
+ print(lexpr(r'all x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify())
+ print(lexpr(r'(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))').simplify())
+
+ print('='*20 + 'Test alpha conversion and binder expression equality' + '='*20)
+ e1 = lexpr('exists x.P(x)')
print(e1)
- e2 = e1.alpha_convert(Variable("z"))
+ e2 = e1.alpha_convert(Variable('z'))
print(e2)
print(e1 == e2)
-
def demo_errors():
- print("=" * 20 + "Test reader errors" + "=" * 20)
- demoException("(P(x) & Q(x)")
- demoException("((P(x) &) & Q(x))")
- demoException("P(x) -> ")
- demoException("P(x")
- demoException("P(x,")
- demoException("P(x,)")
- demoException("exists")
- demoException("exists x.")
- demoException("\\")
- demoException("\\ x y.")
- demoException("P(x)Q(x)")
- demoException("(P(x)Q(x)")
- demoException("exists x -> y")
-
+ print('='*20 + 'Test reader errors' + '='*20)
+ demoException('(P(x) & Q(x)')
+ demoException('((P(x) &) & Q(x))')
+ demoException('P(x) -> ')
+ demoException('P(x')
+ demoException('P(x,')
+ demoException('P(x,)')
+ demoException('exists')
+ demoException('exists x.')
+ demoException('\\')
+ demoException('\\ x y.')
+ demoException('P(x)Q(x)')
+ demoException('(P(x)Q(x)')
+ demoException('exists x -> y')
def demoException(s):
try:
except LogicalExpressionException as e:
print("%s: %s" % (e.__class__.__name__, e))
-
def printtype(ex):
print("%s : %s" % (ex.str(), ex.type))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# demo_errors()
# Natural Language Toolkit: Relation Extraction
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
- A clause is an atom of the form ``relsym(subjsym, objsym)``,
where the relation, subject and object have been canonicalized to single strings.
"""
+from __future__ import print_function
# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
from collections import defaultdict
-import html
import re
+from six.moves import html_entities
+
# Dictionary that associates corpora with NE classes
NE_CLASSES = {
- "ieer": [
- "LOCATION",
- "ORGANIZATION",
- "PERSON",
- "DURATION",
- "DATE",
- "CARDINAL",
- "PERCENT",
- "MONEY",
- "MEASURE",
- ],
- "conll2002": ["LOC", "PER", "ORG"],
- "ace": [
- "LOCATION",
- "ORGANIZATION",
- "PERSON",
- "DURATION",
- "DATE",
- "CARDINAL",
- "PERCENT",
- "MONEY",
- "MEASURE",
- "FACILITY",
- "GPE",
- ],
-}
+ 'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
+ 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'],
+ 'conll2002': ['LOC', 'PER', 'ORG'],
+ 'ace': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
+ 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE', 'FACILITY', 'GPE'],
+ }
# Allow abbreviated class labels
-short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON")
-long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER")
+short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON')
+long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER')
def _expand(type):
except KeyError:
return type
-
def class_abbrev(type):
"""
Abbreviate an NE class name.
return type
-def _join(lst, sep=" ", untag=False):
+def _join(lst, sep=' ', untag=False):
"""
Join a list into a string, turning tags tuples into tag strings or just words.
:param untag: if ``True``, omit the tag from tagged input strings.
if untag:
return sep.join(tup[0] for tup in lst)
from nltk.tag import tuple2str
-
return sep.join(tuple2str(tup) for tup in lst)
-
-def descape_entity(m, defs=html.entities.entitydefs):
+def descape_entity(m, defs=html_entities.entitydefs):
"""
Translate one entity to its ISO Latin value.
Inspired by example from effbot.org
"""
+ #s = 'mcglashan_&_sarrail'
+ #l = ['mcglashan', '&', 'sarrail']
+ #pattern = re.compile("&(\w+?);")
+ #new = list2sym(l)
+ #s = pattern.sub(descape_entity, s)
+ #print s, new
try:
return defs[m.group(1)]
except KeyError:
- return m.group(0) # use as is
-
+ return m.group(0) # use as is
def list2sym(lst):
"""
:return: a Unicode string without whitespace
:rtype: unicode
"""
- sym = _join(lst, "_", untag=True)
+ sym = _join(lst, '_', untag=True)
sym = sym.lower()
ENT = re.compile("&(\w+?);")
sym = ENT.sub(descape_entity, sym)
- sym = sym.replace(".", "")
+ sym = sym.replace('.', '')
return sym
-
def tree2semi_rel(tree):
"""
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
result = []
while len(pairs) > 2:
reldict = defaultdict(str)
- reldict["lcon"] = _join(pairs[0][0][-window:])
- reldict["subjclass"] = pairs[0][1].label()
- reldict["subjtext"] = _join(pairs[0][1].leaves())
- reldict["subjsym"] = list2sym(pairs[0][1].leaves())
- reldict["filler"] = _join(pairs[1][0])
- reldict["untagged_filler"] = _join(pairs[1][0], untag=True)
- reldict["objclass"] = pairs[1][1].label()
- reldict["objtext"] = _join(pairs[1][1].leaves())
- reldict["objsym"] = list2sym(pairs[1][1].leaves())
- reldict["rcon"] = _join(pairs[2][0][:window])
+ reldict['lcon'] = _join(pairs[0][0][-window:])
+ reldict['subjclass'] = pairs[0][1].label()
+ reldict['subjtext'] = _join(pairs[0][1].leaves())
+ reldict['subjsym'] = list2sym(pairs[0][1].leaves())
+ reldict['filler'] = _join(pairs[1][0])
+ reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
+ reldict['objclass'] = pairs[1][1].label()
+ reldict['objtext'] = _join(pairs[1][1].leaves())
+ reldict['objsym'] = list2sym(pairs[1][1].leaves())
+ reldict['rcon'] = _join(pairs[2][0][:window])
if trace:
- print(
- "(%s(%s, %s)"
- % (
- reldict["untagged_filler"],
- reldict["subjclass"],
- reldict["objclass"],
- )
- )
+ print("(%s(%s, %s)" % (reldict['untagged_filler'], reldict['subjclass'], reldict['objclass']))
result.append(reldict)
pairs = pairs[1:]
return result
-
-def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10):
+def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
"""
Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
if _expand(subjclass) in NE_CLASSES[corpus]:
subjclass = _expand(subjclass)
else:
- raise ValueError(
- "your value for the subject type has not been recognized: %s"
- % subjclass
- )
+ raise ValueError("your value for the subject type has not been recognized: %s" % subjclass)
if objclass and objclass not in NE_CLASSES[corpus]:
if _expand(objclass) in NE_CLASSES[corpus]:
objclass = _expand(objclass)
else:
- raise ValueError(
- "your value for the object type has not been recognized: %s" % objclass
- )
+ raise ValueError("your value for the object type has not been recognized: %s" % objclass)
- if corpus == "ace" or corpus == "conll2002":
+ if corpus == 'ace' or corpus == 'conll2002':
pairs = tree2semi_rel(doc)
- elif corpus == "ieer":
+ elif corpus == 'ieer':
pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
else:
raise ValueError("corpus type not recognized")
reldicts = semi_rel2reldict(pairs)
- relfilter = lambda x: (
- x["subjclass"] == subjclass
- and len(x["filler"].split()) <= window
- and pattern.match(x["filler"])
- and x["objclass"] == objclass
- )
+ relfilter = lambda x: (x['subjclass'] == subjclass and
+ len(x['filler'].split()) <= window and
+ pattern.match(x['filler']) and
+ x['objclass'] == objclass)
return list(filter(relfilter, reldicts))
:param reldict: a relation dictionary
:type reldict: defaultdict
"""
- items = [
- class_abbrev(reldict["subjclass"]),
- reldict["subjtext"],
- reldict["filler"],
- class_abbrev(reldict["objclass"]),
- reldict["objtext"],
- ]
- format = "[%s: %r] %r [%s: %r]"
+ items = [class_abbrev(reldict['subjclass']), reldict['subjtext'], reldict['filler'], class_abbrev(reldict['objclass']), reldict['objtext']]
+ format = '[%s: %r] %r [%s: %r]'
if lcon:
- items = [reldict["lcon"]] + items
- format = "...%r)" + format
+ items = [reldict['lcon']] + items
+ format = '...%r)' + format
if rcon:
- items.append(reldict["rcon"])
- format = format + "(%r..."
+ items.append(reldict['rcon'])
+ format = format + '(%r...'
printargs = tuple(items)
return format % printargs
-
def clause(reldict, relsym):
"""
Print the relation in clausal form.
:param relsym: a label for the relation
:type relsym: str
"""
- items = (relsym, reldict["subjsym"], reldict["objsym"])
+ items = (relsym, reldict['subjsym'], reldict['objsym'])
return "%s(%r, %r)" % items
query.
"""
from nltk.corpus import ieer
-
if sql:
try:
import sqlite3
-
- connection = sqlite3.connect(":memory:")
+ connection = sqlite3.connect(":memory:")
connection.text_factory = sqlite3.OptimizedUnicode
cur = connection.cursor()
- cur.execute(
- """create table Locations
- (OrgName text, LocationName text, DocID text)"""
- )
+ cur.execute("""create table Locations
+ (OrgName text, LocationName text, DocID text)""")
except ImportError:
import warnings
-
warnings.warn("Cannot import sqlite; sql flag will be ignored.")
- IN = re.compile(r".*\bin\b(?!\b.+ing)")
+
+ IN = re.compile(r'.*\bin\b(?!\b.+ing)')
print()
print("IEER: in(ORG, LOC) -- just the clauses:")
if trace:
print(doc.docno)
print("=" * 15)
- for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
- print(clause(rel, relsym="IN"))
+ for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
+ print(clause(rel, relsym='IN'))
if sql:
try:
- rtuple = (rel["subjtext"], rel["objtext"], doc.docno)
- cur.execute(
- """insert into Locations
- values (?, ?, ?)""",
- rtuple,
- )
+ rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
+ cur.execute("""insert into Locations
+ values (?, ?, ?)""", rtuple)
connection.commit()
except NameError:
pass
if sql:
try:
- cur.execute(
- """select OrgName from Locations
- where LocationName = 'Atlanta'"""
- )
+ cur.execute("""select OrgName from Locations
+ where LocationName = 'Atlanta'""")
print()
print("Extract data from SQL table: ORGs in Atlanta")
print("-" * 15)
# Example of has_role(PER, LOC)
############################################
-
def roles_demo(trace=0):
from nltk.corpus import ieer
-
roles = """
(.*( # assorted roles
analyst|
print(doc.docno)
print("=" * 15)
lcon = rcon = True
- for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES):
+ for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
print(rtuple(rel, lcon=lcon, rcon=rcon))
print("IEER: First 20 Headlines")
print("=" * 45)
- trees = [
- (doc.docno, doc.headline)
- for file in ieer.fileids()
- for doc in ieer.parsed_docs(file)
- ]
+ trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
+
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
-
def conllned(trace=1):
"""
Find the copula+'van' relation ('of') in the Dutch tagged training corpus
print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
print("=" * 45)
- for doc in conll2002.chunked_sents("ned.train"):
+
+ for doc in conll2002.chunked_sents('ned.train'):
lcon = rcon = False
if trace:
- lcon = rcon = True
- for rel in extract_rels(
- "PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10
- ):
+ lcon = rcon = True
+ for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10):
print(rtuple(rel, lcon=lcon, rcon=rcon))
-
#############################################
## Spanish CONLL2002: (PER, ORG)
#############################################
-
def conllesp():
from nltk.corpus import conll2002
print()
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
print("=" * 45)
- rels = [
- rel
- for doc in conll2002.chunked_sents("esp.train")
- for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE)
- ]
- for r in rels[:10]:
- print(clause(r, relsym="DE"))
+ rels = [rel for doc in conll2002.chunked_sents('esp.train')
+ for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
+ for r in rels[:10]: print(clause(r, relsym='DE'))
print()
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
- ROLE = re.compile(
- r".*(chairman|president|trader|scientist|economist|analyst|partner).*"
- )
+ ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
- rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7)
+ rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
- print("{0:<5}{1}".format(i, rtuple(rel)))
+ print('{0:<5}{1}'.format(i, rtuple(rel)))
-if __name__ == "__main__":
+if __name__ == '__main__':
import nltk
from nltk.sem import relextract
-
in_demo(trace=0)
roles_demo(trace=0)
conllned()
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-from nltk.sem.logic import (
- AllExpression,
- AndExpression,
- ApplicationExpression,
- EqualityExpression,
- ExistsExpression,
- IffExpression,
- ImpExpression,
- NegatedExpression,
- OrExpression,
- VariableExpression,
- skolem_function,
- unique_variable,
-)
-
+from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression,
+ EqualityExpression, ExistsExpression, IffExpression,
+ ImpExpression, NegatedExpression, OrExpression,
+ VariableExpression, skolem_function, unique_variable)
def skolemize(expression, univ_scope=None, used_variables=None):
"""
used_variables = set()
if isinstance(expression, AllExpression):
- term = skolemize(
- expression.term,
- univ_scope | set([expression.variable]),
- used_variables | set([expression.variable]),
- )
- return term.replace(
- expression.variable,
- VariableExpression(unique_variable(ignore=used_variables)),
- )
+ term = skolemize(expression.term, univ_scope|set([expression.variable]), used_variables|set([expression.variable]))
+ return term.replace(expression.variable, VariableExpression(unique_variable(ignore=used_variables)))
elif isinstance(expression, AndExpression):
- return skolemize(expression.first, univ_scope, used_variables) & skolemize(
- expression.second, univ_scope, used_variables
- )
+ return skolemize(expression.first, univ_scope, used_variables) &\
+ skolemize(expression.second, univ_scope, used_variables)
elif isinstance(expression, OrExpression):
- return to_cnf(
- skolemize(expression.first, univ_scope, used_variables),
- skolemize(expression.second, univ_scope, used_variables),
- )
+ return to_cnf(skolemize(expression.first, univ_scope, used_variables),
+ skolemize(expression.second, univ_scope, used_variables))
elif isinstance(expression, ImpExpression):
- return to_cnf(
- skolemize(-expression.first, univ_scope, used_variables),
- skolemize(expression.second, univ_scope, used_variables),
- )
+ return to_cnf(skolemize(-expression.first, univ_scope, used_variables),
+ skolemize(expression.second, univ_scope, used_variables))
elif isinstance(expression, IffExpression):
- return to_cnf(
- skolemize(-expression.first, univ_scope, used_variables),
- skolemize(expression.second, univ_scope, used_variables),
- ) & to_cnf(
- skolemize(expression.first, univ_scope, used_variables),
- skolemize(-expression.second, univ_scope, used_variables),
- )
+ return to_cnf(skolemize(-expression.first, univ_scope, used_variables),
+ skolemize(expression.second, univ_scope, used_variables)) &\
+ to_cnf(skolemize(expression.first, univ_scope, used_variables),
+ skolemize(-expression.second, univ_scope, used_variables))
elif isinstance(expression, EqualityExpression):
return expression
elif isinstance(expression, NegatedExpression):
negated = expression.term
if isinstance(negated, AllExpression):
- term = skolemize(
- -negated.term, univ_scope, used_variables | set([negated.variable])
- )
+ term = skolemize(-negated.term, univ_scope, used_variables|set([negated.variable]))
if univ_scope:
return term.replace(negated.variable, skolem_function(univ_scope))
else:
- skolem_constant = VariableExpression(
- unique_variable(ignore=used_variables)
- )
+ skolem_constant = VariableExpression(unique_variable(ignore=used_variables))
return term.replace(negated.variable, skolem_constant)
elif isinstance(negated, AndExpression):
- return to_cnf(
- skolemize(-negated.first, univ_scope, used_variables),
- skolemize(-negated.second, univ_scope, used_variables),
- )
+ return to_cnf(skolemize(-negated.first, univ_scope, used_variables),
+ skolemize(-negated.second, univ_scope, used_variables))
elif isinstance(negated, OrExpression):
- return skolemize(-negated.first, univ_scope, used_variables) & skolemize(
- -negated.second, univ_scope, used_variables
- )
+ return skolemize(-negated.first, univ_scope, used_variables) &\
+ skolemize(-negated.second, univ_scope, used_variables)
elif isinstance(negated, ImpExpression):
- return skolemize(negated.first, univ_scope, used_variables) & skolemize(
- -negated.second, univ_scope, used_variables
- )
+ return skolemize(negated.first, univ_scope, used_variables) &\
+ skolemize(-negated.second, univ_scope, used_variables)
elif isinstance(negated, IffExpression):
- return to_cnf(
- skolemize(-negated.first, univ_scope, used_variables),
- skolemize(-negated.second, univ_scope, used_variables),
- ) & to_cnf(
- skolemize(negated.first, univ_scope, used_variables),
- skolemize(negated.second, univ_scope, used_variables),
- )
+ return to_cnf(skolemize(-negated.first, univ_scope, used_variables),
+ skolemize(-negated.second, univ_scope, used_variables)) &\
+ to_cnf(skolemize(negated.first, univ_scope, used_variables),
+ skolemize(negated.second, univ_scope, used_variables))
elif isinstance(negated, EqualityExpression):
return expression
elif isinstance(negated, NegatedExpression):
return skolemize(negated.term, univ_scope, used_variables)
elif isinstance(negated, ExistsExpression):
- term = skolemize(
- -negated.term,
- univ_scope | set([negated.variable]),
- used_variables | set([negated.variable]),
- )
- return term.replace(
- negated.variable,
- VariableExpression(unique_variable(ignore=used_variables)),
- )
+ term = skolemize(-negated.term, univ_scope|set([negated.variable]), used_variables|set([negated.variable]))
+ return term.replace(negated.variable, VariableExpression(unique_variable(ignore=used_variables)))
elif isinstance(negated, ApplicationExpression):
return expression
else:
- raise Exception("'%s' cannot be skolemized" % expression)
+ raise Exception('\'%s\' cannot be skolemized' % expression)
elif isinstance(expression, ExistsExpression):
- term = skolemize(
- expression.term, univ_scope, used_variables | set([expression.variable])
- )
+ term = skolemize(expression.term, univ_scope, used_variables|set([expression.variable]))
if univ_scope:
return term.replace(expression.variable, skolem_function(univ_scope))
else:
elif isinstance(expression, ApplicationExpression):
return expression
else:
- raise Exception("'%s' cannot be skolemized" % expression)
-
+ raise Exception('\'%s\' cannot be skolemized' % expression)
def to_cnf(first, second):
"""
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
syntax tree, followed by evaluation of the semantic representation in
a first-order model.
"""
+from __future__ import print_function, unicode_literals
import codecs
from nltk.sem import evaluate
## Utility functions for connecting parse output to semantics
##############################################################
-
def parse_sents(inputs, grammar, trace=0):
"""
Convert input sentences into syntactic trees.
cp = load_parser(grammar, trace=trace)
parses = []
for sent in inputs:
- tokens = sent.split() # use a tokenizer?
+ tokens = sent.split() # use a tokenizer?
syntrees = list(cp.parse(tokens))
parses.append(syntrees)
return parses
-
-def root_semrep(syntree, semkey="SEM"):
+def root_semrep(syntree, semkey='SEM'):
"""
Find the semantic representation at the root of a tree.
try:
return node[semkey]
except KeyError:
- print(node, end=" ")
+ print(node, end=' ')
print("has no specification for the feature %s" % semkey)
raise
-
-def interpret_sents(inputs, grammar, semkey="SEM", trace=0):
+def interpret_sents(inputs, grammar, semkey='SEM', trace=0):
"""
Add the semantic representation to each syntactic parse tree
of each input sentence.
:return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations)
:rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression)))
"""
- return [
- [(syn, root_semrep(syn, semkey)) for syn in syntrees]
- for syntrees in parse_sents(inputs, grammar, trace=trace)
- ]
-
+ return [[(syn, root_semrep(syn, semkey)) for syn in syntrees]
+ for syntrees in parse_sents(inputs, grammar, trace=trace)]
def evaluate_sents(inputs, grammar, model, assignment, trace=0):
"""
:return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model)
:rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression, bool or dict(str): bool)))
"""
- return [
- [
- (syn, sem, model.evaluate("%s" % sem, assignment, trace=trace))
- for (syn, sem) in interpretations
- ]
- for interpretations in interpret_sents(inputs, grammar)
- ]
+ return [[(syn, sem, model.evaluate("%s" % sem, assignment, trace=trace))
+ for (syn, sem) in interpretations]
+ for interpretations in interpret_sents(inputs, grammar)]
def demo_model0():
global m0, g0
- # Initialize a valuation of non-logical constants."""
- v = [
- ("john", "b1"),
- ("mary", "g1"),
- ("suzie", "g2"),
- ("fido", "d1"),
- ("tess", "d2"),
- ("noosa", "n"),
- ("girl", set(["g1", "g2"])),
- ("boy", set(["b1", "b2"])),
- ("dog", set(["d1", "d2"])),
- ("bark", set(["d1", "d2"])),
- ("walk", set(["b1", "g2", "d1"])),
- ("chase", set([("b1", "g1"), ("b2", "g1"), ("g1", "d1"), ("g2", "d2")])),
- (
- "see",
- set([("b1", "g1"), ("b2", "d2"), ("g1", "b1"), ("d2", "b1"), ("g2", "n")]),
- ),
- ("in", set([("b1", "n"), ("b2", "n"), ("d2", "n")])),
- ("with", set([("b1", "g1"), ("g1", "b1"), ("d1", "b1"), ("b1", "d1")])),
- ]
- # Read in the data from ``v``
+ #Initialize a valuation of non-logical constants."""
+ v = [('john', 'b1'),
+ ('mary', 'g1'),
+ ('suzie', 'g2'),
+ ('fido', 'd1'),
+ ('tess', 'd2'),
+ ('noosa', 'n'),
+ ('girl', set(['g1', 'g2'])),
+ ('boy', set(['b1', 'b2'])),
+ ('dog', set(['d1', 'd2'])),
+ ('bark', set(['d1', 'd2'])),
+ ('walk', set(['b1', 'g2', 'd1'])),
+ ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])),
+ ('see', set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'),('d2', 'b1'), ('g2', 'n')])),
+ ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])),
+ ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')]))
+ ]
+ #Read in the data from ``v``
val = evaluate.Valuation(v)
- # Bind ``dom`` to the ``domain`` property of ``val``
+ #Bind ``dom`` to the ``domain`` property of ``val``
dom = val.domain
- # Initialize a model with parameters ``dom`` and ``val``.
+ #Initialize a model with parameters ``dom`` and ``val``.
m0 = evaluate.Model(dom, val)
- # Initialize a variable assignment with parameter ``dom``
+ #Initialize a variable assignment with parameter ``dom``
g0 = evaluate.Assignment(dom)
-def read_sents(filename, encoding="utf8"):
- with codecs.open(filename, "r", encoding) as fp:
+def read_sents(filename, encoding='utf8'):
+ with codecs.open(filename, 'r', encoding) as fp:
sents = [l.rstrip() for l in fp]
# get rid of blank lines
sents = [l for l in sents if len(l) > 0]
- sents = [l for l in sents if not l[0] == "#"]
+ sents = [l for l in sents if not l[0] == '#']
return sents
-
def demo_legacy_grammar():
"""
Check that interpret_sents() is compatible with legacy grammars that use
"""
from nltk.grammar import FeatureGrammar
- g = FeatureGrammar.fromstring(
- """
+ g = FeatureGrammar.fromstring("""
% start S
S[sem=<hello>] -> 'hello'
- """
- )
+ """)
print("Reading grammar: %s" % g)
print("*" * 20)
- for reading in interpret_sents(["hello"], g, semkey="sem"):
+ for reading in interpret_sents(['hello'], g, semkey='sem'):
syn, sem = reading[0]
print()
print("output: ", sem)
-
def demo():
import sys
from optparse import OptionParser
-
- description = """
+ description = \
+ """
Parse and evaluate some sentences.
"""
opts = OptionParser(description=description)
- opts.set_defaults(
- evaluate=True,
- beta=True,
- syntrace=0,
- semtrace=0,
- demo="default",
- grammar="",
- sentences="",
- )
-
- opts.add_option(
- "-d",
- "--demo",
- dest="demo",
- help="choose demo D; omit this for the default demo, or specify 'chat80'",
- metavar="D",
- )
- opts.add_option(
- "-g", "--gram", dest="grammar", help="read in grammar G", metavar="G"
- )
- opts.add_option(
- "-m",
- "--model",
- dest="model",
- help="import model M (omit '.py' suffix)",
- metavar="M",
- )
- opts.add_option(
- "-s",
- "--sentences",
- dest="sentences",
- help="read in a file of test sentences S",
- metavar="S",
- )
- opts.add_option(
- "-e",
- "--no-eval",
- action="store_false",
- dest="evaluate",
- help="just do a syntactic analysis",
- )
- opts.add_option(
- "-b",
- "--no-beta-reduction",
- action="store_false",
- dest="beta",
- help="don't carry out beta-reduction",
- )
- opts.add_option(
- "-t",
- "--syntrace",
- action="count",
- dest="syntrace",
- help="set syntactic tracing on; requires '-e' option",
- )
- opts.add_option(
- "-T",
- "--semtrace",
- action="count",
- dest="semtrace",
- help="set semantic tracing on",
- )
+ opts.set_defaults(evaluate=True, beta=True, syntrace=0,
+ semtrace=0, demo='default', grammar='', sentences='')
+
+ opts.add_option("-d", "--demo", dest="demo",
+ help="choose demo D; omit this for the default demo, or specify 'chat80'", metavar="D")
+ opts.add_option("-g", "--gram", dest="grammar",
+ help="read in grammar G", metavar="G")
+ opts.add_option("-m", "--model", dest="model",
+ help="import model M (omit '.py' suffix)", metavar="M")
+ opts.add_option("-s", "--sentences", dest="sentences",
+ help="read in a file of test sentences S", metavar="S")
+ opts.add_option("-e", "--no-eval", action="store_false", dest="evaluate",
+ help="just do a syntactic analysis")
+ opts.add_option("-b", "--no-beta-reduction", action="store_false",
+ dest="beta", help="don't carry out beta-reduction")
+ opts.add_option("-t", "--syntrace", action="count", dest="syntrace",
+ help="set syntactic tracing on; requires '-e' option")
+ opts.add_option("-T", "--semtrace", action="count", dest="semtrace",
+ help="set semantic tracing on")
(options, args) = opts.parse_args()
- SPACER = "-" * 30
+ SPACER = '-' * 30
demo_model0()
sents = [
- "Fido sees a boy with Mary",
- "John sees Mary",
- "every girl chases a dog",
- "every boy chases a girl",
- "John walks with a girl in Noosa",
- "who walks",
- ]
+ 'Fido sees a boy with Mary',
+ 'John sees Mary',
+ 'every girl chases a dog',
+ 'every boy chases a girl',
+ 'John walks with a girl in Noosa',
+ 'who walks']
- gramfile = "grammars/sample_grammars/sem2.fcfg"
+ gramfile = 'grammars/sample_grammars/sem2.fcfg'
if options.sentences:
sentsfile = options.sentences
g = g0
if options.evaluate:
- evaluations = evaluate_sents(sents, gramfile, model, g, trace=options.semtrace)
+ evaluations = \
+ evaluate_sents(sents, gramfile, model, g, trace=options.semtrace)
else:
- semreps = interpret_sents(sents, gramfile, trace=options.syntrace)
+ semreps = \
+ interpret_sents(sents, gramfile, trace=options.syntrace)
for i, sent in enumerate(sents):
n = 1
- print("\nSentence: %s" % sent)
+ print('\nSentence: %s' % sent)
print(SPACER)
if options.evaluate:
for (syntree, semrep, value) in evaluations[i]:
if isinstance(value, dict):
value = set(value.keys())
- print("%d: %s" % (n, semrep))
+ print('%d: %s' % (n, semrep))
print(value)
n += 1
else:
for (syntree, semrep) in semreps[i]:
- print("%d: %s" % (n, semrep))
+ print('%d: %s' % (n, semrep))
n += 1
-
if __name__ == "__main__":
demo()
demo_legacy_grammar()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Sentiment Analysis
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
# Natural Language Toolkit: Sentiment Analyzer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
purposes.
"""
-import sys
+from __future__ import print_function
from collections import defaultdict
from nltk.classify.util import apply_features, accuracy as eval_accuracy
from nltk.collocations import BigramCollocationFinder
-from nltk.metrics import (
- BigramAssocMeasures,
- precision as eval_precision,
- recall as eval_recall,
- f_measure as eval_f_measure,
-)
+from nltk.metrics import (BigramAssocMeasures, precision as eval_precision,
+ recall as eval_recall, f_measure as eval_f_measure)
from nltk.probability import FreqDist
+from nltk.sentiment.util import save_file, timer
class SentimentAnalyzer(object):
"""
A Sentiment Analysis tool based on machine learning approaches.
"""
-
def __init__(self, classifier=None):
self.feat_extractors = defaultdict(list)
self.classifier = classifier
"""
# Stopwords are not removed
unigram_feats_freqs = FreqDist(word for word in words)
- return [
- w
- for w, f in unigram_feats_freqs.most_common(top_n)
- if unigram_feats_freqs[w] > min_freq
- ]
+ return [w for w, f in unigram_feats_freqs.most_common(top_n)
+ if unigram_feats_freqs[w] > min_freq]
- def bigram_collocation_feats(
- self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi
- ):
+ def bigram_collocation_feats(self, documents, top_n=None, min_freq=3,
+ assoc_measure=BigramAssocMeasures.pmi):
"""
Return `top_n` bigram features (using `assoc_measure`).
Note that this method is based on bigram collocations measures, and not
:param kwargs: additional parameters that will be passed as arguments to
the classifier `train` function.
:return: A classifier instance trained on the training set.
- :rtype:
+ :rtype:
"""
print("Training classifier")
self.classifier = trainer(training_set, **kwargs)
if save_classifier:
- self.save_file(self.classifier, save_classifier)
+ save_file(self.classifier, save_classifier)
return self.classifier
- def save_file(self, content, filename):
- """
- Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
- """
- print("Saving", filename, file=sys.stderr)
- with open(filename, 'wb') as storage_file:
- # The protocol=2 parameter is for python2 compatibility
- pickle.dump(content, storage_file, protocol=2)
-
- def evaluate(
- self,
- test_set,
- classifier=None,
- accuracy=True,
- f_measure=True,
- precision=True,
- recall=True,
- verbose=False,
- ):
+ def evaluate(self, test_set, classifier=None, accuracy=True, f_measure=True,
+ precision=True, recall=True, verbose=False):
"""
Evaluate and print classifier performance on the test set.
metrics_results = {}
if accuracy == True:
accuracy_score = eval_accuracy(classifier, test_set)
- metrics_results["Accuracy"] = accuracy_score
+ metrics_results['Accuracy'] = accuracy_score
gold_results = defaultdict(set)
test_results = defaultdict(set)
for label in labels:
if precision == True:
- precision_score = eval_precision(
- gold_results[label], test_results[label]
- )
- metrics_results["Precision [{0}]".format(label)] = precision_score
+ precision_score = eval_precision(gold_results[label],
+ test_results[label])
+ metrics_results['Precision [{0}]'.format(label)] = precision_score
if recall == True:
- recall_score = eval_recall(gold_results[label], test_results[label])
- metrics_results["Recall [{0}]".format(label)] = recall_score
+ recall_score = eval_recall(gold_results[label],
+ test_results[label])
+ metrics_results['Recall [{0}]'.format(label)] = recall_score
if f_measure == True:
- f_measure_score = eval_f_measure(
- gold_results[label], test_results[label]
- )
- metrics_results["F-measure [{0}]".format(label)] = f_measure_score
+ f_measure_score = eval_f_measure(gold_results[label],
+ test_results[label])
+ metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
# Print evaluation results (in alphabetical order)
if verbose == True:
for result in sorted(metrics_results):
- print("{0}: {1}".format(result, metrics_results[result]))
+ print('{0}: {1}'.format(result, metrics_results[result]))
return metrics_results
#
# Natural Language Toolkit: Sentiment Analyzer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility methods for Sentiment Analysis.
"""
+from __future__ import division
import codecs
import csv
import sys
import time
from copy import deepcopy
+from itertools import tee
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.data import load
from nltk.tokenize.casual import EMOTICON_RE
+from nltk.twitter.common import outf_writer_compat, extract_fields
-# ////////////////////////////////////////////////////////////
-# { Regular expressions
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
+#{ Regular expressions
+#////////////////////////////////////////////////////////////
# Regular expression for negation by Christopher Potts
NEGATION = r"""
NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
-CLAUSE_PUNCT = r"^[.:;!?]$"
+CLAUSE_PUNCT = r'^[.:;!?]$'
CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
# Happy and sad emoticons
-HAPPY = set(
- [
- ":-)",
- ":)",
- ";)",
- ":o)",
- ":]",
- ":3",
- ":c)",
- ":>",
- "=]",
- "8)",
- "=)",
- ":}",
- ":^)",
- ":-D",
- ":D",
- "8-D",
- "8D",
- "x-D",
- "xD",
- "X-D",
- "XD",
- "=-D",
- "=D",
- "=-3",
- "=3",
- ":-))",
- ":'-)",
- ":')",
- ":*",
- ":^*",
- ">:P",
- ":-P",
- ":P",
- "X-P",
- "x-p",
- "xp",
- "XP",
- ":-p",
- ":p",
- "=p",
- ":-b",
- ":b",
- ">:)",
- ">;)",
- ">:-)",
- "<3",
- ]
-)
-
-SAD = set(
- [
- ":L",
- ":-/",
- ">:/",
- ":S",
- ">:[",
- ":@",
- ":-(",
- ":[",
- ":-||",
- "=L",
- ":<",
- ":-[",
- ":-<",
- "=\\",
- "=/",
- ">:(",
- ":(",
- ">.<",
- ":'-(",
- ":'(",
- ":\\",
- ":-c",
- ":c",
- ":{",
- ">:\\",
- ";(",
- ]
-)
+HAPPY = set([
+ ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
+ ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
+ '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
+ 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
+ '<3'
+ ])
+
+SAD = set([
+ ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
+ ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
+ ':c', ':{', '>:\\', ';('
+ ])
def timer(method):
"""
A timer decorator to measure execution performance of methods.
"""
-
def timed(*args, **kw):
start = time.time()
result = method(*args, **kw)
# in Python 2.x round() will return a float, so we convert it to int
secs = int(round(tot_time % 60))
if hours == 0 and mins == 0 and secs < 10:
- print("[TIMER] {0}(): {:.3f} seconds".format(method.__name__, tot_time))
+ print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time))
else:
- print(
- "[TIMER] {0}(): {1}h {2}m {3}s".format(
- method.__name__, hours, mins, secs
- )
- )
+ print('[TIMER] {0}(): {1}h {2}m {3}s'.format(method.__name__, hours, mins, secs))
return result
-
return timed
-# ////////////////////////////////////////////////////////////
-# { Feature extractor functions
-# ////////////////////////////////////////////////////////////
+def pairwise(iterable):
+ """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
+ a, b = tee(iterable)
+ next(b, None)
+ return zip(a, b)
+
+#////////////////////////////////////////////////////////////
+#{ Feature extractor functions
+#////////////////////////////////////////////////////////////
"""
Feature extractor functions are declared outside the SentimentAnalyzer class.
Users should have the possibility to create their own feature extractors
without modifying SentimentAnalyzer.
"""
-
def extract_unigram_feats(document, unigrams, handle_negation=False):
"""
Populate a dictionary of unigram features, reflecting the presence/absence in
if handle_negation:
document = mark_negation(document)
for word in unigrams:
- features["contains({0})".format(word)] = word in set(document)
+ features['contains({0})'.format(word)] = word in set(document)
return features
-
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
"""
features = {}
for bigr in bigrams:
- features["contains({0} - {1})".format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
- document
- )
+ features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
-
-# ////////////////////////////////////////////////////////////
-# { Helper Functions
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
+#{ Helper Functions
+#////////////////////////////////////////////////////////////
def mark_negation(document, double_neg_flip=False, shallow=False):
"""
neg_scope = not neg_scope
continue
else:
- doc[i] += "_NEG"
+ doc[i] += '_NEG'
elif neg_scope and CLAUSE_PUNCT_RE.search(word):
neg_scope = not neg_scope
elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
- doc[i] += "_NEG"
+ doc[i] += '_NEG'
return document
-
def output_markdown(filename, **kwargs):
"""
Write the output of an analysis to a file.
"""
- with codecs.open(filename, "at") as outfile:
- text = "\n*** \n\n"
- text += "{0} \n\n".format(time.strftime("%d/%m/%Y, %H:%M"))
+ with codecs.open(filename, 'at') as outfile:
+ text = '\n*** \n\n'
+ text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M"))
for k in sorted(kwargs):
if isinstance(kwargs[k], dict):
dictionary = kwargs[k]
- text += " - **{0}:**\n".format(k)
+ text += ' - **{0}:**\n'.format(k)
for entry in sorted(dictionary):
- text += " - {0}: {1} \n".format(entry, dictionary[entry])
+ text += ' - {0}: {1} \n'.format(entry, dictionary[entry])
elif isinstance(kwargs[k], list):
- text += " - **{0}:**\n".format(k)
+ text += ' - **{0}:**\n'.format(k)
for entry in kwargs[k]:
- text += " - {0}\n".format(entry)
+ text += ' - {0}\n'.format(entry)
else:
- text += " - **{0}:** {1} \n".format(k, kwargs[k])
+ text += ' - **{0}:** {1} \n'.format(k, kwargs[k])
outfile.write(text)
+def save_file(content, filename):
+ """
+ Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
+ """
+ print("Saving", filename)
+ with codecs.open(filename, 'wb') as storage_file:
+ # The protocol=2 parameter is for python2 compatibility
+ pickle.dump(content, storage_file, protocol=2)
def split_train_test(all_instances, n=None):
"""
random.shuffle(all_instances)
if not n or n > len(all_instances):
n = len(all_instances)
- train_set = all_instances[: int(0.8 * n)]
- test_set = all_instances[int(0.8 * n) : n]
+ train_set = all_instances[:int(.8*n)]
+ test_set = all_instances[int(.8*n):n]
return train_set, test_set
-
def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
try:
import matplotlib.pyplot as plt
except ImportError:
- raise ImportError(
- "The plot function requires matplotlib to be installed."
- "See http://matplotlib.org/"
- )
+ raise ImportError('The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/')
- plt.locator_params(axis="y", nbins=3)
+ plt.locator_params(axis='y', nbins=3)
axes = plt.axes()
axes.yaxis.grid()
- plt.plot(x_values, y_values, "ro", color="red")
+ plt.plot(x_values, y_values, 'ro', color='red')
plt.ylim(ymin=-1.2, ymax=1.2)
plt.tight_layout(pad=5)
if x_labels:
- plt.xticks(x_values, x_labels, rotation="vertical")
+ plt.xticks(x_values, x_labels, rotation='vertical')
if y_labels:
- plt.yticks([-1, 0, 1], y_labels, rotation="horizontal")
+ plt.yticks([-1, 0, 1], y_labels, rotation='horizontal')
# Pad margins so that markers are not clipped by the axes
plt.margins(0.2)
plt.show()
+#////////////////////////////////////////////////////////////
+#{ Parsing and conversion functions
+#////////////////////////////////////////////////////////////
-# ////////////////////////////////////////////////////////////
-# { Parsing and conversion functions
-# ////////////////////////////////////////////////////////////
-
-
-def json2csv_preprocess(
- json_file,
- outfile,
- fields,
- encoding="utf8",
- errors="replace",
- gzip_compress=False,
- skip_retweets=True,
- skip_tongue_tweets=True,
- skip_ambiguous_tweets=True,
- strip_off_emoticons=True,
- remove_duplicates=True,
- limit=None,
-):
+def json2csv_preprocess(json_file, outfile, fields, encoding='utf8', errors='replace',
+ gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True,
+ skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True,
+ limit=None):
"""
Convert json file to csv file, preprocessing each row to obtain a suitable
dataset for tweets Semantic Analysis.
subsets of the original tweets json data.
"""
with codecs.open(json_file, encoding=encoding) as fp:
- (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
# write the list of fields as header
writer.writerow(fields)
tweet = json.loads(line)
row = extract_fields(tweet, fields)
try:
- text = row[fields.index("text")]
+ text = row[fields.index('text')]
# Remove retweets
if skip_retweets == True:
- if re.search(r"\bRT\b", text):
+ if re.search(r'\bRT\b', text):
continue
# Remove tweets containing ":P" and ":-P" emoticons
if skip_tongue_tweets == True:
- if re.search(r"\:\-?P\b", text):
+ if re.search(r'\:\-?P\b', text):
continue
# Remove tweets containing both happy and sad emoticons
if skip_ambiguous_tweets == True:
continue
# Strip off emoticons from all tweets
if strip_off_emoticons == True:
- row[fields.index("text")] = re.sub(
- r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text)
- )
+ row[fields.index('text')] = re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text))
# Remove duplicate tweets
if remove_duplicates == True:
- if row[fields.index("text")] in tweets_cache:
+ if row[fields.index('text')] in tweets_cache:
continue
else:
- tweets_cache.append(row[fields.index("text")])
+ tweets_cache.append(row[fields.index('text')])
except ValueError:
pass
writer.writerow(row)
break
outf.close()
-
-def parse_tweets_set(
- filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True
-):
+def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None,
+ skip_header=True):
"""
Parse csv file containing tweets and output data a list of (text, label) tuples.
"""
tweets = []
if not sent_tokenizer:
- sent_tokenizer = load("tokenizers/punkt/english.pickle")
-
- with codecs.open(filename, "rt") as csvfile:
- reader = csv.reader(csvfile)
- if skip_header == True:
- next(reader, None) # skip the header
- i = 0
- for tweet_id, text in reader:
- # text = text[1]
- i += 1
- sys.stdout.write("Loaded {0} tweets\r".format(i))
- # Apply sentence and word tokenizer to text
- if word_tokenizer:
- tweet = [
- w
- for sent in sent_tokenizer.tokenize(text)
- for w in word_tokenizer.tokenize(sent)
- ]
- else:
- tweet = text
- tweets.append((tweet, label))
-
+ sent_tokenizer = load('tokenizers/punkt/english.pickle')
+
+ # If we use Python3.x we can proceed using the 'rt' flag
+ if sys.version_info[0] == 3:
+ with codecs.open(filename, 'rt') as csvfile:
+ reader = csv.reader(csvfile)
+ if skip_header == True:
+ next(reader, None) # skip the header
+ i = 0
+ for tweet_id, text in reader:
+ # text = text[1]
+ i += 1
+ sys.stdout.write('Loaded {0} tweets\r'.format(i))
+ # Apply sentence and word tokenizer to text
+ if word_tokenizer:
+ tweet = [w for sent in sent_tokenizer.tokenize(text)
+ for w in word_tokenizer.tokenize(sent)]
+ else:
+ tweet = text
+ tweets.append((tweet, label))
+ # If we use Python2.x we need to handle encoding problems
+ elif sys.version_info[0] < 3:
+ with codecs.open(filename) as csvfile:
+ reader = csv.reader(csvfile)
+ if skip_header == True:
+ next(reader, None) # skip the header
+ i = 0
+ for row in reader:
+ unicode_row = [x.decode('utf8') for x in row]
+ text = unicode_row[1]
+ i += 1
+ sys.stdout.write('Loaded {0} tweets\r'.format(i))
+ # Apply sentence and word tokenizer to text
+ if word_tokenizer:
+ tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text)
+ for w in word_tokenizer.tokenize(sent)]
+ else:
+ tweet = text
+ tweets.append((tweet, label))
print("Loaded {0} tweets".format(i))
return tweets
-
-# ////////////////////////////////////////////////////////////
-# { Demos
-# ////////////////////////////////////////////////////////////
-
+#////////////////////////////////////////////////////////////
+#{ Demos
+#////////////////////////////////////////////////////////////
def demo_tweets(trainer, n_instances=None, output=None):
"""
# tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
if n_instances is not None:
- n_instances = int(n_instances / 2)
+ n_instances = int(n_instances/2)
- fields = ["id", "text"]
+ fields = ['id', 'text']
positive_json = twitter_samples.abspath("positive_tweets.json")
- positive_csv = "positive_tweets.csv"
+ positive_csv = 'positive_tweets.csv'
json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
negative_json = twitter_samples.abspath("negative_tweets.json")
- negative_csv = "negative_tweets.csv"
+ negative_csv = 'negative_tweets.csv'
json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
- neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer)
- pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer)
+ neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
+ pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
- training_tweets = train_pos_docs + train_neg_docs
- testing_tweets = test_pos_docs + test_neg_docs
+ training_tweets = train_pos_docs+train_neg_docs
+ testing_tweets = test_pos_docs+test_neg_docs
sentim_analyzer = SentimentAnalyzer()
# stopwords = stopwords.words('english')
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# Add bigram collocation features
- bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
- [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12
- )
- sentim_analyzer.add_feat_extractor(
- extract_bigram_feats, bigrams=bigram_collocs_feats
- )
+ bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],
+ top_n=100, min_freq=12)
+ sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)
training_set = sentim_analyzer.apply_features(training_tweets)
test_set = sentim_analyzer.apply_features(testing_tweets)
try:
classifier.show_most_informative_features()
except AttributeError:
- print(
- "Your classifier does not provide a show_most_informative_features() method."
- )
+ print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
- output_markdown(
- output,
- Dataset="labeled_tweets",
- Classifier=type(classifier).__name__,
- Tokenizer=tokenizer.__class__.__name__,
- Feats=extr,
- Results=results,
- Instances=n_instances,
- )
-
+ output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__,
+ Tokenizer=tokenizer.__class__.__name__, Feats=extr,
+ Results=results, Instances=n_instances)
def demo_movie_reviews(trainer, n_instances=None, output=None):
"""
from nltk.sentiment import SentimentAnalyzer
if n_instances is not None:
- n_instances = int(n_instances / 2)
-
- pos_docs = [
- (list(movie_reviews.words(pos_id)), "pos")
- for pos_id in movie_reviews.fileids("pos")[:n_instances]
- ]
- neg_docs = [
- (list(movie_reviews.words(neg_id)), "neg")
- for neg_id in movie_reviews.fileids("neg")[:n_instances]
- ]
+ n_instances = int(n_instances/2)
+
+ pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
+ neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
# We separately split positive and negative instances to keep a balanced
# uniform class distribution in both train and test sets.
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
- training_docs = train_pos_docs + train_neg_docs
- testing_docs = test_pos_docs + test_neg_docs
+ training_docs = train_pos_docs+train_neg_docs
+ testing_docs = test_pos_docs+test_neg_docs
sentim_analyzer = SentimentAnalyzer()
all_words = sentim_analyzer.all_words(training_docs)
try:
classifier.show_most_informative_features()
except AttributeError:
- print(
- "Your classifier does not provide a show_most_informative_features() method."
- )
+ print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
- output_markdown(
- output,
- Dataset="Movie_reviews",
- Classifier=type(classifier).__name__,
- Tokenizer="WordPunctTokenizer",
- Feats=extr,
- Results=results,
- Instances=n_instances,
- )
-
+ output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
+ Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
+ Instances=n_instances)
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
"""
from nltk.corpus import subjectivity
if n_instances is not None:
- n_instances = int(n_instances / 2)
+ n_instances = int(n_instances/2)
- subj_docs = [
- (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances]
- ]
- obj_docs = [
- (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances]
- ]
+ subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
+ obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
train_subj_docs, test_subj_docs = split_train_test(subj_docs)
train_obj_docs, test_obj_docs = split_train_test(obj_docs)
- training_docs = train_subj_docs + train_obj_docs
- testing_docs = test_subj_docs + test_obj_docs
+ training_docs = train_subj_docs+train_obj_docs
+ testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
- all_words_neg = sentim_analyzer.all_words(
- [mark_negation(doc) for doc in training_docs]
- )
+ all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
# Add simple unigram word features handling negation
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
try:
classifier.show_most_informative_features()
except AttributeError:
- print(
- "Your classifier does not provide a show_most_informative_features() method."
- )
+ print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if save_analyzer == True:
- save_file(sentim_analyzer, "sa_subjectivity.pickle")
+ save_file(sentim_analyzer, 'sa_subjectivity.pickle')
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
- output_markdown(
- output,
- Dataset="subjectivity",
- Classifier=type(classifier).__name__,
- Tokenizer="WhitespaceTokenizer",
- Feats=extr,
- Instances=n_instances,
- Results=results,
- )
+ output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
+ Tokenizer='WhitespaceTokenizer', Feats=extr,
+ Instances=n_instances, Results=results)
return sentim_analyzer
-
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
-
word_tokenizer = regexp.WhitespaceTokenizer()
try:
- sentim_analyzer = load("sa_subjectivity.pickle")
+ sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
- print("Cannot find the sentiment analyzer you want to load.")
- print("Training a new one using NaiveBayesClassifier.")
+ print('Cannot find the sentiment analyzer you want to load.')
+ print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
-
def demo_liu_hu_lexicon(sentence, plot=False):
"""
Basic example of sentiment classification using Liu and Hu opinion lexicon.
neg_words = 0
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
- x = list(range(len(tokenized_sent))) # x axis for the plot
+ x = list(range(len(tokenized_sent))) # x axis for the plot
y = []
for word in tokenized_sent:
if word in opinion_lexicon.positive():
pos_words += 1
- y.append(1) # positive
+ y.append(1) # positive
elif word in opinion_lexicon.negative():
neg_words += 1
- y.append(-1) # negative
+ y.append(-1) # negative
else:
- y.append(0) # neutral
+ y.append(0) # neutral
if pos_words > neg_words:
- print("Positive")
+ print('Positive')
elif pos_words < neg_words:
- print("Negative")
+ print('Negative')
elif pos_words == neg_words:
- print("Neutral")
+ print('Neutral')
if plot == True:
- _show_plot(
- x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"]
- )
-
+ _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
def demo_vader_instance(text):
"""
:param text: a text whose polarity has to be evaluated.
"""
from nltk.sentiment import SentimentIntensityAnalyzer
-
vader_analyzer = SentimentIntensityAnalyzer()
print(vader_analyzer.polarity_scores(text))
-
def demo_vader_tweets(n_instances=None, output=None):
"""
Classify 10000 positive and negative tweets using Vader approach.
from collections import defaultdict
from nltk.corpus import twitter_samples
from nltk.sentiment import SentimentIntensityAnalyzer
- from nltk.metrics import (
- accuracy as eval_accuracy,
- precision as eval_precision,
- recall as eval_recall,
- f_measure as eval_f_measure,
- )
+ from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision,
+ recall as eval_recall, f_measure as eval_f_measure)
if n_instances is not None:
- n_instances = int(n_instances / 2)
+ n_instances = int(n_instances/2)
- fields = ["id", "text"]
+ fields = ['id', 'text']
positive_json = twitter_samples.abspath("positive_tweets.json")
- positive_csv = "positive_tweets.csv"
- json2csv_preprocess(
- positive_json,
- positive_csv,
- fields,
- strip_off_emoticons=False,
- limit=n_instances,
- )
+ positive_csv = 'positive_tweets.csv'
+ json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False,
+ limit=n_instances)
negative_json = twitter_samples.abspath("negative_tweets.json")
- negative_csv = "negative_tweets.csv"
- json2csv_preprocess(
- negative_json,
- negative_csv,
- fields,
- strip_off_emoticons=False,
- limit=n_instances,
- )
+ negative_csv = 'negative_tweets.csv'
+ json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False,
+ limit=n_instances)
- pos_docs = parse_tweets_set(positive_csv, label="pos")
- neg_docs = parse_tweets_set(negative_csv, label="neg")
+ pos_docs = parse_tweets_set(positive_csv, label='pos')
+ neg_docs = parse_tweets_set(negative_csv, label='neg')
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
- training_tweets = train_pos_docs + train_neg_docs
- testing_tweets = test_pos_docs + test_neg_docs
+ training_tweets = train_pos_docs+train_neg_docs
+ testing_tweets = test_pos_docs+test_neg_docs
vader_analyzer = SentimentIntensityAnalyzer()
labels.add(label)
gold_results[label].add(i)
acc_gold_results.append(label)
- score = vader_analyzer.polarity_scores(text)["compound"]
+ score = vader_analyzer.polarity_scores(text)['compound']
if score > 0:
- observed = "pos"
+ observed = 'pos'
else:
- observed = "neg"
+ observed = 'neg'
num += 1
acc_test_results.append(observed)
test_results[observed].add(i)
metrics_results = {}
for label in labels:
- accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
- metrics_results["Accuracy"] = accuracy_score
- precision_score = eval_precision(gold_results[label], test_results[label])
- metrics_results["Precision [{0}]".format(label)] = precision_score
- recall_score = eval_recall(gold_results[label], test_results[label])
- metrics_results["Recall [{0}]".format(label)] = recall_score
- f_measure_score = eval_f_measure(gold_results[label], test_results[label])
- metrics_results["F-measure [{0}]".format(label)] = f_measure_score
+ accuracy_score = eval_accuracy(acc_gold_results,
+ acc_test_results)
+ metrics_results['Accuracy'] = accuracy_score
+ precision_score = eval_precision(gold_results[label],
+ test_results[label])
+ metrics_results['Precision [{0}]'.format(label)] = precision_score
+ recall_score = eval_recall(gold_results[label],
+ test_results[label])
+ metrics_results['Recall [{0}]'.format(label)] = recall_score
+ f_measure_score = eval_f_measure(gold_results[label],
+ test_results[label])
+ metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
for result in sorted(metrics_results):
- print("{0}: {1}".format(result, metrics_results[result]))
+ print('{0}: {1}'.format(result, metrics_results[result]))
if output:
- output_markdown(
- output,
- Approach="Vader",
- Dataset="labeled_tweets",
- Instances=n_instances,
- Results=metrics_results,
- )
-
+ output_markdown(output, Approach='Vader', Dataset='labeled_tweets',
+ Instances=n_instances, Results=metrics_results)
-if __name__ == "__main__":
+if __name__ == '__main__':
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
- from nltk.twitter.common import _outf_writer, extract_fields
naive_bayes = NaiveBayesClassifier.train
svm = SklearnClassifier(LinearSVC()).train
# coding: utf-8
# Natural Language Toolkit: vader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
# George Berry <geb97@cornell.edu> (modifications)
-# Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
"""
+import codecs
import math
import re
import string
from itertools import product
-
import nltk.data
-from nltk.util import pairwise
-
-class VaderConstants:
+from .util import pairwise
+
+##Constants##
+
+# (empirically derived mean sentiment intensity rating increase for booster words)
+B_INCR = 0.293
+B_DECR = -0.293
+
+# (empirically derived mean sentiment intensity rating increase for using
+# ALLCAPs to emphasize a word)
+C_INCR = 0.733
+
+N_SCALAR = -0.74
+
+# for removing punctuation
+REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
+
+PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
+ "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
+NEGATE = {"aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
+ "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
+ "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
+ "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
+ "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
+ "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
+ "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
+ "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"}
+
+# booster/dampener 'intensifiers' or 'degree adverbs'
+# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
+
+BOOSTER_DICT = \
+{"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, "completely": B_INCR, "considerably": B_INCR,
+ "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormously": B_INCR,
+ "entirely": B_INCR, "especially": B_INCR, "exceptionally": B_INCR, "extremely": B_INCR,
+ "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR,
+ "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, "fucking": B_INCR,
+ "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, "incredibly": B_INCR,
+ "intensely": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR,
+ "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR,
+ "so": B_INCR, "substantially": B_INCR,
+ "thoroughly": B_INCR, "totally": B_INCR, "tremendously": B_INCR,
+ "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utterly": B_INCR,
+ "very": B_INCR,
+ "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR,
+ "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR,
+ "less": B_DECR, "little": B_DECR, "marginally": B_DECR, "occasionally": B_DECR, "partly": B_DECR,
+ "scarcely": B_DECR, "slightly": B_DECR, "somewhat": B_DECR,
+ "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR}
+
+# check for special case idioms using a sentiment-laden keyword known to SAGE
+SPECIAL_CASE_IDIOMS = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "yeah right": -2,
+ "cut the mustard": 2, "kiss of death": -1.5, "hand to mouth": -2}
+
+
+##Static methods##
+
+def negated(input_words, include_nt=True):
"""
- A class to keep the Vader lists and constants.
+ Determine if input contains negation words
"""
- ##Constants##
- # (empirically derived mean sentiment intensity rating increase for booster words)
- B_INCR = 0.293
- B_DECR = -0.293
-
- # (empirically derived mean sentiment intensity rating increase for using
- # ALLCAPs to emphasize a word)
- C_INCR = 0.733
-
- N_SCALAR = -0.74
-
- NEGATE = {
- "aint",
- "arent",
- "cannot",
- "cant",
- "couldnt",
- "darent",
- "didnt",
- "doesnt",
- "ain't",
- "aren't",
- "can't",
- "couldn't",
- "daren't",
- "didn't",
- "doesn't",
- "dont",
- "hadnt",
- "hasnt",
- "havent",
- "isnt",
- "mightnt",
- "mustnt",
- "neither",
- "don't",
- "hadn't",
- "hasn't",
- "haven't",
- "isn't",
- "mightn't",
- "mustn't",
- "neednt",
- "needn't",
- "never",
- "none",
- "nope",
- "nor",
- "not",
- "nothing",
- "nowhere",
- "oughtnt",
- "shant",
- "shouldnt",
- "uhuh",
- "wasnt",
- "werent",
- "oughtn't",
- "shan't",
- "shouldn't",
- "uh-uh",
- "wasn't",
- "weren't",
- "without",
- "wont",
- "wouldnt",
- "won't",
- "wouldn't",
- "rarely",
- "seldom",
- "despite",
- }
-
- # booster/dampener 'intensifiers' or 'degree adverbs'
- # http://en.wiktionary.org/wiki/Category:English_degree_adverbs
-
- BOOSTER_DICT = {
- "absolutely": B_INCR,
- "amazingly": B_INCR,
- "awfully": B_INCR,
- "completely": B_INCR,
- "considerably": B_INCR,
- "decidedly": B_INCR,
- "deeply": B_INCR,
- "effing": B_INCR,
- "enormously": B_INCR,
- "entirely": B_INCR,
- "especially": B_INCR,
- "exceptionally": B_INCR,
- "extremely": B_INCR,
- "fabulously": B_INCR,
- "flipping": B_INCR,
- "flippin": B_INCR,
- "fricking": B_INCR,
- "frickin": B_INCR,
- "frigging": B_INCR,
- "friggin": B_INCR,
- "fully": B_INCR,
- "fucking": B_INCR,
- "greatly": B_INCR,
- "hella": B_INCR,
- "highly": B_INCR,
- "hugely": B_INCR,
- "incredibly": B_INCR,
- "intensely": B_INCR,
- "majorly": B_INCR,
- "more": B_INCR,
- "most": B_INCR,
- "particularly": B_INCR,
- "purely": B_INCR,
- "quite": B_INCR,
- "really": B_INCR,
- "remarkably": B_INCR,
- "so": B_INCR,
- "substantially": B_INCR,
- "thoroughly": B_INCR,
- "totally": B_INCR,
- "tremendously": B_INCR,
- "uber": B_INCR,
- "unbelievably": B_INCR,
- "unusually": B_INCR,
- "utterly": B_INCR,
- "very": B_INCR,
- "almost": B_DECR,
- "barely": B_DECR,
- "hardly": B_DECR,
- "just enough": B_DECR,
- "kind of": B_DECR,
- "kinda": B_DECR,
- "kindof": B_DECR,
- "kind-of": B_DECR,
- "less": B_DECR,
- "little": B_DECR,
- "marginally": B_DECR,
- "occasionally": B_DECR,
- "partly": B_DECR,
- "scarcely": B_DECR,
- "slightly": B_DECR,
- "somewhat": B_DECR,
- "sort of": B_DECR,
- "sorta": B_DECR,
- "sortof": B_DECR,
- "sort-of": B_DECR,
- }
-
- # check for special case idioms using a sentiment-laden keyword known to SAGE
- SPECIAL_CASE_IDIOMS = {
- "the shit": 3,
- "the bomb": 3,
- "bad ass": 1.5,
- "yeah right": -2,
- "cut the mustard": 2,
- "kiss of death": -1.5,
- "hand to mouth": -2,
- }
-
- # for removing punctuation
- REGEX_REMOVE_PUNCTUATION = re.compile("[{0}]".format(re.escape(string.punctuation)))
-
- PUNC_LIST = [
- ".",
- "!",
- "?",
- ",",
- ";",
- ":",
- "-",
- "'",
- '"',
- "!!",
- "!!!",
- "??",
- "???",
- "?!?",
- "!?!",
- "?!?!",
- "!?!?",
- ]
-
- def __init__(self):
- pass
-
- def negated(self, input_words, include_nt=True):
- """
- Determine if input contains negation words
- """
- neg_words = self.NEGATE
- if any(word.lower() in neg_words for word in input_words):
+ neg_words = NEGATE
+ if any(word.lower() in neg_words for word in input_words):
+ return True
+ if include_nt:
+ if any("n't" in word.lower() for word in input_words):
return True
- if include_nt:
- if any("n't" in word.lower() for word in input_words):
- return True
- for first, second in pairwise(input_words):
- if second.lower() == "least" and first.lower() != "at":
- return True
- return False
-
- def normalize(self, score, alpha=15):
- """
- Normalize the score to be between -1 and 1 using an alpha that
- approximates the max expected value
- """
- norm_score = score / math.sqrt((score * score) + alpha)
- return norm_score
+ for first, second in pairwise(input_words):
+ if second.lower() == "least" and first.lower() != 'at':
+ return True
+ return False
- def scalar_inc_dec(self, word, valence, is_cap_diff):
- """
- Check if the preceding words increase, decrease, or negate/nullify the
- valence
- """
- scalar = 0.0
- word_lower = word.lower()
- if word_lower in self.BOOSTER_DICT:
- scalar = self.BOOSTER_DICT[word_lower]
- if valence < 0:
- scalar *= -1
- # check if booster/dampener word is in ALLCAPS (while others aren't)
- if word.isupper() and is_cap_diff:
- if valence > 0:
- scalar += self.C_INCR
- else:
- scalar -= self.C_INCR
- return scalar
+def normalize(score, alpha=15):
+ """
+ Normalize the score to be between -1 and 1 using an alpha that
+ approximates the max expected value
+ """
+ norm_score = score/math.sqrt((score*score) + alpha)
+ return norm_score
-class SentiText:
+def allcap_differential(words):
+ """
+ Check whether just some words in the input are ALL CAPS
+
+ :param list words: The words to inspect
+ :returns: `True` if some but not all items in `words` are ALL CAPS
+ """
+ is_different = False
+ allcap_words = 0
+ for word in words:
+ if word.isupper():
+ allcap_words += 1
+ cap_differential = len(words) - allcap_words
+ if cap_differential > 0 and cap_differential < len(words):
+ is_different = True
+ return is_different
+
+
+def scalar_inc_dec(word, valence, is_cap_diff):
+ """
+ Check if the preceding words increase, decrease, or negate/nullify the
+ valence
+ """
+ scalar = 0.0
+ word_lower = word.lower()
+ if word_lower in BOOSTER_DICT:
+ scalar = BOOSTER_DICT[word_lower]
+ if valence < 0:
+ scalar *= -1
+ #check if booster/dampener word is in ALLCAPS (while others aren't)
+ if word.isupper() and is_cap_diff:
+ if valence > 0:
+ scalar += C_INCR
+ else: scalar -= C_INCR
+ return scalar
+
+class SentiText(object):
"""
Identify sentiment-relevant string-level properties of input text.
"""
-
- def __init__(self, text, punc_list, regex_remove_punctuation):
+ def __init__(self, text):
if not isinstance(text, str):
- text = str(text.encode("utf-8"))
+ text = str(text.encode('utf-8'))
self.text = text
- self.PUNC_LIST = punc_list
- self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
self.words_and_emoticons = self._words_and_emoticons()
- # doesn't separate words from
+ # doesn't separate words from\
# adjacent punctuation (keeps emoticons & contractions)
- self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
+ self.is_cap_diff = allcap_differential(self.words_and_emoticons)
def _words_plus_punc(self):
"""
',cat': 'cat',
}
"""
- no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
+ no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
# removes punctuation (but loses emoticons & contractions)
words_only = no_punc_text.split()
# remove singletons
- words_only = set(w for w in words_only if len(w) > 1)
+ words_only = set( w for w in words_only if len(w) > 1 )
# the product gives ('cat', ',') and (',', 'cat')
- punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
- punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
+ punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
+ punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
words_punc_dict = punc_before
words_punc_dict.update(punc_after)
return words_punc_dict
wes[i] = words_punc_dict[we]
return wes
- def allcap_differential(self, words):
- """
- Check whether just some words in the input are ALL CAPS
-
- :param list words: The words to inspect
- :returns: `True` if some but not all items in `words` are ALL CAPS
- """
- is_different = False
- allcap_words = 0
- for word in words:
- if word.isupper():
- allcap_words += 1
- cap_differential = len(words) - allcap_words
- if 0 < cap_differential < len(words):
- is_different = True
- return is_different
-
-
-class SentimentIntensityAnalyzer:
+class SentimentIntensityAnalyzer(object):
"""
Give a sentiment intensity score to sentences.
"""
-
- def __init__(
- self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
- ):
+ def __init__(self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt"):
self.lexicon_file = nltk.data.load(lexicon_file)
self.lexicon = self.make_lex_dict()
- self.constants = VaderConstants()
def make_lex_dict(self):
"""
Convert lexicon file to a dictionary
"""
lex_dict = {}
- for line in self.lexicon_file.split("\n"):
- (word, measure) = line.strip().split("\t")[0:2]
+ for line in self.lexicon_file.split('\n'):
+ (word, measure) = line.strip().split('\t')[0:2]
lex_dict[word] = float(measure)
return lex_dict
Positive values are positive valence, negative value are negative
valence.
"""
- # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
- sentitext = SentiText(text, self.constants.PUNC_LIST,
- self.constants.REGEX_REMOVE_PUNCTUATION)
+ sentitext = SentiText(text)
+ #text, words_and_emoticons, is_cap_diff = self.preprocess(text)
+
sentiments = []
words_and_emoticons = sentitext.words_and_emoticons
for item in words_and_emoticons:
valence = 0
i = words_and_emoticons.index(item)
- if (
- i < len(words_and_emoticons) - 1
- and item.lower() == "kind"
- and words_and_emoticons[i + 1].lower() == "of"
- ) or item.lower() in self.constants.BOOSTER_DICT:
+ if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and \
+ words_and_emoticons[i+1].lower() == "of") or \
+ item.lower() in BOOSTER_DICT:
sentiments.append(valence)
continue
words_and_emoticons = sentitext.words_and_emoticons
item_lowercase = item.lower()
if item_lowercase in self.lexicon:
- # get the sentiment valence
+ #get the sentiment valence
valence = self.lexicon[item_lowercase]
- # check if sentiment laden word is in ALL CAPS (while others aren't)
+ #check if sentiment laden word is in ALL CAPS (while others aren't)
if item.isupper() and is_cap_diff:
if valence > 0:
- valence += self.constants.C_INCR
+ valence += C_INCR
else:
- valence -= self.constants.C_INCR
-
- for start_i in range(0, 3):
- if (
- i > start_i
- and words_and_emoticons[i - (start_i + 1)].lower()
- not in self.lexicon
- ):
+ valence -= C_INCR
+
+ for start_i in range(0,3):
+ if i > start_i and words_and_emoticons[i-(start_i+1)].lower() not in self.lexicon:
# dampen the scalar modifier of preceding words and emoticons
# (excluding the ones that immediately preceed the item) based
# on their distance from the current item.
- s = self.constants.scalar_inc_dec(
- words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
- )
+ s = scalar_inc_dec(words_and_emoticons[i-(start_i+1)], valence, is_cap_diff)
if start_i == 1 and s != 0:
- s = s * 0.95
+ s = s*0.95
if start_i == 2 and s != 0:
- s = s * 0.9
- valence = valence + s
- valence = self._never_check(
- valence, words_and_emoticons, start_i, i
- )
+ s = s*0.9
+ valence = valence+s
+ valence = self._never_check(valence, words_and_emoticons, start_i, i)
if start_i == 2:
valence = self._idioms_check(valence, words_and_emoticons, i)
def _least_check(self, valence, words_and_emoticons, i):
# check for negation case using "least"
- if (
- i > 1
- and words_and_emoticons[i - 1].lower() not in self.lexicon
- and words_and_emoticons[i - 1].lower() == "least"
- ):
- if (
- words_and_emoticons[i - 2].lower() != "at"
- and words_and_emoticons[i - 2].lower() != "very"
- ):
- valence = valence * self.constants.N_SCALAR
- elif (
- i > 0
- and words_and_emoticons[i - 1].lower() not in self.lexicon
- and words_and_emoticons[i - 1].lower() == "least"
- ):
- valence = valence * self.constants.N_SCALAR
+ if i > 1 and words_and_emoticons[i-1].lower() not in self.lexicon \
+ and words_and_emoticons[i-1].lower() == "least":
+ if words_and_emoticons[i-2].lower() != "at" and words_and_emoticons[i-2].lower() != "very":
+ valence = valence*N_SCALAR
+ elif i > 0 and words_and_emoticons[i-1].lower() not in self.lexicon \
+ and words_and_emoticons[i-1].lower() == "least":
+ valence = valence*N_SCALAR
return valence
def _but_check(self, words_and_emoticons, sentiments):
- but = {"but", "BUT"} & set(words_and_emoticons)
- if but:
- bi = words_and_emoticons.index(next(iter(but)))
- for sidx, sentiment in enumerate(sentiments):
- if sidx < bi:
- sentiments[sidx] = sentiment * 0.5
- elif sidx > bi:
- sentiments[sidx] = sentiment * 1.5
+ # check for modification in sentiment due to contrastive conjunction 'but'
+ if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
+ try:
+ bi = words_and_emoticons.index('but')
+ except ValueError:
+ bi = words_and_emoticons.index('BUT')
+ for sentiment in sentiments:
+ si = sentiments.index(sentiment)
+ if si < bi:
+ sentiments.pop(si)
+ sentiments.insert(si, sentiment*0.5)
+ elif si > bi:
+ sentiments.pop(si)
+ sentiments.insert(si, sentiment*1.5)
return sentiments
def _idioms_check(self, valence, words_and_emoticons, i):
- onezero = "{0} {1}".format(words_and_emoticons[i - 1], words_and_emoticons[i])
+ onezero = "{0} {1}".format(words_and_emoticons[i-1], words_and_emoticons[i])
- twoonezero = "{0} {1} {2}".format(
- words_and_emoticons[i - 2],
- words_and_emoticons[i - 1],
- words_and_emoticons[i],
- )
+ twoonezero = "{0} {1} {2}".format(words_and_emoticons[i-2],
+ words_and_emoticons[i-1], words_and_emoticons[i])
- twoone = "{0} {1}".format(
- words_and_emoticons[i - 2], words_and_emoticons[i - 1]
- )
+ twoone = "{0} {1}".format(words_and_emoticons[i-2], words_and_emoticons[i-1])
- threetwoone = "{0} {1} {2}".format(
- words_and_emoticons[i - 3],
- words_and_emoticons[i - 2],
- words_and_emoticons[i - 1],
- )
+ threetwoone = "{0} {1} {2}".format(words_and_emoticons[i-3],
+ words_and_emoticons[i-2], words_and_emoticons[i-1])
- threetwo = "{0} {1}".format(
- words_and_emoticons[i - 3], words_and_emoticons[i - 2]
- )
+ threetwo = "{0} {1}".format(words_and_emoticons[i-3], words_and_emoticons[i-2])
sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
for seq in sequences:
- if seq in self.constants.SPECIAL_CASE_IDIOMS:
- valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
+ if seq in SPECIAL_CASE_IDIOMS:
+ valence = SPECIAL_CASE_IDIOMS[seq]
break
- if len(words_and_emoticons) - 1 > i:
- zeroone = "{0} {1}".format(
- words_and_emoticons[i], words_and_emoticons[i + 1]
- )
- if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
- valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
- if len(words_and_emoticons) - 1 > i + 1:
- zeroonetwo = "{0} {1} {2}".format(
- words_and_emoticons[i],
- words_and_emoticons[i + 1],
- words_and_emoticons[i + 2],
- )
- if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
- valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
+ if len(words_and_emoticons)-1 > i:
+ zeroone = "{0} {1}".format(words_and_emoticons[i], words_and_emoticons[i+1])
+ if zeroone in SPECIAL_CASE_IDIOMS:
+ valence = SPECIAL_CASE_IDIOMS[zeroone]
+ if len(words_and_emoticons)-1 > i+1:
+ zeroonetwo = "{0} {1} {2}".format(words_and_emoticons[i], words_and_emoticons[i+1], words_and_emoticons[i+2])
+ if zeroonetwo in SPECIAL_CASE_IDIOMS:
+ valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
# check for booster/dampener bi-grams such as 'sort of' or 'kind of'
- if threetwo in self.constants.BOOSTER_DICT or twoone in self.constants.BOOSTER_DICT:
- valence = valence + self.constants.B_DECR
+ if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
+ valence = valence+B_DECR
return valence
def _never_check(self, valence, words_and_emoticons, start_i, i):
if start_i == 0:
- if self.constants.negated([words_and_emoticons[i - 1]]):
- valence = valence * self.constants.N_SCALAR
+ if negated([words_and_emoticons[i-1]]):
+ valence = valence*N_SCALAR
if start_i == 1:
- if words_and_emoticons[i - 2] == "never" and (
- words_and_emoticons[i - 1] == "so"
- or words_and_emoticons[i - 1] == "this"
- ):
- valence = valence * 1.5
- elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
- valence = valence * self.constants.N_SCALAR
+ if words_and_emoticons[i-2] == "never" and\
+ (words_and_emoticons[i-1] == "so" or
+ words_and_emoticons[i-1] == "this"):
+ valence = valence*1.5
+ elif negated([words_and_emoticons[i-(start_i+1)]]):
+ valence = valence*N_SCALAR
if start_i == 2:
- if (
- words_and_emoticons[i - 3] == "never"
- and (
- words_and_emoticons[i - 2] == "so"
- or words_and_emoticons[i - 2] == "this"
- )
- or (
- words_and_emoticons[i - 1] == "so"
- or words_and_emoticons[i - 1] == "this"
- )
- ):
- valence = valence * 1.25
- elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
- valence = valence * self.constants.N_SCALAR
+ if words_and_emoticons[i-3] == "never" and \
+ (words_and_emoticons[i-2] == "so" or words_and_emoticons[i-2] == "this") or \
+ (words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"):
+ valence = valence*1.25
+ elif negated([words_and_emoticons[i-(start_i+1)]]):
+ valence = valence*N_SCALAR
return valence
def _punctuation_emphasis(self, sum_s, text):
# add emphasis from exclamation points and question marks
ep_amplifier = self._amplify_ep(text)
qm_amplifier = self._amplify_qm(text)
- punct_emph_amplifier = ep_amplifier + qm_amplifier
+ punct_emph_amplifier = ep_amplifier+qm_amplifier
return punct_emph_amplifier
def _amplify_ep(self, text):
ep_count = 4
# (empirically derived mean sentiment intensity rating increase for
# exclamation points)
- ep_amplifier = ep_count * 0.292
+ ep_amplifier = ep_count*0.292
return ep_amplifier
def _amplify_qm(self, text):
if qm_count <= 3:
# (empirically derived mean sentiment intensity rating increase for
# question marks)
- qm_amplifier = qm_count * 0.18
+ qm_amplifier = qm_count*0.18
else:
qm_amplifier = 0.96
return qm_amplifier
neu_count = 0
for sentiment_score in sentiments:
if sentiment_score > 0:
- pos_sum += (
- float(sentiment_score) + 1
- ) # compensates for neutral words that are counted as 1
+ pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1
if sentiment_score < 0:
- neg_sum += (
- float(sentiment_score) - 1
- ) # when used with math.fabs(), compensates for neutrals
+ neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals
if sentiment_score == 0:
neu_count += 1
return pos_sum, neg_sum, neu_count
punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
if sum_s > 0:
sum_s += punct_emph_amplifier
- elif sum_s < 0:
+ elif sum_s < 0:
sum_s -= punct_emph_amplifier
- compound = self.constants.normalize(sum_s)
+ compound = normalize(sum_s)
# discriminate between positive, negative and neutral sentiment scores
pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
if pos_sum > math.fabs(neg_sum):
- pos_sum += punct_emph_amplifier
+ pos_sum += (punct_emph_amplifier)
elif pos_sum < math.fabs(neg_sum):
- neg_sum -= punct_emph_amplifier
+ neg_sum -= (punct_emph_amplifier)
total = pos_sum + math.fabs(neg_sum) + neu_count
pos = math.fabs(pos_sum / total)
neg = 0.0
neu = 0.0
- sentiment_dict = {
- "neg": round(neg, 3),
- "neu": round(neu, 3),
- "pos": round(pos, 3),
- "compound": round(compound, 4),
- }
+ sentiment_dict = \
+ {"neg" : round(neg, 3),
+ "neu" : round(neu, 3),
+ "pos" : round(pos, 3),
+ "compound" : round(compound, 4)}
return sentiment_dict
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.rslp import RSLPStemmer
-from nltk.stem.cistem import Cistem
# Natural Language Toolkit: Stemmer Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
-class StemmerI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class StemmerI(object):
"""
A processing interface for removing morphological affixes from
words. This process is known as stemming.
"""
-
@abstractmethod
def stem(self, token):
"""
#
# Natural Language Toolkit: ARLSTem Stemmer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
#
# Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
# Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
ARLSTem is promising and producing high performances. This stemmer is not
based on any dictionary and can be used on-line effectively.
"""
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
class ARLSTem(StemmerI):
- """
+ '''
ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
Department of Telecommunication & Information Processing. USTHB University,
Algiers, Algeria.
ARLSTem.stem(token) returns the Arabic stem for the input token.
The ARLSTem Stemmer requires that all tokens are encoded using Unicode
encoding.
- """
+ '''
def __init__(self):
# different Alif with hamza
- self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]")
- self.re_alifMaqsura = re.compile(r"[\u0649]")
- self.re_diacritics = re.compile(r"[\u064B-\u065F]")
+ self.re_hamzated_alif = re.compile(r'[\u0622\u0623\u0625]')
+ self.re_alifMaqsura = re.compile(r'[\u0649]')
+ self.re_diacritics = re.compile(r'[\u064B-\u065F]')
# Alif Laam, Laam Laam, Fa Laam, Fa Ba
- self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"]
+ self.pr2 = [
+ '\u0627\u0644', '\u0644\u0644',
+ '\u0641\u0644', '\u0641\u0628'
+ ]
# Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
- self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"]
+ self.pr3 = [
+ '\u0628\u0627\u0644',
+ '\u0643\u0627\u0644',
+ '\u0648\u0627\u0644'
+ ]
# Fa Laam Laam, Waaw Laam Laam
- self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"]
+ self.pr32 = ['\u0641\u0644\u0644', '\u0648\u0644\u0644']
# Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
self.pr4 = [
- "\u0641\u0628\u0627\u0644",
- "\u0648\u0628\u0627\u0644",
- "\u0641\u0643\u0627\u0644",
- ]
+ '\u0641\u0628\u0627\u0644',
+ '\u0648\u0628\u0627\u0644',
+ '\u0641\u0643\u0627\u0644'
+ ]
# Kaf Yaa, Kaf Miim
- self.su2 = ["\u0643\u064A", "\u0643\u0645"]
+ self.su2 = [
+ '\u0643\u064A',
+ '\u0643\u0645'
+ ]
# Ha Alif, Ha Miim
- self.su22 = ["\u0647\u0627", "\u0647\u0645"]
+ self.su22 = ['\u0647\u0627', '\u0647\u0645']
# Kaf Miim Alif, Kaf Noon Shadda
- self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"]
+ self.su3 = ['\u0643\u0645\u0627', '\u0643\u0646\u0651']
# Ha Miim Alif, Ha Noon Shadda
- self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"]
+ self.su32 = ['\u0647\u0645\u0627', '\u0647\u0646\u0651']
# Alif Noon, Ya Noon, Waaw Noon
- self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"]
+ self.pl_si2 = ['\u0627\u0646', '\u064A\u0646', '\u0648\u0646']
# Taa Alif Noon, Taa Ya Noon
- self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"]
+ self.pl_si3 = ['\u062A\u0627\u0646', '\u062A\u064A\u0646']
# Alif Noon, Waaw Noon
- self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"]
+ self.verb_su2 = ['\u0627\u0646', '\u0648\u0646']
# Siin Taa, Siin Yaa
- self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"]
+ self.verb_pr2 = ['\u0633\u062A', '\u0633\u064A']
# Siin Alif, Siin Noon
- self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"]
- # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
- self.verb_pr33 = [
- "\u0644\u0646",
- "\u0644\u062A",
- "\u0644\u064A",
- "\u0644\u0623",
- ]
+ self.verb_pr22 = ['\u0633\u0627', '\u0633\u0646']
+
# Taa Miim Alif, Taa Noon Shadda
- self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"]
+ self.verb_suf3 = ['\u062A\u0645\u0627', '\u062A\u0646\u0651']
# Noon Alif, Taa Miim, Taa Alif, Waaw Alif
self.verb_suf2 = [
- "\u0646\u0627",
- "\u062A\u0645",
- "\u062A\u0627",
- "\u0648\u0627",
- ]
+ '\u0646\u0627', '\u062A\u0645',
+ '\u062A\u0627', '\u0648\u0627'
+ ]
# Taa, Alif, Noon
- self.verb_suf1 = ["\u062A", "\u0627", "\u0646"]
+ self.verb_suf1 = ['\u062A', '\u0627', '\u0646']
def stem(self, token):
"""
"""
try:
if token is None:
- raise ValueError(
- "The word could not be stemmed, because \
- it is empty !"
- )
+ raise ValueError("The word could not be stemmed, because \
+ it is empty !")
# remove Arabic diacritics and replace some letters with others
token = self.norm(token)
# strip common prefixes of the nouns
beginning.
"""
# strip Arabic diacritics
- token = self.re_diacritics.sub("", token)
+ token = self.re_diacritics.sub('', token)
# replace Hamzated Alif with Alif bare
- token = self.re_hamzated_alif.sub("\u0627", token)
+ token = self.re_hamzated_alif.sub('\u0627', token)
# replace alifMaqsura with Yaa
- token = self.re_alifMaqsura.sub("\u064A", token)
+ token = self.re_alifMaqsura.sub('\u064A', token)
# strip the Waaw from the word beginning if the remaining is 3 letters
# at least
- if token.startswith("\u0648") and len(token) > 3:
+ if token.startswith('\u0648') and len(token) > 3:
token = token[1:]
return token
"""
remove suffixes from the word's end.
"""
- if token.endswith("\u0643") and len(token) > 3:
+ if token.endswith('\u0643') and len(token) > 3:
return token[:-1]
if len(token) > 4:
for s2 in self.su2:
for s3 in self.su3:
if token.endswith(s3):
return token[:-3]
- if token.endswith("\u0647") and len(token) > 3:
+ if token.endswith('\u0647') and len(token) > 3:
token = token[:-1]
return token
if len(token) > 4:
for s3 in self.su32:
if token.endswith(s3):
return token[:-3]
- if token.endswith("\u0646\u0627") and len(token) > 4:
+ if token.endswith('\u0646\u0627') and len(token) > 4:
return token[:-2]
return token
"""
transform the word from the feminine form to the masculine form.
"""
- if token.endswith("\u0629") and len(token) > 3:
+ if token.endswith('\u0629') and len(token) > 3:
return token[:-1]
def plur2sing(self, token):
for ps3 in self.pl_si3:
if token.endswith(ps3):
return token[:-3]
- if len(token) > 3 and token.endswith("\u0627\u062A"):
+ if len(token) > 3 and token.endswith('\u0627\u062A'):
return token[:-2]
- if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627":
+ if (len(token) > 3 and token.startswith('\u0627')
+ and token[2] == '\u0627'):
return token[:2] + token[3:]
- if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627":
+ if (len(token) > 4 and token.startswith('\u0627')
+ and token[-2] == '\u0627'):
return token[1:-2] + token[-1]
def verb(self, token):
vb = self.verb_t4(token)
if vb is not None:
return vb
- vb = self.verb_t5(token)
- if vb is not None:
- return vb
- return self.verb_t6(token)
+ return self.verb_t5(token)
def verb_t1(self, token):
"""
stem the present prefixes and suffixes
"""
- if len(token) > 5 and token.startswith("\u062A"): # Taa
+ if len(token) > 5 and token.startswith('\u062A'): # Taa
for s2 in self.pl_si2:
if token.endswith(s2):
return token[1:-2]
- if len(token) > 5 and token.startswith("\u064A"): # Yaa
+ if len(token) > 5 and token.startswith('\u064A'): # Yaa
for s2 in self.verb_su2:
if token.endswith(s2):
return token[1:-2]
- if len(token) > 4 and token.startswith("\u0627"): # Alif
+ if len(token) > 4 and token.startswith('\u0627'): # Alif
# Waaw Alif
- if len(token) > 5 and token.endswith("\u0648\u0627"):
+ if len(token) > 5 and token.endswith('\u0648\u0627'):
return token[1:-2]
# Yaa
- if token.endswith("\u064A"):
+ if token.endswith('\u064A'):
return token[1:-1]
# Alif
- if token.endswith("\u0627"):
+ if token.endswith('\u0627'):
return token[1:-1]
# Noon
- if token.endswith("\u0646"):
+ if token.endswith('\u0646'):
return token[1:-1]
# ^Yaa, Noon$
- if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"):
+ if (len(token) > 4
+ and token.startswith('\u064A')
+ and token.endswith('\u0646')):
return token[1:-1]
# ^Taa, Noon$
- if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"):
+ if (len(token) > 4
+ and token.startswith('\u062A')
+ and token.endswith('\u0646')):
return token[1:-1]
def verb_t2(self, token):
if len(token) > 6:
for s2 in self.pl_si2:
# ^Siin Taa
- if token.startswith(self.verb_pr2[0]) and token.endswith(s2):
+ if (token.startswith(self.verb_pr2[0])
+ and token.endswith(s2)):
return token[2:-2]
# ^Siin Yaa, Alif Noon$
- if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]):
+ if (token.startswith(self.verb_pr2[1])
+ and token.endswith(self.pl_si2[0])):
return token[2:-2]
# ^Siin Yaa, Waaw Noon$
- if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]):
+ if (token.startswith(self.verb_pr2[1])
+ and token.endswith(self.pl_si2[2])):
return token[2:-2]
# ^Siin Taa, Noon$
- if (
- len(token) > 5
- and token.startswith(self.verb_pr2[0])
- and token.endswith("\u0646")
- ):
+ if (len(token) > 5
+ and token.startswith(self.verb_pr2[0])
+ and token.endswith('\u0646')):
return token[2:-1]
# ^Siin Yaa, Noon$
- if (
- len(token) > 5
- and token.startswith(self.verb_pr2[1])
- and token.endswith("\u0646")
- ):
+ if (len(token) > 5
+ and token.startswith(self.verb_pr2[1])
+ and token.endswith('\u0646')):
return token[2:-1]
def verb_t3(self, token):
"""
if len(token) > 5:
for su3 in self.verb_suf3:
- if token.endswith(su3):
+ if(token.endswith(su3)):
return token[:-3]
if len(token) > 4:
for su2 in self.verb_suf2:
for pr1 in self.verb_suf1:
if token.startswith(pr1):
return token[1:]
- if token.startswith("\u064A"):
+ if token.startswith('\u064A'):
return token[1:]
def verb_t5(self, token):
if token.startswith(pr2):
return token[2:]
return token
-
- def verb_t6(self, token):
- """
- stem the order prefixes
- """
- if len(token) > 4:
- for pr3 in self.verb_pr33:
- if token.startswith(pr3):
- return token[2:]
- return token
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: CISTEM Stemmer for German
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Leonie Weissweiler <l.weissweiler@outlook.de>
-# Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
-# Alexander Fraser <fraser@cis.lmu.de>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-import re
-from nltk.stem.api import StemmerI
-
-
-class Cistem(StemmerI):
- """
- CISTEM Stemmer for German
-
- This is the official Python implementation of the CISTEM stemmer.
- It is based on the paper
- Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German
- Based on a Comparative Analysis of Publicly Available Stemmers.
- In Proceedings of the German Society for Computational Linguistics and Language
- Technology (GSCL)
- which can be read here:
- http://www.cis.lmu.de/~weissweiler/cistem/
-
- In the paper, we conducted an analysis of publicly available stemmers,
- developed two gold standards for German stemming and evaluated the stemmers
- based on the two gold standards. We then proposed the stemmer implemented here
- and show that it achieves slightly better f-measure than the other stemmers and
- is thrice as fast as the Snowball stemmer for German while being about as fast
- as most other stemmers.
-
- case_insensitive is a a boolean specifying if case-insensitive stemming
- should be used. Case insensitivity improves performance only if words in the
- text may be incorrectly upper case. For all-lowercase and correctly cased
- text, best performance is achieved by setting case_insensitive for false.
-
- :param case_insensitive: if True, the stemming is case insensitive. False by default.
- :type case_insensitive: bool
- """
-
- strip_ge = re.compile(r"^ge(.{4,})")
- repl_xx = re.compile(r"(.)\1")
- strip_emr = re.compile(r"e[mr]$")
- strip_nd = re.compile(r"nd$")
- strip_t = re.compile(r"t$")
- strip_esn = re.compile(r"[esn]$")
- repl_xx_back = re.compile(r"(.)\*")
-
- def __init__(self, case_insensitive=False):
- self._case_insensitive = case_insensitive
-
- @staticmethod
- def replace_to(word):
- word = word.replace("sch", "$")
- word = word.replace("ei", "%")
- word = word.replace("ie", "&")
- word = Cistem.repl_xx.sub(r"\1*", word)
-
- return word
-
- @staticmethod
- def replace_back(word):
- word = Cistem.repl_xx_back.sub(r"\1\1", word)
- word = word.replace("%", "ei")
- word = word.replace("&", "ie")
- word = word.replace("$", "sch")
-
- return word
-
- def stem(self, word):
- """
- This method takes the word to be stemmed and returns the stemmed word.
-
- :param word: the word that is to be stemmed
- :type word: unicode
- :return word: the stemmed word
- :rtype: unicode
-
- >>> from nltk.stem.cistem import Cistem
- >>> stemmer = Cistem()
- >>> s1 = "Speicherbehältern"
- >>> stemmer.stem(s1)
- 'speicherbehalt'
- >>> s2 = "Grenzpostens"
- >>> stemmer.stem(s2)
- 'grenzpost'
- >>> s3 = "Ausgefeiltere"
- >>> stemmer.stem(s3)
- 'ausgefeilt'
- >>> stemmer = Cistem(True)
- >>> stemmer.stem(s1)
- 'speicherbehal'
- >>> stemmer.stem(s2)
- 'grenzpo'
- >>> stemmer.stem(s3)
- 'ausgefeil'
- """
- if len(word) == 0:
- return word
-
- upper = word[0].isupper()
- word = word.lower()
-
- word = word.replace("ü", "u")
- word = word.replace("ö", "o")
- word = word.replace("ä", "a")
- word = word.replace("ß", "ss")
-
- word = Cistem.strip_ge.sub(r"\1", word)
- word = Cistem.replace_to(word)
-
- while len(word) > 3:
- if len(word) > 5:
- (word, success) = Cistem.strip_emr.subn("", word)
- if success != 0:
- continue
-
- (word, success) = Cistem.strip_nd.subn("", word)
- if success != 0:
- continue
-
- if not upper or self._case_insensitive:
- (word, success) = Cistem.strip_t.subn("", word)
- if success != 0:
- continue
-
- (word, success) = Cistem.strip_esn.subn("", word)
- if success != 0:
- continue
- else:
- break
-
- word = Cistem.replace_back(word)
-
- return word
-
- def segment(self, word):
- """
- This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
- addition to returning the stem, it also returns the rest that was removed at
- the end. To be able to return the stem unchanged so the stem and the rest
- can be concatenated to form the original word, all subsitutions that altered
- the stem in any other way than by removing letters at the end were left out.
-
- :param word: the word that is to be stemmed
- :type word: unicode
- :return word: the stemmed word
- :rtype: unicode
- :return word: the removed suffix
- :rtype: unicode
-
- >>> from nltk.stem.cistem import Cistem
- >>> stemmer = Cistem()
- >>> s1 = "Speicherbehältern"
- >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
- ('speicherbehält', 'ern')
- >>> s2 = "Grenzpostens"
- >>> stemmer.segment(s2)
- ('grenzpost', 'ens')
- >>> s3 = "Ausgefeiltere"
- >>> stemmer.segment(s3)
- ('ausgefeilt', 'ere')
- >>> stemmer = Cistem(True)
- >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
- ('speicherbehäl', 'tern')
- >>> stemmer.segment(s2)
- ('grenzpo', 'stens')
- >>> stemmer.segment(s3)
- ('ausgefeil', 'tere')
- """
-
- rest_length = 0
-
- if len(word) == 0:
- return ("", "")
-
- upper = word[0].isupper()
- word = word.lower()
-
- original = word[:]
-
- word = Cistem.replace_to(word)
-
- while len(word) > 3:
- if len(word) > 5:
- (word, success) = Cistem.strip_emr.subn("", word)
- if success != 0:
- rest_length += 2
- continue
-
- (word, success) = Cistem.strip_nd.subn("", word)
- if success != 0:
- rest_length += 2
- continue
-
- if not upper or self._case_insensitive:
- (word, success) = Cistem.strip_t.subn("", word)
- if success != 0:
- rest_length += 1
- continue
-
- (word, success) = Cistem.strip_esn.subn("", word)
- if success != 0:
- rest_length += 1
- continue
- else:
- break
-
- word = Cistem.replace_back(word)
-
- if rest_length:
- rest = original[-rest_length:]
- else:
- rest = ""
-
- return (word, rest)
#
# Natural Language Toolkit: The ISRI Arabic Stemmer
#
-# Copyright (C) 2001-2020 NLTK Proejct
+# Copyright (C) 2001-2017 NLTK Proejct
# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
# Author: Hosam Algasaier <hosam_hme@yahoo.com>
# URL: <http://nltk.org/>
increases the word ambiguities and changes the original root.
"""
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
class ISRIStemmer(StemmerI):
- """
+ '''
ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
Information Science Research Institute. University of Nevada, Las Vegas, USA.
The ISRI Stemmer requires that all tokens have Unicode string types.
If you use Python IDLE on Arabic Windows you have to decode text first
using Arabic '1256' coding.
- """
+ '''
def __init__(self):
# length three prefixes
- self.p3 = [
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644",
- "\u0648\u0644\u0644",
- "\u0648\u0627\u0644",
- ]
+ self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644',
+ '\u0648\u0644\u0644', '\u0648\u0627\u0644']
# length two prefixes
- self.p2 = ["\u0627\u0644", "\u0644\u0644"]
+ self.p2 = ['\u0627\u0644', '\u0644\u0644']
# length one prefixes
- self.p1 = [
- "\u0644",
- "\u0628",
- "\u0641",
- "\u0633",
- "\u0648",
- "\u064a",
- "\u062a",
- "\u0646",
- "\u0627",
- ]
+ self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648',
+ '\u064a', '\u062a', '\u0646', '\u0627']
# length three suffixes
- self.s3 = [
- "\u062a\u0645\u0644",
- "\u0647\u0645\u0644",
- "\u062a\u0627\u0646",
- "\u062a\u064a\u0646",
- "\u0643\u0645\u0644",
- ]
+ self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644',
+ '\u062a\u0627\u0646', '\u062a\u064a\u0646',
+ '\u0643\u0645\u0644']
# length two suffixes
- self.s2 = [
- "\u0648\u0646",
- "\u0627\u062a",
- "\u0627\u0646",
- "\u064a\u0646",
- "\u062a\u0646",
- "\u0643\u0645",
- "\u0647\u0646",
- "\u0646\u0627",
- "\u064a\u0627",
- "\u0647\u0627",
- "\u062a\u0645",
- "\u0643\u0646",
- "\u0646\u064a",
- "\u0648\u0627",
- "\u0645\u0627",
- "\u0647\u0645",
- ]
+ self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646',
+ '\u064a\u0646', '\u062a\u0646', '\u0643\u0645',
+ '\u0647\u0646', '\u0646\u0627', '\u064a\u0627',
+ '\u0647\u0627', '\u062a\u0645', '\u0643\u0646',
+ '\u0646\u064a', '\u0648\u0627', '\u0645\u0627',
+ '\u0647\u0645']
# length one suffixes
- self.s1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a", "\u0627", "\u0646"]
+ self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a',
+ '\u0627', '\u0646']
# groups of length four patterns
- self.pr4 = {
- 0: ["\u0645"],
- 1: ["\u0627"],
- 2: ["\u0627", "\u0648", "\u064A"],
- 3: ["\u0629"],
- }
+ self.pr4 = {0: ['\u0645'], 1: ['\u0627'],
+ 2: ['\u0627', '\u0648', '\u064A'], 3: ['\u0629']}
# Groups of length five patterns and length three roots
- self.pr53 = {
- 0: ["\u0627", "\u062a"],
- 1: ["\u0627", "\u064a", "\u0648"],
- 2: ["\u0627", "\u062a", "\u0645"],
- 3: ["\u0645", "\u064a", "\u062a"],
- 4: ["\u0645", "\u062a"],
- 5: ["\u0627", "\u0648"],
- 6: ["\u0627", "\u0645"],
- }
-
- self.re_short_vowels = re.compile(r"[\u064B-\u0652]")
- self.re_hamza = re.compile(r"[\u0621\u0624\u0626]")
- self.re_initial_hamza = re.compile(r"^[\u0622\u0623\u0625]")
-
- self.stop_words = [
- "\u064a\u0643\u0648\u0646",
- "\u0648\u0644\u064a\u0633",
- "\u0648\u0643\u0627\u0646",
- "\u0643\u0630\u0644\u0643",
- "\u0627\u0644\u062a\u064a",
- "\u0648\u0628\u064a\u0646",
- "\u0639\u0644\u064a\u0647\u0627",
- "\u0645\u0633\u0627\u0621",
- "\u0627\u0644\u0630\u064a",
- "\u0648\u0643\u0627\u0646\u062a",
- "\u0648\u0644\u0643\u0646",
- "\u0648\u0627\u0644\u062a\u064a",
- "\u062a\u0643\u0648\u0646",
- "\u0627\u0644\u064a\u0648\u0645",
- "\u0627\u0644\u0644\u0630\u064a\u0646",
- "\u0639\u0644\u064a\u0647",
- "\u0643\u0627\u0646\u062a",
- "\u0644\u0630\u0644\u0643",
- "\u0623\u0645\u0627\u0645",
- "\u0647\u0646\u0627\u0643",
- "\u0645\u0646\u0647\u0627",
- "\u0645\u0627\u0632\u0627\u0644",
- "\u0644\u0627\u0632\u0627\u0644",
- "\u0644\u0627\u064a\u0632\u0627\u0644",
- "\u0645\u0627\u064a\u0632\u0627\u0644",
- "\u0627\u0635\u0628\u062d",
- "\u0623\u0635\u0628\u062d",
- "\u0623\u0645\u0633\u0649",
- "\u0627\u0645\u0633\u0649",
- "\u0623\u0636\u062d\u0649",
- "\u0627\u0636\u062d\u0649",
- "\u0645\u0627\u0628\u0631\u062d",
- "\u0645\u0627\u0641\u062a\u0626",
- "\u0645\u0627\u0627\u0646\u0641\u0643",
- "\u0644\u0627\u0633\u064a\u0645\u0627",
- "\u0648\u0644\u0627\u064a\u0632\u0627\u0644",
- "\u0627\u0644\u062d\u0627\u0644\u064a",
- "\u0627\u0644\u064a\u0647\u0627",
- "\u0627\u0644\u0630\u064a\u0646",
- "\u0641\u0627\u0646\u0647",
- "\u0648\u0627\u0644\u0630\u064a",
- "\u0648\u0647\u0630\u0627",
- "\u0644\u0647\u0630\u0627",
- "\u0641\u0643\u0627\u0646",
- "\u0633\u062a\u0643\u0648\u0646",
- "\u0627\u0644\u064a\u0647",
- "\u064a\u0645\u0643\u0646",
- "\u0628\u0647\u0630\u0627",
- "\u0627\u0644\u0630\u0649",
- ]
+ self.pr53 = {0: ['\u0627', '\u062a'],
+ 1: ['\u0627', '\u064a', '\u0648'],
+ 2: ['\u0627', '\u062a', '\u0645'],
+ 3: ['\u0645', '\u064a', '\u062a'],
+ 4: ['\u0645', '\u062a'],
+ 5: ['\u0627', '\u0648'],
+ 6: ['\u0627', '\u0645']}
+
+ self.re_short_vowels = re.compile(r'[\u064B-\u0652]')
+ self.re_hamza = re.compile(r'[\u0621\u0624\u0626]')
+ self.re_initial_hamza = re.compile(r'^[\u0622\u0623\u0625]')
+
+ self.stop_words = ['\u064a\u0643\u0648\u0646',
+ '\u0648\u0644\u064a\u0633',
+ '\u0648\u0643\u0627\u0646',
+ '\u0643\u0630\u0644\u0643',
+ '\u0627\u0644\u062a\u064a',
+ '\u0648\u0628\u064a\u0646',
+ '\u0639\u0644\u064a\u0647\u0627',
+ '\u0645\u0633\u0627\u0621',
+ '\u0627\u0644\u0630\u064a',
+ '\u0648\u0643\u0627\u0646\u062a',
+ '\u0648\u0644\u0643\u0646',
+ '\u0648\u0627\u0644\u062a\u064a',
+ '\u062a\u0643\u0648\u0646',
+ '\u0627\u0644\u064a\u0648\u0645',
+ '\u0627\u0644\u0644\u0630\u064a\u0646',
+ '\u0639\u0644\u064a\u0647',
+ '\u0643\u0627\u0646\u062a',
+ '\u0644\u0630\u0644\u0643',
+ '\u0623\u0645\u0627\u0645',
+ '\u0647\u0646\u0627\u0643',
+ '\u0645\u0646\u0647\u0627',
+ '\u0645\u0627\u0632\u0627\u0644',
+ '\u0644\u0627\u0632\u0627\u0644',
+ '\u0644\u0627\u064a\u0632\u0627\u0644',
+ '\u0645\u0627\u064a\u0632\u0627\u0644',
+ '\u0627\u0635\u0628\u062d',
+ '\u0623\u0635\u0628\u062d',
+ '\u0623\u0645\u0633\u0649',
+ '\u0627\u0645\u0633\u0649',
+ '\u0623\u0636\u062d\u0649',
+ '\u0627\u0636\u062d\u0649',
+ '\u0645\u0627\u0628\u0631\u062d',
+ '\u0645\u0627\u0641\u062a\u0626',
+ '\u0645\u0627\u0627\u0646\u0641\u0643',
+ '\u0644\u0627\u0633\u064a\u0645\u0627',
+ '\u0648\u0644\u0627\u064a\u0632\u0627\u0644',
+ '\u0627\u0644\u062d\u0627\u0644\u064a',
+ '\u0627\u0644\u064a\u0647\u0627',
+ '\u0627\u0644\u0630\u064a\u0646',
+ '\u0641\u0627\u0646\u0647',
+ '\u0648\u0627\u0644\u0630\u064a',
+ '\u0648\u0647\u0630\u0627',
+ '\u0644\u0647\u0630\u0627',
+ '\u0641\u0643\u0627\u0646',
+ '\u0633\u062a\u0643\u0648\u0646',
+ '\u0627\u0644\u064a\u0647',
+ '\u064a\u0645\u0643\u0646',
+ '\u0628\u0647\u0630\u0627',
+ '\u0627\u0644\u0630\u0649']
def stem(self, token):
"""
Stemming a word token using the ISRI stemmer.
"""
- token = self.norm(
- token, 1
- ) # remove diacritics which representing Arabic short vowels
+ token = self.norm(token, 1) # remove diacritics which representing Arabic short vowels
if token in self.stop_words:
- return token # exclude stop words from being processed
- token = self.pre32(
- token
- ) # remove length three and length two prefixes in this order
- token = self.suf32(
- token
- ) # remove length three and length two suffixes in this order
- token = self.waw(
- token
- ) # remove connective ‘و’ if it precedes a word beginning with ‘و’
- token = self.norm(token, 2) # normalize initial hamza to bare alif
+ return token # exclude stop words from being processed
+ token = self.pre32(token) # remove length three and length two prefixes in this order
+ token = self.suf32(token) # remove length three and length two suffixes in this order
+ token = self.waw(token) # remove connective ‘و’ if it precedes a word beginning with ‘و’
+ token = self.norm(token, 2) # normalize initial hamza to bare alif
# if 4 <= word length <= 7, then stem; otherwise, no stemming
- if len(token) == 4: # length 4 word
+ if len(token) == 4: # length 4 word
token = self.pro_w4(token)
- elif len(token) == 5: # length 5 word
+ elif len(token) == 5: # length 5 word
token = self.pro_w53(token)
token = self.end_w5(token)
- elif len(token) == 6: # length 6 word
+ elif len(token) == 6: # length 6 word
token = self.pro_w6(token)
token = self.end_w6(token)
- elif len(token) == 7: # length 7 word
+ elif len(token) == 7: # length 7 word
token = self.suf1(token)
if len(token) == 7:
token = self.pre1(token)
num=3 both 1&2
"""
if num == 1:
- word = self.re_short_vowels.sub("", word)
+ word = self.re_short_vowels.sub('', word)
elif num == 2:
- word = self.re_initial_hamza.sub("\u0627", word)
+ word = self.re_initial_hamza.sub('\u0627', word)
elif num == 3:
- word = self.re_short_vowels.sub("", word)
- word = self.re_initial_hamza.sub("\u0627", word)
+ word = self.re_short_vowels.sub('', word)
+ word = self.re_initial_hamza.sub('\u0627', word)
return word
def pre32(self, word):
def waw(self, word):
"""remove connective ‘و’ if it precedes a word beginning with ‘و’ """
- if len(word) >= 4 and word[:2] == "\u0648\u0648":
+ if len(word) >= 4 and word[:2] == '\u0648\u0648':
word = word[1:]
return word
def pro_w4(self, word):
"""process length four patterns and extract length three roots"""
- if word[0] in self.pr4[0]: # مفعل
+ if word[0] in self.pr4[0]: # مفعل
word = word[1:]
- elif word[1] in self.pr4[1]: # فاعل
+ elif word[1] in self.pr4[1]: # فاعل
word = word[:1] + word[2:]
- elif word[2] in self.pr4[2]: # فعال - فعول - فعيل
+ elif word[2] in self.pr4[2]: # فعال - فعول - فعيل
word = word[:2] + word[3]
- elif word[3] in self.pr4[3]: # فعلة
+ elif word[3] in self.pr4[3]: # فعلة
word = word[:-1]
else:
- word = self.suf1(word) # do - normalize short sufix
+ word = self.suf1(word) # do - normalize short sufix
if len(word) == 4:
word = self.pre1(word) # do - normalize short prefix
return word
def pro_w53(self, word):
"""process length five patterns and extract length three roots"""
- if word[2] in self.pr53[0] and word[0] == "\u0627": # افتعل - افاعل
+ if word[2] in self.pr53[0] and word[0] == '\u0627': # افتعل - افاعل
word = word[1] + word[3:]
- elif word[3] in self.pr53[1] and word[0] == "\u0645": # مفعول - مفعال - مفعيل
+ elif word[3] in self.pr53[1] and word[0] == '\u0645': # مفعول - مفعال - مفعيل
word = word[1:3] + word[4]
- elif word[0] in self.pr53[2] and word[4] == "\u0629": # مفعلة - تفعلة - افعلة
+ elif word[0] in self.pr53[2] and word[4] == '\u0629': # مفعلة - تفعلة - افعلة
word = word[1:4]
- elif word[0] in self.pr53[3] and word[2] == "\u062a": # مفتعل - يفتعل - تفتعل
+ elif word[0] in self.pr53[3] and word[2] == '\u062a': # مفتعل - يفتعل - تفتعل
word = word[1] + word[3:]
- elif word[0] in self.pr53[4] and word[2] == "\u0627": # مفاعل - تفاعل
+ elif word[0] in self.pr53[4] and word[2] == '\u0627': # مفاعل - تفاعل
word = word[1] + word[3:]
- elif word[2] in self.pr53[5] and word[4] == "\u0629": # فعولة - فعالة
+ elif word[2] in self.pr53[5] and word[4] == '\u0629': # فعولة - فعالة
word = word[:2] + word[3]
- elif word[0] in self.pr53[6] and word[1] == "\u0646": # انفعل - منفعل
+ elif word[0] in self.pr53[6] and word[1] == '\u0646': # انفعل - منفعل
word = word[2:]
- elif word[3] == "\u0627" and word[0] == "\u0627": # افعال
+ elif word[3] == '\u0627' and word[0] == '\u0627': # افعال
word = word[1:3] + word[4]
- elif word[4] == "\u0646" and word[3] == "\u0627": # فعلان
+ elif word[4] == '\u0646' and word[3] == '\u0627': # فعلان
word = word[:3]
- elif word[3] == "\u064a" and word[0] == "\u062a": # تفعيل
+ elif word[3] == '\u064a' and word[0] == '\u062a': # تفعيل
word = word[1:3] + word[4]
- elif word[3] == "\u0648" and word[1] == "\u0627": # فاعول
+ elif word[3] == '\u0648' and word[1] == '\u0627': # فاعول
word = word[0] + word[2] + word[4]
- elif word[2] == "\u0627" and word[1] == "\u0648": # فواعل
+ elif word[2] == '\u0627' and word[1] == '\u0648': # فواعل
word = word[0] + word[3:]
- elif word[3] == "\u0626" and word[2] == "\u0627": # فعائل
+ elif word[3] == '\u0626' and word[2] == '\u0627': # فعائل
word = word[:2] + word[4]
- elif word[4] == "\u0629" and word[1] == "\u0627": # فاعلة
+ elif word[4] == '\u0629' and word[1] == '\u0627': # فاعلة
word = word[0] + word[2:4]
- elif word[4] == "\u064a" and word[2] == "\u0627": # فعالي
+ elif word[4] == '\u064a' and word[2] == '\u0627': # فعالي
word = word[:2] + word[3]
else:
- word = self.suf1(word) # do - normalize short sufix
+ word = self.suf1(word) # do - normalize short sufix
if len(word) == 5:
word = self.pre1(word) # do - normalize short prefix
return word
"""process length five patterns and extract length four roots"""
if word[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل
word = word[1:]
- elif word[4] == "\u0629": # فعللة
+ elif word[4] == '\u0629': # فعللة
word = word[:4]
- elif word[2] == "\u0627": # فعالل
+ elif word[2] == '\u0627': # فعالل
word = word[:2] + word[3:]
return word
def pro_w6(self, word):
"""process length six patterns and extract length three roots"""
- if word.startswith("\u0627\u0633\u062a") or word.startswith(
- "\u0645\u0633\u062a"
- ): # مستفعل - استفعل
+ if word.startswith('\u0627\u0633\u062a') or word.startswith('\u0645\u0633\u062a'): # مستفعل - استفعل
word = word[3:]
- elif (
- word[0] == "\u0645" and word[3] == "\u0627" and word[5] == "\u0629"
- ): # مفعالة
+ elif word[0] == '\u0645' and word[3] == '\u0627' and word[5] == '\u0629': # مفعالة
word = word[1:3] + word[4]
- elif (
- word[0] == "\u0627" and word[2] == "\u062a" and word[4] == "\u0627"
- ): # افتعال
+ elif word[0] == '\u0627' and word[2] == '\u062a' and word[4] == '\u0627': # افتعال
word = word[1] + word[3] + word[5]
- elif (
- word[0] == "\u0627" and word[3] == "\u0648" and word[2] == word[4]
- ): # افعوعل
+ elif word[0] == '\u0627' and word[3] == '\u0648' and word[2] == word[4]: # افعوعل
word = word[1] + word[4:]
- elif (
- word[0] == "\u062a" and word[2] == "\u0627" and word[4] == "\u064a"
- ): # تفاعيل new pattern
+ elif word[0] == '\u062a' and word[2] == '\u0627' and word[4] == '\u064a': # تفاعيل new pattern
word = word[1] + word[3] + word[5]
else:
- word = self.suf1(word) # do - normalize short sufix
+ word = self.suf1(word) # do - normalize short sufix
if len(word) == 6:
word = self.pre1(word) # do - normalize short prefix
return word
def pro_w64(self, word):
"""process length six patterns and extract length four roots"""
- if word[0] == "\u0627" and word[4] == "\u0627": # افعلال
+ if word[0] == '\u0627' and word[4] == '\u0627': # افعلال
word = word[1:4] + word[5]
- elif word.startswith("\u0645\u062a"): # متفعلل
+ elif word.startswith('\u0645\u062a'): # متفعلل
word = word[2:]
return word
if word.startswith(sp1):
return word[1:]
return word
+
+
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Tomcavage <stomcava@law.upenn.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
"""
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class LancasterStemmer(StemmerI):
"""
Lancaster Stemmer
# The rule list is static since it doesn't change between instances
default_rule_tuple = (
- "ai*2.", # -ia > - if intact
- "a*1.", # -a > - if intact
- "bb1.", # -bb > -b
- "city3s.", # -ytic > -ys
- "ci2>", # -ic > -
- "cn1t>", # -nc > -nt
- "dd1.", # -dd > -d
- "dei3y>", # -ied > -y
+ "ai*2.", # -ia > - if intact
+ "a*1.", # -a > - if intact
+ "bb1.", # -bb > -b
+ "city3s.", # -ytic > -ys
+ "ci2>", # -ic > -
+ "cn1t>", # -nc > -nt
+ "dd1.", # -dd > -d
+ "dei3y>", # -ied > -y
"deec2ss.", # -ceed >", -cess
- "dee1.", # -eed > -ee
- "de2>", # -ed > -
- "dooh4>", # -hood > -
- "e1>", # -e > -
- "feil1v.", # -lief > -liev
- "fi2>", # -if > -
- "gni3>", # -ing > -
- "gai3y.", # -iag > -y
- "ga2>", # -ag > -
- "gg1.", # -gg > -g
- "ht*2.", # -th > - if intact
- "hsiug5ct.", # -guish > -ct
- "hsi3>", # -ish > -
- "i*1.", # -i > - if intact
- "i1y>", # -i > -y
- "ji1d.", # -ij > -id -- see nois4j> & vis3j>
- "juf1s.", # -fuj > -fus
- "ju1d.", # -uj > -ud
- "jo1d.", # -oj > -od
- "jeh1r.", # -hej > -her
- "jrev1t.", # -verj > -vert
- "jsim2t.", # -misj > -mit
- "jn1d.", # -nj > -nd
- "j1s.", # -j > -s
+ "dee1.", # -eed > -ee
+ "de2>", # -ed > -
+ "dooh4>", # -hood > -
+ "e1>", # -e > -
+ "feil1v.", # -lief > -liev
+ "fi2>", # -if > -
+ "gni3>", # -ing > -
+ "gai3y.", # -iag > -y
+ "ga2>", # -ag > -
+ "gg1.", # -gg > -g
+ "ht*2.", # -th > - if intact
+ "hsiug5ct.", # -guish > -ct
+ "hsi3>", # -ish > -
+ "i*1.", # -i > - if intact
+ "i1y>", # -i > -y
+ "ji1d.", # -ij > -id -- see nois4j> & vis3j>
+ "juf1s.", # -fuj > -fus
+ "ju1d.", # -uj > -ud
+ "jo1d.", # -oj > -od
+ "jeh1r.", # -hej > -her
+ "jrev1t.", # -verj > -vert
+ "jsim2t.", # -misj > -mit
+ "jn1d.", # -nj > -nd
+ "j1s.", # -j > -s
"lbaifi6.", # -ifiabl > -
- "lbai4y.", # -iabl > -y
- "lba3>", # -abl > -
- "lbi3.", # -ibl > -
- "lib2l>", # -bil > -bl
- "lc1.", # -cl > c
- "lufi4y.", # -iful > -y
- "luf3>", # -ful > -
- "lu2.", # -ul > -
- "lai3>", # -ial > -
- "lau3>", # -ual > -
- "la2>", # -al > -
- "ll1.", # -ll > -l
- "mui3.", # -ium > -
- "mu*2.", # -um > - if intact
- "msi3>", # -ism > -
- "mm1.", # -mm > -m
- "nois4j>", # -sion > -j
+ "lbai4y.", # -iabl > -y
+ "lba3>", # -abl > -
+ "lbi3.", # -ibl > -
+ "lib2l>", # -bil > -bl
+ "lc1.", # -cl > c
+ "lufi4y.", # -iful > -y
+ "luf3>", # -ful > -
+ "lu2.", # -ul > -
+ "lai3>", # -ial > -
+ "lau3>", # -ual > -
+ "la2>", # -al > -
+ "ll1.", # -ll > -l
+ "mui3.", # -ium > -
+ "mu*2.", # -um > - if intact
+ "msi3>", # -ism > -
+ "mm1.", # -mm > -m
+ "nois4j>", # -sion > -j
"noix4ct.", # -xion > -ct
- "noi3>", # -ion > -
- "nai3>", # -ian > -
- "na2>", # -an > -
- "nee0.", # protect -een
- "ne2>", # -en > -
- "nn1.", # -nn > -n
- "pihs4>", # -ship > -
- "pp1.", # -pp > -p
- "re2>", # -er > -
- "rae0.", # protect -ear
- "ra2.", # -ar > -
- "ro2>", # -or > -
- "ru2>", # -ur > -
- "rr1.", # -rr > -r
- "rt1>", # -tr > -t
- "rei3y>", # -ier > -y
- "sei3y>", # -ies > -y
- "sis2.", # -sis > -s
- "si2>", # -is > -
- "ssen4>", # -ness > -
- "ss0.", # protect -ss
- "suo3>", # -ous > -
- "su*2.", # -us > - if intact
- "s*1>", # -s > - if intact
- "s0.", # -s > -s
- "tacilp4y.", # -plicat > -ply
- "ta2>", # -at > -
- "tnem4>", # -ment > -
- "tne3>", # -ent > -
- "tna3>", # -ant > -
- "tpir2b.", # -ript > -rib
- "tpro2b.", # -orpt > -orb
- "tcud1.", # -duct > -duc
- "tpmus2.", # -sumpt > -sum
+ "noi3>", # -ion > -
+ "nai3>", # -ian > -
+ "na2>", # -an > -
+ "nee0.", # protect -een
+ "ne2>", # -en > -
+ "nn1.", # -nn > -n
+ "pihs4>", # -ship > -
+ "pp1.", # -pp > -p
+ "re2>", # -er > -
+ "rae0.", # protect -ear
+ "ra2.", # -ar > -
+ "ro2>", # -or > -
+ "ru2>", # -ur > -
+ "rr1.", # -rr > -r
+ "rt1>", # -tr > -t
+ "rei3y>", # -ier > -y
+ "sei3y>", # -ies > -y
+ "sis2.", # -sis > -s
+ "si2>", # -is > -
+ "ssen4>", # -ness > -
+ "ss0.", # protect -ss
+ "suo3>", # -ous > -
+ "su*2.", # -us > - if intact
+ "s*1>", # -s > - if intact
+ "s0.", # -s > -s
+ "tacilp4y.", # -plicat > -ply
+ "ta2>", # -at > -
+ "tnem4>", # -ment > -
+ "tne3>", # -ent > -
+ "tna3>", # -ant > -
+ "tpir2b.", # -ript > -rib
+ "tpro2b.", # -orpt > -orb
+ "tcud1.", # -duct > -duc
+ "tpmus2.", # -sumpt > -sum
"tpec2iv.", # -cept > -ceiv
- "tulo2v.", # -olut > -olv
- "tsis0.", # protect -sist
- "tsi3>", # -ist > -
- "tt1.", # -tt > -t
- "uqi3.", # -iqu > -
- "ugo1.", # -ogu > -og
- "vis3j>", # -siv > -j
- "vie0.", # protect -eiv
- "vi2>", # -iv > -
- "ylb1>", # -bly > -bl
- "yli3y>", # -ily > -y
- "ylp0.", # protect -ply
- "yl2>", # -ly > -
- "ygo1.", # -ogy > -og
- "yhp1.", # -phy > -ph
- "ymo1.", # -omy > -om
- "ypo1.", # -opy > -op
- "yti3>", # -ity > -
- "yte3>", # -ety > -
- "ytl2.", # -lty > -l
- "yrtsi5.", # -istry > -
- "yra3>", # -ary > -
- "yro3>", # -ory > -
- "yfi3.", # -ify > -
- "ycn2t>", # -ncy > -nt
- "yca3>", # -acy > -
- "zi2>", # -iz > -
- "zy1s.", # -yz > -ys
+ "tulo2v.", # -olut > -olv
+ "tsis0.", # protect -sist
+ "tsi3>", # -ist > -
+ "tt1.", # -tt > -t
+ "uqi3.", # -iqu > -
+ "ugo1.", # -ogu > -og
+ "vis3j>", # -siv > -j
+ "vie0.", # protect -eiv
+ "vi2>", # -iv > -
+ "ylb1>", # -bly > -bl
+ "yli3y>", # -ily > -y
+ "ylp0.", # protect -ply
+ "yl2>", # -ly > -
+ "ygo1.", # -ogy > -og
+ "yhp1.", # -phy > -ph
+ "ymo1.", # -omy > -om
+ "ypo1.", # -opy > -op
+ "yti3>", # -ity > -
+ "yte3>", # -ety > -
+ "ytl2.", # -lty > -l
+ "yrtsi5.", # -istry > -
+ "yra3>", # -ary > -
+ "yro3>", # -ory > -
+ "yfi3.", # -ify > -
+ "ycn2t>", # -ncy > -nt
+ "yca3>", # -acy > -
+ "zi2>", # -iz > -
+ "zy1s." # -yz > -ys
)
def __init__(self, rule_tuple=None, strip_prefix_flag=False):
last_letter_position = self.__getLastLetter(word)
# Only stem the word if it has a last letter and a rule matching that last letter
- if (
- last_letter_position < 0
- or word[last_letter_position] not in self.rule_dictionary
- ):
+ if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary:
proceed = False
else:
for rule in self.rule_dictionary[word[last_letter_position]]:
rule_match = valid_rule.match(rule)
if rule_match:
- (
- ending_string,
- intact_flag,
- remove_total,
- append_string,
- cont_flag,
- ) = rule_match.groups()
+ (ending_string,
+ intact_flag,
+ remove_total,
+ append_string,
+ cont_flag) = rule_match.groups()
# Convert the number of chars to remove when stemming
# from a string to an integer
# Proceed if word's ending matches rule's word ending
if word.endswith(ending_string[::-1]):
if intact_flag:
- if word == intact_word and self.__isAcceptable(
- word, remove_total
- ):
- word = self.__applyRule(
- word, remove_total, append_string
- )
+ if (word == intact_word and
+ self.__isAcceptable(word, remove_total)):
+ word = self.__applyRule(word,
+ remove_total,
+ append_string)
rule_was_applied = True
- if cont_flag == ".":
+ if cont_flag == '.':
proceed = False
break
elif self.__isAcceptable(word, remove_total):
- word = self.__applyRule(
- word, remove_total, append_string
- )
+ word = self.__applyRule(word,
+ remove_total,
+ append_string)
rule_was_applied = True
- if cont_flag == ".":
+ if cont_flag == '.':
proceed = False
break
# If no rules apply, the word doesn't need any more stemming
# If the word starts with a vowel, it must be at least 2
# characters long to be stemmed
if word[0] in "aeiouy":
- if len(word) - remove_total >= 2:
+ if (len(word) - remove_total >= 2):
word_is_acceptable = True
# If the word starts with a consonant, it must be at least 3
# characters long (including one vowel) to be stemmed
- elif len(word) - remove_total >= 3:
+ elif (len(word) - remove_total >= 3):
if word[1] in "aeiouy":
word_is_acceptable = True
elif word[2] in "aeiouy":
word_is_acceptable = True
return word_is_acceptable
+
def __applyRule(self, word, remove_total, append_string):
"""Apply the stemming rule to the word
"""
This function originally taken from Whoosh.
"""
- for prefix in (
- "kilo",
- "micro",
- "milli",
- "intra",
- "ultra",
- "mega",
- "nano",
- "pico",
- "pseudo",
- ):
+ for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
+ "nano", "pico", "pseudo"):
if word.startswith(prefix):
- return word[len(prefix) :]
+ return word[len(prefix):]
return word
def __repr__(self):
- return "<LancasterStemmer>"
+ return '<LancasterStemmer>'
in many languages.
"""
-__docformat__ = "plaintext"
+from __future__ import print_function, unicode_literals
+
+__docformat__ = 'plaintext'
import re
from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class PorterStemmer(StemmerI):
"""
A word stemmer based on the Porter stemming algorithm.
Porter, M. "An algorithm for suffix stripping."
Program 14.3 (1980): 130-137.
-
+
See http://www.tartarus.org/~martin/PorterStemmer/ for the homepage
of the algorithm.
-
+
Martin Porter has endorsed several modifications to the Porter
algorithm since writing his original paper, and those extensions are
included in the implementations on his website. Additionally, others
PorterStemmer.ORIGINAL_ALGORITHM
- Implementation that is faithful to the original paper.
-
+
Note that Martin Porter has deprecated this version of the
algorithm. Martin distributes implementations of the Porter
Stemmer in many languages, hosted at:
-
+
http://www.tartarus.org/~martin/PorterStemmer/
-
+
and all of these implementations include his extensions. He
strongly recommends against using the original, published
version of the algorithm; only use this mode if you clearly
understand why you are choosing to do so.
-
+
PorterStemmer.MARTIN_EXTENSIONS
- Implementation that only uses the modifications to the
algorithm that are included in the implementations on Martin
Porter's website. He has declared Porter frozen, so the
behaviour of those implementations should never change.
-
+
PorterStemmer.NLTK_EXTENSIONS (default)
- Implementation that includes further improvements devised by
NLTK contributors or taken from other modified implementations
found on the web.
-
+
For the best stemming, you should use the default NLTK_EXTENSIONS
version. However, if you need to get the same results as either the
original algorithm or one of Martin Porter's hosted versions for
- compatibility with an existing implementation or dataset, you can use
+ compability with an existing implementation or dataset, you can use
one of the other modes instead.
"""
-
+
# Modes the Stemmer can be instantiated in
- NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
- MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
- ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
+ NLTK_EXTENSIONS = 'NLTK_EXTENSIONS'
+ MARTIN_EXTENSIONS = 'MARTIN_EXTENSIONS'
+ ORIGINAL_ALGORITHM = 'ORIGINAL_ALGORITHM'
def __init__(self, mode=NLTK_EXTENSIONS):
if mode not in (
self.NLTK_EXTENSIONS,
self.MARTIN_EXTENSIONS,
- self.ORIGINAL_ALGORITHM,
+ self.ORIGINAL_ALGORITHM
):
raise ValueError(
"Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
"PorterStemmer.MARTIN_EXTENSIONS, or "
"PorterStemmer.ORIGINAL_ALGORITHM"
)
-
+
self.mode = mode
-
+
if self.mode == self.NLTK_EXTENSIONS:
# This is a table of irregular forms. It is quite short,
# but still reflects the errors actually drawn to Martin
# Porter's attention over a 20 year period!
irregular_forms = {
- "sky": ["sky", "skies"],
- "die": ["dying"],
- "lie": ["lying"],
- "tie": ["tying"],
- "news": ["news"],
- "inning": ["innings", "inning"],
- "outing": ["outings", "outing"],
- "canning": ["cannings", "canning"],
- "howe": ["howe"],
- "proceed": ["proceed"],
- "exceed": ["exceed"],
- "succeed": ["succeed"],
+ "sky" : ["sky", "skies"],
+ "die" : ["dying"],
+ "lie" : ["lying"],
+ "tie" : ["tying"],
+ "news" : ["news"],
+ "inning" : ["innings", "inning"],
+ "outing" : ["outings", "outing"],
+ "canning" : ["cannings", "canning"],
+ "howe" : ["howe"],
+ "proceed" : ["proceed"],
+ "exceed" : ["exceed"],
+ "succeed" : ["succeed"],
}
self.pool = {}
for val in irregular_forms[key]:
self.pool[val] = key
- self.vowels = frozenset(["a", "e", "i", "o", "u"])
+ self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
def _is_consonant(self, word, i):
"""Returns True if word[i] is a consonant, False otherwise
-
+
A consonant is defined in the paper as follows:
-
+
A consonant in a word is a letter other than A, E, I, O or
U, and other than Y preceded by a consonant. (The fact that
the term `consonant' is defined to some extent in terms of
"""
if word[i] in self.vowels:
return False
- if word[i] == "y":
+ if word[i] == 'y':
if i == 0:
return True
else:
- return not self._is_consonant(word, i - 1)
+ return (not self._is_consonant(word, i - 1))
return True
-
+
def _measure(self, stem):
"""Returns the 'measure' of stem, per definition in the paper
-
+
From the paper:
-
+
A consonant will be denoted by c, a vowel by v. A list
ccc... of length greater than 0 will be denoted by C, and a
list vvv... of length greater than 0 will be denoted by V.
CVCV ... V
VCVC ... C
VCVC ... V
-
+
These may all be represented by the single form
-
+
[C]VCVC ... [V]
-
+
where the square brackets denote arbitrary presence of their
contents. Using (VC){m} to denote VC repeated m times, this
may again be written as
m=1 TROUBLE, OATS, TREES, IVY.
m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
"""
- cv_sequence = ""
-
+ cv_sequence = ''
+
# Construct a string of 'c's and 'v's representing whether each
# character in `stem` is a consonant or a vowel.
# e.g. 'falafel' becomes 'cvcvcvc',
# 'architecture' becomes 'vcccvcvccvcv'
for i in range(len(stem)):
if self._is_consonant(stem, i):
- cv_sequence += "c"
+ cv_sequence += 'c'
else:
- cv_sequence += "v"
-
+ cv_sequence += 'v'
+
# Count the number of 'vc' occurences, which is equivalent to
# the number of 'VC' occurrences in Porter's reduced form in the
# docstring above, which is in turn equivalent to `m`
- return cv_sequence.count("vc")
-
+ return cv_sequence.count('vc')
+
def _has_positive_measure(self, stem):
return self._measure(stem) > 0
if not self._is_consonant(stem, i):
return True
return False
-
+
def _ends_double_consonant(self, word):
"""Implements condition *d from the paper
-
+
Returns True if word ends with a double consonant
"""
return (
- len(word) >= 2
- and word[-1] == word[-2]
- and self._is_consonant(word, len(word) - 1)
+ len(word) >= 2 and
+ word[-1] == word[-2] and
+ self._is_consonant(word, len(word)-1)
)
def _ends_cvc(self, word):
"""Implements condition *o from the paper
-
+
From the paper:
-
+
*o - the stem ends cvc, where the second c is not W, X or Y
(e.g. -WIL, -HOP).
"""
return (
- len(word) >= 3
- and self._is_consonant(word, len(word) - 3)
- and not self._is_consonant(word, len(word) - 2)
- and self._is_consonant(word, len(word) - 1)
- and word[-1] not in ("w", "x", "y")
+ len(word) >= 3 and
+ self._is_consonant(word, len(word) - 3) and
+ not self._is_consonant(word, len(word) - 2) and
+ self._is_consonant(word, len(word) - 1) and
+ word[-1] not in ('w', 'x', 'y')
) or (
- self.mode == self.NLTK_EXTENSIONS
- and len(word) == 2
- and not self._is_consonant(word, 0)
- and self._is_consonant(word, 1)
+ self.mode == self.NLTK_EXTENSIONS and
+ len(word) == 2 and
+ not self._is_consonant(word, 0) and
+ self._is_consonant(word, 1)
)
-
+
def _replace_suffix(self, word, suffix, replacement):
"""Replaces `suffix` of `word` with `replacement"""
assert word.endswith(suffix), "Given word doesn't end with given suffix"
- if suffix == "":
+ if suffix == '':
return word + replacement
else:
- return word[: -len(suffix)] + replacement
-
+ return word[:-len(suffix)] + replacement
+
def _apply_rule_list(self, word, rules):
"""Applies the first applicable suffix-removal rule to the word
-
+
Takes a word and a list of suffix-removal rules represented as
3-tuples, with the first element being the suffix to remove,
the second element being the string to replace it with, and the
"""
for rule in rules:
suffix, replacement, condition = rule
- if suffix == "*d" and self._ends_double_consonant(word):
+ if suffix == '*d' and self._ends_double_consonant(word):
stem = word[:-2]
if condition is None or condition(stem):
return stem + replacement
# Don't try any further rules
return word
if word.endswith(suffix):
- stem = self._replace_suffix(word, suffix, "")
+ stem = self._replace_suffix(word, suffix, '')
if condition is None or condition(stem):
return stem + replacement
else:
# Don't try any further rules
return word
-
+
return word
-
+
def _step1a(self, word):
"""Implements Step 1a from "An algorithm for suffix stripping"
-
+
From the paper:
-
+
SSES -> SS caresses -> caress
IES -> I ponies -> poni
ties -> ti
# this NLTK-only rule extends the original algorithm, so
# that 'flies'->'fli' but 'dies'->'die' etc
if self.mode == self.NLTK_EXTENSIONS:
- if word.endswith("ies") and len(word) == 4:
- return self._replace_suffix(word, "ies", "ie")
-
- return self._apply_rule_list(
- word,
- [
- ("sses", "ss", None), # SSES -> SS
- ("ies", "i", None), # IES -> I
- ("ss", "ss", None), # SS -> SS
- ("s", "", None), # S ->
- ],
- )
-
+ if word.endswith('ies') and len(word) == 4:
+ return self._replace_suffix(word, 'ies', 'ie')
+
+ return self._apply_rule_list(word, [
+ ('sses', 'ss', None), # SSES -> SS
+ ('ies', 'i', None), # IES -> I
+ ('ss', 'ss', None), # SS -> SS
+ ('s', '', None), # S ->
+ ])
+
def _step1b(self, word):
"""Implements Step 1b from "An algorithm for suffix stripping"
-
+
From the paper:
-
+
(m>0) EED -> EE feed -> feed
agreed -> agree
(*v*) ED -> plastered -> plaster
bled -> bled
(*v*) ING -> motoring -> motor
sing -> sing
-
+
If the second or third of the rules in Step 1b is successful,
the following is done:
# this NLTK-only block extends the original algorithm, so that
# 'spied'->'spi' but 'died'->'die' etc
if self.mode == self.NLTK_EXTENSIONS:
- if word.endswith("ied"):
+ if word.endswith('ied'):
if len(word) == 4:
- return self._replace_suffix(word, "ied", "ie")
+ return self._replace_suffix(word, 'ied', 'ie')
else:
- return self._replace_suffix(word, "ied", "i")
-
+ return self._replace_suffix(word, 'ied', 'i')
+
# (m>0) EED -> EE
- if word.endswith("eed"):
- stem = self._replace_suffix(word, "eed", "")
+ if word.endswith('eed'):
+ stem = self._replace_suffix(word, 'eed', '')
if self._measure(stem) > 0:
- return stem + "ee"
+ return stem + 'ee'
else:
return word
-
+
rule_2_or_3_succeeded = False
-
- for suffix in ["ed", "ing"]:
+
+ for suffix in ['ed', 'ing']:
if word.endswith(suffix):
- intermediate_stem = self._replace_suffix(word, suffix, "")
+ intermediate_stem = self._replace_suffix(word, suffix, '')
if self._contains_vowel(intermediate_stem):
rule_2_or_3_succeeded = True
break
-
+
if not rule_2_or_3_succeeded:
return word
- return self._apply_rule_list(
- intermediate_stem,
- [
- ("at", "ate", None), # AT -> ATE
- ("bl", "ble", None), # BL -> BLE
- ("iz", "ize", None), # IZ -> IZE
- # (*d and not (*L or *S or *Z))
- # -> single letter
- (
- "*d",
- intermediate_stem[-1],
- lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
- ),
- # (m=1 and *o) -> E
- (
- "",
- "e",
- lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
- ),
- ],
- )
-
+ return self._apply_rule_list(intermediate_stem, [
+ ('at', 'ate', None), # AT -> ATE
+ ('bl', 'ble', None), # BL -> BLE
+ ('iz', 'ize', None), # IZ -> IZE
+ # (*d and not (*L or *S or *Z))
+ # -> single letter
+ (
+ '*d',
+ intermediate_stem[-1],
+ lambda stem: intermediate_stem[-1] not in ('l', 's', 'z')
+ ),
+ # (m=1 and *o) -> E
+ (
+ '',
+ 'e',
+ lambda stem: (self._measure(stem) == 1 and
+ self._ends_cvc(stem))
+ ),
+ ])
+
def _step1c(self, word):
"""Implements Step 1c from "An algorithm for suffix stripping"
-
+
From the paper:
-
+
Step 1c
(*v*) Y -> I happy -> happi
sky -> sky
"""
-
def nltk_condition(stem):
"""
This has been modified from the original Porter algorithm so
conflate with 'spied', 'tried', 'flies' ...
"""
return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
-
+
def original_condition(stem):
return self._contains_vowel(stem)
-
- return self._apply_rule_list(
- word,
- [
- (
- "y",
- "i",
- nltk_condition
- if self.mode == self.NLTK_EXTENSIONS
- else original_condition,
- )
- ],
- )
+
+ return self._apply_rule_list(word, [
+ (
+ 'y',
+ 'i',
+ nltk_condition if self.mode == self.NLTK_EXTENSIONS
+ else original_condition
+ )
+ ])
def _step2(self, word):
"""Implements Step 2 from "An algorithm for suffix stripping"
-
+
From the paper:
-
+
Step 2
(m>0) ATIONAL -> ATE relational -> relate
# Instead of applying the ALLI -> AL rule after '(a)bli' per
# the published algorithm, instead we apply it first, and,
# if it succeeds, run the result through step2 again.
- if word.endswith("alli") and self._has_positive_measure(
- self._replace_suffix(word, "alli", "")
+ if (
+ word.endswith('alli') and
+ self._has_positive_measure(
+ self._replace_suffix(word, 'alli', '')
+ )
):
- return self._step2(self._replace_suffix(word, "alli", "al"))
-
- bli_rule = ("bli", "ble", self._has_positive_measure)
- abli_rule = ("abli", "able", self._has_positive_measure)
-
+ return self._step2(
+ self._replace_suffix(word, 'alli', 'al')
+ )
+
+ bli_rule = ('bli', 'ble', self._has_positive_measure)
+ abli_rule = ('abli', 'able', self._has_positive_measure)
+
rules = [
- ("ational", "ate", self._has_positive_measure),
- ("tional", "tion", self._has_positive_measure),
- ("enci", "ence", self._has_positive_measure),
- ("anci", "ance", self._has_positive_measure),
- ("izer", "ize", self._has_positive_measure),
+ ('ational', 'ate', self._has_positive_measure),
+ ('tional', 'tion', self._has_positive_measure),
+ ('enci', 'ence', self._has_positive_measure),
+ ('anci', 'ance', self._has_positive_measure),
+ ('izer', 'ize', self._has_positive_measure),
+
abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
- ("alli", "al", self._has_positive_measure),
- ("entli", "ent", self._has_positive_measure),
- ("eli", "e", self._has_positive_measure),
- ("ousli", "ous", self._has_positive_measure),
- ("ization", "ize", self._has_positive_measure),
- ("ation", "ate", self._has_positive_measure),
- ("ator", "ate", self._has_positive_measure),
- ("alism", "al", self._has_positive_measure),
- ("iveness", "ive", self._has_positive_measure),
- ("fulness", "ful", self._has_positive_measure),
- ("ousness", "ous", self._has_positive_measure),
- ("aliti", "al", self._has_positive_measure),
- ("iviti", "ive", self._has_positive_measure),
- ("biliti", "ble", self._has_positive_measure),
+
+ ('alli', 'al', self._has_positive_measure),
+ ('entli', 'ent', self._has_positive_measure),
+ ('eli', 'e', self._has_positive_measure),
+ ('ousli', 'ous', self._has_positive_measure),
+ ('ization', 'ize', self._has_positive_measure),
+ ('ation', 'ate', self._has_positive_measure),
+ ('ator', 'ate', self._has_positive_measure),
+ ('alism', 'al', self._has_positive_measure),
+ ('iveness', 'ive', self._has_positive_measure),
+ ('fulness', 'ful', self._has_positive_measure),
+ ('ousness', 'ous', self._has_positive_measure),
+ ('aliti', 'al', self._has_positive_measure),
+ ('iviti', 'ive', self._has_positive_measure),
+ ('biliti', 'ble', self._has_positive_measure),
]
-
+
if self.mode == self.NLTK_EXTENSIONS:
- rules.append(("fulli", "ful", self._has_positive_measure))
-
+ rules.append(
+ ('fulli', 'ful', self._has_positive_measure)
+ )
+
# The 'l' of the 'logi' -> 'log' rule is put with the stem,
# so that short stems like 'geo' 'theo' etc work like
# 'archaeo' 'philo' etc.
- rules.append(
- ("logi", "log", lambda stem: self._has_positive_measure(word[:-3]))
- )
+ rules.append((
+ "logi",
+ "log",
+ lambda stem: self._has_positive_measure(word[:-3])
+ ))
if self.mode == self.MARTIN_EXTENSIONS:
- rules.append(("logi", "log", self._has_positive_measure))
-
+ rules.append(
+ ("logi", "log", self._has_positive_measure)
+ )
+
return self._apply_rule_list(word, rules)
def _step3(self, word):
"""Implements Step 3 from "An algorithm for suffix stripping"
-
+
From the paper:
-
+
Step 3
(m>0) ICATE -> IC triplicate -> triplic
(m>0) FUL -> hopeful -> hope
(m>0) NESS -> goodness -> good
"""
- return self._apply_rule_list(
- word,
- [
- ("icate", "ic", self._has_positive_measure),
- ("ative", "", self._has_positive_measure),
- ("alize", "al", self._has_positive_measure),
- ("iciti", "ic", self._has_positive_measure),
- ("ical", "ic", self._has_positive_measure),
- ("ful", "", self._has_positive_measure),
- ("ness", "", self._has_positive_measure),
- ],
- )
+ return self._apply_rule_list(word, [
+ ('icate', 'ic', self._has_positive_measure),
+ ('ative', '', self._has_positive_measure),
+ ('alize', 'al', self._has_positive_measure),
+ ('iciti', 'ic', self._has_positive_measure),
+ ('ical', 'ic', self._has_positive_measure),
+ ('ful', '', self._has_positive_measure),
+ ('ness', '', self._has_positive_measure),
+ ])
def _step4(self, word):
"""Implements Step 4 from "An algorithm for suffix stripping"
-
+
Step 4
(m>1) AL -> revival -> reviv
tidying up.
"""
measure_gt_1 = lambda stem: self._measure(stem) > 1
-
- return self._apply_rule_list(
- word,
- [
- ("al", "", measure_gt_1),
- ("ance", "", measure_gt_1),
- ("ence", "", measure_gt_1),
- ("er", "", measure_gt_1),
- ("ic", "", measure_gt_1),
- ("able", "", measure_gt_1),
- ("ible", "", measure_gt_1),
- ("ant", "", measure_gt_1),
- ("ement", "", measure_gt_1),
- ("ment", "", measure_gt_1),
- ("ent", "", measure_gt_1),
- # (m>1 and (*S or *T)) ION ->
- (
- "ion",
- "",
- lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
- ),
- ("ou", "", measure_gt_1),
- ("ism", "", measure_gt_1),
- ("ate", "", measure_gt_1),
- ("iti", "", measure_gt_1),
- ("ous", "", measure_gt_1),
- ("ive", "", measure_gt_1),
- ("ize", "", measure_gt_1),
- ],
- )
-
+
+ return self._apply_rule_list(word, [
+ ('al', '', measure_gt_1),
+ ('ance', '', measure_gt_1),
+ ('ence', '', measure_gt_1),
+ ('er', '', measure_gt_1),
+ ('ic', '', measure_gt_1),
+ ('able', '', measure_gt_1),
+ ('ible', '', measure_gt_1),
+ ('ant', '', measure_gt_1),
+ ('ement', '', measure_gt_1),
+ ('ment', '', measure_gt_1),
+ ('ent', '', measure_gt_1),
+
+ # (m>1 and (*S or *T)) ION ->
+ (
+ 'ion',
+ '',
+ lambda stem: self._measure(stem) > 1 and stem[-1] in ('s', 't')
+ ),
+
+ ('ou', '', measure_gt_1),
+ ('ism', '', measure_gt_1),
+ ('ate', '', measure_gt_1),
+ ('iti', '', measure_gt_1),
+ ('ous', '', measure_gt_1),
+ ('ive', '', measure_gt_1),
+ ('ize', '', measure_gt_1),
+ ])
+
def _step5a(self, word):
"""Implements Step 5a from "An algorithm for suffix stripping"
-
+
From the paper:
-
+
Step 5a
(m>1) E -> probate -> probat
# no explicit mention of the inconsistency; you have to infer it
# from the examples.
# For this reason, we can't use _apply_rule_list here.
- if word.endswith("e"):
- stem = self._replace_suffix(word, "e", "")
+ if word.endswith('e'):
+ stem = self._replace_suffix(word, 'e', '')
if self._measure(stem) > 1:
return stem
if self._measure(stem) == 1 and not self._ends_cvc(stem):
def _step5b(self, word):
"""Implements Step 5a from "An algorithm for suffix stripping"
-
+
From the paper:
-
+
Step 5b
(m > 1 and *d and *L) -> single letter
controll -> control
roll -> roll
"""
- return self._apply_rule_list(
- word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
- )
+ return self._apply_rule_list(word, [
+ ('ll', 'l', lambda stem: self._measure(word[:-1]) > 1)
+ ])
def stem(self, word):
stem = word.lower()
-
+
if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
return self.pool[word]
stem = self._step4(stem)
stem = self._step5a(stem)
stem = self._step5b(stem)
-
+
return stem
def __repr__(self):
- return "<PorterStemmer>"
-
+ return '<PorterStemmer>'
def demo():
"""
stemmed.append(stemmer.stem(word))
# Convert the results to a string, and word-wrap them.
- results = " ".join(stemmed)
- results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
+ results = ' '.join(stemmed)
+ results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
# Convert the original to a string, and word wrap it.
- original = " ".join(orig)
- original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
+ original = ' '.join(orig)
+ original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
# Print the results.
- print("-Original-".center(70).replace(" ", "*").replace("-", " "))
+ print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
print(original)
- print("-Results-".center(70).replace(" ", "*").replace("-", " "))
+ print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
print(results)
- print("*" * 70)
+ print('*'*70)
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
import re
from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class RegexpStemmer(StemmerI):
"""
A stemmer that uses regular expressions to identify morphological
:type min: int
:param min: The minimum length of string to stem
"""
-
def __init__(self, regexp, min=0):
- if not hasattr(regexp, "pattern"):
+ if not hasattr(regexp, 'pattern'):
regexp = re.compile(regexp)
self._regexp = regexp
self._min = min
if len(word) < self._min:
return word
else:
- return self._regexp.sub("", word)
+ return self._regexp.sub('', word)
def __repr__(self):
- return "<RegexpStemmer: {!r}>".format(self._regexp.pattern)
+ return '<RegexpStemmer: {!r}>'.format(self._regexp.pattern)
+
+
+
+
# Natural Language Toolkit: RSLP Stemmer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tiago Tresoldi <tresoldi@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
# do NLTK para o português para qualquer debate.
-
+from __future__ import print_function, unicode_literals
from nltk.data import load
from nltk.stem.api import StemmerI
-
class RSLPStemmer(StemmerI):
"""
A stemmer for Portuguese.
uma cas de port e janel , em cim dum coxilh .
"""
- def __init__(self):
+ def __init__ (self):
self._model = []
- self._model.append(self.read_rule("step0.pt"))
- self._model.append(self.read_rule("step1.pt"))
- self._model.append(self.read_rule("step2.pt"))
- self._model.append(self.read_rule("step3.pt"))
- self._model.append(self.read_rule("step4.pt"))
- self._model.append(self.read_rule("step5.pt"))
- self._model.append(self.read_rule("step6.pt"))
+ self._model.append( self.read_rule("step0.pt") )
+ self._model.append( self.read_rule("step1.pt") )
+ self._model.append( self.read_rule("step2.pt") )
+ self._model.append( self.read_rule("step3.pt") )
+ self._model.append( self.read_rule("step4.pt") )
+ self._model.append( self.read_rule("step5.pt") )
+ self._model.append( self.read_rule("step6.pt") )
- def read_rule(self, filename):
- rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
+ def read_rule (self, filename):
+ rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
lines = rules.split("\n")
- lines = [line for line in lines if line != ""] # remove blank lines
+ lines = [line for line in lines if line != ""] # remove blank lines
lines = [line for line in lines if line[0] != "#"] # remove comments
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
tokens = line.split("\t")
# text to be searched for at the end of the string
- rule.append(tokens[0][1:-1]) # remove quotes
+ rule.append( tokens[0][1:-1] ) # remove quotes
# minimum stem size to perform the replacement
- rule.append(int(tokens[1]))
+ rule.append( int(tokens[1]) )
# text to be replaced into
- rule.append(tokens[2][1:-1]) # remove quotes
+ rule.append( tokens[2][1:-1] ) # remove quotes
# exceptions to this rule
- rule.append([token[1:-1] for token in tokens[3].split(",")])
+ rule.append( [token[1:-1] for token in tokens[3].split(",")] )
# append to the results
rules.append(rule)
rules = self._model[rule_index]
for rule in rules:
suffix_length = len(rule[0])
- if word[-suffix_length:] == rule[0]: # if suffix matches
- if len(word) >= suffix_length + rule[1]: # if we have minimum size
- if word not in rule[3]: # if not an exception
+ if word[-suffix_length:] == rule[0]: # if suffix matches
+ if len(word) >= suffix_length + rule[1]: # if we have minimum size
+ if word not in rule[3]: # if not an exception
word = word[:-suffix_length] + rule[2]
break
return word
+
+
+
#
# Natural Language Toolkit: Snowball Stemmer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Michael Stahl <pemistahl@gmail.com>
# Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
# Lakhdar Benzahia <lakhdar.benzahia@gmail.com> (co-writer)
There is also a demo function: `snowball.demo()`.
"""
+from __future__ import unicode_literals, print_function
+from six.moves import input
import re
+from nltk import compat
from nltk.corpus import stopwords
from nltk.stem import porter
from nltk.stem.util import suffix_replace, prefix_replace
language, a ValueError is raised.
"""
- languages = (
- "arabic",
- "danish",
- "dutch",
- "english",
- "finnish",
- "french",
- "german",
- "hungarian",
- "italian",
- "norwegian",
- "porter",
- "portuguese",
- "romanian",
- "russian",
- "spanish",
- "swedish",
- )
+ languages = ("arabic", "danish", "dutch", "english", "finnish", "french", "german",
+ "hungarian", "italian", "norwegian", "porter", "portuguese",
+ "romanian", "russian", "spanish", "swedish")
def __init__(self, language, ignore_stopwords=False):
if language not in self.languages:
self.stemmer = stemmerclass(ignore_stopwords)
self.stem = self.stemmer.stem
self.stopwords = self.stemmer.stopwords
-
+
def stem(self, token):
return self.stemmer.stem(self, token)
+@compat.python_2_unicode_compatible
class _LanguageSpecificStemmer(StemmerI):
"""
for word in stopwords.words(language):
self.stopwords.add(word)
except IOError:
- raise ValueError(
- "{!r} has no list of stopwords. Please set"
- " 'ignore_stopwords' to 'False'.".format(self)
- )
+ raise ValueError("{!r} has no list of stopwords. Please set"
+ " 'ignore_stopwords' to 'False'.".format(self))
def __repr__(self):
"""
nltk.stem.porter for more information.
"""
-
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
"""
r1 = ""
for i in range(1, len(word)):
- if word[i] not in vowels and word[i - 1] in vowels:
- if 3 > len(word[: i + 1]) > 0:
+ if word[i] not in vowels and word[i-1] in vowels:
+ if len(word[:i+1]) < 3 and len(word[:i+1]) > 0:
r1 = word[3:]
- elif len(word[: i + 1]) >= 3:
- r1 = word[i + 1 :]
+ elif len(word[:i+1]) >= 3:
+ r1 = word[i+1:]
else:
return word
break
r1 = ""
r2 = ""
for i in range(1, len(word)):
- if word[i] not in vowels and word[i - 1] in vowels:
- r1 = word[i + 1 :]
+ if word[i] not in vowels and word[i-1] in vowels:
+ r1 = word[i+1:]
break
for i in range(1, len(r1)):
- if r1[i] not in vowels and r1[i - 1] in vowels:
- r2 = r1[i + 1 :]
+ if r1[i] not in vowels and r1[i-1] in vowels:
+ r2 = r1[i+1:]
break
return (r1, r2)
+
+
def _rv_standard(self, word, vowels):
"""
Return the standard interpretation of the string region RV.
if word[1] not in vowels:
for i in range(2, len(word)):
if word[i] in vowels:
- rv = word[i + 1 :]
+ rv = word[i+1:]
break
elif word[0] in vowels and word[1] in vowels:
for i in range(2, len(word)):
if word[i] not in vowels:
- rv = word[i + 1 :]
+ rv = word[i+1:]
break
else:
rv = word[3:]
return rv
-
-class ArabicStemmer(_StandardStemmer):
+class ArabicStemmer(_LanguageSpecificStemmer):
"""
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
The Snowball Arabic light Stemmer
Lakhdar Benzahia
Nltk Version Author : Lakhdar Benzahia
"""
-
# Normalize_pre stes
- __vocalization = re.compile(
- r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]"
- ) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
+ __vocalization = re.compile(r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]') # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
- __kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda
+ __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda
- __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟
+ __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') # ؛ ، ؟
# Normalize_post
- __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ
+ __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ
# normalize other hamza's
- __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ
+ __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') # أ، إ، آ
- __waw_hamza = re.compile(r"[\u0624]") # ؤ
+ __waw_hamza = re.compile(r'[\u0624]') # ؤ
- __yeh_hamza = re.compile(r"[\u0626]") # ئ
+ __yeh_hamza = re.compile(r'[\u0626]') # ئ
- __alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ
+ __alefat = re.compile(r'[\u0623\u0622\u0625]') # أ، إ، آ
# Checks
- __checks1 = (
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644", # بال، كال
- "\u0627\u0644",
- "\u0644\u0644", # لل، ال
- )
+ __checks1 = ('\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال
+ '\u0627\u0644', '\u0644\u0644' # لل، ال
+ )
- __checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات
+ __checks2 = ('\u0629', # ة
+ '\u0627\u062a' # female plural ات
+ )
# Suffixes
- __suffix_noun_step1a = (
- "\u064a",
- "\u0643",
- "\u0647", # ي، ك، ه
- "\u0646\u0627",
- "\u0643\u0645",
- "\u0647\u0627",
- "\u0647\u0646",
- "\u0647\u0645", # نا، كم، ها، هن، هم
- "\u0643\u0645\u0627",
- "\u0647\u0645\u0627", # كما، هما
- )
-
- __suffix_noun_step1b = "\u0646" # ن
-
- __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و
-
- __suffix_noun_step2b = "\u0627\u062a" # ات
-
- __suffix_noun_step2c1 = "\u062a" # ت
-
- __suffix_noun_step2c2 = "\u0629" # ة
-
- __suffix_noun_step3 = "\u064a" # ي
-
- __suffix_verb_step1 = (
- "\u0647",
- "\u0643", # ه، ك
- "\u0646\u064a",
- "\u0646\u0627",
- "\u0647\u0627",
- "\u0647\u0645", # ني، نا، ها، هم
- "\u0647\u0646",
- "\u0643\u0645",
- "\u0643\u0646", # هن، كم، كن
- "\u0647\u0645\u0627",
- "\u0643\u0645\u0627",
- "\u0643\u0645\u0648", # هما، كما، كمو
- )
-
- __suffix_verb_step2a = (
- "\u062a",
- "\u0627",
- "\u0646",
- "\u064a", # ت، ا، ن، ي
- "\u0646\u0627",
- "\u062a\u0627",
- "\u062a\u0646", # نا، تا، تن Past
- "\u0627\u0646",
- "\u0648\u0646",
- "\u064a\u0646", # ان، هن، ين Present
- "\u062a\u0645\u0627", # تما
- )
-
- __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم
-
- __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو
-
- __suffix_all_alef_maqsura = "\u0649" # ى
+ __suffix_noun_step1a = ('\u064a', '\u0643', '\u0647', # ي، ك، ه
+ '\u0646\u0627', '\u0643\u0645', '\u0647\u0627', '\u0647\u0646', '\u0647\u0645', # نا، كم، ها، هن، هم
+ '\u0643\u0645\u0627', '\u0647\u0645\u0627' # كما، هما
+ )
+
+ __suffix_noun_step1b = ('\u0646') # ن
+
+ __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و
+
+ __suffix_noun_step2b = ('\u0627\u062a') # ات
+
+ __suffix_noun_step2c1 = ('\u062a') # ت
+
+ __suffix_noun_step2c2 = ('\u0629') # ة
+
+ __suffix_noun_step3 = ('\u064a') # ي
+
+ __suffix_verb_step1 = ('\u0647', '\u0643', # ه، ك
+ '\u0646\u064a', '\u0646\u0627', '\u0647\u0627', '\u0647\u0645', # ني، نا، ها، هم
+ '\u0647\u0646', '\u0643\u0645', '\u0643\u0646', # هن، كم، كن
+ '\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648' # هما، كما، كمو
+ )
+
+ __suffix_verb_step2a = ( '\u062a', '\u0627', '\u0646' , '\u064a', # ت، ا، ن، ي
+ '\u0646\u0627', '\u062a\u0627', '\u062a\u0646', # نا، تا، تن Past
+ '\u0627\u0646', '\u0648\u0646', '\u064a\u0646', # ان، هن، ين Present
+ '\u062a\u0645\u0627' # تما
+ )
+
+ __suffix_verb_step2b = ('\u0648\u0627','\u062a\u0645') # وا، تم
+
+ __suffix_verb_step2c = ('\u0648', # و
+ '\u062a\u0645\u0648' # تمو
+ )
+
+ __suffix_all_alef_maqsura = ('\u0649') # ى
# Prefixes
- __prefix_step1 = (
- "\u0623", # أ
- "\u0623\u0623",
- "\u0623\u0622",
- "\u0623\u0624",
- "\u0623\u0627",
- "\u0623\u0625", # أأ، أآ، أؤ، أا، أإ
- )
-
- __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال
-
- __prefix_step2b = ("\u0641", "\u0648") # ف، و
-
- __prefix_step3a_noun = (
- "\u0627\u0644",
- "\u0644\u0644", # لل، ال
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644", # بال، كال
- )
-
- __prefix_step3b_noun = (
- "\u0628",
- "\u0643",
- "\u0644", # ب، ك، ل
- "\u0628\u0628",
- "\u0643\u0643", # بب، كك
- )
-
- __prefix_step3_verb = (
- "\u0633\u064a",
- "\u0633\u062a",
- "\u0633\u0646",
- "\u0633\u0623",
- ) # سي، ست، سن، سأ
-
- __prefix_step4_verb = (
- "\u064a\u0633\u062a",
- "\u0646\u0633\u062a",
- "\u062a\u0633\u062a",
- ) # يست، نست، تست
+ __prefix_step1 = ('\u0623', # أ
+ '\u0623\u0623', '\u0623\u0622', '\u0623\u0624', '\u0623\u0627', '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ
+ )
+
+ __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال
+
+ __prefix_step2b = ('\u0641', '\u0648') # ف، و
+
+ __prefix_step3a_noun = ('\u0627\u0644', '\u0644\u0644', # لل، ال
+ '\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال
+ )
+
+ __prefix_step3b_noun = ('\u0628', '\u0643', '\u0644', # ب، ك، ل
+ '\u0628\u0628', '\u0643\u0643' # بب، كك
+ )
+
+ __prefix_step3_verb = ('\u0633\u064a', '\u0633\u062a', '\u0633\u0646', '\u0633\u0623') # سي، ست، سن، سأ
+
+ __prefix_step4_verb = ('\u064a\u0633\u062a', '\u0646\u0633\u062a', '\u062a\u0633\u062a') # يست، نست، تست
# Suffixes added due to Conjugation Verbs
- __conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك
-
- __conjugation_suffix_verb_2 = (
- "\u0646\u064a",
- "\u0646\u0627",
- "\u0647\u0627", # ني، نا، ها
- "\u0647\u0645",
- "\u0647\u0646",
- "\u0643\u0645", # هم، هن، كم
- "\u0643\u0646", # كن
- )
- __conjugation_suffix_verb_3 = (
- "\u0647\u0645\u0627",
- "\u0643\u0645\u0627",
- "\u0643\u0645\u0648",
- ) # هما، كما، كمو
-
- __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي
-
- __conjugation_suffix_verb_past = (
- "\u0646\u0627",
- "\u062a\u0627",
- "\u062a\u0646",
- ) # نا، تا، تن
-
- __conjugation_suffix_verb_present = (
- "\u0627\u0646",
- "\u0648\u0646",
- "\u064a\u0646",
- ) # ان، ون، ين
+ __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك
+
+ __conjugation_suffix_verb_2 = ('\u0646\u064a', '\u0646\u0627','\u0647\u0627', # ني، نا، ها
+ '\u0647\u0645', '\u0647\u0646', '\u0643\u0645', # هم، هن، كم
+ '\u0643\u0646' # كن
+ )
+ __conjugation_suffix_verb_3 = ('\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648') # هما، كما، كمو
+
+ __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي
+
+ __conjugation_suffix_verb_past = ('\u0646\u0627', '\u062a\u0627', '\u062a\u0646') # نا، تا، تن
+
+ __conjugation_suffix_verb_presnet = ('\u0627\u0646', '\u0648\u0646', '\u064a\u0646') # ان، ون، ين
# Suffixes added due to derivation Names
- __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه
+ __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه
- __conjugation_suffix_noun_2 = (
- "\u0646\u0627",
- "\u0643\u0645", # نا، كم
- "\u0647\u0627",
- "\u0647\u0646",
- "\u0647\u0645", # ها، هن، هم
- )
+ __conjugation_suffix_noun_2 = ('\u0646\u0627', '\u0643\u0645', # نا، كم
+ '\u0647\u0627', '\u0647\u0646', '\u0647\u0645' # ها، هن، هم
+ )
- __conjugation_suffix_noun_3 = (
- "\u0643\u0645\u0627",
- "\u0647\u0645\u0627",
- ) # كما، هما
+ __conjugation_suffix_noun_3 = ('\u0643\u0645\u0627', '\u0647\u0645\u0627') # كما، هما
# Prefixes added due to derivation Names
- __prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا
+ __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا
- __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال
+ __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644') # بال كال
- __articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل
+ __articles_2len = ('\u0627\u0644', '\u0644\u0644') # ال لل
# Prepositions letters
- __prepositions1 = ("\u0643", "\u0644") # ك، ل
- __prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك
+ __prepositions1 = ('\u0643', '\u0644') # ك، ل
+ __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك
is_verb = True
is_noun = True
:return: normalized token type string
"""
# strip diacritics
- token = self.__vocalization.sub("", token)
- # strip kasheeda
- token = self.__kasheeda.sub("", token)
+ token = self.__vocalization.sub('', token)
+ #strip kasheeda
+ token = self.__kasheeda.sub('', token)
# strip punctuation marks
- token = self.__arabic_punctuation_marks.sub("", token)
+ token = self.__arabic_punctuation_marks.sub('', token)
return token
def __normalize_post(self, token):
# normalize last hamza
for hamza in self.__last_hamzat:
if token.endswith(hamza):
- token = suffix_replace(token, hamza, "\u0621")
+ token = suffix_replace(token, hamza, '\u0621')
break
# normalize other hamzat
- token = self.__initial_hamzat.sub("\u0627", token)
- token = self.__waw_hamza.sub("\u0648", token)
- token = self.__yeh_hamza.sub("\u064a", token)
- token = self.__alefat.sub("\u0627", token)
- return token
+ token = self.__initial_hamzat.sub('\u0627', token)
+ token = self.__waw_hamza.sub('\u0648', token)
+ token = self.__yeh_hamza.sub('\u064a', token)
+ token = self.__alefat.sub('\u0627', token)
+ return token
def __checks_1(self, token):
- for prefix in self.__checks1:
+ for prefix in self.__checks1 :
if token.startswith(prefix):
- if prefix in self.__articles_3len and len(token) > 4:
+ if prefix in self.__articles_3len and len(token) > 4 :
self.is_noun = True
self.is_verb = False
self.is_defined = True
break
- if prefix in self.__articles_2len and len(token) > 3:
+ if prefix in self.__articles_2len and len(token) > 3 :
self.is_noun = True
self.is_verb = False
self.is_defined = True
def __checks_2(self, token):
for suffix in self.__checks2:
if token.endswith(suffix):
- if suffix == "\u0629" and len(token) > 2:
+ if suffix == '\u0629' and len(token) > 2:
self.is_noun = True
self.is_verb = False
break
- if suffix == "\u0627\u062a" and len(token) > 3:
+ if suffix == '\u0627\u062a' and len(token) > 3:
self.is_noun = True
self.is_verb = False
break
def __Suffix_Verb_Step2a(self, token):
for suffix in self.__suffix_verb_step2a:
- if token.endswith(suffix) and len(token) > 3:
- if suffix == "\u062a" and len(token) >= 4:
+ if token.endswith(suffix):
+ if suffix == '\u062a' and len(token) >= 4:
token = token[:-1]
self.suffix_verb_step2a_success = True
break
self.suffix_verb_step2a_success = True
break
- if suffix == "\u062a\u0645\u0627" and len(token) >= 6:
+ if suffix == '\u062a\u0645\u0627' and len(token) >= 6:
token = token[:-3]
self.suffix_verb_step2a_success = True
break
- return token
+ return token
def __Suffix_Verb_Step2c(self, token):
for suffix in self.__suffix_verb_step2c:
if token.endswith(suffix):
- if suffix == "\u062a\u0645\u0648" and len(token) >= 6:
+ if suffix == '\u062a\u0645\u0648' and len(token) >= 6:
token = token[:-3]
break
- if suffix == "\u0648" and len(token) >= 4:
+ if suffix == '\u0648' and len(token) >= 4:
token = token[:-1]
break
return token
token = token[:-2]
self.suffix_verb_step2b_success = True
break
- return token
+ return token
def __Suffix_Noun_Step2c2(self, token):
for suffix in self.__suffix_noun_step2c2:
token = token[:-2]
self.suffix_noun_step2b_success = True
break
- return token
+ return token
def __Suffix_Noun_Step2c1(self, token):
for suffix in self.__suffix_noun_step2c1:
def __Suffix_All_alef_maqsura(self, token):
for suffix in self.__suffix_all_alef_maqsura:
if token.endswith(suffix):
- token = suffix_replace(token, suffix, "\u064a")
- return token
+ token = suffix_replace(token, suffix, '\u064a')
+ return token
def __Prefix_Step1(self, token):
for prefix in self.__prefix_step1:
if token.startswith(prefix) and len(token) > 3:
- if prefix == "\u0623\u0623":
- token = prefix_replace(token, prefix, "\u0623")
+ if prefix == '\u0623\u0623':
+ token = prefix_replace(token, prefix, '\u0623')
break
- elif prefix == "\u0623\u0622":
- token = prefix_replace(token, prefix, "\u0622")
+ elif prefix == '\u0623\u0622':
+ token = prefix_replace(token, prefix, '\u0622')
break
- elif prefix == "\u0623\u0624":
- token = prefix_replace(token, prefix, "\u0624")
+ elif prefix == '\u0623\u0624':
+ token = prefix_replace(token, prefix, '\u0624')
break
- elif prefix == "\u0623\u0627":
- token = prefix_replace(token, prefix, "\u0627")
+ elif prefix == '\u0623\u0627' :
+ token = prefix_replace(token, prefix, '\u0627')
break
- elif prefix == "\u0623\u0625":
- token = prefix_replace(token, prefix, "\u0625")
+ elif prefix == '\u0623\u0625' :
+ token = prefix_replace(token, prefix, '\u0625')
break
return token
def __Prefix_Step2a(self, token):
for prefix in self.__prefix_step2a:
if token.startswith(prefix) and len(token) > 5:
- token = token[len(prefix) :]
+ token = token[len(prefix):]
self.prefix_step2a_success = True
break
- return token
+ return token
def __Prefix_Step2b(self, token):
for prefix in self.__prefix_step2b:
- if token.startswith(prefix) and len(token) > 3:
+ if token.startswith(prefix) and len(token) > 3 :
if token[:2] not in self.__prefixes1:
- token = token[len(prefix) :]
+ token = token[len(prefix):]
break
return token
for prefix in self.__prefix_step3a_noun:
if token.startswith(prefix):
if prefix in self.__articles_2len and len(token) > 4:
- token = token[len(prefix) :]
+ token = token[len(prefix):]
self.prefix_step3a_noun_success = True
break
- if prefix in self.__articles_3len and len(token) > 5:
- token = token[len(prefix) :]
+ if prefix in self.__articles_3len and len(token) > 5:
+ token = token[len(prefix):]
break
return token
for prefix in self.__prefix_step3b_noun:
if token.startswith(prefix):
if len(token) > 3:
- if prefix == "\u0628":
- token = token[len(prefix) :]
+ if prefix == '\u0628':
+ token = token[len(prefix):]
self.prefix_step3b_noun_success = True
break
break
if prefix in self.__prepositions1 and len(token) > 4:
- token = token[len(prefix) :] # BUG: cause confusion
+ token = token[len(prefix):] # BUG: cause confusion
self.prefix_step3b_noun_success = True
break
return token
def __Prefix_Step4_Verb(self, token):
for prefix in self.__prefix_step4_verb:
if token.startswith(prefix) and len(token) > 4:
- token = prefix_replace(token, prefix, "\u0627\u0633\u062a")
+ token = prefix_replace(token, prefix, '\u0627\u0633\u062a')
self.is_verb = True
self.is_noun = False
break
self.__checks_1(modified_word)
# checks2
self.__checks_2(modified_word)
- # Pre_Normalization
modified_word = self.__normalize_pre(modified_word)
- # Avoid stopwords
- if modified_word in self.stopwords or len(modified_word) <= 2:
- return modified_word
- # Start stemming
if self.is_verb:
modified_word = self.__Suffix_Verb_Step1(modified_word)
- if self.suffixes_verb_step1_success:
+ if self.suffixes_verb_step1_success:
modified_word = self.__Suffix_Verb_Step2a(modified_word)
- if not self.suffix_verb_step2a_success:
+ if not self.suffix_verb_step2a_success :
modified_word = self.__Suffix_Verb_Step2c(modified_word)
- # or next TODO: How to deal with or next instruction
+ #or next
else:
modified_word = self.__Suffix_Verb_Step2b(modified_word)
if not self.suffix_verb_step2b_success:
if not self.suffix_noun_step2c2_success:
if not self.is_defined:
modified_word = self.__Suffix_Noun_Step1a(modified_word)
- # if self.suffix_noun_step1a_success:
+ #if self.suffix_noun_step1a_success:
modified_word = self.__Suffix_Noun_Step2a(modified_word)
if not self.suffix_noun_step2a_success:
- modified_word = self.__Suffix_Noun_Step2b(modified_word)
- if (
- not self.suffix_noun_step2b_success
- and not self.suffix_noun_step2a_success
- ):
+ modified_word = self.__Suffix_Noun_Step2b(modified_word)
+ if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
modified_word = self.__Suffix_Noun_Step2c1(modified_word)
# or next ? todo : how to deal with or next
else:
- modified_word = self.__Suffix_Noun_Step1b(modified_word)
+ modified_word = self.__Suffix_Noun_Step1b(modified_word)
if self.suffixe_noun_step1b_success:
modified_word = self.__Suffix_Noun_Step2a(modified_word)
if not self.suffix_noun_step2a_success:
modified_word = self.__Suffix_Noun_Step2b(modified_word)
- if (
- not self.suffix_noun_step2b_success
- and not self.suffix_noun_step2a_success
- ):
+ if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
modified_word = self.__Suffix_Noun_Step2c1(modified_word)
else:
if not self.is_defined:
stemmed_word = modified_word
return stemmed_word
-
class DanishStemmer(_ScandinavianStemmer):
"""
# The language's vowels and other important characters are defined.
__vowels = "aeiouy\xE6\xE5\xF8"
__consonants = "bcdfghjklmnpqrstvwxz"
- __double_consonants = (
- "bb",
- "cc",
- "dd",
- "ff",
- "gg",
- "hh",
- "jj",
- "kk",
- "ll",
- "mm",
- "nn",
- "pp",
- "qq",
- "rr",
- "ss",
- "tt",
- "vv",
- "ww",
- "xx",
- "zz",
- )
+ __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
+ "kk", "ll", "mm", "nn", "pp", "qq", "rr",
+ "ss", "tt", "vv", "ww", "xx", "zz")
__s_ending = "abcdfghjklmnoprtvyz\xE5"
# The different suffixes, divided into the algorithm's steps
# and organized by length, are listed in tuples.
- __step1_suffixes = (
- "erendes",
- "erende",
- "hedens",
- "ethed",
- "erede",
- "heden",
- "heder",
- "endes",
- "ernes",
- "erens",
- "erets",
- "ered",
- "ende",
- "erne",
- "eren",
- "erer",
- "heds",
- "enes",
- "eres",
- "eret",
- "hed",
- "ene",
- "ere",
- "ens",
- "ers",
- "ets",
- "en",
- "er",
- "es",
- "et",
- "e",
- "s",
- )
+ __step1_suffixes = ("erendes", "erende", "hedens", "ethed",
+ "erede", "heden", "heder", "endes",
+ "ernes", "erens", "erets", "ered",
+ "ende", "erne", "eren", "erer", "heds",
+ "enes", "eres", "eret", "hed", "ene", "ere",
+ "ens", "ers", "ets", "en", "er", "es", "et",
+ "e", "s")
__step2_suffixes = ("gd", "dt", "gt", "kt")
__step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig")
word = word[:-1]
r1 = r1[:-1]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
break
# STEP 2
word = word[:-1]
r1 = r1[:-1]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
if r1.endswith(self.__step2_suffixes):
word = word[:-1]
word = word[:-1]
break
+
return word
step2_success = False
# Vowel accents are removed.
- word = (
- word.replace("\xE4", "a")
- .replace("\xE1", "a")
- .replace("\xEB", "e")
- .replace("\xE9", "e")
- .replace("\xED", "i")
- .replace("\xEF", "i")
- .replace("\xF6", "o")
- .replace("\xF3", "o")
- .replace("\xFC", "u")
- .replace("\xFA", "u")
- )
+ word = (word.replace("\xE4", "a").replace("\xE1", "a")
+ .replace("\xEB", "e").replace("\xE9", "e")
+ .replace("\xED", "i").replace("\xEF", "i")
+ .replace("\xF6", "o").replace("\xF3", "o")
+ .replace("\xFC", "u").replace("\xFA", "u"))
# An initial 'y', a 'y' after a vowel,
# and an 'i' between self.__vowels is put into upper case.
word = "".join(("Y", word[1:]))
for i in range(1, len(word)):
- if word[i - 1] in self.__vowels and word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
+ if word[i-1] in self.__vowels and word[i] == "y":
+ word = "".join((word[:i], "Y", word[i+1:]))
- for i in range(1, len(word) - 1):
- if (
- word[i - 1] in self.__vowels
- and word[i] == "i"
- and word[i + 1] in self.__vowels
- ):
- word = "".join((word[:i], "I", word[i + 1 :]))
+ for i in range(1, len(word)-1):
+ if (word[i-1] in self.__vowels and word[i] == "i" and
+ word[i+1] in self.__vowels):
+ word = "".join((word[:i], "I", word[i+1:]))
r1, r2 = self._r1r2_standard(word, self.__vowels)
# R1 is adjusted so that the region before it
# contains at least 3 letters.
for i in range(1, len(word)):
- if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
- if 3 > len(word[: i + 1]) > 0:
+ if word[i] not in self.__vowels and word[i-1] in self.__vowels:
+ if len(word[:i+1]) < 3 and len(word[:i+1]) > 0:
r1 = word[3:]
- elif len(word[: i + 1]) == 0:
+ elif len(word[:i+1]) == 0:
return word
break
if r2.endswith("heden"):
r2 = suffix_replace(r2, suffix, "heid")
- elif (
- suffix in ("ene", "en")
- and not word.endswith("heden")
- and word[-len(suffix) - 1] not in self.__vowels
- and word[-len(suffix) - 3 : -len(suffix)] != "gem"
- ):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ elif (suffix in ("ene", "en") and
+ not word.endswith("heden") and
+ word[-len(suffix)-1] not in self.__vowels and
+ word[-len(suffix)-3:-len(suffix)] != "gem"):
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
if word.endswith(("kk", "dd", "tt")):
word = word[:-1]
r1 = r1[:-1]
r2 = r2[:-1]
- elif (
- suffix in ("se", "s")
- and word[-len(suffix) - 1] not in self.__vowels
- and word[-len(suffix) - 1] != "j"
- ):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ elif (suffix in ("se", "s") and
+ word[-len(suffix)-1] not in self.__vowels and
+ word[-len(suffix)-1] != "j"):
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
break
# STEP 2
r1 = r1[:-4]
r2 = r2[:-4]
- if (
- r1.endswith("en")
- and word[-3] not in self.__vowels
- and word[-5:-2] != "gem"
- ):
+ if (r1.endswith("en") and word[-3] not in self.__vowels and
+ word[-5:-2] != "gem"):
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
# All occurrences of 'I' and 'Y' are put back into lower case.
word = word.replace("I", "i").replace("Y", "y")
+
return word
+
class EnglishStemmer(_StandardStemmer):
"""
"""
__vowels = "aeiouy"
- __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
+ __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn",
+ "pp", "rr", "tt")
__li_ending = "cdeghkmnrt"
__step0_suffixes = ("'s'", "'s", "'")
__step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
__step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
- __step2_suffixes = (
- "ization",
- "ational",
- "fulness",
- "ousness",
- "iveness",
- "tional",
- "biliti",
- "lessli",
- "entli",
- "ation",
- "alism",
- "aliti",
- "ousli",
- "iviti",
- "fulli",
- "enci",
- "anci",
- "abli",
- "izer",
- "ator",
- "alli",
- "bli",
- "ogi",
- "li",
- )
- __step3_suffixes = (
- "ational",
- "tional",
- "alize",
- "icate",
- "iciti",
- "ative",
- "ical",
- "ness",
- "ful",
- )
- __step4_suffixes = (
- "ement",
- "ance",
- "ence",
- "able",
- "ible",
- "ment",
- "ant",
- "ent",
- "ism",
- "ate",
- "iti",
- "ous",
- "ive",
- "ize",
- "ion",
- "al",
- "er",
- "ic",
- )
+ __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness',
+ 'iveness', 'tional', 'biliti', 'lessli',
+ 'entli', 'ation', 'alism', 'aliti', 'ousli',
+ 'iviti', 'fulli', 'enci', 'anci', 'abli',
+ 'izer', 'ator', 'alli', 'bli', 'ogi', 'li')
+ __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti',
+ 'ative', 'ical', 'ness', 'ful')
+ __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment',
+ 'ant', 'ent', 'ism', 'ate', 'iti', 'ous',
+ 'ive', 'ize', 'ion', 'al', 'er', 'ic')
__step5_suffixes = ("e", "l")
- __special_words = {
- "skis": "ski",
- "skies": "sky",
- "dying": "die",
- "lying": "lie",
- "tying": "tie",
- "idly": "idl",
- "gently": "gentl",
- "ugly": "ugli",
- "early": "earli",
- "only": "onli",
- "singly": "singl",
- "sky": "sky",
- "news": "news",
- "howe": "howe",
- "atlas": "atlas",
- "cosmos": "cosmos",
- "bias": "bias",
- "andes": "andes",
- "inning": "inning",
- "innings": "inning",
- "outing": "outing",
- "outings": "outing",
- "canning": "canning",
- "cannings": "canning",
- "herring": "herring",
- "herrings": "herring",
- "earring": "earring",
- "earrings": "earring",
- "proceed": "proceed",
- "proceeds": "proceed",
- "proceeded": "proceed",
- "proceeding": "proceed",
- "exceed": "exceed",
- "exceeds": "exceed",
- "exceeded": "exceed",
- "exceeding": "exceed",
- "succeed": "succeed",
- "succeeds": "succeed",
- "succeeded": "succeed",
- "succeeding": "succeed",
- }
+ __special_words = {"skis" : "ski",
+ "skies" : "sky",
+ "dying" : "die",
+ "lying" : "lie",
+ "tying" : "tie",
+ "idly" : "idl",
+ "gently" : "gentl",
+ "ugly" : "ugli",
+ "early" : "earli",
+ "only" : "onli",
+ "singly" : "singl",
+ "sky" : "sky",
+ "news" : "news",
+ "howe" : "howe",
+ "atlas" : "atlas",
+ "cosmos" : "cosmos",
+ "bias" : "bias",
+ "andes" : "andes",
+ "inning" : "inning",
+ "innings" : "inning",
+ "outing" : "outing",
+ "outings" : "outing",
+ "canning" : "canning",
+ "cannings" : "canning",
+ "herring" : "herring",
+ "herrings" : "herring",
+ "earring" : "earring",
+ "earrings" : "earring",
+ "proceed" : "proceed",
+ "proceeds" : "proceed",
+ "proceeded" : "proceed",
+ "proceeding" : "proceed",
+ "exceed" : "exceed",
+ "exceeds" : "exceed",
+ "exceeded" : "exceed",
+ "exceeding" : "exceed",
+ "succeed" : "succeed",
+ "succeeds" : "succeed",
+ "succeeded" : "succeed",
+ "succeeding" : "succeed"}
def stem(self, word):
return self.__special_words[word]
# Map the different apostrophe characters to a single consistent one
- word = (
- word.replace("\u2019", "\x27")
- .replace("\u2018", "\x27")
- .replace("\u201B", "\x27")
- )
+ word = (word.replace("\u2019", "\x27")
+ .replace("\u2018", "\x27")
+ .replace("\u201B", "\x27"))
if word.startswith("\x27"):
word = word[1:]
word = "".join(("Y", word[1:]))
for i in range(1, len(word)):
- if word[i - 1] in self.__vowels and word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
+ if word[i-1] in self.__vowels and word[i] == "y":
+ word = "".join((word[:i], "Y", word[i+1:]))
step1a_vowel_found = False
step1b_vowel_found = False
r1 = word[6:]
for i in range(1, len(r1)):
- if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
- r2 = r1[i + 1 :]
+ if r1[i] not in self.__vowels and r1[i-1] in self.__vowels:
+ r2 = r1[i+1:]
break
else:
r1, r2 = self._r1r2_standard(word, self.__vowels)
+
# STEP 0
for suffix in self.__step0_suffixes:
if word.endswith(suffix):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
break
# STEP 1a
r2 = r2[:-2]
elif suffix in ("ied", "ies"):
- if len(word[: -len(suffix)]) > 1:
+ if len(word[:-len(suffix)]) > 1:
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
else:
r2 = ""
else:
- for letter in word[: -len(suffix)]:
+ for letter in word[:-len(suffix)]:
if letter in self.__vowels:
step1b_vowel_found = True
break
if step1b_vowel_found:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
if word.endswith(("at", "bl", "iz")):
word = "".join((word, "e"))
r1 = "".join((r1, "e"))
- if len(word) > 5 or len(r1) >= 3:
+ if len(word) > 5 or len(r1) >=3:
r2 = "".join((r2, "e"))
elif word.endswith(self.__double_consonants):
r1 = r1[:-1]
r2 = r2[:-1]
- elif (
- r1 == ""
- and len(word) >= 3
- and word[-1] not in self.__vowels
- and word[-1] not in "wxY"
- and word[-2] in self.__vowels
- and word[-3] not in self.__vowels
- ) or (
- r1 == ""
- and len(word) == 2
- and word[0] in self.__vowels
- and word[1] not in self.__vowels
- ):
+ elif ((r1 == "" and len(word) >= 3 and
+ word[-1] not in self.__vowels and
+ word[-1] not in "wxY" and
+ word[-2] in self.__vowels and
+ word[-3] not in self.__vowels)
+ or
+ (r1 == "" and len(word) == 2 and
+ word[0] in self.__vowels and
+ word[1] not in self.__vowels)):
word = "".join((word, "e"))
r2 = ""
elif suffix in ("ful", "ness"):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
elif suffix == "ative" and r2.endswith(suffix):
word = word[:-5]
r1 = r1[:-3]
r2 = r2[:-3]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
break
# STEP 5
elif r2.endswith("e"):
word = word[:-1]
elif r1.endswith("e"):
- if len(word) >= 4 and (
- word[-2] in self.__vowels
- or word[-2] in "wxY"
- or word[-3] not in self.__vowels
- or word[-4] in self.__vowels
- ):
+ if len(word) >= 4 and (word[-2] in self.__vowels or
+ word[-2] in "wxY" or
+ word[-3] not in self.__vowels or
+ word[-4] in self.__vowels):
word = word[:-1]
+
word = word.replace("Y", "y")
+
return word
+
class FinnishStemmer(_StandardStemmer):
"""
__vowels = "aeiouy\xE4\xF6"
__restricted_vowels = "aeiou\xE4\xF6"
- __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6")
+ __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4",
+ "\xF6\xF6")
__consonants = "bcdfghjklmnpqrstvwxz"
- __double_consonants = (
- "bb",
- "cc",
- "dd",
- "ff",
- "gg",
- "hh",
- "jj",
- "kk",
- "ll",
- "mm",
- "nn",
- "pp",
- "qq",
- "rr",
- "ss",
- "tt",
- "vv",
- "ww",
- "xx",
- "zz",
- )
- __step1_suffixes = (
- "kaan",
- "k\xE4\xE4n",
- "sti",
- "kin",
- "han",
- "h\xE4n",
- "ko",
- "k\xF6",
- "pa",
- "p\xE4",
- )
- __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en")
- __step3_suffixes = (
- "siin",
- "tten",
- "seen",
- "han",
- "hen",
- "hin",
- "hon",
- "h\xE4n",
- "h\xF6n",
- "den",
- "tta",
- "tt\xE4",
- "ssa",
- "ss\xE4",
- "sta",
- "st\xE4",
- "lla",
- "ll\xE4",
- "lta",
- "lt\xE4",
- "lle",
- "ksi",
- "ine",
- "ta",
- "t\xE4",
- "na",
- "n\xE4",
- "a",
- "\xE4",
- "n",
- )
- __step4_suffixes = (
- "impi",
- "impa",
- "imp\xE4",
- "immi",
- "imma",
- "imm\xE4",
- "mpi",
- "mpa",
- "mp\xE4",
- "mmi",
- "mma",
- "mm\xE4",
- "eja",
- "ej\xE4",
- )
+ __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
+ "kk", "ll", "mm", "nn", "pp", "qq", "rr",
+ "ss", "tt", "vv", "ww", "xx", "zz")
+ __step1_suffixes = ('kaan', 'k\xE4\xE4n', 'sti', 'kin', 'han',
+ 'h\xE4n', 'ko', 'k\xF6', 'pa', 'p\xE4')
+ __step2_suffixes = ('nsa', 'ns\xE4', 'mme', 'nne', 'si', 'ni',
+ 'an', '\xE4n', 'en')
+ __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin',
+ 'hon', 'h\xE4n', 'h\xF6n', 'den', 'tta',
+ 'tt\xE4', 'ssa', 'ss\xE4', 'sta',
+ 'st\xE4', 'lla', 'll\xE4', 'lta',
+ 'lt\xE4', 'lle', 'ksi', 'ine', 'ta',
+ 't\xE4', 'na', 'n\xE4', 'a', '\xE4',
+ 'n')
+ __step4_suffixes = ('impi', 'impa', 'imp\xE4', 'immi', 'imma',
+ 'imm\xE4', 'mpi', 'mpa', 'mp\xE4', 'mmi',
+ 'mma', 'mm\xE4', 'eja', 'ej\xE4')
def stem(self, word):
"""
r1 = r1[:-3]
r2 = r2[:-3]
else:
- if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6":
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ if word[-len(suffix)-1] in "ntaeiouy\xE4\xF6":
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
break
# STEP 2: Possessives
r2 = suffix_replace(r2, "kse", "ksi")
elif suffix == "an":
- if word[-4:-2] in ("ta", "na") or word[-5:-2] in (
- "ssa",
- "sta",
- "lla",
- "lta",
- ):
+ if (word[-4:-2] in ("ta", "na") or
+ word[-5:-2] in ("ssa", "sta", "lla", "lta")):
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
elif suffix == "\xE4n":
- if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in (
- "ss\xE4",
- "st\xE4",
- "ll\xE4",
- "lt\xE4",
- ):
+ if (word[-4:-2] in ("t\xE4", "n\xE4") or
+ word[-5:-2] in ("ss\xE4", "st\xE4",
+ "ll\xE4", "lt\xE4")):
word = word[:-2]
r1 = r1[:-2]
r2 = r2[:-2]
# STEP 3: Cases
for suffix in self.__step3_suffixes:
if r1.endswith(suffix):
- if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"):
- if (
- (suffix == "han" and word[-4] == "a")
- or (suffix == "hen" and word[-4] == "e")
- or (suffix == "hin" and word[-4] == "i")
- or (suffix == "hon" and word[-4] == "o")
- or (suffix == "h\xE4n" and word[-4] == "\xE4")
- or (suffix == "h\xF6n" and word[-4] == "\xF6")
- ):
+ if suffix in ("han", "hen", "hin", "hon", "h\xE4n",
+ "h\xF6n"):
+ if ((suffix == "han" and word[-4] == "a") or
+ (suffix == "hen" and word[-4] == "e") or
+ (suffix == "hin" and word[-4] == "i") or
+ (suffix == "hon" and word[-4] == "o") or
+ (suffix == "h\xE4n" and word[-4] == "\xE4") or
+ (suffix == "h\xF6n" and word[-4] == "\xF6")):
word = word[:-3]
r1 = r1[:-3]
r2 = r2[:-3]
step3_success = True
elif suffix in ("siin", "den", "tten"):
- if (
- word[-len(suffix) - 1] == "i"
- and word[-len(suffix) - 2] in self.__restricted_vowels
- ):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ if (word[-len(suffix)-1] == "i" and
+ word[-len(suffix)-2] in self.__restricted_vowels):
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
step3_success = True
else:
continue
r1 = r1[:-1]
r2 = r2[:-1]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
step3_success = True
break
# STEP 4: Other endings
for suffix in self.__step4_suffixes:
if r2.endswith(suffix):
- if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"):
+ if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma",
+ "mm\xE4"):
if word[-5:-3] != "po":
word = word[:-3]
r1 = r1[:-3]
r2 = r2[:-3]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
break
# STEP 5: Plurals
word = word[:-1]
r1 = r1[:-1]
- elif (
- not step3_success
- and len(r1) >= 2
- and r1[-1] == "t"
- and r1[-2] in self.__vowels
- ):
+ elif (not step3_success and len(r1) >= 2 and
+ r1[-1] == "t" and r1[-2] in self.__vowels):
word = word[:-1]
r1 = r1[:-1]
r2 = r2[:-1]
word = word[:-1]
r1 = r1[:-1]
- if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei":
+ if (len(r1) >= 2 and r1[-2] in self.__consonants and
+ r1[-1] in "a\xE4ei"):
word = word[:-1]
r1 = r1[:-1]
continue
else:
if i == 1:
- if word[-i - 1 :] in self.__double_consonants:
+ if word[-i-1:] in self.__double_consonants:
word = word[:-1]
else:
- if word[-i - 1 : -i + 1] in self.__double_consonants:
- word = "".join((word[:-i], word[-i + 1 :]))
+ if word[-i-1:-i+1] in self.__double_consonants:
+ word = "".join((word[:-i], word[-i+1:]))
break
+
return word
+
class FrenchStemmer(_StandardStemmer):
"""
"""
__vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
- __step1_suffixes = (
- "issements",
- "issement",
- "atrices",
- "atrice",
- "ateurs",
- "ations",
- "logies",
- "usions",
- "utions",
- "ements",
- "amment",
- "emment",
- "ances",
- "iqUes",
- "ismes",
- "ables",
- "istes",
- "ateur",
- "ation",
- "logie",
- "usion",
- "ution",
- "ences",
- "ement",
- "euses",
- "ments",
- "ance",
- "iqUe",
- "isme",
- "able",
- "iste",
- "ence",
- "it\xE9s",
- "ives",
- "eaux",
- "euse",
- "ment",
- "eux",
- "it\xE9",
- "ive",
- "ifs",
- "aux",
- "if",
- )
- __step2a_suffixes = (
- "issaIent",
- "issantes",
- "iraIent",
- "issante",
- "issants",
- "issions",
- "irions",
- "issais",
- "issait",
- "issant",
- "issent",
- "issiez",
- "issons",
- "irais",
- "irait",
- "irent",
- "iriez",
- "irons",
- "iront",
- "isses",
- "issez",
- "\xEEmes",
- "\xEEtes",
- "irai",
- "iras",
- "irez",
- "isse",
- "ies",
- "ira",
- "\xEEt",
- "ie",
- "ir",
- "is",
- "it",
- "i",
- )
- __step2b_suffixes = (
- "eraIent",
- "assions",
- "erions",
- "assent",
- "assiez",
- "\xE8rent",
- "erais",
- "erait",
- "eriez",
- "erons",
- "eront",
- "aIent",
- "antes",
- "asses",
- "ions",
- "erai",
- "eras",
- "erez",
- "\xE2mes",
- "\xE2tes",
- "ante",
- "ants",
- "asse",
- "\xE9es",
- "era",
- "iez",
- "ais",
- "ait",
- "ant",
- "\xE9e",
- "\xE9s",
- "er",
- "ez",
- "\xE2t",
- "ai",
- "as",
- "\xE9",
- "a",
- )
- __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB")
+ __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice',
+ 'ateurs', 'ations', 'logies', 'usions',
+ 'utions', 'ements', 'amment', 'emment',
+ 'ances', 'iqUes', 'ismes', 'ables', 'istes',
+ 'ateur', 'ation', 'logie', 'usion', 'ution',
+ 'ences', 'ement', 'euses', 'ments', 'ance',
+ 'iqUe', 'isme', 'able', 'iste', 'ence',
+ 'it\xE9s', 'ives', 'eaux', 'euse', 'ment',
+ 'eux', 'it\xE9', 'ive', 'ifs', 'aux', 'if')
+ __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante',
+ 'issants', 'issions', 'irions', 'issais',
+ 'issait', 'issant', 'issent', 'issiez', 'issons',
+ 'irais', 'irait', 'irent', 'iriez', 'irons',
+ 'iront', 'isses', 'issez', '\xEEmes',
+ '\xEEtes', 'irai', 'iras', 'irez', 'isse',
+ 'ies', 'ira', '\xEEt', 'ie', 'ir', 'is',
+ 'it', 'i')
+ __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent',
+ 'assiez', '\xE8rent', 'erais', 'erait',
+ 'eriez', 'erons', 'eront', 'aIent', 'antes',
+ 'asses', 'ions', 'erai', 'eras', 'erez',
+ '\xE2mes', '\xE2tes', 'ante', 'ants',
+ 'asse', '\xE9es', 'era', 'iez', 'ais',
+ 'ait', 'ant', '\xE9e', '\xE9s', 'er',
+ 'ez', '\xE2t', 'ai', 'as', '\xE9', 'a')
+ __step4_suffixes = ('i\xE8re', 'I\xE8re', 'ion', 'ier', 'Ier',
+ 'e', '\xEB')
def stem(self, word):
"""
# Every occurrence of 'u' after 'q' is put into upper case.
for i in range(1, len(word)):
- if word[i - 1] == "q" and word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
+ if word[i-1] == "q" and word[i] == "u":
+ word = "".join((word[:i], "U", word[i+1:]))
# Every occurrence of 'u' and 'i'
# between vowels is put into upper case.
# Every occurrence of 'y' preceded or
# followed by a vowel is also put into upper case.
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
+ for i in range(1, len(word)-1):
+ if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
+ word = "".join((word[:i], "U", word[i+1:]))
elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1 :]))
+ word = "".join((word[:i], "I", word[i+1:]))
- if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels:
+ if word[i-1] in self.__vowels or word[i+1] in self.__vowels:
if word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
+ word = "".join((word[:i], "Y", word[i+1:]))
r1, r2 = self._r1r2_standard(word, self.__vowels)
rv = self.__rv_french(word, self.__vowels)
elif suffix in ("euse", "euses"):
if suffix in r2:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
step1_success = True
elif suffix in r1:
step1_success = True
elif suffix in ("ement", "ements") and suffix in rv:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
step1_success = True
if word[-2:] == "iv" and "iv" in r2:
word = suffix_replace(word, "emment", "ent")
rv_ending_found = True
- elif (
- suffix in ("ment", "ments")
- and suffix in rv
- and not rv.startswith(suffix)
- and rv[rv.rindex(suffix) - 1] in self.__vowels
- ):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ elif (suffix in ("ment", "ments") and suffix in rv and
+ not rv.startswith(suffix) and
+ rv[rv.rindex(suffix)-1] in self.__vowels):
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
rv_ending_found = True
elif suffix == "aux" and suffix in r1:
word = "".join((word[:-2], "l"))
step1_success = True
- elif (
- suffix in ("issement", "issements")
- and suffix in r1
- and word[-len(suffix) - 1] not in self.__vowels
- ):
- word = word[: -len(suffix)]
+ elif (suffix in ("issement", "issements") and suffix in r1
+ and word[-len(suffix)-1] not in self.__vowels):
+ word = word[:-len(suffix)]
step1_success = True
- elif (
- suffix
- in (
- "ance",
- "iqUe",
- "isme",
- "able",
- "iste",
- "eux",
- "ances",
- "iqUes",
- "ismes",
- "ables",
- "istes",
- )
- and suffix in r2
- ):
- word = word[: -len(suffix)]
+ elif suffix in ("ance", "iqUe", "isme", "able", "iste",
+ "eux", "ances", "iqUes", "ismes",
+ "ables", "istes") and suffix in r2:
+ word = word[:-len(suffix)]
step1_success = True
- elif (
- suffix
- in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations")
- and suffix in r2
- ):
- word = word[: -len(suffix)]
+ elif suffix in ("atrice", "ateur", "ation", "atrices",
+ "ateurs", "ations") and suffix in r2:
+ word = word[:-len(suffix)]
step1_success = True
if word[-2:] == "ic":
word = suffix_replace(word, suffix, "log")
step1_success = True
- elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2:
+ elif (suffix in ("usion", "ution", "usions", "utions") and
+ suffix in r2):
word = suffix_replace(word, suffix, "u")
step1_success = True
step1_success = True
elif suffix in ("it\xE9", "it\xE9s") and suffix in r2:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
step1_success = True
if word[-4:] == "abil":
if "iv" in r2:
word = word[:-2]
- elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2:
- word = word[: -len(suffix)]
+ elif (suffix in ("if", "ive", "ifs", "ives") and
+ suffix in r2):
+ word = word[:-len(suffix)]
step1_success = True
if word[-2:] == "at" and "at" in r2:
if not step1_success or rv_ending_found:
for suffix in self.__step2a_suffixes:
if word.endswith(suffix):
- if (
- suffix in rv
- and len(rv) > len(suffix)
- and rv[rv.rindex(suffix) - 1] not in self.__vowels
- ):
- word = word[: -len(suffix)]
+ if (suffix in rv and len(rv) > len(suffix) and
+ rv[rv.rindex(suffix)-1] not in self.__vowels):
+ word = word[:-len(suffix)]
step2a_success = True
break
- # STEP 2b: Other verb suffixes
+ # STEP 2b: Other verb suffixes
if not step2a_success:
for suffix in self.__step2b_suffixes:
if rv.endswith(suffix):
word = word[:-4]
step2b_success = True
- elif suffix in (
- "eraIent",
- "erions",
- "\xE8rent",
- "erais",
- "erait",
- "eriez",
- "erons",
- "eront",
- "erai",
- "eras",
- "erez",
- "\xE9es",
- "era",
- "iez",
- "\xE9e",
- "\xE9s",
- "er",
- "ez",
- "\xE9",
- ):
- word = word[: -len(suffix)]
+ elif suffix in ('eraIent', 'erions', '\xE8rent',
+ 'erais', 'erait', 'eriez',
+ 'erons', 'eront', 'erai', 'eras',
+ 'erez', '\xE9es', 'era', 'iez',
+ '\xE9e', '\xE9s', 'er', 'ez',
+ '\xE9'):
+ word = word[:-len(suffix)]
step2b_success = True
- elif suffix in (
- "assions",
- "assent",
- "assiez",
- "aIent",
- "antes",
- "asses",
- "\xE2mes",
- "\xE2tes",
- "ante",
- "ants",
- "asse",
- "ais",
- "ait",
- "ant",
- "\xE2t",
- "ai",
- "as",
- "a",
- ):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ elif suffix in ('assions', 'assent', 'assiez',
+ 'aIent', 'antes', 'asses',
+ '\xE2mes', '\xE2tes', 'ante',
+ 'ants', 'asse', 'ais', 'ait',
+ 'ant', '\xE2t', 'ai', 'as',
+ 'a'):
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
step2b_success = True
if rv.endswith("e"):
word = word[:-1]
# STEP 4: Residual suffixes
else:
- if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s":
+ if (len(word) >= 2 and word[-1] == "s" and
+ word[-2] not in "aiou\xE8s"):
word = word[:-1]
for suffix in self.__step4_suffixes:
if word.endswith(suffix):
if suffix in rv:
- if suffix == "ion" and suffix in r2 and rv[-4] in "st":
+ if (suffix == "ion" and suffix in r2 and
+ rv[-4] in "st"):
word = word[:-3]
- elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"):
+ elif suffix in ("ier", "i\xE8re", "Ier",
+ "I\xE8re"):
word = suffix_replace(word, suffix, "i")
elif suffix == "e":
i += 1
else:
if i != 1 and word[-i] in ("\xE9", "\xE8"):
- word = "".join((word[:-i], "e", word[-i + 1 :]))
+ word = "".join((word[:-i], "e", word[-i+1:]))
break
- word = word.replace("I", "i").replace("U", "u").replace("Y", "y")
+ word = (word.replace("I", "i")
+ .replace("U", "u")
+ .replace("Y", "y"))
+
return word
+
+
def __rv_french(self, word, vowels):
"""
Return the region RV that is used by the French stemmer.
"""
rv = ""
if len(word) >= 2:
- if word.startswith(("par", "col", "tap")) or (
- word[0] in vowels and word[1] in vowels
- ):
+ if (word.startswith(("par", "col", "tap")) or
+ (word[0] in vowels and word[1] in vowels)):
rv = word[3:]
else:
for i in range(1, len(word)):
if word[i] in vowels:
- rv = word[i + 1 :]
+ rv = word[i+1:]
break
return rv
+
class GermanStemmer(_StandardStemmer):
"""
__step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s")
__step2_suffixes = ("est", "en", "er", "st")
- __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik")
+ __step3_suffixes = ("isch", "lich", "heit", "keit",
+ "end", "ung", "ig", "ik")
def stem(self, word):
"""
# Every occurrence of 'u' and 'y'
# between vowels is put into upper case.
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
+ for i in range(1, len(word)-1):
+ if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
+ word = "".join((word[:i], "U", word[i+1:]))
elif word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
+ word = "".join((word[:i], "Y", word[i+1:]))
r1, r2 = self._r1r2_standard(word, self.__vowels)
# R1 is adjusted so that the region before it
# contains at least 3 letters.
for i in range(1, len(word)):
- if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
- if 3 > len(word[: i + 1]) > 0:
+ if word[i] not in self.__vowels and word[i-1] in self.__vowels:
+ if len(word[:i+1]) < 3 and len(word[:i+1]) > 0:
r1 = word[3:]
- elif len(word[: i + 1]) == 0:
+ elif len(word[:i+1]) == 0:
return word
break
# STEP 1
for suffix in self.__step1_suffixes:
if r1.endswith(suffix):
- if (
- suffix in ("en", "es", "e")
- and word[-len(suffix) - 4 : -len(suffix)] == "niss"
- ):
- word = word[: -len(suffix) - 1]
- r1 = r1[: -len(suffix) - 1]
- r2 = r2[: -len(suffix) - 1]
+ if (suffix in ("en", "es", "e") and
+ word[-len(suffix)-4:-len(suffix)] == "niss"):
+ word = word[:-len(suffix)-1]
+ r1 = r1[:-len(suffix)-1]
+ r2 = r2[:-len(suffix)-1]
elif suffix == "s":
if word[-2] in self.__s_ending:
r1 = r1[:-1]
r2 = r2[:-1]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
break
# STEP 2
r1 = r1[:-2]
r2 = r2[:-2]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
break
# STEP 3: Derivational suffixes
for suffix in self.__step3_suffixes:
if r2.endswith(suffix):
if suffix in ("end", "ung"):
- if (
- "ig" in r2[-len(suffix) - 2 : -len(suffix)]
- and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2]
- ):
- word = word[: -len(suffix) - 2]
+ if ("ig" in r2[-len(suffix)-2:-len(suffix)] and
+ "e" not in r2[-len(suffix)-3:-len(suffix)-2]):
+ word = word[:-len(suffix)-2]
else:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
- elif (
- suffix in ("ig", "ik", "isch")
- and "e" not in r2[-len(suffix) - 1 : -len(suffix)]
- ):
- word = word[: -len(suffix)]
+ elif (suffix in ("ig", "ik", "isch") and
+ "e" not in r2[-len(suffix)-1:-len(suffix)]):
+ word = word[:-len(suffix)]
elif suffix in ("lich", "heit"):
- if (
- "er" in r1[-len(suffix) - 2 : -len(suffix)]
- or "en" in r1[-len(suffix) - 2 : -len(suffix)]
- ):
- word = word[: -len(suffix) - 2]
+ if ("er" in r1[-len(suffix)-2:-len(suffix)] or
+ "en" in r1[-len(suffix)-2:-len(suffix)]):
+ word = word[:-len(suffix)-2]
else:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
elif suffix == "keit":
- if "lich" in r2[-len(suffix) - 4 : -len(suffix)]:
- word = word[: -len(suffix) - 4]
+ if "lich" in r2[-len(suffix)-4:-len(suffix)]:
+ word = word[:-len(suffix)-4]
- elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]:
- word = word[: -len(suffix) - 2]
+ elif "ig" in r2[-len(suffix)-2:-len(suffix)]:
+ word = word[:-len(suffix)-2]
else:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
break
# Umlaut accents are removed and
# 'u' and 'y' are put back into lower case.
- word = (
- word.replace("\xE4", "a")
- .replace("\xF6", "o")
- .replace("\xFC", "u")
- .replace("U", "u")
- .replace("Y", "y")
- )
+ word = (word.replace("\xE4", "a").replace("\xF6", "o")
+ .replace("\xFC", "u").replace("U", "u")
+ .replace("Y", "y"))
+
return word
+
class HungarianStemmer(_LanguageSpecificStemmer):
"""
__vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB"
__digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
- __double_consonants = (
- "bb",
- "cc",
- "ccs",
- "dd",
- "ff",
- "gg",
- "ggy",
- "jj",
- "kk",
- "ll",
- "lly",
- "mm",
- "nn",
- "nny",
- "pp",
- "rr",
- "ss",
- "ssz",
- "tt",
- "tty",
- "vv",
- "zz",
- "zzs",
- )
+ __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg",
+ "ggy", "jj", "kk", "ll", "lly", "mm",
+ "nn", "nny", "pp", "rr", "ss", "ssz",
+ "tt", "tty", "vv", "zz", "zzs")
__step1_suffixes = ("al", "el")
- __step2_suffixes = (
- "k\xE9ppen",
- "onk\xE9nt",
- "enk\xE9nt",
- "ank\xE9nt",
- "k\xE9pp",
- "k\xE9nt",
- "ban",
- "ben",
- "nak",
- "nek",
- "val",
- "vel",
- "t\xF3l",
- "t\xF5l",
- "r\xF3l",
- "r\xF5l",
- "b\xF3l",
- "b\xF5l",
- "hoz",
- "hez",
- "h\xF6z",
- "n\xE1l",
- "n\xE9l",
- "\xE9rt",
- "kor",
- "ba",
- "be",
- "ra",
- "re",
- "ig",
- "at",
- "et",
- "ot",
- "\xF6t",
- "ul",
- "\xFCl",
- "v\xE1",
- "v\xE9",
- "en",
- "on",
- "an",
- "\xF6n",
- "n",
- "t",
- )
+ __step2_suffixes = ('k\xE9ppen', 'onk\xE9nt', 'enk\xE9nt',
+ 'ank\xE9nt', 'k\xE9pp', 'k\xE9nt', 'ban',
+ 'ben', 'nak', 'nek', 'val', 'vel', 't\xF3l',
+ 't\xF5l', 'r\xF3l', 'r\xF5l', 'b\xF3l',
+ 'b\xF5l', 'hoz', 'hez', 'h\xF6z',
+ 'n\xE1l', 'n\xE9l', '\xE9rt', 'kor',
+ 'ba', 'be', 'ra', 're', 'ig', 'at', 'et',
+ 'ot', '\xF6t', 'ul', '\xFCl', 'v\xE1',
+ 'v\xE9', 'en', 'on', 'an', '\xF6n',
+ 'n', 't')
__step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
- __step4_suffixes = (
- "astul",
- "est\xFCl",
- "\xE1stul",
- "\xE9st\xFCl",
- "stul",
- "st\xFCl",
- )
+ __step4_suffixes = ('astul', 'est\xFCl', '\xE1stul',
+ '\xE9st\xFCl', 'stul', 'st\xFCl')
__step5_suffixes = ("\xE1", "\xE9")
- __step6_suffixes = (
- "ok\xE9",
- "\xF6k\xE9",
- "ak\xE9",
- "ek\xE9",
- "\xE1k\xE9",
- "\xE1\xE9i",
- "\xE9k\xE9",
- "\xE9\xE9i",
- "k\xE9",
- "\xE9i",
- "\xE9\xE9",
- "\xE9",
- )
- __step7_suffixes = (
- "\xE1juk",
- "\xE9j\xFCk",
- "\xFCnk",
- "unk",
- "juk",
- "j\xFCk",
- "\xE1nk",
- "\xE9nk",
- "nk",
- "uk",
- "\xFCk",
- "em",
- "om",
- "am",
- "od",
- "ed",
- "ad",
- "\xF6d",
- "ja",
- "je",
- "\xE1m",
- "\xE1d",
- "\xE9m",
- "\xE9d",
- "m",
- "d",
- "a",
- "e",
- "o",
- "\xE1",
- "\xE9",
- )
- __step8_suffixes = (
- "jaitok",
- "jeitek",
- "jaink",
- "jeink",
- "aitok",
- "eitek",
- "\xE1itok",
- "\xE9itek",
- "jaim",
- "jeim",
- "jaid",
- "jeid",
- "eink",
- "aink",
- "itek",
- "jeik",
- "jaik",
- "\xE1ink",
- "\xE9ink",
- "aim",
- "eim",
- "aid",
- "eid",
- "jai",
- "jei",
- "ink",
- "aik",
- "eik",
- "\xE1im",
- "\xE1id",
- "\xE1ik",
- "\xE9im",
- "\xE9id",
- "\xE9ik",
- "im",
- "id",
- "ai",
- "ei",
- "ik",
- "\xE1i",
- "\xE9i",
- "i",
- )
- __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k")
+ __step6_suffixes = ('ok\xE9', '\xF6k\xE9', 'ak\xE9',
+ 'ek\xE9', '\xE1k\xE9', '\xE1\xE9i',
+ '\xE9k\xE9', '\xE9\xE9i', 'k\xE9',
+ '\xE9i', '\xE9\xE9', '\xE9')
+ __step7_suffixes = ('\xE1juk', '\xE9j\xFCk', '\xFCnk',
+ 'unk', 'juk', 'j\xFCk', '\xE1nk',
+ '\xE9nk', 'nk', 'uk', '\xFCk', 'em',
+ 'om', 'am', 'od', 'ed', 'ad', '\xF6d',
+ 'ja', 'je', '\xE1m', '\xE1d', '\xE9m',
+ '\xE9d', 'm', 'd', 'a', 'e', 'o',
+ '\xE1', '\xE9')
+ __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok',
+ 'eitek', '\xE1itok', '\xE9itek', 'jaim',
+ 'jeim', 'jaid', 'jeid', 'eink', 'aink',
+ 'itek', 'jeik', 'jaik', '\xE1ink',
+ '\xE9ink', 'aim', 'eim', 'aid', 'eid',
+ 'jai', 'jei', 'ink', 'aik', 'eik',
+ '\xE1im', '\xE1id', '\xE1ik', '\xE9im',
+ '\xE9id', '\xE9ik', 'im', 'id', 'ai',
+ 'ei', 'ik', '\xE1i', '\xE9i', 'i')
+ __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok",
+ "ek", "ak", "k")
def stem(self, word):
"""
# STEP 1: Remove instrumental case
if r1.endswith(self.__step1_suffixes):
for double_cons in self.__double_consonants:
- if word[-2 - len(double_cons) : -2] == double_cons:
+ if word[-2-len(double_cons):-2] == double_cons:
word = "".join((word[:-4], word[-3]))
- if r1[-2 - len(double_cons) : -2] == double_cons:
+ if r1[-2-len(double_cons):-2] == double_cons:
r1 = "".join((r1[:-4], r1[-3]))
break
for suffix in self.__step2_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
if r1.endswith("\xE1"):
word = "".join((word[:-1], "a"))
word = suffix_replace(word, suffix, "e")
r1 = suffix_replace(r1, suffix, "e")
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
break
# STEP 5: Remove factive case
for suffix in self.__step5_suffixes:
if r1.endswith(suffix):
for double_cons in self.__double_consonants:
- if word[-1 - len(double_cons) : -1] == double_cons:
+ if word[-1-len(double_cons):-1] == double_cons:
word = "".join((word[:-3], word[-2]))
- if r1[-1 - len(double_cons) : -1] == double_cons:
+ if r1[-1-len(double_cons):-1] == double_cons:
r1 = "".join((r1[:-3], r1[-2]))
break
word = suffix_replace(word, suffix, "a")
r1 = suffix_replace(r1, suffix, "a")
- elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"):
+ elif suffix in ("\xE9k\xE9", "\xE9\xE9i",
+ "\xE9\xE9"):
word = suffix_replace(word, suffix, "e")
r1 = suffix_replace(r1, suffix, "e")
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
break
# STEP 7: Remove singular owner suffixes
for suffix in self.__step7_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
- if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"):
+ if suffix in ("\xE1nk", "\xE1juk", "\xE1m",
+ "\xE1d", "\xE1"):
word = suffix_replace(word, suffix, "a")
r1 = suffix_replace(r1, suffix, "a")
- elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"):
+ elif suffix in ("\xE9nk", "\xE9j\xFCk",
+ "\xE9m", "\xE9d", "\xE9"):
word = suffix_replace(word, suffix, "e")
r1 = suffix_replace(r1, suffix, "e")
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
break
# STEP 8: Remove plural owner suffixes
for suffix in self.__step8_suffixes:
if word.endswith(suffix):
if r1.endswith(suffix):
- if suffix in (
- "\xE1im",
- "\xE1id",
- "\xE1i",
- "\xE1ink",
- "\xE1itok",
- "\xE1ik",
- ):
+ if suffix in ("\xE1im", "\xE1id", "\xE1i",
+ "\xE1ink", "\xE1itok", "\xE1ik"):
word = suffix_replace(word, suffix, "a")
r1 = suffix_replace(r1, suffix, "a")
- elif suffix in (
- "\xE9im",
- "\xE9id",
- "\xE9i",
- "\xE9ink",
- "\xE9itek",
- "\xE9ik",
- ):
+ elif suffix in ("\xE9im", "\xE9id", "\xE9i",
+ "\xE9ink", "\xE9itek", "\xE9ik"):
word = suffix_replace(word, suffix, "e")
r1 = suffix_replace(r1, suffix, "e")
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
break
# STEP 9: Remove plural suffixes
elif suffix == "\xE9k":
word = suffix_replace(word, suffix, "e")
else:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
break
+
return word
+
+
def __r1_hungarian(self, word, vowels, digraphs):
"""
Return the region R1 that is used by the Hungarian stemmer.
if word[0] in vowels:
for digraph in digraphs:
if digraph in word[1:]:
- r1 = word[word.index(digraph[-1]) + 1 :]
+ r1 = word[word.index(digraph[-1])+1:]
return r1
for i in range(1, len(word)):
if word[i] not in vowels:
- r1 = word[i + 1 :]
+ r1 = word[i+1:]
break
else:
for i in range(1, len(word)):
if word[i] in vowels:
- r1 = word[i + 1 :]
+ r1 = word[i+1:]
break
return r1
+
class ItalianStemmer(_StandardStemmer):
"""
"""
__vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
- __step0_suffixes = (
- "gliela",
- "gliele",
- "glieli",
- "glielo",
- "gliene",
- "sene",
- "mela",
- "mele",
- "meli",
- "melo",
- "mene",
- "tela",
- "tele",
- "teli",
- "telo",
- "tene",
- "cela",
- "cele",
- "celi",
- "celo",
- "cene",
- "vela",
- "vele",
- "veli",
- "velo",
- "vene",
- "gli",
- "ci",
- "la",
- "le",
- "li",
- "lo",
- "mi",
- "ne",
- "si",
- "ti",
- "vi",
- )
- __step1_suffixes = (
- "atrice",
- "atrici",
- "azione",
- "azioni",
- "uzione",
- "uzioni",
- "usione",
- "usioni",
- "amento",
- "amenti",
- "imento",
- "imenti",
- "amente",
- "abile",
- "abili",
- "ibile",
- "ibili",
- "mente",
- "atore",
- "atori",
- "logia",
- "logie",
- "anza",
- "anze",
- "iche",
- "ichi",
- "ismo",
- "ismi",
- "ista",
- "iste",
- "isti",
- "ist\xE0",
- "ist\xE8",
- "ist\xEC",
- "ante",
- "anti",
- "enza",
- "enze",
- "ico",
- "ici",
- "ica",
- "ice",
- "oso",
- "osi",
- "osa",
- "ose",
- "it\xE0",
- "ivo",
- "ivi",
- "iva",
- "ive",
- )
- __step2_suffixes = (
- "erebbero",
- "irebbero",
- "assero",
- "assimo",
- "eranno",
- "erebbe",
- "eremmo",
- "ereste",
- "eresti",
- "essero",
- "iranno",
- "irebbe",
- "iremmo",
- "ireste",
- "iresti",
- "iscano",
- "iscono",
- "issero",
- "arono",
- "avamo",
- "avano",
- "avate",
- "eremo",
- "erete",
- "erono",
- "evamo",
- "evano",
- "evate",
- "iremo",
- "irete",
- "irono",
- "ivamo",
- "ivano",
- "ivate",
- "ammo",
- "ando",
- "asse",
- "assi",
- "emmo",
- "enda",
- "ende",
- "endi",
- "endo",
- "erai",
- "erei",
- "Yamo",
- "iamo",
- "immo",
- "irai",
- "irei",
- "isca",
- "isce",
- "isci",
- "isco",
- "ano",
- "are",
- "ata",
- "ate",
- "ati",
- "ato",
- "ava",
- "avi",
- "avo",
- "er\xE0",
- "ere",
- "er\xF2",
- "ete",
- "eva",
- "evi",
- "evo",
- "ir\xE0",
- "ire",
- "ir\xF2",
- "ita",
- "ite",
- "iti",
- "ito",
- "iva",
- "ivi",
- "ivo",
- "ono",
- "uta",
- "ute",
- "uti",
- "uto",
- "ar",
- "ir",
- )
+ __step0_suffixes = ('gliela', 'gliele', 'glieli', 'glielo',
+ 'gliene', 'sene', 'mela', 'mele', 'meli',
+ 'melo', 'mene', 'tela', 'tele', 'teli',
+ 'telo', 'tene', 'cela', 'cele', 'celi',
+ 'celo', 'cene', 'vela', 'vele', 'veli',
+ 'velo', 'vene', 'gli', 'ci', 'la', 'le',
+ 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi')
+ __step1_suffixes = ('atrice', 'atrici', 'azione', 'azioni',
+ 'uzione', 'uzioni', 'usione', 'usioni',
+ 'amento', 'amenti', 'imento', 'imenti',
+ 'amente', 'abile', 'abili', 'ibile', 'ibili',
+ 'mente', 'atore', 'atori', 'logia', 'logie',
+ 'anza', 'anze', 'iche', 'ichi', 'ismo',
+ 'ismi', 'ista', 'iste', 'isti', 'ist\xE0',
+ 'ist\xE8', 'ist\xEC', 'ante', 'anti',
+ 'enza', 'enze', 'ico', 'ici', 'ica', 'ice',
+ 'oso', 'osi', 'osa', 'ose', 'it\xE0',
+ 'ivo', 'ivi', 'iva', 'ive')
+ __step2_suffixes = ('erebbero', 'irebbero', 'assero', 'assimo',
+ 'eranno', 'erebbe', 'eremmo', 'ereste',
+ 'eresti', 'essero', 'iranno', 'irebbe',
+ 'iremmo', 'ireste', 'iresti', 'iscano',
+ 'iscono', 'issero', 'arono', 'avamo', 'avano',
+ 'avate', 'eremo', 'erete', 'erono', 'evamo',
+ 'evano', 'evate', 'iremo', 'irete', 'irono',
+ 'ivamo', 'ivano', 'ivate', 'ammo', 'ando',
+ 'asse', 'assi', 'emmo', 'enda', 'ende',
+ 'endi', 'endo', 'erai', 'erei', 'Yamo',
+ 'iamo', 'immo', 'irai', 'irei', 'isca',
+ 'isce', 'isci', 'isco', 'ano', 'are', 'ata',
+ 'ate', 'ati', 'ato', 'ava', 'avi', 'avo',
+ 'er\xE0', 'ere', 'er\xF2', 'ete', 'eva',
+ 'evi', 'evo', 'ir\xE0', 'ire', 'ir\xF2',
+ 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi',
+ 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto',
+ 'ar', 'ir')
def stem(self, word):
"""
step1_success = False
# All acute accents are replaced by grave accents.
- word = (
- word.replace("\xE1", "\xE0")
- .replace("\xE9", "\xE8")
- .replace("\xED", "\xEC")
- .replace("\xF3", "\xF2")
- .replace("\xFA", "\xF9")
- )
+ word = (word.replace("\xE1", "\xE0")
+ .replace("\xE9", "\xE8")
+ .replace("\xED", "\xEC")
+ .replace("\xF3", "\xF2")
+ .replace("\xFA", "\xF9"))
# Every occurrence of 'u' after 'q'
# is put into upper case.
for i in range(1, len(word)):
- if word[i - 1] == "q" and word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
+ if word[i-1] == "q" and word[i] == "u":
+ word = "".join((word[:i], "U", word[i+1:]))
# Every occurrence of 'u' and 'i'
# between vowels is put into upper case.
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
+ for i in range(1, len(word)-1):
+ if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
+ word = "".join((word[:i], "U", word[i+1:]))
- elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1 :]))
+ elif word [i] == "i":
+ word = "".join((word[:i], "I", word[i+1:]))
r1, r2 = self._r1r2_standard(word, self.__vowels)
rv = self._rv_standard(word, self.__vowels)
# STEP 0: Attached pronoun
for suffix in self.__step0_suffixes:
if rv.endswith(suffix):
- if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
-
- elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"):
+ if rv[-len(suffix)-4:-len(suffix)] in ("ando", "endo"):
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
+
+ elif (rv[-len(suffix)-2:-len(suffix)] in
+ ("ar", "er", "ir")):
word = suffix_replace(word, suffix, "e")
r1 = suffix_replace(r1, suffix, "e")
r2 = suffix_replace(r2, suffix, "e")
word = word[:-2]
rv = rv[:-2]
- elif r2.endswith("abil"):
+ elif r2 .endswith("abil"):
word = word[:-4]
rv = rv[:-4]
- elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith(
- suffix
- ):
+ elif (suffix in ("amento", "amenti",
+ "imento", "imenti") and
+ rv.endswith(suffix)):
step1_success = True
word = word[:-6]
rv = rv[:-6]
elif r2.endswith(suffix):
step1_success = True
if suffix in ("azione", "azioni", "atore", "atori"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
if r2.endswith("ic"):
word = word[:-2]
word = word[:-2]
rv = word[:-2]
- elif suffix in ("uzione", "uzioni", "usione", "usioni"):
+ elif suffix in ("uzione", "uzioni",
+ "usione", "usioni"):
word = word[:-5]
rv = rv[:-5]
word = word[:-2]
rv = rv[:-2]
else:
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
# STEP 2: Verb suffixes
if not step1_success:
for suffix in self.__step2_suffixes:
if rv.endswith(suffix):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
# STEP 3a
- if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")):
+ if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8",
+ "\xEC", "\xF2")):
word = word[:-1]
rv = rv[:-1]
word = word.replace("I", "i").replace("U", "u")
+
return word
+
class NorwegianStemmer(_ScandinavianStemmer):
"""
__vowels = "aeiouy\xE6\xE5\xF8"
__s_ending = "bcdfghjlmnoprtvyz"
- __step1_suffixes = (
- "hetenes",
- "hetene",
- "hetens",
- "heter",
- "heten",
- "endes",
- "ande",
- "ende",
- "edes",
- "enes",
- "erte",
- "ede",
- "ane",
- "ene",
- "ens",
- "ers",
- "ets",
- "het",
- "ast",
- "ert",
- "en",
- "ar",
- "er",
- "as",
- "es",
- "et",
- "a",
- "e",
- "s",
- )
+ __step1_suffixes = ("hetenes", "hetene", "hetens", "heter",
+ "heten", "endes", "ande", "ende", "edes",
+ "enes", "erte", "ede", "ane", "ene", "ens",
+ "ers", "ets", "het", "ast", "ert", "en",
+ "ar", "er", "as", "es", "et", "a", "e", "s")
__step2_suffixes = ("dt", "vt")
- __step3_suffixes = (
- "hetslov",
- "eleg",
- "elig",
- "elov",
- "slov",
- "leg",
- "eig",
- "lig",
- "els",
- "lov",
- "ig",
- )
+ __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov",
+ "leg", "eig", "lig", "els", "lov", "ig")
def stem(self, word):
"""
r1 = suffix_replace(r1, suffix, "er")
elif suffix == "s":
- if word[-2] in self.__s_ending or (
- word[-2] == "k" and word[-3] not in self.__vowels
- ):
+ if (word[-2] in self.__s_ending or
+ (word[-2] == "k" and word[-3] not in self.__vowels)):
word = word[:-1]
r1 = r1[:-1]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
break
# STEP 2
# STEP 3
for suffix in self.__step3_suffixes:
if r1.endswith(suffix):
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
break
+
return word
+
class PortugueseStemmer(_StandardStemmer):
"""
"""
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
- __step1_suffixes = (
- "amentos",
- "imentos",
- "uço~es",
- "amento",
- "imento",
- "adoras",
- "adores",
- "a\xE7o~es",
- "logias",
- "\xEAncias",
- "amente",
- "idades",
- "an\xE7as",
- "ismos",
- "istas",
- "adora",
- "a\xE7a~o",
- "antes",
- "\xE2ncia",
- "logia",
- "uça~o",
- "\xEAncia",
- "mente",
- "idade",
- "an\xE7a",
- "ezas",
- "icos",
- "icas",
- "ismo",
- "\xE1vel",
- "\xEDvel",
- "ista",
- "osos",
- "osas",
- "ador",
- "ante",
- "ivas",
- "ivos",
- "iras",
- "eza",
- "ico",
- "ica",
- "oso",
- "osa",
- "iva",
- "ivo",
- "ira",
- )
- __step2_suffixes = (
- "ar\xEDamos",
- "er\xEDamos",
- "ir\xEDamos",
- "\xE1ssemos",
- "\xEAssemos",
- "\xEDssemos",
- "ar\xEDeis",
- "er\xEDeis",
- "ir\xEDeis",
- "\xE1sseis",
- "\xE9sseis",
- "\xEDsseis",
- "\xE1ramos",
- "\xE9ramos",
- "\xEDramos",
- "\xE1vamos",
- "aremos",
- "eremos",
- "iremos",
- "ariam",
- "eriam",
- "iriam",
- "assem",
- "essem",
- "issem",
- "ara~o",
- "era~o",
- "ira~o",
- "arias",
- "erias",
- "irias",
- "ardes",
- "erdes",
- "irdes",
- "asses",
- "esses",
- "isses",
- "astes",
- "estes",
- "istes",
- "\xE1reis",
- "areis",
- "\xE9reis",
- "ereis",
- "\xEDreis",
- "ireis",
- "\xE1veis",
- "\xEDamos",
- "armos",
- "ermos",
- "irmos",
- "aria",
- "eria",
- "iria",
- "asse",
- "esse",
- "isse",
- "aste",
- "este",
- "iste",
- "arei",
- "erei",
- "irei",
- "aram",
- "eram",
- "iram",
- "avam",
- "arem",
- "erem",
- "irem",
- "ando",
- "endo",
- "indo",
- "adas",
- "idas",
- "ar\xE1s",
- "aras",
- "er\xE1s",
- "eras",
- "ir\xE1s",
- "avas",
- "ares",
- "eres",
- "ires",
- "\xEDeis",
- "ados",
- "idos",
- "\xE1mos",
- "amos",
- "emos",
- "imos",
- "iras",
- "ada",
- "ida",
- "ar\xE1",
- "ara",
- "er\xE1",
- "era",
- "ir\xE1",
- "ava",
- "iam",
- "ado",
- "ido",
- "ias",
- "ais",
- "eis",
- "ira",
- "ia",
- "ei",
- "am",
- "em",
- "ar",
- "er",
- "ir",
- "as",
- "es",
- "is",
- "eu",
- "iu",
- "ou",
- )
- __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3")
+ __step1_suffixes = ('amentos', 'imentos', 'uço~es', 'amento',
+ 'imento', 'adoras', 'adores', 'a\xE7o~es',
+ 'logias', '\xEAncias', 'amente',
+ 'idades', 'an\xE7as', 'ismos', 'istas', 'adora',
+ 'a\xE7a~o', 'antes', '\xE2ncia',
+ 'logia', 'uça~o', '\xEAncia',
+ 'mente', 'idade', 'an\xE7a', 'ezas', 'icos', 'icas',
+ 'ismo', '\xE1vel', '\xEDvel', 'ista',
+ 'osos', 'osas', 'ador', 'ante', 'ivas',
+ 'ivos', 'iras', 'eza', 'ico', 'ica',
+ 'oso', 'osa', 'iva', 'ivo', 'ira')
+ __step2_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos',
+ '\xE1ssemos', '\xEAssemos', '\xEDssemos',
+ 'ar\xEDeis', 'er\xEDeis', 'ir\xEDeis',
+ '\xE1sseis', '\xE9sseis', '\xEDsseis',
+ '\xE1ramos', '\xE9ramos', '\xEDramos',
+ '\xE1vamos', 'aremos', 'eremos', 'iremos',
+ 'ariam', 'eriam', 'iriam', 'assem', 'essem',
+ 'issem', 'ara~o', 'era~o', 'ira~o', 'arias',
+ 'erias', 'irias', 'ardes', 'erdes', 'irdes',
+ 'asses', 'esses', 'isses', 'astes', 'estes',
+ 'istes', '\xE1reis', 'areis', '\xE9reis',
+ 'ereis', '\xEDreis', 'ireis', '\xE1veis',
+ '\xEDamos', 'armos', 'ermos', 'irmos',
+ 'aria', 'eria', 'iria', 'asse', 'esse',
+ 'isse', 'aste', 'este', 'iste', 'arei',
+ 'erei', 'irei', 'aram', 'eram', 'iram',
+ 'avam', 'arem', 'erem', 'irem',
+ 'ando', 'endo', 'indo', 'adas', 'idas',
+ 'ar\xE1s', 'aras', 'er\xE1s', 'eras',
+ 'ir\xE1s', 'avas', 'ares', 'eres', 'ires',
+ '\xEDeis', 'ados', 'idos', '\xE1mos',
+ 'amos', 'emos', 'imos', 'iras', 'ada', 'ida',
+ 'ar\xE1', 'ara', 'er\xE1', 'era',
+ 'ir\xE1', 'ava', 'iam', 'ado', 'ido',
+ 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am',
+ 'em', 'ar', 'er', 'ir', 'as',
+ 'es', 'is', 'eu', 'iu', 'ou')
+ __step4_suffixes = ("os", "a", "i", "o", "\xE1",
+ "\xED", "\xF3")
def stem(self, word):
"""
step1_success = False
step2_success = False
- word = (
- word.replace("\xE3", "a~")
- .replace("\xF5", "o~")
- .replace("q\xFC", "qu")
- .replace("g\xFC", "gu")
- )
+ word = (word.replace("\xE3", "a~")
+ .replace("\xF5", "o~")
+ .replace("q\xFC", "qu")
+ .replace("g\xFC", "gu"))
r1, r2 = self._r1r2_standard(word, self.__vowels)
rv = self._rv_standard(word, self.__vowels)
word = word[:-2]
rv = rv[:-2]
- elif (
- suffix in ("ira", "iras")
- and rv.endswith(suffix)
- and word[-len(suffix) - 1 : -len(suffix)] == "e"
- ):
+ elif (suffix in ("ira", "iras") and rv.endswith(suffix) and
+ word[-len(suffix)-1:-len(suffix)] == "e"):
step1_success = True
word = suffix_replace(word, suffix, "ir")
rv = rv[:-4]
elif suffix in ("idade", "idades"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
if r2.endswith(("ic", "iv")):
word = word[:-2]
rv = rv[:-4]
elif suffix in ("iva", "ivo", "ivas", "ivos"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
if r2.endswith("at"):
word = word[:-2]
rv = rv[:-2]
else:
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
# STEP 2: Verb suffixes
if rv.endswith(suffix):
step2_success = True
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
# STEP 3
if not step1_success and not step2_success:
for suffix in self.__step4_suffixes:
if rv.endswith(suffix):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
# STEP 5
word = word[:-1]
rv = rv[:-1]
- if (word.endswith("gu") and rv.endswith("u")) or (
- word.endswith("ci") and rv.endswith("i")
- ):
+ if ((word.endswith("gu") and rv.endswith("u")) or
+ (word.endswith("ci") and rv.endswith("i"))):
word = word[:-1]
elif word.endswith("\xE7"):
word = word.replace("a~", "\xE3").replace("o~", "\xF5")
+
return word
+
class RomanianStemmer(_StandardStemmer):
"""
"""
__vowels = "aeiou\u0103\xE2\xEE"
- __step0_suffixes = (
- "iilor",
- "ului",
- "elor",
- "iile",
- "ilor",
- "atei",
- "a\u0163ie",
- "a\u0163ia",
- "aua",
- "ele",
- "iua",
- "iei",
- "ile",
- "ul",
- "ea",
- "ii",
- )
- __step1_suffixes = (
- "abilitate",
- "abilitati",
- "abilit\u0103\u0163i",
- "ibilitate",
- "abilit\u0103i",
- "ivitate",
- "ivitati",
- "ivit\u0103\u0163i",
- "icitate",
- "icitati",
- "icit\u0103\u0163i",
- "icatori",
- "ivit\u0103i",
- "icit\u0103i",
- "icator",
- "a\u0163iune",
- "atoare",
- "\u0103toare",
- "i\u0163iune",
- "itoare",
- "iciva",
- "icive",
- "icivi",
- "iciv\u0103",
- "icala",
- "icale",
- "icali",
- "ical\u0103",
- "ativa",
- "ative",
- "ativi",
- "ativ\u0103",
- "atori",
- "\u0103tori",
- "itiva",
- "itive",
- "itivi",
- "itiv\u0103",
- "itori",
- "iciv",
- "ical",
- "ativ",
- "ator",
- "\u0103tor",
- "itiv",
- "itor",
- )
- __step2_suffixes = (
- "abila",
- "abile",
- "abili",
- "abil\u0103",
- "ibila",
- "ibile",
- "ibili",
- "ibil\u0103",
- "atori",
- "itate",
- "itati",
- "it\u0103\u0163i",
- "abil",
- "ibil",
- "oasa",
- "oas\u0103",
- "oase",
- "anta",
- "ante",
- "anti",
- "ant\u0103",
- "ator",
- "it\u0103i",
- "iune",
- "iuni",
- "isme",
- "ista",
- "iste",
- "isti",
- "ist\u0103",
- "i\u015Fti",
- "ata",
- "at\u0103",
- "ati",
- "ate",
- "uta",
- "ut\u0103",
- "uti",
- "ute",
- "ita",
- "it\u0103",
- "iti",
- "ite",
- "ica",
- "ice",
- "ici",
- "ic\u0103",
- "osi",
- "o\u015Fi",
- "ant",
- "iva",
- "ive",
- "ivi",
- "iv\u0103",
- "ism",
- "ist",
- "at",
- "ut",
- "it",
- "ic",
- "os",
- "iv",
- )
- __step3_suffixes = (
- "seser\u0103\u0163i",
- "aser\u0103\u0163i",
- "iser\u0103\u0163i",
- "\xE2ser\u0103\u0163i",
- "user\u0103\u0163i",
- "seser\u0103m",
- "aser\u0103m",
- "iser\u0103m",
- "\xE2ser\u0103m",
- "user\u0103m",
- "ser\u0103\u0163i",
- "sese\u015Fi",
- "seser\u0103",
- "easc\u0103",
- "ar\u0103\u0163i",
- "ur\u0103\u0163i",
- "ir\u0103\u0163i",
- "\xE2r\u0103\u0163i",
- "ase\u015Fi",
- "aser\u0103",
- "ise\u015Fi",
- "iser\u0103",
- "\xe2se\u015Fi",
- "\xE2ser\u0103",
- "use\u015Fi",
- "user\u0103",
- "ser\u0103m",
- "sesem",
- "indu",
- "\xE2ndu",
- "eaz\u0103",
- "e\u015Fti",
- "e\u015Fte",
- "\u0103\u015Fti",
- "\u0103\u015Fte",
- "ea\u0163i",
- "ia\u0163i",
- "ar\u0103m",
- "ur\u0103m",
- "ir\u0103m",
- "\xE2r\u0103m",
- "asem",
- "isem",
- "\xE2sem",
- "usem",
- "se\u015Fi",
- "ser\u0103",
- "sese",
- "are",
- "ere",
- "ire",
- "\xE2re",
- "ind",
- "\xE2nd",
- "eze",
- "ezi",
- "esc",
- "\u0103sc",
- "eam",
- "eai",
- "eau",
- "iam",
- "iai",
- "iau",
- "a\u015Fi",
- "ar\u0103",
- "u\u015Fi",
- "ur\u0103",
- "i\u015Fi",
- "ir\u0103",
- "\xE2\u015Fi",
- "\xe2r\u0103",
- "ase",
- "ise",
- "\xE2se",
- "use",
- "a\u0163i",
- "e\u0163i",
- "i\u0163i",
- "\xe2\u0163i",
- "sei",
- "ez",
- "am",
- "ai",
- "au",
- "ea",
- "ia",
- "ui",
- "\xE2i",
- "\u0103m",
- "em",
- "im",
- "\xE2m",
- "se",
- )
+ __step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor',
+ 'atei', 'a\u0163ie', 'a\u0163ia', 'aua',
+ 'ele', 'iua', 'iei', 'ile', 'ul', 'ea',
+ 'ii')
+ __step1_suffixes = ('abilitate', 'abilitati', 'abilit\u0103\u0163i',
+ 'ibilitate', 'abilit\u0103i', 'ivitate',
+ 'ivitati', 'ivit\u0103\u0163i', 'icitate',
+ 'icitati', 'icit\u0103\u0163i', 'icatori',
+ 'ivit\u0103i', 'icit\u0103i', 'icator',
+ 'a\u0163iune', 'atoare', '\u0103toare',
+ 'i\u0163iune', 'itoare', 'iciva', 'icive',
+ 'icivi', 'iciv\u0103', 'icala', 'icale',
+ 'icali', 'ical\u0103', 'ativa', 'ative',
+ 'ativi', 'ativ\u0103', 'atori', '\u0103tori',
+ 'itiva', 'itive', 'itivi', 'itiv\u0103',
+ 'itori', 'iciv', 'ical', 'ativ', 'ator',
+ '\u0103tor', 'itiv', 'itor')
+ __step2_suffixes = ('abila', 'abile', 'abili', 'abil\u0103',
+ 'ibila', 'ibile', 'ibili', 'ibil\u0103',
+ 'atori', 'itate', 'itati', 'it\u0103\u0163i',
+ 'abil', 'ibil', 'oasa', 'oas\u0103', 'oase',
+ 'anta', 'ante', 'anti', 'ant\u0103', 'ator',
+ 'it\u0103i', 'iune', 'iuni', 'isme', 'ista',
+ 'iste', 'isti', 'ist\u0103', 'i\u015Fti',
+ 'ata', 'at\u0103', 'ati', 'ate', 'uta',
+ 'ut\u0103', 'uti', 'ute', 'ita', 'it\u0103',
+ 'iti', 'ite', 'ica', 'ice', 'ici', 'ic\u0103',
+ 'osi', 'o\u015Fi', 'ant', 'iva', 'ive', 'ivi',
+ 'iv\u0103', 'ism', 'ist', 'at', 'ut', 'it',
+ 'ic', 'os', 'iv')
+ __step3_suffixes = ('seser\u0103\u0163i', 'aser\u0103\u0163i',
+ 'iser\u0103\u0163i', '\xE2ser\u0103\u0163i',
+ 'user\u0103\u0163i', 'seser\u0103m',
+ 'aser\u0103m', 'iser\u0103m', '\xE2ser\u0103m',
+ 'user\u0103m', 'ser\u0103\u0163i', 'sese\u015Fi',
+ 'seser\u0103', 'easc\u0103', 'ar\u0103\u0163i',
+ 'ur\u0103\u0163i', 'ir\u0103\u0163i',
+ '\xE2r\u0103\u0163i', 'ase\u015Fi',
+ 'aser\u0103', 'ise\u015Fi', 'iser\u0103',
+ '\xe2se\u015Fi', '\xE2ser\u0103',
+ 'use\u015Fi', 'user\u0103', 'ser\u0103m',
+ 'sesem', 'indu', '\xE2ndu', 'eaz\u0103',
+ 'e\u015Fti', 'e\u015Fte', '\u0103\u015Fti',
+ '\u0103\u015Fte', 'ea\u0163i', 'ia\u0163i',
+ 'ar\u0103m', 'ur\u0103m', 'ir\u0103m',
+ '\xE2r\u0103m', 'asem', 'isem',
+ '\xE2sem', 'usem', 'se\u015Fi', 'ser\u0103',
+ 'sese', 'are', 'ere', 'ire', '\xE2re',
+ 'ind', '\xE2nd', 'eze', 'ezi', 'esc',
+ '\u0103sc', 'eam', 'eai', 'eau', 'iam',
+ 'iai', 'iau', 'a\u015Fi', 'ar\u0103',
+ 'u\u015Fi', 'ur\u0103', 'i\u015Fi', 'ir\u0103',
+ '\xE2\u015Fi', '\xe2r\u0103', 'ase',
+ 'ise', '\xE2se', 'use', 'a\u0163i',
+ 'e\u0163i', 'i\u0163i', '\xe2\u0163i', 'sei',
+ 'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui',
+ '\xE2i', '\u0103m', 'em', 'im', '\xE2m',
+ 'se')
def stem(self, word):
"""
step1_success = False
step2_success = False
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
+ for i in range(1, len(word)-1):
+ if word[i-1] in self.__vowels and word[i+1] in self.__vowels:
if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
+ word = "".join((word[:i], "U", word[i+1:]))
elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1 :]))
+ word = "".join((word[:i], "I", word[i+1:]))
r1, r2 = self._r1r2_standard(word, self.__vowels)
rv = self._rv_standard(word, self.__vowels)
if word.endswith(suffix):
if suffix in r1:
if suffix in ("ul", "ului"):
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
if suffix in rv:
- rv = rv[: -len(suffix)]
+ rv = rv[:-len(suffix)]
else:
rv = ""
- elif (
- suffix == "aua"
- or suffix == "atei"
- or (suffix == "ile" and word[-5:-3] != "ab")
- ):
+ elif (suffix == "aua" or suffix == "atei" or
+ (suffix == "ile" and word[-5:-3] != "ab")):
word = word[:-2]
elif suffix in ("ea", "ele", "elor"):
else:
rv = ""
- elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"):
+ elif suffix in ("ii", "iua", "iei",
+ "iile", "iilor", "ilor"):
word = suffix_replace(word, suffix, "i")
if suffix in rv:
step1_success = True
replacement_done = True
- if suffix in (
- "abilitate",
- "abilitati",
- "abilit\u0103i",
- "abilit\u0103\u0163i",
- ):
+ if suffix in ("abilitate", "abilitati",
+ "abilit\u0103i",
+ "abilit\u0103\u0163i"):
word = suffix_replace(word, suffix, "abil")
elif suffix == "ibilitate":
word = word[:-5]
- elif suffix in (
- "ivitate",
- "ivitati",
- "ivit\u0103i",
- "ivit\u0103\u0163i",
- ):
+ elif suffix in ("ivitate", "ivitati",
+ "ivit\u0103i",
+ "ivit\u0103\u0163i"):
word = suffix_replace(word, suffix, "iv")
- elif suffix in (
- "icitate",
- "icitati",
- "icit\u0103i",
- "icit\u0103\u0163i",
- "icator",
- "icatori",
- "iciv",
- "iciva",
- "icive",
- "icivi",
- "iciv\u0103",
- "ical",
- "icala",
- "icale",
- "icali",
- "ical\u0103",
- ):
+ elif suffix in ("icitate", "icitati", "icit\u0103i",
+ "icit\u0103\u0163i", "icator",
+ "icatori", "iciv", "iciva",
+ "icive", "icivi", "iciv\u0103",
+ "ical", "icala", "icale", "icali",
+ "ical\u0103"):
word = suffix_replace(word, suffix, "ic")
- elif suffix in (
- "ativ",
- "ativa",
- "ative",
- "ativi",
- "ativ\u0103",
- "a\u0163iune",
- "atoare",
- "ator",
- "atori",
- "\u0103toare",
- "\u0103tor",
- "\u0103tori",
- ):
+ elif suffix in ("ativ", "ativa", "ative", "ativi",
+ "ativ\u0103", "a\u0163iune",
+ "atoare", "ator", "atori",
+ "\u0103toare",
+ "\u0103tor", "\u0103tori"):
word = suffix_replace(word, suffix, "at")
if suffix in r2:
r2 = suffix_replace(r2, suffix, "at")
- elif suffix in (
- "itiv",
- "itiva",
- "itive",
- "itivi",
- "itiv\u0103",
- "i\u0163iune",
- "itoare",
- "itor",
- "itori",
- ):
+ elif suffix in ("itiv", "itiva", "itive", "itivi",
+ "itiv\u0103", "i\u0163iune",
+ "itoare", "itor", "itori"):
word = suffix_replace(word, suffix, "it")
if suffix in r2:
if word[-5] == "\u0163":
word = "".join((word[:-5], "t"))
- elif suffix in (
- "ism",
- "isme",
- "ist",
- "ista",
- "iste",
- "isti",
- "ist\u0103",
- "i\u015Fti",
- ):
+ elif suffix in ("ism", "isme", "ist", "ista", "iste",
+ "isti", "ist\u0103", "i\u015Fti"):
word = suffix_replace(word, suffix, "ist")
else:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
break
# STEP 3: Removal of verb suffixes
for suffix in self.__step3_suffixes:
if word.endswith(suffix):
if suffix in rv:
- if suffix in (
- "seser\u0103\u0163i",
- "seser\u0103m",
- "ser\u0103\u0163i",
- "sese\u015Fi",
- "seser\u0103",
- "ser\u0103m",
- "sesem",
- "se\u015Fi",
- "ser\u0103",
- "sese",
- "a\u0163i",
- "e\u0163i",
- "i\u0163i",
- "\xE2\u0163i",
- "sei",
- "\u0103m",
- "em",
- "im",
- "\xE2m",
- "se",
- ):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ if suffix in ('seser\u0103\u0163i', 'seser\u0103m',
+ 'ser\u0103\u0163i', 'sese\u015Fi',
+ 'seser\u0103', 'ser\u0103m', 'sesem',
+ 'se\u015Fi', 'ser\u0103', 'sese',
+ 'a\u0163i', 'e\u0163i', 'i\u0163i',
+ '\xE2\u0163i', 'sei', '\u0103m',
+ 'em', 'im', '\xE2m', 'se'):
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
else:
- if (
- not rv.startswith(suffix)
- and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE"
- ):
- word = word[: -len(suffix)]
+ if (not rv.startswith(suffix) and
+ rv[rv.index(suffix)-1] not in
+ "aeio\u0103\xE2\xEE"):
+ word = word[:-len(suffix)]
break
# STEP 4: Removal of final vowel
for suffix in ("ie", "a", "e", "i", "\u0103"):
if word.endswith(suffix):
if suffix in rv:
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
break
word = word.replace("I", "i").replace("U", "u")
+
return word
+
class RussianStemmer(_LanguageSpecificStemmer):
"""
"""
- __perfective_gerund_suffixes = (
- "ivshis'",
- "yvshis'",
- "vshis'",
- "ivshi",
- "yvshi",
- "vshi",
- "iv",
- "yv",
- "v",
- )
- __adjectival_suffixes = (
- "ui^ushchi^ui^u",
- "ui^ushchi^ai^a",
- "ui^ushchimi",
- "ui^ushchymi",
- "ui^ushchego",
- "ui^ushchogo",
- "ui^ushchemu",
- "ui^ushchomu",
- "ui^ushchikh",
- "ui^ushchykh",
- "ui^ushchui^u",
- "ui^ushchaia",
- "ui^ushchoi^u",
- "ui^ushchei^u",
- "i^ushchi^ui^u",
- "i^ushchi^ai^a",
- "ui^ushchee",
- "ui^ushchie",
- "ui^ushchye",
- "ui^ushchoe",
- "ui^ushchei`",
- "ui^ushchii`",
- "ui^ushchyi`",
- "ui^ushchoi`",
- "ui^ushchem",
- "ui^ushchim",
- "ui^ushchym",
- "ui^ushchom",
- "i^ushchimi",
- "i^ushchymi",
- "i^ushchego",
- "i^ushchogo",
- "i^ushchemu",
- "i^ushchomu",
- "i^ushchikh",
- "i^ushchykh",
- "i^ushchui^u",
- "i^ushchai^a",
- "i^ushchoi^u",
- "i^ushchei^u",
- "i^ushchee",
- "i^ushchie",
- "i^ushchye",
- "i^ushchoe",
- "i^ushchei`",
- "i^ushchii`",
- "i^ushchyi`",
- "i^ushchoi`",
- "i^ushchem",
- "i^ushchim",
- "i^ushchym",
- "i^ushchom",
- "shchi^ui^u",
- "shchi^ai^a",
- "ivshi^ui^u",
- "ivshi^ai^a",
- "yvshi^ui^u",
- "yvshi^ai^a",
- "shchimi",
- "shchymi",
- "shchego",
- "shchogo",
- "shchemu",
- "shchomu",
- "shchikh",
- "shchykh",
- "shchui^u",
- "shchai^a",
- "shchoi^u",
- "shchei^u",
- "ivshimi",
- "ivshymi",
- "ivshego",
- "ivshogo",
- "ivshemu",
- "ivshomu",
- "ivshikh",
- "ivshykh",
- "ivshui^u",
- "ivshai^a",
- "ivshoi^u",
- "ivshei^u",
- "yvshimi",
- "yvshymi",
- "yvshego",
- "yvshogo",
- "yvshemu",
- "yvshomu",
- "yvshikh",
- "yvshykh",
- "yvshui^u",
- "yvshai^a",
- "yvshoi^u",
- "yvshei^u",
- "vshi^ui^u",
- "vshi^ai^a",
- "shchee",
- "shchie",
- "shchye",
- "shchoe",
- "shchei`",
- "shchii`",
- "shchyi`",
- "shchoi`",
- "shchem",
- "shchim",
- "shchym",
- "shchom",
- "ivshee",
- "ivshie",
- "ivshye",
- "ivshoe",
- "ivshei`",
- "ivshii`",
- "ivshyi`",
- "ivshoi`",
- "ivshem",
- "ivshim",
- "ivshym",
- "ivshom",
- "yvshee",
- "yvshie",
- "yvshye",
- "yvshoe",
- "yvshei`",
- "yvshii`",
- "yvshyi`",
- "yvshoi`",
- "yvshem",
- "yvshim",
- "yvshym",
- "yvshom",
- "vshimi",
- "vshymi",
- "vshego",
- "vshogo",
- "vshemu",
- "vshomu",
- "vshikh",
- "vshykh",
- "vshui^u",
- "vshai^a",
- "vshoi^u",
- "vshei^u",
- "emi^ui^u",
- "emi^ai^a",
- "nni^ui^u",
- "nni^ai^a",
- "vshee",
- "vshie",
- "vshye",
- "vshoe",
- "vshei`",
- "vshii`",
- "vshyi`",
- "vshoi`",
- "vshem",
- "vshim",
- "vshym",
- "vshom",
- "emimi",
- "emymi",
- "emego",
- "emogo",
- "ememu",
- "emomu",
- "emikh",
- "emykh",
- "emui^u",
- "emai^a",
- "emoi^u",
- "emei^u",
- "nnimi",
- "nnymi",
- "nnego",
- "nnogo",
- "nnemu",
- "nnomu",
- "nnikh",
- "nnykh",
- "nnui^u",
- "nnai^a",
- "nnoi^u",
- "nnei^u",
- "emee",
- "emie",
- "emye",
- "emoe",
- "emei`",
- "emii`",
- "emyi`",
- "emoi`",
- "emem",
- "emim",
- "emym",
- "emom",
- "nnee",
- "nnie",
- "nnye",
- "nnoe",
- "nnei`",
- "nnii`",
- "nnyi`",
- "nnoi`",
- "nnem",
- "nnim",
- "nnym",
- "nnom",
- "i^ui^u",
- "i^ai^a",
- "imi",
- "ymi",
- "ego",
- "ogo",
- "emu",
- "omu",
- "ikh",
- "ykh",
- "ui^u",
- "ai^a",
- "oi^u",
- "ei^u",
- "ee",
- "ie",
- "ye",
- "oe",
- "ei`",
- "ii`",
- "yi`",
- "oi`",
- "em",
- "im",
- "ym",
- "om",
- )
+ __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'",
+ "ivshi", "yvshi", "vshi", "iv",
+ "yv", "v")
+ __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a',
+ 'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego',
+ 'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu',
+ 'ui^ushchikh', 'ui^ushchykh',
+ 'ui^ushchui^u', 'ui^ushchaia',
+ 'ui^ushchoi^u', 'ui^ushchei^u',
+ 'i^ushchi^ui^u', 'i^ushchi^ai^a',
+ 'ui^ushchee', 'ui^ushchie',
+ 'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`',
+ 'ui^ushchii`', 'ui^ushchyi`',
+ 'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim',
+ 'ui^ushchym', 'ui^ushchom', 'i^ushchimi',
+ 'i^ushchymi', 'i^ushchego', 'i^ushchogo',
+ 'i^ushchemu', 'i^ushchomu', 'i^ushchikh',
+ 'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a',
+ 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee',
+ 'i^ushchie', 'i^ushchye', 'i^ushchoe',
+ 'i^ushchei`', 'i^ushchii`',
+ 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem',
+ 'i^ushchim', 'i^ushchym', 'i^ushchom',
+ 'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u',
+ 'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a',
+ 'shchimi', 'shchymi', 'shchego', 'shchogo',
+ 'shchemu', 'shchomu', 'shchikh', 'shchykh',
+ 'shchui^u', 'shchai^a', 'shchoi^u',
+ 'shchei^u', 'ivshimi', 'ivshymi',
+ 'ivshego', 'ivshogo', 'ivshemu', 'ivshomu',
+ 'ivshikh', 'ivshykh', 'ivshui^u',
+ 'ivshai^a', 'ivshoi^u', 'ivshei^u',
+ 'yvshimi', 'yvshymi', 'yvshego', 'yvshogo',
+ 'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh',
+ 'yvshui^u', 'yvshai^a', 'yvshoi^u',
+ 'yvshei^u', 'vshi^ui^u', 'vshi^ai^a',
+ 'shchee', 'shchie', 'shchye', 'shchoe',
+ 'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
+ 'shchem', 'shchim', 'shchym', 'shchom',
+ 'ivshee', 'ivshie', 'ivshye', 'ivshoe',
+ 'ivshei`', 'ivshii`', 'ivshyi`',
+ 'ivshoi`', 'ivshem', 'ivshim', 'ivshym',
+ 'ivshom', 'yvshee', 'yvshie', 'yvshye',
+ 'yvshoe', 'yvshei`', 'yvshii`',
+ 'yvshyi`', 'yvshoi`', 'yvshem',
+ 'yvshim', 'yvshym', 'yvshom', 'vshimi',
+ 'vshymi', 'vshego', 'vshogo', 'vshemu',
+ 'vshomu', 'vshikh', 'vshykh', 'vshui^u',
+ 'vshai^a', 'vshoi^u', 'vshei^u',
+ 'emi^ui^u', 'emi^ai^a', 'nni^ui^u',
+ 'nni^ai^a', 'vshee',
+ 'vshie', 'vshye', 'vshoe', 'vshei`',
+ 'vshii`', 'vshyi`', 'vshoi`',
+ 'vshem', 'vshim', 'vshym', 'vshom',
+ 'emimi', 'emymi', 'emego', 'emogo',
+ 'ememu', 'emomu', 'emikh', 'emykh',
+ 'emui^u', 'emai^a', 'emoi^u', 'emei^u',
+ 'nnimi', 'nnymi', 'nnego', 'nnogo',
+ 'nnemu', 'nnomu', 'nnikh', 'nnykh',
+ 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
+ 'emee', 'emie', 'emye', 'emoe',
+ 'emei`', 'emii`', 'emyi`',
+ 'emoi`', 'emem', 'emim', 'emym',
+ 'emom', 'nnee', 'nnie', 'nnye', 'nnoe',
+ 'nnei`', 'nnii`', 'nnyi`',
+ 'nnoi`', 'nnem', 'nnim', 'nnym',
+ 'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi',
+ 'ego', 'ogo', 'emu', 'omu', 'ikh',
+ 'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u',
+ 'ee', 'ie', 'ye', 'oe', 'ei`',
+ 'ii`', 'yi`', 'oi`', 'em',
+ 'im', 'ym', 'om')
__reflexive_suffixes = ("si^a", "s'")
- __verb_suffixes = (
- "esh'",
- "ei`te",
- "ui`te",
- "ui^ut",
- "ish'",
- "ete",
- "i`te",
- "i^ut",
- "nno",
- "ila",
- "yla",
- "ena",
- "ite",
- "ili",
- "yli",
- "ilo",
- "ylo",
- "eno",
- "i^at",
- "uet",
- "eny",
- "it'",
- "yt'",
- "ui^u",
- "la",
- "na",
- "li",
- "em",
- "lo",
- "no",
- "et",
- "ny",
- "t'",
- "ei`",
- "ui`",
- "il",
- "yl",
- "im",
- "ym",
- "en",
- "it",
- "yt",
- "i^u",
- "i`",
- "l",
- "n",
- )
- __noun_suffixes = (
- "ii^ami",
- "ii^akh",
- "i^ami",
- "ii^am",
- "i^akh",
- "ami",
- "iei`",
- "i^am",
- "iem",
- "akh",
- "ii^u",
- "'i^u",
- "ii^a",
- "'i^a",
- "ev",
- "ov",
- "ie",
- "'e",
- "ei",
- "ii",
- "ei`",
- "oi`",
- "ii`",
- "em",
- "am",
- "om",
- "i^u",
- "i^a",
- "a",
- "e",
- "i",
- "i`",
- "o",
- "u",
- "y",
- "'",
- )
+ __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut',
+ "ish'", 'ete', 'i`te', 'i^ut', 'nno',
+ 'ila', 'yla', 'ena', 'ite', 'ili', 'yli',
+ 'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny',
+ "it'", "yt'", 'ui^u', 'la', 'na', 'li',
+ 'em', 'lo', 'no', 'et', 'ny', "t'",
+ 'ei`', 'ui`', 'il', 'yl', 'im',
+ 'ym', 'en', 'it', 'yt', 'i^u', 'i`',
+ 'l', 'n')
+ __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh',
+ 'ami', 'iei`', 'i^am', 'iem', 'akh',
+ 'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov',
+ 'ie', "'e", 'ei', 'ii', 'ei`',
+ 'oi`', 'ii`', 'em', 'am', 'om',
+ 'i^u', 'i^a', 'a', 'e', 'i', 'i`',
+ 'o', 'u', 'y', "'")
__superlative_suffixes = ("ei`she", "ei`sh")
__derivational_suffixes = ("ost'", "ost")
chr_exceeded = True
break
- if not chr_exceeded:
- return word
-
- word = self.__cyrillic_to_roman(word)
+ if chr_exceeded:
+ word = self.__cyrillic_to_roman(word)
step1_success = False
adjectival_removed = False
for suffix in self.__perfective_gerund_suffixes:
if rv.endswith(suffix):
if suffix in ("v", "vshi", "vshis'"):
- if (
- rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
- or rv[-len(suffix) - 1 : -len(suffix)] == "a"
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or
+ rv[-len(suffix)-1:-len(suffix)] == "a"):
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
step1_success = True
break
else:
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
step1_success = True
break
if not step1_success:
for suffix in self.__reflexive_suffixes:
if rv.endswith(suffix):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
for suffix in self.__adjectival_suffixes:
if rv.endswith(suffix):
- if suffix in (
- "i^ushchi^ui^u",
- "i^ushchi^ai^a",
- "i^ushchui^u",
- "i^ushchai^a",
- "i^ushchoi^u",
- "i^ushchei^u",
- "i^ushchimi",
- "i^ushchymi",
- "i^ushchego",
- "i^ushchogo",
- "i^ushchemu",
- "i^ushchomu",
- "i^ushchikh",
- "i^ushchykh",
- "shchi^ui^u",
- "shchi^ai^a",
- "i^ushchee",
- "i^ushchie",
- "i^ushchye",
- "i^ushchoe",
- "i^ushchei`",
- "i^ushchii`",
- "i^ushchyi`",
- "i^ushchoi`",
- "i^ushchem",
- "i^ushchim",
- "i^ushchym",
- "i^ushchom",
- "vshi^ui^u",
- "vshi^ai^a",
- "shchui^u",
- "shchai^a",
- "shchoi^u",
- "shchei^u",
- "emi^ui^u",
- "emi^ai^a",
- "nni^ui^u",
- "nni^ai^a",
- "shchimi",
- "shchymi",
- "shchego",
- "shchogo",
- "shchemu",
- "shchomu",
- "shchikh",
- "shchykh",
- "vshui^u",
- "vshai^a",
- "vshoi^u",
- "vshei^u",
- "shchee",
- "shchie",
- "shchye",
- "shchoe",
- "shchei`",
- "shchii`",
- "shchyi`",
- "shchoi`",
- "shchem",
- "shchim",
- "shchym",
- "shchom",
- "vshimi",
- "vshymi",
- "vshego",
- "vshogo",
- "vshemu",
- "vshomu",
- "vshikh",
- "vshykh",
- "emui^u",
- "emai^a",
- "emoi^u",
- "emei^u",
- "nnui^u",
- "nnai^a",
- "nnoi^u",
- "nnei^u",
- "vshee",
- "vshie",
- "vshye",
- "vshoe",
- "vshei`",
- "vshii`",
- "vshyi`",
- "vshoi`",
- "vshem",
- "vshim",
- "vshym",
- "vshom",
- "emimi",
- "emymi",
- "emego",
- "emogo",
- "ememu",
- "emomu",
- "emikh",
- "emykh",
- "nnimi",
- "nnymi",
- "nnego",
- "nnogo",
- "nnemu",
- "nnomu",
- "nnikh",
- "nnykh",
- "emee",
- "emie",
- "emye",
- "emoe",
- "emei`",
- "emii`",
- "emyi`",
- "emoi`",
- "emem",
- "emim",
- "emym",
- "emom",
- "nnee",
- "nnie",
- "nnye",
- "nnoe",
- "nnei`",
- "nnii`",
- "nnyi`",
- "nnoi`",
- "nnem",
- "nnim",
- "nnym",
- "nnom",
- ):
- if (
- rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
- or rv[-len(suffix) - 1 : -len(suffix)] == "a"
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a',
+ 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u',
+ 'i^ushchei^u', 'i^ushchimi', 'i^ushchymi',
+ 'i^ushchego', 'i^ushchogo', 'i^ushchemu',
+ 'i^ushchomu', 'i^ushchikh', 'i^ushchykh',
+ 'shchi^ui^u', 'shchi^ai^a', 'i^ushchee',
+ 'i^ushchie', 'i^ushchye', 'i^ushchoe',
+ 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`',
+ 'i^ushchoi`', 'i^ushchem', 'i^ushchim',
+ 'i^ushchym', 'i^ushchom', 'vshi^ui^u',
+ 'vshi^ai^a', 'shchui^u', 'shchai^a',
+ 'shchoi^u', 'shchei^u', 'emi^ui^u',
+ 'emi^ai^a', 'nni^ui^u', 'nni^ai^a',
+ 'shchimi', 'shchymi', 'shchego', 'shchogo',
+ 'shchemu', 'shchomu', 'shchikh', 'shchykh',
+ 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u',
+ 'shchee', 'shchie', 'shchye', 'shchoe',
+ 'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
+ 'shchem', 'shchim', 'shchym', 'shchom',
+ 'vshimi', 'vshymi', 'vshego', 'vshogo',
+ 'vshemu', 'vshomu', 'vshikh', 'vshykh',
+ 'emui^u', 'emai^a', 'emoi^u', 'emei^u',
+ 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
+ 'vshee', 'vshie', 'vshye', 'vshoe',
+ 'vshei`', 'vshii`', 'vshyi`', 'vshoi`',
+ 'vshem', 'vshim', 'vshym', 'vshom',
+ 'emimi', 'emymi', 'emego', 'emogo',
+ 'ememu', 'emomu', 'emikh', 'emykh',
+ 'nnimi', 'nnymi', 'nnego', 'nnogo',
+ 'nnemu', 'nnomu', 'nnikh', 'nnykh',
+ 'emee', 'emie', 'emye', 'emoe', 'emei`',
+ 'emii`', 'emyi`', 'emoi`', 'emem', 'emim',
+ 'emym', 'emom', 'nnee', 'nnie', 'nnye',
+ 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`',
+ 'nnem', 'nnim', 'nnym', 'nnom'):
+ if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or
+ rv[-len(suffix)-1:-len(suffix)] == "a"):
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
adjectival_removed = True
break
else:
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
adjectival_removed = True
break
if not adjectival_removed:
for suffix in self.__verb_suffixes:
if rv.endswith(suffix):
- if suffix in (
- "la",
- "na",
- "ete",
- "i`te",
- "li",
- "i`",
- "l",
- "em",
- "n",
- "lo",
- "no",
- "et",
- "i^ut",
- "ny",
- "t'",
- "esh'",
- "nno",
- ):
- if (
- rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
- or rv[-len(suffix) - 1 : -len(suffix)] == "a"
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ if suffix in ("la", "na", "ete", "i`te", "li",
+ "i`", "l", "em", "n", "lo", "no",
+ "et", "i^ut", "ny", "t'", "esh'",
+ "nno"):
+ if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or
+ rv[-len(suffix)-1:-len(suffix)] == "a"):
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
verb_removed = True
break
else:
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
verb_removed = True
break
if not adjectival_removed and not verb_removed:
for suffix in self.__noun_suffixes:
if rv.endswith(suffix):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
# Step 2
# Step 3
for suffix in self.__derivational_suffixes:
if r2.endswith(suffix):
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
break
# Step 4
if not undouble_success:
for suffix in self.__superlative_suffixes:
if word.endswith(suffix):
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
superlative_removed = True
break
if word.endswith("nn"):
if word.endswith("'"):
word = word[:-1]
- word = self.__roman_to_cyrillic(word)
+ if chr_exceeded:
+ word = self.__roman_to_cyrillic(word)
+
return word
+
+
def __regions_russian(self, word):
"""
Return the regions RV and R2 which are used by the Russian stemmer.
rv = ""
vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
- word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E")
+ word = (word.replace("i^a", "A")
+ .replace("i^u", "U")
+ .replace("e`", "E"))
for i in range(1, len(word)):
- if word[i] not in vowels and word[i - 1] in vowels:
- r1 = word[i + 1 :]
+ if word[i] not in vowels and word[i-1] in vowels:
+ r1 = word[i+1:]
break
for i in range(1, len(r1)):
- if r1[i] not in vowels and r1[i - 1] in vowels:
- r2 = r1[i + 1 :]
+ if r1[i] not in vowels and r1[i-1] in vowels:
+ r2 = r1[i+1:]
break
for i in range(len(word)):
if word[i] in vowels:
- rv = word[i + 1 :]
+ rv = word[i+1:]
break
- r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
- rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
+ r2 = (r2.replace("A", "i^a")
+ .replace("U", "i^u")
+ .replace("E", "e`"))
+ rv = (rv.replace("A", "i^a")
+ .replace("U", "i^u")
+ .replace("E", "e`"))
+
return (rv, r2)
+
+
def __cyrillic_to_roman(self, word):
"""
Transliterate a Russian word into the Roman alphabet.
RussianStemmer. It is not to be invoked directly!
"""
- word = (
- word.replace("\u0410", "a")
- .replace("\u0430", "a")
- .replace("\u0411", "b")
- .replace("\u0431", "b")
- .replace("\u0412", "v")
- .replace("\u0432", "v")
- .replace("\u0413", "g")
- .replace("\u0433", "g")
- .replace("\u0414", "d")
- .replace("\u0434", "d")
- .replace("\u0415", "e")
- .replace("\u0435", "e")
- .replace("\u0401", "e")
- .replace("\u0451", "e")
- .replace("\u0416", "zh")
- .replace("\u0436", "zh")
- .replace("\u0417", "z")
- .replace("\u0437", "z")
- .replace("\u0418", "i")
- .replace("\u0438", "i")
- .replace("\u0419", "i`")
- .replace("\u0439", "i`")
- .replace("\u041A", "k")
- .replace("\u043A", "k")
- .replace("\u041B", "l")
- .replace("\u043B", "l")
- .replace("\u041C", "m")
- .replace("\u043C", "m")
- .replace("\u041D", "n")
- .replace("\u043D", "n")
- .replace("\u041E", "o")
- .replace("\u043E", "o")
- .replace("\u041F", "p")
- .replace("\u043F", "p")
- .replace("\u0420", "r")
- .replace("\u0440", "r")
- .replace("\u0421", "s")
- .replace("\u0441", "s")
- .replace("\u0422", "t")
- .replace("\u0442", "t")
- .replace("\u0423", "u")
- .replace("\u0443", "u")
- .replace("\u0424", "f")
- .replace("\u0444", "f")
- .replace("\u0425", "kh")
- .replace("\u0445", "kh")
- .replace("\u0426", "t^s")
- .replace("\u0446", "t^s")
- .replace("\u0427", "ch")
- .replace("\u0447", "ch")
- .replace("\u0428", "sh")
- .replace("\u0448", "sh")
- .replace("\u0429", "shch")
- .replace("\u0449", "shch")
- .replace("\u042A", "''")
- .replace("\u044A", "''")
- .replace("\u042B", "y")
- .replace("\u044B", "y")
- .replace("\u042C", "'")
- .replace("\u044C", "'")
- .replace("\u042D", "e`")
- .replace("\u044D", "e`")
- .replace("\u042E", "i^u")
- .replace("\u044E", "i^u")
- .replace("\u042F", "i^a")
- .replace("\u044F", "i^a")
- )
+ word = (word.replace("\u0410", "a").replace("\u0430", "a")
+ .replace("\u0411", "b").replace("\u0431", "b")
+ .replace("\u0412", "v").replace("\u0432", "v")
+ .replace("\u0413", "g").replace("\u0433", "g")
+ .replace("\u0414", "d").replace("\u0434", "d")
+ .replace("\u0415", "e").replace("\u0435", "e")
+ .replace("\u0401", "e").replace("\u0451", "e")
+ .replace("\u0416", "zh").replace("\u0436", "zh")
+ .replace("\u0417", "z").replace("\u0437", "z")
+ .replace("\u0418", "i").replace("\u0438", "i")
+ .replace("\u0419", "i`").replace("\u0439", "i`")
+ .replace("\u041A", "k").replace("\u043A", "k")
+ .replace("\u041B", "l").replace("\u043B", "l")
+ .replace("\u041C", "m").replace("\u043C", "m")
+ .replace("\u041D", "n").replace("\u043D", "n")
+ .replace("\u041E", "o").replace("\u043E", "o")
+ .replace("\u041F", "p").replace("\u043F", "p")
+ .replace("\u0420", "r").replace("\u0440", "r")
+ .replace("\u0421", "s").replace("\u0441", "s")
+ .replace("\u0422", "t").replace("\u0442", "t")
+ .replace("\u0423", "u").replace("\u0443", "u")
+ .replace("\u0424", "f").replace("\u0444", "f")
+ .replace("\u0425", "kh").replace("\u0445", "kh")
+ .replace("\u0426", "t^s").replace("\u0446", "t^s")
+ .replace("\u0427", "ch").replace("\u0447", "ch")
+ .replace("\u0428", "sh").replace("\u0448", "sh")
+ .replace("\u0429", "shch").replace("\u0449", "shch")
+ .replace("\u042A", "''").replace("\u044A", "''")
+ .replace("\u042B", "y").replace("\u044B", "y")
+ .replace("\u042C", "'").replace("\u044C", "'")
+ .replace("\u042D", "e`").replace("\u044D", "e`")
+ .replace("\u042E", "i^u").replace("\u044E", "i^u")
+ .replace("\u042F", "i^a").replace("\u044F", "i^a"))
+
return word
+
+
def __roman_to_cyrillic(self, word):
"""
Transliterate a Russian word back into the Cyrillic alphabet.
RussianStemmer. It is not to be invoked directly!
"""
- word = (
- word.replace("i^u", "\u044E")
- .replace("i^a", "\u044F")
- .replace("shch", "\u0449")
- .replace("kh", "\u0445")
- .replace("t^s", "\u0446")
- .replace("ch", "\u0447")
- .replace("e`", "\u044D")
- .replace("i`", "\u0439")
- .replace("sh", "\u0448")
- .replace("k", "\u043A")
- .replace("e", "\u0435")
- .replace("zh", "\u0436")
- .replace("a", "\u0430")
- .replace("b", "\u0431")
- .replace("v", "\u0432")
- .replace("g", "\u0433")
- .replace("d", "\u0434")
- .replace("e", "\u0435")
- .replace("z", "\u0437")
- .replace("i", "\u0438")
- .replace("l", "\u043B")
- .replace("m", "\u043C")
- .replace("n", "\u043D")
- .replace("o", "\u043E")
- .replace("p", "\u043F")
- .replace("r", "\u0440")
- .replace("s", "\u0441")
- .replace("t", "\u0442")
- .replace("u", "\u0443")
- .replace("f", "\u0444")
- .replace("''", "\u044A")
- .replace("y", "\u044B")
- .replace("'", "\u044C")
- )
+ word = (word.replace("i^u", "\u044E").replace("i^a", "\u044F")
+ .replace("shch", "\u0449").replace("kh", "\u0445")
+ .replace("t^s", "\u0446").replace("ch", "\u0447")
+ .replace("e`", "\u044D").replace("i`", "\u0439")
+ .replace("sh", "\u0448").replace("k", "\u043A")
+ .replace("e", "\u0435").replace("zh", "\u0436")
+ .replace("a", "\u0430").replace("b", "\u0431")
+ .replace("v", "\u0432").replace("g", "\u0433")
+ .replace("d", "\u0434").replace("e", "\u0435")
+ .replace("z", "\u0437").replace("i", "\u0438")
+ .replace("l", "\u043B").replace("m", "\u043C")
+ .replace("n", "\u043D").replace("o", "\u043E")
+ .replace("p", "\u043F").replace("r", "\u0440")
+ .replace("s", "\u0441").replace("t", "\u0442")
+ .replace("u", "\u0443").replace("f", "\u0444")
+ .replace("''", "\u044A").replace("y", "\u044B")
+ .replace("'", "\u044C"))
+
return word
"""
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC"
- __step0_suffixes = (
- "selas",
- "selos",
- "sela",
- "selo",
- "las",
- "les",
- "los",
- "nos",
- "me",
- "se",
- "la",
- "le",
- "lo",
- )
- __step1_suffixes = (
- "amientos",
- "imientos",
- "amiento",
- "imiento",
- "aciones",
- "uciones",
- "adoras",
- "adores",
- "ancias",
- "log\xEDas",
- "encias",
- "amente",
- "idades",
- "anzas",
- "ismos",
- "ables",
- "ibles",
- "istas",
- "adora",
- "aci\xF3n",
- "antes",
- "ancia",
- "log\xEDa",
- "uci\xf3n",
- "encia",
- "mente",
- "anza",
- "icos",
- "icas",
- "ismo",
- "able",
- "ible",
- "ista",
- "osos",
- "osas",
- "ador",
- "ante",
- "idad",
- "ivas",
- "ivos",
- "ico",
- "ica",
- "oso",
- "osa",
- "iva",
- "ivo",
- )
- __step2a_suffixes = (
- "yeron",
- "yendo",
- "yamos",
- "yais",
- "yan",
- "yen",
- "yas",
- "yes",
- "ya",
- "ye",
- "yo",
- "y\xF3",
- )
- __step2b_suffixes = (
- "ar\xEDamos",
- "er\xEDamos",
- "ir\xEDamos",
- "i\xE9ramos",
- "i\xE9semos",
- "ar\xEDais",
- "aremos",
- "er\xEDais",
- "eremos",
- "ir\xEDais",
- "iremos",
- "ierais",
- "ieseis",
- "asteis",
- "isteis",
- "\xE1bamos",
- "\xE1ramos",
- "\xE1semos",
- "ar\xEDan",
- "ar\xEDas",
- "ar\xE9is",
- "er\xEDan",
- "er\xEDas",
- "er\xE9is",
- "ir\xEDan",
- "ir\xEDas",
- "ir\xE9is",
- "ieran",
- "iesen",
- "ieron",
- "iendo",
- "ieras",
- "ieses",
- "abais",
- "arais",
- "aseis",
- "\xE9amos",
- "ar\xE1n",
- "ar\xE1s",
- "ar\xEDa",
- "er\xE1n",
- "er\xE1s",
- "er\xEDa",
- "ir\xE1n",
- "ir\xE1s",
- "ir\xEDa",
- "iera",
- "iese",
- "aste",
- "iste",
- "aban",
- "aran",
- "asen",
- "aron",
- "ando",
- "abas",
- "adas",
- "idas",
- "aras",
- "ases",
- "\xEDais",
- "ados",
- "idos",
- "amos",
- "imos",
- "emos",
- "ar\xE1",
- "ar\xE9",
- "er\xE1",
- "er\xE9",
- "ir\xE1",
- "ir\xE9",
- "aba",
- "ada",
- "ida",
- "ara",
- "ase",
- "\xEDan",
- "ado",
- "ido",
- "\xEDas",
- "\xE1is",
- "\xE9is",
- "\xEDa",
- "ad",
- "ed",
- "id",
- "an",
- "i\xF3",
- "ar",
- "er",
- "ir",
- "as",
- "\xEDs",
- "en",
- "es",
- )
- __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3")
+ __step0_suffixes = ("selas", "selos", "sela", "selo", "las",
+ "les", "los", "nos", "me", "se", "la", "le",
+ "lo")
+ __step1_suffixes = ('amientos', 'imientos', 'amiento', 'imiento',
+ 'aciones', 'uciones', 'adoras', 'adores',
+ 'ancias', 'log\xEDas', 'encias', 'amente',
+ 'idades', 'anzas', 'ismos', 'ables', 'ibles',
+ 'istas', 'adora', 'aci\xF3n', 'antes',
+ 'ancia', 'log\xEDa', 'uci\xf3n', 'encia',
+ 'mente', 'anza', 'icos', 'icas', 'ismo',
+ 'able', 'ible', 'ista', 'osos', 'osas',
+ 'ador', 'ante', 'idad', 'ivas', 'ivos',
+ 'ico',
+ 'ica', 'oso', 'osa', 'iva', 'ivo')
+ __step2a_suffixes = ('yeron', 'yendo', 'yamos', 'yais', 'yan',
+ 'yen', 'yas', 'yes', 'ya', 'ye', 'yo',
+ 'y\xF3')
+ __step2b_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos',
+ 'i\xE9ramos', 'i\xE9semos', 'ar\xEDais',
+ 'aremos', 'er\xEDais', 'eremos',
+ 'ir\xEDais', 'iremos', 'ierais', 'ieseis',
+ 'asteis', 'isteis', '\xE1bamos',
+ '\xE1ramos', '\xE1semos', 'ar\xEDan',
+ 'ar\xEDas', 'ar\xE9is', 'er\xEDan',
+ 'er\xEDas', 'er\xE9is', 'ir\xEDan',
+ 'ir\xEDas', 'ir\xE9is',
+ 'ieran', 'iesen', 'ieron', 'iendo', 'ieras',
+ 'ieses', 'abais', 'arais', 'aseis',
+ '\xE9amos', 'ar\xE1n', 'ar\xE1s',
+ 'ar\xEDa', 'er\xE1n', 'er\xE1s',
+ 'er\xEDa', 'ir\xE1n', 'ir\xE1s',
+ 'ir\xEDa', 'iera', 'iese', 'aste', 'iste',
+ 'aban', 'aran', 'asen', 'aron', 'ando',
+ 'abas', 'adas', 'idas', 'aras', 'ases',
+ '\xEDais', 'ados', 'idos', 'amos', 'imos',
+ 'emos', 'ar\xE1', 'ar\xE9', 'er\xE1',
+ 'er\xE9', 'ir\xE1', 'ir\xE9', 'aba',
+ 'ada', 'ida', 'ara', 'ase', '\xEDan',
+ 'ado', 'ido', '\xEDas', '\xE1is',
+ '\xE9is', '\xEDa', 'ad', 'ed', 'id',
+ 'an', 'i\xF3', 'ar', 'er', 'ir', 'as',
+ '\xEDs', 'en', 'es')
+ __step3_suffixes = ("os", "a", "e", "o", "\xE1",
+ "\xE9", "\xED", "\xF3")
def stem(self, word):
"""
if not (word.endswith(suffix) and rv.endswith(suffix)):
continue
- if (
- rv[: -len(suffix)].endswith(
- (
- "ando",
- "\xE1ndo",
- "ar",
- "\xE1r",
- "er",
- "\xE9r",
- "iendo",
- "i\xE9ndo",
- "ir",
- "\xEDr",
- )
- )
- ) or (
- rv[: -len(suffix)].endswith("yendo")
- and word[: -len(suffix)].endswith("uyendo")
- ):
-
- word = self.__replace_accented(word[: -len(suffix)])
- r1 = self.__replace_accented(r1[: -len(suffix)])
- r2 = self.__replace_accented(r2[: -len(suffix)])
- rv = self.__replace_accented(rv[: -len(suffix)])
+ if ((rv[:-len(suffix)].endswith(("ando", "\xE1ndo",
+ "ar", "\xE1r",
+ "er", "\xE9r",
+ "iendo", "i\xE9ndo",
+ "ir", "\xEDr"))) or
+ (rv[:-len(suffix)].endswith("yendo") and
+ word[:-len(suffix)].endswith("uyendo"))):
+
+ word = self.__replace_accented(word[:-len(suffix)])
+ r1 = self.__replace_accented(r1[:-len(suffix)])
+ r2 = self.__replace_accented(r2[:-len(suffix)])
+ rv = self.__replace_accented(rv[:-len(suffix)])
break
# STEP 1: Standard suffix removal
elif r2.endswith(suffix):
step1_success = True
- if suffix in (
- "adora",
- "ador",
- "aci\xF3n",
- "adoras",
- "adores",
- "aciones",
- "ante",
- "antes",
- "ancia",
- "ancias",
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ if suffix in ("adora", "ador", "aci\xF3n", "adoras",
+ "adores", "aciones", "ante", "antes",
+ "ancia", "ancias"):
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
if r2.endswith("ic"):
word = word[:-2]
rv = suffix_replace(rv, suffix, "ente")
elif suffix == "mente":
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
if r2.endswith(("ante", "able", "ible")):
word = word[:-4]
rv = rv[:-4]
elif suffix in ("idad", "idades"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
for pre_suff in ("abil", "ic", "iv"):
if r2.endswith(pre_suff):
- word = word[: -len(pre_suff)]
- rv = rv[: -len(pre_suff)]
+ word = word[:-len(pre_suff)]
+ rv = rv[:-len(pre_suff)]
elif suffix in ("ivo", "iva", "ivos", "ivas"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r2 = r2[:-len(suffix)]
+ rv = rv[:-len(suffix)]
if r2.endswith("at"):
word = word[:-2]
rv = rv[:-2]
else:
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
# STEP 2a: Verb suffixes beginning 'y'
if not step1_success:
for suffix in self.__step2a_suffixes:
- if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u":
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ if (rv.endswith(suffix) and
+ word[-len(suffix)-1:-len(suffix)] == "u"):
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
break
- # STEP 2b: Other verb suffixes
+ # STEP 2b: Other verb suffixes
for suffix in self.__step2b_suffixes:
if rv.endswith(suffix):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
+ word = word[:-len(suffix)]
+ rv = rv[:-len(suffix)]
if suffix in ("en", "es", "\xE9is", "emos"):
if word.endswith("gu"):
word = word[:-1]
# STEP 3: Residual suffix
for suffix in self.__step3_suffixes:
if rv.endswith(suffix):
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
if suffix in ("e", "\xE9"):
- rv = rv[: -len(suffix)]
+ rv = rv[:-len(suffix)]
if word[-2:] == "gu" and rv.endswith("u"):
word = word[:-1]
their non-accented counterparts (a, e, i, o, u)
:rtype: str or unicode
"""
- return (
- word.replace("\xE1", "a")
- .replace("\xE9", "e")
- .replace("\xED", "i")
- .replace("\xF3", "o")
- .replace("\xFA", "u")
- )
+ return (word.replace("\xE1", "a")
+ .replace("\xE9", "e")
+ .replace("\xED", "i")
+ .replace("\xF3", "o")
+ .replace("\xFA", "u"))
class SwedishStemmer(_ScandinavianStemmer):
__vowels = "aeiouy\xE4\xE5\xF6"
__s_ending = "bcdfghjklmnoprtvy"
- __step1_suffixes = (
- "heterna",
- "hetens",
- "heter",
- "heten",
- "anden",
- "arnas",
- "ernas",
- "ornas",
- "andes",
- "andet",
- "arens",
- "arna",
- "erna",
- "orna",
- "ande",
- "arne",
- "aste",
- "aren",
- "ades",
- "erns",
- "ade",
- "are",
- "ern",
- "ens",
- "het",
- "ast",
- "ad",
- "en",
- "ar",
- "er",
- "or",
- "as",
- "es",
- "at",
- "a",
- "e",
- "s",
- )
+ __step1_suffixes = ("heterna", "hetens", "heter", "heten",
+ "anden", "arnas", "ernas", "ornas", "andes",
+ "andet", "arens", "arna", "erna", "orna",
+ "ande", "arne", "aste", "aren", "ades",
+ "erns", "ade", "are", "ern", "ens", "het",
+ "ast", "ad", "en", "ar", "er", "or", "as",
+ "es", "at", "a", "e", "s")
__step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
__step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig")
word = word[:-1]
r1 = r1[:-1]
else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
+ word = word[:-len(suffix)]
+ r1 = r1[:-len(suffix)]
break
# STEP 2
for suffix in self.__step3_suffixes:
if r1.endswith(suffix):
if suffix in ("els", "lig", "ig"):
- word = word[: -len(suffix)]
+ word = word[:-len(suffix)]
elif suffix in ("fullt", "l\xF6st"):
word = word[:-1]
break
"""
+ import re
from nltk.corpus import udhr
- udhr_corpus = {
- "arabic": "Arabic_Alarabia-Arabic",
- "danish": "Danish_Dansk-Latin1",
- "dutch": "Dutch_Nederlands-Latin1",
- "english": "English-Latin1",
- "finnish": "Finnish_Suomi-Latin1",
- "french": "French_Francais-Latin1",
- "german": "German_Deutsch-Latin1",
- "hungarian": "Hungarian_Magyar-UTF8",
- "italian": "Italian_Italiano-Latin1",
- "norwegian": "Norwegian-Latin1",
- "porter": "English-Latin1",
- "portuguese": "Portuguese_Portugues-Latin1",
- "romanian": "Romanian_Romana-Latin2",
- "russian": "Russian-UTF8",
- "spanish": "Spanish-Latin1",
- "swedish": "Swedish_Svenska-Latin1",
- }
+ udhr_corpus = {"arabic": "Arabic_Alarabia-Arabic",
+ "danish": "Danish_Dansk-Latin1",
+ "dutch": "Dutch_Nederlands-Latin1",
+ "english": "English-Latin1",
+ "finnish": "Finnish_Suomi-Latin1",
+ "french": "French_Francais-Latin1",
+ "german": "German_Deutsch-Latin1",
+ "hungarian": "Hungarian_Magyar-UTF8",
+ "italian": "Italian_Italiano-Latin1",
+ "norwegian": "Norwegian-Latin1",
+ "porter": "English-Latin1",
+ "portuguese": "Portuguese_Portugues-Latin1",
+ "romanian": "Romanian_Romana-Latin2",
+ "russian": "Russian-UTF8",
+ "spanish": "Spanish-Latin1",
+ "swedish": "Swedish_Svenska-Latin1",
+ }
print("\n")
print("******************************")
while True:
- language = input(
- "Please enter the name of the language "
- + "to be demonstrated\n"
- + "/".join(SnowballStemmer.languages)
- + "\n"
- + "(enter 'exit' in order to leave): "
- )
+ language = input("Please enter the name of the language " +
+ "to be demonstrated\n" +
+ "/".join(SnowballStemmer.languages) +
+ "\n" +
+ "(enter 'exit' in order to leave): ")
if language == "exit":
break
if language not in SnowballStemmer.languages:
- print(
- (
- "\nOops, there is no stemmer for this language. "
- + "Please try again.\n"
- )
- )
+ print(("\nOops, there is no stemmer for this language. " +
+ "Please try again.\n"))
continue
stemmer = SnowballStemmer(language)
- excerpt = udhr.words(udhr_corpus[language])[:300]
+ excerpt = udhr.words(udhr_corpus[language]) [:300]
stemmed = " ".join(stemmer.stem(word) for word in excerpt)
- stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip()
+ stemmed = re.sub(r"(.{,70})\s", r'\1\n', stemmed+' ').rstrip()
excerpt = " ".join(excerpt)
- excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip()
+ excerpt = re.sub(r"(.{,70})\s", r'\1\n', excerpt+' ').rstrip()
print("\n")
- print("-" * 70)
- print("ORIGINAL".center(70))
+ print('-' * 70)
+ print('ORIGINAL'.center(70))
print(excerpt)
print("\n\n")
- print("STEMMED RESULTS".center(70))
+ print('STEMMED RESULTS'.center(70))
print(stemmed)
- print("-" * 70)
+ print('-' * 70)
print("\n")
# Natural Language Toolkit: Stemmer Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Helder <he7d3r@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
def suffix_replace(original, old, new):
"""
Replaces the old suffix of the original string by a new suffix
"""
- return original[: -len(old)] + new
-
+ return original[:-len(old)] + new
def prefix_replace(original, old, new):
"""
:param new: string
:return: string
"""
- return new + original[len(old) :]
+ return new + original[len(old):]
\ No newline at end of file
# Natural Language Toolkit: WordNet stemmer interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class WordNetLemmatizer(object):
"""
WordNet Lemmatizer
return min(lemmas, key=len) if lemmas else word
def __repr__(self):
- return "<WordNetLemmatizer>"
+ return '<WordNetLemmatizer>'
# unload wordnet
def teardown_module(module=None):
from nltk.corpus import wordnet
-
wordnet._unload()
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Taggers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
-A Russian tagger is also available if you specify lang="rus". It uses
+A Russian tagger is also available if you specify lang="rus". It uses
the Russian National Corpus tagset:
>>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP
For more information, please consult chapter 5 of the NLTK Book.
"""
-
-from nltk.tag.api import TaggerI
-from nltk.tag.util import str2tuple, tuple2str, untag
-from nltk.tag.sequential import (
- SequentialBackoffTagger,
- ContextTagger,
- DefaultTagger,
- NgramTagger,
- UnigramTagger,
- BigramTagger,
- TrigramTagger,
- AffixTagger,
- RegexpTagger,
- ClassifierBasedTagger,
- ClassifierBasedPOSTagger,
-)
-from nltk.tag.brill import BrillTagger
+from __future__ import print_function
+
+from nltk.tag.api import TaggerI
+from nltk.tag.util import str2tuple, tuple2str, untag
+from nltk.tag.sequential import (SequentialBackoffTagger, ContextTagger,
+ DefaultTagger, NgramTagger, UnigramTagger,
+ BigramTagger, TrigramTagger, AffixTagger,
+ RegexpTagger, ClassifierBasedTagger,
+ ClassifierBasedPOSTagger)
+from nltk.tag.brill import BrillTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer
-from nltk.tag.tnt import TnT
-from nltk.tag.hunpos import HunposTagger
-from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
-from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
-from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
-from nltk.tag.mapping import tagset_mapping, map_tag
-from nltk.tag.crf import CRFTagger
-from nltk.tag.perceptron import PerceptronTagger
+from nltk.tag.tnt import TnT
+from nltk.tag.hunpos import HunposTagger
+from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
+from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
+from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
+from nltk.tag.mapping import tagset_mapping, map_tag
+from nltk.tag.crf import CRFTagger
+from nltk.tag.perceptron import PerceptronTagger
from nltk.data import load, find
-RUS_PICKLE = (
- "taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle"
-)
+RUS_PICKLE = 'taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle'
def _get_tagger(lang=None):
- if lang == "rus":
+ if lang == 'rus':
tagger = PerceptronTagger(False)
- ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
+ ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
tagger.load(ap_russian_model_loc)
else:
tagger = PerceptronTagger()
return tagger
-def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
- # Currently only supoorts English and Russian.
- if lang not in ["eng", "rus"]:
- raise NotImplementedError(
- "Currently, NLTK pos_tag only supports English and Russian "
- "(i.e. lang='eng' or lang='rus')"
- )
- else:
- tagged_tokens = tagger.tag(tokens)
- if tagset: # Maps to the specified tagset.
- if lang == "eng":
- tagged_tokens = [
- (token, map_tag("en-ptb", tagset, tag))
- for (token, tag) in tagged_tokens
- ]
- elif lang == "rus":
- # Note that the new Russion pos tags from the model contains suffixes,
- # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018
- tagged_tokens = [
- (token, map_tag("ru-rnc-new", tagset, tag.partition("=")[0]))
- for (token, tag) in tagged_tokens
- ]
- return tagged_tokens
-
-
-def pos_tag(tokens, tagset=None, lang="eng"):
+def _pos_tag(tokens, tagset, tagger):
+ tagged_tokens = tagger.tag(tokens)
+ if tagset:
+ tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
+ return tagged_tokens
+
+
+def pos_tag(tokens, tagset=None, lang='eng'):
"""
Use NLTK's currently recommended part of speech tagger to
tag the given list of tokens.
:rtype: list(tuple(str, str))
"""
tagger = _get_tagger(lang)
- return _pos_tag(tokens, tagset, tagger, lang)
+ return _pos_tag(tokens, tagset, tagger)
-def pos_tag_sents(sentences, tagset=None, lang="eng"):
+def pos_tag_sents(sentences, tagset=None, lang='eng'):
"""
Use NLTK's currently recommended part of speech tagger to tag the
given list of sentences, each consisting of a list of tokens.
- :param sentences: List of sentences to be tagged
- :type sentences: list(list(str))
+ :param tokens: List of sentences to be tagged
+ :type tokens: list(list(str))
:param tagset: the tagset to be used, e.g. universal, wsj, brown
:type tagset: str
:param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
:rtype: list(list(tuple(str, str)))
"""
tagger = _get_tagger(lang)
- return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences]
+ return [_pos_tag(sent, tagset, tagger) for sent in sentences]
# Natural Language Toolkit: Tagger Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <http://nltk.org/>
information, such as its part of speech.
"""
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from itertools import chain
-
from nltk.internals import overridden
from nltk.metrics import accuracy
+
from nltk.tag.util import untag
-class TaggerI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TaggerI(object):
"""
A processing interface for assigning a tag to each token in a list.
Tags are case sensitive strings that identify some property of each
Subclasses must define:
- either ``tag()`` or ``tag_sents()`` (or both)
"""
-
@abstractmethod
def tag(self, tokens):
"""
def _check_params(self, train, model):
if (train and model) or (not train and not model):
- raise ValueError("Must specify either training data or trained model.")
+ raise ValueError(
+ 'Must specify either training data or trained model.')
class FeaturesetTaggerI(TaggerI):
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division
+
from collections import defaultdict, Counter
from nltk.tag import TaggerI
# Brill Templates
######################################################################
-
@jsontags.register_tag
class Word(Feature):
"""
Feature which examines the text (word) of nearby tokens.
"""
- json_tag = "nltk.tag.brill.Word"
+ json_tag = 'nltk.tag.brill.Word'
@staticmethod
def extract_property(tokens, index):
Feature which examines the tags of nearby tokens.
"""
- json_tag = "nltk.tag.brill.Pos"
+ json_tag = 'nltk.tag.brill.Pos'
@staticmethod
def extract_property(tokens, index):
Template(Pos([-1]), Word([0]), Word([1])),
Template(Pos([-2]), Pos([-1])),
Template(Pos([1]), Pos([2])),
- Template(Pos([1]), Pos([2]), Word([1])),
+ Template(Pos([1]), Pos([2]), Word([1]))
]
# The Brill Tagger
######################################################################
-
@jsontags.register_tag
class BrillTagger(TaggerI):
"""
of the TaggerTrainers available.
"""
- json_tag = "nltk.tag.BrillTagger"
+ json_tag = 'nltk.tag.BrillTagger'
def __init__(self, initial_tagger, rules, training_stats=None):
"""
tids = [r.templateid for r in self._rules]
train_stats = self.train_stats()
- trainscores = train_stats["rulescores"]
- assert len(trainscores) == len(tids), (
- "corrupt statistics: "
+ trainscores = train_stats['rulescores']
+ assert len(trainscores) == len(tids), "corrupt statistics: " \
"{0} train scores for {1} rules".format(trainscores, tids)
- )
template_counts = Counter(tids)
weighted_traincounts = Counter()
for (tid, score) in zip(tids, trainscores):
return (tpl_value[1], repr(tpl_value[0]))
def print_train_stats():
- print(
- "TEMPLATE STATISTICS (TRAIN) {0} templates, {1} rules)".format(
- len(template_counts), len(tids)
- )
- )
- print(
- "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
- "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
+ print("TEMPLATE STATISTICS (TRAIN) {0} templates, {1} rules)".format(
+ len(template_counts),
+ len(tids))
)
+ print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
+ "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats))
head = "#ID | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
- train_tplscores = sorted(
- weighted_traincounts.items(), key=det_tplsort, reverse=True
- )
+ train_tplscores = sorted(weighted_traincounts.items(), key=det_tplsort, reverse=True)
for (tid, trainscore) in train_tplscores:
s = "{0} | {1:5d} {2:5.3f} |{3:4d} {4:.3f} | {5}".format(
tid,
trainscore,
- trainscore / tottrainscores,
+ trainscore/tottrainscores,
template_counts[tid],
- template_counts[tid] / len(tids),
+ template_counts[tid]/len(tids),
Template.ALLTEMPLATES[int(tid)],
)
print(s)
def print_testtrain_stats():
- testscores = test_stats["rulescores"]
- print(
- "TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format(
- len(template_counts), len(tids)
- )
- )
- print(
- "TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
- "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats)
- )
- print(
- "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
- "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
+ testscores = test_stats['rulescores']
+ print("TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format(
+ len(template_counts),
+ len(tids)),
)
+ print("TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
+ "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats))
+ print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
+ "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats))
weighted_testcounts = Counter()
for (tid, score) in zip(tids, testscores):
weighted_testcounts[tid] += score
tottestscores = sum(testscores)
head = "#ID | Score (test) | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
- test_tplscores = sorted(
- weighted_testcounts.items(), key=det_tplsort, reverse=True
- )
+ test_tplscores = sorted(weighted_testcounts.items(), key=det_tplsort, reverse=True)
for (tid, testscore) in test_tplscores:
s = "{0:s} |{1:5d} {2:6.3f} | {3:4d} {4:.3f} |{5:4d} {6:.3f} | {7:s}".format(
tid,
testscore,
- testscore / tottestscores,
+ testscore/tottestscores,
weighted_traincounts[tid],
- weighted_traincounts[tid] / tottrainscores,
+ weighted_traincounts[tid]/tottrainscores,
template_counts[tid],
- template_counts[tid] / len(tids),
+ template_counts[tid]/len(tids),
Template.ALLTEMPLATES[int(tid)],
)
print(s)
def print_unused_templates():
- usedtpls = set(int(tid) for tid in tids)
- unused = [
- (tid, tpl)
- for (tid, tpl) in enumerate(Template.ALLTEMPLATES)
- if tid not in usedtpls
- ]
+ usedtpls = set([int(tid) for tid in tids])
+ unused = [(tid, tpl) for (tid, tpl) in enumerate(Template.ALLTEMPLATES) if tid not in usedtpls]
print("UNUSED TEMPLATES ({0})".format(len(unused)))
for (tid, tpl) in unused:
:type gold: list of list of strings
:returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule))
"""
-
def counterrors(xs):
return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair))
-
testing_stats = {}
- testing_stats["tokencount"] = sum(len(t) for t in sequences)
- testing_stats["sequencecount"] = len(sequences)
+ testing_stats['tokencount'] = sum(len(t) for t in sequences)
+ testing_stats['sequencecount'] = len(sequences)
tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences]
- testing_stats["initialerrors"] = counterrors(tagged_tokenses)
- testing_stats["initialacc"] = (
- 1 - testing_stats["initialerrors"] / testing_stats["tokencount"]
- )
+ testing_stats['initialerrors'] = counterrors(tagged_tokenses)
+ testing_stats['initialacc'] = 1 - testing_stats['initialerrors']/testing_stats['tokencount']
# Apply each rule to the entire corpus, in order
- errors = [testing_stats["initialerrors"]]
+ errors = [testing_stats['initialerrors']]
for rule in self._rules:
for tagged_tokens in tagged_tokenses:
rule.apply(tagged_tokens)
errors.append(counterrors(tagged_tokenses))
- testing_stats["rulescores"] = [
- err0 - err1 for (err0, err1) in zip(errors, errors[1:])
- ]
- testing_stats["finalerrors"] = errors[-1]
- testing_stats["finalacc"] = (
- 1 - testing_stats["finalerrors"] / testing_stats["tokencount"]
- )
+ testing_stats['rulescores'] = [err0 - err1 for (err0, err1) in zip(errors, errors[1:])]
+ testing_stats['finalerrors'] = errors[-1]
+ testing_stats['finalacc'] = 1 - testing_stats['finalerrors']/testing_stats['tokencount']
return (tagged_tokenses, testing_stats)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, division
+
import bisect
import textwrap
from collections import defaultdict
"""
A trainer for tbl taggers.
"""
-
- def __init__(
- self, initial_tagger, templates, trace=0, deterministic=None, ruleformat="str"
- ):
+ def __init__(self, initial_tagger, templates, trace=0,
+ deterministic=None, ruleformat="str"):
"""
Construct a Brill tagger from a baseline tagger and a
set of templates
"""
if deterministic is None:
- deterministic = trace > 0
+ deterministic = (trace > 0)
self._initial_tagger = initial_tagger
self._templates = templates
self._trace = trace
# Create a new copy of the training corpus, and run the
# initial tagger on it. We will progressively update this
# test corpus to look more like the training corpus.
- test_sents = [
- list(self._initial_tagger.tag(untag(sent))) for sent in train_sents
- ]
+ test_sents = [list(self._initial_tagger.tag(untag(sent)))
+ for sent in train_sents]
# Collect some statistics on the training process
trainstats = {}
- trainstats["min_acc"] = min_acc
- trainstats["min_score"] = min_score
- trainstats["tokencount"] = sum(len(t) for t in test_sents)
- trainstats["sequencecount"] = len(test_sents)
- trainstats["templatecount"] = len(self._templates)
- trainstats["rulescores"] = []
- trainstats["initialerrors"] = sum(
+ trainstats['min_acc'] = min_acc
+ trainstats['min_score'] = min_score
+ trainstats['tokencount'] = sum(len(t) for t in test_sents)
+ trainstats['sequencecount'] = len(test_sents)
+ trainstats['templatecount'] = len(self._templates)
+ trainstats['rulescores'] = []
+ trainstats['initialerrors'] = sum(
tag[1] != truth[1]
for paired in zip(test_sents, train_sents)
for (tag, truth) in zip(*paired)
)
- trainstats["initialacc"] = (
- 1 - trainstats["initialerrors"] / trainstats["tokencount"]
- )
+ trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount']
if self._trace > 0:
- print(
- "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; "
- "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(
- **trainstats
- )
- )
+ print("TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; "
+ "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats))
# Initialize our mappings. This will find any errors made
# by the initial tagger, and use those to generate repair
print("Finding initial useful rules...")
self._init_mappings(test_sents, train_sents)
if self._trace:
- print((" Found {} useful rules.".format(len(self._rule_scores))))
+ print((" Found %d useful rules." % len(self._rule_scores)))
# Let the user know what we're up to.
if self._trace > 2:
# Repeatedly select the best rule, and add it to `rules`.
rules = []
try:
- while len(rules) < max_rules:
+ while (len(rules) < max_rules):
# Find the best rule, and add it to our rule list.
rule = self._best_rule(train_sents, test_sents, min_score, min_acc)
if rule:
rules.append(rule)
score = self._rule_scores[rule]
- trainstats["rulescores"].append(score)
+ trainstats['rulescores'].append(score)
else:
break # No more good rules left!
# The user can cancel training manually:
except KeyboardInterrupt:
- print("Training stopped manually -- {} rules found".format(len(rules)))
+ print("Training stopped manually -- %d rules found" % len(rules))
# Discard our tag position mapping & rule mappings.
self._clean()
- trainstats["finalerrors"] = trainstats["initialerrors"] - sum(
- trainstats["rulescores"]
- )
- trainstats["finalacc"] = (
- 1 - trainstats["finalerrors"] / trainstats["tokencount"]
- )
+ trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores'])
+ trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount']
# Create and return a tagger from the rules we found.
return BrillTagger(self._initial_tagger, rules, trainstats)
correct_tag = train_sents[sentnum][wordnum][1]
if tag != correct_tag:
for rule in self._find_rules(sent, wordnum, correct_tag):
- self._update_rule_applies(rule, sentnum, wordnum, train_sents)
+ self._update_rule_applies(rule, sentnum, wordnum,
+ train_sents)
def _clean(self):
self._tag_positions = None
for i in range(start, len(positions)):
sentnum, wordnum = positions[i]
if rule.applies(test_sents[sentnum], wordnum):
- self._update_rule_applies(rule, sentnum, wordnum, train_sents)
+ self._update_rule_applies(rule, sentnum, wordnum,
+ train_sents)
if self._rule_scores[rule] < max_score:
- self._first_unknown_position[rule] = (sentnum, wordnum + 1)
+ self._first_unknown_position[rule] = (sentnum,
+ wordnum+1)
break # The update demoted the rule.
if self._rule_scores[rule] == max_score:
num_broken = len([c for c in changes if c == -1])
# acc here is fixed/(fixed+broken); could also be
# fixed/(fixed+broken+other) == num_fixed/len(changes)
- acc = num_fixed / (num_fixed + num_broken)
+ acc = num_fixed/(num_fixed+num_broken)
if acc >= min_acc:
return rule
# else: rule too inaccurate, discard and try next
# Check if the change causes our templates to propose any
# new rules for this position.
for template in self._templates:
- for new_rule in template.applicable_rules(
- test_sent, wordnum, correct_tag
- ):
+ for new_rule in template.applicable_rules(test_sent, wordnum,
+ correct_tag):
if new_rule not in old_rules:
num_new += 1
if new_rule not in self._rule_scores:
num_unseen += 1
old_rules.add(new_rule)
- self._update_rule_applies(
- new_rule, sentnum, wordnum, train_sents
- )
+ self._update_rule_applies(new_rule, sentnum,
+ wordnum, train_sents)
# We may have caused other rules to match here, that are
# not proposed by our templates -- in particular, rules
if new_rule not in old_rules:
num_new += 1
if new_rule.applies(test_sent, wordnum):
- self._update_rule_applies(
- new_rule, sentnum, wordnum, train_sents
- )
+ self._update_rule_applies(new_rule, sentnum,
+ wordnum, train_sents)
if self._trace > 3:
self._trace_update_rules(num_obsolete, num_new, num_unseen)
# Tracing
def _trace_header(self):
- print(
- """
+ print("""
B |
S F r O | Score = Fixed - Broken
c i o t | R Fixed = num tags changed incorrect -> correct
r e e e | l Other = num tags changed incorrect -> incorrect
e d n r | e
------------------+-------------------------------------------------------
- """.rstrip()
- )
+ """.rstrip())
def _trace_rule(self, rule):
assert self._rule_scores[rule] == sum(self._positions_by_rule[rule].values())
rulestr = rule.format(self._ruleformat)
if self._trace > 2:
- print(
- "{:4d}{:4d}{:4d}{:4d} |".format(score, num_fixed, num_broken, num_other), end=" "
- )
- print(
- textwrap.fill(
- rulestr,
- initial_indent=" " * 20,
- width=79,
- subsequent_indent=" " * 18 + "| ",
- ).strip()
- )
+ print('%4d%4d%4d%4d |' % (score, num_fixed, num_broken, num_other), end=' ')
+ print(textwrap.fill(rulestr, initial_indent=' '*20, width=79,
+ subsequent_indent=' '*18+'| ').strip())
else:
print(rulestr)
def _trace_apply(self, num_updates):
- prefix = " " * 18 + "|"
+ prefix = ' '*18+'|'
print(prefix)
- print(prefix, "Applying rule to {} positions.".format(num_updates))
+ print(prefix, 'Applying rule to %d positions.' % num_updates)
def _trace_update_rules(self, num_obsolete, num_new, num_unseen):
- prefix = " " * 18 + "|"
- print(prefix, "Updated rule tables:")
- print(prefix, (" - {} rule applications removed".format(num_obsolete)))
- print(
- prefix,
- (" - {} rule applications added ({} novel)".format(num_new, num_unseen)),
- )
+ prefix = ' '*18+'|'
+ print(prefix, 'Updated rule tables:')
+ print(prefix, (' - %d rule applications removed' % num_obsolete))
+ print(prefix, (' - %d rule applications added (%d novel)' %
+ (num_new, num_unseen)))
print(prefix)
+
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the CRFSuite Tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Long Duong <longdt219@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for POS tagging using CRFSuite
"""
-
+from __future__ import absolute_import
+from __future__ import unicode_literals
import unicodedata
-import re
+import re
from nltk.tag.api import TaggerI
try:
except ImportError:
pass
-
class CRFTagger(TaggerI):
"""
A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite
-
+
>>> from nltk.tag import CRFTagger
>>> ct = CRFTagger()
-
+
>>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')],
... [('dog','Noun'),('eat','Verb'),('meat','Noun')]]
-
+
>>> ct.train(train_data,'model.crf.tagger')
>>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']])
[[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]]
-
- >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]]
- >>> ct.evaluate(gold_sentences)
+
+ >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]]
+ >>> ct.evaluate(gold_sentences)
1.0
-
- Setting learned model file
- >>> ct = CRFTagger()
+
+ Setting learned model file
+ >>> ct = CRFTagger()
>>> ct.set_model_file('model.crf.tagger')
>>> ct.evaluate(gold_sentences)
1.0
-
+
"""
-
- def __init__(self, feature_func=None, verbose=False, training_opt={}):
+
+
+ def __init__(self, feature_func = None, verbose = False, training_opt = {}):
"""
- Initialize the CRFSuite tagger
- :param feature_func: The function that extracts features for each token of a sentence. This function should take
- 2 parameters: tokens and index which extract features at index position from tokens list. See the build in
- _get_features function for more detail.
+ Initialize the CRFSuite tagger
+ :param feature_func: The function that extracts features for each token of a sentence. This function should take
+ 2 parameters: tokens and index which extract features at index position from tokens list. See the build in
+ _get_features function for more detail.
:param verbose: output the debugging messages during training.
- :type verbose: boolean
+ :type verbose: boolean
:param training_opt: python-crfsuite training options
- :type training_opt : dictionary
-
- Set of possible training options (using LBFGS training algorithm).
+ :type training_opt : dictionary
+
+ Set of possible training options (using LBFGS training algorithm).
'feature.minfreq' : The minimum frequency of features.
'feature.possible_states' : Force to generate possible state features.
'feature.possible_transitions' : Force to generate possible transition features.
{ 'MoreThuente': More and Thuente's method,
'Backtracking': Backtracking method with regular Wolfe condition,
'StrongBacktracking': Backtracking method with strong Wolfe condition
- }
+ }
'max_linesearch' : The maximum number of trials for the line search algorithm.
-
+
"""
-
- self._model_file = ""
+
+ self._model_file = ''
self._tagger = pycrfsuite.Tagger()
-
+
if feature_func is None:
- self._feature_func = self._get_features
+ self._feature_func = self._get_features
else:
- self._feature_func = feature_func
-
- self._verbose = verbose
+ self._feature_func = feature_func
+
+ self._verbose = verbose
self._training_options = training_opt
- self._pattern = re.compile(r"\d")
-
+ self._pattern = re.compile(r'\d')
+
def set_model_file(self, model_file):
self._model_file = model_file
self._tagger.open(self._model_file)
-
+
def _get_features(self, tokens, idx):
"""
- Extract basic features about this word including
- - Current Word
+ Extract basic features about this word including
+ - Current Word
- Is Capitalized ?
- Has Punctuation ?
- Has Number ?
- Suffixes up to length 3
- Note that : we might include feature over previous word, next word ect.
-
+ Note that : we might include feature over previous word, next word ect.
+
:return : a list which contains the features
- :rtype : list(str)
-
- """
+ :rtype : list(str)
+
+ """
token = tokens[idx]
-
+
feature_list = []
-
+
if not token:
return feature_list
-
- # Capitalization
+
+ # Capitalization
if token[0].isupper():
- feature_list.append("CAPITALIZATION")
-
- # Number
+ feature_list.append('CAPITALIZATION')
+
+ # Number
if re.search(self._pattern, token) is not None:
- feature_list.append("HAS_NUM")
-
+ feature_list.append('HAS_NUM')
+
# Punctuation
punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
- if all(unicodedata.category(x) in punc_cat for x in token):
- feature_list.append("PUNCTUATION")
-
+ if all (unicodedata.category(x) in punc_cat for x in token):
+ feature_list.append('PUNCTUATION')
+
# Suffix up to length 3
if len(token) > 1:
- feature_list.append("SUF_" + token[-1:])
- if len(token) > 2:
- feature_list.append("SUF_" + token[-2:])
- if len(token) > 3:
- feature_list.append("SUF_" + token[-3:])
-
- feature_list.append("WORD_" + token)
-
+ feature_list.append('SUF_' + token[-1:])
+ if len(token) > 2:
+ feature_list.append('SUF_' + token[-2:])
+ if len(token) > 3:
+ feature_list.append('SUF_' + token[-3:])
+
+ feature_list.append('WORD_' + token )
+
return feature_list
-
+
def tag_sents(self, sents):
- """
- Tag a list of sentences. NB before using this function, user should specify the mode_file either by
- - Train a new model using ``train'' function
- - Use the pre-trained model which is set via ``set_model_file'' function
- :params sentences : list of sentences needed to tag.
+ '''
+ Tag a list of sentences. NB before using this function, user should specify the mode_file either by
+ - Train a new model using ``train'' function
+ - Use the pre-trained model which is set via ``set_model_file'' function
+ :params sentences : list of sentences needed to tag.
:type sentences : list(list(str))
- :return : list of tagged sentences.
- :rtype : list (list (tuple(str,str)))
- """
- if self._model_file == "":
- raise Exception(
- " No model file is found !! Please use train or set_model_file function"
- )
-
+ :return : list of tagged sentences.
+ :rtype : list (list (tuple(str,str)))
+ '''
+ if self._model_file == '':
+ raise Exception(' No model file is found !! Please use train or set_model_file function')
+
# We need the list of sentences instead of the list generator for matching the input and output
- result = []
+ result = []
for tokens in sents:
- features = [self._feature_func(tokens, i) for i in range(len(tokens))]
+ features = [self._feature_func(tokens,i) for i in range(len(tokens))]
labels = self._tagger.tag(features)
-
+
if len(labels) != len(tokens):
- raise Exception(" Predicted Length Not Matched, Expect Errors !")
-
- tagged_sent = list(zip(tokens, labels))
+ raise Exception(' Predicted Length Not Matched, Expect Errors !')
+
+ tagged_sent = list(zip(tokens,labels))
result.append(tagged_sent)
-
- return result
-
+
+ return result
+
def train(self, train_data, model_file):
- """
- Train the CRF tagger using CRFSuite
- :params train_data : is the list of annotated sentences.
+ '''
+ Train the CRF tagger using CRFSuite
+ :params train_data : is the list of annotated sentences.
:type train_data : list (list(tuple(str,str)))
- :params model_file : the model will be saved to this file.
-
- """
+ :params model_file : the model will be saved to this file.
+
+ '''
trainer = pycrfsuite.Trainer(verbose=self._verbose)
trainer.set_params(self._training_options)
-
+
for sent in train_data:
- tokens, labels = zip(*sent)
- features = [self._feature_func(tokens, i) for i in range(len(tokens))]
- trainer.append(features, labels)
-
+ tokens,labels = zip(*sent)
+ features = [self._feature_func(tokens,i) for i in range(len(tokens))]
+ trainer.append(features,labels)
+
# Now train the model, the output should be model_file
trainer.train(model_file)
# Save the model file
- self.set_model_file(model_file)
+ self.set_model_file(model_file)
def tag(self, tokens):
- """
- Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by
- - Train a new model using ``train'' function
- - Use the pre-trained model which is set via ``set_model_file'' function
- :params tokens : list of tokens needed to tag.
+ '''
+ Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by
+ - Train a new model using ``train'' function
+ - Use the pre-trained model which is set via ``set_model_file'' function
+ :params tokens : list of tokens needed to tag.
:type tokens : list(str)
- :return : list of tagged tokens.
- :rtype : list (tuple(str,str))
- """
-
+ :return : list of tagged tokens.
+ :rtype : list (tuple(str,str))
+ '''
+
return self.tag_sents([tokens])[0]
+
# Natural Language Toolkit: Hidden Markov Model
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn@csse.unimelb.edu.au>
# Philip Blunsom <pcbl@csse.unimelb.edu.au>
# Tiago Tresoldi <tiago@tresoldi.pro.br> (fixes)
For more information, please consult the source code for this module,
which includes extensive demonstration code.
"""
+from __future__ import print_function, unicode_literals, division
import re
import itertools
+from six.moves import map, zip
+
try:
import numpy as np
except ImportError:
pass
-from nltk.probability import (
- FreqDist,
- ConditionalFreqDist,
- ConditionalProbDist,
- DictionaryProbDist,
- DictionaryConditionalProbDist,
- LidstoneProbDist,
- MutableProbDist,
- MLEProbDist,
- RandomProbDist,
-)
+from nltk.probability import (FreqDist, ConditionalFreqDist,
+ ConditionalProbDist, DictionaryProbDist,
+ DictionaryConditionalProbDist,
+ LidstoneProbDist, MutableProbDist,
+ MLEProbDist, RandomProbDist)
from nltk.metrics import accuracy
from nltk.util import LazyMap, unique_list
+from nltk.compat import python_2_unicode_compatible
from nltk.tag.api import TaggerI
_TEXT = 0 # index of text in a tuple
-_TAG = 1 # index of tag in a tuple
-
+_TAG = 1 # index of tag in a tuple
def _identity(labeled_symbols):
return labeled_symbols
-
+@python_2_unicode_compatible
class HiddenMarkovModelTagger(TaggerI):
"""
Hidden Markov model class, a generative model for labelling sequence data.
instances, defaults to the identity function.
:type transform: callable
"""
-
- def __init__(
- self, symbols, states, transitions, outputs, priors, transform=_identity
- ):
+ def __init__(self, symbols, states, transitions, outputs, priors,
+ transform=_identity):
self._symbols = unique_list(symbols)
self._states = unique_list(states)
self._transitions = transitions
self._transform = transform
@classmethod
- def _train(
- cls,
- labeled_sequence,
- test_sequence=None,
- unlabeled_sequence=None,
- transform=_identity,
- estimator=None,
- **kwargs
- ):
+ def _train(cls, labeled_sequence, test_sequence=None,
+ unlabeled_sequence=None, transform=_identity,
+ estimator=None, **kwargs):
if estimator is None:
-
def estimator(fd, bins):
return LidstoneProbDist(fd, 0.1, bins)
labeled_sequence = LazyMap(transform, labeled_sequence)
- symbols = unique_list(word for sent in labeled_sequence for word, tag in sent)
- tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent)
+ symbols = unique_list(word for sent in labeled_sequence
+ for word, tag in sent)
+ tag_set = unique_list(tag for sent in labeled_sequence
+ for word, tag in sent)
trainer = HiddenMarkovModelTrainer(tag_set, symbols)
hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
- hmm = cls(
- hmm._symbols,
- hmm._states,
- hmm._transitions,
- hmm._outputs,
- hmm._priors,
- transform=transform,
- )
+ hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs,
+ hmm._priors, transform=transform)
if test_sequence:
- hmm.test(test_sequence, verbose=kwargs.get("verbose", False))
+ hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
if unlabeled_sequence:
- max_iterations = kwargs.get("max_iterations", 5)
- hmm = trainer.train_unsupervised(
- unlabeled_sequence, model=hmm, max_iterations=max_iterations
- )
+ max_iterations = kwargs.get('max_iterations', 5)
+ hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm,
+ max_iterations=max_iterations)
if test_sequence:
- hmm.test(test_sequence, verbose=kwargs.get("verbose", False))
+ hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
return hmm
@classmethod
- def train(
- cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, **kwargs
- ):
+ def train(cls, labeled_sequence, test_sequence=None,
+ unlabeled_sequence=None, **kwargs):
"""
Train a new HiddenMarkovModelTagger using the given labeled and
unlabeled training instances. Testing will be performed if test
:param max_iterations: number of Baum-Welch interations to perform
:type max_iterations: int
"""
- return cls._train(labeled_sequence, test_sequence, unlabeled_sequence, **kwargs)
+ return cls._train(labeled_sequence, test_sequence,
+ unlabeled_sequence, **kwargs)
def probability(self, sequence):
"""
property, and optionally the TAG property
:type sequence: Token
"""
- return 2 ** (self.log_probability(self._transform(sequence)))
+ return 2**(self.log_probability(self._transform(sequence)))
def log_probability(self, sequence):
"""
if T > 0 and sequence[0][_TAG]:
last_state = sequence[0][_TAG]
- p = self._priors.logprob(last_state) + self._output_logprob(
- last_state, sequence[0][_TEXT]
- )
+ p = self._priors.logprob(last_state) + \
+ self._output_logprob(last_state, sequence[0][_TEXT])
for t in range(1, T):
state = sequence[t][_TAG]
- p += self._transitions[last_state].logprob(
- state
- ) + self._output_logprob(state, sequence[t][_TEXT])
+ p += self._transitions[last_state].logprob(state) + \
+ self._output_logprob(state, sequence[t][_TEXT])
last_state = state
return p
else:
alpha = self._forward_probability(sequence)
- p = logsumexp2(alpha[T - 1])
+ p = logsumexp2(alpha[T-1])
return p
def tag(self, unlabeled_sequence):
V[0] = P + O[:, S[unlabeled_sequence[0]]]
for t in range(1, T):
for j in range(N):
- vs = V[t - 1, :] + X[:, j]
+ vs = V[t-1, :] + X[:, j]
best = np.argmax(vs)
V[t, j] = vs[best] + O[j, S[unlabeled_sequence[t]]]
B[t, j] = best
- current = np.argmax(V[T - 1, :])
+ current = np.argmax(V[T-1,:])
sequence = [current]
- for t in range(T - 1, 0, -1):
+ for t in range(T-1, 0, -1):
last = B[t, current]
sequence.append(last)
current = last
# find the starting log probabilities for each state
symbol = unlabeled_sequence[0]
for i, state in enumerate(self._states):
- V[0, i] = self._priors.logprob(state) + self._output_logprob(state, symbol)
+ V[0, i] = self._priors.logprob(state) + \
+ self._output_logprob(state, symbol)
B[0, state] = None
# find the maximum log probabilities for reaching each state at time t
best = None
for i in range(N):
si = self._states[i]
- va = V[t - 1, i] + self._transitions[si].logprob(sj)
+ va = V[t-1, i] + self._transitions[si].logprob(sj)
if not best or va > best[0]:
best = (va, si)
V[t, j] = best[0] + self._output_logprob(sj, symbol)
# find the highest probability final state
best = None
for i in range(N):
- val = V[T - 1, i]
+ val = V[T-1, i]
if not best or val > best[0]:
best = (val, self._states[i])
# traverse the back-pointers B to find the state sequence
current = best[1]
sequence = [current]
- for t in range(T - 1, 0, -1):
+ for t in range(T-1, 0, -1):
last = B[t, current]
sequence.append(last)
current = last
# sample the starting state and symbol prob dists
tokens = []
state = self._sample_probdist(self._priors, rng.random(), self._states)
- symbol = self._sample_probdist(
- self._outputs[state], rng.random(), self._symbols
- )
+ symbol = self._sample_probdist(self._outputs[state],
+ rng.random(), self._symbols)
tokens.append((symbol, state))
for i in range(1, length):
# sample the state transition and symbol prob dists
- state = self._sample_probdist(
- self._transitions[state], rng.random(), self._states
- )
- symbol = self._sample_probdist(
- self._outputs[state], rng.random(), self._symbols
- )
+ state = self._sample_probdist(self._transitions[state],
+ rng.random(), self._states)
+ symbol = self._sample_probdist(self._outputs[state],
+ rng.random(), self._symbols)
tokens.append((symbol, state))
return tokens
if cum_p <= p <= cum_p + add_p:
return sample
cum_p += add_p
- raise Exception("Invalid probability distribution - " "does not sum to one")
+ raise Exception('Invalid probability distribution - '
+ 'does not sum to one')
def entropy(self, unlabeled_sequence):
"""
alpha = self._forward_probability(unlabeled_sequence)
beta = self._backward_probability(unlabeled_sequence)
- normalisation = logsumexp2(alpha[T - 1])
+ normalisation = logsumexp2(alpha[T-1])
entropy = normalisation
# starting state, t = 0
for i, state in enumerate(self._states):
- p = 2 ** (alpha[0, i] + beta[0, i] - normalisation)
+ p = 2**(alpha[0, i] + beta[0, i] - normalisation)
entropy -= p * self._priors.logprob(state)
- # print('p(s_0 = %s) =' % state, p)
+ #print 'p(s_0 = %s) =' % state, p
# state transitions
for t0 in range(T - 1):
t1 = t0 + 1
for i0, s0 in enumerate(self._states):
for i1, s1 in enumerate(self._states):
- p = 2 ** (
- alpha[t0, i0]
- + self._transitions[s0].logprob(s1)
- + self._outputs[s1].logprob(unlabeled_sequence[t1][_TEXT])
- + beta[t1, i1]
- - normalisation
- )
+ p = 2**(alpha[t0, i0] + self._transitions[s0].logprob(s1) +
+ self._outputs[s1].logprob(
+ unlabeled_sequence[t1][_TEXT]) +
+ beta[t1, i1] - normalisation)
entropy -= p * self._transitions[s0].logprob(s1)
- # print('p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p)
+ #print 'p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p
# symbol emissions
for t in range(T):
for i, state in enumerate(self._states):
- p = 2 ** (alpha[t, i] + beta[t, i] - normalisation)
+ p = 2**(alpha[t, i] + beta[t, i] - normalisation)
entropy -= p * self._outputs[state].logprob(
- unlabeled_sequence[t][_TEXT]
- )
- # print('p(s_%d = %s) =' % (t, state), p)
+ unlabeled_sequence[t][_TEXT])
+ #print 'p(s_%d = %s) =' % (t, state), p
return entropy
alpha = self._forward_probability(unlabeled_sequence)
beta = self._backward_probability(unlabeled_sequence)
- normalisation = logsumexp2(alpha[T - 1])
+ normalisation = logsumexp2(alpha[T-1])
entropies = np.zeros(T, np.float64)
probs = np.zeros(N, np.float64)
probs[s] = alpha[t, s] + beta[t, s] - normalisation
for s in range(N):
- entropies[t] -= 2 ** (probs[s]) * probs[s]
+ entropies[t] -= 2**(probs[s]) * probs[s]
return entropies
log_probs.append(lp)
normalisation = _log_add(*log_probs)
+ #ps = zeros((T, N), float64)
+ #for labelling, lp in zip(labellings, log_probs):
+ #for t in range(T):
+ #ps[t, self._states.index(labelling[t])] += \
+ # 2**(lp - normalisation)
+
+ #for t in range(T):
+ #print 'prob[%d] =' % t, ps[t]
+
entropy = 0
for lp in log_probs:
lp -= normalisation
- entropy -= 2 ** (lp) * lp
+ entropy -= 2**(lp) * lp
return entropy
normalisation = _log_add(*log_probs)
- probabilities = _ninf_array((T, N))
+ probabilities = _ninf_array((T,N))
for labelling, lp in zip(labellings, log_probs):
lp -= normalisation
entropies = np.zeros(T, np.float64)
for t in range(T):
for s in range(N):
- entropies[t] -= 2 ** (probabilities[t, s]) * probabilities[t, s]
+ entropies[t] -= 2**(probabilities[t, s]) * probabilities[t, s]
return entropies
def _transitions_matrix(self):
""" Return a matrix of transition log probabilities. """
- trans_iter = (
- self._transitions[sj].logprob(si)
- for sj in self._states
- for si in self._states
- )
+ trans_iter = (self._transitions[sj].logprob(si)
+ for sj in self._states
+ for si in self._states)
transitions_logprob = np.fromiter(trans_iter, dtype=np.float64)
N = len(self._states)
# Initialization
symbol = unlabeled_sequence[0][_TEXT]
for i, state in enumerate(self._states):
- alpha[0, i] = self._priors.logprob(state) + self._output_logprob(
- state, symbol
- )
+ alpha[0, i] = self._priors.logprob(state) + \
+ self._output_logprob(state, symbol)
# Induction
for t in range(1, T):
output_logprob = self._outputs_vector(symbol)
for i in range(N):
- summand = alpha[t - 1] + transitions_logprob[i]
+ summand = alpha[t-1] + transitions_logprob[i]
alpha[t, i] = logsumexp2(summand) + output_logprob[i]
return alpha
# initialise the backward values;
# "1" is an arbitrarily chosen value from Rabiner tutorial
- beta[T - 1, :] = np.log2(1)
+ beta[T-1, :] = np.log2(1)
# inductively calculate remaining backward values
- for t in range(T - 2, -1, -1):
- symbol = unlabeled_sequence[t + 1][_TEXT]
+ for t in range(T-2, -1, -1):
+ symbol = unlabeled_sequence[t+1][_TEXT]
outputs = self._outputs_vector(symbol)
for i in range(N):
- summand = transitions_logprob[i] + beta[t + 1] + outputs
+ summand = transitions_logprob[i] + beta[t+1] + outputs
beta[t, i] = logsumexp2(summand)
return beta
if verbose:
for test_sent, predicted_sent in zip(test_sequence, predicted_sequence):
- print(
- "Test:",
- " ".join("%s/%s" % (token, tag) for (token, tag) in test_sent),
- )
+ print('Test:',
+ ' '.join('%s/%s' % (token, tag)
+ for (token, tag) in test_sent))
print()
- print("Untagged:", " ".join("%s" % token for (token, tag) in test_sent))
+ print('Untagged:',
+ ' '.join("%s" % token for (token, tag) in test_sent))
print()
- print(
- "HMM-tagged:",
- " ".join("%s/%s" % (token, tag) for (token, tag) in predicted_sent),
- )
+ print('HMM-tagged:',
+ ' '.join('%s/%s' % (token, tag)
+ for (token, tag) in predicted_sent))
print()
- print(
- "Entropy:",
- self.entropy([(token, None) for (token, tag) in predicted_sent]),
- )
+ print('Entropy:',
+ self.entropy([(token, None) for
+ (token, tag) in predicted_sent]))
print()
- print("-" * 60)
+ print('-' * 60)
test_tags = flatten(map(tags, test_sequence))
predicted_tags = flatten(map(tags, predicted_sequence))
acc = accuracy(test_tags, predicted_tags)
count = sum(len(sent) for sent in test_sequence)
- print("accuracy over %d tokens: %.2f" % (count, acc * 100))
+ print('accuracy over %d tokens: %.2f' % (count, acc * 100))
def __repr__(self):
- return "<HiddenMarkovModelTagger %d states and %d output symbols>" % (
- len(self._states),
- len(self._symbols),
- )
+ return ('<HiddenMarkovModelTagger %d states and %d output symbols>'
+ % (len(self._states), len(self._symbols)))
class HiddenMarkovModelTrainer(object):
:param symbols: the set of observation symbols
:type symbols: sequence of any
"""
-
def __init__(self, states=None, symbols=None):
- self._states = states if states else []
- self._symbols = symbols if symbols else []
+ self._states = (states if states else [])
+ self._symbols = (symbols if symbols else [])
- def train(self, labeled_sequences=None, unlabeled_sequences=None, **kwargs):
+ def train(self, labeled_sequences=None, unlabeled_sequences=None,
+ **kwargs):
"""
Trains the HMM using both (or either of) supervised and unsupervised
techniques.
:rtype: HiddenMarkovModelTagger
:param labelled_sequences: the supervised training data, a set of
labelled sequences of observations
- ex: [ (word_1, tag_1),...,(word_n,tag_n) ]
:type labelled_sequences: list
:param unlabeled_sequences: the unsupervised training data, a set of
sequences of observations
- ex: [ word_1, ..., word_n ]
:type unlabeled_sequences: list
:param kwargs: additional arguments to pass to the training methods
"""
if labeled_sequences:
model = self.train_supervised(labeled_sequences, **kwargs)
if unlabeled_sequences:
- if model:
- kwargs["model"] = model
+ if model: kwargs['model'] = model
model = self.train_unsupervised(unlabeled_sequences, **kwargs)
return model
+
def _baum_welch_step(self, sequence, model, symbol_to_number):
N = len(model._states)
beta = model._backward_probability(sequence)
# find the log probability of the sequence
- lpk = logsumexp2(alpha[T - 1])
+ lpk = logsumexp2(alpha[T-1])
A_numer = _ninf_array((N, N))
B_numer = _ninf_array((N, M))
symbol = sequence[t][_TEXT] # not found? FIXME
next_symbol = None
if t < T - 1:
- next_symbol = sequence[t + 1][_TEXT] # not found? FIXME
+ next_symbol = sequence[t+1][_TEXT] # not found? FIXME
xi = symbol_to_number[symbol]
next_outputs_logprob = model._outputs_vector(next_symbol)
alpha_plus_beta = alpha[t] + beta[t]
if t < T - 1:
- numer_add = (
- transitions_logprob
- + next_outputs_logprob
- + beta[t + 1]
- + alpha[t].reshape(N, 1)
- )
+ numer_add = transitions_logprob + next_outputs_logprob + \
+ beta[t+1] + alpha[t].reshape(N, 1)
A_numer = np.logaddexp2(A_numer, numer_add)
A_denom = np.logaddexp2(A_denom, alpha_plus_beta)
else:
B_denom = np.logaddexp2(A_denom, alpha_plus_beta)
- B_numer[:, xi] = np.logaddexp2(B_numer[:, xi], alpha_plus_beta)
+ B_numer[:,xi] = np.logaddexp2(B_numer[:,xi], alpha_plus_beta)
return lpk, A_numer, A_denom, B_numer, B_denom
- def train_unsupervised(self, unlabeled_sequences, update_outputs=True, **kwargs):
+ def train_unsupervised(self, unlabeled_sequences, update_outputs=True,
+ **kwargs):
"""
Trains the HMM using the Baum-Welch algorithm to maximise the
probability of the data sequence. This is a variant of the EM
# create a uniform HMM, which will be iteratively refined, unless
# given an existing model
- model = kwargs.get("model")
+ model = kwargs.get('model')
if not model:
priors = RandomProbDist(self._states)
transitions = DictionaryConditionalProbDist(
- dict((state, RandomProbDist(self._states)) for state in self._states)
- )
+ dict((state, RandomProbDist(self._states))
+ for state in self._states))
outputs = DictionaryConditionalProbDist(
- dict((state, RandomProbDist(self._symbols)) for state in self._states)
- )
- model = HiddenMarkovModelTagger(
- self._symbols, self._states, transitions, outputs, priors
- )
+ dict((state, RandomProbDist(self._symbols))
+ for state in self._states))
+ model = HiddenMarkovModelTagger(self._symbols, self._states,
+ transitions, outputs, priors)
self._states = model._states
self._symbols = model._symbols
# model._priors = MutableProbDist(model._priors, self._states)
model._transitions = DictionaryConditionalProbDist(
- dict(
- (s, MutableProbDist(model._transitions[s], self._states))
- for s in self._states
- )
- )
+ dict((s, MutableProbDist(model._transitions[s], self._states))
+ for s in self._states))
if update_outputs:
model._outputs = DictionaryConditionalProbDist(
- dict(
- (s, MutableProbDist(model._outputs[s], self._symbols))
- for s in self._states
- )
- )
+ dict((s, MutableProbDist(model._outputs[s], self._symbols))
+ for s in self._states))
model.reset_cache()
converged = False
last_logprob = None
iteration = 0
- max_iterations = kwargs.get("max_iterations", 1000)
- epsilon = kwargs.get("convergence_logprob", 1e-6)
+ max_iterations = kwargs.get('max_iterations', 1000)
+ epsilon = kwargs.get('convergence_logprob', 1e-6)
while not converged and iteration < max_iterations:
A_numer = _ninf_array((N, N))
if not sequence:
continue
- (
- lpk,
- seq_A_numer,
- seq_A_denom,
- seq_B_numer,
- seq_B_denom,
- ) = self._baum_welch_step(sequence, model, symbol_numbers)
+ (lpk, seq_A_numer, seq_A_denom,
+ seq_B_numer, seq_B_denom) = self._baum_welch_step(sequence, model, symbol_numbers)
# add these sums to the global A and B values
for i in range(N):
- A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i] - lpk)
- B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i] - lpk)
+ A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i]-lpk)
+ B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i]-lpk)
- A_denom = np.logaddexp2(A_denom, seq_A_denom - lpk)
- B_denom = np.logaddexp2(B_denom, seq_B_denom - lpk)
+ A_denom = np.logaddexp2(A_denom, seq_A_denom-lpk)
+ B_denom = np.logaddexp2(B_denom, seq_B_denom-lpk)
logprob += lpk
if iteration > 0 and abs(logprob - last_logprob) < epsilon:
converged = True
- print("iteration", iteration, "logprob", logprob)
+ print('iteration', iteration, 'logprob', logprob)
iteration += 1
last_logprob = logprob
def logsumexp2(arr):
max_ = arr.max()
- return np.log2(np.sum(2 ** (arr - max_))) + max_
+ return np.log2(np.sum(2**(arr - max_))) + max_
def _log_add(*values):
if x > -np.inf:
sum_diffs = 0
for value in values:
- sum_diffs += 2 ** (value - x)
+ sum_diffs += 2**(value - x)
return x + np.log2(sum_diffs)
else:
return x
A = cpd(A, states, states)
B = cpd(B, states, symbols)
pi = pd(pi, states)
- return HiddenMarkovModelTagger(
- symbols=symbols, states=states, transitions=A, outputs=B, priors=pi
- )
+ return HiddenMarkovModelTagger(symbols=symbols, states=states,
+ transitions=A, outputs=B, priors=pi)
def _market_hmm_example():
"""
Return an example HMM (described at page 381, Huang et al)
"""
- states = ["bull", "bear", "static"]
- symbols = ["up", "down", "unchanged"]
+ states = ['bull', 'bear', 'static']
+ symbols = ['up', 'down', 'unchanged']
A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64)
B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64)
pi = np.array([0.5, 0.2, 0.3], np.float64)
model, states, symbols = _market_hmm_example()
- print("Testing", model)
+ print('Testing', model)
- for test in [
- ["up", "up"],
- ["up", "down", "up"],
- ["down"] * 5,
- ["unchanged"] * 5 + ["up"],
- ]:
+ for test in [['up', 'up'], ['up', 'down', 'up'],
+ ['down'] * 5, ['unchanged'] * 5 + ['up']]:
sequence = [(t, None) for t in test]
- print("Testing with state sequence", test)
- print("probability =", model.probability(sequence))
- print("tagging = ", model.tag([word for (word, tag) in sequence]))
- print("p(tagged) = ", model.probability(sequence))
- print("H = ", model.entropy(sequence))
- print("H_exh = ", model._exhaustive_entropy(sequence))
- print("H(point) = ", model.point_entropy(sequence))
- print("H_exh(point)=", model._exhaustive_point_entropy(sequence))
+ print('Testing with state sequence', test)
+ print('probability =', model.probability(sequence))
+ print('tagging = ', model.tag([word for (word,tag) in sequence]))
+ print('p(tagged) = ', model.probability(sequence))
+ print('H = ', model.entropy(sequence))
+ print('H_exh = ', model._exhaustive_entropy(sequence))
+ print('H(point) = ', model.point_entropy(sequence))
+ print('H_exh(point)=', model._exhaustive_point_entropy(sequence))
print()
-
def load_pos(num_sents):
from nltk.corpus import brown
- sentences = brown.tagged_sents(categories="news")[:num_sents]
+ sentences = brown.tagged_sents(categories='news')[:num_sents]
- tag_re = re.compile(r"[*]|--|[^+*-]+")
+ tag_re = re.compile(r'[*]|--|[^+*-]+')
tag_set = set()
symbols = set()
for i in range(len(sentence)):
word, tag = sentence[i]
word = word.lower() # normalize
- symbols.add(word) # log this word
+ symbols.add(word) # log this word
# Clean up the tag.
tag = tag_re.match(tag).group()
tag_set.add(tag)
return cleaned_sentences, list(tag_set), list(symbols)
-
def demo_pos():
# demonstrates POS tagging using supervised training
print("HMM POS tagging demo")
print()
- print("Training HMM...")
+ print('Training HMM...')
labelled_sequences, tag_set, symbols = load_pos(20000)
trainer = HiddenMarkovModelTrainer(tag_set, symbols)
- hmm = trainer.train_supervised(
- labelled_sequences[10:],
- estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins),
- )
+ hmm = trainer.train_supervised(labelled_sequences[10:],
+ estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
- print("Testing...")
+ print('Testing...')
hmm.test(labelled_sequences[:10], verbose=True)
-
def _untag(sentences):
unlabeled = []
for sentence in sentences:
unlabeled.append([(token[_TEXT], None) for token in sentence])
return unlabeled
-
-def demo_pos_bw(
- test=10, supervised=20, unsupervised=10, verbose=True, max_iterations=5
-):
+def demo_pos_bw(test=10, supervised=20, unsupervised=10, verbose=True,
+ max_iterations=5):
# demonstrates the Baum-Welch algorithm in POS tagging
print()
print("Baum-Welch demo for POS tagging")
print()
- print("Training HMM (supervised, %d sentences)..." % supervised)
+ print('Training HMM (supervised, %d sentences)...' % supervised)
sentences, tag_set, symbols = load_pos(test + supervised + unsupervised)
symbols.add(token[_TEXT])
trainer = HiddenMarkovModelTrainer(tag_set, list(symbols))
- hmm = trainer.train_supervised(
- sentences[test : test + supervised],
- estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins),
- )
+ hmm = trainer.train_supervised(sentences[test:test+supervised],
+ estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
hmm.test(sentences[:test], verbose=verbose)
- print("Training (unsupervised, %d sentences)..." % unsupervised)
+ print('Training (unsupervised, %d sentences)...' % unsupervised)
# it's rather slow - so only use 10 samples by default
- unlabeled = _untag(sentences[test + supervised :])
- hmm = trainer.train_unsupervised(
- unlabeled, model=hmm, max_iterations=max_iterations
- )
+ unlabeled = _untag(sentences[test+supervised:])
+ hmm = trainer.train_unsupervised(unlabeled, model=hmm,
+ max_iterations=max_iterations)
hmm.test(sentences[:test], verbose=verbose)
-
def demo_bw():
# demo Baum Welch by generating some sequences and then performing
# unsupervised training on them
# generate some random sequences
training = []
import random
-
rng = random.Random()
rng.seed(0)
for i in range(10):
# train on those examples, starting with the model that generated them
trainer = HiddenMarkovModelTrainer(states, symbols)
- hmm = trainer.train_unsupervised(training, model=model, max_iterations=1000)
+ hmm = trainer.train_unsupervised(training, model=model,
+ max_iterations=1000)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the HunPos POS-tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Dávid Márk Nemeskey <nemeskeyd@gmail.com> (modifications)
# Attila Zséder <zseder@gmail.com> (modifications)
import os
from subprocess import Popen, PIPE
+from six import text_type
+
from nltk.internals import find_binary, find_file
from nltk.tag.api import TaggerI
-_hunpos_url = "http://code.google.com/p/hunpos/"
+_hunpos_url = 'http://code.google.com/p/hunpos/'
-_hunpos_charset = "ISO-8859-1"
+_hunpos_charset = 'ISO-8859-1'
"""The default encoding used by hunpos: ISO-8859-1."""
-
class HunposTagger(TaggerI):
"""
A class for pos tagging with HunPos. The input is the paths to:
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]
"""
- def __init__(
- self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False
- ):
+ def __init__(self, path_to_model, path_to_bin=None,
+ encoding=_hunpos_charset, verbose=False):
"""
Starts the hunpos-tag executable and establishes a connection with it.
The caller must ensure that tokens are encoded in the right charset.
"""
self._closed = True
- hunpos_paths = [
- ".",
- "/usr/bin",
- "/usr/local/bin",
- "/opt/local/bin",
- "/Applications/bin",
- "~/bin",
- "~/Applications/bin",
- ]
+ hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
+ '/Applications/bin', '~/bin', '~/Applications/bin']
hunpos_paths = list(map(os.path.expanduser, hunpos_paths))
self._hunpos_bin = find_binary(
- "hunpos-tag",
- path_to_bin,
- env_vars=("HUNPOS_TAGGER",),
+ 'hunpos-tag', path_to_bin,
+ env_vars=('HUNPOS_TAGGER',),
searchpath=hunpos_paths,
url=_hunpos_url,
- verbose=verbose,
+ verbose=verbose
)
self._hunpos_model = find_file(
- path_to_model, env_vars=("HUNPOS_TAGGER",), verbose=verbose
- )
+ path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose)
self._encoding = encoding
- self._hunpos = Popen(
- [self._hunpos_bin, self._hunpos_model],
- shell=False,
- stdin=PIPE,
- stdout=PIPE,
- stderr=PIPE,
- )
+ self._hunpos = Popen([self._hunpos_bin, self._hunpos_model],
+ shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
self._closed = False
def __del__(self):
def __enter__(self):
return self
-
def __exit__(self, exc_type, exc_value, traceback):
self.close()
"""
for token in tokens:
assert "\n" not in token, "Tokens should not contain newlines"
- if isinstance(token, str):
+ if isinstance(token, text_type):
token = token.encode(self._encoding)
self._hunpos.stdin.write(token + b"\n")
# We write a final empty line to tell hunpos that the sentence is finished:
tagged_tokens = []
for token in tokens:
tagged = self._hunpos.stdout.readline().strip().split(b"\t")
- tag = tagged[1] if len(tagged) > 1 else None
+ tag = (tagged[1] if len(tagged) > 1 else None)
tagged_tokens.append((token, tag))
# We have to read (and dismiss) the final empty line:
self._hunpos.stdout.readline()
return tagged_tokens
-
# skip doctests if Hunpos tagger is not installed
def setup_module(module):
from nose import SkipTest
-
try:
- HunposTagger("en_wsj.model")
+ HunposTagger('en_wsj.model')
except LookupError:
raise SkipTest("HunposTagger is not available")
# Natural Language Toolkit: Tagset Mapping
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nathan Schneider <nathan@cmu.edu>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
"""
+from __future__ import print_function, unicode_literals, division
from collections import defaultdict
from os.path import join
from nltk.data import load
_UNIVERSAL_DATA = "taggers/universal_tagset"
-_UNIVERSAL_TAGS = (
- "VERB",
- "NOUN",
- "PRON",
- "ADJ",
- "ADV",
- "ADP",
- "CONJ",
- "DET",
- "NUM",
- "PRT",
- "X",
- ".",
-)
+_UNIVERSAL_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.')
# _MAPPINGS = defaultdict(lambda: defaultdict(dict))
# the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag
-_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: "UNK")))
+_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))
def _load_universal_map(fileid):
- contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text")
+ contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text")
# When mapping to the Universal Tagset,
# map unknown inputs to 'X' not 'UNK'
- _MAPPINGS[fileid]["universal"].default_factory = lambda: "X"
+ _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'
for line in contents.splitlines():
line = line.strip()
- if line == "":
+ if line == '':
continue
- fine, coarse = line.split("\t")
+ fine, coarse = line.split('\t')
- assert coarse in _UNIVERSAL_TAGS, "Unexpected coarse tag: {}".format(coarse)
- assert (
- fine not in _MAPPINGS[fileid]["universal"]
- ), "Multiple entries for original tag: {}".format(fine)
+ assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
+ assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine)
- _MAPPINGS[fileid]["universal"][fine] = coarse
+ _MAPPINGS[fileid]['universal'][fine] = coarse
def tagset_mapping(source, target):
"""
if source not in _MAPPINGS or target not in _MAPPINGS[source]:
- if target == "universal":
+ if target == 'universal':
_load_universal_map(source)
- # Added the new Russian National Corpus mappings because the
- # Russian model for nltk.pos_tag() uses it.
- _MAPPINGS["ru-rnc-new"]["universal"] = {
- "A": "ADJ",
- "A-PRO": "PRON",
- "ADV": "ADV",
- "ADV-PRO": "PRON",
- "ANUM": "ADJ",
- "CONJ": "CONJ",
- "INTJ": "X",
- "NONLEX": ".",
- "NUM": "NUM",
- "PARENTH": "PRT",
- "PART": "PRT",
- "PR": "ADP",
- "PRAEDIC": "PRT",
- "PRAEDIC-PRO": "PRON",
- "S": "NOUN",
- "S-PRO": "PRON",
- "V": "VERB",
- }
-
return _MAPPINGS[source][target]
-
def map_tag(source, target, source_tag):
"""
Maps the tag from the source tagset to the target tagset.
"""
# we need a systematic approach to naming
- if target == "universal":
- if source == "wsj":
- source = "en-ptb"
- if source == "brown":
- source = "en-brown"
+ if target == 'universal':
+ if source == 'wsj':
+ source = 'en-ptb'
+ if source == 'brown':
+ source = 'en-brown'
return tagset_mapping(source, target)[source_tag]
+
+
# -*- coding: utf-8 -*-
# This module is a port of the Textblob Averaged Perceptron Tagger
-# Author: Matthew Honnibal <honnibal+gh@gmail.com>,
+# Author: Matthew Honnibal <honnibal+gh@gmail.com>,
# Long Duong <longdt219@gmail.com> (NLTK port)
# URL: <https://github.com/sloria/textblob-aptagger>
# <http://nltk.org/>
#
# This module is provided under the terms of the MIT License.
+from __future__ import absolute_import
+from __future__ import print_function, division
+
import random
from collections import defaultdict
import pickle
from nltk.tag.api import TaggerI
from nltk.data import find, load
-
-from nltk import jsontags
-
-try:
- import numpy as np
-except ImportError:
- pass
+from nltk.compat import python_2_unicode_compatible
PICKLE = "averaged_perceptron_tagger.pickle"
-@jsontags.register_tag
-class AveragedPerceptron:
+class AveragedPerceptron(object):
- """An averaged perceptron, as implemented by Matthew Honnibal.
+ '''An averaged perceptron, as implemented by Matthew Honnibal.
See more implementation details here:
https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
- """
+ '''
- json_tag = "nltk.tag.perceptron.AveragedPerceptron"
-
- def __init__(self, weights=None):
+ def __init__(self):
# Each feature gets its own weight vector, so weights is a dict-of-dicts
- self.weights = weights if weights else {}
+ self.weights = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
# Number of instances seen
self.i = 0
- def _softmax(self, scores):
- s = np.fromiter(scores.values(), dtype=float)
- exps = np.exp(s)
- return exps / np.sum(exps)
-
- def predict(self, features, return_conf=False):
- """Dot-product the features and current weights and return the best label."""
+ def predict(self, features):
+ '''Dot-product the features and current weights and return the best label.'''
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
-
# Do a secondary alphabetic sort, for stability
- best_label = max(self.classes, key=lambda label: (scores[label], label))
- # compute the confidence
- conf = max(self._softmax(scores)) if return_conf == True else None
-
- return best_label, conf
+ return max(self.classes, key=lambda label: (scores[label], label))
def update(self, truth, guess, features):
- """Update the feature weights."""
-
+ '''Update the feature weights.'''
def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
def average_weights(self):
- """Average weights from all iterations."""
+ '''Average weights from all iterations.'''
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
self.weights[feat] = new_feat_weights
def save(self, path):
- """Save the pickled model weights."""
- with open(path, "wb") as fout:
+ '''Save the pickled model weights.'''
+ with open(path, 'wb') as fout:
return pickle.dump(dict(self.weights), fout)
def load(self, path):
- """Load the pickled model weights."""
+ '''Load the pickled model weights.'''
self.weights = load(path)
- def encode_json_obj(self):
- return self.weights
-
- @classmethod
- def decode_json_obj(cls, obj):
- return cls(obj)
-
-
-@jsontags.register_tag
+@python_2_unicode_compatible
class PerceptronTagger(TaggerI):
- """
+ '''
Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
See more implementation details here:
https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
-
+
>>> from nltk.tag.perceptron import PerceptronTagger
- Train the model
-
+ Train the model
+
>>> tagger = PerceptronTagger(load=False)
-
+
>>> tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')],
... [('yes','NNS'),('it','PRP'),('beautiful','JJ')]])
-
+
>>> tagger.tag(['today','is','a','beautiful','day'])
[('today', 'NN'), ('is', 'PRP'), ('a', 'PRP'), ('beautiful', 'JJ'), ('day', 'NN')]
-
- Use the pretrain model (the default constructor)
-
+
+ Use the pretrain model (the default constructor)
+
>>> pretrain = PerceptronTagger()
-
+
>>> pretrain.tag('The quick brown fox jumps over the lazy dog'.split())
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]
-
+
>>> pretrain.tag("The red cat".split())
[('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')]
- """
-
- json_tag = "nltk.tag.sequential.PerceptronTagger"
-
- START = ["-START-", "-START2-"]
- END = ["-END-", "-END2-"]
+ '''
+ START = ['-START-', '-START2-']
+ END = ['-END-', '-END2-']
+
def __init__(self, load=True):
- """
+ '''
:param load: Load the pickled model upon instantiation.
- """
+ '''
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
- AP_MODEL_LOC = "file:" + str(
- find("taggers/averaged_perceptron_tagger/" + PICKLE)
- )
+ AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
self.load(AP_MODEL_LOC)
- def tag(self, tokens, return_conf=False, use_tagdict=True):
- """
+ def tag(self, tokens):
+ '''
Tag tokenized sentences.
:params tokens: list of word
:type tokens: list(str)
- """
+ '''
prev, prev2 = self.START
output = []
-
+
context = self.START + [self.normalize(w) for w in tokens] + self.END
for i, word in enumerate(tokens):
- tag, conf = (
- (self.tagdict.get(word), 1.0) if use_tagdict == True else (None, None)
- )
+ tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
- tag, conf = self.model.predict(features, return_conf)
- output.append((word, tag, conf) if return_conf == True else (word, tag))
-
+ tag = self.model.predict(features)
+ output.append((word, tag))
prev2 = prev
prev = tag
return output
def train(self, sentences, save_loc=None, nr_iter=5):
- """Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
+ '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
controls the number of Perceptron training iterations.
:param sentences: A list or iterator of sentences, where each sentence
is a list of (words, tags) tuples.
:param save_loc: If not ``None``, saves a pickled model in this location.
:param nr_iter: Number of training iterations.
- """
+ '''
# We'd like to allow ``sentences`` to be either a list or an iterator,
# the latter being especially important for a large training dataset.
# Because ``self._make_tagdict(sentences)`` runs regardless, we make
n = 0
for sentence in self._sentences:
words, tags = zip(*sentence)
-
+
prev, prev2 = self.START
- context = self.START + [self.normalize(w) for w in words] + self.END
+ context = self.START + [self.normalize(w) for w in words] \
+ + self.END
for i, word in enumerate(words):
guess = self.tagdict.get(word)
if not guess:
feats = self._get_features(i, word, context, prev, prev2)
- guess, _ = self.model.predict(feats)
+ guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)
prev2 = prev
prev = guess
self.model.average_weights()
# Pickle as a binary file
if save_loc is not None:
- with open(save_loc, "wb") as fout:
+ with open(save_loc, 'wb') as fout:
# changed protocol from -1 to 2 to make pickling Python 2 compatible
pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2)
+
def load(self, loc):
- """
+ '''
:param loc: Load a pickled model at location.
- :type loc: str
- """
+ :type loc: str
+ '''
self.model.weights, self.tagdict, self.classes = load(loc)
self.model.classes = self.classes
-
- def encode_json_obj(self):
- return self.model.weights, self.tagdict, list(self.classes)
-
- @classmethod
- def decode_json_obj(cls, obj):
- tagger = cls(load=False)
- tagger.model.weights, tagger.tagdict, tagger.classes = obj
- tagger.classes = set(tagger.classes)
- tagger.model.classes = tagger.classes
- return tagger
+
def normalize(self, word):
- """
+ '''
Normalization used in pre-processing.
- All words are lower cased
- Groups of digits of length 4 are represented as !YEAR;
- Other digits are represented as !DIGITS
:rtype: str
- """
- if "-" in word and word[0] != "-":
- return "!HYPHEN"
+ '''
+ if '-' in word and word[0] != '-':
+ return '!HYPHEN'
elif word.isdigit() and len(word) == 4:
- return "!YEAR"
+ return '!YEAR'
elif word[0].isdigit():
- return "!DIGITS"
+ return '!DIGITS'
else:
return word.lower()
def _get_features(self, i, word, context, prev, prev2):
- """Map tokens into a feature representation, implemented as a
+ '''Map tokens into a feature representation, implemented as a
{hashable: int} dict. If the features change, a new model must be
trained.
- """
-
+ '''
def add(name, *args):
- features[" ".join((name,) + tuple(args))] += 1
+ features[' '.join((name,) + tuple(args))] += 1
i += len(self.START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
- add("bias")
- add("i suffix", word[-3:])
- add("i pref1", word[0])
- add("i-1 tag", prev)
- add("i-2 tag", prev2)
- add("i tag+i-2 tag", prev, prev2)
- add("i word", context[i])
- add("i-1 tag+i word", prev, context[i])
- add("i-1 word", context[i - 1])
- add("i-1 suffix", context[i - 1][-3:])
- add("i-2 word", context[i - 2])
- add("i+1 word", context[i + 1])
- add("i+1 suffix", context[i + 1][-3:])
- add("i+2 word", context[i + 2])
+ add('bias')
+ add('i suffix', word[-3:])
+ add('i pref1', word[0])
+ add('i-1 tag', prev)
+ add('i-2 tag', prev2)
+ add('i tag+i-2 tag', prev, prev2)
+ add('i word', context[i])
+ add('i-1 tag+i word', prev, context[i])
+ add('i-1 word', context[i-1])
+ add('i-1 suffix', context[i-1][-3:])
+ add('i-2 word', context[i-2])
+ add('i+1 word', context[i+1])
+ add('i+1 suffix', context[i+1][-3:])
+ add('i+2 word', context[i+2])
return features
def _make_tagdict(self, sentences):
- """
+ '''
Make a tag dictionary for single-tag words.
:param sentences: A list of list of (word, tag) tuples.
- """
+ '''
counts = defaultdict(lambda: defaultdict(int))
for sentence in sentences:
self._sentences.append(sentence)
def _pc(n, d):
return (n / d) * 100
-
def _load_data_conll_format(filename):
- print("Read from file: ", filename)
- with open(filename, "rb") as fin:
+ print ('Read from file: ', filename)
+ with open(filename,'rb') as fin:
sentences = []
sentence = []
for line in fin.readlines():
line = line.strip()
- # print line
- if len(line) == 0:
+ #print line
+ if len(line) ==0:
sentences.append(sentence)
sentence = []
continue
- tokens = line.split("\t")
+ tokens = line.split('\t')
word = tokens[1]
tag = tokens[4]
- sentence.append((word, tag))
+ sentence.append((word,tag))
return sentences
-
def _get_pretrain_model():
# Train and test on English part of ConLL data (WSJ part of Penn Treebank)
- # Train: section 2-11
+ # Train: section 2-11
# Test : section 23
tagger = PerceptronTagger()
- training = _load_data_conll_format("english_ptb_train.conll")
- testing = _load_data_conll_format("english_ptb_test.conll")
- print("Size of training and testing (sentence)", len(training), len(testing))
- # Train and save the model
- tagger.train(training, PICKLE)
- print("Accuracy : ", tagger.evaluate(testing))
-
-
-if __name__ == "__main__":
- # _get_pretrain_model()
+ training = _load_data_conll_format('english_ptb_train.conll')
+ testing = _load_data_conll_format('english_ptb_test.conll')
+ print ('Size of training and testing (sentence)', len(training), len(testing))
+ # Train and save the model
+ tagger.train(training, PICKLE)
+ print ('Accuracy : ',tagger.evaluate(testing))
+
+if __name__ == '__main__':
+ #_get_pretrain_model()
pass
# encoding: utf-8
# Natural Language Toolkit: Senna POS Tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]
"""
+from nltk.compat import python_2_unicode_compatible
from nltk.classify import Senna
-
-
+@python_2_unicode_compatible
class SennaTagger(Senna):
- def __init__(self, path, encoding="utf-8"):
- super(SennaTagger, self).__init__(path, ["pos"], encoding)
+ def __init__(self, path, encoding='utf-8'):
+ super(SennaTagger, self).__init__(path, ['pos'], encoding)
def tag_sents(self, sentences):
"""
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
annotations = tagged_sents[i][j]
- tagged_sents[i][j] = (annotations["word"], annotations["pos"])
+ tagged_sents[i][j] = (annotations['word'], annotations['pos'])
return tagged_sents
-
-
+@python_2_unicode_compatible
class SennaChunkTagger(Senna):
- def __init__(self, path, encoding="utf-8"):
- super(SennaChunkTagger, self).__init__(path, ["chk"], encoding)
+ def __init__(self, path, encoding='utf-8'):
+ super(SennaChunkTagger, self).__init__(path, ['chk'], encoding)
def tag_sents(self, sentences):
"""
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
annotations = tagged_sents[i][j]
- tagged_sents[i][j] = (annotations["word"], annotations["chk"])
+ tagged_sents[i][j] = (annotations['word'], annotations['chk'])
return tagged_sents
def bio_to_chunks(self, tagged_sent, chunk_type):
current_chunk_position = []
for idx, word_pos in enumerate(tagged_sent):
word, pos = word_pos
- if "-" + chunk_type in pos: # Append the word to the current_chunk.
+ if '-'+chunk_type in pos: # Append the word to the current_chunk.
current_chunk.append((word))
current_chunk_position.append((idx))
else:
- if current_chunk: # Flush the full chunk when out of an NP.
- _chunk_str = " ".join(current_chunk)
- _chunk_pos_str = "-".join(map(str, current_chunk_position))
+ if current_chunk: # Flush the full chunk when out of an NP.
+ _chunk_str = ' '.join(current_chunk)
+ _chunk_pos_str = '-'.join(map(str, current_chunk_position))
yield _chunk_str, _chunk_pos_str
current_chunk = []
current_chunk_position = []
- if current_chunk: # Flush the last chunk.
- yield " ".join(current_chunk), "-".join(map(str, current_chunk_position))
-
+ if current_chunk: # Flush the last chunk.
+ yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))
+@python_2_unicode_compatible
class SennaNERTagger(Senna):
- def __init__(self, path, encoding="utf-8"):
- super(SennaNERTagger, self).__init__(path, ["ner"], encoding)
+ def __init__(self, path, encoding='utf-8'):
+ super(SennaNERTagger, self).__init__(path, ['ner'], encoding)
def tag_sents(self, sentences):
"""
for i in range(len(tagged_sents)):
for j in range(len(tagged_sents[i])):
annotations = tagged_sents[i][j]
- tagged_sents[i][j] = (annotations["word"], annotations["ner"])
+ tagged_sents[i][j] = (annotations['word'], annotations['ner'])
return tagged_sents
+
# skip doctests if Senna is not installed
def setup_module(module):
from nose import SkipTest
-
try:
- tagger = Senna("/usr/share/senna-v3.0", ["pos", "chk", "ner"])
+ tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
except OSError:
raise SkipTest("Senna executable not found")
+
# Natural Language Toolkit: Sequential Backoff Taggers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger)
consulted instead. Any SequentialBackoffTagger may serve as a
backoff tagger for any other SequentialBackoffTagger.
"""
-import ast
+from __future__ import print_function, unicode_literals
from abc import abstractmethod
import re
from nltk.probability import ConditionalFreqDist
from nltk.classify import NaiveBayesClassifier
+from nltk.compat import python_2_unicode_compatible
from nltk.tag.api import TaggerI, FeaturesetTaggerI
:ivar _taggers: A list of all the taggers that should be tried to
tag a token (i.e., self and its backoff taggers).
"""
-
def __init__(self, backoff=None):
if backoff is None:
self._taggers = [self]
"""
+@python_2_unicode_compatible
class ContextTagger(SequentialBackoffTagger):
"""
An abstract base class for sequential backoff taggers that choose
:ivar _context_to_tag: Dictionary mapping contexts to tags.
"""
-
def __init__(self, context_to_tag, backoff=None):
"""
:param context_to_tag: A dictionary mapping contexts to tags.
:param backoff: The backoff tagger that should be used for this tagger.
"""
- super().__init__(backoff)
- self._context_to_tag = context_to_tag if context_to_tag else {}
+ SequentialBackoffTagger.__init__(self, backoff)
+ self._context_to_tag = (context_to_tag if context_to_tag else {})
@abstractmethod
def context(self, tokens, index, history):
return len(self._context_to_tag)
def __repr__(self):
- return "<{}: size={}>".format(self.__class__.__name__, self.size())
+ return '<%s: size=%d>' % (self.__class__.__name__, self.size())
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
- if self.backoff is None or tag != self.backoff.tag_one(
- tokens, index, tags[:index]
- ):
+ if (self.backoff is None or
+ tag != self.backoff.tag_one(
+ tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0) / token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
- print("[Trained Unigram tagger:", end=" ")
- print("size={}, backoff={:.2f}%, pruning={:.2f}%]".format(size, backoff, pruning))
+ print("[Trained Unigram tagger:", end=' ')
+ print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
+ size, backoff, pruning))
######################################################################
# Tagger Classes
######################################################################
-
-
+@python_2_unicode_compatible
@jsontags.register_tag
class DefaultTagger(SequentialBackoffTagger):
"""
:type tag: str
"""
- json_tag = "nltk.tag.sequential.DefaultTagger"
+ json_tag = 'nltk.tag.sequential.DefaultTagger'
def __init__(self, tag):
self._tag = tag
- super().__init__(None)
+ SequentialBackoffTagger.__init__(self, None)
def encode_json_obj(self):
return self._tag
return self._tag # ignore token and history
def __repr__(self):
- return "<DefaultTagger: tag={}>".format(self._tag)
+ return '<DefaultTagger: tag=%s>' % self._tag
@jsontags.register_tag
fewer than *cutoff* times, then exclude it from the
context-to-tag table for the new tagger.
"""
+ json_tag = 'nltk.tag.sequential.NgramTagger'
- json_tag = "nltk.tag.sequential.NgramTagger"
-
- def __init__(
- self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False
- ):
+ def __init__(self, n, train=None, model=None,
+ backoff=None, cutoff=0, verbose=False):
self._n = n
self._check_params(train, model)
- super().__init__(model, backoff)
+ ContextTagger.__init__(self, model, backoff)
if train:
self._train(train, cutoff, verbose)
def encode_json_obj(self):
- _context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()}
- if "NgramTagger" in self.__class__.__name__:
- return self._n, _context_to_tag, self.backoff
- else:
- return _context_to_tag, self.backoff
+ return self._n, self._context_to_tag, self.backoff
@classmethod
def decode_json_obj(cls, obj):
- try:
- _n, _context_to_tag, backoff = obj
- except ValueError:
- _context_to_tag, backoff = obj
-
- if not _context_to_tag:
- return backoff
-
- _context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()}
-
- if "NgramTagger" in cls.__name__:
- return cls(_n, model=_context_to_tag, backoff=backoff)
- else:
- return cls(model=_context_to_tag, backoff=backoff)
+ _n, _context_to_tag, backoff = obj
+ return cls(_n, model=_context_to_tag, backoff=backoff)
def context(self, tokens, index, history):
- tag_context = tuple(history[max(0, index - self._n + 1) : index])
+ tag_context = tuple(history[max(0, index-self._n+1):index])
return tag_context, tokens[index]
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> for tok, tag in unigram_tagger.tag(test_sent):
- ... print("({}, {}), ".format(tok, tag))
+ ... print("(%s, %s), " % (tok, tag))
(The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
(Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
(investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
:type cutoff: int
"""
- json_tag = "nltk.tag.sequential.UnigramTagger"
+ json_tag = 'nltk.tag.sequential.UnigramTagger'
+
+ def __init__(self, train=None, model=None,
+ backoff=None, cutoff=0, verbose=False):
+ NgramTagger.__init__(self, 1, train, model,
+ backoff, cutoff, verbose)
+
+ def encode_json_obj(self):
+ return self._context_to_tag, self.backoff
- def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
- super().__init__(1, train, model, backoff, cutoff, verbose)
+ @classmethod
+ def decode_json_obj(cls, obj):
+ _context_to_tag, backoff = obj
+ return cls(model=_context_to_tag, backoff=backoff)
def context(self, tokens, index, history):
return tokens[index]
in order not to use the backoff tagger
:type cutoff: int
"""
+ json_tag = 'nltk.tag.sequential.BigramTagger'
- json_tag = "nltk.tag.sequential.BigramTagger"
+ def __init__(self, train=None, model=None,
+ backoff=None, cutoff=0, verbose=False):
+ NgramTagger.__init__(self, 2, train, model,
+ backoff, cutoff, verbose)
- def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
- super().__init__(2, train, model, backoff, cutoff, verbose)
+ def encode_json_obj(self):
+ return self._context_to_tag, self.backoff
+
+ @classmethod
+ def decode_json_obj(cls, obj):
+ _context_to_tag, backoff = obj
+ return cls(model=_context_to_tag, backoff=backoff)
@jsontags.register_tag
in order not to use the backoff tagger
:type cutoff: int
"""
+ json_tag = 'nltk.tag.sequential.TrigramTagger'
- json_tag = "nltk.tag.sequential.TrigramTagger"
+ def __init__(self, train=None, model=None,
+ backoff=None, cutoff=0, verbose=False):
+ NgramTagger.__init__(self, 3, train, model,
+ backoff, cutoff, verbose)
- def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
- super().__init__(3, train, model, backoff, cutoff, verbose)
+ def encode_json_obj(self):
+ return self._context_to_tag, self.backoff
+
+ @classmethod
+ def decode_json_obj(cls, obj):
+ _context_to_tag, backoff = obj
+ return cls(model=_context_to_tag, backoff=backoff)
@jsontags.register_tag
tag of None by this tagger.
"""
- json_tag = "nltk.tag.sequential.AffixTagger"
+ json_tag = 'nltk.tag.sequential.AffixTagger'
- def __init__(
- self,
- train=None,
- model=None,
- affix_length=-3,
- min_stem_length=2,
- backoff=None,
- cutoff=0,
- verbose=False,
- ):
+ def __init__(self, train=None, model=None, affix_length=-3,
+ min_stem_length=2, backoff=None, cutoff=0, verbose=False):
self._check_params(train, model)
- super().__init__(model, backoff)
+ ContextTagger.__init__(self, model, backoff)
self._affix_length = affix_length
self._min_word_length = min_stem_length + abs(affix_length)
self._train(train, cutoff, verbose)
def encode_json_obj(self):
- return (
- self._affix_length,
- self._min_word_length,
- self._context_to_tag,
- self.backoff,
- )
+ return self._affix_length, self._min_word_length, self._context_to_tag, self.backoff
@classmethod
def decode_json_obj(cls, obj):
affix_length=_affix_length,
min_stem_length=_min_word_length - abs(_affix_length),
model=_context_to_tag,
- backoff=backoff,
+ backoff=backoff
)
def context(self, tokens, index, history):
if len(token) < self._min_word_length:
return None
elif self._affix_length > 0:
- return token[: self._affix_length]
+ return token[:self._affix_length]
else:
- return token[self._affix_length :]
+ return token[self._affix_length:]
+@python_2_unicode_compatible
@jsontags.register_tag
class RegexpTagger(SequentialBackoffTagger):
"""
assigned the tag None.
"""
- json_tag = "nltk.tag.sequential.RegexpTagger"
+ json_tag = 'nltk.tag.sequential.RegexpTagger'
def __init__(self, regexps, backoff=None):
"""
"""
- super().__init__(backoff)
- try:
- self._regexps = [(re.compile(regexp), tag,) for regexp, tag in regexps]
- except Exception as e:
- raise Exception(
- 'Invalid RegexpTagger regexp:', str(e), 'regexp:', regexp, 'tag:', tag)
+ SequentialBackoffTagger.__init__(self, backoff)
+ self._regexs = [(re.compile(regexp), tag,) for regexp, tag in regexps]
def encode_json_obj(self):
- return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff
+ return [(regexp.patten, tag,) for regexp, tag in self._regexs], self.backoff
@classmethod
def decode_json_obj(cls, obj):
regexps, backoff = obj
- return cls(regexps, backoff)
+ self = cls(())
+ self._regexs = [(re.compile(regexp), tag,) for regexp, tag in regexps]
+ SequentialBackoffTagger.__init__(self, backoff)
+ return self
def choose_tag(self, tokens, index, history):
- for regexp, tag in self._regexps:
+ for regexp, tag in self._regexs:
if re.match(regexp, tokens[index]):
return tag
return None
def __repr__(self):
- return "<Regexp Tagger: size={}>".format(len(self._regexps))
+ return '<Regexp Tagger: size=%d>' % len(self._regexs)
+@python_2_unicode_compatible
class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
"""
A sequential tagger that uses a classifier to choose the tag for
back on its backoff tagger if the probability of the most
likely tag is less than *cutoff_prob*.
"""
-
- def __init__(
- self,
- feature_detector=None,
- train=None,
- classifier_builder=NaiveBayesClassifier.train,
- classifier=None,
- backoff=None,
- cutoff_prob=None,
- verbose=False,
- ):
+ def __init__(self, feature_detector=None, train=None,
+ classifier_builder=NaiveBayesClassifier.train,
+ classifier=None, backoff=None,
+ cutoff_prob=None, verbose=False):
self._check_params(train, classifier)
- super().__init__(backoff)
+ SequentialBackoffTagger.__init__(self, backoff)
if (train and classifier) or (not train and not classifier):
- raise ValueError(
- "Must specify either training data or " "trained classifier."
- )
+ raise ValueError('Must specify either training data or '
+ 'trained classifier.')
if feature_detector is not None:
self._feature_detector = feature_detector
classifier_corpus = []
if verbose:
- print("Constructing training corpus for classifier.")
+ print('Constructing training corpus for classifier.')
for sentence in tagged_corpus:
history = []
untagged_sentence, tags = zip(*sentence)
for index in range(len(sentence)):
- featureset = self.feature_detector(untagged_sentence, index, history)
+ featureset = self.feature_detector(untagged_sentence,
+ index, history)
classifier_corpus.append((featureset, tags[index]))
history.append(tags[index])
if verbose:
- print("Training classifier ({} instances)".format(len(classifier_corpus)))
+ print('Training classifier (%d instances)' % len(classifier_corpus))
self._classifier = classifier_builder(classifier_corpus)
def __repr__(self):
- return "<ClassifierBasedTagger: {}>".format(self._classifier)
+ return '<ClassifierBasedTagger: %r>' % self._classifier
def feature_detector(self, tokens, index, history):
"""
"""
A classifier based part of speech tagger.
"""
-
def feature_detector(self, tokens, index, history):
word = tokens[index]
if index == 0:
prevword = prevprevword = None
prevtag = prevprevtag = None
elif index == 1:
- prevword = tokens[index - 1].lower()
+ prevword = tokens[index-1].lower()
prevprevword = None
- prevtag = history[index - 1]
+ prevtag = history[index-1]
prevprevtag = None
else:
- prevword = tokens[index - 1].lower()
- prevprevword = tokens[index - 2].lower()
- prevtag = history[index - 1]
- prevprevtag = history[index - 2]
-
- if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word):
- shape = "number"
- elif re.match("\W+$", word):
- shape = "punct"
- elif re.match("[A-Z][a-z]+$", word):
- shape = "upcase"
- elif re.match("[a-z]+$", word):
- shape = "downcase"
- elif re.match("\w+$", word):
- shape = "mixedcase"
+ prevword = tokens[index-1].lower()
+ prevprevword = tokens[index-2].lower()
+ prevtag = history[index-1]
+ prevprevtag = history[index-2]
+
+ if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
+ shape = 'number'
+ elif re.match('\W+$', word):
+ shape = 'punct'
+ elif re.match('[A-Z][a-z]+$', word):
+ shape = 'upcase'
+ elif re.match('[a-z]+$', word):
+ shape = 'downcase'
+ elif re.match('\w+$', word):
+ shape = 'mixedcase'
else:
- shape = "other"
+ shape = 'other'
features = {
- "prevtag": prevtag,
- "prevprevtag": prevprevtag,
- "word": word,
- "word.lower": word.lower(),
- "suffix3": word.lower()[-3:],
- "suffix2": word.lower()[-2:],
- "suffix1": word.lower()[-1:],
- "prevprevword": prevprevword,
- "prevword": prevword,
- "prevtag+word": "{}+{}".format(prevtag, word.lower()),
- "prevprevtag+word": "{}+{}".format(prevprevtag, word.lower()),
- "prevword+word": "{}+{}".format(prevword, word.lower()),
- "shape": shape,
- }
+ 'prevtag': prevtag,
+ 'prevprevtag': prevprevtag,
+ 'word': word,
+ 'word.lower': word.lower(),
+ 'suffix3': word.lower()[-3:],
+ 'suffix2': word.lower()[-2:],
+ 'suffix1': word.lower()[-1:],
+ 'prevprevword': prevprevword,
+ 'prevword': prevword,
+ 'prevtag+word': '%s+%s' % (prevtag, word.lower()),
+ 'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()),
+ 'prevword+word': '%s+%s' % (prevword, word.lower()),
+ 'shape': shape,
+ }
return features
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org>
# Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
from subprocess import PIPE
import warnings
+from six import text_type
+
from nltk.internals import find_file, find_jar, config_java, java, _java_options
from nltk.tag.api import TaggerI
+from nltk.parse.corenlp import CoreNLPParser
-_stanford_url = "https://nlp.stanford.edu/software"
+_stanford_url = 'https://nlp.stanford.edu/software'
class StanfordTagger(TaggerI):
- ``_JAR`` file: Class constant that represents the jar file name.
"""
- _SEPARATOR = ""
- _JAR = ""
-
- def __init__(
- self,
- model_filename,
- path_to_jar=None,
- encoding="utf8",
- verbose=False,
- java_options="-mx1000m",
- ):
- # Raise deprecation warning.
- warnings.warn(
- str(
- "\nThe StanfordTokenizer will "
- "be deprecated in version 3.2.6.\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead."
- ),
- DeprecationWarning,
- stacklevel=2,
- )
+ _SEPARATOR = ''
+ _JAR = ''
+ def __init__(self, model_filename, path_to_jar=None, encoding='utf8',
+ verbose=False, java_options='-mx1000m'):
+ # Raise deprecation warning.
+ warnings.simplefilter('always', DeprecationWarning)
+ warnings.warn(str("\nThe StanfordTokenizer will "
+ "be deprecated in version 3.2.5.\n"
+ "Please use \033[91mnltk.tag.stanford.CoreNLPPOSTagger\033[0m "
+ "or \033[91mnltk.tag.stanford.CoreNLPNERTagger\033[0m instead."),
+ DeprecationWarning, stacklevel=2)
+ warnings.simplefilter('ignore', DeprecationWarning)
if not self._JAR:
- warnings.warn(
- "The StanfordTagger class is not meant to be "
- "instantiated directly. Did you mean "
- "StanfordPOSTagger or StanfordNERTagger?"
- )
+ warnings.warn('The StanfordTagger class is not meant to be '
+ 'instantiated directly. Did you mean '
+ 'StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar(
- self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose
- )
+ self._JAR, path_to_jar,
+ searchpath=(), url=_stanford_url,
+ verbose=verbose)
- self._stanford_model = find_file(
- model_filename, env_vars=("STANFORD_MODELS",), verbose=verbose
- )
+ self._stanford_model = find_file(model_filename,
+ env_vars=('STANFORD_MODELS',),
+ verbose=verbose)
self._encoding = encoding
self.java_options = java_options
def tag_sents(self, sentences):
encoding = self._encoding
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
config_java(options=self.java_options, verbose=False)
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
cmd = list(self._cmd)
- cmd.extend(["-encoding", encoding])
+ cmd.extend(['-encoding', encoding])
# Write the actual sentences to the temporary input file
- _input_fh = os.fdopen(_input_fh, "wb")
- _input = "\n".join((" ".join(x) for x in sentences))
- if isinstance(_input, str) and encoding:
+ _input_fh = os.fdopen(_input_fh, 'wb')
+ _input = '\n'.join((' '.join(x) for x in sentences))
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
# Run the tagger and get the output
- stanpos_output, _stderr = java(
- cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
- )
+ stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
+ stdout=PIPE, stderr=PIPE)
stanpos_output = stanpos_output.decode(encoding)
# Delete the temporary file
sentence = []
for tagged_word in tagged_sentence.strip().split():
word_tags = tagged_word.strip().split(self._SEPARATOR)
- sentence.append(("".join(word_tags[:-1]), word_tags[-1]))
+ sentence.append((''.join(word_tags[:-1]), word_tags[-1]))
tagged_sentences.append(sentence)
return tagged_sentences
>>> st.tag('What is the airspeed of an unladen swallow ?'.split())
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
-
- _SEPARATOR = "_"
- _JAR = "stanford-postagger.jar"
+ _SEPARATOR = '_'
+ _JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs):
super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
- return [
- "edu.stanford.nlp.tagger.maxent.MaxentTagger",
- "-model",
- self._stanford_model,
- "-textFile",
- self._input_file_path,
- "-tokenize",
- "false",
- "-outputFormatOptions",
- "keepEmptySentences",
- ]
+ return ['edu.stanford.nlp.tagger.maxent.MaxentTagger',
+ '-model', self._stanford_model, '-textFile',
+ self._input_file_path, '-tokenize', 'false',
+ '-outputFormatOptions', 'keepEmptySentences']
class StanfordNERTagger(StanfordTagger):
('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
"""
- _SEPARATOR = "/"
- _JAR = "stanford-ner.jar"
- _FORMAT = "slashTags"
+ _SEPARATOR = '/'
+ _JAR = 'stanford-ner.jar'
+ _FORMAT = 'slashTags'
def __init__(self, *args, **kwargs):
super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
# Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
- return [
- "edu.stanford.nlp.ie.crf.CRFClassifier",
- "-loadClassifier",
- self._stanford_model,
- "-textFile",
- self._input_file_path,
- "-outputFormat",
- self._FORMAT,
- "-tokenizerFactory",
- "edu.stanford.nlp.process.WhitespaceTokenizer",
- "-tokenizerOptions",
- '"tokenizeNLs=false"',
- ]
+ return ['edu.stanford.nlp.ie.crf.CRFClassifier',
+ '-loadClassifier', self._stanford_model, '-textFile',
+ self._input_file_path, '-outputFormat', self._FORMAT,
+ '-tokenizerFactory',
+ 'edu.stanford.nlp.process.WhitespaceTokenizer',
+ '-tokenizerOptions', '\"tokenizeNLs=false\"']
def parse_output(self, text, sentences):
- if self._FORMAT == "slashTags":
+ if self._FORMAT == 'slashTags':
# Joint together to a big list
tagged_sentences = []
for tagged_sentence in text.strip().split("\n"):
for tagged_word in tagged_sentence.strip().split():
word_tags = tagged_word.strip().split(self._SEPARATOR)
- tagged_sentences.append(("".join(word_tags[:-1]), word_tags[-1]))
+ tagged_sentences.append((''.join(word_tags[:-1]),
+ word_tags[-1]))
# Separate it according to the input
result = []
start = 0
for sent in sentences:
- result.append(tagged_sentences[start : start + len(sent)])
+ result.append(tagged_sentences[start:start + len(sent)])
start += len(sent)
return result
raise NotImplementedError
+class CoreNLPTagger(CoreNLPParser, TaggerI):
+ def __init__(self, tagtype, url='http://localhost:9000', encoding='utf8'):
+ """
+ An abstract interface to POS/NER taggers of CoreNLP that returns the
+ POS/NER tags from the Stanford CoreNLP API at nltk.parse.corenlp.
+ """
+ self.tagtype = tagtype
+ super(CoreNLPTagger, self).__init__(url, encoding)
+
+ def tag_sents(self, sentences):
+ # Converting list(list(str)) -> list(str)
+ sentences = (' '.join(words) for words in sentences)
+ return list(self.raw_tag_sents(sentences))
+
+
+ def tag(self, sentence):
+ return self.tag_sents([sentence])[0]
+
+ def raw_tag_sents(self, sentences):
+ """
+ This function will interface the `GenericCoreNLPParser.api_call` to
+ retreive the JSON output and return the annotations required.
+ """
+ default_properties = {'ssplit.isOneSentence': 'true',
+ 'annotators': 'tokenize,ssplit,' }
+ # Supports only 'pos' or 'ner' tags.
+ assert self.tagtype in ['pos', 'ner']
+ default_properties['annotators'] += self.tagtype
+ for sentence in sentences:
+ tagged_data = self.api_call(sentence, properties=default_properties)
+ assert len(tagged_data['sentences']) == 1
+ # Taggers only need to return 1-best sentence.
+ yield [(token['word'], token[self.tagtype]) for token in tagged_data['sentences'][0]['tokens']]
+
+
+class CoreNLPPOSTagger(CoreNLPTagger):
+ """
+ This is a subclass of the CoreNLPTagger that wraps around the
+ nltk.parse.CoreNLPParser for Part-of-Sppech tagging.
+
+ >>> from nltk.tag.stanford import CoreNLPPOSTagger
+ >>> CoreNLPPOSTagger(url='http://localhost:9000').tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
+ [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
+ """
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
+ super(CoreNLPPOSTagger, self).__init__('pos', url, encoding)
+
+
+class CoreNLPNERTagger(CoreNLPTagger):
+ """
+ This is a subclass of the CoreNLPTagger that wraps around the
+ nltk.parse.CoreNLPParser for Named-Entity tagging.
+
+ >>> from nltk.tag.stanford import CoreNLPNERTagger
+ >>> CoreNLPNERTagger(url='http://localhost:9000').tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
+ [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
+ """
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
+ super(CoreNLPNERTagger, self).__init__('ner', url, encoding)
+
def setup_module(module):
from nose import SkipTest
try:
- StanfordPOSTagger("english-bidirectional-distsim.tagger")
+ StanfordPOSTagger('english-bidirectional-distsim.tagger')
+ except LookupError:
+ raise SkipTest('Doctests from nltk.tag.stanford are skipped because one \
+ of the stanford jars cannot be found.')
+
+ try:
+ CoreNLPPOSTagger()
+ CoreNLPNERTagger()
except LookupError:
- raise SkipTest(
- "Doctests from nltk.tag.stanford are skipped because one \
- of the stanford jars cannot be found."
- )
+ raise SkipTest('Doctests from nltk.tag.stanford.CoreNLPTokenizer'
+ 'are skipped because the stanford corenlp server not started')
# Natural Language Toolkit: TnT Tagger
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sam Huston <sjh900@gmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-"""
+'''
Implementation of 'TnT - A Statisical Part of Speech Tagger'
by Thorsten Brants
http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf
-"""
-
+'''
+from __future__ import print_function, division
from math import log
from operator import itemgetter
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.tag.api import TaggerI
-
class TnT(TaggerI):
- """
+ '''
TnT - Statistical POS tagger
IMPORTANT NOTES:
It is possible to differentiate the tags which are assigned to
capitalized words. However this does not result in a significant
gain in the accuracy of the results.
- """
+ '''
def __init__(self, unk=None, Trained=False, N=1000, C=False):
- """
+ '''
Construct a TnT statistical tagger. Tagger must be trained
before being used to tag input.
information for tagging.
NOTE: using capitalization may not increase the accuracy
of the tagger
- """
-
- self._uni = FreqDist()
- self._bi = ConditionalFreqDist()
- self._tri = ConditionalFreqDist()
- self._wd = ConditionalFreqDist()
- self._eos = ConditionalFreqDist()
- self._l1 = 0.0
- self._l2 = 0.0
- self._l3 = 0.0
- self._N = N
- self._C = C
- self._T = Trained
+ '''
+
+ self._uni = FreqDist()
+ self._bi = ConditionalFreqDist()
+ self._tri = ConditionalFreqDist()
+ self._wd = ConditionalFreqDist()
+ self._eos = ConditionalFreqDist()
+ self._l1 = 0.0
+ self._l2 = 0.0
+ self._l3 = 0.0
+ self._N = N
+ self._C = C
+ self._T = Trained
self._unk = unk
self.known = 0
def train(self, data):
- """
+ '''
Uses a set of tagged data to train the tagger.
If an unknown word tagger is specified,
it is trained on the same data.
:param data: List of lists of (word, tag) tuples
:type data: tuple(str)
- """
+ '''
# Ensure that local C flag is initialized before use
C = False
self._unk.train(data)
for sent in data:
- history = [("BOS", False), ("BOS", False)]
+ history = [('BOS',False), ('BOS',False)]
for w, t in sent:
# if capitalization is requested,
# and the word begins with a capital
# set local flag C to True
- if self._C and w[0].isupper():
- C = True
+ if self._C and w[0].isupper(): C=True
self._wd[w][t] += 1
- self._uni[(t, C)] += 1
- self._bi[history[1]][(t, C)] += 1
- self._tri[tuple(history)][(t, C)] += 1
+ self._uni[(t,C)] += 1
+ self._bi[history[1]][(t,C)] += 1
+ self._tri[tuple(history)][(t,C)] += 1
- history.append((t, C))
+ history.append((t,C))
history.pop(0)
# set local flag C to false for the next word
C = False
- self._eos[t]["EOS"] += 1
+ self._eos[t]['EOS'] += 1
+
# compute lambda values from the trained frequency distributions
self._compute_lambda()
+ #(debugging -- ignore or delete me)
+ #print "lambdas"
+ #print i, self._l1, i, self._l2, i, self._l3
+
+
def _compute_lambda(self):
- """
+ '''
creates lambda values based upon training data
NOTE: no need to explicitly reference C,
ISSUES -- Resolutions:
if 2 values are equal, increment both lambda values
by (f(t1,t2,t3) / 2)
- """
+ '''
# temporary lambda variables
tl1 = 0.0
# safe_div provides a safe floating point division
# it returns -1 if the denominator is 0
- c3 = self._safe_div(
- (self._tri[history][tag] - 1), (self._tri[history].N() - 1)
- )
- c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1))
- c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1))
+ c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1))
+ c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1))
+ c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1))
+
# if c1 is the maximum value:
if (c1 > c3) and (c1 > c2):
# otherwise there might be a problem
# eg: all values = 0
else:
+ #print "Problem", c1, c2 ,c3
pass
# Lambda normalisation:
# ensures that l1+l2+l3 = 1
- self._l1 = tl1 / (tl1 + tl2 + tl3)
- self._l2 = tl2 / (tl1 + tl2 + tl3)
- self._l3 = tl3 / (tl1 + tl2 + tl3)
+ self._l1 = tl1 / (tl1+tl2+tl3)
+ self._l2 = tl2 / (tl1+tl2+tl3)
+ self._l3 = tl3 / (tl1+tl2+tl3)
+
+
def _safe_div(self, v1, v2):
- """
+ '''
Safe floating point division function, does not allow division by 0
returns -1 if the denominator is 0
- """
+ '''
if v2 == 0:
return -1
else:
return v1 / v2
def tagdata(self, data):
- """
+ '''
Tags each sentence in a list of sentences
:param data:list of list of words
Invokes tag(sent) function for each sentence
compiles the results into a list of tagged sentences
each tagged sentence is a list of (word, tag) tuples
- """
+ '''
res = []
for sent in data:
res1 = self.tag(sent)
res.append(res1)
return res
+
def tag(self, data):
- """
+ '''
Tags a single sentence
:param data: list of words
with the correct words in the input sequence
returns a list of (word, tag) tuples
- """
+ '''
- current_state = [(["BOS", "BOS"], 0.0)]
+ current_state = [(['BOS', 'BOS'], 0.0)]
sent = list(data)
res = []
for i in range(len(sent)):
# unpack and discard the C flags
- (t, C) = tags[i + 2]
+ (t,C) = tags[i+2]
res.append((sent[i], t))
return res
+
def _tagword(self, sent, current_states):
- """
+ '''
:param sent : List of words remaining in the sentence
:type sent : [word,]
:param current_states : List of possible tag combinations for
Uses formula specified above to calculate the probability
of a particular tag
- """
+ '''
# if this word marks the end of the sentance,
# return the most probable tag
# if the Capitalisation is requested,
# initalise the flag for this word
C = False
- if self._C and word[0].isupper():
- C = True
+ if self._C and word[0].isupper(): C=True
# if word is known
# compute the set of possible tags
logprobs = []
for t in self._wd[word].keys():
- tC = (t, C)
+ tC = (t,C)
p_uni = self._uni.freq(tC)
p_bi = self._bi[history[-1]].freq(tC)
p_tri = self._tri[tuple(history[-2:])].freq(tC)
p_wd = self._wd[word][t] / self._uni[tC]
- p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri
+ p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri
p2 = log(p, 2) + log(p_wd, 2)
# compute the result of appending each tag to this history
- new_states.append((history + [tC], curr_sent_logprob + p2))
+ new_states.append((history + [tC],
+ curr_sent_logprob + p2))
# otherwise a new word, set of possible tags is unknown
else:
# if no unknown word tagger has been specified
# then use the tag 'Unk'
if self._unk is None:
- tag = ("Unk", C)
+ tag = ('Unk',C)
# otherwise apply the unknown word tagger
else:
[(_w, t)] = list(self._unk.tag([word]))
- tag = (t, C)
+ tag = (t,C)
for (history, logprob) in current_states:
history.append(tag)
# del everything after N (threshold)
# this is the beam search cut
if len(new_states) > self._N:
- new_states = new_states[: self._N]
+ new_states = new_states[:self._N]
# compute the tags for the rest of the sentence
# return the best list of tags for the sentence
# helper function -- basic sentence tokenizer
########################################
-
def basic_sent_chop(data, raw=True):
- """
+ '''
Basic method for tokenizing input into sentences
for this tagger:
This is a simple method which enhances the performance of the TnT
tagger. Better sentence tokenization will further enhance the results.
- """
+ '''
new_data = []
curr_sent = []
- sent_mark = [",", ".", "?", "!"]
+ sent_mark = [',','.','?','!']
+
if raw:
for word in data:
curr_sent.append(word)
else:
- for (word, tag) in data:
+ for (word,tag) in data:
if word in sent_mark:
- curr_sent.append((word, tag))
+ curr_sent.append((word,tag))
new_data.append(curr_sent)
curr_sent = []
else:
- curr_sent.append((word, tag))
+ curr_sent.append((word,tag))
return new_data
+
def demo():
from nltk.corpus import brown
-
sents = list(brown.tagged_sents())
test = list(brown.sents())
+ # create and train the tagger
tagger = TnT()
tagger.train(sents[200:1000])
+ # tag some data
tagged_data = tagger.tagdata(test[100:120])
+ # print results
for j in range(len(tagged_data)):
s = tagged_data[j]
- t = sents[j + 100]
+ t = sents[j+100]
for i in range(len(s)):
- print(s[i], "--", t[i])
+ print(s[i],'--', t[i])
print()
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=True)
- t.train(d[(11) * 100 :])
- s.train(d[(11) * 100 :])
+ t.train(d[(11)*100:])
+ s.train(d[(11)*100:])
for i in range(10):
- tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)])
+ tacc = t.evaluate(d[i*100:((i+1)*100)])
tp_un = t.unknown / (t.known + t.unknown)
tp_kn = t.known / (t.known + t.unknown)
t.unknown = 0
t.known = 0
- print("Capitalization off:")
- print("Accuracy:", tacc)
- print("Percentage known:", tp_kn)
- print("Percentage unknown:", tp_un)
- print("Accuracy over known words:", (tacc / tp_kn))
+ print('Capitalization off:')
+ print('Accuracy:', tacc)
+ print('Percentage known:', tp_kn)
+ print('Percentage unknown:', tp_un)
+ print('Accuracy over known words:', (tacc / tp_kn))
- sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)])
+ sacc = s.evaluate(d[i*100:((i+1)*100)])
sp_un = s.unknown / (s.known + s.unknown)
sp_kn = s.known / (s.known + s.unknown)
s.unknown = 0
s.known = 0
- print("Capitalization on:")
- print("Accuracy:", sacc)
- print("Percentage known:", sp_kn)
- print("Percentage unknown:", sp_un)
- print("Accuracy over known words:", (sacc / sp_kn))
-
+ print('Capitalization on:')
+ print('Accuracy:', sacc)
+ print('Percentage known:', sp_kn)
+ print('Percentage unknown:', sp_un)
+ print('Accuracy over known words:', (sacc / sp_kn))
def demo3():
from nltk.corpus import treebank, brown
d = d[:1000]
e = e[:1000]
- d10 = int(len(d) * 0.1)
- e10 = int(len(e) * 0.1)
+ d10 = int(len(d)*0.1)
+ e10 = int(len(e)*0.1)
tknacc = 0
sknacc = 0
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=False)
- dtest = d[(i * d10) : ((i + 1) * d10)]
- etest = e[(i * e10) : ((i + 1) * e10)]
+ dtest = d[(i*d10):((i+1)*d10)]
+ etest = e[(i*e10):((i+1)*e10)]
- dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
- etrain = e[: (i * e10)] + e[((i + 1) * e10) :]
+ dtrain = d[:(i*d10)] + d[((i+1)*d10):]
+ etrain = e[:(i*e10)] + e[((i+1)*e10):]
t.train(dtrain)
s.train(etrain)
s.unknown = 0
s.known = 0
- tknacc += tacc / tp_kn
- sknacc += sacc / tp_kn
+ tknacc += (tacc / tp_kn)
+ sknacc += (sacc / tp_kn)
tallacc += tacc
sallacc += sacc
- # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc)
+ #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
+
print("brown: acc over words known:", 10 * tknacc)
print(" : overall accuracy:", 10 * tallacc)
print("treebank: acc over words known:", 10 * sknacc)
print(" : overall accuracy:", 10 * sallacc)
print(" : words known:", 10 * sknown)
+
+
+
+
# Natural Language Toolkit: Tagger Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
-def str2tuple(s, sep="/"):
+def str2tuple(s, sep='/'):
"""
Given the string representation of a tagged token, return the
corresponding tuple representation. The rightmost occurrence of
"""
loc = s.rfind(sep)
if loc >= 0:
- return (s[:loc], s[loc + len(sep) :].upper())
+ return (s[:loc], s[loc+len(sep):].upper())
else:
return (s, None)
-
-def tuple2str(tagged_token, sep="/"):
+def tuple2str(tagged_token, sep='/'):
"""
Given the tuple representation of a tagged token, return the
corresponding string representation. This representation is
if tag is None:
return word
else:
- assert sep not in tag, "tag may not contain sep!"
- return "%s%s%s" % (word, sep, tag)
-
+ assert sep not in tag, 'tag may not contain sep!'
+ return '%s%s%s' % (word, sep, tag)
def untag(tagged_sentence):
"""
"""
return [w for (w, t) in tagged_sentence]
+
+
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
"""
from nltk.tbl.template import Template
-
-# API: Template(...), Template.expand(...)
+#API: Template(...), Template.expand(...)
from nltk.tbl.feature import Feature
-
-# API: Feature(...), Feature.expand(...)
+#API: Feature(...), Feature.expand(...)
from nltk.tbl.rule import Rule
-
-# API: Rule.format(...), Rule.templatetid
+#API: Rule.format(...), Rule.templatetid
from nltk.tbl.erroranalysis import error_list
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, absolute_import, division
import os
import pickle
from nltk.tag.brill import Word, Pos
from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger
-
def demo():
"""
Run a demo with defaults. See source comments for details,
"""
postag()
-
def demo_repr_rule_format():
"""
Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
"""
postag(ruleformat="repr")
-
def demo_str_rule_format():
"""
Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
"""
postag(ruleformat="str")
-
def demo_verbose_rule_format():
"""
Exemplify Rule.format("verbose")
"""
postag(ruleformat="verbose")
-
def demo_multiposition_feature():
"""
The feature/s of a template takes a list of positions
points can also be used: Pos(-3, -1) is the same as the arg
below.
"""
- postag(templates=[Template(Pos([-3, -2, -1]))])
-
+ postag(templates=[Template(Pos([-3,-2,-1]))])
def demo_multifeature_template():
"""
Templates can have more than a single feature.
"""
- postag(templates=[Template(Word([0]), Pos([-2, -1]))])
-
+ postag(templates=[Template(Word([0]), Pos([-2,-1]))])
def demo_template_statistics():
"""
"""
postag(incremental_stats=True, template_stats=True)
-
def demo_generated_templates():
"""
Template.expand and Feature.expand are class methods facilitating
Note: training with 500 templates can easily fill all available
even on relatively small corpora
"""
- wordtpls = Word.expand([-1, 0, 1], [1, 2], excludezero=False)
- tagtpls = Pos.expand([-2, -1, 0, 1], [1, 2], excludezero=True)
- templates = list(Template.expand([wordtpls, tagtpls], combinations=(1, 3)))
- print(
- "Generated {0} templates for transformation-based learning".format(
- len(templates)
- )
- )
+ wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False)
+ tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True)
+ templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3)))
+ print("Generated {0} templates for transformation-based learning".format(len(templates)))
postag(templates=templates, incremental_stats=True, template_stats=True)
-
def demo_learning_curve():
"""
Plot a learning curve -- the contribution on tagging accuracy of
the individual rules.
Note: requires matplotlib
"""
- postag(
- incremental_stats=True,
- separate_baseline_data=True,
- learning_curve_output="learningcurve.png",
- )
-
+ postag(incremental_stats=True, separate_baseline_data=True, learning_curve_output="learningcurve.png")
def demo_error_analysis():
"""
"""
postag(error_output="errors.txt")
-
def demo_serialize_tagger():
"""
Serializes the learned tagger to a file in pickle format; reloads it
"""
postag(serialize_output="tagger.pcl")
-
def demo_high_accuracy_rules():
"""
Discard rules with low accuracy. This may hurt performance a bit,
"""
postag(num_sents=3000, min_acc=0.96, min_score=10)
-
def postag(
templates=None,
tagged_data=None,
learning_curve_take=300,
baseline_backoff_tagger=None,
separate_baseline_data=False,
- cache_baseline_tagger=None,
-):
+ cache_baseline_tagger=None):
"""
Brill Tagger Demonstration
:param templates: how many sentences of training and testing data to use
baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
if templates is None:
from nltk.tag.brill import describe_template_sets, brill24
-
# some pre-built template sets taken from typical systems or publications are
# available. Print a list with describe_template_sets()
# for instance:
templates = brill24()
- (training_data, baseline_data, gold_data, testing_data) = _demo_prepare_data(
- tagged_data, train, num_sents, randomize, separate_baseline_data
- )
+ (training_data, baseline_data, gold_data, testing_data) = \
+ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)
# creating (or reloading from cache) a baseline tagger (unigram tagger)
# this is just a mechanism for getting deterministic output from the baseline between
# python versions
if cache_baseline_tagger:
if not os.path.exists(cache_baseline_tagger):
- baseline_tagger = UnigramTagger(
- baseline_data, backoff=baseline_backoff_tagger
- )
- with open(cache_baseline_tagger, "w") as print_rules:
+ baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
+ with open(cache_baseline_tagger, 'w') as print_rules:
pickle.dump(baseline_tagger, print_rules)
- print(
- "Trained baseline tagger, pickled it to {0}".format(
- cache_baseline_tagger
- )
- )
+ print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
with open(cache_baseline_tagger, "r") as print_rules:
- baseline_tagger = pickle.load(print_rules)
+ baseline_tagger= pickle.load(print_rules)
print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
else:
baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
print("Trained baseline tagger")
if gold_data:
- print(
- " Accuracy on test set: {0:0.4f}".format(
- baseline_tagger.evaluate(gold_data)
- )
- )
+ print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))
# creating a Brill tagger
tbrill = time.time()
- trainer = BrillTaggerTrainer(
- baseline_tagger, templates, trace, ruleformat=ruleformat
- )
+ trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
print("Training tbl tagger...")
brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
# printing the learned rules, if learned silently
if trace == 1:
print("\nLearned rules: ")
- for (ruleno, rule) in enumerate(brill_tagger.rules(), 1):
+ for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))
+
# printing template statistics (optionally including comparison with the training data)
# note: if not separate_baseline_data, then baseline accuracy will be artificially high
- if incremental_stats:
- print(
- "Incrementally tagging the test data, collecting individual rule statistics"
- )
- (taggedtest, teststats) = brill_tagger.batch_tag_incremental(
- testing_data, gold_data
- )
+ if incremental_stats:
+ print("Incrementally tagging the test data, collecting individual rule statistics")
+ (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
print(" Rule statistics collected")
if not separate_baseline_data:
- print(
- "WARNING: train_stats asked for separate_baseline_data=True; the baseline "
- "will be artificially high"
- )
+ print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
+ "will be artificially high")
trainstats = brill_tagger.train_stats()
if template_stats:
brill_tagger.print_template_statistics(teststats)
if learning_curve_output:
- _demo_plot(
- learning_curve_output, teststats, trainstats, take=learning_curve_take
- )
+ _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
print("Wrote plot of learning curve to {0}".format(learning_curve_output))
else:
print("Tagging the test data")
# writing error analysis to file
if error_output is not None:
- with open(error_output, "w") as f:
- f.write("Errors for Brill Tagger %r\n\n" % serialize_output)
- f.write(
- u"\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n"
- )
+ with open(error_output, 'w') as f:
+ f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
+ f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
print("Wrote tagger errors including context to {0}".format(error_output))
# serializing the tagger to a pickle file and reloading (just to see it works)
if serialize_output is not None:
taggedtest = brill_tagger.tag_sents(testing_data)
- with open(serialize_output, "w") as print_rules:
+ with open(serialize_output, 'w') as print_rules:
pickle.dump(brill_tagger, print_rules)
print("Wrote pickled tagger to {0}".format(serialize_output))
with open(serialize_output, "r") as print_rules:
else:
print("PROBLEM: Reloaded tagger gave different results on test set")
-
-def _demo_prepare_data(
- tagged_data, train, num_sents, randomize, separate_baseline_data
-):
+def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data):
# train is the proportion of data used in training; the rest is reserved
# for testing.
if tagged_data is None:
baseline_data = training_data
else:
bl_cutoff = len(training_data) // 3
- (baseline_data, training_data) = (
- training_data[:bl_cutoff],
- training_data[bl_cutoff:],
- )
+ (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:])
(trainseqs, traintokens) = corpus_size(training_data)
(testseqs, testtokens) = corpus_size(testing_data)
(bltrainseqs, bltraintokens) = corpus_size(baseline_data)
print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens))
print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens))
- print(
- "Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
- bltrainseqs,
- bltraintokens,
- "" if separate_baseline_data else "[reused the training set]",
- )
- )
+ print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
+ bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]"))
return (training_data, baseline_data, gold_data, testing_data)
def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
- testcurve = [teststats["initialerrors"]]
- for rulescore in teststats["rulescores"]:
- testcurve.append(testcurve[-1] - rulescore)
- testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]]
+ testcurve = [teststats['initialerrors']]
+ for rulescore in teststats['rulescores']:
+ testcurve.append(testcurve[-1] - rulescore)
+ testcurve = [1 - x/teststats['tokencount'] for x in testcurve[:take]]
- traincurve = [trainstats["initialerrors"]]
- for rulescore in trainstats["rulescores"]:
- traincurve.append(traincurve[-1] - rulescore)
- traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]]
+ traincurve = [trainstats['initialerrors']]
+ for rulescore in trainstats['rulescores']:
+ traincurve.append(traincurve[-1] - rulescore)
+ traincurve = [1 - x/trainstats['tokencount'] for x in traincurve[:take]]
- import matplotlib.pyplot as plt
+ import matplotlib.pyplot as plt
+ r = list(range(len(testcurve)))
+ plt.plot(r, testcurve, r, traincurve)
+ plt.axis([None, None, None, 1.0])
+ plt.savefig(learning_curve_output)
- r = list(range(len(testcurve)))
- plt.plot(r, testcurve, r, traincurve)
- plt.axis([None, None, None, 1.0])
- plt.savefig(learning_curve_output)
-
-NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")])
+NN_CD_TAGGER = RegexpTagger(
+ [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
+ (r'.*', 'NN')])
REGEXP_TAGGER = RegexpTagger(
- [
- (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
- (r"(The|the|A|a|An|an)$", "AT"), # articles
- (r".*able$", "JJ"), # adjectives
- (r".*ness$", "NN"), # nouns formed from adjectives
- (r".*ly$", "RB"), # adverbs
- (r".*s$", "NNS"), # plural nouns
- (r".*ing$", "VBG"), # gerunds
- (r".*ed$", "VBD"), # past tense verbs
- (r".*", "NN"), # nouns (default)
- ]
-)
+ [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
+ (r'(The|the|A|a|An|an)$', 'AT'), # articles
+ (r'.*able$', 'JJ'), # adjectives
+ (r'.*ness$', 'NN'), # nouns formed from adjectives
+ (r'.*ly$', 'RB'), # adverbs
+ (r'.*s$', 'NNS'), # plural nouns
+ (r'.*ing$', 'VBG'), # gerunds
+ (r'.*ed$', 'VBD'), # past tense verbs
+ (r'.*', 'NN') # nouns (default)
+])
def corpus_size(seqs):
return (len(seqs), sum(len(x) for x in seqs))
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo_learning_curve()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-# returns a list of errors in string format
+from __future__ import print_function
+
+# returns a list of errors in string format
def error_list(train_sents, test_sents):
"""
:param test_sents: The tagged corpus
:type test_sents: list(tuple)
"""
- hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % (
- "left context",
- "word/test->gold".center(22),
- "right context",
- )
+ hdr = (('%25s | %s | %s\n' + '-'*26+'+'+'-'*24+'+'+'-'*26) %
+ ('left context', 'word/test->gold'.center(22), 'right context'))
errors = [hdr]
for (train_sent, test_sent) in zip(train_sents, test_sents):
for wordnum, (word, train_pos) in enumerate(train_sent):
test_pos = test_sent[wordnum][1]
if train_pos != test_pos:
- left = " ".join("%s/%s" % w for w in train_sent[:wordnum])
- right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :])
- mid = "%s/%s->%s" % (word, test_pos, train_pos)
- errors.append(
- "%25s | %s | %s" % (left[-25:], mid.center(22), right[:25])
- )
+ left = ' '.join('%s/%s' % w for w in train_sent[:wordnum])
+ right = ' '.join('%s/%s' % w for w in train_sent[wordnum+1:])
+ mid = '%s/%s->%s' % (word, test_pos, train_pos)
+ errors.append('%25s | %s | %s' %
+ (left[-25:], mid.center(22), right[:25]))
return errors
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import division, print_function, unicode_literals
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
-class Feature(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class Feature(object):
"""
An abstract base class for Features. A Feature is a combination of
a specific property-computing method and a list of relative positions
"""
- json_tag = "nltk.tbl.Feature"
+ json_tag = 'nltk.tbl.Feature'
PROPERTY_NAME = None
def __init__(self, positions, end=None):
"""
self.positions = None # to avoid warnings
if end is None:
- self.positions = tuple(sorted(set(int(i) for i in positions)))
- else: # positions was actually not a list, but only the start index
+ self.positions = tuple(sorted(set([int(i) for i in positions])))
+ else: # positions was actually not a list, but only the start index
try:
if positions > end:
raise TypeError
- self.positions = tuple(range(positions, end + 1))
+ self.positions = tuple(range(positions, end+1))
except TypeError:
# let any kind of erroneous spec raise ValueError
- raise ValueError(
- "illegal interval specification: (start={0}, end={1})".format(
- positions, end
- )
- )
+ raise ValueError("illegal interval specification: (start={0}, end={1})".format(positions, end))
# set property name given in subclass, or otherwise name of subclass
self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
return cls(positions)
def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, list(self.positions))
+ return "%s(%r)" % (
+ self.__class__.__name__, list(self.positions))
@classmethod
def expand(cls, starts, winlens, excludezero=False):
"""
if not all(x > 0 for x in winlens):
raise ValueError("non-positive window length in {0}".format(winlens))
- xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1))
+ xs = (starts[i:i+w] for w in winlens for i in range(len(starts)-w+1))
return [cls(x) for x in xs if not (excludezero and 0 in x)]
def issuperset(self, other):
"""
- return self.__class__ is other.__class__ and set(self.positions) >= set(
- other.positions
- )
+ return self.__class__ is other.__class__ and set(self.positions) >= set(other.positions)
def intersects(self, other):
"""
:rtype: bool
"""
- return bool(
- (
- self.__class__ is other.__class__
- and set(self.positions) & set(other.positions)
- )
- )
+ return bool((self.__class__ is other.__class__ and set(self.positions) & set(other.positions)))
# Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
# it will be enough to define __lt__ and __eq__
def __eq__(self, other):
- return self.__class__ is other.__class__ and self.positions == other.positions
+ return (self.__class__ is other.__class__ and self.positions == other.positions)
def __lt__(self, other):
return (
- self.__class__.__name__ < other.__class__.__name__
- or
+ self.__class__.__name__ < other.__class__.__name__ or
# self.positions is a sorted tuple of ints
self.positions < other.positions
)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk import jsontags
######################################################################
# Tag Rules
######################################################################
-class TagRule(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TagRule(object):
"""
An interface for tag transformations on a tagged corpus, as
performed by tbl taggers. Each transformation finds all tokens
raise TypeError("Rules must implement __hash__()")
+@python_2_unicode_compatible
@jsontags.register_tag
class Rule(TagRule):
"""
"""
- json_tag = "nltk.tbl.Rule"
+ json_tag = 'nltk.tbl.Rule'
def __init__(self, templateid, original_tag, replacement_tag, conditions):
"""
def encode_json_obj(self):
return {
- "templateid": self.templateid,
- "original": self.original_tag,
- "replacement": self.replacement_tag,
- "conditions": self._conditions,
+ 'templateid': self.templateid,
+ 'original': self.original_tag,
+ 'replacement': self.replacement_tag,
+ 'conditions': self._conditions,
}
@classmethod
def decode_json_obj(cls, obj):
- return cls(
- obj["templateid"],
- obj["original"],
- obj["replacement"],
- tuple(tuple(feat) for feat in obj["conditions"])
- )
+ return cls(obj['templateid'], obj['original'], obj['replacement'], obj['conditions'])
def applies(self, tokens, index):
# Inherit docs from TagRule
for pos in feature.positions:
if not (0 <= index + pos < len(tokens)):
continue
- if feature.extract_property(tokens, index + pos) == val:
+ if feature.extract_property(tokens, index+pos) == val:
break
else:
# No token satisfied the condition; return false.
return True
def __eq__(self, other):
- return self is other or (
- other is not None
- and other.__class__ == self.__class__
- and self.original_tag == other.original_tag
- and self.replacement_tag == other.replacement_tag
- and self._conditions == other._conditions
- )
+ return (self is other or
+ (other is not None and
+ other.__class__ == self.__class__ and
+ self.original_tag == other.original_tag and
+ self.replacement_tag == other.replacement_tag and
+ self._conditions == other._conditions))
def __ne__(self, other):
return not (self == other)
try:
return self.__repr
except AttributeError:
- self.__repr = "{0}('{1}', {2}, {3}, [{4}])".format(
- self.__class__.__name__,
- self.templateid,
- repr(self.original_tag),
- repr(self.replacement_tag),
- # list(self._conditions) would be simpler but will not generate
- # the same Rule.__repr__ in python 2 and 3 and thus break some tests
- ", ".join(
- "({0},{1})".format(f, repr(v))
- for (f, v) in self._conditions
- ),
+ self.__repr = (
+ "{0}('{1}', {2}, {3}, [{4}])".format(
+ self.__class__.__name__,
+ self.templateid,
+ unicode_repr(self.original_tag),
+ unicode_repr(self.replacement_tag),
+
+ # list(self._conditions) would be simpler but will not generate
+ # the same Rule.__repr__ in python 2 and 3 and thus break some tests
+ ', '.join("({0},{1})".format(f, unicode_repr(v)) for (f, v) in self._conditions)
+ )
)
return self.__repr
Return a compact, predicate-logic styled string representation
of the given condition.
"""
- return "{0}:{1}@[{2}]".format(
+ return '{0}:{1}@[{2}]'.format(
feature.PROPERTY_NAME,
value,
- ",".join(str(w) for w in feature.positions),
+ ",".join(str(w) for w in feature.positions)
)
- conditions = " & ".join(
- [_condition_to_logic(f, v) for (f, v) in self._conditions]
- )
- s = "{0}->{1} if {2}".format(
- self.original_tag, self.replacement_tag, conditions
+ conditions = ' & '.join([_condition_to_logic(f, v) for (f, v) in self._conditions])
+ s = '{0}->{1} if {2}'.format(
+ self.original_tag,
+ self.replacement_tag,
+ conditions
)
return s
Not sure how useful this is.
"""
-
def condition_to_str(feature, value):
- return 'the %s of %s is "%s"' % (
- feature.PROPERTY_NAME,
- range_to_str(feature.positions),
- value,
- )
+ return ('the %s of %s is "%s"' %
+ (feature.PROPERTY_NAME, range_to_str(feature.positions), value))
def range_to_str(positions):
if len(positions) == 1:
p = positions[0]
if p == 0:
- return "this word"
+ return 'this word'
if p == -1:
- return "the preceding word"
+ return 'the preceding word'
elif p == 1:
- return "the following word"
+ return 'the following word'
elif p < 0:
- return "word i-%d" % -p
+ return 'word i-%d' % -p
elif p > 0:
- return "word i+%d" % p
+ return 'word i+%d' % p
else:
# for complete compatibility with the wordy format of nltk2
mx = max(positions)
mn = min(positions)
if mx - mn == len(positions) - 1:
- return "words i%+d...i%+d" % (mn, mx)
+ return 'words i%+d...i%+d' % (mn, mx)
else:
- return "words {%s}" % (",".join("i%+d" % d for d in positions),)
+ return 'words {%s}' % (",".join("i%+d" % d for d in positions),)
- replacement = "%s -> %s" % (self.original_tag, self.replacement_tag)
- conditions = (" if " if self._conditions else "") + ", and ".join(
+ replacement = '%s -> %s' % (self.original_tag, self.replacement_tag)
+ conditions = (' if ' if self._conditions else "") + ', and '.join(
condition_to_str(f, v) for (f, v) in self._conditions
)
return replacement + conditions
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import itertools as it
from nltk.tbl.feature import Feature
from nltk.tbl.rule import Rule
-class BrillTemplateI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class BrillTemplateI(object):
"""
An interface for generating lists of transformational rules that
apply at given sentence positions. ``BrillTemplateI`` is used by
``Brill`` training algorithms to generate candidate rules.
"""
-
@abstractmethod
def applicable_rules(self, tokens, i, correctTag):
"""
- use the given features, each at its own independent position; and
- are applicable to the given token.
"""
-
ALLTEMPLATES = []
# record a unique id of form "001", for each template created
# _ids = it.count(0)
# Template(Feature1(args), Feature2(args), ...)
if all(isinstance(f, Feature) for f in features):
self._features = features
- elif issubclass(features[0], Feature) and all(
- isinstance(a, tuple) for a in features[1:]
- ):
+ elif issubclass(features[0], Feature) and all(isinstance(a, tuple) for a in features[1:]):
self._features = [features[0](*tp) for tp in features[1:]]
else:
raise TypeError(
- "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..."
- )
+ "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...")
self.id = "{0:03d}".format(len(self.ALLTEMPLATES))
self.ALLTEMPLATES.append(self)
def __repr__(self):
- return "%s(%s)" % (
- self.__class__.__name__,
- ",".join([str(f) for f in self._features]),
- )
+ return "%s(%s)" % (self.__class__.__name__, ",".join([str(f) for f in self._features]))
def applicable_rules(self, tokens, index, correct_tag):
if tokens[index][1] == correct_tag:
for feature in self._features:
conditions.append([])
for pos in feature.positions:
- if not (0 <= index + pos < len(tokens)):
+ if not (0 <= index+pos < len(tokens)):
continue
- value = feature.extract_property(tokens, index + pos)
- conditions[-1].append((feature, value))
+ value = feature.extract_property(tokens, index+pos)
+ conditions[-1].append( (feature, value) )
return conditions
def get_neighborhood(self, tokens, index):
# inherit docs from BrillTemplateI
# applicable_rules(tokens, index, ...) depends on index.
- neighborhood = set([index]) # set literal for python 2.7+
+ neighborhood = set([index]) #set literal for python 2.7+
# applicable_rules(tokens, i, ...) depends on index if
# i+start < index <= i+end.
allpositions = [0] + [p for feat in self._features for p in feat.positions]
start, end = min(allpositions), max(allpositions)
- s = max(0, index + (-end))
- e = min(index + (-start) + 1, len(tokens))
+ s = max(0, index+(-end))
+ e = min(index+(-start)+1, len(tokens))
for i in range(s, e):
neighborhood.add(i)
return neighborhood
:returns: generator of Templates
"""
-
- def nonempty_powerset(xs): # xs is a list
+ def nonempty_powerset(xs): #xs is a list
# itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
# find the correct tuple given combinations, one of {None, k, (k1,k2)}
- k = combinations # for brevity
- combrange = (
- (1, len(xs) + 1)
- if k is None
- else (k, k + 1) # n over 1 .. n over n (all non-empty combinations)
- if isinstance(k, int)
- else (k[0], k[1] + 1) # n over k (only
- ) # n over k1, n over k1+1... n over k2
- return it.chain.from_iterable(
- it.combinations(xs, r) for r in range(*combrange)
- )
-
+ k = combinations #for brevity
+ combrange = ((1, len(xs)+1) if k is None else # n over 1 .. n over n (all non-empty combinations)
+ (k, k+1) if isinstance(k, int) else # n over k (only
+ (k[0], k[1]+1)) # n over k1, n over k1+1... n over k2
+ return it.chain.from_iterable(it.combinations(xs, r)
+ for r in range(*combrange))
seentemplates = set()
for picks in nonempty_powerset(featurelists):
for pick in it.product(*picks):
- if any(
- i != j and x.issuperset(y)
- for (i, x) in enumerate(pick)
- for (j, y) in enumerate(pick)
- ):
+ if any(i != j and x.issuperset(y)
+ for (i, x) in enumerate(pick)
+ for (j, y) in enumerate(pick)):
continue
- if skipintersecting and any(
- i != j and x.intersects(y)
- for (i, x) in enumerate(pick)
- for (j, y) in enumerate(pick)
- ):
+ if skipintersecting and any(i != j and x.intersects(y)
+ for (i, x) in enumerate(pick)
+ for (j, y) in enumerate(pick)):
continue
thistemplate = cls(*sorted(pick))
strpick = str(thistemplate)
#!!FIXME --this is hackish
- if strpick in seentemplates: # already added
+ if strpick in seentemplates: #already added
cls._poptemplate()
continue
seentemplates.add(strpick)
+++ /dev/null
-# Natural Language Toolkit: Unit Tests
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Edward Loper <edloper@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-"""
-Unit tests for the NLTK modules. These tests are intended to ensure
-that source code changes don't accidentally introduce bugs.
-For instructions, please see:
-
-../../web/dev/local_testing.rst
-
-https://github.com/nltk/nltk/blob/develop/web/dev/local_testing.rst
-
-
-"""
+++ /dev/null
-"""Test suite that runs all NLTK tests.
-
-This module, `nltk.test.all`, is named as the NLTK ``test_suite`` in the
-project's ``setup-eggs.py`` file. Here, we create a test suite that
-runs all of our doctests, and return it for processing by the setuptools
-test harness.
-
-"""
-import doctest, unittest
-from glob import glob
-import os.path
-
-
-def additional_tests():
- # print("here-000000000000000")
- # print("-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest')))
- dir = os.path.dirname(__file__)
- paths = glob(os.path.join(dir, "*.doctest"))
- files = [os.path.basename(path) for path in paths]
- return unittest.TestSuite([doctest.DocFileSuite(file) for file in files])
-
-
-# if os.path.split(path)[-1] != 'index.rst'
-# skips time-dependent doctest in index.rst
+++ /dev/null
-==========
-BLEU tests
-==========
-
->>> from nltk.translate import bleu
-
-If the candidate has no alignment to any of the references, the BLEU score is 0.
-
->>> bleu(
-... ['The candidate has no alignment to any of the references'.split()],
-... 'John loves Mary'.split(),
-... [1],
-... )
-0
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
- >>> import os.path
-
- >>> from nltk.corpus.reader import BNCCorpusReader
- >>> import nltk.test
-
- >>> root = os.path.dirname(nltk.test.__file__)
- >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml')
-
-Checking the word access.
--------------------------
-
- >>> len(bnc.words())
- 151
-
- >>> bnc.words()[:6]
- ['Ah', 'there', 'we', 'are', ',', '.']
- >>> bnc.words(stem=True)[:6]
- ['ah', 'there', 'we', 'be', ',', '.']
-
- >>> bnc.tagged_words()[:6]
- [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
-
- >>> bnc.tagged_words(c5=True)[:6]
- [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
-
-Testing access to the sentences.
---------------------------------
-
- >>> len(bnc.sents())
- 15
-
- >>> bnc.sents()[0]
- ['Ah', 'there', 'we', 'are', ',', '.']
- >>> bnc.sents(stem=True)[0]
- ['ah', 'there', 'we', 'be', ',', '.']
-
- >>> bnc.tagged_sents()[0]
- [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
- >>> bnc.tagged_sents(c5=True)[0]
- [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
-
-A not lazy loader.
-------------------
-
- >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False)
-
- >>> len(eager.words())
- 151
- >>> eager.words(stem=True)[6:17]
- ['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']
-
- >>> eager.tagged_words()[6:11]
- [('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')]
- >>> eager.tagged_words(c5=True)[6:17]
- [('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')]
- >>> len(eager.sents())
- 15
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==============================
-Combinatory Categorial Grammar
-==============================
-
-Relative Clauses
-----------------
-
- >>> from nltk.ccg import chart, lexicon
-
-Construct a lexicon:
-
- >>> lex = lexicon.parseLexicon('''
- ... :- S, NP, N, VP
- ...
- ... Det :: NP/N
- ... Pro :: NP
- ... Modal :: S\\NP/VP
- ...
- ... TV :: VP/NP
- ... DTV :: TV/NP
- ...
- ... the => Det
- ...
- ... that => Det
- ... that => NP
- ...
- ... I => Pro
- ... you => Pro
- ... we => Pro
- ...
- ... chef => N
- ... cake => N
- ... children => N
- ... dough => N
- ...
- ... will => Modal
- ... should => Modal
- ... might => Modal
- ... must => Modal
- ...
- ... and => var\\.,var/.,var
- ...
- ... to => VP[to]/VP
- ...
- ... without => (VP\\VP)/VP[ing]
- ...
- ... be => TV
- ... cook => TV
- ... eat => TV
- ...
- ... cooking => VP[ing]/NP
- ...
- ... give => DTV
- ...
- ... is => (S\\NP)/NP
- ... prefer => (S\\NP)/NP
- ...
- ... which => (N\\N)/(S/NP)
- ...
- ... persuade => (VP/VP[to])/NP
- ... ''')
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> for parse in parser.parse("you prefer that cake".split()):
- ... chart.printCCGDerivation(parse)
- ... break
- ...
- you prefer that cake
- NP ((S\NP)/NP) (NP/N) N
- -------------->
- NP
- --------------------------->
- (S\NP)
- --------------------------------<
- S
-
- >>> for parse in parser.parse("that is the cake which you prefer".split()):
- ... chart.printCCGDerivation(parse)
- ... break
- ...
- that is the cake which you prefer
- NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/NP)
- ----->T
- (S/(S\NP))
- ------------------>B
- (S/NP)
- ---------------------------------->
- (N\N)
- ----------------------------------------<
- N
- ------------------------------------------------>
- NP
- ------------------------------------------------------------->
- (S\NP)
- -------------------------------------------------------------------<
- S
-
-
-Some other sentences to try:
-"that is the cake which we will persuade the chef to cook"
-"that is the cake which we will persuade the chef to give the children"
-
- >>> sent = "that is the dough which you will eat without cooking".split()
- >>> nosub_parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet +
- ... chart.CompositionRuleSet + chart.TypeRaiseRuleSet)
-
-Without Substitution (no output)
-
- >>> for parse in nosub_parser.parse(sent):
- ... chart.printCCGDerivation(parse)
-
-With Substitution:
-
- >>> for parse in parser.parse(sent):
- ... chart.printCCGDerivation(parse)
- ... break
- ...
- that is the dough which you will eat without cooking
- NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/VP) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP)
- ----->T
- (S/(S\NP))
- ------------------------------------->B
- ((VP\VP)/NP)
- ----------------------------------------------<Sx
- (VP/NP)
- ----------------------------------------------------------->B
- ((S\NP)/NP)
- ---------------------------------------------------------------->B
- (S/NP)
- -------------------------------------------------------------------------------->
- (N\N)
- ---------------------------------------------------------------------------------------<
- N
- ----------------------------------------------------------------------------------------------->
- NP
- ------------------------------------------------------------------------------------------------------------>
- (S\NP)
- ------------------------------------------------------------------------------------------------------------------<
- S
-
-
-Conjunction
------------
-
- >>> from nltk.ccg.chart import CCGChartParser, ApplicationRuleSet, CompositionRuleSet
- >>> from nltk.ccg.chart import SubstitutionRuleSet, TypeRaiseRuleSet, printCCGDerivation
- >>> from nltk.ccg import lexicon
-
-Lexicons for the tests:
-
- >>> test1_lex = '''
- ... :- S,N,NP,VP
- ... I => NP
- ... you => NP
- ... will => S\\NP/VP
- ... cook => VP/NP
- ... which => (N\\N)/(S/NP)
- ... and => var\\.,var/.,var
- ... might => S\\NP/VP
- ... eat => VP/NP
- ... the => NP/N
- ... mushrooms => N
- ... parsnips => N'''
- >>> test2_lex = '''
- ... :- N, S, NP, VP
- ... articles => N
- ... the => NP/N
- ... and => var\\.,var/.,var
- ... which => (N\\N)/(S/NP)
- ... I => NP
- ... anyone => NP
- ... will => (S/VP)\\NP
- ... file => VP/NP
- ... without => (VP\\VP)/VP[ing]
- ... forget => VP/NP
- ... reading => VP[ing]/NP
- ... '''
-
-Tests handling of conjunctions.
-Note that while the two derivations are different, they are semantically equivalent.
-
- >>> lex = lexicon.parseLexicon(test1_lex)
- >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet)
- >>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()):
- ... printCCGDerivation(parse)
- I will cook and might eat the mushrooms and parsnips
- NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N
- ---------------------->B
- ((S\NP)/NP)
- ---------------------->B
- ((S\NP)/NP)
- ------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP))
- -----------------------------------------------------------------------<
- ((S\NP)/NP)
- ------------------------------------->
- (N\.,N)
- ------------------------------------------------<
- N
- -------------------------------------------------------->
- NP
- ------------------------------------------------------------------------------------------------------------------------------->
- (S\NP)
- -----------------------------------------------------------------------------------------------------------------------------------<
- S
- I will cook and might eat the mushrooms and parsnips
- NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N
- ---------------------->B
- ((S\NP)/NP)
- ---------------------->B
- ((S\NP)/NP)
- ------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP))
- -----------------------------------------------------------------------<
- ((S\NP)/NP)
- ------------------------------------------------------------------------------->B
- ((S\NP)/N)
- ------------------------------------->
- (N\.,N)
- ------------------------------------------------<
- N
- ------------------------------------------------------------------------------------------------------------------------------->
- (S\NP)
- -----------------------------------------------------------------------------------------------------------------------------------<
- S
-
-
-Tests handling subject extraction.
-Interesting to point that the two parses are clearly semantically different.
-
- >>> lex = lexicon.parseLexicon(test2_lex)
- >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet)
- >>> for parse in parser.parse("articles which I will file and forget without reading".split()):
- ... printCCGDerivation(parse)
- articles which I will file and forget without reading
- N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP)
- -----------------<
- (S/VP)
- ------------------------------------->B
- ((VP\VP)/NP)
- ----------------------------------------------<Sx
- (VP/NP)
- ------------------------------------------------------------------------->
- ((VP/NP)\.,(VP/NP))
- ----------------------------------------------------------------------------------<
- (VP/NP)
- --------------------------------------------------------------------------------------------------->B
- (S/NP)
- ------------------------------------------------------------------------------------------------------------------->
- (N\N)
- -----------------------------------------------------------------------------------------------------------------------------<
- N
- articles which I will file and forget without reading
- N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP)
- -----------------<
- (S/VP)
- ------------------------------------>
- ((VP/NP)\.,(VP/NP))
- ---------------------------------------------<
- (VP/NP)
- ------------------------------------->B
- ((VP\VP)/NP)
- ----------------------------------------------------------------------------------<Sx
- (VP/NP)
- --------------------------------------------------------------------------------------------------->B
- (S/NP)
- ------------------------------------------------------------------------------------------------------------------->
- (N\N)
- -----------------------------------------------------------------------------------------------------------------------------<
- N
-
-
-Unicode support
----------------
-
-Unicode words are supported.
-
- >>> from nltk.ccg import chart, lexicon
-
-Lexicons for the tests:
-
- >>> lex = lexicon.parseLexicon('''
- ... :- S, N, NP, PP
- ...
- ... AdjI :: N\\N
- ... AdjD :: N/N
- ... AdvD :: S/S
- ... AdvI :: S\\S
- ... Det :: NP/N
- ... PrepNPCompl :: PP/NP
- ... PrepNAdjN :: S\\S/N
- ... PrepNAdjNP :: S\\S/NP
- ... VPNP :: S\\NP/NP
- ... VPPP :: S\\NP/PP
- ... VPser :: S\\NP/AdjI
- ...
- ... auto => N
- ... bebidas => N
- ... cine => N
- ... ley => N
- ... libro => N
- ... ministro => N
- ... panadería => N
- ... presidente => N
- ... super => N
- ...
- ... el => Det
- ... la => Det
- ... las => Det
- ... un => Det
- ...
- ... Ana => NP
- ... Pablo => NP
- ...
- ... y => var\\.,var/.,var
- ...
- ... pero => (S/NP)\\(S/NP)/(S/NP)
- ...
- ... anunció => VPNP
- ... compró => VPNP
- ... cree => S\\NP/S[dep]
- ... desmintió => VPNP
- ... lee => VPNP
- ... fueron => VPPP
- ...
- ... es => VPser
- ...
- ... interesante => AdjD
- ... interesante => AdjI
- ... nueva => AdjD
- ... nueva => AdjI
- ...
- ... a => PrepNPCompl
- ... en => PrepNAdjN
- ... en => PrepNAdjNP
- ...
- ... ayer => AdvI
- ...
- ... que => (NP\\NP)/(S/NP)
- ... que => S[dep]/S
- ... ''')
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()):
- ... printCCGDerivation(parse) # doctest: +SKIP
- ... # it fails on python2.7 because of the unicode problem explained in https://github.com/nltk/nltk/pull/1354
- ... break
- el ministro anunció pero el presidente desmintió la nueva ley
- (NP/N) N ((S\NP)/NP) (((S/NP)\(S/NP))/(S/NP)) (NP/N) N ((S\NP)/NP) (NP/N) (N/N) N
- ------------------>
- NP
- ------------------>T
- (S/(S\NP))
- -------------------->
- NP
- -------------------->T
- (S/(S\NP))
- --------------------------------->B
- (S/NP)
- ----------------------------------------------------------->
- ((S/NP)\(S/NP))
- ------------>
- N
- -------------------->
- NP
- --------------------<T
- (S\(S/NP))
- -------------------------------------------------------------------------------<B
- (S\(S/NP))
- --------------------------------------------------------------------------------------------<B
- (S/NP)
- -------------------------------------------------------------------------------------------------------------->
- S
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==============================================
-Combinatory Categorial Grammar with semantics
-==============================================
-
------
-Chart
------
-
-
- >>> from nltk.ccg import chart, lexicon
- >>> from nltk.ccg.chart import printCCGDerivation
-
-No semantics
--------------------
-
- >>> lex = lexicon.fromstring('''
- ... :- S, NP, N
- ... She => NP
- ... has => (S\\NP)/NP
- ... books => NP
- ... ''',
- ... False)
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> parses = list(parser.parse("She has books".split()))
- >>> print(str(len(parses)) + " parses")
- 3 parses
-
- >>> printCCGDerivation(parses[0])
- She has books
- NP ((S\NP)/NP) NP
- -------------------->
- (S\NP)
- -------------------------<
- S
-
- >>> printCCGDerivation(parses[1])
- She has books
- NP ((S\NP)/NP) NP
- ----->T
- (S/(S\NP))
- -------------------->
- (S\NP)
- ------------------------->
- S
-
-
- >>> printCCGDerivation(parses[2])
- She has books
- NP ((S\NP)/NP) NP
- ----->T
- (S/(S\NP))
- ------------------>B
- (S/NP)
- ------------------------->
- S
-
-Simple semantics
--------------------
-
- >>> lex = lexicon.fromstring('''
- ... :- S, NP, N
- ... She => NP {she}
- ... has => (S\\NP)/NP {\\x y.have(y, x)}
- ... a => NP/N {\\P.exists z.P(z)}
- ... book => N {book}
- ... ''',
- ... True)
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> parses = list(parser.parse("She has a book".split()))
- >>> print(str(len(parses)) + " parses")
- 7 parses
-
- >>> printCCGDerivation(parses[0])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
- ------------------------------------->
- NP {exists z.book(z)}
- ------------------------------------------------------------------->
- (S\NP) {\y.have(y,exists z.book(z))}
- -----------------------------------------------------------------------------<
- S {have(she,exists z.book(z))}
-
- >>> printCCGDerivation(parses[1])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
- --------------------------------------------------------->B
- ((S\NP)/N) {\P y.have(y,exists z.P(z))}
- ------------------------------------------------------------------->
- (S\NP) {\y.have(y,exists z.book(z))}
- -----------------------------------------------------------------------------<
- S {have(she,exists z.book(z))}
-
- >>> printCCGDerivation(parses[2])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
- ---------->T
- (S/(S\NP)) {\F.F(she)}
- ------------------------------------->
- NP {exists z.book(z)}
- ------------------------------------------------------------------->
- (S\NP) {\y.have(y,exists z.book(z))}
- ----------------------------------------------------------------------------->
- S {have(she,exists z.book(z))}
-
- >>> printCCGDerivation(parses[3])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
- ---------->T
- (S/(S\NP)) {\F.F(she)}
- --------------------------------------------------------->B
- ((S\NP)/N) {\P y.have(y,exists z.P(z))}
- ------------------------------------------------------------------->
- (S\NP) {\y.have(y,exists z.book(z))}
- ----------------------------------------------------------------------------->
- S {have(she,exists z.book(z))}
-
- >>> printCCGDerivation(parses[4])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
- ---------->T
- (S/(S\NP)) {\F.F(she)}
- ---------------------------------------->B
- (S/NP) {\x.have(she,x)}
- ------------------------------------->
- NP {exists z.book(z)}
- ----------------------------------------------------------------------------->
- S {have(she,exists z.book(z))}
-
- >>> printCCGDerivation(parses[5])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
- ---------->T
- (S/(S\NP)) {\F.F(she)}
- --------------------------------------------------------->B
- ((S\NP)/N) {\P y.have(y,exists z.P(z))}
- ------------------------------------------------------------------->B
- (S/N) {\P.have(she,exists z.P(z))}
- ----------------------------------------------------------------------------->
- S {have(she,exists z.book(z))}
-
- >>> printCCGDerivation(parses[6])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
- ---------->T
- (S/(S\NP)) {\F.F(she)}
- ---------------------------------------->B
- (S/NP) {\x.have(she,x)}
- ------------------------------------------------------------------->B
- (S/N) {\P.have(she,exists z.P(z))}
- ----------------------------------------------------------------------------->
- S {have(she,exists z.book(z))}
-
-Complex semantics
--------------------
-
- >>> lex = lexicon.fromstring('''
- ... :- S, NP, N
- ... She => NP {she}
- ... has => (S\\NP)/NP {\\x y.have(y, x)}
- ... a => ((S\\NP)\\((S\\NP)/NP))/N {\\P R x.(exists z.P(z) & R(z,x))}
- ... book => N {book}
- ... ''',
- ... True)
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> parses = list(parser.parse("She has a book".split()))
- >>> print(str(len(parses)) + " parses")
- 2 parses
-
- >>> printCCGDerivation(parses[0])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book}
- ---------------------------------------------------------------------->
- ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))}
- ----------------------------------------------------------------------------------------------------<
- (S\NP) {\x.(exists z.book(z) & have(x,z))}
- --------------------------------------------------------------------------------------------------------------<
- S {(exists z.book(z) & have(she,z))}
-
- >>> printCCGDerivation(parses[1])
- She has a book
- NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book}
- ---------->T
- (S/(S\NP)) {\F.F(she)}
- ---------------------------------------------------------------------->
- ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))}
- ----------------------------------------------------------------------------------------------------<
- (S\NP) {\x.(exists z.book(z) & have(x,z))}
- -------------------------------------------------------------------------------------------------------------->
- S {(exists z.book(z) & have(she,z))}
-
-Using conjunctions
----------------------
-
- # TODO: The semantics of "and" should have been more flexible
- >>> lex = lexicon.fromstring('''
- ... :- S, NP, N
- ... I => NP {I}
- ... cook => (S\\NP)/NP {\\x y.cook(x,y)}
- ... and => var\\.,var/.,var {\\P Q x y.(P(x,y) & Q(x,y))}
- ... eat => (S\\NP)/NP {\\x y.eat(x,y)}
- ... the => NP/N {\\x.the(x)}
- ... bacon => N {bacon}
- ... ''',
- ... True)
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> parses = list(parser.parse("I cook and eat the bacon".split()))
- >>> print(str(len(parses)) + " parses")
- 7 parses
-
- >>> printCCGDerivation(parses[0])
- I cook and eat the bacon
- NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
- ------------------------------------------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
- -------------------------------------------------------------------------------------------------------------------<
- ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
- ------------------------------->
- NP {the(bacon)}
- -------------------------------------------------------------------------------------------------------------------------------------------------->
- (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
- ----------------------------------------------------------------------------------------------------------------------------------------------------------<
- S {(eat(the(bacon),I) & cook(the(bacon),I))}
-
- >>> printCCGDerivation(parses[1])
- I cook and eat the bacon
- NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
- ------------------------------------------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
- -------------------------------------------------------------------------------------------------------------------<
- ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
- --------------------------------------------------------------------------------------------------------------------------------------->B
- ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))}
- -------------------------------------------------------------------------------------------------------------------------------------------------->
- (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
- ----------------------------------------------------------------------------------------------------------------------------------------------------------<
- S {(eat(the(bacon),I) & cook(the(bacon),I))}
-
- >>> printCCGDerivation(parses[2])
- I cook and eat the bacon
- NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- ------------------------------------------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
- -------------------------------------------------------------------------------------------------------------------<
- ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
- ------------------------------->
- NP {the(bacon)}
- -------------------------------------------------------------------------------------------------------------------------------------------------->
- (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
- ---------------------------------------------------------------------------------------------------------------------------------------------------------->
- S {(eat(the(bacon),I) & cook(the(bacon),I))}
-
- >>> printCCGDerivation(parses[3])
- I cook and eat the bacon
- NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- ------------------------------------------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
- -------------------------------------------------------------------------------------------------------------------<
- ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
- --------------------------------------------------------------------------------------------------------------------------------------->B
- ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))}
- -------------------------------------------------------------------------------------------------------------------------------------------------->
- (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
- ---------------------------------------------------------------------------------------------------------------------------------------------------------->
- S {(eat(the(bacon),I) & cook(the(bacon),I))}
-
- >>> printCCGDerivation(parses[4])
- I cook and eat the bacon
- NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- ------------------------------------------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
- -------------------------------------------------------------------------------------------------------------------<
- ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
- --------------------------------------------------------------------------------------------------------------------------->B
- (S/NP) {\x.(eat(x,I) & cook(x,I))}
- ------------------------------->
- NP {the(bacon)}
- ---------------------------------------------------------------------------------------------------------------------------------------------------------->
- S {(eat(the(bacon),I) & cook(the(bacon),I))}
-
- >>> printCCGDerivation(parses[5])
- I cook and eat the bacon
- NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- ------------------------------------------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
- -------------------------------------------------------------------------------------------------------------------<
- ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
- --------------------------------------------------------------------------------------------------------------------------------------->B
- ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))}
- ----------------------------------------------------------------------------------------------------------------------------------------------->B
- (S/N) {\x.(eat(the(x),I) & cook(the(x),I))}
- ---------------------------------------------------------------------------------------------------------------------------------------------------------->
- S {(eat(the(bacon),I) & cook(the(bacon),I))}
-
- >>> printCCGDerivation(parses[6])
- I cook and eat the bacon
- NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- ------------------------------------------------------------------------------------->
- (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
- -------------------------------------------------------------------------------------------------------------------<
- ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
- --------------------------------------------------------------------------------------------------------------------------->B
- (S/NP) {\x.(eat(x,I) & cook(x,I))}
- ----------------------------------------------------------------------------------------------------------------------------------------------->B
- (S/N) {\x.(eat(the(x),I) & cook(the(x),I))}
- ---------------------------------------------------------------------------------------------------------------------------------------------------------->
- S {(eat(the(bacon),I) & cook(the(bacon),I))}
-
-Tests from published papers
-------------------------------
-
-An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf
-
- >>> lex = lexicon.fromstring('''
- ... :- S, NP
- ... I => NP {I}
- ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)}
- ... them => NP {them}
- ... money => NP {money}
- ... ''',
- ... True)
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> parses = list(parser.parse("I give them money".split()))
- >>> print(str(len(parses)) + " parses")
- 3 parses
-
- >>> printCCGDerivation(parses[0])
- I give them money
- NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money}
- -------------------------------------------------->
- ((S\NP)/NP) {\y z.give(y,them,z)}
- -------------------------------------------------------------->
- (S\NP) {\z.give(money,them,z)}
- ----------------------------------------------------------------------<
- S {give(money,them,I)}
-
- >>> printCCGDerivation(parses[1])
- I give them money
- NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- -------------------------------------------------->
- ((S\NP)/NP) {\y z.give(y,them,z)}
- -------------------------------------------------------------->
- (S\NP) {\z.give(money,them,z)}
- ---------------------------------------------------------------------->
- S {give(money,them,I)}
-
-
- >>> printCCGDerivation(parses[2])
- I give them money
- NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- -------------------------------------------------->
- ((S\NP)/NP) {\y z.give(y,them,z)}
- ---------------------------------------------------------->B
- (S/NP) {\y.give(y,them,I)}
- ---------------------------------------------------------------------->
- S {give(money,them,I)}
-
-
-An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf
-
- >>> lex = lexicon.fromstring('''
- ... :- N, NP, S
- ... money => N {money}
- ... that => (N\\N)/(S/NP) {\\P Q x.(P(x) & Q(x))}
- ... I => NP {I}
- ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)}
- ... them => NP {them}
- ... ''',
- ... True)
-
- >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
- >>> parses = list(parser.parse("money that I give them".split()))
- >>> print(str(len(parses)) + " parses")
- 3 parses
-
- >>> printCCGDerivation(parses[0])
- money that I give them
- N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- -------------------------------------------------->
- ((S\NP)/NP) {\y z.give(y,them,z)}
- ---------------------------------------------------------->B
- (S/NP) {\y.give(y,them,I)}
- ------------------------------------------------------------------------------------------------->
- (N\N) {\Q x.(give(x,them,I) & Q(x))}
- ------------------------------------------------------------------------------------------------------------<
- N {\x.(give(x,them,I) & money(x))}
-
- >>> printCCGDerivation(parses[1])
- money that I give them
- N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them}
- ----------->T
- (N/(N\N)) {\F.F(money)}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- -------------------------------------------------->
- ((S\NP)/NP) {\y z.give(y,them,z)}
- ---------------------------------------------------------->B
- (S/NP) {\y.give(y,them,I)}
- ------------------------------------------------------------------------------------------------->
- (N\N) {\Q x.(give(x,them,I) & Q(x))}
- ------------------------------------------------------------------------------------------------------------>
- N {\x.(give(x,them,I) & money(x))}
-
- >>> printCCGDerivation(parses[2])
- money that I give them
- N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them}
- ----------->T
- (N/(N\N)) {\F.F(money)}
- -------------------------------------------------->B
- (N/(S/NP)) {\P x.(P(x) & money(x))}
- -------->T
- (S/(S\NP)) {\F.F(I)}
- -------------------------------------------------->
- ((S\NP)/NP) {\y z.give(y,them,z)}
- ---------------------------------------------------------->B
- (S/NP) {\y.give(y,them,I)}
- ------------------------------------------------------------------------------------------------------------>
- N {\x.(give(x,them,I) & money(x))}
-
-
--------
-Lexicon
--------
-
- >>> from nltk.ccg import lexicon
-
-Parse lexicon with semantics
-
- >>> print(str(lexicon.fromstring(
- ... '''
- ... :- S,NP
- ...
- ... IntransVsg :: S\\NP[sg]
- ...
- ... sleeps => IntransVsg {\\x.sleep(x)}
- ... eats => S\\NP[sg]/NP {\\x y.eat(x,y)}
- ...
- ... and => var\\var/var {\\x y.x & y}
- ... ''',
- ... True
- ... )))
- and => ((_var0\_var0)/_var0) {(\x y.x & y)}
- eats => ((S\NP['sg'])/NP) {\x y.eat(x,y)}
- sleeps => (S\NP['sg']) {\x.sleep(x)}
-
-Parse lexicon without semantics
-
- >>> print(str(lexicon.fromstring(
- ... '''
- ... :- S,NP
- ...
- ... IntransVsg :: S\\NP[sg]
- ...
- ... sleeps => IntransVsg
- ... eats => S\\NP[sg]/NP {sem=\\x y.eat(x,y)}
- ...
- ... and => var\\var/var
- ... ''',
- ... False
- ... )))
- and => ((_var0\_var0)/_var0)
- eats => ((S\NP['sg'])/NP)
- sleeps => (S\NP['sg'])
-
-Semantics are missing
-
- >>> print(str(lexicon.fromstring(
- ... '''
- ... :- S,NP
- ...
- ... eats => S\\NP[sg]/NP
- ... ''',
- ... True
- ... )))
- Traceback (most recent call last):
- ...
- AssertionError: eats => S\NP[sg]/NP must contain semantics because include_semantics is set to True
-
-
-------------------------------------
-CCG combinator semantics computation
-------------------------------------
-
- >>> from nltk.sem.logic import *
- >>> from nltk.ccg.logic import *
-
- >>> read_expr = Expression.fromstring
-
-Compute semantics from function application
-
- >>> print(str(compute_function_semantics(read_expr(r'\x.P(x)'), read_expr(r'book'))))
- P(book)
-
- >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'read'))))
- read(book)
-
- >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'\x.read(x)'))))
- read(book)
-
-Compute semantics from composition
-
- >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'\x.Q(x)'))))
- \x.P(Q(x))
-
- >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'read'))))
- Traceback (most recent call last):
- ...
- AssertionError: `read` must be a lambda expression
-
-Compute semantics from substitution
-
- >>> print(str(compute_substitution_semantics(read_expr(r'\x y.P(x,y)'), read_expr(r'\x.Q(x)'))))
- \x.P(x,Q(x))
-
- >>> print(str(compute_substitution_semantics(read_expr(r'\x.P(x)'), read_expr(r'read'))))
- Traceback (most recent call last):
- ...
- AssertionError: `\x.P(x)` must be a lambda expression with 2 arguments
-
-Compute type-raise semantics
-
- >>> print(str(compute_type_raised_semantics(read_expr(r'\x.P(x)'))))
- \F x.F(P(x))
-
- >>> print(str(compute_type_raised_semantics(read_expr(r'\x.F(x)'))))
- \F1 x.F1(F(x))
-
- >>> print(str(compute_type_raised_semantics(read_expr(r'\x y z.P(x,y,z)'))))
- \F x y z.F(P(x,y,z))
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=======
-Chat-80
-=======
-
-Chat-80 was a natural language system which allowed the user to
-interrogate a Prolog knowledge base in the domain of world
-geography. It was developed in the early '80s by Warren and Pereira; see
-`<http://acl.ldc.upenn.edu/J/J82/J82-3002.pdf>`_ for a description and
-`<http://www.cis.upenn.edu/~pereira/oldies.html>`_ for the source
-files.
-
-The ``chat80`` module contains functions to extract data from the Chat-80
-relation files ('the world database'), and convert then into a format
-that can be incorporated in the FOL models of
-``nltk.sem.evaluate``. The code assumes that the Prolog
-input files are available in the NLTK corpora directory.
-
-The Chat-80 World Database consists of the following files::
-
- world0.pl
- rivers.pl
- cities.pl
- countries.pl
- contain.pl
- borders.pl
-
-This module uses a slightly modified version of ``world0.pl``, in which
-a set of Prolog rules have been omitted. The modified file is named
-``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since
-it uses a list rather than a string in the second field.
-
-Reading Chat-80 Files
-=====================
-
-Chat-80 relations are like tables in a relational database. The
-relation acts as the name of the table; the first argument acts as the
-'primary key'; and subsequent arguments are further fields in the
-table. In general, the name of the table provides a label for a unary
-predicate whose extension is all the primary keys. For example,
-relations in ``cities.pl`` are of the following form::
-
- 'city(athens,greece,1368).'
-
-Here, ``'athens'`` is the key, and will be mapped to a member of the
-unary predicate *city*.
-
-By analogy with NLTK corpora, ``chat80`` defines a number of 'items'
-which correspond to the relations.
-
- >>> from nltk.sem import chat80
- >>> print(chat80.items) # doctest: +ELLIPSIS
- ('borders', 'circle_of_lat', 'circle_of_long', 'city', ...)
-
-The fields in the table are mapped to binary predicates. The first
-argument of the predicate is the primary key, while the second
-argument is the data in the relevant field. Thus, in the above
-example, the third field is mapped to the binary predicate
-*population_of*, whose extension is a set of pairs such as
-``'(athens, 1368)'``.
-
-An exception to this general framework is required by the relations in
-the files ``borders.pl`` and ``contains.pl``. These contain facts of the
-following form::
-
- 'borders(albania,greece).'
-
- 'contains0(africa,central_africa).'
-
-We do not want to form a unary concept out the element in
-the first field of these records, and we want the label of the binary
-relation just to be ``'border'``/``'contain'`` respectively.
-
-In order to drive the extraction process, we use 'relation metadata bundles'
-which are Python dictionaries such as the following::
-
- city = {'label': 'city',
- 'closures': [],
- 'schema': ['city', 'country', 'population'],
- 'filename': 'cities.pl'}
-
-According to this, the file ``city['filename']`` contains a list of
-relational tuples (or more accurately, the corresponding strings in
-Prolog form) whose predicate symbol is ``city['label']`` and whose
-relational schema is ``city['schema']``. The notion of a ``closure`` is
-discussed in the next section.
-
-Concepts
-========
-In order to encapsulate the results of the extraction, a class of
-``Concept``\ s is introduced. A ``Concept`` object has a number of
-attributes, in particular a ``prefLabel``, an arity and ``extension``.
-
- >>> c1 = chat80.Concept('dog', arity=1, extension=set(['d1', 'd2']))
- >>> print(c1)
- Label = 'dog'
- Arity = 1
- Extension = ['d1', 'd2']
-
-
-
-The ``extension`` attribute makes it easier to inspect the output of
-the extraction.
-
- >>> schema = ['city', 'country', 'population']
- >>> concepts = chat80.clause2concepts('cities.pl', 'city', schema)
- >>> concepts
- [Concept('city'), Concept('country_of'), Concept('population_of')]
- >>> for c in concepts: # doctest: +NORMALIZE_WHITESPACE
- ... print("%s:\n\t%s" % (c.prefLabel, c.extension[:4]))
- city:
- ['athens', 'bangkok', 'barcelona', 'berlin']
- country_of:
- [('athens', 'greece'), ('bangkok', 'thailand'), ('barcelona', 'spain'), ('berlin', 'east_germany')]
- population_of:
- [('athens', '1368'), ('bangkok', '1178'), ('barcelona', '1280'), ('berlin', '3481')]
-
-In addition, the ``extension`` can be further
-processed: in the case of the ``'border'`` relation, we check that the
-relation is **symmetric**, and in the case of the ``'contain'``
-relation, we carry out the **transitive closure**. The closure
-properties associated with a concept is indicated in the relation
-metadata, as indicated earlier.
-
- >>> borders = set([('a1', 'a2'), ('a2', 'a3')])
- >>> c2 = chat80.Concept('borders', arity=2, extension=borders)
- >>> print(c2)
- Label = 'borders'
- Arity = 2
- Extension = [('a1', 'a2'), ('a2', 'a3')]
- >>> c3 = chat80.Concept('borders', arity=2, closures=['symmetric'], extension=borders)
- >>> c3.close()
- >>> print(c3)
- Label = 'borders'
- Arity = 2
- Extension = [('a1', 'a2'), ('a2', 'a1'), ('a2', 'a3'), ('a3', 'a2')]
-
-The ``extension`` of a ``Concept`` object is then incorporated into a
-``Valuation`` object.
-
-Persistence
-===========
-The functions ``val_dump`` and ``val_load`` are provided to allow a
-valuation to be stored in a persistent database and re-loaded, rather
-than having to be re-computed each time.
-
-Individuals and Lexical Items
-=============================
-As well as deriving relations from the Chat-80 data, we also create a
-set of individual constants, one for each entity in the domain. The
-individual constants are string-identical to the entities. For
-example, given a data item such as ``'zloty'``, we add to the valuation
-a pair ``('zloty', 'zloty')``. In order to parse English sentences that
-refer to these entities, we also create a lexical item such as the
-following for each individual constant::
-
- PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty'
-
-The set of rules is written to the file ``chat_pnames.fcfg`` in the
-current directory.
-
-SQL Query
-=========
-
-The ``city`` relation is also available in RDB form and can be queried
-using SQL statements.
-
- >>> import nltk
- >>> q = "SELECT City, Population FROM city_table WHERE Country = 'china' and Population > 1000"
- >>> for answer in chat80.sql_query('corpora/city_database/city.db', q):
- ... print("%-10s %4s" % answer)
- canton 1496
- chungking 1100
- mukden 1551
- peking 2031
- shanghai 5407
- tientsin 1795
-
-The (deliberately naive) grammar ``sql.fcfg`` translates from English
-to SQL:
-
- >>> nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg')
- % start S
- S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp]
- VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp]
- VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap]
- NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n]
- PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np]
- AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp]
- NP[SEM='Country="greece"'] -> 'Greece'
- NP[SEM='Country="china"'] -> 'China'
- Det[SEM='SELECT'] -> 'Which' | 'What'
- N[SEM='City FROM city_table'] -> 'cities'
- IV[SEM=''] -> 'are'
- A[SEM=''] -> 'located'
- P[SEM=''] -> 'in'
-
-Given this grammar, we can express, and then execute, queries in English.
-
- >>> cp = nltk.parse.load_parser('grammars/book_grammars/sql0.fcfg')
- >>> query = 'What cities are in China'
- >>> for tree in cp.parse(query.split()):
- ... answer = tree.label()['SEM']
- ... q = " ".join(answer)
- ... print(q)
- ...
- SELECT City FROM city_table WHERE Country="china"
-
- >>> rows = chat80.sql_query('corpora/city_database/city.db', q)
- >>> for r in rows: print("%s" % r, end=' ')
- canton chungking dairen harbin kowloon mukden peking shanghai sian tientsin
-
-
-Using Valuations
------------------
-
-In order to convert such an extension into a valuation, we use the
-``make_valuation()`` method; setting ``read=True`` creates and returns
-a new ``Valuation`` object which contains the results.
-
- >>> val = chat80.make_valuation(concepts, read=True)
- >>> 'calcutta' in val['city']
- True
- >>> [town for (town, country) in val['country_of'] if country == 'india']
- ['bombay', 'calcutta', 'delhi', 'hyderabad', 'madras']
- >>> dom = val.domain
- >>> g = nltk.sem.Assignment(dom)
- >>> m = nltk.sem.Model(dom, val)
- >>> m.evaluate(r'population_of(jakarta, 533)', g)
- True
-
-
+++ /dev/null
-=======================
- CHILDES Corpus Readers
-=======================
-
-Read the XML version of the CHILDES corpus.
-
-How to use CHILDESCorpusReader
-==============================
-
-Read the CHILDESCorpusReader class and read the CHILDES corpus saved in
-the nltk_data directory.
-
- >>> import nltk
- >>> from nltk.corpus.reader import CHILDESCorpusReader
- >>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
-
-Reading files in the Valian corpus (Valian, 1991).
-
- >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
- >>> valian.fileids()
- ['Valian/01a.xml', 'Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml',...
-
-Count the number of files
-
- >>> len(valian.fileids())
- 43
-
-Printing properties of the corpus files.
-
- >>> corpus_data = valian.corpus(valian.fileids())
- >>> print(corpus_data[0]['Lang'])
- eng
- >>> for key in sorted(corpus_data[0].keys()):
- ... print(key, ": ", corpus_data[0][key])
- Corpus : valian
- Date : 1986-03-04
- Id : 01a
- Lang : eng
- Version : 2.0.1
- {http://www.w3.org/2001/XMLSchema-instance}schemaLocation : http://www.talkbank.org/ns/talkbank http://talkbank.org/software/talkbank.xsd
-
-Printing information of participants of the corpus. The most common codes for
-the participants are 'CHI' (target child), 'MOT' (mother), and 'INV' (investigator).
-
- >>> corpus_participants = valian.participants(valian.fileids())
- >>> for this_corpus_participants in corpus_participants[:2]:
- ... for key in sorted(this_corpus_participants.keys()):
- ... dct = this_corpus_participants[key]
- ... print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
- CHI : [('age', 'P2Y1M3D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')]
- INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')]
- MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')]
- CHI : [('age', 'P2Y1M12D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')]
- INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')]
- MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')]
-
-printing words.
-
- >>> valian.words('Valian/01a.xml')
- ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
-
-printing sentences.
-
- >>> valian.sents('Valian/01a.xml')
- [['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname',
- 'and', 'it', 'is', 'March', 'fourth', 'I', 'believe', 'and', 'when',
- 'was', "Parent's", 'birthday'], ["Child's"], ['oh', "I'm", 'sorry'],
- ["that's", 'okay'], ...
-
-You can specify the participants with the argument *speaker*.
-
- >>> valian.words('Valian/01a.xml',speaker=['INV'])
- ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
- >>> valian.words('Valian/01a.xml',speaker=['MOT'])
- ["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ...
- >>> valian.words('Valian/01a.xml',speaker=['CHI'])
- ['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',...
-
-
-tagged_words() and tagged_sents() return the usual (word,pos) tuple lists.
-POS tags in the CHILDES are automatically assigned by MOR and POST programs
-(MacWhinney, 2000).
-
- >>> valian.tagged_words('Valian/01a.xml')[:30]
- [('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'),
- ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'),
- ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'),
- ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'),
- ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n'), ("Child's", 'n:prop'),
- ('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj'), ("that's", 'pro:dem'),
- ('okay', 'adj'), ('February', 'n:prop'), ('first', 'adj'),
- ('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')]
-
- >>> valian.tagged_sents('Valian/01a.xml')[:10]
- [[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'),
- ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'),
- ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'),
- ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'),
- ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n')],
- [("Child's", 'n:prop')], [('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj')],
- [("that's", 'pro:dem'), ('okay', 'adj')],
- [('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'),
- ('eighty', 'det:num'), ('four', 'det:num')],
- [('great', 'adj')],
- [('and', 'coord'), ("she's", 'pro:sub'), ('two', 'det:num'), ('years', 'n'), ('old', 'adj')],
- [('correct', 'adj')],
- [('okay', 'co')], [('she', 'pro:sub'), ('just', 'adv:int'), ('turned', 'part'), ('two', 'det:num'),
- ('a', 'det'), ('month', 'n'), ('ago', 'adv')]]
-
-When the argument *stem* is true, the word stems (e.g., 'is' -> 'be-3PS') are
-used instread of the original words.
-
- >>> valian.words('Valian/01a.xml')[:30]
- ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', ...
- >>> valian.words('Valian/01a.xml',stem=True)[:30]
- ['at', 'Parent', 'Lastname', 's', 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'be-3S', ...
-
-When the argument *replace* is true, the replaced words are used instread of
-the original words.
-
- >>> valian.words('Valian/01a.xml',speaker='CHI')[247]
- 'tikteat'
- >>> valian.words('Valian/01a.xml',speaker='CHI',replace=True)[247]
- 'trick'
-
-When the argument *relation* is true, the relational relationships in the
-sentence are returned. See Sagae et al. (2010) for details of the relational
-structure adopted in the CHILDES.
-
- >>> valian.words('Valian/01a.xml',relation=True)[:10]
- [[('at', 'prep', '1|0|ROOT'), ('Parent', 'n', '2|5|VOC'), ('Lastname', 'n', '3|5|MOD'), ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|1|JCT'), ('Child', 'n', '7|8|NAME'), ('Lastname', 'n', '8|6|POBJ'), ('and', 'coord', '9|8|COORD'), ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v', '11|9|COMP'), ('March', 'n', '12|11|PRED'), ('fourth', 'adj', '13|12|MOD'), ('I', 'pro', '15|16|SUBJ'), ('believe', 'v', '16|14|ROOT'), ('and', 'coord', '18|17|ROOT'), ('when', 'adv', '19|20|PRED'), ('be-PAST', 'v', '20|18|COMP'), ('Parent', 'n', '21|23|MOD'), ('s', 'poss', '22|23|MOD'), ('birth', 'n', '23|20|SUBJ')], [('Child', 'n', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|4|COM'), ('I', 'pro', '3|4|SUBJ'), ('be', 'v', '4|0|ROOT'), ('sorry', 'adj', '5|4|PRED')], [('that', 'pro', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], [('February', 'n', '1|6|VOC'), ('first', 'adj', '2|6|ENUM'), ('nineteen', 'det', '4|6|ENUM'), ('eighty', 'det', '5|6|ENUM'), ('four', 'det', '6|0|ROOT')], [('great', 'adj', '1|0|ROOT')], [('and', 'coord', '1|0|ROOT'), ('she', 'pro', '2|1|ROOT'), ('be', 'aux', '3|5|AUX'), ('two', 'det', '4|5|QUANT'), ('year-PL', 'n', '5|2|ROOT'), ('old', 'adj', '6|5|MOD')], [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|0|ROOT'), ('just', 'adv', '2|3|JCT'), ('turn-PERF', 'part', '3|1|XCOMP'), ('two', 'det', '4|6|QUANT'), ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]]
-
-Printing age. When the argument *month* is true, the age information in
-the CHILDES format is converted into the number of months.
-
- >>> valian.age()
- ['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ...
- >>> valian.age('Valian/01a.xml')
- ['P2Y1M3D']
- >>> valian.age('Valian/01a.xml',month=True)
- [25]
-
-Printing MLU. The criteria for the MLU computation is broadly based on
-Brown (1973).
-
- >>> valian.MLU()
- [2.3574660633484..., 2.292682926829..., 3.492857142857..., 2.961783439490...,
- 2.0842696629213..., 3.169811320754..., 3.137404580152..., 3.0578034682080...,
- 4.090163934426..., 3.488372093023..., 2.8773584905660..., 3.4792899408284...,
- 4.0111940298507..., 3.456790123456..., 4.487603305785..., 4.007936507936...,
- 5.25, 5.154696132596..., ...]
-
- >>> valian.MLU('Valian/01a.xml')
- [2.35746606334...]
-
-
-Basic stuff
-==============================
-
-Count the number of words and sentences of each file.
-
- >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
- >>> for this_file in valian.fileids()[:6]:
- ... print(valian.corpus(this_file)[0]['Corpus'], valian.corpus(this_file)[0]['Id'])
- ... print("num of words: %i" % len(valian.words(this_file)))
- ... print("num of sents: %i" % len(valian.sents(this_file)))
- valian 01a
- num of words: 3606
- num of sents: 1027
- valian 01b
- num of words: 4376
- num of sents: 1274
- valian 02a
- num of words: 2673
- num of sents: 801
- valian 02b
- num of words: 5020
- num of sents: 1583
- valian 03a
- num of words: 2743
- num of sents: 988
- valian 03b
- num of words: 4409
- num of sents: 1397
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-def setup_module(module):
- from nose import SkipTest
- import nltk.data
-
- try:
- nltk.data.find("corpora/childes/data-xml/Eng-USA-MOR/")
- except LookupError as e:
- print(e)
- raise SkipTest(
- "The CHILDES corpus is not found. "
- "It should be manually downloaded and saved/unpacked "
- "to [NLTK_Data_Dir]/corpora/childes/"
- )
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==========
- Chunking
-==========
-
- >>> from nltk.chunk import *
- >>> from nltk.chunk.util import *
- >>> from nltk.chunk.regexp import *
- >>> from nltk import Tree
-
- >>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./."
- >>> gold_chunked_text = tagstr2tree(tagged_text)
- >>> unchunked_text = gold_chunked_text.flatten()
-
-Chunking uses a special regexp syntax for rules that delimit the chunks. These
-rules must be converted to 'regular' regular expressions before a sentence can
-be chunked.
-
- >>> tag_pattern = "<DT>?<JJ>*<NN.*>"
- >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern)
- >>> regexp_pattern
- '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)'
-
-Construct some new chunking rules.
-
- >>> chunk_rule = ChunkRule("<.*>+", "Chunk everything")
- >>> chink_rule = ChinkRule("<VBD|IN|\.>", "Chink on verbs/prepositions")
- >>> split_rule = SplitRule("<DT><NN>", "<DT><NN>",
- ... "Split successive determiner/noun pairs")
-
-
-Create and score a series of chunk parsers, successively more complex.
-
- >>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
- >>> chunked_text = chunk_parser.parse(unchunked_text)
- >>> print(chunked_text)
- (S
- (NP
- The/DT
- cat/NN
- sat/VBD
- on/IN
- the/DT
- mat/NN
- the/DT
- dog/NN
- chewed/VBD
- ./.))
-
- >>> chunkscore = ChunkScore()
- >>> chunkscore.score(gold_chunked_text, chunked_text)
- >>> print(chunkscore.precision())
- 0.0
-
- >>> print(chunkscore.recall())
- 0.0
-
- >>> print(chunkscore.f_measure())
- 0
-
- >>> for chunk in sorted(chunkscore.missed()): print(chunk)
- (NP The/DT cat/NN)
- (NP the/DT dog/NN)
- (NP the/DT mat/NN)
-
- >>> for chunk in chunkscore.incorrect(): print(chunk)
- (NP
- The/DT
- cat/NN
- sat/VBD
- on/IN
- the/DT
- mat/NN
- the/DT
- dog/NN
- chewed/VBD
- ./.)
-
- >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule],
- ... chunk_label='NP')
- >>> chunked_text = chunk_parser.parse(unchunked_text)
- >>> print(chunked_text)
- (S
- (NP The/DT cat/NN)
- sat/VBD
- on/IN
- (NP the/DT mat/NN the/DT dog/NN)
- chewed/VBD
- ./.)
- >>> assert chunked_text == chunk_parser.parse(list(unchunked_text))
-
- >>> chunkscore = ChunkScore()
- >>> chunkscore.score(gold_chunked_text, chunked_text)
- >>> chunkscore.precision()
- 0.5
-
- >>> print(chunkscore.recall())
- 0.33333333...
-
- >>> print(chunkscore.f_measure())
- 0.4
-
- >>> for chunk in sorted(chunkscore.missed()): print(chunk)
- (NP the/DT dog/NN)
- (NP the/DT mat/NN)
-
- >>> for chunk in chunkscore.incorrect(): print(chunk)
- (NP the/DT mat/NN the/DT dog/NN)
-
- >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule],
- ... chunk_label='NP')
- >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True)
- # Input:
- <DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>
- # Chunk everything:
- {<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>}
- # Chink on verbs/prepositions:
- {<DT> <NN>} <VBD> <IN> {<DT> <NN> <DT> <NN>} <VBD> <.>
- # Split successive determiner/noun pairs:
- {<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.>
- >>> print(chunked_text)
- (S
- (NP The/DT cat/NN)
- sat/VBD
- on/IN
- (NP the/DT mat/NN)
- (NP the/DT dog/NN)
- chewed/VBD
- ./.)
-
- >>> chunkscore = ChunkScore()
- >>> chunkscore.score(gold_chunked_text, chunked_text)
- >>> chunkscore.precision()
- 1.0
-
- >>> chunkscore.recall()
- 1.0
-
- >>> chunkscore.f_measure()
- 1.0
-
- >>> chunkscore.missed()
- []
-
- >>> chunkscore.incorrect()
- []
-
- >>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE
- [<ChunkRule: '<.*>+'>, <ChinkRule: '<VBD|IN|\\.>'>,
- <SplitRule: '<DT><NN>', '<DT><NN>'>]
-
-Printing parsers:
-
- >>> print(repr(chunk_parser))
- <RegexpChunkParser with 3 rules>
- >>> print(chunk_parser)
- RegexpChunkParser with 3 rules:
- Chunk everything
- <ChunkRule: '<.*>+'>
- Chink on verbs/prepositions
- <ChinkRule: '<VBD|IN|\\.>'>
- Split successive determiner/noun pairs
- <SplitRule: '<DT><NN>', '<DT><NN>'>
-
-Regression Tests
-~~~~~~~~~~~~~~~~
-ChunkParserI
-------------
-`ChunkParserI` is an abstract interface -- it is not meant to be
-instantiated directly.
-
- >>> ChunkParserI().parse([])
- Traceback (most recent call last):
- . . .
- NotImplementedError
-
-
-ChunkString
------------
-ChunkString can be built from a tree of tagged tuples, a tree of
-trees, or a mixed list of both:
-
- >>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)])
- >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])])
- >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])])
- >>> ChunkString(t1)
- <ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'>
- >>> ChunkString(t2)
- <ChunkString: '<t0><t1>'>
- >>> ChunkString(t3)
- <ChunkString: '<t0><t1>'>
-
-Other values generate an error:
-
- >>> ChunkString(Tree('S', ['x']))
- Traceback (most recent call last):
- . . .
- ValueError: chunk structures must contain tagged tokens or trees
-
-The `str()` for a chunk string adds spaces to it, which makes it line
-up with `str()` output for other chunk strings over the same
-underlying input.
-
- >>> cs = ChunkString(t1)
- >>> print(cs)
- <t0> <t1> <t2> <t3> <t4> <t5> <t6> <t7> <t8> <t9>
- >>> cs.xform('<t3>', '{<t3>}')
- >>> print(cs)
- <t0> <t1> <t2> {<t3>} <t4> <t5> <t6> <t7> <t8> <t9>
-
-The `_verify()` method makes sure that our transforms don't corrupt
-the chunk string. By setting debug_level=2, `_verify()` will be
-called at the end of every call to `xform`.
-
- >>> cs = ChunkString(t1, debug_level=3)
-
- >>> # tag not marked with <...>:
- >>> cs.xform('<t3>', 't3')
- Traceback (most recent call last):
- . . .
- ValueError: Transformation generated invalid chunkstring:
- <t0><t1><t2>t3<t4><t5><t6><t7><t8><t9>
-
- >>> # brackets not balanced:
- >>> cs.xform('<t3>', '{<t3>')
- Traceback (most recent call last):
- . . .
- ValueError: Transformation generated invalid chunkstring:
- <t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9>
-
- >>> # nested brackets:
- >>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}')
- Traceback (most recent call last):
- . . .
- ValueError: Transformation generated invalid chunkstring:
- <t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9>
-
- >>> # modified tags:
- >>> cs.xform('<t3>', '<t9>')
- Traceback (most recent call last):
- . . .
- ValueError: Transformation generated invalid chunkstring: tag changed
-
- >>> # added tags:
- >>> cs.xform('<t9>', '<t9><t10>')
- Traceback (most recent call last):
- . . .
- ValueError: Transformation generated invalid chunkstring: tag changed
-
-Chunking Rules
---------------
-
-Test the different rule constructors & __repr__ methods:
-
- >>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_CHINK_PATTERN,
- ... '{<a|b>}', 'chunk <a> and <b>')
- >>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_CHINK_PATTERN),
- ... '{<a|b>}', 'chunk <a> and <b>')
- >>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>')
- >>> r4 = ChinkRule('<a|b>', 'chink <a> and <b>')
- >>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>')
- >>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>')
- >>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>')
- >>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>')
- >>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>')
- >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9:
- ... print(rule)
- <RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
- <RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
- <ChunkRule: '<a|b>'>
- <ChinkRule: '<a|b>'>
- <UnChunkRule: '<a|b>'>
- <MergeRule: '<a>', '<b>'>
- <SplitRule: '<a>', '<b>'>
- <ExpandLeftRule: '<a>', '<b>'>
- <ExpandRightRule: '<a>', '<b>'>
-
-`tag_pattern2re_pattern()` complains if the tag pattern looks problematic:
-
- >>> tag_pattern2re_pattern('{}')
- Traceback (most recent call last):
- . . .
- ValueError: Bad tag pattern: '{}'
-
-RegexpChunkParser
------------------
-
-A warning is printed when parsing an empty sentence:
-
- >>> parser = RegexpChunkParser([ChunkRule('<a>', '')])
- >>> parser.parse(Tree('S', []))
- Warning: parsing empty text
- Tree('S', [])
-
-RegexpParser
-------------
-
- >>> parser = RegexpParser('''
- ... NP: {<DT>? <JJ>* <NN>*} # NP
- ... P: {<IN>} # Preposition
- ... V: {<V.*>} # Verb
- ... PP: {<P> <NP>} # PP -> P NP
- ... VP: {<V> <NP|PP>*} # VP -> V (NP|PP)*
- ... ''')
- >>> print(repr(parser))
- <chunk.RegexpParser with 5 stages>
- >>> print(parser)
- chunk.RegexpParser with 5 stages:
- RegexpChunkParser with 1 rules:
- NP <ChunkRule: '<DT>? <JJ>* <NN>*'>
- RegexpChunkParser with 1 rules:
- Preposition <ChunkRule: '<IN>'>
- RegexpChunkParser with 1 rules:
- Verb <ChunkRule: '<V.*>'>
- RegexpChunkParser with 1 rules:
- PP -> P NP <ChunkRule: '<P> <NP>'>
- RegexpChunkParser with 1 rules:
- VP -> V (NP|PP)* <ChunkRule: '<V> <NP|PP>*'>
- >>> print(parser.parse(unchunked_text, trace=True))
- # Input:
- <DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>
- # NP:
- {<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.>
- # Input:
- <NP> <VBD> <IN> <NP> <NP> <VBD> <.>
- # Preposition:
- <NP> <VBD> {<IN>} <NP> <NP> <VBD> <.>
- # Input:
- <NP> <VBD> <P> <NP> <NP> <VBD> <.>
- # Verb:
- <NP> {<VBD>} <P> <NP> <NP> {<VBD>} <.>
- # Input:
- <NP> <V> <P> <NP> <NP> <V> <.>
- # PP -> P NP:
- <NP> <V> {<P> <NP>} <NP> <V> <.>
- # Input:
- <NP> <V> <PP> <NP> <V> <.>
- # VP -> V (NP|PP)*:
- <NP> {<V> <PP> <NP>}{<V>} <.>
- (S
- (NP The/DT cat/NN)
- (VP
- (V sat/VBD)
- (PP (P on/IN) (NP the/DT mat/NN))
- (NP the/DT dog/NN))
- (VP (V chewed/VBD))
- ./.)
-
-Test parsing of other rule types:
-
- >>> print(RegexpParser('''
- ... X:
- ... }<a><b>{ # chink rule
- ... <a>}{<b> # split rule
- ... <a>{}<b> # merge rule
- ... <a>{<b>}<c> # chunk rule w/ context
- ... '''))
- chunk.RegexpParser with 1 stages:
- RegexpChunkParser with 4 rules:
- chink rule <ChinkRule: '<a><b>'>
- split rule <SplitRule: '<a>', '<b>'>
- merge rule <MergeRule: '<a>', '<b>'>
- chunk rule w/ context <ChunkRuleWithContext: '<a>', '<b>', '<c>'>
-
-Illegal patterns give an error message:
-
- >>> print(RegexpParser('X: {<foo>} {<bar>}'))
- Traceback (most recent call last):
- . . .
- ValueError: Illegal chunk pattern: {<foo>} {<bar>}
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=============
- Classifiers
-=============
-
-Classifiers label tokens with category labels (or *class labels*).
-Typically, labels are represented with strings (such as ``"health"``
-or ``"sports"``. In NLTK, classifiers are defined using classes that
-implement the `ClassifyI` interface:
-
- >>> import nltk
- >>> nltk.usage(nltk.classify.ClassifierI)
- ClassifierI supports the following operations:
- - self.classify(featureset)
- - self.classify_many(featuresets)
- - self.labels()
- - self.prob_classify(featureset)
- - self.prob_classify_many(featuresets)
-
-NLTK defines several classifier classes:
-
-- `ConditionalExponentialClassifier`
-- `DecisionTreeClassifier`
-- `MaxentClassifier`
-- `NaiveBayesClassifier`
-- `WekaClassifier`
-
-Classifiers are typically created by training them on a training
-corpus.
-
-
-Regression Tests
-~~~~~~~~~~~~~~~~
-
-We define a very simple training corpus with 3 binary features: ['a',
-'b', 'c'], and are two labels: ['x', 'y']. We use a simple feature set so
-that the correct answers can be calculated analytically (although we
-haven't done this yet for all tests).
-
- >>> train = [
- ... (dict(a=1,b=1,c=1), 'y'),
- ... (dict(a=1,b=1,c=1), 'x'),
- ... (dict(a=1,b=1,c=0), 'y'),
- ... (dict(a=0,b=1,c=1), 'x'),
- ... (dict(a=0,b=1,c=1), 'y'),
- ... (dict(a=0,b=0,c=1), 'y'),
- ... (dict(a=0,b=1,c=0), 'x'),
- ... (dict(a=0,b=0,c=0), 'x'),
- ... (dict(a=0,b=1,c=1), 'y'),
- ... (dict(a=None,b=1,c=0), 'x'),
- ... ]
- >>> test = [
- ... (dict(a=1,b=0,c=1)), # unseen
- ... (dict(a=1,b=0,c=0)), # unseen
- ... (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
- ... (dict(a=0,b=1,c=0)), # seen 1 time, label=x
- ... ]
-
-Test the Naive Bayes classifier:
-
- >>> classifier = nltk.classify.NaiveBayesClassifier.train(train)
- >>> sorted(classifier.labels())
- ['x', 'y']
- >>> classifier.classify_many(test)
- ['y', 'x', 'y', 'x']
- >>> for pdist in classifier.prob_classify_many(test):
- ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
- 0.2500 0.7500
- 0.5833 0.4167
- 0.3571 0.6429
- 0.7000 0.3000
- >>> classifier.show_most_informative_features()
- Most Informative Features
- c = 0 x : y = 2.3 : 1.0
- c = 1 y : x = 1.8 : 1.0
- a = 1 y : x = 1.7 : 1.0
- a = 0 x : y = 1.0 : 1.0
- b = 0 x : y = 1.0 : 1.0
- b = 1 x : y = 1.0 : 1.0
-
-Test the Decision Tree classifier (without None):
-
- >>> classifier = nltk.classify.DecisionTreeClassifier.train(
- ... train[:-1], entropy_cutoff=0,
- ... support_cutoff=0)
- >>> sorted(classifier.labels())
- ['x', 'y']
- >>> print(classifier)
- c=0? .................................................. x
- a=0? ................................................ x
- a=1? ................................................ y
- c=1? .................................................. y
- <BLANKLINE>
- >>> classifier.classify_many(test)
- ['y', 'y', 'y', 'x']
- >>> for pdist in classifier.prob_classify_many(test):
- ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
- Traceback (most recent call last):
- . . .
- NotImplementedError
-
-
-Test the Decision Tree classifier (with None):
-
- >>> classifier = nltk.classify.DecisionTreeClassifier.train(
- ... train, entropy_cutoff=0,
- ... support_cutoff=0)
- >>> sorted(classifier.labels())
- ['x', 'y']
- >>> print(classifier)
- c=0? .................................................. x
- a=0? ................................................ x
- a=1? ................................................ y
- a=None? ............................................. x
- c=1? .................................................. y
- <BLANKLINE>
-
-
-Test SklearnClassifier, which requires the scikit-learn package.
-
- >>> from nltk.classify import SklearnClassifier
- >>> from sklearn.naive_bayes import BernoulliNB
- >>> from sklearn.svm import SVC
- >>> train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
- ... ({"a": 5, "b": 2, "c": 1}, "ham"),
- ... ({"a": 0, "b": 3, "c": 4}, "spam"),
- ... ({"a": 5, "b": 1, "c": 1}, "ham"),
- ... ({"a": 1, "b": 4, "c": 3}, "spam")]
- >>> classif = SklearnClassifier(BernoulliNB()).train(train_data)
- >>> test_data = [{"a": 3, "b": 2, "c": 1},
- ... {"a": 0, "b": 3, "c": 7}]
- >>> classif.classify_many(test_data)
- ['ham', 'spam']
- >>> classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
- >>> classif.classify_many(test_data)
- ['ham', 'spam']
-
-Test the Maximum Entropy classifier training algorithms; they should all
-generate the same results.
-
- >>> def print_maxent_test_header():
- ... print(' '*11+''.join([' test[%s] ' % i
- ... for i in range(len(test))]))
- ... print(' '*11+' p(x) p(y)'*len(test))
- ... print('-'*(11+15*len(test)))
-
- >>> def test_maxent(algorithm):
- ... print('%11s' % algorithm, end=' ')
- ... try:
- ... classifier = nltk.classify.MaxentClassifier.train(
- ... train, algorithm, trace=0, max_iter=1000)
- ... except Exception as e:
- ... print('Error: %r' % e)
- ... return
- ...
- ... for featureset in test:
- ... pdist = classifier.prob_classify(featureset)
- ... print('%8.2f%6.2f' % (pdist.prob('x'), pdist.prob('y')), end=' ')
- ... print()
-
- >>> print_maxent_test_header(); test_maxent('GIS'); test_maxent('IIS')
- test[0] test[1] test[2] test[3]
- p(x) p(y) p(x) p(y) p(x) p(y) p(x) p(y)
- -----------------------------------------------------------------------
- GIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
- IIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
-
- >>> test_maxent('MEGAM'); test_maxent('TADM') # doctest: +SKIP
- MEGAM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
- TADM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
-
-
-
-Regression tests for TypedMaxentFeatureEncoding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- >>> from nltk.classify import maxent
- >>> train = [
- ... ({'a': 1, 'b': 1, 'c': 1}, 'y'),
- ... ({'a': 5, 'b': 5, 'c': 5}, 'x'),
- ... ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'),
- ... ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'),
- ... ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'),
- ... ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x')
- ... ]
-
- >>> test = [
- ... {'a': 1, 'b': 0.8, 'c': 1.2},
- ... {'a': 5.2, 'b': 5.1, 'c': 5}
- ... ]
-
- >>> encoding = maxent.TypedMaxentFeatureEncoding.train(
- ... train, count_cutoff=3, alwayson_features=True)
-
- >>> classifier = maxent.MaxentClassifier.train(
- ... train, bernoulli=False, encoding=encoding, trace=0)
-
- >>> classifier.classify_many(test)
- ['y', 'x']
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-# most of classify.doctest requires numpy
-def setup_module(module):
- from nose import SkipTest
-
- try:
- import numpy
- except ImportError:
- raise SkipTest("classify.doctest requires numpy")
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===========
-Collections
-===========
-
- >>> import nltk
- >>> from nltk.collections import *
-
-Trie
-----
-
-Trie can be pickled:
-
- >>> import pickle
- >>> trie = nltk.collections.Trie(['a'])
- >>> s = pickle.dumps(trie)
- >>> pickle.loads(s)
- {'a': {True: None}}
\ No newline at end of file
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==============
- Collocations
-==============
-
-Overview
-~~~~~~~~
-
-Collocations are expressions of multiple words which commonly co-occur. For
-example, the top ten bigram collocations in Genesis are listed below, as
-measured using Pointwise Mutual Information.
-
- >>> import nltk
- >>> from nltk.collocations import *
- >>> bigram_measures = nltk.collocations.BigramAssocMeasures()
- >>> trigram_measures = nltk.collocations.TrigramAssocMeasures()
- >>> fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
- >>> finder = BigramCollocationFinder.from_words(
- ... nltk.corpus.genesis.words('english-web.txt'))
- >>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
- [('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'),
- ('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'),
- ('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'),
- ('cutting', 'instrument')]
-
-While these words are highly collocated, the expressions are also very
-infrequent. Therefore it is useful to apply filters, such as ignoring all
-bigrams which occur less than three times in the corpus:
-
- >>> finder.apply_freq_filter(3)
- >>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
- [('Beer', 'Lahai'), ('Lahai', 'Roi'), ('gray', 'hairs'),
- ('Most', 'High'), ('ewe', 'lambs'), ('many', 'colors'),
- ('burnt', 'offering'), ('Paddan', 'Aram'), ('east', 'wind'),
- ('living', 'creature')]
-
-We may similarly find collocations among tagged words:
-
- >>> finder = BigramCollocationFinder.from_words(
- ... nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
- >>> finder.nbest(bigram_measures.pmi, 5) # doctest: +NORMALIZE_WHITESPACE
- [(('1,119', 'NUM'), ('votes', 'NOUN')),
- (('1962', 'NUM'), ("governor's", 'NOUN')),
- (('637', 'NUM'), ('E.', 'NOUN')),
- (('Alpharetta', 'NOUN'), ('prison', 'NOUN')),
- (('Bar', 'NOUN'), ('Association', 'NOUN'))]
-
-Or tags alone:
-
- >>> finder = BigramCollocationFinder.from_words(t for w, t in
- ... nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
- >>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
- [('PRT', 'VERB'), ('PRON', 'VERB'), ('ADP', 'DET'), ('.', 'PRON'), ('DET', 'ADJ'),
- ('CONJ', 'PRON'), ('ADP', 'NUM'), ('NUM', '.'), ('ADV', 'ADV'), ('VERB', 'ADV')]
-
-Or spanning intervening words:
-
- >>> finder = BigramCollocationFinder.from_words(
- ... nltk.corpus.genesis.words('english-web.txt'),
- ... window_size = 20)
- >>> finder.apply_freq_filter(2)
- >>> ignored_words = nltk.corpus.stopwords.words('english')
- >>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
- >>> finder.nbest(bigram_measures.likelihood_ratio, 10) # doctest: +NORMALIZE_WHITESPACE
- [('chief', 'chief'), ('became', 'father'), ('years', 'became'),
- ('hundred', 'years'), ('lived', 'became'), ('king', 'king'),
- ('lived', 'years'), ('became', 'became'), ('chief', 'chiefs'),
- ('hundred', 'became')]
-
-Finders
-~~~~~~~
-
-The collocations package provides collocation finders which by default
-consider all ngrams in a text as candidate collocations:
-
- >>> text = "I do not like green eggs and ham, I do not like them Sam I am!"
- >>> tokens = nltk.wordpunct_tokenize(text)
- >>> finder = BigramCollocationFinder.from_words(tokens)
- >>> scored = finder.score_ngrams(bigram_measures.raw_freq)
- >>> sorted(bigram for bigram, score in scored) # doctest: +NORMALIZE_WHITESPACE
- [(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'),
- ('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'),
- ('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'),
- ('them', 'Sam')]
-
-We could otherwise construct the collocation finder from manually-derived
-FreqDists:
-
- >>> word_fd = nltk.FreqDist(tokens)
- >>> bigram_fd = nltk.FreqDist(nltk.bigrams(tokens))
- >>> finder = BigramCollocationFinder(word_fd, bigram_fd)
- >>> scored == finder.score_ngrams(bigram_measures.raw_freq)
- True
-
-A similar interface is provided for trigrams:
-
- >>> finder = TrigramCollocationFinder.from_words(tokens)
- >>> scored = finder.score_ngrams(trigram_measures.raw_freq)
- >>> set(trigram for trigram, score in scored) == set(nltk.trigrams(tokens))
- True
-
-We may want to select only the top n results:
-
- >>> sorted(finder.nbest(trigram_measures.raw_freq, 2))
- [('I', 'do', 'not'), ('do', 'not', 'like')]
-
-Alternatively, we can select those above a minimum score value:
-
- >>> sorted(finder.above_score(trigram_measures.raw_freq,
- ... 1.0 / len(tuple(nltk.trigrams(tokens)))))
- [('I', 'do', 'not'), ('do', 'not', 'like')]
-
-Now spanning intervening words:
-
- >>> finder = TrigramCollocationFinder.from_words(tokens)
- >>> finder = TrigramCollocationFinder.from_words(tokens, window_size=4)
- >>> sorted(finder.nbest(trigram_measures.raw_freq, 4))
- [('I', 'do', 'like'), ('I', 'do', 'not'), ('I', 'not', 'like'), ('do', 'not', 'like')]
-
-A closer look at the finder's ngram frequencies:
-
- >>> sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10] # doctest: +NORMALIZE_WHITESPACE
- [(('I', 'do', 'like'), 2), (('I', 'do', 'not'), 2), (('I', 'not', 'like'), 2),
- (('do', 'not', 'like'), 2), ((',', 'I', 'do'), 1), ((',', 'I', 'not'), 1),
- ((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1),
- (('Sam', 'I', 'am'), 1)]
-
-A similar interface is provided for fourgrams:
-
- >>> finder_4grams = QuadgramCollocationFinder.from_words(tokens)
- >>> scored_4grams = finder_4grams.score_ngrams(fourgram_measures.raw_freq)
- >>> set(fourgram for fourgram, score in scored_4grams) == set(nltk.ngrams(tokens, n=4))
- True
-
-Filtering candidates
-~~~~~~~~~~~~~~~~~~~~
-
-All the ngrams in a text are often too many to be useful when finding
-collocations. It is generally useful to remove some words or punctuation,
-and to require a minimum frequency for candidate collocations.
-
-Given our sample text above, if we remove all trigrams containing personal
-pronouns from candidature, score_ngrams should return 6 less results, and
-'do not like' will be the only candidate which occurs more than once:
-
- >>> finder = TrigramCollocationFinder.from_words(tokens)
- >>> len(finder.score_ngrams(trigram_measures.raw_freq))
- 14
- >>> finder.apply_word_filter(lambda w: w in ('I', 'me'))
- >>> len(finder.score_ngrams(trigram_measures.raw_freq))
- 8
- >>> sorted(finder.above_score(trigram_measures.raw_freq,
- ... 1.0 / len(tuple(nltk.trigrams(tokens)))))
- [('do', 'not', 'like')]
-
-Sometimes a filter is a function on the whole ngram, rather than each word,
-such as if we may permit 'and' to appear in the middle of a trigram, but
-not on either edge:
-
- >>> finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3))
- >>> len(finder.score_ngrams(trigram_measures.raw_freq))
- 6
-
-Finally, it is often important to remove low frequency candidates, as we
-lack sufficient evidence about their significance as collocations:
-
- >>> finder.apply_freq_filter(2)
- >>> len(finder.score_ngrams(trigram_measures.raw_freq))
- 1
-
-Association measures
-~~~~~~~~~~~~~~~~~~~~
-
-A number of measures are available to score collocations or other associations.
-The arguments to measure functions are marginals of a contingency table, in the
-bigram case (n_ii, (n_ix, n_xi), n_xx)::
-
- w1 ~w1
- ------ ------
- w2 | n_ii | n_oi | = n_xi
- ------ ------
- ~w2 | n_io | n_oo |
- ------ ------
- = n_ix TOTAL = n_xx
-
-We test their calculation using some known values presented in Manning and
-Schutze's text and other papers.
-
-Student's t: examples from Manning and Schutze 5.3.2
-
- >>> print('%0.4f' % bigram_measures.student_t(8, (15828, 4675), 14307668))
- 0.9999
- >>> print('%0.4f' % bigram_measures.student_t(20, (42, 20), 14307668))
- 4.4721
-
-Chi-square: examples from Manning and Schutze 5.3.3
-
- >>> print('%0.2f' % bigram_measures.chi_sq(8, (15828, 4675), 14307668))
- 1.55
- >>> print('%0.0f' % bigram_measures.chi_sq(59, (67, 65), 571007))
- 456400
-
-Likelihood ratios: examples from Dunning, CL, 1993
-
- >>> print('%0.2f' % bigram_measures.likelihood_ratio(110, (2552, 221), 31777))
- 270.72
- >>> print('%0.2f' % bigram_measures.likelihood_ratio(8, (13, 32), 31777))
- 95.29
-
-Pointwise Mutual Information: examples from Manning and Schutze 5.4
-
- >>> print('%0.2f' % bigram_measures.pmi(20, (42, 20), 14307668))
- 18.38
- >>> print('%0.2f' % bigram_measures.pmi(20, (15019, 15629), 14307668))
- 0.29
-
-TODO: Find authoritative results for trigrams.
-
-Using contingency table values
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-While frequency counts make marginals readily available for collocation
-finding, it is common to find published contingency table values. The
-collocations package therefore provides a wrapper, ContingencyMeasures, which
-wraps an association measures class, providing association measures which
-take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the
-bigram case.
-
- >>> from nltk.metrics import ContingencyMeasures
- >>> cont_bigram_measures = ContingencyMeasures(bigram_measures)
- >>> print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740))
- 95.29
- >>> print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173))
- 1.55
-
-Ranking and correlation
-~~~~~~~~~~~~~~~~~~~~~~~
-
-It is useful to consider the results of finding collocations as a ranking, and
-the rankings output using different association measures can be compared using
-the Spearman correlation coefficient.
-
-Ranks can be assigned to a sorted list of results trivially by assigning
-strictly increasing ranks to each result:
-
- >>> from nltk.metrics.spearman import *
- >>> results_list = ['item1', 'item2', 'item3', 'item4', 'item5']
- >>> print(list(ranks_from_sequence(results_list)))
- [('item1', 0), ('item2', 1), ('item3', 2), ('item4', 3), ('item5', 4)]
-
-If scores are available for each result, we may allow sufficiently similar
-results (differing by no more than rank_gap) to be assigned the same rank:
-
- >>> results_scored = [('item1', 50.0), ('item2', 40.0), ('item3', 38.0),
- ... ('item4', 35.0), ('item5', 14.0)]
- >>> print(list(ranks_from_scores(results_scored, rank_gap=5)))
- [('item1', 0), ('item2', 1), ('item3', 1), ('item4', 1), ('item5', 4)]
-
-The Spearman correlation coefficient gives a number from -1.0 to 1.0 comparing
-two rankings. A coefficient of 1.0 indicates identical rankings; -1.0 indicates
-exact opposite rankings.
-
- >>> print('%0.1f' % spearman_correlation(
- ... ranks_from_sequence(results_list),
- ... ranks_from_sequence(results_list)))
- 1.0
- >>> print('%0.1f' % spearman_correlation(
- ... ranks_from_sequence(reversed(results_list)),
- ... ranks_from_sequence(results_list)))
- -1.0
- >>> results_list2 = ['item2', 'item3', 'item1', 'item5', 'item4']
- >>> print('%0.1f' % spearman_correlation(
- ... ranks_from_sequence(results_list),
- ... ranks_from_sequence(results_list2)))
- 0.6
- >>> print('%0.1f' % spearman_correlation(
- ... ranks_from_sequence(reversed(results_list)),
- ... ranks_from_sequence(results_list2)))
- -0.6
-
-
+++ /dev/null
-.. Copyright (C) 2001-2016 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==================================
-Concordance Example
-==================================
-
-A concordance view shows us every occurrence of a given
-word, together with some context. Here we look up the word monstrous
-in Moby Dick by entering text1 followed by a period, then the term
-concordance, and then placing "monstrous" in parentheses:
-
->>> from nltk.corpus import gutenberg
->>> from nltk.text import Text
->>> corpus = gutenberg.words('melville-moby_dick.txt')
->>> text = Text(corpus)
-
->>> text.concordance("monstrous") # doctest:+NORMALIZE_WHITESPACE
-Displaying 11 of 11 matches:
-ong the former , one was of a most monstrous size . ... This came towards us ,
-ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
-ll over with a heathenish array of monstrous clubs and spears . Some were thick
-d as you gazed , and wondered what monstrous cannibal and savage could ever hav
-that has survived the flood ; most monstrous and most mountainous ! That Himmal
-they might scout at Moby Dick as a monstrous fable , or still worse and more de
-th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
-ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
-ere to enter upon those still more monstrous stories of them which are to be fo
-ght have been rummaged out of this monstrous cabinet there is no telling . But
-of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u
-
->>> text.concordance("monstrous") # doctest:+ELLIPSIS, +NORMALIZE_WHITESPACE
-Displaying 11 of 11 matches:
-ong the former , one was of a most monstrous size . ... This came towards us ,
-ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
-ll over with a heathenish array of monstrous clubs and spears . Some were thick
-...
-
-=================================
-Concordance List
-=================================
-
-Often we need to store the results of concordance for further usage.
-To do so, call the concordance function with the stdout argument set
-to false:
-
->>> from nltk.corpus import gutenberg
->>> from nltk.text import Text
->>> corpus = gutenberg.words('melville-moby_dick.txt')
->>> text = Text(corpus)
->>> con_list = text.concordance_list("monstrous")
->>> con_list[2].line
-'ll over with a heathenish array of monstrous clubs and spears . Some were thick'
->>> len(con_list)
-11
-
-=================================
-Patching Issue #2088
-=================================
-
-Patching https://github.com/nltk/nltk/issues/2088
-The left slice of the left context should be clip to 0 if the `i-context` < 0.
-
->>> from nltk import Text, word_tokenize
->>> jane_eyre = 'Chapter 1\nTHERE was no possibility of taking a walk that day. We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further outdoor exercise was now out of the question.'
->>> text = Text(word_tokenize(jane_eyre))
->>> text.concordance_list('taking')[0].left
-['Chapter', '1', 'THERE', 'was', 'no', 'possibility', 'of']
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-================
- Corpus Readers
-================
-
-The `nltk.corpus` package defines a collection of *corpus reader*
-classes, which can be used to access the contents of a diverse set of
-corpora. The list of available corpora is given at:
-
-http://www.nltk.org/nltk_data/
-
-Each corpus reader class is specialized to handle a specific
-corpus format. In addition, the `nltk.corpus` package automatically
-creates a set of corpus reader instances that can be used to access
-the corpora in the NLTK data package.
-Section `Corpus Reader Objects`_ ("Corpus Reader Objects") describes
-the corpus reader instances that can be used to read the corpora in
-the NLTK data package. Section `Corpus Reader Classes`_ ("Corpus
-Reader Classes") describes the corpus reader classes themselves, and
-discusses the issues involved in creating new corpus reader objects
-and new corpus reader classes. Section `Regression Tests`_
-("Regression Tests") contains regression tests for the corpus readers
-and associated functions and classes.
-
-.. contents:: **Table of Contents**
- :depth: 2
- :backlinks: none
-
----------------------
-Corpus Reader Objects
----------------------
-
-Overview
-========
-
-NLTK includes a diverse set of corpora which can be
-read using the ``nltk.corpus`` package. Each corpus is accessed by
-means of a "corpus reader" object from ``nltk.corpus``:
-
- >>> import nltk.corpus
- >>> # The Brown corpus:
- >>> print(str(nltk.corpus.brown).replace('\\\\','/'))
- <CategorizedTaggedCorpusReader in '.../corpora/brown'...>
- >>> # The Penn Treebank Corpus:
- >>> print(str(nltk.corpus.treebank).replace('\\\\','/'))
- <BracketParseCorpusReader in '.../corpora/treebank/combined'...>
- >>> # The Name Genders Corpus:
- >>> print(str(nltk.corpus.names).replace('\\\\','/'))
- <WordListCorpusReader in '.../corpora/names'...>
- >>> # The Inaugural Address Corpus:
- >>> print(str(nltk.corpus.inaugural).replace('\\\\','/'))
- <PlaintextCorpusReader in '.../corpora/inaugural'...>
-
-Most corpora consist of a set of files, each containing a document (or
-other pieces of text). A list of identifiers for these files is
-accessed via the ``fileids()`` method of the corpus reader:
-
- >>> nltk.corpus.treebank.fileids() # doctest: +ELLIPSIS
- ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...]
- >>> nltk.corpus.inaugural.fileids() # doctest: +ELLIPSIS
- ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', ...]
-
-Each corpus reader provides a variety of methods to read data from the
-corpus, depending on the format of the corpus. For example, plaintext
-corpora support methods to read the corpus as raw text, a list of
-words, a list of sentences, or a list of paragraphs.
-
- >>> from nltk.corpus import inaugural
- >>> inaugural.raw('1789-Washington.txt') # doctest: +ELLIPSIS
- 'Fellow-Citizens of the Senate ...'
- >>> inaugural.words('1789-Washington.txt')
- ['Fellow', '-', 'Citizens', 'of', 'the', ...]
- >>> inaugural.sents('1789-Washington.txt') # doctest: +ELLIPSIS
- [['Fellow', '-', 'Citizens'...], ['Among', 'the', 'vicissitudes'...]...]
- >>> inaugural.paras('1789-Washington.txt') # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [[['Fellow', '-', 'Citizens'...]],
- [['Among', 'the', 'vicissitudes'...],
- ['On', 'the', 'one', 'hand', ',', 'I'...]...]...]
-
-Each of these reader methods may be given a single document's item
-name or a list of document item names. When given a list of document
-item names, the reader methods will concatenate together the contents
-of the individual documents.
-
- >>> l1 = len(inaugural.words('1789-Washington.txt'))
- >>> l2 = len(inaugural.words('1793-Washington.txt'))
- >>> l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt']))
- >>> print('%s+%s == %s' % (l1, l2, l3))
- 1538+147 == 1685
-
-If the reader methods are called without any arguments, they will
-typically load all documents in the corpus.
-
- >>> len(inaugural.words())
- 149797
-
-If a corpus contains a README file, it can be accessed with a ``readme()`` method:
-
- >>> inaugural.readme()[:32]
- 'C-Span Inaugural Address Corpus\n'
-
-Plaintext Corpora
-=================
-
-Here are the first few words from each of NLTK's plaintext corpora:
-
- >>> nltk.corpus.abc.words()
- ['PM', 'denies', 'knowledge', 'of', 'AWB', ...]
- >>> nltk.corpus.genesis.words()
- ['In', 'the', 'beginning', 'God', 'created', ...]
- >>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt')
- ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...]
- >>> nltk.corpus.inaugural.words()
- ['Fellow', '-', 'Citizens', 'of', 'the', ...]
- >>> nltk.corpus.state_union.words()
- ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...]
- >>> nltk.corpus.webtext.words()
- ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...]
-
-Tagged Corpora
-==============
-
-In addition to the plaintext corpora, NLTK's data package also
-contains a wide variety of annotated corpora. For example, the Brown
-Corpus is annotated with part-of-speech tags, and defines additional
-methods ``tagged_*()`` which words as `(word,tag)` tuples, rather
-than just bare word strings.
-
- >>> from nltk.corpus import brown
- >>> print(brown.words())
- ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
- >>> print(brown.tagged_words())
- [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
- >>> print(brown.sents()) # doctest: +ELLIPSIS
- [['The', 'Fulton', 'County'...], ['The', 'jury', 'further'...], ...]
- >>> print(brown.tagged_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [[('The', 'AT'), ('Fulton', 'NP-TL')...],
- [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR')...]...]
- >>> print(brown.paras(categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [[['It', 'is', 'not', 'news', 'that', 'Nathan', 'Milstein'...],
- ['Certainly', 'not', 'in', 'Orchestra', 'Hall', 'where'...]],
- [['There', 'was', 'about', 'that', 'song', 'something', ...],
- ['Not', 'the', 'noblest', 'performance', 'we', 'have', ...], ...], ...]
- >>> print(brown.tagged_paras(categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [[[('It', 'PPS'), ('is', 'BEZ'), ('not', '*'), ...],
- [('Certainly', 'RB'), ('not', '*'), ('in', 'IN'), ...]],
- [[('There', 'EX'), ('was', 'BEDZ'), ('about', 'IN'), ...],
- [('Not', '*'), ('the', 'AT'), ('noblest', 'JJT'), ...], ...], ...]
-
-Similarly, the Indian Language POS-Tagged Corpus includes samples of
-Indian text annotated with part-of-speech tags:
-
- >>> from nltk.corpus import indian
- >>> print(indian.words()) # doctest: +SKIP
- ['\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf\...',
- '\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', ...]
- >>> print(indian.tagged_words()) # doctest: +SKIP
- [('\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf...', 'NN'),
- ('\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', 'NN'), ...]
-
-Several tagged corpora support access to a simplified, universal tagset, e.g. where all nouns
-tags are collapsed to a single category ``NOUN``:
-
- >>> print(brown.tagged_sents(tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ...],
- [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ...]...]
- >>> from nltk.corpus import conll2000, switchboard
- >>> print(conll2000.tagged_words(tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [('Confidence', 'NOUN'), ('in', 'ADP'), ...]
-
-Use ``nltk.app.pos_concordance()`` to access a GUI for searching tagged corpora.
-
-Chunked Corpora
-===============
-
-The CoNLL corpora also provide chunk structures, which are encoded as
-flat trees. The CoNLL 2000 Corpus includes phrasal chunks; and the
-CoNLL 2002 Corpus includes named entity chunks.
-
- >>> from nltk.corpus import conll2000, conll2002
- >>> print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [['Confidence', 'in', 'the', 'pound', 'is', 'widely', ...],
- ['Chancellor', 'of', 'the', 'Exchequer', ...], ...]
- >>> for tree in conll2000.chunked_sents()[:2]:
- ... print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- (S
- (NP Confidence/NN)
- (PP in/IN)
- (NP the/DT pound/NN)
- (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
- (NP another/DT sharp/JJ dive/NN)
- if/IN
- ...)
- (S
- Chancellor/NNP
- (PP of/IN)
- (NP the/DT Exchequer/NNP)
- ...)
- >>> print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [['Sao', 'Paulo', '(', 'Brasil', ')', ',', ...], ['-'], ...]
- >>> for tree in conll2002.chunked_sents()[:2]:
- ... print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- (S
- (LOC Sao/NC Paulo/VMI)
- (/Fpa
- (LOC Brasil/NC)
- )/Fpt
- ...)
- (S -/Fg)
-
-.. note:: Since the CONLL corpora do not contain paragraph break
- information, these readers do not support the ``para()`` method.)
-
-.. warning:: if you call the conll corpora reader methods without any
- arguments, they will return the contents of the entire corpus,
- *including* the 'test' portions of the corpus.)
-
-SemCor is a subset of the Brown corpus tagged with WordNet senses and
-named entities. Both kinds of lexical items include multiword units,
-which are encoded as chunks (senses and part-of-speech tags pertain
-to the entire chunk).
-
- >>> from nltk.corpus import semcor
- >>> semcor.words()
- ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
- >>> semcor.chunks()
- [['The'], ['Fulton', 'County', 'Grand', 'Jury'], ...]
- >>> semcor.sents() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...],
- ['The', 'jury', 'further', 'said', ...], ...]
- >>> semcor.chunk_sents() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [[['The'], ['Fulton', 'County', 'Grand', 'Jury'], ['said'], ...
- ['.']], [['The'], ['jury'], ['further'], ['said'], ... ['.']], ...]
- >>> list(map(str, semcor.tagged_chunks(tag='both')[:3]))
- ['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", "(Lemma('state.v.01.say') (VB said))"]
- >>> [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]
- [['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", ...
- '(None .)'], ['(DT The)', ... '(None .)']]
-
-
-The IEER corpus is another chunked corpus. This corpus is unusual in
-that each corpus item contains multiple documents. (This reflects the
-fact that each corpus file contains multiple documents.) The IEER
-corpus defines the `parsed_docs` method, which returns the documents
-in a given item as `IEERDocument` objects:
-
- >>> from nltk.corpus import ieer
- >>> ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
- ['APW_19980314', 'APW_19980424', 'APW_19980429',
- 'NYT_19980315', 'NYT_19980403', 'NYT_19980407']
- >>> docs = ieer.parsed_docs('APW_19980314')
- >>> print(docs[0])
- <IEERDocument APW19980314.0391: 'Kenyans protest tax hikes'>
- >>> print(docs[0].docno)
- APW19980314.0391
- >>> print(docs[0].doctype)
- NEWS STORY
- >>> print(docs[0].date_time)
- 03/14/1998 10:36:00
- >>> print(docs[0].headline)
- (DOCUMENT Kenyans protest tax hikes)
- >>> print(docs[0].text) # doctest: +ELLIPSIS
- (DOCUMENT
- (LOCATION NAIROBI)
- ,
- (LOCATION Kenya)
- (
- (ORGANIZATION AP)
- )
- _
- (CARDINAL Thousands)
- of
- laborers,
- ...
- on
- (DATE Saturday)
- ...)
-
-Parsed Corpora
-==============
-
-The Treebank corpora provide a syntactic parse for each sentence. The
-NLTK data package includes a 10% sample of the Penn Treebank (in
-``treebank``), as well as the Sinica Treebank (in ``sinica_treebank``).
-
-Reading the Penn Treebank (Wall Street Journal sample):
-
- >>> from nltk.corpus import treebank
- >>> print(treebank.fileids()) # doctest: +ELLIPSIS
- ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...]
- >>> print(treebank.words('wsj_0003.mrg'))
- ['A', 'form', 'of', 'asbestos', 'once', 'used', ...]
- >>> print(treebank.tagged_words('wsj_0003.mrg'))
- [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...]
- >>> print(treebank.parsed_sents('wsj_0003.mrg')[0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- (S
- (S-TPC-1
- (NP-SBJ
- (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos))))
- (RRC ...)...)...)
- ...
- (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1))))
- (. .))
-
-If you have access to a full installation of the Penn Treebank, NLTK
-can be configured to load it as well. Download the ``ptb`` package,
-and in the directory ``nltk_data/corpora/ptb`` place the ``BROWN``
-and ``WSJ`` directories of the Treebank installation (symlinks work
-as well). Then use the ``ptb`` module instead of ``treebank``:
-
- >>> from nltk.corpus import ptb
- >>> print(ptb.fileids()) # doctest: +SKIP
- ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ...]
- >>> print(ptb.words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP
- ['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...]
- >>> print(ptb.tagged_words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP
- [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...]
-
-...and so forth, like ``treebank`` but with extended fileids. Categories
-specified in ``allcats.txt`` can be used to filter by genre; they consist
-of ``news`` (for WSJ articles) and names of the Brown subcategories
-(``fiction``, ``humor``, ``romance``, etc.):
-
- >>> ptb.categories() # doctest: +SKIP
- ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction']
- >>> print(ptb.fileids('news')) # doctest: +SKIP
- ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG', ...]
- >>> print(ptb.words(categories=['humor','fiction'])) # doctest: +SKIP
- ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]
-
-As PropBank and NomBank depend on the (WSJ portion of the) Penn Treebank,
-the modules ``propbank_ptb`` and ``nombank_ptb`` are provided for access
-to a full PTB installation.
-
-Reading the Sinica Treebank:
-
- >>> from nltk.corpus import sinica_treebank
- >>> print(sinica_treebank.sents()) # doctest: +SKIP
- [['\xe4\xb8\x80'], ['\xe5\x8f\x8b\xe6\x83\x85'], ...]
- >>> sinica_treebank.parsed_sents()[25] # doctest: +SKIP
- Tree('S',
- [Tree('NP',
- [Tree('Nba', ['\xe5\x98\x89\xe7\x8f\x8d'])]),
- Tree('V\xe2\x80\xa7\xe5\x9c\xb0',
- [Tree('VA11', ['\xe4\xb8\x8d\xe5\x81\x9c']),
- Tree('DE', ['\xe7\x9a\x84'])]),
- Tree('VA4', ['\xe5\x93\xad\xe6\xb3\xa3'])])
-
-Reading the CoNLL 2007 Dependency Treebanks:
-
- >>> from nltk.corpus import conll2007
- >>> conll2007.sents('esp.train')[0] # doctest: +SKIP
- ['El', 'aumento', 'del', 'índice', 'de', 'desempleo', ...]
- >>> conll2007.parsed_sents('esp.train')[0] # doctest: +SKIP
- <DependencyGraph with 38 nodes>
- >>> print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP
- (fortaleció
- (aumento El (del (índice (de (desempleo estadounidense)))))
- hoy
- considerablemente
- (al
- (euro
- (cotizaba
- ,
- que
- (a (15.35 las GMT))
- se
- (en (mercado el (de divisas) (de Fráncfort)))
- (a 0,9452_dólares)
- (frente_a , (0,9349_dólares los (de (mañana esta)))))))
- .)
-
-Word Lists and Lexicons
-=======================
-
-The NLTK data package also includes a number of lexicons and word
-lists. These are accessed just like text corpora. The following
-examples illustrate the use of the wordlist corpora:
-
- >>> from nltk.corpus import names, stopwords, words
- >>> words.fileids()
- ['en', 'en-basic']
- >>> words.words('en') # doctest: +ELLIPSIS
- ['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', ...]
-
- >>> stopwords.fileids() # doctest: +ELLIPSIS
- ['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', ...]
- >>> sorted(stopwords.words('portuguese')) # doctest: +ELLIPSIS
- ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', ...]
- >>> names.fileids()
- ['female.txt', 'male.txt']
- >>> names.words('male.txt') # doctest: +ELLIPSIS
- ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', ...]
- >>> names.words('female.txt') # doctest: +ELLIPSIS
- ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', ...]
-
-The CMU Pronunciation Dictionary corpus contains pronounciation
-transcriptions for over 100,000 words. It can be accessed as a list
-of entries (where each entry consists of a word, an identifier, and a
-transcription) or as a dictionary from words to lists of
-transcriptions. Transcriptions are encoded as tuples of phoneme
-strings.
-
- >>> from nltk.corpus import cmudict
- >>> print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [('acetate', ['AE1', 'S', 'AH0', 'T', 'EY2', 'T']),
- ('acetic', ['AH0', 'S', 'EH1', 'T', 'IH0', 'K']),
- ('acetic', ['AH0', 'S', 'IY1', 'T', 'IH0', 'K']),
- ('aceto', ['AA0', 'S', 'EH1', 'T', 'OW0']),
- ('acetochlor', ['AA0', 'S', 'EH1', 'T', 'OW0', 'K', 'L', 'AO2', 'R']),
- ('acetone', ['AE1', 'S', 'AH0', 'T', 'OW2', 'N'])]
- >>> # Load the entire cmudict corpus into a Python dictionary:
- >>> transcr = cmudict.dict()
- >>> print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()]) # doctest: +NORMALIZE_WHITESPACE
- [['N', 'AE1', 'CH', 'ER0', 'AH0', 'L'],
- ['L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH'],
- ['T', 'UW1', 'L'],
- ['K', 'IH1', 'T']]
-
-
-WordNet
-=======
-
-Please see the separate WordNet howto.
-
-FrameNet
-========
-
-Please see the separate FrameNet howto.
-
-PropBank
-========
-
-Please see the separate PropBank howto.
-
-SentiWordNet
-============
-
-Please see the separate SentiWordNet howto.
-
-Categorized Corpora
-===================
-
-Several corpora included with NLTK contain documents that have been categorized for
-topic, genre, polarity, etc. In addition to the standard corpus interface, these
-corpora provide access to the list of categories and the mapping between the documents
-and their categories (in both directions). Access the categories using the ``categories()``
-method, e.g.:
-
- >>> from nltk.corpus import brown, movie_reviews, reuters
- >>> brown.categories() # doctest: +NORMALIZE_WHITESPACE
- ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor',
- 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
- >>> movie_reviews.categories()
- ['neg', 'pos']
- >>> reuters.categories() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
- ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa',
- 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn',
- 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', ...]
-
-This method has an optional argument that specifies a document or a list
-of documents, allowing us to map from (one or more) documents to (one or more) categories:
-
- >>> brown.categories('ca01')
- ['news']
- >>> brown.categories(['ca01','cb01'])
- ['editorial', 'news']
- >>> reuters.categories('training/9865')
- ['barley', 'corn', 'grain', 'wheat']
- >>> reuters.categories(['training/9865', 'training/9880'])
- ['barley', 'corn', 'grain', 'money-fx', 'wheat']
-
-We can go back the other way using the optional argument of the ``fileids()`` method:
-
- >>> reuters.fileids('barley') # doctest: +ELLIPSIS
- ['test/15618', 'test/15649', 'test/15676', 'test/15728', 'test/15871', ...]
-
-Both the ``categories()`` and ``fileids()`` methods return a sorted list containing
-no duplicates.
-
-In addition to mapping between categories and documents, these corpora permit
-direct access to their contents via the categories. Instead of accessing a subset
-of a corpus by specifying one or more fileids, we can identify one or more categories, e.g.:
-
- >>> brown.tagged_words(categories='news')
- [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
- >>> brown.sents(categories=['editorial','reviews']) # doctest: +NORMALIZE_WHITESPACE
- [['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General',
- 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed',
- 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from',
- 'the', 'day', 'it', 'convened', '.'], ...]
-
-Note that it is an error to specify both documents and categories.
-
-In the context of a text categorization system, we can easily test if the
-category assigned to a document is correct as follows:
-
- >>> def classify(doc): return 'news' # Trivial classifier
- >>> doc = 'ca01'
- >>> classify(doc) in brown.categories(doc)
- True
-
-
-Other Corpora
-=============
-
-comparative_sentences
----------------------
-A list of sentences from various sources, especially reviews and articles. Each
-line contains one sentence; sentences were separated by using a sentence tokenizer.
-Comparative sentences have been annotated with their type, entities, features and
-keywords.
-
- >>> from nltk.corpus import comparative_sentences
- >>> comparison = comparative_sentences.comparisons()[0]
- >>> comparison.text
- ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
- 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
- 'had', '.']
- >>> comparison.entity_2
- 'models'
- >>> (comparison.feature, comparison.keyword)
- ('rewind', 'more')
- >>> len(comparative_sentences.comparisons())
- 853
-
-opinion_lexicon
----------------
-A list of positive and negative opinion words or sentiment words for English.
-
- >>> from nltk.corpus import opinion_lexicon
- >>> opinion_lexicon.words()[:4]
- ['2-faced', '2-faces', 'abnormal', 'abolish']
-
-The OpinionLexiconCorpusReader also provides shortcuts to retrieve positive/negative
-words:
-
- >>> opinion_lexicon.negative()[:4]
- ['2-faced', '2-faces', 'abnormal', 'abolish']
-
-Note that words from `words()` method in opinion_lexicon are sorted by file id,
-not alphabetically:
-
- >>> opinion_lexicon.words()[0:10]
- ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
- 'abominate', 'abomination', 'abort', 'aborted']
- >>> sorted(opinion_lexicon.words())[0:10]
- ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
- 'abominate', 'abomination', 'abort']
-
-ppattach
---------
-The Prepositional Phrase Attachment corpus is a corpus of
-prepositional phrase attachment decisions. Each instance in the
-corpus is encoded as a ``PPAttachment`` object:
-
- >>> from nltk.corpus import ppattach
- >>> ppattach.attachments('training') # doctest: +NORMALIZE_WHITESPACE
- [PPAttachment(sent='0', verb='join', noun1='board',
- prep='as', noun2='director', attachment='V'),
- PPAttachment(sent='1', verb='is', noun1='chairman',
- prep='of', noun2='N.V.', attachment='N'),
- ...]
- >>> inst = ppattach.attachments('training')[0]
- >>> (inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2)
- ('0', 'join', 'board', 'as', 'director')
- >>> inst.attachment
- 'V'
-
-product_reviews_1 and product_reviews_2
----------------------------------------
-These two datasets respectively contain annotated customer reviews of 5 and 9
-products from amazon.com.
-
- >>> from nltk.corpus import product_reviews_1
- >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
- >>> review = camera_reviews[0]
- >>> review.sents()[0]
- ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
- 'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
- >>> review.features()
- [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
- ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
- ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
- ('option', '+1')]
-
-It is also possible to reach the same information directly from the stream:
-
- >>> product_reviews_1.features('Canon_G3.txt')
- [('canon powershot g3', '+3'), ('use', '+2'), ...]
-
-We can compute stats for specific product features:
-
- >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
- >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
- >>> mean = tot / n_reviews
- >>> print(n_reviews, tot, mean)
- 15 24 1.6
-
-pros_cons
----------
-A list of pros/cons sentences for determining context (aspect) dependent
-sentiment words, which are then applied to sentiment analysis of comparative
-sentences.
-
- >>> from nltk.corpus import pros_cons
- >>> pros_cons.sents(categories='Cons')
- [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
- 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
- ...]
- >>> pros_cons.words('IntegratedPros.txt')
- ['Easy', 'to', 'use', ',', 'economical', '!', ...]
-
-semcor
-------
-The Brown Corpus, annotated with WordNet senses.
-
- >>> from nltk.corpus import semcor
- >>> semcor.words('brown2/tagfiles/br-n12.xml') # doctest: +ELLIPSIS
- ['When', 'several', 'minutes', 'had', 'passed', ...]
- >>> sent = semcor.xml('brown2/tagfiles/br-n12.xml').findall('context/p/s')[0]
- >>> for wordform in sent.getchildren():
- ... print(wordform.text, end=' ')
- ... for key in sorted(wordform.keys()):
- ... print(key + '=' + wordform.get(key), end=' ')
- ... print()
- ...
- When cmd=ignore pos=WRB
- several cmd=done lemma=several lexsn=5:00:00:some(a):00 pos=JJ wnsn=1
- minutes cmd=done lemma=minute lexsn=1:28:00:: pos=NN wnsn=1
- had cmd=done ot=notag pos=VBD
- passed cmd=done lemma=pass lexsn=2:38:03:: pos=VB wnsn=4
- and cmd=ignore pos=CC
- Curt cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1
- had cmd=done ot=notag pos=VBD
- n't cmd=done lemma=n't lexsn=4:02:00:: pos=RB wnsn=0
- emerged cmd=done lemma=emerge lexsn=2:30:00:: pos=VB wnsn=1
- from cmd=ignore pos=IN
- the cmd=ignore pos=DT
- livery_stable cmd=done lemma=livery_stable lexsn=1:06:00:: pos=NN wnsn=1
- ,
- Brenner cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1
- re-entered cmd=done lemma=re-enter lexsn=2:38:00:: pos=VB wnsn=1
- the cmd=ignore pos=DT
- hotel cmd=done lemma=hotel lexsn=1:06:00:: pos=NN wnsn=1
- and cmd=ignore pos=CC
- faced cmd=done lemma=face lexsn=2:42:02:: pos=VB wnsn=4
- Summers cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1
- across cmd=ignore pos=IN
- the cmd=ignore pos=DT
- counter cmd=done lemma=counter lexsn=1:06:00:: pos=NN wnsn=1
- .
-
-senseval
---------
-The Senseval 2 corpus is a word sense disambiguation corpus. Each
-item in the corpus corresponds to a single ambiguous word. For each
-of these words, the corpus contains a list of instances, corresponding
-to occurrences of that word. Each instance provides the word; a list
-of word senses that apply to the word occurrence; and the word's
-context.
-
- >>> from nltk.corpus import senseval
- >>> senseval.fileids()
- ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos']
- >>> senseval.instances('hard.pos')
- ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [SensevalInstance(word='hard-a',
- position=20,
- context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...],
- senses=('HARD1',)),
- SensevalInstance(word='hard-a',
- position=10,
- context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...],
- senses=('HARD1',)), ...]
-
-The following code looks at instances of the word 'interest', and
-displays their local context (2 words on each side) and word sense(s):
-
- >>> for inst in senseval.instances('interest.pos')[:10]:
- ... p = inst.position
- ... left = ' '.join(w for (w,t) in inst.context[p-2:p])
- ... word = ' '.join(w for (w,t) in inst.context[p:p+1])
- ... right = ' '.join(w for (w,t) in inst.context[p+1:p+3])
- ... senses = ' '.join(inst.senses)
- ... print('%20s |%10s | %-15s -> %s' % (left, word, right, senses))
- declines in | interest | rates . -> interest_6
- indicate declining | interest | rates because -> interest_6
- in short-term | interest | rates . -> interest_6
- 4 % | interest | in this -> interest_5
- company with | interests | in the -> interest_5
- , plus | interest | . -> interest_6
- set the | interest | rate on -> interest_6
- 's own | interest | , prompted -> interest_4
- principal and | interest | is the -> interest_6
- increase its | interest | to 70 -> interest_5
-
-sentence_polarity
------------------
-The Sentence Polarity dataset contains 5331 positive and 5331 negative processed
-sentences.
-
- >>> from nltk.corpus import sentence_polarity
- >>> sentence_polarity.sents()
- [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
- 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
- 'it', 'funny', '.'], ...]
- >>> sentence_polarity.categories()
- ['neg', 'pos']
- >>> sentence_polarity.sents()[1]
- ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys',
- 'could', 'possibly', 'find', 'it', 'funny', '.']
-
-shakespeare
------------
-The Shakespeare corpus contains a set of Shakespeare plays, formatted
-as XML files. These corpora are returned as ElementTree objects:
-
- >>> from nltk.corpus import shakespeare
- >>> from xml.etree import ElementTree
- >>> shakespeare.fileids() # doctest: +ELLIPSIS
- ['a_and_c.xml', 'dream.xml', 'hamlet.xml', 'j_caesar.xml', ...]
- >>> play = shakespeare.xml('dream.xml')
- >>> print(play) # doctest: +ELLIPSIS
- <Element 'PLAY' at ...>
- >>> print('%s: %s' % (play[0].tag, play[0].text))
- TITLE: A Midsummer Night's Dream
- >>> personae = [persona.text for persona in
- ... play.findall('PERSONAE/PERSONA')]
- >>> print(personae) # doctest: +ELLIPSIS
- ['THESEUS, Duke of Athens.', 'EGEUS, father to Hermia.', ...]
- >>> # Find and print speakers not listed as personae
- >>> names = [persona.split(',')[0] for persona in personae]
- >>> speakers = set(speaker.text for speaker in
- ... play.findall('*/*/*/SPEAKER'))
- >>> print(sorted(speakers.difference(names))) # doctest: +NORMALIZE_WHITESPACE
- ['ALL', 'COBWEB', 'DEMETRIUS', 'Fairy', 'HERNIA', 'LYSANDER',
- 'Lion', 'MOTH', 'MUSTARDSEED', 'Moonshine', 'PEASEBLOSSOM',
- 'Prologue', 'Pyramus', 'Thisbe', 'Wall']
-
-subjectivity
------------
-The Subjectivity Dataset contains 5000 subjective and 5000 objective processed
-sentences.
-
- >>> from nltk.corpus import subjectivity
- >>> subjectivity.categories()
- ['obj', 'subj']
- >>> subjectivity.sents()[23]
- ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
- 'happened', 'off', 'screen', '.']
- >>> subjectivity.words(categories='subj')
- ['smart', 'and', 'alert', ',', 'thirteen', ...]
-
-toolbox
--------
-The Toolbox corpus distributed with NLTK contains a sample lexicon and
-several sample texts from the Rotokas language. The Toolbox corpus
-reader returns Toolbox files as XML ElementTree objects. The
-following example loads the Rotokas dictionary, and figures out the
-distribution of part-of-speech tags for reduplicated words.
-
-.. doctest: +SKIP
-
- >>> from nltk.corpus import toolbox
- >>> from nltk.probability import FreqDist
- >>> from xml.etree import ElementTree
- >>> import re
- >>> rotokas = toolbox.xml('rotokas.dic')
- >>> redup_pos_freqdist = FreqDist()
- >>> # Note: we skip over the first record, which is actually
- >>> # the header.
- >>> for record in rotokas[1:]:
- ... lexeme = record.find('lx').text
- ... if re.match(r'(.*)\1$', lexeme):
- ... redup_pos_freqdist[record.find('ps').text] += 1
- >>> for item, count in redup_pos_freqdist.most_common():
- ... print(item, count)
- V 41
- N 14
- ??? 4
-
-This example displays some records from a Rotokas text:
-
-.. doctest: +SKIP
-
- >>> river = toolbox.xml('rotokas/river.txt', key='ref')
- >>> for record in river.findall('record')[:3]:
- ... for piece in record:
- ... if len(piece.text) > 60:
- ... print('%-6s %s...' % (piece.tag, piece.text[:57]))
- ... else:
- ... print('%-6s %s' % (piece.tag, piece.text))
- ref Paragraph 1
- t ``Viapau oisio ra ovaupasi ...
- m viapau oisio ra ovau -pa -si ...
- g NEG this way/like this and forget -PROG -2/3.DL...
- p NEG ??? CONJ V.I -SUFF.V.3 -SUFF.V...
- f ``No ken lus tingting wanema samting papa i bin tok,'' Na...
- fe ``Don't forget what Dad said,'' yelled Naomi.
- ref 2
- t Osa Ira ora Reviti viapau uvupasiva.
- m osa Ira ora Reviti viapau uvu -pa -si ...
- g as/like name and name NEG hear/smell -PROG -2/3...
- p CONJ N.PN CONJ N.PN NEG V.T -SUFF.V.3 -SUF...
- f Tasol Ila na David no bin harim toktok.
- fe But Ila and David took no notice.
- ref 3
- t Ikaupaoro rokosiva ...
- m ikau -pa -oro roko -si -va ...
- g run/hurry -PROG -SIM go down -2/3.DL.M -RP ...
- p V.T -SUFF.V.3 -SUFF.V.4 ADV -SUFF.V.4 -SUFF.VT....
- f Tupela i bin hariap i go long wara .
- fe They raced to the river.
-
-timit
------
-The NLTK data package includes a fragment of the TIMIT
-Acoustic-Phonetic Continuous Speech Corpus. This corpus is broken
-down into small speech samples, each of which is available as a wave
-file, a phonetic transcription, and a tokenized word list.
-
- >>> from nltk.corpus import timit
- >>> print(timit.utteranceids()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466',
- 'dr1-fvmh0/si2096', 'dr1-fvmh0/si836', 'dr1-fvmh0/sx116',
- 'dr1-fvmh0/sx206', 'dr1-fvmh0/sx26', 'dr1-fvmh0/sx296', ...]
-
- >>> item = timit.utteranceids()[5]
- >>> print(timit.phones(item)) # doctest: +NORMALIZE_WHITESPACE
- ['h#', 'k', 'l', 'ae', 's', 'pcl', 'p', 'dh', 'ax',
- 's', 'kcl', 'k', 'r', 'ux', 'ix', 'nx', 'y', 'ax',
- 'l', 'eh', 'f', 'tcl', 't', 'hh', 'ae', 'n', 'dcl',
- 'd', 'h#']
- >>> print(timit.words(item))
- ['clasp', 'the', 'screw', 'in', 'your', 'left', 'hand']
- >>> timit.play(item) # doctest: +SKIP
-
-The corpus reader can combine the word segmentation information with
-the phonemes to produce a single tree structure:
-
- >>> for tree in timit.phone_trees(item):
- ... print(tree)
- (S
- h#
- (clasp k l ae s pcl p)
- (the dh ax)
- (screw s kcl k r ux)
- (in ix nx)
- (your y ax)
- (left l eh f tcl t)
- (hand hh ae n dcl d)
- h#)
-
-The start time and stop time of each phoneme, word, and sentence are
-also available:
-
- >>> print(timit.phone_times(item)) # doctest: +ELLIPSIS
- [('h#', 0, 2190), ('k', 2190, 3430), ('l', 3430, 4326), ...]
- >>> print(timit.word_times(item)) # doctest: +ELLIPSIS
- [('clasp', 2190, 8804), ('the', 8804, 9734), ...]
- >>> print(timit.sent_times(item))
- [('Clasp the screw in your left hand.', 0, 32154)]
-
-We can use these times to play selected pieces of a speech sample:
-
- >>> timit.play(item, 2190, 8804) # 'clasp' # doctest: +SKIP
-
-The corpus reader can also be queried for information about the
-speaker and sentence identifier for a given speech sample:
-
- >>> print(timit.spkrid(item))
- dr1-fvmh0
- >>> print(timit.sentid(item))
- sx116
- >>> print(timit.spkrinfo(timit.spkrid(item))) # doctest: +NORMALIZE_WHITESPACE
- SpeakerInfo(id='VMH0',
- sex='F',
- dr='1',
- use='TRN',
- recdate='03/11/86',
- birthdate='01/08/60',
- ht='5\'05"',
- race='WHT',
- edu='BS',
- comments='BEST NEW ENGLAND ACCENT SO FAR')
-
- >>> # List the speech samples from the same speaker:
- >>> timit.utteranceids(spkrid=timit.spkrid(item)) # doctest: +ELLIPSIS
- ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...]
-
-twitter_samples
----------------
-
-Twitter is well-known microblog service that allows public data to be
-collected via APIs. NLTK's twitter corpus currently contains a sample of 20k Tweets
-retrieved from the Twitter Streaming API.
-
- >>> from nltk.corpus import twitter_samples
- >>> twitter_samples.fileids()
- ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
-
-We follow standard practice in storing full Tweets as line-separated
-JSON. These data structures can be accessed via `tweets.docs()`. However, in general it
-is more practical to focus just on the text field of the Tweets, which
-are accessed via the `strings()` method.
-
- >>> twitter_samples.strings('tweets.20150430-223406.json')
- ['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP', ...]
-
-The default tokenizer for Tweets is specialised for 'casual' text, and
-the `tokenized()` method returns a list of lists of tokens.
-
- >>> twitter_samples.tokenized('tweets.20150430-223406.json')
- [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...],
- ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...]
-
-rte
----
-The RTE (Recognizing Textual Entailment) corpus was derived from the
-RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a
-list of XML-formatted 'text'/'hypothesis' pairs.
-
- >>> from nltk.corpus import rte
- >>> print(rte.fileids()) # doctest: +ELLIPSIS
- ['rte1_dev.xml', 'rte1_test.xml', 'rte2_dev.xml', ..., 'rte3_test.xml']
- >>> rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml'])
- >>> print(rtepairs) # doctest: +ELLIPSIS
- [<RTEPair: gid=2-8>, <RTEPair: gid=2-9>, <RTEPair: gid=2-15>, ...]
-
-In the gold standard test sets, each pair is labeled according to
-whether or not the text 'entails' the hypothesis; the
-entailment value is mapped to an integer 1 (True) or 0 (False).
-
- >>> rtepairs[5]
- <RTEPair: gid=2-23>
- >>> rtepairs[5].text # doctest: +NORMALIZE_WHITESPACE
- 'His wife Strida won a seat in parliament after forging an alliance
- with the main anti-Syrian coalition in the recent election.'
- >>> rtepairs[5].hyp
- 'Strida elected to parliament.'
- >>> rtepairs[5].value
- 1
-
-The RTE corpus also supports an ``xml()`` method which produces ElementTrees.
-
- >>> xmltree = rte.xml('rte3_dev.xml')
- >>> xmltree # doctest: +SKIP
- <Element entailment-corpus at ...>
- >>> xmltree[7].findtext('t') # doctest: +NORMALIZE_WHITESPACE
- "Mrs. Bush's approval ratings have remained very high, above 80%,
- even as her husband's have recently dropped below 50%."
-
-verbnet
--------
-The VerbNet corpus is a lexicon that divides verbs into classes, based
-on their syntax-semantics linking behavior. The basic elements in the
-lexicon are verb lemmas, such as 'abandon' and 'accept', and verb
-classes, which have identifiers such as 'remove-10.1' and
-'admire-31.2-1'. These class identifiers consist of a representative
-verb selected from the class, followed by a numerical identifier. The
-list of verb lemmas, and the list of class identifiers, can be
-retrieved with the following methods:
-
- >>> from nltk.corpus import verbnet
- >>> verbnet.lemmas()[20:25]
- ['accelerate', 'accept', 'acclaim', 'accompany', 'accrue']
- >>> verbnet.classids()[:5]
- ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93']
-
-The `classids()` method may also be used to retrieve the classes that
-a given lemma belongs to:
-
- >>> verbnet.classids('accept')
- ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2']
-
-The `classids()` method may additionally be used to retrieve all classes
-within verbnet if nothing is passed:
-
- >>> verbnet.classids()
- ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93', 'advise-37.9', 'advise-37.9-1', 'allow-64', 'amalgamate-22.2', 'amalgamate-22.2-1', 'amalgamate-22.2-1-1', 'amalgamate-22.2-2', 'amalgamate-22.2-2-1', 'amalgamate-22.2-3', 'amalgamate-22.2-3-1', 'amalgamate-22.2-3-1-1', 'amalgamate-22.2-3-2', 'amuse-31.1', 'animal_sounds-38', 'appeal-31.4', 'appeal-31.4-1', 'appeal-31.4-2', 'appeal-31.4-3', 'appear-48.1.1', 'appoint-29.1', 'approve-77', 'assessment-34', 'assuming_position-50', 'avoid-52', 'banish-10.2', 'battle-36.4', 'battle-36.4-1', 'begin-55.1', 'begin-55.1-1', 'being_dressed-41.3.3', 'bend-45.2', 'berry-13.7', 'bill-54.5', 'body_internal_motion-49', 'body_internal_states-40.6', 'braid-41.2.2', 'break-45.1', 'breathe-40.1.2', 'breathe-40.1.2-1', 'bring-11.3', 'bring-11.3-1', 'build-26.1', 'build-26.1-1', 'bulge-47.5.3', 'bump-18.4', 'bump-18.4-1', 'butter-9.9', 'calibratable_cos-45.6', 'calibratable_cos-45.6-1', 'calve-28', 'captain-29.8', 'captain-29.8-1', 'captain-29.8-1-1', 'care-88', 'care-88-1', 'carry-11.4', 'carry-11.4-1', 'carry-11.4-1-1', 'carve-21.2', 'carve-21.2-1', 'carve-21.2-2', 'change_bodily_state-40.8.4', 'characterize-29.2', 'characterize-29.2-1', 'characterize-29.2-1-1', 'characterize-29.2-1-2', 'chase-51.6', 'cheat-10.6', 'cheat-10.6-1', 'cheat-10.6-1-1', 'chew-39.2', 'chew-39.2-1', 'chew-39.2-2', 'chit_chat-37.6', 'clear-10.3', 'clear-10.3-1', 'cling-22.5', 'coil-9.6', 'coil-9.6-1', 'coloring-24', 'complain-37.8', 'complete-55.2', 'concealment-16', 'concealment-16-1', 'confess-37.10', 'confine-92', 'confine-92-1', 'conjecture-29.5', 'conjecture-29.5-1', 'conjecture-29.5-2', 'consider-29.9', 'consider-29.9-1', 'consider-29.9-1-1', 'consider-29.9-1-1-1', 'consider-29.9-2', 'conspire-71', 'consume-66', 'consume-66-1', 'contiguous_location-47.8', 'contiguous_location-47.8-1', 'contiguous_location-47.8-2', 'continue-55.3', 'contribute-13.2', 'contribute-13.2-1', 'contribute-13.2-1-1', 'contribute-13.2-1-1-1', 'contribute-13.2-2', 'contribute-13.2-2-1', 'convert-26.6.2', 'convert-26.6.2-1', 'cooking-45.3', 'cooperate-73', 'cooperate-73-1', 'cooperate-73-2', 'cooperate-73-3', 'cope-83', 'cope-83-1', 'cope-83-1-1', 'correlate-86', 'correspond-36.1', 'correspond-36.1-1', 'correspond-36.1-1-1', 'cost-54.2', 'crane-40.3.2', 'create-26.4', 'create-26.4-1', 'curtsey-40.3.3', 'cut-21.1', 'cut-21.1-1', 'debone-10.8', 'declare-29.4', 'declare-29.4-1', 'declare-29.4-1-1', 'declare-29.4-1-1-1', 'declare-29.4-1-1-2', 'declare-29.4-1-1-3', 'declare-29.4-2', 'dedicate-79', 'defend-85', 'destroy-44', 'devour-39.4', 'devour-39.4-1', 'devour-39.4-2', 'differ-23.4', 'dine-39.5', 'disappearance-48.2', 'disassemble-23.3', 'discover-84', 'discover-84-1', 'discover-84-1-1', 'dress-41.1.1', 'dressing_well-41.3.2', 'drive-11.5', 'drive-11.5-1', 'dub-29.3', 'dub-29.3-1', 'eat-39.1', 'eat-39.1-1', 'eat-39.1-2', 'enforce-63', 'engender-27', 'entity_specific_cos-45.5', 'entity_specific_modes_being-47.2', 'equip-13.4.2', 'equip-13.4.2-1', 'equip-13.4.2-1-1', 'escape-51.1', 'escape-51.1-1', 'escape-51.1-2', 'escape-51.1-2-1', 'exceed-90', 'exchange-13.6', 'exchange-13.6-1', 'exchange-13.6-1-1', 'exhale-40.1.3', 'exhale-40.1.3-1', 'exhale-40.1.3-2', 'exist-47.1', 'exist-47.1-1', 'exist-47.1-1-1', 'feeding-39.7', 'ferret-35.6', 'fill-9.8', 'fill-9.8-1', 'fit-54.3', 'flinch-40.5', 'floss-41.2.1', 'focus-87', 'forbid-67', 'force-59', 'force-59-1', 'free-80', 'free-80-1', 'fulfilling-13.4.1', 'fulfilling-13.4.1-1', 'fulfilling-13.4.1-2', 'funnel-9.3', 'funnel-9.3-1', 'funnel-9.3-2', 'funnel-9.3-2-1', 'future_having-13.3', 'get-13.5.1', 'get-13.5.1-1', 'give-13.1', 'give-13.1-1', 'gobble-39.3', 'gobble-39.3-1', 'gobble-39.3-2', 'gorge-39.6', 'groom-41.1.2', 'grow-26.2', 'help-72', 'help-72-1', 'herd-47.5.2', 'hiccup-40.1.1', 'hit-18.1', 'hit-18.1-1', 'hold-15.1', 'hold-15.1-1', 'hunt-35.1', 'hurt-40.8.3', 'hurt-40.8.3-1', 'hurt-40.8.3-1-1', 'hurt-40.8.3-2', 'illustrate-25.3', 'image_impression-25.1', 'indicate-78', 'indicate-78-1', 'indicate-78-1-1', 'inquire-37.1.2', 'instr_communication-37.4', 'investigate-35.4', 'judgement-33', 'keep-15.2', 'knead-26.5', 'learn-14', 'learn-14-1', 'learn-14-2', 'learn-14-2-1', 'leave-51.2', 'leave-51.2-1', 'lecture-37.11', 'lecture-37.11-1', 'lecture-37.11-1-1', 'lecture-37.11-2', 'light_emission-43.1', 'limit-76', 'linger-53.1', 'linger-53.1-1', 'lodge-46', 'long-32.2', 'long-32.2-1', 'long-32.2-2', 'manner_speaking-37.3', 'marry-36.2', 'marvel-31.3', 'marvel-31.3-1', 'marvel-31.3-2', 'marvel-31.3-3', 'marvel-31.3-4', 'marvel-31.3-5', 'marvel-31.3-6', 'marvel-31.3-7', 'marvel-31.3-8', 'marvel-31.3-9', 'masquerade-29.6', 'masquerade-29.6-1', 'masquerade-29.6-2', 'matter-91', 'meander-47.7', 'meet-36.3', 'meet-36.3-1', 'meet-36.3-2', 'mine-10.9', 'mix-22.1', 'mix-22.1-1', 'mix-22.1-1-1', 'mix-22.1-2', 'mix-22.1-2-1', 'modes_of_being_with_motion-47.3', 'murder-42.1', 'murder-42.1-1', 'neglect-75', 'neglect-75-1', 'neglect-75-1-1', 'neglect-75-2', 'nonvehicle-51.4.2', 'nonverbal_expression-40.2', 'obtain-13.5.2', 'obtain-13.5.2-1', 'occurrence-48.3', 'order-60', 'order-60-1', 'orphan-29.7', 'other_cos-45.4', 'pain-40.8.1', 'pay-68', 'peer-30.3', 'pelt-17.2', 'performance-26.7', 'performance-26.7-1', 'performance-26.7-1-1', 'performance-26.7-2', 'performance-26.7-2-1', 'pit-10.7', 'pocket-9.10', 'pocket-9.10-1', 'poison-42.2', 'poke-19', 'pour-9.5', 'preparing-26.3', 'preparing-26.3-1', 'preparing-26.3-2', 'price-54.4', 'push-12', 'push-12-1', 'push-12-1-1', 'put-9.1', 'put-9.1-1', 'put-9.1-2', 'put_direction-9.4', 'put_spatial-9.2', 'put_spatial-9.2-1', 'reach-51.8', 'reflexive_appearance-48.1.2', 'refrain-69', 'register-54.1', 'rely-70', 'remove-10.1', 'risk-94', 'risk-94-1', 'roll-51.3.1', 'rummage-35.5', 'run-51.3.2', 'rush-53.2', 'say-37.7', 'say-37.7-1', 'say-37.7-1-1', 'say-37.7-2', 'scribble-25.2', 'search-35.2', 'see-30.1', 'see-30.1-1', 'see-30.1-1-1', 'send-11.1', 'send-11.1-1', 'separate-23.1', 'separate-23.1-1', 'separate-23.1-2', 'settle-89', 'shake-22.3', 'shake-22.3-1', 'shake-22.3-1-1', 'shake-22.3-2', 'shake-22.3-2-1', 'sight-30.2', 'simple_dressing-41.3.1', 'slide-11.2', 'slide-11.2-1-1', 'smell_emission-43.3', 'snooze-40.4', 'sound_emission-43.2', 'sound_existence-47.4', 'spank-18.3', 'spatial_configuration-47.6', 'split-23.2', 'spray-9.7', 'spray-9.7-1', 'spray-9.7-1-1', 'spray-9.7-2', 'stalk-35.3', 'steal-10.5', 'stimulus_subject-30.4', 'stop-55.4', 'stop-55.4-1', 'substance_emission-43.4', 'succeed-74', 'succeed-74-1', 'succeed-74-1-1', 'succeed-74-2', 'suffocate-40.7', 'suspect-81', 'swarm-47.5.1', 'swarm-47.5.1-1', 'swarm-47.5.1-2', 'swarm-47.5.1-2-1', 'swat-18.2', 'talk-37.5', 'tape-22.4', 'tape-22.4-1', 'tell-37.2', 'throw-17.1', 'throw-17.1-1', 'throw-17.1-1-1', 'tingle-40.8.2', 'touch-20', 'touch-20-1', 'transcribe-25.4', 'transfer_mesg-37.1.1', 'transfer_mesg-37.1.1-1', 'transfer_mesg-37.1.1-1-1', 'try-61', 'turn-26.6.1', 'turn-26.6.1-1', 'urge-58', 'vehicle-51.4.1', 'vehicle-51.4.1-1', 'waltz-51.5', 'want-32.1', 'want-32.1-1', 'want-32.1-1-1', 'weather-57', 'weekend-56', 'wink-40.3.1', 'wink-40.3.1-1', 'wipe_instr-10.4.2', 'wipe_instr-10.4.2-1', 'wipe_manner-10.4.1', 'wipe_manner-10.4.1-1', 'wish-62', 'withdraw-82', 'withdraw-82-1', 'withdraw-82-2', 'withdraw-82-3']
-
-The primary object in the lexicon is a class record, which is stored
-as an ElementTree xml object. The class record for a given class
-identifier is returned by the `vnclass()` method:
-
- >>> verbnet.vnclass('remove-10.1') # doctest: +ELLIPSIS
- <Element 'VNCLASS' at ...>
-
-The `vnclass()` method also accepts "short" identifiers, such as '10.1':
-
- >>> verbnet.vnclass('10.1') # doctest: +ELLIPSIS
- <Element 'VNCLASS' at ...>
-
-See the Verbnet documentation, or the Verbnet files, for information
-about the structure of this xml. As an example, we can retrieve a
-list of thematic roles for a given Verbnet class:
-
- >>> vn_31_2 = verbnet.vnclass('admire-31.2')
- >>> for themrole in vn_31_2.findall('THEMROLES/THEMROLE'):
- ... print(themrole.attrib['type'], end=' ')
- ... for selrestr in themrole.findall('SELRESTRS/SELRESTR'):
- ... print('[%(Value)s%(type)s]' % selrestr.attrib, end=' ')
- ... print()
- Theme
- Experiencer [+animate]
- Predicate
-
-The Verbnet corpus also provides a variety of pretty printing
-functions that can be used to display the xml contents in a more
-concise form. The simplest such method is `pprint()`:
-
- >>> print(verbnet.pprint('57'))
- weather-57
- Subclasses: (none)
- Members: blow clear drizzle fog freeze gust hail howl lightning mist
- mizzle pelt pour precipitate rain roar shower sleet snow spit spot
- sprinkle storm swelter teem thaw thunder
- Thematic roles:
- * Theme[+concrete +force]
- Frames:
- Intransitive (Expletive Subject)
- Example: It's raining.
- Syntax: LEX[it] LEX[[+be]] VERB
- Semantics:
- * weather(during(E), Weather_type, ?Theme)
- NP (Expletive Subject, Theme Object)
- Example: It's raining cats and dogs.
- Syntax: LEX[it] LEX[[+be]] VERB NP[Theme]
- Semantics:
- * weather(during(E), Weather_type, Theme)
- PP (Expletive Subject, Theme-PP)
- Example: It was pelting with rain.
- Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme]
- Semantics:
- * weather(during(E), Weather_type, Theme)
-
-Verbnet gives us frames that link the syntax and semantics using an example.
-These frames are part of the corpus and we can use `frames()` to get a frame
-for a given verbnet class.
-
- >>> frame = verbnet.frames('57')
- >>> frame == [{'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': '?Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'Intransitive', 'secondary': 'Expletive Subject'}}, {'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': 'Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining cats and dogs.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'NP', 'secondary': 'Expletive Subject, Theme Object'}}, {'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': 'Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': 'It was pelting with rain.', 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'PREP', 'modifiers': {'value': 'with', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'PP', 'secondary': 'Expletive Subject, Theme-PP'}}]
- True
-
-Verbnet corpus lets us access thematic roles individually using `themroles()`.
-
- >>> themroles = verbnet.themroles('57')
- >>> themroles == [{'modifiers': [{'type': 'concrete', 'value': '+'}, {'type': 'force', 'value': '+'}], 'type': 'Theme'}]
- True
-
-Verbnet classes may also have subclasses sharing similar syntactic and semantic properties
-while having differences with the superclass. The Verbnet corpus allows us to access these
-subclasses using `subclasses()`.
-
- >>> print(verbnet.subclasses('9.1')) #Testing for 9.1 since '57' does not have subclasses
- ['put-9.1-1', 'put-9.1-2']
-
-
-nps_chat
---------
-
-The NPS Chat Corpus, Release 1.0 consists of over 10,000 posts in age-specific
-chat rooms, which have been anonymized, POS-tagged and dialogue-act tagged.
-
- >>> print(nltk.corpus.nps_chat.words())
- ['now', 'im', 'left', 'with', 'this', 'gay', ...]
- >>> print(nltk.corpus.nps_chat.tagged_words())
- [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]
- >>> print(nltk.corpus.nps_chat.tagged_posts()) # doctest: +NORMALIZE_WHITESPACE
- [[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'),
- ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN')], [(':P', 'UH')], ...]
-
-We can access the XML elements corresponding to individual posts. These elements
-have ``class`` and ``user`` attributes that we can access using ``p.attrib['class']``
-and ``p.attrib['user']``. They also have text content, accessed using ``p.text``.
-
- >>> print(nltk.corpus.nps_chat.xml_posts()) # doctest: +ELLIPSIS
- [<Element 'Post' at 0...>, <Element 'Post' at 0...>, ...]
- >>> posts = nltk.corpus.nps_chat.xml_posts()
- >>> sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys())
- ['Accept', 'Bye', 'Clarify', 'Continuer', 'Emotion', 'Emphasis',
- 'Greet', 'Other', 'Reject', 'Statement', 'System', 'nAnswer',
- 'whQuestion', 'yAnswer', 'ynQuestion']
- >>> posts[0].text
- 'now im left with this gay name'
-
-In addition to the above methods for accessing tagged text, we can navigate
-the XML structure directly, as follows:
-
- >>> tokens = posts[0].findall('terminals/t')
- >>> [t.attrib['pos'] + "/" + t.attrib['word'] for t in tokens]
- ['RB/now', 'PRP/im', 'VBD/left', 'IN/with', 'DT/this', 'JJ/gay', 'NN/name']
-
-multext_east
-------------
-
-The Multext-East Corpus consists of POS-tagged versions of George Orwell's book
-1984 in 12 languages: English, Czech, Hungarian, Macedonian, Slovenian, Serbian,
-Slovak, Romanian, Estonian, Farsi, Bulgarian and Polish.
-The corpus can be accessed using the usual methods for tagged corpora. The tagset
-can be transformed from the Multext-East specific MSD tags to the Universal tagset
-using the "tagset" parameter of all functions returning tagged parts of the corpus.
-
- >>> print(nltk.corpus.multext_east.words("oana-en.xml"))
- ['It', 'was', 'a', 'bright', ...]
- >>> print(nltk.corpus.multext_east.tagged_words("oana-en.xml"))
- [('It', '#Pp3ns'), ('was', '#Vmis3s'), ('a', '#Di'), ...]
- >>> print(nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal"))
- [[('It', 'PRON'), ('was', 'VERB'), ('a', 'DET'), ...]
-
-
-
----------------------
-Corpus Reader Classes
----------------------
-
-NLTK's *corpus reader* classes are used to access the contents of a
-diverse set of corpora. Each corpus reader class is specialized to
-handle a specific corpus format. Examples include the
-`PlaintextCorpusReader`, which handles corpora that consist of a set
-of unannotated text files, and the `BracketParseCorpusReader`, which
-handles corpora that consist of files containing
-parenthesis-delineated parse trees.
-
-Automatically Created Corpus Reader Instances
-=============================================
-
-When the `nltk.corpus` module is imported, it automatically creates a
-set of corpus reader instances that can be used to access the corpora
-in the NLTK data distribution. Here is a small sample of those
-corpus reader instances:
-
- >>> import nltk
- >>> nltk.corpus.brown # doctest: +ELLIPSIS
- <CategorizedTaggedCorpusReader ...>
- >>> nltk.corpus.treebank # doctest: +ELLIPSIS
- <BracketParseCorpusReader ...>
- >>> nltk.corpus.names # doctest: +ELLIPSIS
- <WordListCorpusReader ...>
- >>> nltk.corpus.genesis # doctest: +ELLIPSIS
- <PlaintextCorpusReader ...>
- >>> nltk.corpus.inaugural # doctest: +ELLIPSIS
- <PlaintextCorpusReader ...>
-
-This sample illustrates that different corpus reader classes are used
-to read different corpora; but that the same corpus reader class may
-be used for more than one corpus (e.g., ``genesis`` and ``inaugural``).
-
-Creating New Corpus Reader Instances
-====================================
-
-Although the `nltk.corpus` module automatically creates corpus reader
-instances for the corpora in the NLTK data distribution, you may
-sometimes need to create your own corpus reader. In particular, you
-would need to create your own corpus reader if you want...
-
-- To access a corpus that is not included in the NLTK data
- distribution.
-
-- To access a full copy of a corpus for which the NLTK data
- distribution only provides a sample.
-
-- To access a corpus using a customized corpus reader (e.g., with
- a customized tokenizer).
-
-To create a new corpus reader, you will first need to look up the
-signature for that corpus reader's constructor. Different corpus
-readers have different constructor signatures, but most of the
-constructor signatures have the basic form::
-
- SomeCorpusReader(root, files, ...options...)
-
-Where ``root`` is an absolute path to the directory containing the
-corpus data files; ``files`` is either a list of file names (relative
-to ``root``) or a regexp specifying which files should be included;
-and ``options`` are additional reader-specific options. For example,
-we can create a customized corpus reader for the genesis corpus that
-uses a different sentence tokenizer as follows:
-
- >>> # Find the directory where the corpus lives.
- >>> genesis_dir = nltk.data.find('corpora/genesis')
- >>> # Create our custom sentence tokenizer.
- >>> my_sent_tokenizer = nltk.RegexpTokenizer('[^.!?]+')
- >>> # Create the new corpus reader object.
- >>> my_genesis = nltk.corpus.PlaintextCorpusReader(
- ... genesis_dir, '.*\.txt', sent_tokenizer=my_sent_tokenizer)
- >>> # Use the new corpus reader object.
- >>> print(my_genesis.sents('english-kjv.txt')[0]) # doctest: +NORMALIZE_WHITESPACE
- ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
- 'and', 'the', 'earth']
-
-If you wish to read your own plaintext corpus, which is stored in the
-directory '/usr/share/some-corpus', then you can create a corpus
-reader for it with::
-
- >>> my_corpus = nltk.corpus.PlaintextCorpusReader(
- ... '/usr/share/some-corpus', '.*\.txt') # doctest: +SKIP
-
-For a complete list of corpus reader subclasses, see the API
-documentation for `nltk.corpus.reader`.
-
-Corpus Types
-============
-
-Corpora vary widely in the types of content they include. This is
-reflected in the fact that the base class `CorpusReader` only defines
-a few general-purpose methods for listing and accessing the files that
-make up a corpus. It is up to the subclasses to define *data access
-methods* that provide access to the information in the corpus.
-However, corpus reader subclasses should be consistent in their
-definitions of these data access methods wherever possible.
-
-At a high level, corpora can be divided into three basic types:
-
-- A *token corpus* contains information about specific occurrences of
- language use (or linguistic tokens), such as dialogues or written
- texts. Examples of token corpora are collections of written text
- and collections of speech.
-
-- A *type corpus*, or *lexicon*, contains information about a coherent
- set of lexical items (or linguistic types). Examples of lexicons
- are dictionaries and word lists.
-
-- A *language description corpus* contains information about a set of
- non-lexical linguistic constructs, such as grammar rules.
-
-However, many individual corpora blur the distinctions between these
-types. For example, corpora that are primarily lexicons may include
-token data in the form of example sentences; and corpora that are
-primarily token corpora may be accompanied by one or more word lists
-or other lexical data sets.
-
-Because corpora vary so widely in their information content, we have
-decided that it would not be wise to use separate corpus reader base
-classes for different corpus types. Instead, we simply try to make
-the corpus readers consistent wherever possible, but let them differ
-where the underlying data itself differs.
-
-Common Corpus Reader Methods
-============================
-
-As mentioned above, there are only a handful of methods that all
-corpus readers are guaranteed to implement. These methods provide
-access to the files that contain the corpus data. Every corpus is
-assumed to consist of one or more files, all located in a common root
-directory (or in subdirectories of that root directory). The absolute
-path to the root directory is stored in the ``root`` property:
-
- >>> import os
- >>> str(nltk.corpus.genesis.root).replace(os.path.sep,'/') # doctest: +ELLIPSIS
- '.../nltk_data/corpora/genesis'
-
-Each file within the corpus is identified by a platform-independent
-identifier, which is basically a path string that uses ``/`` as the
-path separator. I.e., this identifier can be converted to a relative
-path as follows:
-
- >>> some_corpus_file_id = nltk.corpus.reuters.fileids()[0]
- >>> import os.path
- >>> os.path.normpath(some_corpus_file_id).replace(os.path.sep,'/')
- 'test/14826'
-
-To get a list of all data files that make up a corpus, use the
-``fileids()`` method. In some corpora, these files will not all contain
-the same type of data; for example, for the ``nltk.corpus.timit``
-corpus, ``fileids()`` will return a list including text files, word
-segmentation files, phonetic transcription files, sound files, and
-metadata files. For corpora with diverse file types, the ``fileids()``
-method will often take one or more optional arguments, which can be
-used to get a list of the files with a specific file type:
-
- >>> nltk.corpus.timit.fileids() # doctest: +ELLIPSIS
- ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...]
- >>> nltk.corpus.timit.fileids('phn') # doctest: +ELLIPSIS
- ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa2.phn', 'dr1-fvmh0/si1466.phn', ...]
-
-In some corpora, the files are divided into distinct categories. For
-these corpora, the ``fileids()`` method takes an optional argument,
-which can be used to get a list of the files within a specific category:
-
- >>> nltk.corpus.brown.fileids('hobbies') # doctest: +ELLIPSIS
- ['ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', ...]
-
-The ``abspath()`` method can be used to find the absolute path to a
-corpus file, given its file identifier:
-
- >>> str(nltk.corpus.brown.abspath('ce06')).replace(os.path.sep,'/') # doctest: +ELLIPSIS
- '.../corpora/brown/ce06'
-
-The ``abspaths()`` method can be used to find the absolute paths for
-one corpus file, a list of corpus files, or (if no fileids are specified),
-all corpus files.
-
-This method is mainly useful as a helper method when defining corpus
-data access methods, since data access methods can usually be called
-with a string argument (to get a view for a specific file), with a
-list argument (to get a view for a specific list of files), or with no
-argument (to get a view for the whole corpus).
-
-Data Access Methods
-===================
-
-Individual corpus reader subclasses typically extend this basic set of
-file-access methods with one or more *data access methods*, which provide
-easy access to the data contained in the corpus. The signatures for
-data access methods often have the basic form::
-
- corpus_reader.some_data access(fileids=None, ...options...)
-
-Where ``fileids`` can be a single file identifier string (to get a view
-for a specific file); a list of file identifier strings (to get a view
-for a specific list of files); or None (to get a view for the entire
-corpus). Some of the common data access methods, and their return
-types, are:
-
- - I{corpus}.words(): list of str
- - I{corpus}.sents(): list of (list of str)
- - I{corpus}.paras(): list of (list of (list of str))
- - I{corpus}.tagged_words(): list of (str,str) tuple
- - I{corpus}.tagged_sents(): list of (list of (str,str))
- - I{corpus}.tagged_paras(): list of (list of (list of (str,str)))
- - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves)
- - I{corpus}.parsed_sents(): list of (Tree with str leaves)
- - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves))
- - I{corpus}.xml(): A single xml ElementTree
- - I{corpus}.raw(): str (unprocessed corpus contents)
-
-For example, the `words()` method is supported by many different
-corpora, and returns a flat list of word strings:
-
- >>> nltk.corpus.brown.words()
- ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
- >>> nltk.corpus.treebank.words()
- ['Pierre', 'Vinken', ',', '61', 'years', 'old', ...]
- >>> nltk.corpus.conll2002.words()
- ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', ...]
- >>> nltk.corpus.genesis.words()
- ['In', 'the', 'beginning', 'God', 'created', ...]
-
-On the other hand, the `tagged_words()` method is only supported by
-corpora that include part-of-speech annotations:
-
- >>> nltk.corpus.brown.tagged_words()
- [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
- >>> nltk.corpus.treebank.tagged_words()
- [('Pierre', 'NNP'), ('Vinken', 'NNP'), ...]
- >>> nltk.corpus.conll2002.tagged_words()
- [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]
- >>> nltk.corpus.genesis.tagged_words()
- Traceback (most recent call last):
- ...
- AttributeError: 'PlaintextCorpusReader' object has no attribute 'tagged_words'
-
-Although most corpus readers use file identifiers to index their
-content, some corpora use different identifiers instead. For example,
-the data access methods for the ``timit`` corpus uses *utterance
-identifiers* to select which corpus items should be returned:
-
- >>> nltk.corpus.timit.utteranceids() # doctest: +ELLIPSIS
- ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...]
- >>> nltk.corpus.timit.words('dr1-fvmh0/sa2')
- ["don't", 'ask', 'me', 'to', 'carry', 'an', 'oily', 'rag', 'like', 'that']
-
-Attempting to call ``timit``\ 's data access methods with a file
-identifier will result in an exception:
-
- >>> nltk.corpus.timit.fileids() # doctest: +ELLIPSIS
- ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...]
- >>> nltk.corpus.timit.words('dr1-fvmh0/sa1.txt') # doctest: +SKIP
- Traceback (most recent call last):
- ...
- IOError: No such file or directory: '.../dr1-fvmh0/sa1.txt.wrd'
-
-As another example, the ``propbank`` corpus defines the ``roleset()``
-method, which expects a roleset identifier, not a file identifier:
-
- >>> roleset = nltk.corpus.propbank.roleset('eat.01')
- >>> from xml.etree import ElementTree as ET
- >>> print(ET.tostring(roleset).decode('utf8')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- <roleset id="eat.01" name="consume" vncls="39.1">
- <roles>
- <role descr="consumer, eater" n="0">...</role>...
- </roles>...
- </roleset>...
-
-Stream Backed Corpus Views
-==========================
-An important feature of NLTK's corpus readers is that many of them
-access the underlying data files using "corpus views." A *corpus
-view* is an object that acts like a simple data structure (such as a
-list), but does not store the data elements in memory; instead, data
-elements are read from the underlying data files on an as-needed
-basis.
-
-By only loading items from the file on an as-needed basis, corpus
-views maintain both memory efficiency and responsiveness. The memory
-efficiency of corpus readers is important because some corpora contain
-very large amounts of data, and storing the entire data set in memory
-could overwhelm many machines. The responsiveness is important when
-experimenting with corpora in interactive sessions and in in-class
-demonstrations.
-
-The most common corpus view is the `StreamBackedCorpusView`, which
-acts as a read-only list of tokens. Two additional corpus view
-classes, `ConcatenatedCorpusView` and `LazySubsequence`, make it
-possible to create concatenations and take slices of
-`StreamBackedCorpusView` objects without actually storing the
-resulting list-like object's elements in memory.
-
-In the future, we may add additional corpus views that act like other
-basic data structures, such as dictionaries.
-
-Writing New Corpus Readers
-==========================
-
-In order to add support for new corpus formats, it is necessary to
-define new corpus reader classes. For many corpus formats, writing
-new corpus readers is relatively straight-forward. In this section,
-we'll describe what's involved in creating a new corpus reader. If
-you do create a new corpus reader, we encourage you to contribute it
-back to the NLTK project.
-
-Don't Reinvent the Wheel
-------------------------
-Before you start writing a new corpus reader, you should check to be
-sure that the desired format can't be read using an existing corpus
-reader with appropriate constructor arguments. For example, although
-the `TaggedCorpusReader` assumes that words and tags are separated by
-``/`` characters by default, an alternative tag-separation character
-can be specified via the ``sep`` constructor argument. You should
-also check whether the new corpus format can be handled by subclassing
-an existing corpus reader, and tweaking a few methods or variables.
-
-Design
-------
-If you decide to write a new corpus reader from scratch, then you
-should first decide which data access methods you want the reader to
-provide, and what their signatures should be. You should look at
-existing corpus readers that process corpora with similar data
-contents, and try to be consistent with those corpus readers whenever
-possible.
-
-You should also consider what sets of identifiers are appropriate for
-the corpus format. Where it's practical, file identifiers should be
-used. However, for some corpora, it may make sense to use additional
-sets of identifiers. Each set of identifiers should have a distinct
-name (e.g., fileids, utteranceids, rolesets); and you should be consistent
-in using that name to refer to that identifier. Do not use parameter
-names like ``id``, which leave it unclear what type of identifier is
-required.
-
-Once you've decided what data access methods and identifiers are
-appropriate for your corpus, you should decide if there are any
-customizable parameters that you'd like the corpus reader to handle.
-These parameters make it possible to use a single corpus reader to
-handle a wider variety of corpora. The ``sep`` argument for
-`TaggedCorpusReader`, mentioned above, is an example of a customizable
-corpus reader parameter.
-
-Implementation
---------------
-
-Constructor
-~~~~~~~~~~~
-If your corpus reader implements any customizable parameters, then
-you'll need to override the constructor. Typically, the new
-constructor will first call its base class's constructor, and then
-store the customizable parameters. For example, the
-`ConllChunkCorpusReader`\ 's constructor is defined as follows:
-
- def __init__(self, root, fileids, chunk_types, encoding='utf8',
- tagset=None, separator=None):
- ConllCorpusReader.__init__(
- self, root, fileids, ('words', 'pos', 'chunk'),
- chunk_types=chunk_types, encoding=encoding,
- tagset=tagset, separator=separator)
-
-If your corpus reader does not implement any customization parameters,
-then you can often just inherit the base class's constructor.
-
-Data Access Methods
-~~~~~~~~~~~~~~~~~~~
-
-The most common type of data access method takes an argument
-identifying which files to access, and returns a view covering those
-files. This argument may be a single file identifier string (to get a
-view for a specific file); a list of file identifier strings (to get a
-view for a specific list of files); or None (to get a view for the
-entire corpus). The method's implementation converts this argument to
-a list of path names using the `abspaths()` method, which handles all
-three value types (string, list, and None):
-
- >>> print(str(nltk.corpus.brown.abspaths()).replace('\\\\','/')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [FileSystemPathPointer('.../corpora/brown/ca01'),
- FileSystemPathPointer('.../corpora/brown/ca02'), ...]
- >>> print(str(nltk.corpus.brown.abspaths('ce06')).replace('\\\\','/')) # doctest: +ELLIPSIS
- [FileSystemPathPointer('.../corpora/brown/ce06')]
- >>> print(str(nltk.corpus.brown.abspaths(['ce06', 'ce07'])).replace('\\\\','/')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [FileSystemPathPointer('.../corpora/brown/ce06'),
- FileSystemPathPointer('.../corpora/brown/ce07')]
-
-An example of this type of method is the `words()` method, defined by
-the `PlaintextCorpusReader` as follows:
-
- >>> def words(self, fileids=None):
- ... return concat([self.CorpusView(fileid, self._read_word_block)
- ... for fileid in self.abspaths(fileids)])
-
-This method first uses `abspaths()` to convert ``fileids`` to a list of
-absolute paths. It then creates a corpus view for each file, using
-the `PlaintextCorpusReader._read_word_block()` method to read elements
-from the data file (see the discussion of corpus views below).
-Finally, it combines these corpus views using the
-`nltk.corpus.reader.util.concat()` function.
-
-When writing a corpus reader for a corpus that is never expected to be
-very large, it can sometimes be appropriate to read the files
-directly, rather than using a corpus view. For example, the
-`WordListCorpusView` class defines its `words()` method as follows:
-
- >>> def words(self, fileids=None):
- ... return concat([[w for w in open(fileid).read().split('\n') if w]
- ... for fileid in self.abspaths(fileids)])
-
-(This is usually more appropriate for lexicons than for token corpora.)
-
-If the type of data returned by a data access method is one for which
-NLTK has a conventional representation (e.g., words, tagged words, and
-parse trees), then you should use that representation. Otherwise, you
-may find it necessary to define your own representation. For data
-structures that are relatively corpus-specific, it's usually best to
-define new classes for these elements. For example, the ``propbank``
-corpus defines the `PropbankInstance` class to store the semantic role
-labeling instances described by the corpus; and the ``ppattach``
-corpus defines the `PPAttachment` class to store the prepositional
-attachment instances described by the corpus.
-
-Corpus Views
-~~~~~~~~~~~~
-.. (Much of the content for this section is taken from the
- StreamBackedCorpusView docstring.)
-
-The heart of a `StreamBackedCorpusView` is its *block reader*
-function, which reads zero or more tokens from a stream, and returns
-them as a list. A very simple example of a block reader is:
-
- >>> def simple_block_reader(stream):
- ... return stream.readline().split()
-
-This simple block reader reads a single line at a time, and returns a
-single token (consisting of a string) for each whitespace-separated
-substring on the line. A `StreamBackedCorpusView` built from this
-block reader will act like a read-only list of all the
-whitespace-separated tokens in an underlying file.
-
-When deciding how to define the block reader for a given corpus,
-careful consideration should be given to the size of blocks handled by
-the block reader. Smaller block sizes will increase the memory
-requirements of the corpus view's internal data structures (by 2
-integers per block). On the other hand, larger block sizes may
-decrease performance for random access to the corpus. (But note that
-larger block sizes will *not* decrease performance for iteration.)
-
-Internally, the `StreamBackedCorpusView` class maintains a partial
-mapping from token index to file position, with one entry per block.
-When a token with a given index *i* is requested, the corpus view
-constructs it as follows:
-
-1. First, it searches the toknum/filepos mapping for the token index
- closest to (but less than or equal to) *i*.
-
-2. Then, starting at the file position corresponding to that index, it
- reads one block at a time using the block reader until it reaches
- the requested token.
-
-The toknum/filepos mapping is created lazily: it is initially empty,
-but every time a new block is read, the block's initial token is added
-to the mapping. (Thus, the toknum/filepos map has one entry per
-block.)
-
-You can create your own corpus view in one of two ways:
-
-1. Call the `StreamBackedCorpusView` constructor, and provide your
- block reader function via the ``block_reader`` argument.
-
-2. Subclass `StreamBackedCorpusView`, and override the
- `read_block()` method.
-
-The first option is usually easier, but the second option can allow
-you to write a single `read_block` method whose behavior can be
-customized by different parameters to the subclass's constructor. For
-an example of this design pattern, see the `TaggedCorpusView` class,
-which is used by `TaggedCorpusView`.
-
-----------------
-Regression Tests
-----------------
-
-The following helper functions are used to create and then delete
-testing corpora that are stored in temporary directories. These
-testing corpora are used to make sure the readers work correctly.
-
- >>> import tempfile, os.path, textwrap
- >>> def make_testcorpus(ext='', **fileids):
- ... root = tempfile.mkdtemp()
- ... for fileid, contents in fileids.items():
- ... fileid += ext
- ... f = open(os.path.join(root, fileid), 'w')
- ... f.write(textwrap.dedent(contents))
- ... f.close()
- ... return root
- >>> def del_testcorpus(root):
- ... for fileid in os.listdir(root):
- ... os.remove(os.path.join(root, fileid))
- ... os.rmdir(root)
-
-Plaintext Corpus Reader
-=======================
-The plaintext corpus reader is used to access corpora that consist of
-unprocessed plaintext data. It assumes that paragraph breaks are
-indicated by blank lines. Sentences and words can be tokenized using
-the default tokenizers, or by custom tokenizers specified as
-parameters to the constructor.
-
- >>> root = make_testcorpus(ext='.txt',
- ... a="""\
- ... This is the first sentence. Here is another
- ... sentence! And here's a third sentence.
- ...
- ... This is the second paragraph. Tokenization is currently
- ... fairly simple, so the period in Mr. gets tokenized.
- ... """,
- ... b="""This is the second file.""")
-
- >>> from nltk.corpus.reader.plaintext import PlaintextCorpusReader
-
-The list of documents can be specified explicitly, or implicitly (using a
-regexp). The ``ext`` argument specifies a file extension.
-
- >>> corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt'])
- >>> corpus.fileids()
- ['a.txt', 'b.txt']
- >>> corpus = PlaintextCorpusReader(root, '.*\.txt')
- >>> corpus.fileids()
- ['a.txt', 'b.txt']
-
-The directory containing the corpus is corpus.root:
-
- >>> str(corpus.root) == str(root)
- True
-
-We can get a list of words, or the raw string:
-
- >>> corpus.words()
- ['This', 'is', 'the', 'first', 'sentence', '.', ...]
- >>> corpus.raw()[:40]
- 'This is the first sentence. Here is ano'
-
-Check that reading individual documents works, and reading all documents at
-once works:
-
- >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]
- (46, [40, 6])
- >>> corpus.words('a.txt')
- ['This', 'is', 'the', 'first', 'sentence', '.', ...]
- >>> corpus.words('b.txt')
- ['This', 'is', 'the', 'second', 'file', '.']
- >>> corpus.words()[:4], corpus.words()[-4:]
- (['This', 'is', 'the', 'first'], ['the', 'second', 'file', '.'])
-
-We're done with the test corpus:
-
- >>> del_testcorpus(root)
-
-Test the plaintext corpora that come with nltk:
-
- >>> from nltk.corpus import abc, genesis, inaugural
- >>> from nltk.corpus import state_union, webtext
- >>> for corpus in (abc, genesis, inaugural, state_union,
- ... webtext):
- ... print(str(corpus).replace('\\\\','/'))
- ... print(' ', repr(corpus.fileids())[:60])
- ... print(' ', repr(corpus.words()[:10])[:60])
- <PlaintextCorpusReader in '.../nltk_data/corpora/ab...'>
- ['rural.txt', 'science.txt']
- ['PM', 'denies', 'knowledge', 'of', 'AWB', ...
- <PlaintextCorpusReader in '.../nltk_data/corpora/genesi...'>
- ['english-kjv.txt', 'english-web.txt', 'finnish.txt', ...
- ['In', 'the', 'beginning', 'God', 'created', 'the', ...
- <PlaintextCorpusReader in '.../nltk_data/corpora/inaugura...'>
- ['1789-Washington.txt', '1793-Washington.txt', ...
- ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ...
- <PlaintextCorpusReader in '.../nltk_data/corpora/state_unio...'>
- ['1945-Truman.txt', '1946-Truman.txt', ...
- ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...
- <PlaintextCorpusReader in '.../nltk_data/corpora/webtex...'>
- ['firefox.txt', 'grail.txt', 'overheard.txt', ...
- ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...
-
-
-Tagged Corpus Reader
-====================
-The Tagged Corpus reader can give us words, sentences, and paragraphs,
-each tagged or untagged. All of the read methods can take one item
-(in which case they return the contents of that file) or a list of
-documents (in which case they concatenate the contents of those files).
-By default, they apply to all documents in the corpus.
-
- >>> root = make_testcorpus(
- ... a="""\
- ... This/det is/verb the/det first/adj sentence/noun ./punc
- ... Here/det is/verb another/adj sentence/noun ./punc
- ... Note/verb that/comp you/pron can/verb use/verb \
- ... any/noun tag/noun set/noun
- ...
- ... This/det is/verb the/det second/adj paragraph/noun ./punc
- ... word/n without/adj a/det tag/noun :/: hello ./punc
- ... """,
- ... b="""\
- ... This/det is/verb the/det second/adj file/noun ./punc
- ... """)
-
- >>> from nltk.corpus.reader.tagged import TaggedCorpusReader
- >>> corpus = TaggedCorpusReader(root, list('ab'))
- >>> corpus.fileids()
- ['a', 'b']
- >>> str(corpus.root) == str(root)
- True
- >>> corpus.words()
- ['This', 'is', 'the', 'first', 'sentence', '.', ...]
- >>> corpus.sents() # doctest: +ELLIPSIS
- [['This', 'is', 'the', 'first', ...], ['Here', 'is', 'another'...], ...]
- >>> corpus.paras() # doctest: +ELLIPSIS
- [[['This', ...], ['Here', ...], ...], [['This', ...], ...], ...]
- >>> corpus.tagged_words() # doctest: +ELLIPSIS
- [('This', 'DET'), ('is', 'VERB'), ('the', 'DET'), ...]
- >>> corpus.tagged_sents() # doctest: +ELLIPSIS
- [[('This', 'DET'), ('is', 'VERB'), ...], [('Here', 'DET'), ...], ...]
- >>> corpus.tagged_paras() # doctest: +ELLIPSIS
- [[[('This', 'DET'), ...], ...], [[('This', 'DET'), ...], ...], ...]
- >>> corpus.raw()[:40]
- 'This/det is/verb the/det first/adj sente'
- >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]
- (38, [32, 6])
- >>> len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()]
- (6, [5, 1])
- >>> len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()]
- (3, [2, 1])
- >>> print(corpus.words('a'))
- ['This', 'is', 'the', 'first', 'sentence', '.', ...]
- >>> print(corpus.words('b'))
- ['This', 'is', 'the', 'second', 'file', '.']
- >>> del_testcorpus(root)
-
-The Brown Corpus uses the tagged corpus reader:
-
- >>> from nltk.corpus import brown
- >>> brown.fileids() # doctest: +ELLIPSIS
- ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', ...]
- >>> brown.categories() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor',
- 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
- >>> print(repr(brown.root).replace('\\\\','/')) # doctest: +ELLIPSIS
- FileSystemPathPointer('.../corpora/brown')
- >>> brown.words()
- ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
- >>> brown.sents() # doctest: +ELLIPSIS
- [['The', 'Fulton', 'County', 'Grand', ...], ...]
- >>> brown.paras() # doctest: +ELLIPSIS
- [[['The', 'Fulton', 'County', ...]], [['The', 'jury', ...]], ...]
- >>> brown.tagged_words() # doctest: +ELLIPSIS
- [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
- >>> brown.tagged_sents() # doctest: +ELLIPSIS
- [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ...], ...]
- >>> brown.tagged_paras() # doctest: +ELLIPSIS
- [[[('The', 'AT'), ...]], [[('The', 'AT'), ...]], ...]
-
-Verbnet Corpus Reader
-=====================
-
-Make sure we're picking up the right number of elements:
-
- >>> from nltk.corpus import verbnet
- >>> len(verbnet.lemmas())
- 3621
- >>> len(verbnet.wordnetids())
- 4953
- >>> len(verbnet.classids())
- 429
-
-Selecting classids based on various selectors:
-
- >>> verbnet.classids(lemma='take') # doctest: +NORMALIZE_WHITESPACE
- ['bring-11.3', 'characterize-29.2', 'convert-26.6.2', 'cost-54.2',
- 'fit-54.3', 'performance-26.7-2', 'steal-10.5']
- >>> verbnet.classids(wordnetid='lead%2:38:01')
- ['accompany-51.7']
- >>> verbnet.classids(fileid='approve-77.xml')
- ['approve-77']
- >>> verbnet.classids(classid='admire-31.2') # subclasses
- ['admire-31.2-1']
-
-vnclass() accepts filenames, long ids, and short ids:
-
- >>> a = ElementTree.tostring(verbnet.vnclass('admire-31.2.xml'))
- >>> b = ElementTree.tostring(verbnet.vnclass('admire-31.2'))
- >>> c = ElementTree.tostring(verbnet.vnclass('31.2'))
- >>> a == b == c
- True
-
-fileids() can be used to get files based on verbnet class ids:
-
- >>> verbnet.fileids('admire-31.2')
- ['admire-31.2.xml']
- >>> verbnet.fileids(['admire-31.2', 'obtain-13.5.2'])
- ['admire-31.2.xml', 'obtain-13.5.2.xml']
- >>> verbnet.fileids('badidentifier')
- Traceback (most recent call last):
- . . .
- ValueError: vnclass identifier 'badidentifier' not found
-
-longid() and shortid() can be used to convert identifiers:
-
- >>> verbnet.longid('31.2')
- 'admire-31.2'
- >>> verbnet.longid('admire-31.2')
- 'admire-31.2'
- >>> verbnet.shortid('31.2')
- '31.2'
- >>> verbnet.shortid('admire-31.2')
- '31.2'
- >>> verbnet.longid('badidentifier')
- Traceback (most recent call last):
- . . .
- ValueError: vnclass identifier 'badidentifier' not found
- >>> verbnet.shortid('badidentifier')
- Traceback (most recent call last):
- . . .
- ValueError: vnclass identifier 'badidentifier' not found
-
-Corpus View Regression Tests
-============================
-
-Select some corpus files to play with:
-
- >>> import nltk.data
- >>> # A very short file (160 chars):
- >>> f1 = nltk.data.find('corpora/inaugural/README')
- >>> # A relatively short file (791 chars):
- >>> f2 = nltk.data.find('corpora/inaugural/1793-Washington.txt')
- >>> # A longer file (32k chars):
- >>> f3 = nltk.data.find('corpora/inaugural/1909-Taft.txt')
- >>> fileids = [f1, f2, f3]
-
-
-Concatenation
--------------
-Check that concatenation works as intended.
-
- >>> from nltk.corpus.reader.util import *
-
- >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8')
- >>> c2 = StreamBackedCorpusView(f2, read_whitespace_block, encoding='utf-8')
- >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8')
- >>> c123 = c1+c2+c3
- >>> print(c123)
- ['C-Span', 'Inaugural', 'Address', 'Corpus', 'US', ...]
-
- >>> l1 = f1.open(encoding='utf-8').read().split()
- >>> l2 = f2.open(encoding='utf-8').read().split()
- >>> l3 = f3.open(encoding='utf-8').read().split()
- >>> l123 = l1+l2+l3
-
- >>> list(c123) == l123
- True
-
- >>> (c1+c2+c3)[100] == l123[100]
- True
-
-Slicing
--------
-First, do some tests with fairly small slices. These will all
-generate tuple values.
-
- >>> from nltk.util import LazySubsequence
- >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8')
- >>> l1 = f1.open(encoding='utf-8').read().split()
- >>> print(len(c1))
- 21
- >>> len(c1) < LazySubsequence.MIN_SIZE
- True
-
-Choose a list of indices, based on the length, that covers the
-important corner cases:
-
- >>> indices = [-60, -30, -22, -21, -20, -1,
- ... 0, 1, 10, 20, 21, 22, 30, 60]
-
-Test slicing with explicit start & stop value:
-
- >>> for s in indices:
- ... for e in indices:
- ... assert list(c1[s:e]) == l1[s:e]
-
-Test slicing with stop=None:
-
- >>> for s in indices:
- ... assert list(c1[s:]) == l1[s:]
-
-Test slicing with start=None:
-
- >>> for e in indices:
- ... assert list(c1[:e]) == l1[:e]
-
-Test slicing with start=stop=None:
-
- >>> list(c1[:]) == list(l1[:])
- True
-
-Next, we'll do some tests with much longer slices. These will
-generate LazySubsequence objects.
-
- >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8')
- >>> l3 = f3.open(encoding='utf-8').read().split()
- >>> print(len(c3))
- 5430
- >>> len(c3) > LazySubsequence.MIN_SIZE*2
- True
-
-Choose a list of indices, based on the length, that covers the
-important corner cases:
-
- >>> indices = [-12000, -6000, -5431, -5430, -5429, -3000, -200, -1,
- ... 0, 1, 200, 3000, 5000, 5429, 5430, 5431, 6000, 12000]
-
-Test slicing with explicit start & stop value:
-
- >>> for s in indices:
- ... for e in indices:
- ... assert list(c3[s:e]) == l3[s:e]
-
-Test slicing with stop=None:
-
- >>> for s in indices:
- ... assert list(c3[s:]) == l3[s:]
-
-Test slicing with start=None:
-
- >>> for e in indices:
- ... assert list(c3[:e]) == l3[:e]
-
-Test slicing with start=stop=None:
-
- >>> list(c3[:]) == list(l3[:])
- True
-
-Multiple Iterators
-------------------
-If multiple iterators are created for the same corpus view, their
-iteration can be interleaved:
-
- >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block)
- >>> iterators = [c3.iterate_from(n) for n in [0,15,30,45]]
- >>> for i in range(15):
- ... for iterator in iterators:
- ... print('%-15s' % next(iterator), end=' ')
- ... print()
- My a duties in
- fellow heavy of a
- citizens: weight the proper
- Anyone of office sense
- who responsibility. upon of
- has If which the
- taken not, he obligation
- the he is which
- oath has about the
- I no to oath
- have conception enter, imposes.
- just of or The
- taken the he office
- must powers is of
- feel and lacking an
-
-SeekableUnicodeStreamReader
-===========================
-
-The file-like objects provided by the ``codecs`` module unfortunately
-suffer from a bug that prevents them from working correctly with
-corpus view objects. In particular, although the expose ``seek()``
-and ``tell()`` methods, those methods do not exhibit the expected
-behavior, because they are not synchronized with the internal buffers
-that are kept by the file-like objects. For example, the ``tell()``
-method will return the file position at the end of the buffers (whose
-contents have not yet been returned by the stream); and therefore this
-file position can not be used to return to the 'current' location in
-the stream (since ``seek()`` has no way to reconstruct the buffers).
-
-To get around these problems, we define a new class,
-`SeekableUnicodeStreamReader`, to act as a file-like interface to
-files containing encoded unicode data. This class is loosely based on
-the ``codecs.StreamReader`` class. To construct a new reader, we call
-the constructor with an underlying stream and an encoding name:
-
- >>> from io import StringIO, BytesIO
- >>> from nltk.data import SeekableUnicodeStreamReader
- >>> stream = BytesIO(b"""\
- ... This is a test file.
- ... It is encoded in ascii.
- ... """.decode('ascii').encode('ascii'))
- >>> reader = SeekableUnicodeStreamReader(stream, 'ascii')
-
-`SeekableUnicodeStreamReader`\ s support all of the normal operations
-supplied by a read-only stream. Note that all of the read operations
-return ``unicode`` objects (not ``str`` objects).
-
- >>> reader.read() # read the entire file.
- 'This is a test file.\nIt is encoded in ascii.\n'
- >>> reader.seek(0) # rewind to the start.
- >>> reader.read(5) # read at most 5 bytes.
- 'This '
- >>> reader.readline() # read to the end of the line.
- 'is a test file.\n'
- >>> reader.seek(0) # rewind to the start.
- >>> for line in reader:
- ... print(repr(line)) # iterate over lines
- 'This is a test file.\n'
- 'It is encoded in ascii.\n'
- >>> reader.seek(0) # rewind to the start.
- >>> reader.readlines() # read a list of line strings
- ['This is a test file.\n', 'It is encoded in ascii.\n']
- >>> reader.close()
-
-Size argument to ``read()``
----------------------------
-The ``size`` argument to ``read()`` specifies the maximum number of
-*bytes* to read, not the maximum number of *characters*. Thus, for
-encodings that use multiple bytes per character, it may return fewer
-characters than the ``size`` argument:
-
- >>> stream = BytesIO(b"""\
- ... This is a test file.
- ... It is encoded in utf-16.
- ... """.decode('ascii').encode('utf-16'))
- >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
- >>> reader.read(10)
- 'This '
-
-If a read block ends in the middle of the byte string encoding a
-single character, then that byte string is stored in an internal
-buffer, and re-used on the next call to ``read()``. However, if the
-size argument is too small to read even a single character, even
-though at least one character is available, then the ``read()`` method
-will read additional bytes until it can return a single character.
-This ensures that the ``read()`` method does not return an empty
-string, which could be mistaken for indicating the end of the file.
-
- >>> reader.seek(0) # rewind to the start.
- >>> reader.read(1) # we actually need to read 4 bytes
- 'T'
- >>> int(reader.tell())
- 4
-
-The ``readline()`` method may read more than a single line of text, in
-which case it stores the text that it does not return in a buffer. If
-this buffer is not empty, then its contents will be included in the
-value returned by the next call to ``read()``, regardless of the
-``size`` argument, since they are available without reading any new
-bytes from the stream:
-
- >>> reader.seek(0) # rewind to the start.
- >>> reader.readline() # stores extra text in a buffer
- 'This is a test file.\n'
- >>> print(reader.linebuffer) # examine the buffer contents
- ['It is encoded i']
- >>> reader.read(0) # returns the contents of the buffer
- 'It is encoded i'
- >>> print(reader.linebuffer) # examine the buffer contents
- None
-
-Seek and Tell
--------------
-In addition to these basic read operations,
-`SeekableUnicodeStreamReader` also supports the ``seek()`` and
-``tell()`` operations. However, some care must still be taken when
-using these operations. In particular, the only file offsets that
-should be passed to ``seek()`` are ``0`` and any offset that has been
-returned by ``tell``.
-
- >>> stream = BytesIO(b"""\
- ... This is a test file.
- ... It is encoded in utf-16.
- ... """.decode('ascii').encode('utf-16'))
- >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
- >>> reader.read(20)
- 'This is a '
- >>> pos = reader.tell(); print(pos)
- 22
- >>> reader.read(20)
- 'test file.'
- >>> reader.seek(pos) # rewind to the position from tell.
- >>> reader.read(20)
- 'test file.'
-
-The ``seek()`` and ``tell()`` methods work property even when
-``readline()`` is used.
-
- >>> stream = BytesIO(b"""\
- ... This is a test file.
- ... It is encoded in utf-16.
- ... """.decode('ascii').encode('utf-16'))
- >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
- >>> reader.readline()
- 'This is a test file.\n'
- >>> pos = reader.tell(); print(pos)
- 44
- >>> reader.readline()
- 'It is encoded in utf-16.\n'
- >>> reader.seek(pos) # rewind to the position from tell.
- >>> reader.readline()
- 'It is encoded in utf-16.\n'
-
-
-Squashed Bugs
-=============
-
-svn 5276 fixed a bug in the comment-stripping behavior of
-parse_sexpr_block.
-
- >>> from io import StringIO
- >>> from nltk.corpus.reader.util import read_sexpr_block
- >>> f = StringIO(b"""
- ... (a b c)
- ... # This line is a comment.
- ... (d e f\ng h)""".decode('ascii'))
- >>> print(read_sexpr_block(f, block_size=38, comment_char='#'))
- ['(a b c)']
- >>> print(read_sexpr_block(f, block_size=38, comment_char='#'))
- ['(d e f\ng h)']
-
-svn 5277 fixed a bug in parse_sexpr_block, which would cause it to
-enter an infinite loop if a file ended mid-sexpr, or ended with a
-token that was not followed by whitespace. A related bug caused
-an infinite loop if the corpus ended in an unmatched close paren --
-this was fixed in svn 5279
-
- >>> f = StringIO(b"""
- ... This file ends mid-sexpr
- ... (hello (world""".decode('ascii'))
- >>> for i in range(3): print(read_sexpr_block(f))
- ['This', 'file', 'ends', 'mid-sexpr']
- ['(hello (world']
- []
-
- >>> f = StringIO(b"This file has no trailing whitespace.".decode('ascii'))
- >>> for i in range(3): print(read_sexpr_block(f))
- ['This', 'file', 'has', 'no', 'trailing']
- ['whitespace.']
- []
-
- >>> # Bug fixed in 5279:
- >>> f = StringIO(b"a b c)".decode('ascii'))
- >>> for i in range(3): print(read_sexpr_block(f))
- ['a', 'b']
- ['c)']
- []
-
-
-svn 5624 & 5265 fixed a bug in ConcatenatedCorpusView, which caused it
-to return the wrong items when indexed starting at any index beyond
-the first file.
-
- >>> import nltk
- >>> sents = nltk.corpus.brown.sents()
- >>> print(sents[6000])
- ['Cholesterol', 'and', 'thyroid']
- >>> print(sents[6000])
- ['Cholesterol', 'and', 'thyroid']
-
-svn 5728 fixed a bug in Categorized*CorpusReader, which caused them
-to return words from *all* files when just one file was specified.
-
- >>> from nltk.corpus import reuters
- >>> reuters.words('training/13085')
- ['SNYDER', '&', 'lt', ';', 'SOI', '>', 'MAKES', ...]
- >>> reuters.words('training/5082')
- ['SHEPPARD', 'RESOURCES', 'TO', 'MERGE', 'WITH', ...]
-
-svn 7227 fixed a bug in the qc corpus reader, which prevented
-access to its tuples() method
-
- >>> from nltk.corpus import qc
- >>> qc.tuples('test.txt')
- [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...]
-
-
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-from nltk.corpus import teardown_module
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-Crubadan Corpus Reader
-======================
-
-Crubadan is an NLTK corpus reader for ngram files provided
-by the Crubadan project. It supports several languages.
-
- >>> from nltk.corpus import crubadan
- >>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
- ['abk', 'abn',..., 'zpa', 'zul']
-
-----------------------------------------
-Language code mapping and helper methods
-----------------------------------------
-
-The web crawler that generates the 3-gram frequencies works at the
-level of "writing systems" rather than languages. Writing systems
-are assigned internal 2-3 letter codes that require mapping to the
-standard ISO 639-3 codes. For more information, please refer to
-the README in nltk_data/crubadan folder after installing it.
-
-To translate ISO 639-3 codes to "Crubadan Code":
-
- >>> crubadan.iso_to_crubadan('eng')
- 'en'
- >>> crubadan.iso_to_crubadan('fra')
- 'fr'
- >>> crubadan.iso_to_crubadan('aaa')
-
-In reverse, print ISO 639-3 code if we have the Crubadan Code:
-
- >>> crubadan.crubadan_to_iso('en')
- 'eng'
- >>> crubadan.crubadan_to_iso('fr')
- 'fra'
- >>> crubadan.crubadan_to_iso('aa')
-
----------------------------
-Accessing ngram frequencies
----------------------------
-
-On initialization the reader will create a dictionary of every
-language supported by the Crubadan project, mapping the ISO 639-3
-language code to its corresponding ngram frequency.
-
-You can access individual language FreqDist and the ngrams within them as follows:
-
- >>> english_fd = crubadan.lang_freq('eng')
- >>> english_fd['the']
- 728135
-
-Above accesses the FreqDist of English and returns the frequency of the ngram 'the'.
-A ngram that isn't found within the language will return 0:
-
- >>> english_fd['sometest']
- 0
-
-A language that isn't supported will raise an exception:
-
- >>> crubadan.lang_freq('elvish')
- Traceback (most recent call last):
- ...
- RuntimeError: Unsupported language.
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=========================================
- Loading Resources From the Data Package
-=========================================
-
- >>> import nltk.data
-
-Overview
-~~~~~~~~
-The `nltk.data` module contains functions that can be used to load
-NLTK resource files, such as corpora, grammars, and saved processing
-objects.
-
-Loading Data Files
-~~~~~~~~~~~~~~~~~~
-Resources are loaded using the function `nltk.data.load()`, which
-takes as its first argument a URL specifying what file should be
-loaded. The ``nltk:`` protocol loads files from the NLTK data
-distribution:
-
- >>> tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
- >>> tokenizer.tokenize('Hello. This is a test. It works!')
- ['Hello.', 'This is a test.', 'It works!']
-
-It is important to note that there should be no space following the
-colon (':') in the URL; 'nltk: tokenizers/punkt/english.pickle' will
-not work!
-
-The ``nltk:`` protocol is used by default if no protocol is specified:
-
- >>> nltk.data.load('tokenizers/punkt/english.pickle') # doctest: +ELLIPSIS
- <nltk.tokenize.punkt.PunktSentenceTokenizer object at ...>
-
-But it is also possible to load resources from ``http:``, ``ftp:``,
-and ``file:`` URLs, e.g. ``cfg = nltk.data.load('http://example.com/path/to/toy.cfg')``
-
- >>> # Load a grammar using an absolute path.
- >>> url = 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg')
- >>> url.replace('\\', '/') # doctest: +ELLIPSIS
- 'file:...toy.cfg'
- >>> print(nltk.data.load(url)) # doctest: +ELLIPSIS
- Grammar with 14 productions (start state = S)
- S -> NP VP
- PP -> P NP
- ...
- P -> 'on'
- P -> 'in'
-
-The second argument to the `nltk.data.load()` function specifies the
-file format, which determines how the file's contents are processed
-before they are returned by ``load()``. The formats that are
-currently supported by the data module are described by the dictionary
-`nltk.data.FORMATS`:
-
- >>> for format, descr in sorted(nltk.data.FORMATS.items()):
- ... print('{0:<7} {1:}'.format(format, descr)) # doctest: +NORMALIZE_WHITESPACE
- cfg A context free grammar.
- fcfg A feature CFG.
- fol A list of first order logic expressions, parsed with
- nltk.sem.logic.Expression.fromstring.
- json A serialized python object, stored using the json module.
- logic A list of first order logic expressions, parsed with
- nltk.sem.logic.LogicParser. Requires an additional logic_parser
- parameter
- pcfg A probabilistic CFG.
- pickle A serialized python object, stored using the pickle
- module.
- raw The raw (byte string) contents of a file.
- text The raw (unicode string) contents of a file.
- val A semantic valuation, parsed by
- nltk.sem.Valuation.fromstring.
- yaml A serialized python object, stored using the yaml module.
-
-`nltk.data.load()` will raise a ValueError if a bad format name is
-specified:
-
- >>> nltk.data.load('grammars/sample_grammars/toy.cfg', 'bar')
- Traceback (most recent call last):
- . . .
- ValueError: Unknown format type!
-
-By default, the ``"auto"`` format is used, which chooses a format
-based on the filename's extension. The mapping from file extensions
-to format names is specified by `nltk.data.AUTO_FORMATS`:
-
- >>> for ext, format in sorted(nltk.data.AUTO_FORMATS.items()):
- ... print('.%-7s -> %s' % (ext, format))
- .cfg -> cfg
- .fcfg -> fcfg
- .fol -> fol
- .json -> json
- .logic -> logic
- .pcfg -> pcfg
- .pickle -> pickle
- .text -> text
- .txt -> text
- .val -> val
- .yaml -> yaml
-
-If `nltk.data.load()` is unable to determine the format based on the
-filename's extension, it will raise a ValueError:
-
- >>> nltk.data.load('foo.bar')
- Traceback (most recent call last):
- . . .
- ValueError: Could not determine format for foo.bar based on its file
- extension; use the "format" argument to specify the format explicitly.
-
-Note that by explicitly specifying the ``format`` argument, you can
-override the load method's default processing behavior. For example,
-to get the raw contents of any file, simply use ``format="raw"``:
-
- >>> s = nltk.data.load('grammars/sample_grammars/toy.cfg', 'text')
- >>> print(s) # doctest: +ELLIPSIS
- S -> NP VP
- PP -> P NP
- NP -> Det N | NP PP
- VP -> V NP | VP PP
- ...
-
-Making Local Copies
-~~~~~~~~~~~~~~~~~~~
-.. This will not be visible in the html output: create a tempdir to
- play in.
- >>> import tempfile, os
- >>> tempdir = tempfile.mkdtemp()
- >>> old_dir = os.path.abspath('.')
- >>> os.chdir(tempdir)
-
-The function `nltk.data.retrieve()` copies a given resource to a local
-file. This can be useful, for example, if you want to edit one of the
-sample grammars.
-
- >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg')
- Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy.cfg'
-
- >>> # Simulate editing the grammar.
- >>> with open('toy.cfg') as inp:
- ... s = inp.read().replace('NP', 'DP')
- >>> with open('toy.cfg', 'w') as out:
- ... _bytes_written = out.write(s)
-
- >>> # Load the edited grammar, & display it.
- >>> cfg = nltk.data.load('file:///' + os.path.abspath('toy.cfg'))
- >>> print(cfg) # doctest: +ELLIPSIS
- Grammar with 14 productions (start state = S)
- S -> DP VP
- PP -> P DP
- ...
- P -> 'on'
- P -> 'in'
-
-The second argument to `nltk.data.retrieve()` specifies the filename
-for the new copy of the file. By default, the source file's filename
-is used.
-
- >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg', 'mytoy.cfg')
- Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'mytoy.cfg'
- >>> os.path.isfile('./mytoy.cfg')
- True
- >>> nltk.data.retrieve('grammars/sample_grammars/np.fcfg')
- Retrieving 'nltk:grammars/sample_grammars/np.fcfg', saving to 'np.fcfg'
- >>> os.path.isfile('./np.fcfg')
- True
-
-If a file with the specified (or default) filename already exists in
-the current directory, then `nltk.data.retrieve()` will raise a
-ValueError exception. It will *not* overwrite the file:
-
- >>> os.path.isfile('./toy.cfg')
- True
- >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') # doctest: +ELLIPSIS
- Traceback (most recent call last):
- . . .
- ValueError: File '...toy.cfg' already exists!
-
-.. This will not be visible in the html output: clean up the tempdir.
- >>> os.chdir(old_dir)
- >>> for f in os.listdir(tempdir):
- ... os.remove(os.path.join(tempdir, f))
- >>> os.rmdir(tempdir)
-
-Finding Files in the NLTK Data Package
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The `nltk.data.find()` function searches the NLTK data package for a
-given file, and returns a pointer to that file. This pointer can
-either be a `FileSystemPathPointer` (whose `path` attribute gives the
-absolute path of the file); or a `ZipFilePathPointer`, specifying a
-zipfile and the name of an entry within that zipfile. Both pointer
-types define the `open()` method, which can be used to read the string
-contents of the file.
-
- >>> path = nltk.data.find('corpora/abc/rural.txt')
- >>> str(path) # doctest: +ELLIPSIS
- '...rural.txt'
- >>> print(path.open().read(60).decode())
- PM denies knowledge of AWB kickbacks
- The Prime Minister has
-
-Alternatively, the `nltk.data.load()` function can be used with the
-keyword argument ``format="raw"``:
-
- >>> s = nltk.data.load('corpora/abc/rural.txt', format='raw')[:60]
- >>> print(s.decode())
- PM denies knowledge of AWB kickbacks
- The Prime Minister has
-
-Alternatively, you can use the keyword argument ``format="text"``:
-
- >>> s = nltk.data.load('corpora/abc/rural.txt', format='text')[:60]
- >>> print(s)
- PM denies knowledge of AWB kickbacks
- The Prime Minister has
-
-Resource Caching
-~~~~~~~~~~~~~~~~
-
-NLTK uses a weakref dictionary to maintain a cache of resources that
-have been loaded. If you load a resource that is already stored in
-the cache, then the cached copy will be returned. This behavior can
-be seen by the trace output generated when verbose=True:
-
- >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True)
- <<Loading nltk:grammars/book_grammars/feat0.fcfg>>
- >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True)
- <<Using cached copy of nltk:grammars/book_grammars/feat0.fcfg>>
-
-If you wish to load a resource from its source, bypassing the cache,
-use the ``cache=False`` argument to `nltk.data.load()`. This can be
-useful, for example, if the resource is loaded from a local file, and
-you are actively editing that file:
-
- >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',cache=False,verbose=True)
- <<Loading nltk:grammars/book_grammars/feat0.fcfg>>
-
-The cache *no longer* uses weak references. A resource will not be
-automatically expunged from the cache when no more objects are using
-it. In the following example, when we clear the variable ``feat0``,
-the reference count for the feature grammar object drops to zero.
-However, the object remains cached:
-
- >>> del feat0
- >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',
- ... verbose=True)
- <<Using cached copy of nltk:grammars/book_grammars/feat0.fcfg>>
-
-You can clear the entire contents of the cache, using
-`nltk.data.clear_cache()`:
-
- >>> nltk.data.clear_cache()
-
-Retrieving other Data Sources
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- >>> formulas = nltk.data.load('grammars/book_grammars/background.fol')
- >>> for f in formulas: print(str(f))
- all x.(boxerdog(x) -> dog(x))
- all x.(boxer(x) -> person(x))
- all x.-(dog(x) & person(x))
- all x.(married(x) <-> exists y.marry(x,y))
- all x.(bark(x) -> dog(x))
- all x y.(marry(x,y) -> (person(x) & person(y)))
- -(Vincent = Mia)
- -(Vincent = Fido)
- -(Mia = Fido)
-
-Regression Tests
-~~~~~~~~~~~~~~~~
-Create a temp dir for tests that write files:
-
- >>> import tempfile, os
- >>> tempdir = tempfile.mkdtemp()
- >>> old_dir = os.path.abspath('.')
- >>> os.chdir(tempdir)
-
-The `retrieve()` function accepts all url types:
-
- >>> urls = ['https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg',
- ... 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg'),
- ... 'nltk:grammars/sample_grammars/toy.cfg',
- ... 'grammars/sample_grammars/toy.cfg']
- >>> for i, url in enumerate(urls):
- ... nltk.data.retrieve(url, 'toy-%d.cfg' % i) # doctest: +ELLIPSIS
- Retrieving 'https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', saving to 'toy-0.cfg'
- Retrieving 'file:...toy.cfg', saving to 'toy-1.cfg'
- Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-2.cfg'
- Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-3.cfg'
-
-Clean up the temp dir:
-
- >>> os.chdir(old_dir)
- >>> for f in os.listdir(tempdir):
- ... os.remove(os.path.join(tempdir, f))
- >>> os.rmdir(tempdir)
-
-Lazy Loader
------------
-A lazy loader is a wrapper object that defers loading a resource until
-it is accessed or used in any way. This is mainly intended for
-internal use by NLTK's corpus readers.
-
- >>> # Create a lazy loader for toy.cfg.
- >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg')
-
- >>> # Show that it's not loaded yet:
- >>> object.__repr__(ll) # doctest: +ELLIPSIS
- '<nltk.data.LazyLoader object at ...>'
-
- >>> # printing it is enough to cause it to be loaded:
- >>> print(ll)
- <Grammar with 14 productions>
-
- >>> # Show that it's now been loaded:
- >>> object.__repr__(ll) # doctest: +ELLIPSIS
- '<nltk.grammar.CFG object at ...>'
-
-
- >>> # Test that accessing an attribute also loads it:
- >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg')
- >>> ll.start()
- S
- >>> object.__repr__(ll) # doctest: +ELLIPSIS
- '<nltk.grammar.CFG object at ...>'
-
-Buffered Gzip Reading and Writing
----------------------------------
-Write performance to gzip-compressed is extremely poor when the files become large.
-File creation can become a bottleneck in those cases.
-
-Read performance from large gzipped pickle files was improved in data.py by
-buffering the reads. A similar fix can be applied to writes by buffering
-the writes to a StringIO object first.
-
-This is mainly intended for internal use. The test simply tests that reading
-and writing work as intended and does not test how much improvement buffering
-provides.
-
- >>> from io import StringIO
- >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10)
- >>> ans = []
- >>> for i in range(10000):
- ... ans.append(str(i).encode('ascii'))
- ... test.write(str(i).encode('ascii'))
- >>> test.close()
- >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'rb')
- >>> test.read() == b''.join(ans)
- True
- >>> test.close()
- >>> import os
- >>> os.unlink('testbuf.gz')
-
-JSON Encoding and Decoding
---------------------------
-JSON serialization is used instead of pickle for some classes.
-
- >>> from nltk import jsontags
- >>> from nltk.jsontags import JSONTaggedEncoder, JSONTaggedDecoder, register_tag
- >>> @jsontags.register_tag
- ... class JSONSerializable:
- ... json_tag = 'JSONSerializable'
- ...
- ... def __init__(self, n):
- ... self.n = n
- ...
- ... def encode_json_obj(self):
- ... return self.n
- ...
- ... @classmethod
- ... def decode_json_obj(cls, obj):
- ... n = obj
- ... return cls(n)
- ...
- >>> JSONTaggedEncoder().encode(JSONSerializable(1))
- '{"!JSONSerializable": 1}'
- >>> JSONTaggedDecoder().decode('{"!JSONSerializable": 1}').n
- 1
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===================
-Dependency Grammars
-===================
-
- >>> from nltk.grammar import DependencyGrammar
- >>> from nltk.parse import (
- ... DependencyGraph,
- ... ProjectiveDependencyParser,
- ... NonprojectiveDependencyParser,
- ... )
-
-CoNLL Data
-----------
-
- >>> treebank_data = """Pierre NNP 2 NMOD
- ... Vinken NNP 8 SUB
- ... , , 2 P
- ... 61 CD 5 NMOD
- ... years NNS 6 AMOD
- ... old JJ 2 NMOD
- ... , , 2 P
- ... will MD 0 ROOT
- ... join VB 8 VC
- ... the DT 11 NMOD
- ... board NN 9 OBJ
- ... as IN 9 VMOD
- ... a DT 15 NMOD
- ... nonexecutive JJ 15 NMOD
- ... director NN 12 PMOD
- ... Nov. NNP 9 VMOD
- ... 29 CD 16 NMOD
- ... . . 9 VMOD
- ... """
-
- >>> dg = DependencyGraph(treebank_data)
- >>> dg.tree().pprint()
- (will
- (Vinken Pierre , (old (years 61)) ,)
- (join (board the) (as (director a nonexecutive)) (Nov. 29) .))
- >>> for head, rel, dep in dg.triples():
- ... print(
- ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
- ... .format(h=head, r=rel, d=dep)
- ... )
- (will, MD), SUB, (Vinken, NNP)
- (Vinken, NNP), NMOD, (Pierre, NNP)
- (Vinken, NNP), P, (,, ,)
- (Vinken, NNP), NMOD, (old, JJ)
- (old, JJ), AMOD, (years, NNS)
- (years, NNS), NMOD, (61, CD)
- (Vinken, NNP), P, (,, ,)
- (will, MD), VC, (join, VB)
- (join, VB), OBJ, (board, NN)
- (board, NN), NMOD, (the, DT)
- (join, VB), VMOD, (as, IN)
- (as, IN), PMOD, (director, NN)
- (director, NN), NMOD, (a, DT)
- (director, NN), NMOD, (nonexecutive, JJ)
- (join, VB), VMOD, (Nov., NNP)
- (Nov., NNP), NMOD, (29, CD)
- (join, VB), VMOD, (., .)
-
-Using a custom cell extractor.
-
- >>> def custom_extractor(cells):
- ... _, tag, head, rel = cells
- ... return 'spam', 'spam', tag, tag, '', head, rel
- >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
- >>> dg.tree().pprint()
- (spam
- (spam spam spam (spam (spam spam)) spam)
- (spam (spam spam) (spam (spam spam spam)) (spam spam) spam))
-
-Custom cell extractors can take in and return an index.
-
- >>> def custom_extractor(cells, index):
- ... word, tag, head, rel = cells
- ... return (index, '{}-{}'.format(word, index), word,
- ... tag, tag, '', head, rel)
- >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
- >>> dg.tree().pprint()
- (will-8
- (Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7)
- (join-9
- (board-11 the-10)
- (as-12 (director-15 a-13 nonexecutive-14))
- (Nov.-16 29-17)
- .-18))
-
-Using the dependency-parsed version of the Penn Treebank corpus sample.
-
- >>> from nltk.corpus import dependency_treebank
- >>> t = dependency_treebank.parsed_sents()[0]
- >>> print(t.to_conll(3)) # doctest: +NORMALIZE_WHITESPACE
- Pierre NNP 2
- Vinken NNP 8
- , , 2
- 61 CD 5
- years NNS 6
- old JJ 2
- , , 2
- will MD 0
- join VB 8
- the DT 11
- board NN 9
- as IN 9
- a DT 15
- nonexecutive JJ 15
- director NN 12
- Nov. NNP 9
- 29 CD 16
- . . 8
-
-Using the output of zpar (like Malt-TAB but with zero-based indexing)
-
- >>> zpar_data = """
- ... Pierre NNP 1 NMOD
- ... Vinken NNP 7 SUB
- ... , , 1 P
- ... 61 CD 4 NMOD
- ... years NNS 5 AMOD
- ... old JJ 1 NMOD
- ... , , 1 P
- ... will MD -1 ROOT
- ... join VB 7 VC
- ... the DT 10 NMOD
- ... board NN 8 OBJ
- ... as IN 8 VMOD
- ... a DT 14 NMOD
- ... nonexecutive JJ 14 NMOD
- ... director NN 11 PMOD
- ... Nov. NNP 8 VMOD
- ... 29 CD 15 NMOD
- ... . . 7 P
- ... """
-
- >>> zdg = DependencyGraph(zpar_data, zero_based=True)
- >>> print(zdg.tree())
- (will
- (Vinken Pierre , (old (years 61)) ,)
- (join (board the) (as (director a nonexecutive)) (Nov. 29))
- .)
-
-
-Projective Dependency Parsing
------------------------------
-
- >>> grammar = DependencyGrammar.fromstring("""
- ... 'fell' -> 'price' | 'stock'
- ... 'price' -> 'of' 'the'
- ... 'of' -> 'stock'
- ... 'stock' -> 'the'
- ... """)
- >>> print(grammar)
- Dependency grammar with 5 productions
- 'fell' -> 'price'
- 'fell' -> 'stock'
- 'price' -> 'of' 'the'
- 'of' -> 'stock'
- 'stock' -> 'the'
-
- >>> dp = ProjectiveDependencyParser(grammar)
- >>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])):
- ... print(t)
- (fell (price the (of (stock the))))
- (fell (price the of) (stock the))
- (fell (price the of the) stock)
-
-Non-Projective Dependency Parsing
----------------------------------
-
- >>> grammar = DependencyGrammar.fromstring("""
- ... 'taught' -> 'play' | 'man'
- ... 'man' -> 'the'
- ... 'play' -> 'golf' | 'dog' | 'to'
- ... 'dog' -> 'his'
- ... """)
- >>> print(grammar)
- Dependency grammar with 7 productions
- 'taught' -> 'play'
- 'taught' -> 'man'
- 'man' -> 'the'
- 'play' -> 'golf'
- 'play' -> 'dog'
- 'play' -> 'to'
- 'dog' -> 'his'
-
- >>> dp = NonprojectiveDependencyParser(grammar)
- >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
-
- >>> print(g.root['word'])
- taught
-
- >>> for _, node in sorted(g.nodes.items()):
- ... if node['word'] is not None:
- ... print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
- 1 the: []
- 2 man: [1]
- 3 taught: [2, 7]
- 4 his: []
- 5 dog: [4]
- 6 to: []
- 7 play: [5, 6, 8]
- 8 golf: []
-
- >>> print(g.tree())
- (taught (man the) (play (dog his) to golf))
-
-Integration with MALT parser
-============================
-
-In case the top relation is different from the default, we can set it. In case
-of MALT parser, it's set to `'null'`.
-
->>> dg_str = """1 I _ NN NN _ 2 nn _ _
-... 2 shot _ NN NN _ 0 null _ _
-... 3 an _ AT AT _ 2 dep _ _
-... 4 elephant _ NN NN _ 7 nn _ _
-... 5 in _ NN NN _ 7 nn _ _
-... 6 my _ NN NN _ 7 nn _ _
-... 7 pajamas _ NNS NNS _ 3 dobj _ _
-... """
->>> dg = DependencyGraph(dg_str, top_relation_label='null')
-
->>> len(dg.nodes)
-8
-
->>> dg.root['word'], dg.root['address']
-('shot', 2)
-
->>> print(dg.to_conll(10)) # doctest: +NORMALIZE_WHITESPACE
-1 I _ NN NN _ 2 nn _ _
-2 shot _ NN NN _ 0 null _ _
-3 an _ AT AT _ 2 dep _ _
-4 elephant _ NN NN _ 7 nn _ _
-5 in _ NN NN _ 7 nn _ _
-6 my _ NN NN _ 7 nn _ _
-7 pajamas _ NNS NNS _ 3 dobj _ _
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==================
-Discourse Checking
-==================
-
- >>> from nltk import *
- >>> from nltk.sem import logic
- >>> logic._counter._value = 0
-
-Introduction
-============
-
-The NLTK discourse module makes it possible to test consistency and
-redundancy of simple discourses, using theorem-proving and
-model-building from `nltk.inference`.
-
-The ``DiscourseTester`` constructor takes a list of sentences as a
-parameter.
-
- >>> dt = DiscourseTester(['a boxer walks', 'every boxer chases a girl'])
-
-The ``DiscourseTester`` parses each sentence into a list of logical
-forms. Once we have created ``DiscourseTester`` object, we can
-inspect various properties of the discourse. First off, we might want
-to double-check what sentences are currently stored as the discourse.
-
- >>> dt.sentences()
- s0: a boxer walks
- s1: every boxer chases a girl
-
-As you will see, each sentence receives an identifier `s`\ :subscript:`i`.
-We might also want to check what grammar the ``DiscourseTester`` is
-using (by default, ``book_grammars/discourse.fcfg``):
-
- >>> dt.grammar() # doctest: +ELLIPSIS
- % start S
- # Grammar Rules
- S[SEM = <app(?subj,?vp)>] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp]
- NP[NUM=?n,SEM=<app(?det,?nom)> ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom]
- NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np]
- ...
-
-A different grammar can be invoked by using the optional ``gramfile``
-parameter when a ``DiscourseTester`` object is created.
-
-Readings and Threads
-====================
-
-Depending on
-the grammar used, we may find some sentences have more than one
-logical form. To check this, use the ``readings()`` method. Given a
-sentence identifier of the form `s`\ :subscript:`i`, each reading of
-that sentence is given an identifier `s`\ :sub:`i`-`r`\ :sub:`j`.
-
-
- >>> dt.readings()
- <BLANKLINE>
- s0 readings:
- <BLANKLINE>
- s0-r0: exists z1.(boxer(z1) & walk(z1))
- s0-r1: exists z1.(boxerdog(z1) & walk(z1))
- <BLANKLINE>
- s1 readings:
- <BLANKLINE>
- s1-r0: all z2.(boxer(z2) -> exists z3.(girl(z3) & chase(z2,z3)))
- s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
-
-
-In this case, the only source of ambiguity lies in the word *boxer*,
-which receives two translations: ``boxer`` and ``boxerdog``. The
-intention is that one of these corresponds to the ``person`` sense and
-one to the ``dog`` sense. In principle, we would also expect to see a
-quantifier scope ambiguity in ``s1``. However, the simple grammar we
-are using, namely `sem4.fcfg <sem4.fcfg>`_, doesn't support quantifier
-scope ambiguity.
-
-We can also investigate the readings of a specific sentence:
-
- >>> dt.readings('a boxer walks')
- The sentence 'a boxer walks' has these readings:
- exists x.(boxer(x) & walk(x))
- exists x.(boxerdog(x) & walk(x))
-
-Given that each sentence is two-ways ambiguous, we potentially have
-four different discourse 'threads', taking all combinations of
-readings. To see these, specify the ``threaded=True`` parameter on
-the ``readings()`` method. Again, each thread is assigned an
-identifier of the form `d`\ :sub:`i`. Following the identifier is a
-list of the readings that constitute that thread.
-
- >>> dt.readings(threaded=True) # doctest: +NORMALIZE_WHITESPACE
- d0: ['s0-r0', 's1-r0']
- d1: ['s0-r0', 's1-r1']
- d2: ['s0-r1', 's1-r0']
- d3: ['s0-r1', 's1-r1']
-
-Of course, this simple-minded approach doesn't scale: a discourse with, say, three
-sentences, each of which has 3 readings, will generate 27 different
-threads. It is an interesting exercise to consider how to manage
-discourse ambiguity more efficiently.
-
-Checking Consistency
-====================
-
-Now, we can check whether some or all of the discourse threads are
-consistent, using the ``models()`` method. With no parameter, this
-method will try to find a model for every discourse thread in the
-current discourse. However, we can also specify just one thread, say ``d1``.
-
- >>> dt.models('d1')
- --------------------------------------------------------------------------------
- Model for Discourse Thread d1
- --------------------------------------------------------------------------------
- % number = 1
- % seconds = 0
- <BLANKLINE>
- % Interpretation of size 2
- <BLANKLINE>
- c1 = 0.
- <BLANKLINE>
- f1(0) = 0.
- f1(1) = 0.
- <BLANKLINE>
- boxer(0).
- - boxer(1).
- <BLANKLINE>
- - boxerdog(0).
- - boxerdog(1).
- <BLANKLINE>
- - girl(0).
- - girl(1).
- <BLANKLINE>
- walk(0).
- - walk(1).
- <BLANKLINE>
- - chase(0,0).
- - chase(0,1).
- - chase(1,0).
- - chase(1,1).
- <BLANKLINE>
- Consistent discourse: d1 ['s0-r0', 's1-r1']:
- s0-r0: exists z1.(boxer(z1) & walk(z1))
- s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
- <BLANKLINE>
-
-There are various formats for rendering **Mace4** models --- here,
-we have used the 'cooked' format (which is intended to be
-human-readable). There are a number of points to note.
-
-#. The entities in the domain are all treated as non-negative
- integers. In this case, there are only two entities, ``0`` and
- ``1``.
-
-#. The ``-`` symbol indicates negation. So ``0`` is the only
- ``boxerdog`` and the only thing that ``walk``\ s. Nothing is a
- ``boxer``, or a ``girl`` or in the ``chase`` relation. Thus the
- universal sentence is vacuously true.
-
-#. ``c1`` is an introduced constant that denotes ``0``.
-
-#. ``f1`` is a Skolem function, but it plays no significant role in
- this model.
-
-
-We might want to now add another sentence to the discourse, and there
-is method ``add_sentence()`` for doing just this.
-
- >>> dt.add_sentence('John is a boxer')
- >>> dt.sentences()
- s0: a boxer walks
- s1: every boxer chases a girl
- s2: John is a boxer
-
-We can now test all the properties as before; here, we just show a
-couple of them.
-
- >>> dt.readings()
- <BLANKLINE>
- s0 readings:
- <BLANKLINE>
- s0-r0: exists z1.(boxer(z1) & walk(z1))
- s0-r1: exists z1.(boxerdog(z1) & walk(z1))
- <BLANKLINE>
- s1 readings:
- <BLANKLINE>
- s1-r0: all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
- s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
- <BLANKLINE>
- s2 readings:
- <BLANKLINE>
- s2-r0: boxer(John)
- s2-r1: boxerdog(John)
- >>> dt.readings(threaded=True) # doctest: +NORMALIZE_WHITESPACE
- d0: ['s0-r0', 's1-r0', 's2-r0']
- d1: ['s0-r0', 's1-r0', 's2-r1']
- d2: ['s0-r0', 's1-r1', 's2-r0']
- d3: ['s0-r0', 's1-r1', 's2-r1']
- d4: ['s0-r1', 's1-r0', 's2-r0']
- d5: ['s0-r1', 's1-r0', 's2-r1']
- d6: ['s0-r1', 's1-r1', 's2-r0']
- d7: ['s0-r1', 's1-r1', 's2-r1']
-
-If you are interested in a particular thread, the ``expand_threads()``
-method will remind you of what readings it consists of:
-
- >>> thread = dt.expand_threads('d1')
- >>> for rid, reading in thread:
- ... print(rid, str(reading.normalize()))
- s0-r0 exists z1.(boxer(z1) & walk(z1))
- s1-r0 all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
- s2-r1 boxerdog(John)
-
-Suppose we have already defined a discourse, as follows:
-
- >>> dt = DiscourseTester(['A student dances', 'Every student is a person'])
-
-Now, when we add a new sentence, is it consistent with what we already
-have? The `` consistchk=True`` parameter of ``add_sentence()`` allows
-us to check:
-
- >>> dt.add_sentence('No person dances', consistchk=True)
- Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']:
- s0-r0: exists z1.(student(z1) & dance(z1))
- s1-r0: all z1.(student(z1) -> person(z1))
- s2-r0: -exists z1.(person(z1) & dance(z1))
- <BLANKLINE>
- >>> dt.readings()
- <BLANKLINE>
- s0 readings:
- <BLANKLINE>
- s0-r0: exists z1.(student(z1) & dance(z1))
- <BLANKLINE>
- s1 readings:
- <BLANKLINE>
- s1-r0: all z1.(student(z1) -> person(z1))
- <BLANKLINE>
- s2 readings:
- <BLANKLINE>
- s2-r0: -exists z1.(person(z1) & dance(z1))
-
-So let's retract the inconsistent sentence:
-
- >>> dt.retract_sentence('No person dances', verbose=True) # doctest: +NORMALIZE_WHITESPACE
- Current sentences are
- s0: A student dances
- s1: Every student is a person
-
-We can now verify that result is consistent.
-
- >>> dt.models()
- --------------------------------------------------------------------------------
- Model for Discourse Thread d0
- --------------------------------------------------------------------------------
- % number = 1
- % seconds = 0
- <BLANKLINE>
- % Interpretation of size 2
- <BLANKLINE>
- c1 = 0.
- <BLANKLINE>
- dance(0).
- - dance(1).
- <BLANKLINE>
- person(0).
- - person(1).
- <BLANKLINE>
- student(0).
- - student(1).
- <BLANKLINE>
- Consistent discourse: d0 ['s0-r0', 's1-r0']:
- s0-r0: exists z1.(student(z1) & dance(z1))
- s1-r0: all z1.(student(z1) -> person(z1))
- <BLANKLINE>
-
-Checking Informativity
-======================
-
-Let's assume that we are still trying to extend the discourse *A
-student dances.* *Every student is a person.* We add a new sentence,
-but this time, we check whether it is informative with respect to what
-has gone before.
-
- >>> dt.add_sentence('A person dances', informchk=True)
- Sentence 'A person dances' under reading 'exists x.(person(x) & dance(x))':
- Not informative relative to thread 'd0'
-
-In fact, we are just checking whether the new sentence is entailed by
-the preceding discourse.
-
- >>> dt.models()
- --------------------------------------------------------------------------------
- Model for Discourse Thread d0
- --------------------------------------------------------------------------------
- % number = 1
- % seconds = 0
- <BLANKLINE>
- % Interpretation of size 2
- <BLANKLINE>
- c1 = 0.
- <BLANKLINE>
- c2 = 0.
- <BLANKLINE>
- dance(0).
- - dance(1).
- <BLANKLINE>
- person(0).
- - person(1).
- <BLANKLINE>
- student(0).
- - student(1).
- <BLANKLINE>
- Consistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']:
- s0-r0: exists z1.(student(z1) & dance(z1))
- s1-r0: all z1.(student(z1) -> person(z1))
- s2-r0: exists z1.(person(z1) & dance(z1))
- <BLANKLINE>
-
-
-
-Adding Background Knowledge
-===========================
-
-Let's build a new discourse, and look at the readings of the component sentences:
-
- >>> dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks'])
- >>> dt.readings()
- <BLANKLINE>
- s0 readings:
- <BLANKLINE>
- s0-r0: boxer(Vincent)
- s0-r1: boxerdog(Vincent)
- <BLANKLINE>
- s1 readings:
- <BLANKLINE>
- s1-r0: boxer(Fido)
- s1-r1: boxerdog(Fido)
- <BLANKLINE>
- s2 readings:
- <BLANKLINE>
- s2-r0: married(Vincent)
- <BLANKLINE>
- s3 readings:
- <BLANKLINE>
- s3-r0: bark(Fido)
-
-This gives us a lot of threads:
-
- >>> dt.readings(threaded=True) # doctest: +NORMALIZE_WHITESPACE
- d0: ['s0-r0', 's1-r0', 's2-r0', 's3-r0']
- d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0']
- d2: ['s0-r1', 's1-r0', 's2-r0', 's3-r0']
- d3: ['s0-r1', 's1-r1', 's2-r0', 's3-r0']
-
-
-We can eliminate some of the readings, and hence some of the threads,
-by adding background information.
-
- >>> import nltk.data
- >>> bg = nltk.data.load('grammars/book_grammars/background.fol')
- >>> dt.add_background(bg)
- >>> dt.background()
- all x.(boxerdog(x) -> dog(x))
- all x.(boxer(x) -> person(x))
- all x.-(dog(x) & person(x))
- all x.(married(x) <-> exists y.marry(x,y))
- all x.(bark(x) -> dog(x))
- all x y.(marry(x,y) -> (person(x) & person(y)))
- -(Vincent = Mia)
- -(Vincent = Fido)
- -(Mia = Fido)
-
-The background information allows us to reject three of the threads as
-inconsistent. To see what remains, use the ``filter=True`` parameter
-on ``readings()``.
-
- >>> dt.readings(filter=True) # doctest: +NORMALIZE_WHITESPACE
- d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0']
-
-The ``models()`` method gives us more information about the surviving thread.
-
- >>> dt.models()
- --------------------------------------------------------------------------------
- Model for Discourse Thread d0
- --------------------------------------------------------------------------------
- No model found!
- <BLANKLINE>
- --------------------------------------------------------------------------------
- Model for Discourse Thread d1
- --------------------------------------------------------------------------------
- % number = 1
- % seconds = 0
- <BLANKLINE>
- % Interpretation of size 3
- <BLANKLINE>
- Fido = 0.
- <BLANKLINE>
- Mia = 1.
- <BLANKLINE>
- Vincent = 2.
- <BLANKLINE>
- f1(0) = 0.
- f1(1) = 0.
- f1(2) = 2.
- <BLANKLINE>
- bark(0).
- - bark(1).
- - bark(2).
- <BLANKLINE>
- - boxer(0).
- - boxer(1).
- boxer(2).
- <BLANKLINE>
- boxerdog(0).
- - boxerdog(1).
- - boxerdog(2).
- <BLANKLINE>
- dog(0).
- - dog(1).
- - dog(2).
- <BLANKLINE>
- - married(0).
- - married(1).
- married(2).
- <BLANKLINE>
- - person(0).
- - person(1).
- person(2).
- <BLANKLINE>
- - marry(0,0).
- - marry(0,1).
- - marry(0,2).
- - marry(1,0).
- - marry(1,1).
- - marry(1,2).
- - marry(2,0).
- - marry(2,1).
- marry(2,2).
- <BLANKLINE>
- --------------------------------------------------------------------------------
- Model for Discourse Thread d2
- --------------------------------------------------------------------------------
- No model found!
- <BLANKLINE>
- --------------------------------------------------------------------------------
- Model for Discourse Thread d3
- --------------------------------------------------------------------------------
- No model found!
- <BLANKLINE>
- Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0', 's3-r0']:
- s0-r0: boxer(Vincent)
- s1-r0: boxer(Fido)
- s2-r0: married(Vincent)
- s3-r0: bark(Fido)
- <BLANKLINE>
- Consistent discourse: d1 ['s0-r0', 's1-r1', 's2-r0', 's3-r0']:
- s0-r0: boxer(Vincent)
- s1-r1: boxerdog(Fido)
- s2-r0: married(Vincent)
- s3-r0: bark(Fido)
- <BLANKLINE>
- Inconsistent discourse: d2 ['s0-r1', 's1-r0', 's2-r0', 's3-r0']:
- s0-r1: boxerdog(Vincent)
- s1-r0: boxer(Fido)
- s2-r0: married(Vincent)
- s3-r0: bark(Fido)
- <BLANKLINE>
- Inconsistent discourse: d3 ['s0-r1', 's1-r1', 's2-r0', 's3-r0']:
- s0-r1: boxerdog(Vincent)
- s1-r1: boxerdog(Fido)
- s2-r0: married(Vincent)
- s3-r0: bark(Fido)
- <BLANKLINE>
-
-
-.. This will not be visible in the html output: create a tempdir to
- play in.
- >>> import tempfile, os
- >>> tempdir = tempfile.mkdtemp()
- >>> old_dir = os.path.abspath('.')
- >>> os.chdir(tempdir)
-
-In order to play around with your own version of background knowledge,
-you might want to start off with a local copy of ``background.fol``:
-
- >>> nltk.data.retrieve('grammars/book_grammars/background.fol')
- Retrieving 'nltk:grammars/book_grammars/background.fol', saving to 'background.fol'
-
-After you have modified the file, the ``load_fol()`` function will parse
-the strings in the file into expressions of ``nltk.sem.logic``.
-
- >>> from nltk.inference.discourse import load_fol
- >>> mybg = load_fol(open('background.fol').read())
-
-The result can be loaded as an argument of ``add_background()`` in the
-manner shown earlier.
-
-.. This will not be visible in the html output: clean up the tempdir.
- >>> os.chdir(old_dir)
- >>> for f in os.listdir(tempdir):
- ... os.remove(os.path.join(tempdir, f))
- >>> os.rmdir(tempdir)
- >>> nltk.data.clear_cache()
-
-
-Regression Testing from book
-============================
-
- >>> logic._counter._value = 0
-
- >>> from nltk.tag import RegexpTagger
- >>> tagger = RegexpTagger(
- ... [('^(chases|runs)$', 'VB'),
- ... ('^(a)$', 'ex_quant'),
- ... ('^(every)$', 'univ_quant'),
- ... ('^(dog|boy)$', 'NN'),
- ... ('^(He)$', 'PRP')
- ... ])
- >>> rc = DrtGlueReadingCommand(depparser=MaltParser(tagger=tagger))
- >>> dt = DiscourseTester(map(str.split, ['Every dog chases a boy', 'He runs']), rc)
- >>> dt.readings()
- <BLANKLINE>
- s0 readings:
- <BLANKLINE>
- s0-r0: ([z2],[boy(z2), (([z5],[dog(z5)]) -> ([],[chases(z5,z2)]))])
- s0-r1: ([],[(([z1],[dog(z1)]) -> ([z2],[boy(z2), chases(z1,z2)]))])
- <BLANKLINE>
- s1 readings:
- <BLANKLINE>
- s1-r0: ([z1],[PRO(z1), runs(z1)])
- >>> dt.readings(show_thread_readings=True)
- d0: ['s0-r0', 's1-r0'] : ([z1,z2],[boy(z1), (([z3],[dog(z3)]) -> ([],[chases(z3,z1)])), (z2 = z1), runs(z2)])
- d1: ['s0-r1', 's1-r0'] : INVALID: AnaphoraResolutionException
- >>> dt.readings(filter=True, show_thread_readings=True)
- d0: ['s0-r0', 's1-r0'] : ([z1,z3],[boy(z1), (([z2],[dog(z2)]) -> ([],[chases(z2,z1)])), (z3 = z1), runs(z3)])
-
- >>> logic._counter._value = 0
-
- >>> from nltk.parse import FeatureEarleyChartParser
- >>> from nltk.sem.drt import DrtParser
- >>> grammar = nltk.data.load('grammars/book_grammars/drt.fcfg', logic_parser=DrtParser())
- >>> parser = FeatureEarleyChartParser(grammar, trace=0)
- >>> trees = parser.parse('Angus owns a dog'.split())
- >>> print(list(trees)[0].label()['SEM'].simplify().normalize())
- ([z1,z2],[Angus(z1), dog(z2), own(z1,z2)])
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-# FIXME: the entire discourse.doctest is skipped if Prover9/Mace4 is
-# not installed, but there are pure-python parts that don't need Prover9.
-def setup_module(module):
- from nose import SkipTest
- from nltk.inference.mace import Mace
-
- try:
- m = Mace()
- m._find_binary("mace4")
- except LookupError:
- raise SkipTest("Mace4/Prover9 is not available so discourse.doctest is skipped")
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-================================
- Discourse Representation Theory
-================================
-
- >>> from nltk.sem import logic
- >>> from nltk.inference import TableauProver
-
-Overview
-========
-
-A DRS can be created with the ``DRS()`` constructor. This takes two arguments: a list of
-discourse referents and list of conditions. .
-
- >>> from nltk.sem.drt import *
- >>> dexpr = DrtExpression.fromstring
- >>> man_x = dexpr('man(x)')
- >>> walk_x = dexpr('walk(x)')
- >>> x = dexpr('x')
- >>> print(DRS([x], [man_x, walk_x]))
- ([x],[man(x), walk(x)])
-
-The ``parse()`` method can also be applied directly to DRS
-expressions, which allows them to be specified more
-easily.
-
- >>> drs1 = dexpr('([x],[man(x),walk(x)])')
- >>> print(drs1)
- ([x],[man(x), walk(x)])
-
-DRSs can be *merged* using the ``+`` operator.
-
- >>> drs2 = dexpr('([y],[woman(y),stop(y)])')
- >>> drs3 = drs1 + drs2
- >>> print(drs3)
- (([x],[man(x), walk(x)]) + ([y],[woman(y), stop(y)]))
- >>> print(drs3.simplify())
- ([x,y],[man(x), walk(x), woman(y), stop(y)])
-
-We can embed DRSs as components of an ``implies`` condition.
-
- >>> s = '([], [(%s -> %s)])' % (drs1, drs2)
- >>> print(dexpr(s))
- ([],[(([x],[man(x), walk(x)]) -> ([y],[woman(y), stop(y)]))])
-
-The ``fol()`` method converts DRSs into FOL formulae.
-
- >>> print(dexpr(r'([x],[man(x), walks(x)])').fol())
- exists x.(man(x) & walks(x))
- >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol())
- all x.(man(x) -> walks(x))
-
-In order to visualize a DRS, the ``pretty_format()`` method can be used.
-
- >>> print(drs3.pretty_format())
- _________ __________
- | x | | y |
- (|---------| + |----------|)
- | man(x) | | woman(y) |
- | walk(x) | | stop(y) |
- |_________| |__________|
-
-
-Parse to semantics
-------------------
-
-..
- >>> logic._counter._value = 0
-
-DRSs can be used for building compositional semantics in a feature
-based grammar. To specify that we want to use DRSs, the appropriate
-logic parser needs be passed as a parameter to ``load_earley()``
-
- >>> from nltk.parse import load_parser
- >>> from nltk.sem.drt import DrtParser
- >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, logic_parser=DrtParser())
- >>> for tree in parser.parse('a dog barks'.split()):
- ... print(tree.label()['SEM'].simplify())
- ...
- ([x],[dog(x), bark(x)])
-
-Alternatively, a ``FeatStructReader`` can be passed with the ``logic_parser`` set on it
-
- >>> from nltk.featstruct import FeatStructReader
- >>> from nltk.grammar import FeatStructNonterminal
- >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, fstruct_reader=FeatStructReader(fdict_class=FeatStructNonterminal, logic_parser=DrtParser()))
- >>> for tree in parser.parse('every girl chases a dog'.split()):
- ... print(tree.label()['SEM'].simplify().normalize())
- ...
- ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chase(z1,z2)]))])
-
-
-
-Unit Tests
-==========
-
-Parser
-------
-
- >>> print(dexpr(r'([x,y],[sees(x,y)])'))
- ([x,y],[sees(x,y)])
- >>> print(dexpr(r'([x],[man(x), walks(x)])'))
- ([x],[man(x), walks(x)])
- >>> print(dexpr(r'\x.([],[man(x), walks(x)])'))
- \x.([],[man(x), walks(x)])
- >>> print(dexpr(r'\x.\y.([],[sees(x,y)])'))
- \x y.([],[sees(x,y)])
-
- >>> print(dexpr(r'([x,y],[(x = y)])'))
- ([x,y],[(x = y)])
- >>> print(dexpr(r'([x,y],[(x != y)])'))
- ([x,y],[-(x = y)])
-
- >>> print(dexpr(r'\x.([],[walks(x)])(john)'))
- (\x.([],[walks(x)]))(john)
- >>> print(dexpr(r'\R.\x.([],[big(x,R)])(\y.([],[mouse(y)]))'))
- (\R x.([],[big(x,R)]))(\y.([],[mouse(y)]))
-
- >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))'))
- (([x],[walks(x)]) + ([y],[runs(y)]))
- >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))'))
- (([x,y],[walks(x), jumps(y)]) + ([z],[twos(z)]) + ([w],[runs(w)]))
- >>> print(dexpr(r'((([],[walks(x)]) + ([],[twos(x)])) + ([],[runs(x)]))'))
- (([],[walks(x)]) + ([],[twos(x)]) + ([],[runs(x)]))
- >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)])) + (([],[threes(x)]) + ([],[fours(x)])))'))
- (([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)]))
-
- >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))'))
- (([],[walks(x)]) -> ([],[runs(x)]))
-
- >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])'))
- ([x],[PRO(x), sees(John,x)])
- >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])'))
- ([x],[man(x), -([],[walks(x)])])
- >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'))
- ([],[(([x],[man(x)]) -> ([],[walks(x)]))])
-
- >>> print(dexpr(r'DRS([x],[walk(x)])'))
- ([x],[walk(x)])
- >>> print(dexpr(r'DRS([x][walk(x)])'))
- ([x],[walk(x)])
- >>> print(dexpr(r'([x][walk(x)])'))
- ([x],[walk(x)])
-
-``simplify()``
---------------
-
- >>> print(dexpr(r'\x.([],[man(x), walks(x)])(john)').simplify())
- ([],[man(john), walks(john)])
- >>> print(dexpr(r'\x.\y.([z],[dog(z),sees(x,y)])(john)(mary)').simplify())
- ([z],[dog(z), sees(john,mary)])
- >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').simplify())
- \x.([],[big(x,\y.([],[mouse(y)]))])
-
- >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').simplify())
- ([x,y],[walks(x), runs(y)])
- >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))').simplify())
- ([w,x,y,z],[walks(x), jumps(y), twos(z), runs(w)])
- >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])))').simplify())
- ([],[walks(x), runs(x), threes(x), fours(x)])
- >>> dexpr(r'([x],[man(x)])+([x],[walks(x)])').simplify() == \
- ... dexpr(r'([x,z1],[man(x), walks(z1)])')
- True
- >>> dexpr(r'([y],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)]))])+([x],[run(x)])').simplify() == \
- ... dexpr(r'([y,z1],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)])), run(z1)])')
- True
-
- >>> dexpr(r'\Q.(([x],[john(x),walks(x)]) + Q)(([x],[PRO(x),leaves(x)]))').simplify() == \
- ... dexpr(r'([x,z1],[john(x), walks(x), PRO(z1), leaves(z1)])')
- True
-
- >>> logic._counter._value = 0
- >>> print(dexpr('([],[(([x],[dog(x)]) -> ([e,y],[boy(y), chase(e), subj(e,x), obj(e,y)]))])+([e,x],[PRO(x), run(e), subj(e,x)])').simplify().normalize().normalize())
- ([e02,z5],[(([z3],[dog(z3)]) -> ([e01,z4],[boy(z4), chase(e01), subj(e01,z3), obj(e01,z4)])), PRO(z5), run(e02), subj(e02,z5)])
-
-``fol()``
------------
-
- >>> print(dexpr(r'([x,y],[sees(x,y)])').fol())
- exists x y.sees(x,y)
- >>> print(dexpr(r'([x],[man(x), walks(x)])').fol())
- exists x.(man(x) & walks(x))
- >>> print(dexpr(r'\x.([],[man(x), walks(x)])').fol())
- \x.(man(x) & walks(x))
- >>> print(dexpr(r'\x y.([],[sees(x,y)])').fol())
- \x y.sees(x,y)
-
- >>> print(dexpr(r'\x.([],[walks(x)])(john)').fol())
- \x.walks(x)(john)
- >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').fol())
- (\R x.big(x,R))(\y.mouse(y))
-
- >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').fol())
- (exists x.walks(x) & exists y.runs(y))
-
- >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))').fol())
- (walks(x) -> runs(x))
-
- >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])').fol())
- exists x.(PRO(x) & sees(John,x))
- >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])').fol())
- exists x.(man(x) & -walks(x))
- >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol())
- all x.(man(x) -> walks(x))
-
- >>> print(dexpr(r'([x],[man(x) | walks(x)])').fol())
- exists x.(man(x) | walks(x))
- >>> print(dexpr(r'P(x) + ([x],[walks(x)])').fol())
- (P(x) & exists x.walks(x))
-
-``resolve_anaphora()``
-----------------------
-
- >>> from nltk.sem.drt import AnaphoraResolutionException
-
- >>> print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])')))
- ([x,y,z],[dog(x), cat(y), walks(z), (z = [x,y])])
- >>> print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])')))
- ([],[(([x],[dog(x)]) -> ([y],[walks(y), (y = x)]))])
- >>> print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')).simplify())
- ([x,y],[(x = y)])
- >>> try: print(resolve_anaphora(dexpr(r'([x],[walks(x), PRO(x)])')))
- ... except AnaphoraResolutionException as e: print(e)
- Variable 'x' does not resolve to anything.
- >>> print(resolve_anaphora(dexpr('([e01,z6,z7],[boy(z6), PRO(z7), run(e01), subj(e01,z7)])')))
- ([e01,z6,z7],[boy(z6), (z7 = z6), run(e01), subj(e01,z7)])
-
-``equiv()``:
-----------------
-
- >>> a = dexpr(r'([x],[man(x), walks(x)])')
- >>> b = dexpr(r'([x],[walks(x), man(x)])')
- >>> print(a.equiv(b, TableauProver()))
- True
-
-
-``replace()``:
---------------
-
- >>> a = dexpr(r'a')
- >>> w = dexpr(r'w')
- >>> x = dexpr(r'x')
- >>> y = dexpr(r'y')
- >>> z = dexpr(r'z')
-
-
-replace bound
--------------
-
- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, False))
- ([x],[give(x,y,z)])
- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, True))
- ([a],[give(a,y,z)])
-
-replace unbound
----------------
-
- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, False))
- ([x],[give(x,a,z)])
- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, True))
- ([x],[give(x,a,z)])
-
-replace unbound with bound
---------------------------
-
- >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, False) == \
- ... dexpr('([z1],[give(z1,x,z)])')
- True
- >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, True) == \
- ... dexpr('([z1],[give(z1,x,z)])')
- True
-
-replace unbound with unbound
-----------------------------
-
- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, False))
- ([x],[give(x,z,z)])
- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, True))
- ([x],[give(x,z,z)])
-
-
-replace unbound
----------------
-
- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False))
- (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True))
- (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
-
-replace bound
--------------
-
- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, False))
- (([x],[P(x,y,z)]) + ([y],[Q(x,y,z)]))
- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, True))
- (([a],[P(a,y,z)]) + ([y],[Q(a,y,z)]))
-
-replace unbound with unbound
-----------------------------
-
- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False))
- (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True))
- (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
-
-replace unbound with bound on same side
----------------------------------------
-
- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, False) == \
- ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))')
- True
- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, True) == \
- ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))')
- True
-
-replace unbound with bound on other side
-----------------------------------------
-
- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, False) == \
- ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))')
- True
- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, True) == \
- ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))')
- True
-
-replace unbound with double bound
----------------------------------
-
- >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, False) == \
- ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))')
- True
- >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, True) == \
- ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))')
- True
-
-
-regression tests
-----------------
-
- >>> d = dexpr('([x],[A(c), ([y],[B(x,y,z,a)])->([z],[C(x,y,z,a)])])')
- >>> print(d)
- ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
- >>> print(d.pretty_format())
- ____________________________________
- | x |
- |------------------------------------|
- | A(c) |
- | ____________ ____________ |
- | | y | | z | |
- | (|------------| -> |------------|) |
- | | B(x,y,z,a) | | C(x,y,z,a) | |
- | |____________| |____________| |
- |____________________________________|
- >>> print(str(d))
- ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
- >>> print(d.fol())
- exists x.(A(c) & all y.(B(x,y,z,a) -> exists z.C(x,y,z,a)))
- >>> print(d.replace(Variable('a'), DrtVariableExpression(Variable('r'))))
- ([x],[A(c), (([y],[B(x,y,z,r)]) -> ([z],[C(x,y,z,r)]))])
- >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r'))))
- ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
- >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r'))))
- ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
- >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r'))))
- ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([z],[C(x,y,z,a)]))])
- >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')), True))
- ([r],[A(c), (([y],[B(r,y,z,a)]) -> ([z],[C(r,y,z,a)]))])
- >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')), True))
- ([x],[A(c), (([r],[B(x,r,z,a)]) -> ([z],[C(x,r,z,a)]))])
- >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')), True))
- ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([r],[C(x,y,r,a)]))])
- >>> print(d == dexpr('([l],[A(c), ([m],[B(l,m,z,a)])->([n],[C(l,m,n,a)])])'))
- True
- >>> d = dexpr('([],[([x,y],[B(x,y,h), ([a,b],[dee(x,a,g)])])->([z,w],[cee(x,y,f), ([c,d],[E(x,c,d,e)])])])')
- >>> sorted(d.free())
- [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')]
- >>> sorted(d.variables())
- [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')]
- >>> sorted(d.get_refs(True))
- [Variable('a'), Variable('b'), Variable('c'), Variable('d'), Variable('w'), Variable('x'), Variable('y'), Variable('z')]
- >>> sorted(d.conds[0].get_refs(False))
- [Variable('x'), Variable('y')]
- >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])->([],[C(x,y)]), ([x,y],[D(x,y)])->([],[E(x,y)]), ([],[F(x,y)])->([x,y],[G(x,y)])])').eliminate_equality())
- ([x],[A(x,x), (([],[B(x,x)]) -> ([],[C(x,x)])), (([x,y],[D(x,y)]) -> ([],[E(x,y)])), (([],[F(x,x)]) -> ([x,y],[G(x,y)]))])
- >>> print(dexpr('([x,y],[A(x,y), (x=y)]) -> ([],[B(x,y)])').eliminate_equality())
- (([x],[A(x,x)]) -> ([],[B(x,x)]))
- >>> print(dexpr('([x,y],[A(x,y)]) -> ([],[B(x,y), (x=y)])').eliminate_equality())
- (([x,y],[A(x,y)]) -> ([],[B(x,x)]))
- >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])])').eliminate_equality())
- ([x],[A(x,x), ([],[B(x,x)])])
- >>> print(dexpr('([x,y],[A(x,y), ([],[B(x,y), (x=y)])])').eliminate_equality())
- ([x,y],[A(x,y), ([],[B(x,x)])])
- >>> print(dexpr('([z8 z9 z10],[A(z8), z8=z10, z9=z10, B(z9), C(z10), D(z10)])').eliminate_equality())
- ([z9],[A(z9), B(z9), C(z9), D(z9)])
-
- >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)]), ([x,y],[C(x,y)])])').eliminate_equality())
- ([x],[A(x,x), ([],[B(x,x)]), ([x,y],[C(x,y)])])
- >>> print(dexpr('([x,y],[A(x,y)]) + ([],[B(x,y), (x=y)]) + ([],[C(x,y)])').eliminate_equality())
- ([x],[A(x,x), B(x,x), C(x,x)])
- >>> print(dexpr('([x,y],[B(x,y)])+([x,y],[C(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))))
- (([x,y],[B(x,y)]) + ([x,y],[C(x,y)]))
- >>> print(dexpr('(([x,y],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))))
- (([x,y],[B(x,y)]) + ([],[C(x,y)]) + ([],[D(x,y)]))
- >>> print(dexpr('(([],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))))
- (([],[B(x,x)]) + ([],[C(x,x)]) + ([],[D(x,x)]))
- >>> print(dexpr('(([],[B(x,y), ([x,y],[A(x,y)])])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))).normalize())
- (([],[B(z3,z1), ([z2,z3],[A(z3,z2)])]) + ([],[C(z3,z1)]) + ([],[D(z3,z1)]))
-
-
-Parse errors
-============
-
- >>> def parse_error(drtstring):
- ... try: dexpr(drtstring)
- ... except logic.LogicalExpressionException as e: print(e)
-
- >>> parse_error(r'')
- End of input found. Expression expected.
- <BLANKLINE>
- ^
- >>> parse_error(r'(')
- End of input found. Expression expected.
- (
- ^
- >>> parse_error(r'()')
- Unexpected token: ')'. Expression expected.
- ()
- ^
- >>> parse_error(r'([')
- End of input found. Expected token ']'.
- ([
- ^
- >>> parse_error(r'([,')
- ',' is an illegal variable name. Constants may not be quantified.
- ([,
- ^
- >>> parse_error(r'([x,')
- End of input found. Variable expected.
- ([x,
- ^
- >>> parse_error(r'([]')
- End of input found. Expected token '['.
- ([]
- ^
- >>> parse_error(r'([][')
- End of input found. Expected token ']'.
- ([][
- ^
- >>> parse_error(r'([][,')
- Unexpected token: ','. Expression expected.
- ([][,
- ^
- >>> parse_error(r'([][]')
- End of input found. Expected token ')'.
- ([][]
- ^
- >>> parse_error(r'([x][man(x)]) |')
- End of input found. Expression expected.
- ([x][man(x)]) |
- ^
-
-Pretty Printing
-===============
-
- >>> dexpr(r"([],[])").pretty_print()
- __
- | |
- |--|
- |__|
-
- >>> dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print()
- _____________________________
- | |
- |-----------------------------|
- | ________ _________ |
- | | x | | | |
- | (|--------| -> |---------|) |
- | | big(x) | | bark(x) | |
- | | dog(x) | |_________| |
- | |________| |
- | _________ |
- | | x | |
- | __ |---------| |
- | | | walk(x) | |
- | |_________| |
- |_____________________________|
-
- >>> dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print()
- _________ _________
- | x y | | z |
- (|---------| + |---------|)
- | (x = y) | | dog(z) |
- |_________| | walk(z) |
- |_________|
-
- >>> dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print()
- _______________________________
- | |
- |-------------------------------|
- | ___ ___ _________ |
- | | x | | y | | z | |
- | (|---| | |---| | |---------|) |
- | |___| |___| | dog(z) | |
- | | walk(z) | |
- | |_________| |
- |_______________________________|
-
- >>> dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print()
- ___ ________
- \ | x | \ | |
- /\ P Q.(|---| + P(x) + Q(x))( /\ x.|--------|)
- |___| | dog(x) |
- |________|
-
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=========================
- Feature Grammar Parsing
-=========================
-
-.. include:: ../../../nltk_book/definitions.rst
-
-Grammars can be parsed from strings.
-
- >>> import nltk
- >>> from nltk import grammar, parse
- >>> g = """
- ... % start DP
- ... DP[AGR=?a] -> D[AGR=?a] N[AGR=?a]
- ... D[AGR=[NUM='sg', PERS=3]] -> 'this' | 'that'
- ... D[AGR=[NUM='pl', PERS=3]] -> 'these' | 'those'
- ... D[AGR=[NUM='pl', PERS=1]] -> 'we'
- ... D[AGR=[PERS=2]] -> 'you'
- ... N[AGR=[NUM='sg', GND='m']] -> 'boy'
- ... N[AGR=[NUM='pl', GND='m']] -> 'boys'
- ... N[AGR=[NUM='sg', GND='f']] -> 'girl'
- ... N[AGR=[NUM='pl', GND='f']] -> 'girls'
- ... N[AGR=[NUM='sg']] -> 'student'
- ... N[AGR=[NUM='pl']] -> 'students'
- ... """
- >>> grammar = grammar.FeatureGrammar.fromstring(g)
- >>> tokens = 'these girls'.split()
- >>> parser = parse.FeatureEarleyChartParser(grammar)
- >>> trees = parser.parse(tokens)
- >>> for tree in trees: print(tree)
- (DP[AGR=[GND='f', NUM='pl', PERS=3]]
- (D[AGR=[NUM='pl', PERS=3]] these)
- (N[AGR=[GND='f', NUM='pl']] girls))
-
-In general, when we are trying to develop even a very small grammar,
-it is convenient to put the rules in a file where they can be edited,
-tested and revised. Let's assume that we have saved feat0cfg_ as a file named
-``'feat0.fcfg'`` and placed it in the NLTK ``data`` directory. We can
-inspect it as follows:
-
-.. _feat0cfg: http://nltk.svn.sourceforge.net/svnroot/nltk/trunk/nltk/data/grammars/feat0.fcfg
-
- >>> nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg')
- % start S
- # ###################
- # Grammar Productions
- # ###################
- # S expansion productions
- S -> NP[NUM=?n] VP[NUM=?n]
- # NP expansion productions
- NP[NUM=?n] -> N[NUM=?n]
- NP[NUM=?n] -> PropN[NUM=?n]
- NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
- NP[NUM=pl] -> N[NUM=pl]
- # VP expansion productions
- VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
- VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP
- # ###################
- # Lexical Productions
- # ###################
- Det[NUM=sg] -> 'this' | 'every'
- Det[NUM=pl] -> 'these' | 'all'
- Det -> 'the' | 'some' | 'several'
- PropN[NUM=sg]-> 'Kim' | 'Jody'
- N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
- N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children'
- IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks'
- TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'
- IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk'
- TV[TENSE=pres, NUM=pl] -> 'see' | 'like'
- IV[TENSE=past] -> 'disappeared' | 'walked'
- TV[TENSE=past] -> 'saw' | 'liked'
-
-Assuming we have saved feat0cfg_ as a file named
-``'feat0.fcfg'``, the function ``parse.load_parser`` allows us to
-read the grammar into NLTK, ready for use in parsing.
-
-
- >>> cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1)
- >>> sent = 'Kim likes children'
- >>> tokens = sent.split()
- >>> tokens
- ['Kim', 'likes', 'children']
- >>> trees = cp.parse(tokens)
- |.Kim .like.chil.|
- |[----] . .| [0:1] 'Kim'
- |. [----] .| [1:2] 'likes'
- |. . [----]| [2:3] 'children'
- |[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' *
- |[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *
- |[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
- |. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' *
- |. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}
- |. . [----]| [2:3] N[NUM='pl'] -> 'children' *
- |. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] *
- |. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'}
- |. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] *
- |[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] *
- >>> for tree in trees: print(tree)
- (S[]
- (NP[NUM='sg'] (PropN[NUM='sg'] Kim))
- (VP[NUM='sg', TENSE='pres']
- (TV[NUM='sg', TENSE='pres'] likes)
- (NP[NUM='pl'] (N[NUM='pl'] children))))
-
-The parser works directly with
-the underspecified productions given by the grammar. That is, the
-Predictor rule does not attempt to compile out all admissible feature
-combinations before trying to expand the non-terminals on the left hand
-side of a production. However, when the Scanner matches an input word
-against a lexical production that has been predicted, the new edge will
-typically contain fully specified features; e.g., the edge
-[PropN[`num`:feat: = `sg`:fval:] |rarr| 'Kim', (0, 1)]. Recall from
-Chapter 8 that the Fundamental (or Completer) Rule in
-standard CFGs is used to combine an incomplete edge that's expecting a
-nonterminal *B* with a following, complete edge whose left hand side
-matches *B*. In our current setting, rather than checking for a
-complete match, we test whether the expected category *B* will
-`unify`:dt: with the left hand side *B'* of a following complete
-edge. We will explain in more detail in Section 9.2 how
-unification works; for the moment, it is enough to know that as a
-result of unification, any variable values of features in *B* will be
-instantiated by constant values in the corresponding feature structure
-in *B'*, and these instantiated values will be used in the new edge
-added by the Completer. This instantiation can be seen, for example,
-in the edge
-[NP [`num`:feat:\ =\ `sg`:fval:] |rarr| PropN[`num`:feat:\ =\ `sg`:fval:] |dot|, (0, 1)]
-in Example 9.2, where the feature `num`:feat: has been assigned the value `sg`:fval:.
-
-Feature structures in NLTK are ... Atomic feature values can be strings or
-integers.
-
- >>> fs1 = nltk.FeatStruct(TENSE='past', NUM='sg')
- >>> print(fs1)
- [ NUM = 'sg' ]
- [ TENSE = 'past' ]
-
-We can think of a feature structure as being like a Python dictionary,
-and access its values by indexing in the usual way.
-
- >>> fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem')
- >>> print(fs1['GND'])
- fem
-
-We can also define feature structures which have complex values, as
-discussed earlier.
-
- >>> fs2 = nltk.FeatStruct(POS='N', AGR=fs1)
- >>> print(fs2)
- [ [ GND = 'fem' ] ]
- [ AGR = [ NUM = 'pl' ] ]
- [ [ PER = 3 ] ]
- [ ]
- [ POS = 'N' ]
- >>> print(fs2['AGR'])
- [ GND = 'fem' ]
- [ NUM = 'pl' ]
- [ PER = 3 ]
- >>> print(fs2['AGR']['PER'])
- 3
-
-Feature structures can also be constructed using the ``parse()``
-method of the ``nltk.FeatStruct`` class. Note that in this case, atomic
-feature values do not need to be enclosed in quotes.
-
- >>> f1 = nltk.FeatStruct("[NUMBER = sg]")
- >>> f2 = nltk.FeatStruct("[PERSON = 3]")
- >>> print(nltk.unify(f1, f2))
- [ NUMBER = 'sg' ]
- [ PERSON = 3 ]
-
- >>> f1 = nltk.FeatStruct("[A = [B = b, D = d]]")
- >>> f2 = nltk.FeatStruct("[A = [C = c, D = d]]")
- >>> print(nltk.unify(f1, f2))
- [ [ B = 'b' ] ]
- [ A = [ C = 'c' ] ]
- [ [ D = 'd' ] ]
-
-
-Feature Structures as Graphs
-----------------------------
-
-Feature structures are not inherently tied to linguistic objects; they are
-general purpose structures for representing knowledge. For example, we
-could encode information about a person in a feature structure:
-
- >>> person01 = nltk.FeatStruct("[NAME=Lee, TELNO='01 27 86 42 96',AGE=33]")
- >>> print(person01)
- [ AGE = 33 ]
- [ NAME = 'Lee' ]
- [ TELNO = '01 27 86 42 96' ]
-
-There are a number of notations for representing reentrancy in
-matrix-style representations of feature structures. In NLTK, we adopt
-the following convention: the first occurrence of a shared feature structure
-is prefixed with an integer in parentheses, such as ``(1)``, and any
-subsequent reference to that structure uses the notation
-``->(1)``, as shown below.
-
-
- >>> fs = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
- ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""")
- >>> print(fs)
- [ ADDRESS = (1) [ NUMBER = 74 ] ]
- [ [ STREET = 'rue Pascal' ] ]
- [ ]
- [ NAME = 'Lee' ]
- [ ]
- [ SPOUSE = [ ADDRESS -> (1) ] ]
- [ [ NAME = 'Kim' ] ]
-
-There can be any number of tags within a single feature structure.
-
- >>> fs3 = nltk.FeatStruct("[A=(1)[B=b], C=(2)[], D->(1), E->(2)]")
- >>> print(fs3)
- [ A = (1) [ B = 'b' ] ]
- [ ]
- [ C = (2) [] ]
- [ ]
- [ D -> (1) ]
- [ E -> (2) ]
- >>> fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal')
- >>> fs2 = nltk.FeatStruct(CITY='Paris')
- >>> print(nltk.unify(fs1, fs2))
- [ CITY = 'Paris' ]
- [ NUMBER = 74 ]
- [ STREET = 'rue Pascal' ]
-
-Unification is symmetric:
-
- >>> nltk.unify(fs1, fs2) == nltk.unify(fs2, fs1)
- True
-
-Unification is commutative:
-
- >>> fs3 = nltk.FeatStruct(TELNO='01 27 86 42 96')
- >>> nltk.unify(nltk.unify(fs1, fs2), fs3) == nltk.unify(fs1, nltk.unify(fs2, fs3))
- True
-
-Unification between `FS`:math:\ :subscript:`0` and `FS`:math:\
-:subscript:`1` will fail if the two feature structures share a path |pi|,
-but the value of |pi| in `FS`:math:\ :subscript:`0` is a distinct
-atom from the value of |pi| in `FS`:math:\ :subscript:`1`. In NLTK,
-this is implemented by setting the result of unification to be
-``None``.
-
- >>> fs0 = nltk.FeatStruct(A='a')
- >>> fs1 = nltk.FeatStruct(A='b')
- >>> print(nltk.unify(fs0, fs1))
- None
-
-Now, if we look at how unification interacts with structure-sharing,
-things become really interesting.
-
-
-
- >>> fs0 = nltk.FeatStruct("""[NAME=Lee,
- ... ADDRESS=[NUMBER=74,
- ... STREET='rue Pascal'],
- ... SPOUSE= [NAME=Kim,
- ... ADDRESS=[NUMBER=74,
- ... STREET='rue Pascal']]]""")
- >>> print(fs0)
- [ ADDRESS = [ NUMBER = 74 ] ]
- [ [ STREET = 'rue Pascal' ] ]
- [ ]
- [ NAME = 'Lee' ]
- [ ]
- [ [ ADDRESS = [ NUMBER = 74 ] ] ]
- [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ]
- [ [ ] ]
- [ [ NAME = 'Kim' ] ]
-
-
- >>> fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]")
- >>> print(nltk.unify(fs0, fs1))
- [ ADDRESS = [ NUMBER = 74 ] ]
- [ [ STREET = 'rue Pascal' ] ]
- [ ]
- [ NAME = 'Lee' ]
- [ ]
- [ [ [ CITY = 'Paris' ] ] ]
- [ [ ADDRESS = [ NUMBER = 74 ] ] ]
- [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ]
- [ [ ] ]
- [ [ NAME = 'Kim' ] ]
-
- >>> fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
- ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""")
-
-
- >>> print(fs2)
- [ ADDRESS = (1) [ NUMBER = 74 ] ]
- [ [ STREET = 'rue Pascal' ] ]
- [ ]
- [ NAME = 'Lee' ]
- [ ]
- [ SPOUSE = [ ADDRESS -> (1) ] ]
- [ [ NAME = 'Kim' ] ]
-
-
- >>> print(nltk.unify(fs2, fs1))
- [ [ CITY = 'Paris' ] ]
- [ ADDRESS = (1) [ NUMBER = 74 ] ]
- [ [ STREET = 'rue Pascal' ] ]
- [ ]
- [ NAME = 'Lee' ]
- [ ]
- [ SPOUSE = [ ADDRESS -> (1) ] ]
- [ [ NAME = 'Kim' ] ]
-
-
- >>> fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]")
- >>> fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]")
- >>> print(fs2)
- [ ADDRESS1 = ?x ]
- [ ADDRESS2 = ?x ]
- >>> print(nltk.unify(fs1, fs2))
- [ ADDRESS1 = (1) [ NUMBER = 74 ] ]
- [ [ STREET = 'rue Pascal' ] ]
- [ ]
- [ ADDRESS2 -> (1) ]
-
-
-
-
- >>> sent = 'who do you claim that you like'
- >>> tokens = sent.split()
- >>> cp = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1)
- >>> trees = cp.parse(tokens)
- |.w.d.y.c.t.y.l.|
- |[-] . . . . . .| [0:1] 'who'
- |. [-] . . . . .| [1:2] 'do'
- |. . [-] . . . .| [2:3] 'you'
- |. . . [-] . . .| [3:4] 'claim'
- |. . . . [-] . .| [4:5] 'that'
- |. . . . . [-] .| [5:6] 'you'
- |. . . . . . [-]| [6:7] 'like'
- |# . . . . . . .| [0:0] NP[]/NP[] -> *
- |. # . . . . . .| [1:1] NP[]/NP[] -> *
- |. . # . . . . .| [2:2] NP[]/NP[] -> *
- |. . . # . . . .| [3:3] NP[]/NP[] -> *
- |. . . . # . . .| [4:4] NP[]/NP[] -> *
- |. . . . . # . .| [5:5] NP[]/NP[] -> *
- |. . . . . . # .| [6:6] NP[]/NP[] -> *
- |. . . . . . . #| [7:7] NP[]/NP[] -> *
- |[-] . . . . . .| [0:1] NP[+WH] -> 'who' *
- |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {}
- |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
- |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {}
- |. [-] . . . . .| [1:2] V[+AUX] -> 'do' *
- |. [-> . . . . .| [1:2] S[+INV] -> V[+AUX] * NP[] VP[] {}
- |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {}
- |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {}
- |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {}
- |. . [-] . . . .| [2:3] NP[-WH] -> 'you' *
- |. . [-> . . . .| [2:3] S[-INV] -> NP[] * VP[] {}
- |. . [-> . . . .| [2:3] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
- |. . [-> . . . .| [2:3] S[-INV] -> NP[] * S[]/NP[] {}
- |. [---> . . . .| [1:3] S[+INV] -> V[+AUX] NP[] * VP[] {}
- |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {}
- |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' *
- |. . . [-> . . .| [3:4] VP[] -> V[-AUX, SUBCAT='clause'] * SBar[] {}
- |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {}
- |. . . . [-] . .| [4:5] Comp[] -> 'that' *
- |. . . . [-> . .| [4:5] SBar[] -> Comp[] * S[-INV] {}
- |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {}
- |. . . . . [-] .| [5:6] NP[-WH] -> 'you' *
- |. . . . . [-> .| [5:6] S[-INV] -> NP[] * VP[] {}
- |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
- |. . . . . [-> .| [5:6] S[-INV] -> NP[] * S[]/NP[] {}
- |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' *
- |. . . . . . [->| [6:7] VP[] -> V[-AUX, SUBCAT='trans'] * NP[] {}
- |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {}
- |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] *
- |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] *
- |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] *
- |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] *
- |. . [---------]| [2:7] S[-INV]/NP[] -> NP[] VP[]/NP[] *
- |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] *
- |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] *
-
- >>> trees = list(trees)
- >>> for tree in trees: print(tree)
- (S[-INV]
- (NP[+WH] who)
- (S[+INV]/NP[]
- (V[+AUX] do)
- (NP[-WH] you)
- (VP[]/NP[]
- (V[-AUX, SUBCAT='clause'] claim)
- (SBar[]/NP[]
- (Comp[] that)
- (S[-INV]/NP[]
- (NP[-WH] you)
- (VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] )))))))
-
-A different parser should give the same parse trees, but perhaps in a different order:
-
- >>> cp2 = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1,
- ... parser=parse.FeatureEarleyChartParser)
- >>> trees2 = cp2.parse(tokens)
- |.w.d.y.c.t.y.l.|
- |[-] . . . . . .| [0:1] 'who'
- |. [-] . . . . .| [1:2] 'do'
- |. . [-] . . . .| [2:3] 'you'
- |. . . [-] . . .| [3:4] 'claim'
- |. . . . [-] . .| [4:5] 'that'
- |. . . . . [-] .| [5:6] 'you'
- |. . . . . . [-]| [6:7] 'like'
- |> . . . . . . .| [0:0] S[-INV] -> * NP[] VP[] {}
- |> . . . . . . .| [0:0] S[-INV]/?x[] -> * NP[] VP[]/?x[] {}
- |> . . . . . . .| [0:0] S[-INV] -> * NP[] S[]/NP[] {}
- |> . . . . . . .| [0:0] S[-INV] -> * Adv[+NEG] S[+INV] {}
- |> . . . . . . .| [0:0] S[+INV] -> * V[+AUX] NP[] VP[] {}
- |> . . . . . . .| [0:0] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {}
- |> . . . . . . .| [0:0] NP[+WH] -> * 'who' {}
- |[-] . . . . . .| [0:1] NP[+WH] -> 'who' *
- |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {}
- |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
- |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {}
- |. > . . . . . .| [1:1] S[-INV]/?x[] -> * NP[] VP[]/?x[] {}
- |. > . . . . . .| [1:1] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {}
- |. > . . . . . .| [1:1] V[+AUX] -> * 'do' {}
- |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
- |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
- |. > . . . . . .| [1:1] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
- |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='intrans'] {}
- |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {}
- |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {}
- |. > . . . . . .| [1:1] VP[] -> * V[+AUX] VP[] {}
- |. [-] . . . . .| [1:2] V[+AUX] -> 'do' *
- |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {}
- |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {}
- |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {}
- |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='intrans'] {}
- |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {}
- |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {}
- |. . > . . . . .| [2:2] VP[] -> * V[+AUX] VP[] {}
- |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
- |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
- |. . > . . . . .| [2:2] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
- |. . > . . . . .| [2:2] NP[-WH] -> * 'you' {}
- |. . [-] . . . .| [2:3] NP[-WH] -> 'you' *
- |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {}
- |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
- |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
- |. . . > . . . .| [3:3] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
- |. . . > . . . .| [3:3] V[-AUX, SUBCAT='clause'] -> * 'claim' {}
- |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' *
- |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {}
- |. . . . > . . .| [4:4] SBar[]/?x[] -> * Comp[] S[-INV]/?x[] {}
- |. . . . > . . .| [4:4] Comp[] -> * 'that' {}
- |. . . . [-] . .| [4:5] Comp[] -> 'that' *
- |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {}
- |. . . . . > . .| [5:5] S[-INV]/?x[] -> * NP[] VP[]/?x[] {}
- |. . . . . > . .| [5:5] NP[-WH] -> * 'you' {}
- |. . . . . [-] .| [5:6] NP[-WH] -> 'you' *
- |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
- |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
- |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
- |. . . . . . > .| [6:6] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
- |. . . . . . > .| [6:6] V[-AUX, SUBCAT='trans'] -> * 'like' {}
- |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' *
- |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {}
- |. . . . . . . #| [7:7] NP[]/NP[] -> *
- |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] *
- |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] *
- |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] *
- |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] *
- |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] *
- |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] *
-
- >>> sorted(trees) == sorted(trees2)
- True
-
-
-Let's load a German grammar:
-
- >>> cp = parse.load_parser('grammars/book_grammars/german.fcfg', trace=0)
- >>> sent = 'die Katze sieht den Hund'
- >>> tokens = sent.split()
- >>> trees = cp.parse(tokens)
- >>> for tree in trees: print(tree)
- (S[]
- (NP[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom']
- (Det[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] die)
- (N[AGR=[GND='fem', NUM='sg', PER=3]] Katze))
- (VP[AGR=[NUM='sg', PER=3]]
- (TV[AGR=[NUM='sg', PER=3], OBJCASE='acc'] sieht)
- (NP[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc']
- (Det[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] den)
- (N[AGR=[GND='masc', NUM='sg', PER=3]] Hund))))
-
-Grammar with Binding Operators
-------------------------------
-The `bindop.fcfg`_ grammar is a semantic grammar that uses lambda
-calculus. Each element has a core semantics, which is a single lambda
-calculus expression; and a set of binding operators, which bind
-variables.
-
-.. _bindop.fcfg: http://nltk.svn.sourceforge.net/svnroot/nltk/trunk/nltk/data/grammars/bindop.fcfg
-
-In order to make the binding operators work right, they need to
-instantiate their bound variable every time they are added to the
-chart. To do this, we use a special subclass of `Chart`, called
-`InstantiateVarsChart`.
-
- >>> from nltk.parse.featurechart import InstantiateVarsChart
- >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=1,
- ... chart_class=InstantiateVarsChart)
- >>> print(cp.grammar())
- Grammar with 15 productions (start state = S[])
- S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] VP[SEM=[BO=?b2, CORE=?vp]]
- VP[SEM=[BO={?b1+?b2}, CORE=<?v(?obj)>]] -> TV[SEM=[BO=?b1, CORE=?v]] NP[SEM=[BO=?b2, CORE=?obj]]
- VP[SEM=?s] -> IV[SEM=?s]
- NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] N[SEM=[BO=?b2, CORE=?n]]
- Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a'
- N[SEM=[BO={/}, CORE=<dog>]] -> 'dog'
- N[SEM=[BO={/}, CORE=<dog>]] -> 'cat'
- N[SEM=[BO={/}, CORE=<dog>]] -> 'mouse'
- IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks'
- IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'eats'
- IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'walks'
- TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds'
- TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'walks'
- NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'john'
- NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'alex'
-
-A simple intransitive sentence:
-
- >>> from nltk.sem import logic
- >>> logic._counter._value = 100
-
- >>> trees = cp.parse('john barks'.split())
- |. john.barks.|
- |[-----] .| [0:1] 'john'
- |. [-----]| [1:2] 'barks'
- |[-----] .| [0:1] NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=<z101>]] -> 'john' *
- |[-----> .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: <IndividualVariableExpression z2>}
- |. [-----]| [1:2] IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' *
- |. [-----]| [1:2] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] *
- |[===========]| [0:2] S[SEM=[BO={bo(\P.P(John),z2)}, CORE=<bark(z2)>]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<z2>]] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] *
- >>> for tree in trees: print(tree)
- (S[SEM=[BO={bo(\P.P(John),z2)}, CORE=<bark(z2)>]]
- (NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=<z101>]] john)
- (VP[SEM=[BO={/}, CORE=<\x.bark(x)>]]
- (IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] barks)))
-
-A transitive sentence:
-
- >>> trees = cp.parse('john feeds a dog'.split())
- |.joh.fee. a .dog.|
- |[---] . . .| [0:1] 'john'
- |. [---] . .| [1:2] 'feeds'
- |. . [---] .| [2:3] 'a'
- |. . . [---]| [3:4] 'dog'
- |[---] . . .| [0:1] NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=<z102>]] -> 'john' *
- |[---> . . .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: <IndividualVariableExpression z2>}
- |. [---] . .| [1:2] TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' *
- |. [---> . .| [1:2] VP[SEM=[BO={?b1+?b2}, CORE=<?v(?obj)>]] -> TV[SEM=[BO=?b1, CORE=?v]] * NP[SEM=[BO=?b2, CORE=?obj]] {?b1: {/}, ?v: <LambdaExpression \x y.feed(y,x)>}
- |. . [---] .| [2:3] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' *
- |. . [---> .| [2:3] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] * N[SEM=[BO=?b2, CORE=?n]] {?b1: {/}, ?det: <LambdaExpression \Q P.exists x.(Q(x) & P(x))>}
- |. . . [---]| [3:4] N[SEM=[BO={/}, CORE=<dog>]] -> 'dog' *
- |. . [-------]| [2:4] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=<z103>]] -> Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] N[SEM=[BO={/}, CORE=<dog>]] *
- |. . [------->| [2:4] S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.exists x.(dog(x) & P(x)),z2)}, ?subj: <IndividualVariableExpression z2>}
- |. [-----------]| [1:4] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] -> TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<z2>]] *
- |[===============]| [0:4] S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<feed(z2,z3)>]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<z2>]] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<\y.feed(y,z3)>]] *
-
- >>> for tree in trees: print(tree)
- (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<feed(z2,z3)>]]
- (NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=<z102>]] john)
- (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]]
- (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds)
- (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=<z103>]]
- (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a)
- (N[SEM=[BO={/}, CORE=<dog>]] dog))))
-
-Turn down the verbosity:
-
- >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=0,
- ... chart_class=InstantiateVarsChart)
-
-Reuse the same lexical item twice:
-
- >>> trees = cp.parse('john feeds john'.split())
- >>> for tree in trees: print(tree)
- (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.P(John),z3)}, CORE=<feed(z2,z3)>]]
- (NP[SEM=[BO={bo(\P.P(John),z104)}, CORE=<z104>]] john)
- (VP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<\y.feed(y,z2)>]]
- (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds)
- (NP[SEM=[BO={bo(\P.P(John),z105)}, CORE=<z105>]] john)))
-
- >>> trees = cp.parse('a dog feeds a dog'.split())
- >>> for tree in trees: print(tree)
- (S[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<feed(z2,z3)>]]
- (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z106)}, CORE=<z106>]]
- (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a)
- (N[SEM=[BO={/}, CORE=<dog>]] dog))
- (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]]
- (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds)
- (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z107)}, CORE=<z107>]]
- (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a)
- (N[SEM=[BO={/}, CORE=<dog>]] dog))))
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==================================
- Feature Structures & Unification
-==================================
- >>> from nltk.featstruct import FeatStruct
- >>> from nltk.sem.logic import Variable, VariableExpression, Expression
-
-.. note:: For now, featstruct uses the older lambdalogic semantics
- module. Eventually, it should be updated to use the new first
- order predicate logic module.
-
-Overview
-~~~~~~~~
-A feature structure is a mapping from feature identifiers to feature
-values, where feature values can be simple values (like strings or
-ints), nested feature structures, or variables:
-
- >>> fs1 = FeatStruct(number='singular', person=3)
- >>> print(fs1)
- [ number = 'singular' ]
- [ person = 3 ]
-
-Feature structure may be nested:
-
- >>> fs2 = FeatStruct(type='NP', agr=fs1)
- >>> print(fs2)
- [ agr = [ number = 'singular' ] ]
- [ [ person = 3 ] ]
- [ ]
- [ type = 'NP' ]
-
-Variables are used to indicate that two features should be assigned
-the same value. For example, the following feature structure requires
-that the feature fs3['agr']['number'] be bound to the same value as the
-feature fs3['subj']['number'].
-
- >>> fs3 = FeatStruct(agr=FeatStruct(number=Variable('?n')),
- ... subj=FeatStruct(number=Variable('?n')))
- >>> print(fs3)
- [ agr = [ number = ?n ] ]
- [ ]
- [ subj = [ number = ?n ] ]
-
-Feature structures are typically used to represent partial information
-about objects. A feature name that is not mapped to a value stands
-for a feature whose value is unknown (*not* a feature without a
-value). Two feature structures that represent (potentially
-overlapping) information about the same object can be combined by
-*unification*.
-
- >>> print(fs2.unify(fs3))
- [ agr = [ number = 'singular' ] ]
- [ [ person = 3 ] ]
- [ ]
- [ subj = [ number = 'singular' ] ]
- [ ]
- [ type = 'NP' ]
-
-When two inconsistent feature structures are unified, the unification
-fails and returns ``None``.
-
- >>> fs4 = FeatStruct(agr=FeatStruct(person=1))
- >>> print(fs4.unify(fs2))
- None
- >>> print(fs2.unify(fs4))
- None
-
-..
- >>> del fs1, fs2, fs3, fs4 # clean-up
-
-Feature Structure Types
------------------------
-There are actually two types of feature structure:
-
-- *feature dictionaries*, implemented by `FeatDict`, act like
- Python dictionaries. Feature identifiers may be strings or
- instances of the `Feature` class.
-- *feature lists*, implemented by `FeatList`, act like Python
- lists. Feature identifiers are integers.
-
-When you construct a feature structure using the `FeatStruct`
-constructor, it will automatically decide which type is appropriate:
-
- >>> type(FeatStruct(number='singular'))
- <class 'nltk.featstruct.FeatDict'>
- >>> type(FeatStruct([1,2,3]))
- <class 'nltk.featstruct.FeatList'>
-
-Usually, we will just use feature dictionaries; but sometimes feature
-lists can be useful too. Two feature lists will unify with each other
-only if they have equal lengths, and all of their feature values
-match. If you wish to write a feature list that contains 'unknown'
-values, you must use variables:
-
- >>> fs1 = FeatStruct([1,2,Variable('?y')])
- >>> fs2 = FeatStruct([1,Variable('?x'),3])
- >>> fs1.unify(fs2)
- [1, 2, 3]
-
-..
- >>> del fs1, fs2 # clean-up
-
-Parsing Feature Structure Strings
----------------------------------
-Feature structures can be constructed directly from strings. Often,
-this is more convenient than constructing them directly. NLTK can
-parse most feature strings to produce the corresponding feature
-structures. (But you must restrict your base feature values to
-strings, ints, logic expressions (`nltk.sem.logic.Expression`), and a
-few other types discussed below).
-
-Feature dictionaries are written like Python dictionaries, except that
-keys are not put in quotes; and square brackets (``[]``) are used
-instead of braces (``{}``):
-
- >>> FeatStruct('[tense="past", agr=[number="sing", person=3]]')
- [agr=[number='sing', person=3], tense='past']
-
-If a feature value is a single alphanumeric word, then it does not
-need to be quoted -- it will be automatically treated as a string:
-
- >>> FeatStruct('[tense=past, agr=[number=sing, person=3]]')
- [agr=[number='sing', person=3], tense='past']
-
-Feature lists are written like python lists:
-
- >>> FeatStruct('[1, 2, 3]')
- [1, 2, 3]
-
-The expression ``[]`` is treated as an empty feature dictionary, not
-an empty feature list:
-
- >>> type(FeatStruct('[]'))
- <class 'nltk.featstruct.FeatDict'>
-
-Feature Paths
--------------
-Features can be specified using *feature paths*, or tuples of feature
-identifiers that specify path through the nested feature structures to
-a value.
-
- >>> fs1 = FeatStruct('[x=1, y=[1,2,[z=3]]]')
- >>> fs1['y']
- [1, 2, [z=3]]
- >>> fs1['y', 2]
- [z=3]
- >>> fs1['y', 2, 'z']
- 3
-
-..
- >>> del fs1 # clean-up
-
-Reentrance
-----------
-Feature structures may contain reentrant feature values. A *reentrant
-feature value* is a single feature structure that can be accessed via
-multiple feature paths.
-
- >>> fs1 = FeatStruct(x='val')
- >>> fs2 = FeatStruct(a=fs1, b=fs1)
- >>> print(fs2)
- [ a = (1) [ x = 'val' ] ]
- [ ]
- [ b -> (1) ]
- >>> fs2
- [a=(1)[x='val'], b->(1)]
-
-As you can see, reentrane is displayed by marking a feature structure
-with a unique identifier, in this case ``(1)``, the first time it is
-encountered; and then using the special form ``var -> id`` whenever it
-is encountered again. You can use the same notation to directly
-create reentrant feature structures from strings.
-
- >>> FeatStruct('[a=(1)[], b->(1), c=[d->(1)]]')
- [a=(1)[], b->(1), c=[d->(1)]]
-
-Reentrant feature structures may contain cycles:
-
- >>> fs3 = FeatStruct('(1)[a->(1)]')
- >>> fs3['a', 'a', 'a', 'a']
- (1)[a->(1)]
- >>> fs3['a', 'a', 'a', 'a'] is fs3
- True
-
-Unification preserves the reentrance relations imposed by both of the
-unified feature structures. In the feature structure resulting from
-unification, any modifications to a reentrant feature value will be
-visible using any of its feature paths.
-
- >>> fs3.unify(FeatStruct('[a=[b=12], c=33]'))
- (1)[a->(1), b=12, c=33]
-
-..
- >>> del fs1, fs2, fs3 # clean-up
-
-Feature Structure Equality
---------------------------
-Two feature structures are considered equal if they assign the same
-values to all features, *and* they contain the same reentrances.
-
- >>> fs1 = FeatStruct('[a=(1)[x=1], b->(1)]')
- >>> fs2 = FeatStruct('[a=(1)[x=1], b->(1)]')
- >>> fs3 = FeatStruct('[a=[x=1], b=[x=1]]')
- >>> fs1 == fs1, fs1 is fs1
- (True, True)
- >>> fs1 == fs2, fs1 is fs2
- (True, False)
- >>> fs1 == fs3, fs1 is fs3
- (False, False)
-
-Note that this differs from how Python dictionaries and lists define
-equality -- in particular, Python dictionaries and lists ignore
-reentrance relations. To test two feature structures for equality
-while ignoring reentrance relations, use the `equal_values()` method:
-
- >>> fs1.equal_values(fs1)
- True
- >>> fs1.equal_values(fs2)
- True
- >>> fs1.equal_values(fs3)
- True
-
-..
- >>> del fs1, fs2, fs3 # clean-up
-
-Feature Value Sets & Feature Value Tuples
------------------------------------------
-`nltk.featstruct` defines two new data types that are intended to be
-used as feature values: `FeatureValueTuple` and `FeatureValueSet`.
-Both of these types are considered base values -- i.e., unification
-does *not* apply to them. However, variable binding *does* apply to
-any values that they contain.
-
-Feature value tuples are written with parentheses:
-
- >>> fs1 = FeatStruct('[x=(?x, ?y)]')
- >>> fs1
- [x=(?x, ?y)]
- >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2})
- [x=(1, 2)]
-
-Feature sets are written with braces:
-
- >>> fs1 = FeatStruct('[x={?x, ?y}]')
- >>> fs1
- [x={?x, ?y}]
- >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2})
- [x={1, 2}]
-
-In addition to the basic feature value tuple & set classes, nltk
-defines feature value unions (for sets) and feature value
-concatenations (for tuples). These are written using '+', and can be
-used to combine sets & tuples:
-
- >>> fs1 = FeatStruct('[x=((1, 2)+?z), z=?z]')
- >>> fs1
- [x=((1, 2)+?z), z=?z]
- >>> fs1.unify(FeatStruct('[z=(3, 4, 5)]'))
- [x=(1, 2, 3, 4, 5), z=(3, 4, 5)]
-
-Thus, feature value tuples and sets can be used to build up tuples
-and sets of values over the corse of unification. For example, when
-parsing sentences using a semantic feature grammar, feature sets or
-feature tuples can be used to build a list of semantic predicates as
-the sentence is parsed.
-
-As was mentioned above, unification does not apply to feature value
-tuples and sets. One reason for this that it's impossible to define a
-single correct answer for unification when concatenation is used.
-Consider the following example:
-
- >>> fs1 = FeatStruct('[x=(1, 2, 3, 4)]')
- >>> fs2 = FeatStruct('[x=(?a+?b), a=?a, b=?b]')
-
-If unification applied to feature tuples, then the unification
-algorithm would have to arbitrarily choose how to divide the tuple
-(1,2,3,4) into two parts. Instead, the unification algorithm refuses
-to make this decision, and simply unifies based on value. Because
-(1,2,3,4) is not equal to (?a+?b), fs1 and fs2 will not unify:
-
- >>> print(fs1.unify(fs2))
- None
-
-If you need a list-like structure that unification does apply to, use
-`FeatList`.
-
-..
- >>> del fs1, fs2 # clean-up
-
-Light-weight Feature Structures
--------------------------------
-Many of the functions defined by `nltk.featstruct` can be applied
-directly to simple Python dictionaries and lists, rather than to
-full-fledged `FeatDict` and `FeatList` objects. In other words,
-Python ``dicts`` and ``lists`` can be used as "light-weight" feature
-structures.
-
- >>> # Note: pprint prints dicts sorted
- >>> from pprint import pprint
- >>> from nltk.featstruct import unify
- >>> pprint(unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b'))))
- {'a': 'a', 'x': 1, 'y': {'b': 'b'}}
-
-However, you should keep in mind the following caveats:
-
-- Python dictionaries & lists ignore reentrance when checking for
- equality between values. But two FeatStructs with different
- reentrances are considered nonequal, even if all their base
- values are equal.
-
-- FeatStructs can be easily frozen, allowing them to be used as
- keys in hash tables. Python dictionaries and lists can not.
-
-- FeatStructs display reentrance in their string representations;
- Python dictionaries and lists do not.
-
-- FeatStructs may *not* be mixed with Python dictionaries and lists
- (e.g., when performing unification).
-
-- FeatStructs provide a number of useful methods, such as `walk()`
- and `cyclic()`, which are not available for Python dicts & lists.
-
-In general, if your feature structures will contain any reentrances,
-or if you plan to use them as dictionary keys, it is strongly
-recommended that you use full-fledged `FeatStruct` objects.
-
-Custom Feature Values
----------------------
-The abstract base class `CustomFeatureValue` can be used to define new
-base value types that have custom unification methods. For example,
-the following feature value type encodes a range, and defines
-unification as taking the intersection on the ranges:
-
- >>> from functools import total_ordering
- >>> from nltk.featstruct import CustomFeatureValue, UnificationFailure
- >>> @total_ordering
- ... class Range(CustomFeatureValue):
- ... def __init__(self, low, high):
- ... assert low <= high
- ... self.low = low
- ... self.high = high
- ... def unify(self, other):
- ... if not isinstance(other, Range):
- ... return UnificationFailure
- ... low = max(self.low, other.low)
- ... high = min(self.high, other.high)
- ... if low <= high: return Range(low, high)
- ... else: return UnificationFailure
- ... def __repr__(self):
- ... return '(%s<x<%s)' % (self.low, self.high)
- ... def __eq__(self, other):
- ... if not isinstance(other, Range):
- ... return False
- ... return (self.low == other.low) and (self.high == other.high)
- ... def __lt__(self, other):
- ... if not isinstance(other, Range):
- ... return True
- ... return (self.low, self.high) < (other.low, other.high)
-
- >>> fs1 = FeatStruct(x=Range(5,8), y=FeatStruct(z=Range(7,22)))
- >>> print(fs1.unify(FeatStruct(x=Range(6, 22))))
- [ x = (6<x<8) ]
- [ ]
- [ y = [ z = (7<x<22) ] ]
- >>> print(fs1.unify(FeatStruct(x=Range(9, 12))))
- None
- >>> print(fs1.unify(FeatStruct(x=12)))
- None
- >>> print(fs1.unify(FeatStruct('[x=?x, y=[z=?x]]')))
- [ x = (7<x<8) ]
- [ ]
- [ y = [ z = (7<x<8) ] ]
-
-Regression Tests
-~~~~~~~~~~~~~~~~
-
-Dictionary access methods (non-mutating)
-----------------------------------------
-
- >>> fs1 = FeatStruct(a=1, b=2, c=3)
- >>> fs2 = FeatStruct(x=fs1, y='x')
-
-Feature structures support all dictionary methods (excluding the class
-method `dict.fromkeys()`). Non-mutating methods:
-
- >>> sorted(fs2.keys()) # keys()
- ['x', 'y']
- >>> sorted(fs2.values()) # values()
- [[a=1, b=2, c=3], 'x']
- >>> sorted(fs2.items()) # items()
- [('x', [a=1, b=2, c=3]), ('y', 'x')]
- >>> sorted(fs2) # __iter__()
- ['x', 'y']
- >>> 'a' in fs2, 'x' in fs2 # __contains__()
- (False, True)
- >>> fs2.has_key('a'), fs2.has_key('x') # has_key()
- (False, True)
- >>> fs2['x'], fs2['y'] # __getitem__()
- ([a=1, b=2, c=3], 'x')
- >>> fs2['a'] # __getitem__()
- Traceback (most recent call last):
- . . .
- KeyError: 'a'
- >>> fs2.get('x'), fs2.get('y'), fs2.get('a') # get()
- ([a=1, b=2, c=3], 'x', None)
- >>> fs2.get('x', 'hello'), fs2.get('a', 'hello') # get()
- ([a=1, b=2, c=3], 'hello')
- >>> len(fs1), len(fs2) # __len__
- (3, 2)
- >>> fs2.copy() # copy()
- [x=[a=1, b=2, c=3], y='x']
- >>> fs2.copy() is fs2 # copy()
- False
-
-Note: by default, `FeatStruct.copy()` does a deep copy. Use
-`FeatStruct.copy(deep=False)` for a shallow copy.
-
-..
- >>> del fs1, fs2 # clean-up.
-
-Dictionary access methods (mutating)
-------------------------------------
- >>> fs1 = FeatStruct(a=1, b=2, c=3)
- >>> fs2 = FeatStruct(x=fs1, y='x')
-
-Setting features (`__setitem__()`)
-
- >>> fs1['c'] = 5
- >>> fs1
- [a=1, b=2, c=5]
- >>> fs1['x'] = 12
- >>> fs1
- [a=1, b=2, c=5, x=12]
- >>> fs2['x', 'a'] = 2
- >>> fs2
- [x=[a=2, b=2, c=5, x=12], y='x']
- >>> fs1
- [a=2, b=2, c=5, x=12]
-
-Deleting features (`__delitem__()`)
-
- >>> del fs1['x']
- >>> fs1
- [a=2, b=2, c=5]
- >>> del fs2['x', 'a']
- >>> fs1
- [b=2, c=5]
-
-`setdefault()`:
-
- >>> fs1.setdefault('b', 99)
- 2
- >>> fs1
- [b=2, c=5]
- >>> fs1.setdefault('x', 99)
- 99
- >>> fs1
- [b=2, c=5, x=99]
-
-`update()`:
-
- >>> fs2.update({'a':'A', 'b':'B'}, c='C')
- >>> fs2
- [a='A', b='B', c='C', x=[b=2, c=5, x=99], y='x']
-
-`pop()`:
-
- >>> fs2.pop('a')
- 'A'
- >>> fs2
- [b='B', c='C', x=[b=2, c=5, x=99], y='x']
- >>> fs2.pop('a')
- Traceback (most recent call last):
- . . .
- KeyError: 'a'
- >>> fs2.pop('a', 'foo')
- 'foo'
- >>> fs2
- [b='B', c='C', x=[b=2, c=5, x=99], y='x']
-
-`clear()`:
-
- >>> fs1.clear()
- >>> fs1
- []
- >>> fs2
- [b='B', c='C', x=[], y='x']
-
-`popitem()`:
-
- >>> sorted([fs2.popitem() for i in range(len(fs2))])
- [('b', 'B'), ('c', 'C'), ('x', []), ('y', 'x')]
- >>> fs2
- []
-
-Once a feature structure has been frozen, it may not be mutated.
-
- >>> fs1 = FeatStruct('[x=1, y=2, z=[a=3]]')
- >>> fs1.freeze()
- >>> fs1.frozen()
- True
- >>> fs1['z'].frozen()
- True
-
- >>> fs1['x'] = 5
- Traceback (most recent call last):
- . . .
- ValueError: Frozen FeatStructs may not be modified.
- >>> del fs1['x']
- Traceback (most recent call last):
- . . .
- ValueError: Frozen FeatStructs may not be modified.
- >>> fs1.clear()
- Traceback (most recent call last):
- . . .
- ValueError: Frozen FeatStructs may not be modified.
- >>> fs1.pop('x')
- Traceback (most recent call last):
- . . .
- ValueError: Frozen FeatStructs may not be modified.
- >>> fs1.popitem()
- Traceback (most recent call last):
- . . .
- ValueError: Frozen FeatStructs may not be modified.
- >>> fs1.setdefault('x')
- Traceback (most recent call last):
- . . .
- ValueError: Frozen FeatStructs may not be modified.
- >>> fs1.update(z=22)
- Traceback (most recent call last):
- . . .
- ValueError: Frozen FeatStructs may not be modified.
-
-..
- >>> del fs1, fs2 # clean-up.
-
-Feature Paths
--------------
-Make sure that __getitem__ with feature paths works as intended:
-
- >>> fs1 = FeatStruct(a=1, b=2,
- ... c=FeatStruct(
- ... d=FeatStruct(e=12),
- ... f=FeatStruct(g=55, h='hello')))
- >>> fs1[()]
- [a=1, b=2, c=[d=[e=12], f=[g=55, h='hello']]]
- >>> fs1['a'], fs1[('a',)]
- (1, 1)
- >>> fs1['c','d','e']
- 12
- >>> fs1['c','f','g']
- 55
-
-Feature paths that select unknown features raise KeyError:
-
- >>> fs1['c', 'f', 'e']
- Traceback (most recent call last):
- . . .
- KeyError: ('c', 'f', 'e')
- >>> fs1['q', 'p']
- Traceback (most recent call last):
- . . .
- KeyError: ('q', 'p')
-
-Feature paths that try to go 'through' a feature that's not a feature
-structure raise KeyError:
-
- >>> fs1['a', 'b']
- Traceback (most recent call last):
- . . .
- KeyError: ('a', 'b')
-
-Feature paths can go through reentrant structures:
-
- >>> fs2 = FeatStruct('(1)[a=[b=[c->(1), d=5], e=11]]')
- >>> fs2['a', 'b', 'c', 'a', 'e']
- 11
- >>> fs2['a', 'b', 'c', 'a', 'b', 'd']
- 5
- >>> fs2[tuple('abcabcabcabcabcabcabcabcabcabca')]
- (1)[b=[c=[a->(1)], d=5], e=11]
-
-Indexing requires strings, `Feature`\s, or tuples; other types raise a
-TypeError:
-
- >>> fs2[12]
- Traceback (most recent call last):
- . . .
- TypeError: Expected feature name or path. Got 12.
- >>> fs2[list('abc')]
- Traceback (most recent call last):
- . . .
- TypeError: Expected feature name or path. Got ['a', 'b', 'c'].
-
-Feature paths can also be used with `get()`, `has_key()`, and
-`__contains__()`.
-
- >>> fpath1 = tuple('abcabc')
- >>> fpath2 = tuple('abcabz')
- >>> fs2.get(fpath1), fs2.get(fpath2)
- ((1)[a=[b=[c->(1), d=5], e=11]], None)
- >>> fpath1 in fs2, fpath2 in fs2
- (True, False)
- >>> fs2.has_key(fpath1), fs2.has_key(fpath2)
- (True, False)
-
-..
- >>> del fs1, fs2 # clean-up
-
-Reading Feature Structures
---------------------------
-
-Empty feature struct:
-
- >>> FeatStruct('[]')
- []
-
-Test features with integer values:
-
- >>> FeatStruct('[a=12, b=-33, c=0]')
- [a=12, b=-33, c=0]
-
-Test features with string values. Either single or double quotes may
-be used. Strings are evaluated just like python strings -- in
-particular, you can use escape sequences and 'u' and 'r' prefixes, and
-triple-quoted strings.
-
- >>> FeatStruct('[a="", b="hello", c="\'", d=\'\', e=\'"\']')
- [a='', b='hello', c="'", d='', e='"']
- >>> FeatStruct(r'[a="\\", b="\"", c="\x6f\\y", d="12"]')
- [a='\\', b='"', c='o\\y', d='12']
- >>> FeatStruct(r'[b=r"a\b\c"]')
- [b='a\\b\\c']
- >>> FeatStruct('[x="""a"""]')
- [x='a']
-
-Test parsing of reentrant feature structures.
-
- >>> FeatStruct('[a=(1)[], b->(1)]')
- [a=(1)[], b->(1)]
- >>> FeatStruct('[a=(1)[x=1, y=2], b->(1)]')
- [a=(1)[x=1, y=2], b->(1)]
-
-Test parsing of cyclic feature structures.
-
- >>> FeatStruct('[a=(1)[b->(1)]]')
- [a=(1)[b->(1)]]
- >>> FeatStruct('(1)[a=[b=[c->(1)]]]')
- (1)[a=[b=[c->(1)]]]
-
-Strings of the form "+name" and "-name" may be used to specify boolean
-values.
-
- >>> FeatStruct('[-bar, +baz, +foo]')
- [-bar, +baz, +foo]
-
-None, True, and False are recognized as values:
-
- >>> FeatStruct('[bar=True, baz=False, foo=None]')
- [+bar, -baz, foo=None]
-
-Special features:
-
- >>> FeatStruct('NP/VP')
- NP[]/VP[]
- >>> FeatStruct('?x/?x')
- ?x[]/?x[]
- >>> print(FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]'))
- [ *type* = 'VP' ]
- [ ]
- [ [ *type* = 'NP' ] ]
- [ *slash* = [ agr = ?x ] ]
- [ [ pl = True ] ]
- [ ]
- [ agr = ?x ]
- [ fin = True ]
- [ tense = 'past' ]
-
-Here the slash feature gets coerced:
- >>> FeatStruct('[*slash*=a, x=b, *type*="NP"]')
- NP[x='b']/a[]
-
- >>> FeatStruct('NP[sem=<bob>]/NP')
- NP[sem=<bob>]/NP[]
- >>> FeatStruct('S[sem=<walk(bob)>]')
- S[sem=<walk(bob)>]
- >>> print(FeatStruct('NP[sem=<bob>]/NP'))
- [ *type* = 'NP' ]
- [ ]
- [ *slash* = [ *type* = 'NP' ] ]
- [ ]
- [ sem = <bob> ]
-
-Playing with ranges:
-
- >>> from nltk.featstruct import RangeFeature, FeatStructReader
- >>> width = RangeFeature('width')
- >>> reader = FeatStructReader([width])
- >>> fs1 = reader.fromstring('[*width*=-5:12]')
- >>> fs2 = reader.fromstring('[*width*=2:123]')
- >>> fs3 = reader.fromstring('[*width*=-7:-2]')
- >>> fs1.unify(fs2)
- [*width*=(2, 12)]
- >>> fs1.unify(fs3)
- [*width*=(-5, -2)]
- >>> print(fs2.unify(fs3)) # no overlap in width.
- None
-
-The slash feature has a default value of 'False':
-
- >>> print(FeatStruct('NP[]/VP').unify(FeatStruct('NP[]'), trace=1))
- <BLANKLINE>
- Unification trace:
- / NP[]/VP[]
- |\ NP[]
- |
- | Unify feature: *type*
- | / 'NP'
- | |\ 'NP'
- | |
- | +-->'NP'
- |
- | Unify feature: *slash*
- | / VP[]
- | |\ False
- | |
- X X <-- FAIL
- None
-
-The demo structures from category.py. They all parse, but they don't
-do quite the right thing, -- ?x vs x.
-
- >>> FeatStruct(pos='n', agr=FeatStruct(number='pl', gender='f'))
- [agr=[gender='f', number='pl'], pos='n']
- >>> FeatStruct(r'NP[sem=<bob>]/NP')
- NP[sem=<bob>]/NP[]
- >>> FeatStruct(r'S[sem=<app(?x, ?y)>]')
- S[sem=<?x(?y)>]
- >>> FeatStruct('?x/?x')
- ?x[]/?x[]
- >>> FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')
- VP[agr=?x, +fin, tense='past']/NP[agr=?x, +pl]
- >>> FeatStruct('S[sem = <app(?subj, ?vp)>]')
- S[sem=<?subj(?vp)>]
-
- >>> FeatStruct('S')
- S[]
-
-The parser also includes support for reading sets and tuples.
-
- >>> FeatStruct('[x={1,2,2,2}, y={/}]')
- [x={1, 2}, y={/}]
- >>> FeatStruct('[x=(1,2,2,2), y=()]')
- [x=(1, 2, 2, 2), y=()]
- >>> print(FeatStruct('[x=(1,[z=(1,2,?x)],?z,{/})]'))
- [ x = (1, [ z = (1, 2, ?x) ], ?z, {/}) ]
-
-Note that we can't put a featstruct inside a tuple, because doing so
-would hash it, and it's not frozen yet:
-
- >>> print(FeatStruct('[x={[]}]'))
- Traceback (most recent call last):
- . . .
- TypeError: FeatStructs must be frozen before they can be hashed.
-
-There's a special syntax for taking the union of sets: "{...+...}".
-The elements should only be variables or sets.
-
- >>> FeatStruct('[x={?a+?b+{1,2,3}}]')
- [x={?a+?b+{1, 2, 3}}]
-
-There's a special syntax for taking the concatenation of tuples:
-"(...+...)". The elements should only be variables or tuples.
-
- >>> FeatStruct('[x=(?a+?b+(1,2,3))]')
- [x=(?a+?b+(1, 2, 3))]
-
-Parsing gives helpful messages if your string contains an error.
-
- >>> FeatStruct('[a=, b=5]]')
- Traceback (most recent call last):
- . . .
- ValueError: Error parsing feature structure
- [a=, b=5]]
- ^ Expected value
- >>> FeatStruct('[a=12 22, b=33]')
- Traceback (most recent call last):
- . . .
- ValueError: Error parsing feature structure
- [a=12 22, b=33]
- ^ Expected comma
- >>> FeatStruct('[a=5] [b=6]')
- Traceback (most recent call last):
- . . .
- ValueError: Error parsing feature structure
- [a=5] [b=6]
- ^ Expected end of string
- >>> FeatStruct(' *++*')
- Traceback (most recent call last):
- . . .
- ValueError: Error parsing feature structure
- *++*
- ^ Expected open bracket or identifier
- >>> FeatStruct('[x->(1)]')
- Traceback (most recent call last):
- . . .
- ValueError: Error parsing feature structure
- [x->(1)]
- ^ Expected bound identifier
- >>> FeatStruct('[x->y]')
- Traceback (most recent call last):
- . . .
- ValueError: Error parsing feature structure
- [x->y]
- ^ Expected identifier
- >>> FeatStruct('')
- Traceback (most recent call last):
- . . .
- ValueError: Error parsing feature structure
- <BLANKLINE>
- ^ Expected open bracket or identifier
-
-
-Unification
------------
-Very simple unifications give the expected results:
-
- >>> FeatStruct().unify(FeatStruct())
- []
- >>> FeatStruct(number='singular').unify(FeatStruct())
- [number='singular']
- >>> FeatStruct().unify(FeatStruct(number='singular'))
- [number='singular']
- >>> FeatStruct(number='singular').unify(FeatStruct(person=3))
- [number='singular', person=3]
-
-Merging nested structures:
-
- >>> fs1 = FeatStruct('[A=[B=b]]')
- >>> fs2 = FeatStruct('[A=[C=c]]')
- >>> fs1.unify(fs2)
- [A=[B='b', C='c']]
- >>> fs2.unify(fs1)
- [A=[B='b', C='c']]
-
-A basic case of reentrant unification
-
- >>> fs4 = FeatStruct('[A=(1)[B=b], E=[F->(1)]]')
- >>> fs5 = FeatStruct("[A=[C='c'], E=[F=[D='d']]]")
- >>> fs4.unify(fs5)
- [A=(1)[B='b', C='c', D='d'], E=[F->(1)]]
- >>> fs5.unify(fs4)
- [A=(1)[B='b', C='c', D='d'], E=[F->(1)]]
-
-More than 2 paths to a value
-
- >>> fs1 = FeatStruct("[a=[],b=[],c=[],d=[]]")
- >>> fs2 = FeatStruct('[a=(1)[], b->(1), c->(1), d->(1)]')
- >>> fs1.unify(fs2)
- [a=(1)[], b->(1), c->(1), d->(1)]
-
-fs1[a] gets unified with itself
-
- >>> fs1 = FeatStruct('[x=(1)[], y->(1)]')
- >>> fs2 = FeatStruct('[x=(1)[], y->(1)]')
- >>> fs1.unify(fs2)
- [x=(1)[], y->(1)]
-
-Bound variables should get forwarded appropriately
-
- >>> fs1 = FeatStruct('[A=(1)[X=x], B->(1), C=?cvar, D=?dvar]')
- >>> fs2 = FeatStruct('[A=(1)[Y=y], B=(2)[Z=z], C->(1), D->(2)]')
- >>> fs1.unify(fs2)
- [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)]
- >>> fs2.unify(fs1)
- [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)]
-
-Cyclic structure created by unification.
-
- >>> fs1 = FeatStruct('[F=(1)[], G->(1)]')
- >>> fs2 = FeatStruct('[F=[H=(2)[]], G->(2)]')
- >>> fs3 = fs1.unify(fs2)
- >>> fs3
- [F=(1)[H->(1)], G->(1)]
- >>> fs3['F'] is fs3['G']
- True
- >>> fs3['F'] is fs3['G']['H']
- True
- >>> fs3['F'] is fs3['G']['H']['H']
- True
- >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H']
- True
-
-Cyclic structure created w/ variables.
-
- >>> fs1 = FeatStruct('[F=[H=?x]]')
- >>> fs2 = FeatStruct('[F=?x]')
- >>> fs3 = fs1.unify(fs2, rename_vars=False)
- >>> fs3
- [F=(1)[H->(1)]]
- >>> fs3['F'] is fs3['F']['H']
- True
- >>> fs3['F'] is fs3['F']['H']['H']
- True
- >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H']
- True
-
-Unifying w/ a cyclic feature structure.
-
- >>> fs4 = FeatStruct('[F=[H=[H=[H=(1)[]]]], K->(1)]')
- >>> fs3.unify(fs4)
- [F=(1)[H->(1)], K->(1)]
- >>> fs4.unify(fs3)
- [F=(1)[H->(1)], K->(1)]
-
-Variable bindings should preserve reentrance.
-
- >>> bindings = {}
- >>> fs1 = FeatStruct("[a=?x]")
- >>> fs2 = fs1.unify(FeatStruct("[a=[]]"), bindings)
- >>> fs2['a'] is bindings[Variable('?x')]
- True
- >>> fs2.unify(FeatStruct("[b=?x]"), bindings)
- [a=(1)[], b->(1)]
-
-Aliased variable tests
-
- >>> fs1 = FeatStruct("[a=?x, b=?x]")
- >>> fs2 = FeatStruct("[b=?y, c=?y]")
- >>> bindings = {}
- >>> fs3 = fs1.unify(fs2, bindings)
- >>> fs3
- [a=?x, b=?x, c=?x]
- >>> bindings
- {Variable('?y'): Variable('?x')}
- >>> fs3.unify(FeatStruct("[a=1]"))
- [a=1, b=1, c=1]
-
-If we keep track of the bindings, then we can use the same variable
-over multiple calls to unify.
-
- >>> bindings = {}
- >>> fs1 = FeatStruct('[a=?x]')
- >>> fs2 = fs1.unify(FeatStruct('[a=[]]'), bindings)
- >>> fs2.unify(FeatStruct('[b=?x]'), bindings)
- [a=(1)[], b->(1)]
- >>> bindings
- {Variable('?x'): []}
-
-..
- >>> del fs1, fs2, fs3, fs4, fs5 # clean-up
-
-Unification Bindings
---------------------
-
- >>> bindings = {}
- >>> fs1 = FeatStruct('[a=?x]')
- >>> fs2 = FeatStruct('[a=12]')
- >>> fs3 = FeatStruct('[b=?x]')
- >>> fs1.unify(fs2, bindings)
- [a=12]
- >>> bindings
- {Variable('?x'): 12}
- >>> fs3.substitute_bindings(bindings)
- [b=12]
- >>> fs3 # substitute_bindings didn't mutate fs3.
- [b=?x]
- >>> fs2.unify(fs3, bindings)
- [a=12, b=12]
-
- >>> bindings = {}
- >>> fs1 = FeatStruct('[a=?x, b=1]')
- >>> fs2 = FeatStruct('[a=5, b=?x]')
- >>> fs1.unify(fs2, bindings)
- [a=5, b=1]
- >>> sorted(bindings.items())
- [(Variable('?x'), 5), (Variable('?x2'), 1)]
-
-..
- >>> del fs1, fs2, fs3 # clean-up
-
-Expressions
------------
-
- >>> e = Expression.fromstring('\\P y.P(z,y)')
- >>> fs1 = FeatStruct(x=e, y=Variable('z'))
- >>> fs2 = FeatStruct(y=VariableExpression(Variable('John')))
- >>> fs1.unify(fs2)
- [x=<\P y.P(John,y)>, y=<John>]
-
-Remove Variables
-----------------
-
- >>> FeatStruct('[a=?x, b=12, c=[d=?y]]').remove_variables()
- [b=12, c=[]]
- >>> FeatStruct('(1)[a=[b=?x,c->(1)]]').remove_variables()
- (1)[a=[c->(1)]]
-
-Equality & Hashing
-------------------
-The `equal_values` method checks whether two feature structures assign
-the same value to every feature. If the optional argument
-``check_reentrances`` is supplied, then it also returns false if there
-is any difference in the reentrances.
-
- >>> a = FeatStruct('(1)[x->(1)]')
- >>> b = FeatStruct('(1)[x->(1)]')
- >>> c = FeatStruct('(1)[x=[x->(1)]]')
- >>> d = FeatStruct('[x=(1)[x->(1)]]')
- >>> e = FeatStruct('(1)[x=[x->(1), y=1], y=1]')
- >>> def compare(x,y):
- ... assert x.equal_values(y, True) == y.equal_values(x, True)
- ... assert x.equal_values(y, False) == y.equal_values(x, False)
- ... if x.equal_values(y, True):
- ... assert x.equal_values(y, False)
- ... print('equal values, same reentrance')
- ... elif x.equal_values(y, False):
- ... print('equal values, different reentrance')
- ... else:
- ... print('different values')
-
- >>> compare(a, a)
- equal values, same reentrance
- >>> compare(a, b)
- equal values, same reentrance
- >>> compare(a, c)
- equal values, different reentrance
- >>> compare(a, d)
- equal values, different reentrance
- >>> compare(c, d)
- equal values, different reentrance
- >>> compare(a, e)
- different values
- >>> compare(c, e)
- different values
- >>> compare(d, e)
- different values
- >>> compare(e, e)
- equal values, same reentrance
-
-Feature structures may not be hashed until they are frozen:
-
- >>> hash(a)
- Traceback (most recent call last):
- . . .
- TypeError: FeatStructs must be frozen before they can be hashed.
- >>> a.freeze()
- >>> v = hash(a)
-
-Feature structures define hash consistently. The following example
-looks at the hash value for each (fs1,fs2) pair; if their hash values
-are not equal, then they must not be equal. If their hash values are
-equal, then display a message, and indicate whether their values are
-indeed equal. Note that c and d currently have the same hash value,
-even though they are not equal. That is not a bug, strictly speaking,
-but it wouldn't be a bad thing if it changed.
-
- >>> for fstruct in (a, b, c, d, e):
- ... fstruct.freeze()
- >>> for fs1_name in 'abcde':
- ... for fs2_name in 'abcde':
- ... fs1 = locals()[fs1_name]
- ... fs2 = locals()[fs2_name]
- ... if hash(fs1) != hash(fs2):
- ... assert fs1 != fs2
- ... else:
- ... print('%s and %s have the same hash value,' %
- ... (fs1_name, fs2_name))
- ... if fs1 == fs2: print('and are equal')
- ... else: print('and are not equal')
- a and a have the same hash value, and are equal
- a and b have the same hash value, and are equal
- b and a have the same hash value, and are equal
- b and b have the same hash value, and are equal
- c and c have the same hash value, and are equal
- c and d have the same hash value, and are not equal
- d and c have the same hash value, and are not equal
- d and d have the same hash value, and are equal
- e and e have the same hash value, and are equal
-
-..
- >>> del a, b, c, d, e, v # clean-up
-
-Tracing
--------
-
- >>> fs1 = FeatStruct('[a=[b=(1)[], c=?x], d->(1), e=[f=?x]]')
- >>> fs2 = FeatStruct('[a=(1)[c="C"], e=[g->(1)]]')
- >>> fs1.unify(fs2, trace=True)
- <BLANKLINE>
- Unification trace:
- / [a=[b=(1)[], c=?x], d->(1), e=[f=?x]]
- |\ [a=(1)[c='C'], e=[g->(1)]]
- |
- | Unify feature: a
- | / [b=[], c=?x]
- | |\ [c='C']
- | |
- | | Unify feature: a.c
- | | / ?x
- | | |\ 'C'
- | | |
- | | +-->Variable('?x')
- | |
- | +-->[b=[], c=?x]
- | Bindings: {?x: 'C'}
- |
- | Unify feature: e
- | / [f=?x]
- | |\ [g=[c='C']]
- | |
- | +-->[f=?x, g=[b=[], c=?x]]
- | Bindings: {?x: 'C'}
- |
- +-->[a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]]
- Bindings: {?x: 'C'}
- [a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]]
- >>>
- >>> fs1 = FeatStruct('[a=?x, b=?z, c=?z]')
- >>> fs2 = FeatStruct('[a=?y, b=?y, c=?q]')
- >>> #fs1.unify(fs2, trace=True)
- >>>
-
-..
- >>> del fs1, fs2 # clean-up
-
-Unification on Dicts & Lists
-----------------------------
-It's possible to do unification on dictionaries:
-
- >>> from nltk.featstruct import unify
- >>> pprint(unify(dict(x=1, y=dict(z=2)), dict(x=1, q=5)), width=1)
- {'q': 5, 'x': 1, 'y': {'z': 2}}
-
-It's possible to do unification on lists as well:
-
- >>> unify([1, 2, 3], [1, Variable('x'), 3])
- [1, 2, 3]
-
-Mixing dicts and lists is fine:
-
- >>> pprint(unify([dict(x=1, y=dict(z=2)),3], [dict(x=1, q=5),3]),
- ... width=1)
- [{'q': 5, 'x': 1, 'y': {'z': 2}}, 3]
-
-Mixing dicts and FeatStructs is discouraged:
-
- >>> unify(dict(x=1), FeatStruct(x=1))
- Traceback (most recent call last):
- . . .
- ValueError: Mixing FeatStruct objects with Python dicts and lists is not supported.
-
-But you can do it if you really want, by explicitly stating that both
-dictionaries and FeatStructs should be treated as feature structures:
-
- >>> unify(dict(x=1), FeatStruct(x=1), fs_class=(dict, FeatStruct))
- {'x': 1}
-
-Finding Conflicts
------------------
-
- >>> from nltk.featstruct import conflicts
- >>> fs1 = FeatStruct('[a=[b=(1)[c=2], d->(1), e=[f->(1)]]]')
- >>> fs2 = FeatStruct('[a=[b=[c=[x=5]], d=[c=2], e=[f=[c=3]]]]')
- >>> for path in conflicts(fs1, fs2):
- ... print('%-8s: %r vs %r' % ('.'.join(path), fs1[path], fs2[path]))
- a.b.c : 2 vs [x=5]
- a.e.f.c : 2 vs 3
-
-..
- >>> del fs1, fs2 # clean-up
-
-Retracting Bindings
--------------------
-
- >>> from nltk.featstruct import retract_bindings
- >>> bindings = {}
- >>> fs1 = FeatStruct('[a=?x, b=[c=?y]]')
- >>> fs2 = FeatStruct('[a=(1)[c=[d=1]], b->(1)]')
- >>> fs3 = fs1.unify(fs2, bindings)
- >>> print(fs3)
- [ a = (1) [ c = [ d = 1 ] ] ]
- [ ]
- [ b -> (1) ]
- >>> pprint(bindings)
- {Variable('?x'): [c=[d=1]], Variable('?y'): [d=1]}
- >>> retract_bindings(fs3, bindings)
- [a=?x, b=?x]
- >>> pprint(bindings)
- {Variable('?x'): [c=?y], Variable('?y'): [d=1]}
-
-Squashed Bugs
-~~~~~~~~~~~~~
-In svn rev 5167, unifying two feature structures that used the same
-variable would cause those variables to become aliased in the output.
-
- >>> fs1 = FeatStruct('[a=?x]')
- >>> fs2 = FeatStruct('[b=?x]')
- >>> fs1.unify(fs2)
- [a=?x, b=?x2]
-
-There was a bug in svn revision 5172 that caused `rename_variables` to
-rename variables to names that are already used.
-
- >>> FeatStruct('[a=?x, b=?x2]').rename_variables(
- ... vars=[Variable('?x')])
- [a=?x3, b=?x2]
- >>> fs1 = FeatStruct('[a=?x]')
- >>> fs2 = FeatStruct('[a=?x, b=?x2]')
- >>> fs1.unify(fs2)
- [a=?x, b=?x2]
-
-There was a bug in svn rev 5167 that caused us to get the following
-example wrong. Basically the problem was that we only followed
-'forward' pointers for other, not self, when unifying two feature
-structures. (nb: this test assumes that features are unified in
-alphabetical order -- if they are not, it might pass even if the bug
-is present.)
-
- >>> fs1 = FeatStruct('[a=[x=1], b=?x, c=?x]')
- >>> fs2 = FeatStruct('[a=(1)[], b->(1), c=[x=2]]')
- >>> print(fs1.unify(fs2))
- None
-
-..
- >>> del fs1, fs2 # clean-up
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-========
-FrameNet
-========
-
-The FrameNet corpus is a lexical database of English that is both human-
-and machine-readable, based on annotating examples of how words are used
-in actual texts. FrameNet is based on a theory of meaning called Frame
-Semantics, deriving from the work of Charles J. Fillmore and colleagues.
-The basic idea is straightforward: that the meanings of most words can
-best be understood on the basis of a semantic frame: a description of a
-type of event, relation, or entity and the participants in it. For
-example, the concept of cooking typically involves a person doing the
-cooking (Cook), the food that is to be cooked (Food), something to hold
-the food while cooking (Container) and a source of heat
-(Heating_instrument). In the FrameNet project, this is represented as a
-frame called Apply_heat, and the Cook, Food, Heating_instrument and
-Container are called frame elements (FEs). Words that evoke this frame,
-such as fry, bake, boil, and broil, are called lexical units (LUs) of
-the Apply_heat frame. The job of FrameNet is to define the frames
-and to annotate sentences to show how the FEs fit syntactically around
-the word that evokes the frame.
-
-------
-Frames
-------
-
-A Frame is a script-like conceptual structure that describes a
-particular type of situation, object, or event along with the
-participants and props that are needed for that Frame. For
-example, the "Apply_heat" frame describes a common situation
-involving a Cook, some Food, and a Heating_Instrument, and is
-evoked by words such as bake, blanch, boil, broil, brown,
-simmer, steam, etc.
-
-We call the roles of a Frame "frame elements" (FEs) and the
-frame-evoking words are called "lexical units" (LUs).
-
-FrameNet includes relations between Frames. Several types of
-relations are defined, of which the most important are:
-
-- Inheritance: An IS-A relation. The child frame is a subtype
- of the parent frame, and each FE in the parent is bound to
- a corresponding FE in the child. An example is the
- "Revenge" frame which inherits from the
- "Rewards_and_punishments" frame.
-
-- Using: The child frame presupposes the parent frame as
- background, e.g the "Speed" frame "uses" (or presupposes)
- the "Motion" frame; however, not all parent FEs need to be
- bound to child FEs.
-
-- Subframe: The child frame is a subevent of a complex event
- represented by the parent, e.g. the "Criminal_process" frame
- has subframes of "Arrest", "Arraignment", "Trial", and
- "Sentencing".
-
-- Perspective_on: The child frame provides a particular
- perspective on an un-perspectivized parent frame. A pair of
- examples consists of the "Hiring" and "Get_a_job" frames,
- which perspectivize the "Employment_start" frame from the
- Employer's and the Employee's point of view, respectively.
-
-To get a list of all of the Frames in FrameNet, you can use the
-`frames()` function. If you supply a regular expression pattern to the
-`frames()` function, you will get a list of all Frames whose names match
-that pattern:
-
- >>> from pprint import pprint
- >>> from operator import itemgetter
- >>> from nltk.corpus import framenet as fn
- >>> from nltk.corpus.reader.framenet import PrettyList
- >>> x = fn.frames(r'(?i)crim')
- >>> x.sort(key=itemgetter('ID'))
- >>> x
- [<frame ID=200 name=Criminal_process>, <frame ID=500 name=Criminal_investigation>, ...]
- >>> PrettyList(sorted(x, key=itemgetter('ID')))
- [<frame ID=200 name=Criminal_process>, <frame ID=500 name=Criminal_investigation>, ...]
-
-To get the details of a particular Frame, you can use the `frame()`
-function passing in the frame number:
-
- >>> from pprint import pprint
- >>> from nltk.corpus import framenet as fn
- >>> f = fn.frame(202)
- >>> f.ID
- 202
- >>> f.name
- 'Arrest'
- >>> f.definition # doctest: +ELLIPSIS
- "Authorities charge a Suspect, who is under suspicion of having committed a crime..."
- >>> len(f.lexUnit)
- 11
- >>> pprint(sorted([x for x in f.FE]))
- ['Authorities',
- 'Charges',
- 'Co-participant',
- 'Manner',
- 'Means',
- 'Offense',
- 'Place',
- 'Purpose',
- 'Source_of_legal_authority',
- 'Suspect',
- 'Time',
- 'Type']
- >>> pprint(f.frameRelations)
- [<Parent=Intentionally_affect -- Inheritance -> Child=Arrest>, <Complex=Criminal_process -- Subframe -> Component=Arrest>, ...]
-
-The `frame()` function shown above returns a dict object containing
-detailed information about the Frame. See the documentation on the
-`frame()` function for the specifics.
-
-You can also search for Frames by their Lexical Units (LUs). The
-`frames_by_lemma()` function returns a list of all frames that contain
-LUs in which the 'name' attribute of the LU matchs the given regular
-expression. Note that LU names are composed of "lemma.POS", where the
-"lemma" part can be made up of either a single lexeme (e.g. 'run') or
-multiple lexemes (e.g. 'a little') (see below).
-
- >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) # doctest: +ELLIPSIS
- [<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>]
-
--------------
-Lexical Units
--------------
-
-A lexical unit (LU) is a pairing of a word with a meaning. For
-example, the "Apply_heat" Frame describes a common situation
-involving a Cook, some Food, and a Heating Instrument, and is
-_evoked_ by words such as bake, blanch, boil, broil, brown,
-simmer, steam, etc. These frame-evoking words are the LUs in the
-Apply_heat frame. Each sense of a polysemous word is a different
-LU.
-
-We have used the word "word" in talking about LUs. The reality
-is actually rather complex. When we say that the word "bake" is
-polysemous, we mean that the lemma "bake.v" (which has the
-word-forms "bake", "bakes", "baked", and "baking") is linked to
-three different frames:
-
-- Apply_heat: "Michelle baked the potatoes for 45 minutes."
-
-- Cooking_creation: "Michelle baked her mother a cake for her birthday."
-
-- Absorb_heat: "The potatoes have to bake for more than 30 minutes."
-
-These constitute three different LUs, with different
-definitions.
-
-Multiword expressions such as "given name" and hyphenated words
-like "shut-eye" can also be LUs. Idiomatic phrases such as
-"middle of nowhere" and "give the slip (to)" are also defined as
-LUs in the appropriate frames ("Isolated_places" and "Evading",
-respectively), and their internal structure is not analyzed.
-
-Framenet provides multiple annotated examples of each sense of a
-word (i.e. each LU). Moreover, the set of examples
-(approximately 20 per LU) illustrates all of the combinatorial
-possibilities of the lexical unit.
-
-Each LU is linked to a Frame, and hence to the other words which
-evoke that Frame. This makes the FrameNet database similar to a
-thesaurus, grouping together semantically similar words.
-
-In the simplest case, frame-evoking words are verbs such as
-"fried" in:
-
- "Matilde fried the catfish in a heavy iron skillet."
-
-Sometimes event nouns may evoke a Frame. For example,
-"reduction" evokes "Cause_change_of_scalar_position" in:
-
- "...the reduction of debt levels to $665 million from $2.6 billion."
-
-Adjectives may also evoke a Frame. For example, "asleep" may
-evoke the "Sleep" frame as in:
-
- "They were asleep for hours."
-
-Many common nouns, such as artifacts like "hat" or "tower",
-typically serve as dependents rather than clearly evoking their
-own frames.
-
-Details for a specific lexical unit can be obtained using this class's
-`lus()` function, which takes an optional regular expression
-pattern that will be matched against the name of the lexical unit:
-
- >>> from pprint import pprint
- >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')))
- [<lu ID=14733 name=a little.n>, <lu ID=14743 name=a little.adv>, ...]
-
-You can obtain detailed information on a particular LU by calling the
-`lu()` function and passing in an LU's 'ID' number:
-
- >>> from pprint import pprint
- >>> from nltk.corpus import framenet as fn
- >>> fn.lu(256).name
- 'foresee.v'
- >>> fn.lu(256).definition
- 'COD: be aware of beforehand; predict.'
- >>> fn.lu(256).frame.name
- 'Expectation'
- >>> fn.lu(256).lexemes[0].name
- 'foresee'
-
-Note that LU names take the form of a dotted string (e.g. "run.v" or "a
-little.adv") in which a lemma preceeds the "." and a part of speech
-(POS) follows the dot. The lemma may be composed of a single lexeme
-(e.g. "run") or of multiple lexemes (e.g. "a little"). The list of
-POSs used in the LUs is:
-
-v - verb
-n - noun
-a - adjective
-adv - adverb
-prep - preposition
-num - numbers
-intj - interjection
-art - article
-c - conjunction
-scon - subordinating conjunction
-
-For more detailed information about the info that is contained in the
-dict that is returned by the `lu()` function, see the documentation on
-the `lu()` function.
-
--------------------
-Annotated Documents
--------------------
-
-The FrameNet corpus contains a small set of annotated documents. A list
-of these documents can be obtained by calling the `docs()` function:
-
- >>> from pprint import pprint
- >>> from nltk.corpus import framenet as fn
- >>> d = fn.docs('BellRinging')[0]
- >>> d.corpname
- 'PropBank'
- >>> d.sentence[49] # doctest: +ELLIPSIS
- full-text sentence (...) in BellRinging:
- <BLANKLINE>
- <BLANKLINE>
- [POS] 17 tags
- <BLANKLINE>
- [POS_tagset] PENN
- <BLANKLINE>
- [text] + [annotationSet]
- <BLANKLINE>
- `` I live in hopes that the ringers themselves will be drawn into
- ***** ******* *****
- Desir Cause_t Cause
- [1] [3] [2]
- <BLANKLINE>
- that fuller life .
- ******
- Comple
- [4]
- (Desir=Desiring, Cause_t=Cause_to_make_noise, Cause=Cause_motion, Comple=Completeness)
- <BLANKLINE>
-
- >>> d.sentence[49].annotationSet[1] # doctest: +ELLIPSIS
- annotation set (...):
- <BLANKLINE>
- [status] MANUAL
- <BLANKLINE>
- [LU] (6605) hope.n in Desiring
- <BLANKLINE>
- [frame] (366) Desiring
- <BLANKLINE>
- [GF] 2 relations
- <BLANKLINE>
- [PT] 2 phrases
- <BLANKLINE>
- [text] + [Target] + [FE] + [Noun]
- <BLANKLINE>
- `` I live in hopes that the ringers themselves will be drawn into
- - ^^^^ ^^ ***** ----------------------------------------------
- E supp su Event
- <BLANKLINE>
- that fuller life .
- -----------------
- <BLANKLINE>
- (E=Experiencer, su=supp)
- <BLANKLINE>
- <BLANKLINE>
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===============================================
-Generating sentences from context-free grammars
-===============================================
-
-An example grammar:
-
- >>> from nltk.parse.generate import generate, demo_grammar
- >>> from nltk import CFG
- >>> grammar = CFG.fromstring(demo_grammar)
- >>> print(grammar)
- Grammar with 13 productions (start state = S)
- S -> NP VP
- NP -> Det N
- PP -> P NP
- VP -> 'slept'
- VP -> 'saw' NP
- VP -> 'walked' PP
- Det -> 'the'
- Det -> 'a'
- N -> 'man'
- N -> 'park'
- N -> 'dog'
- P -> 'in'
- P -> 'with'
-
-The first 10 generated sentences:
-
- >>> for sentence in generate(grammar, n=10):
- ... print(' '.join(sentence))
- the man slept
- the man saw the man
- the man saw the park
- the man saw the dog
- the man saw a man
- the man saw a park
- the man saw a dog
- the man walked in the man
- the man walked in the park
- the man walked in the dog
-
-All sentences of max depth 4:
-
- >>> for sentence in generate(grammar, depth=4):
- ... print(' '.join(sentence))
- the man slept
- the park slept
- the dog slept
- a man slept
- a park slept
- a dog slept
-
-The number of sentences of different max depths:
-
- >>> len(list(generate(grammar, depth=3)))
- 0
- >>> len(list(generate(grammar, depth=4)))
- 6
- >>> len(list(generate(grammar, depth=5)))
- 42
- >>> len(list(generate(grammar, depth=6)))
- 114
- >>> len(list(generate(grammar)))
- 114
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=======================================
-Demonstrate word embedding using Gensim
-=======================================
-
-We demonstrate three functions:
-- Train the word embeddings using brown corpus;
-- Load the pre-trained model and perform simple tasks; and
-- Pruning the pre-trained binary model.
-
- >>> import gensim
-
----------------
-Train the model
----------------
-
-Here we train a word embedding using the Brown Corpus:
-
- >>> from nltk.corpus import brown
- >>> model = gensim.models.Word2Vec(brown.sents())
-
-It might take some time to train the model. So, after it is trained, it can be saved as follows:
-
- >>> model.save('brown.embedding')
- >>> new_model = gensim.models.Word2Vec.load('brown.embedding')
-
-The model will be the list of words with their embedding. We can easily get the vector representation of a word.
- >>> len(new_model['university'])
- 100
-
-There are some supporting functions already implemented in Gensim to manipulate with word embeddings.
-For example, to compute the cosine similarity between 2 words:
-
- >>> new_model.similarity('university','school') > 0.3
- True
-
----------------------------
-Using the pre-trained model
----------------------------
-
-NLTK includes a pre-trained model which is part of a model that is trained on 100 billion words from the Google News Dataset.
-The full model is from https://code.google.com/p/word2vec/ (about 3 GB).
-
- >>> from nltk.data import find
- >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
- >>> model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)
-
-We pruned the model to only include the most common words (~44k words).
-
- >>> len(model.vocab)
- 43981
-
-Each word is represented in the space of 300 dimensions:
-
- >>> len(model['university'])
- 300
-
-Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score.
-
- >>> model.most_similar(positive=['university'], topn = 3)
- [('universities', 0.70039...), ('faculty', 0.67809...), ('undergraduate', 0.65870...)]
-
-Finding a word that is not in a list is also supported, although, implementing this by yourself is simple.
-
- >>> model.doesnt_match('breakfast cereal dinner lunch'.split())
- 'cereal'
-
-Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example,
-the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'.
-
- >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)
- [('queen', 0.71181...)]
-
- >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)
- [('France', 0.78840...)]
-
-We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words.
-
-| import numpy as np
-| labels = []
-| count = 0
-| max_count = 1000
-| X = np.zeros(shape=(max_count,len(model['university'])))
-|
-| for term in model.vocab:
-| X[count] = model[term]
-| labels.append(term)
-| count+= 1
-| if count >= max_count: break
-|
-| # It is recommended to use PCA first to reduce to ~50 dimensions
-| from sklearn.decomposition import PCA
-| pca = PCA(n_components=50)
-| X_50 = pca.fit_transform(X)
-|
-| # Using TSNE to further reduce to 2 dimensions
-| from sklearn.manifold import TSNE
-| model_tsne = TSNE(n_components=2, random_state=0)
-| Y = model_tsne.fit_transform(X_50)
-|
-| # Show the scatter plot
-| import matplotlib.pyplot as plt
-| plt.scatter(Y[:,0], Y[:,1], 20)
-|
-| # Add labels
-| for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
-| plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)
-|
-| plt.show()
-
-------------------------------
-Prune the trained binary model
-------------------------------
-
-Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/
-We use this code to get the `word2vec_sample` model.
-
-| import gensim
-| from gensim.models.word2vec import Word2Vec
-| # Load the binary model
-| model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True);
-|
-| # Only output word that appear in the Brown corpus
-| from nltk.corpus import brown
-| words = set(brown.words())
-| print (len(words))
-|
-| # Output presented word to a temporary file
-| out_file = 'pruned.word2vec.txt'
-| f = open(out_file,'wb')
-|
-| word_presented = words.intersection(model.vocab.keys())
-| f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
-|
-| for word in word_presented:
-| f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
-|
-| f.close()
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-def setup_module(module):
- from nose import SkipTest
-
- try:
- import gensim
- except ImportError:
- raise SkipTest("Gensim doctest requires gensim")
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==============================================================================
- Glue Semantics
-==============================================================================
-
-.. include:: ../../../nltk_book/definitions.rst
-
-
-======================
-Linear logic
-======================
-
- >>> from nltk.sem import logic
- >>> from nltk.sem.glue import *
- >>> from nltk.sem.linearlogic import *
-
- >>> from nltk.sem.linearlogic import Expression
- >>> read_expr = Expression.fromstring
-
-Parser
-
- >>> print(read_expr(r'f'))
- f
- >>> print(read_expr(r'(g -o f)'))
- (g -o f)
- >>> print(read_expr(r'(g -o (h -o f))'))
- (g -o (h -o f))
- >>> print(read_expr(r'((g -o G) -o G)'))
- ((g -o G) -o G)
- >>> print(read_expr(r'(g -o f)(g)'))
- (g -o f)(g)
- >>> print(read_expr(r'((g -o G) -o G)((g -o f))'))
- ((g -o G) -o G)((g -o f))
-
-Simplify
-
- >>> print(read_expr(r'f').simplify())
- f
- >>> print(read_expr(r'(g -o f)').simplify())
- (g -o f)
- >>> print(read_expr(r'((g -o G) -o G)').simplify())
- ((g -o G) -o G)
- >>> print(read_expr(r'(g -o f)(g)').simplify())
- f
- >>> try: read_expr(r'(g -o f)(f)').simplify()
- ... except LinearLogicApplicationException as e: print(e)
- ...
- Cannot apply (g -o f) to f. Cannot unify g with f given {}
- >>> print(read_expr(r'(G -o f)(g)').simplify())
- f
- >>> print(read_expr(r'((g -o G) -o G)((g -o f))').simplify())
- f
-
-Test BindingDict
-
- >>> h = ConstantExpression('h')
- >>> g = ConstantExpression('g')
- >>> f = ConstantExpression('f')
-
- >>> H = VariableExpression('H')
- >>> G = VariableExpression('G')
- >>> F = VariableExpression('F')
-
- >>> d1 = BindingDict({H: h})
- >>> d2 = BindingDict({F: f, G: F})
- >>> d12 = d1 + d2
- >>> all12 = ['%s: %s' % (v, d12[v]) for v in d12.d]
- >>> all12.sort()
- >>> print(all12)
- ['F: f', 'G: f', 'H: h']
-
- >>> BindingDict([(F,f),(G,g),(H,h)]) == BindingDict({F:f, G:g, H:h})
- True
-
- >>> d4 = BindingDict({F: f})
- >>> try: d4[F] = g
- ... except VariableBindingException as e: print(e)
- Variable F already bound to another value
-
-Test Unify
-
- >>> try: f.unify(g, BindingDict())
- ... except UnificationException as e: print(e)
- ...
- Cannot unify f with g given {}
-
- >>> f.unify(G, BindingDict()) == BindingDict({G: f})
- True
- >>> try: f.unify(G, BindingDict({G: h}))
- ... except UnificationException as e: print(e)
- ...
- Cannot unify f with G given {G: h}
- >>> f.unify(G, BindingDict({G: f})) == BindingDict({G: f})
- True
- >>> f.unify(G, BindingDict({H: f})) == BindingDict({G: f, H: f})
- True
-
- >>> G.unify(f, BindingDict()) == BindingDict({G: f})
- True
- >>> try: G.unify(f, BindingDict({G: h}))
- ... except UnificationException as e: print(e)
- ...
- Cannot unify G with f given {G: h}
- >>> G.unify(f, BindingDict({G: f})) == BindingDict({G: f})
- True
- >>> G.unify(f, BindingDict({H: f})) == BindingDict({G: f, H: f})
- True
-
- >>> G.unify(F, BindingDict()) == BindingDict({G: F})
- True
- >>> try: G.unify(F, BindingDict({G: H}))
- ... except UnificationException as e: print(e)
- ...
- Cannot unify G with F given {G: H}
- >>> G.unify(F, BindingDict({G: F})) == BindingDict({G: F})
- True
- >>> G.unify(F, BindingDict({H: F})) == BindingDict({G: F, H: F})
- True
-
-Test Compile
-
- >>> print(read_expr('g').compile_pos(Counter(), GlueFormula))
- (<ConstantExpression g>, [])
- >>> print(read_expr('(g -o f)').compile_pos(Counter(), GlueFormula))
- (<ImpExpression (g -o f)>, [])
- >>> print(read_expr('(g -o (h -o f))').compile_pos(Counter(), GlueFormula))
- (<ImpExpression (g -o (h -o f))>, [])
-
-
-======================
-Glue
-======================
-
-Demo of "John walks"
---------------------
-
- >>> john = GlueFormula("John", "g")
- >>> print(john)
- John : g
- >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)")
- >>> print(walks)
- \x.walks(x) : (g -o f)
- >>> print(walks.applyto(john))
- \x.walks(x)(John) : (g -o f)(g)
- >>> print(walks.applyto(john).simplify())
- walks(John) : f
-
-
-Demo of "A dog walks"
----------------------
-
- >>> a = GlueFormula("\P Q.some x.(P(x) and Q(x))", "((gv -o gr) -o ((g -o G) -o G))")
- >>> print(a)
- \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G))
- >>> man = GlueFormula(r"\x.man(x)", "(gv -o gr)")
- >>> print(man)
- \x.man(x) : (gv -o gr)
- >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)")
- >>> print(walks)
- \x.walks(x) : (g -o f)
- >>> a_man = a.applyto(man)
- >>> print(a_man.simplify())
- \Q.exists x.(man(x) & Q(x)) : ((g -o G) -o G)
- >>> a_man_walks = a_man.applyto(walks)
- >>> print(a_man_walks.simplify())
- exists x.(man(x) & walks(x)) : f
-
-
-Demo of 'every girl chases a dog'
----------------------------------
-
-Individual words:
-
- >>> every = GlueFormula("\P Q.all x.(P(x) -> Q(x))", "((gv -o gr) -o ((g -o G) -o G))")
- >>> print(every)
- \P Q.all x.(P(x) -> Q(x)) : ((gv -o gr) -o ((g -o G) -o G))
- >>> girl = GlueFormula(r"\x.girl(x)", "(gv -o gr)")
- >>> print(girl)
- \x.girl(x) : (gv -o gr)
- >>> chases = GlueFormula(r"\x y.chases(x,y)", "(g -o (h -o f))")
- >>> print(chases)
- \x y.chases(x,y) : (g -o (h -o f))
- >>> a = GlueFormula("\P Q.some x.(P(x) and Q(x))", "((hv -o hr) -o ((h -o H) -o H))")
- >>> print(a)
- \P Q.exists x.(P(x) & Q(x)) : ((hv -o hr) -o ((h -o H) -o H))
- >>> dog = GlueFormula(r"\x.dog(x)", "(hv -o hr)")
- >>> print(dog)
- \x.dog(x) : (hv -o hr)
-
-Noun Quantification can only be done one way:
-
- >>> every_girl = every.applyto(girl)
- >>> print(every_girl.simplify())
- \Q.all x.(girl(x) -> Q(x)) : ((g -o G) -o G)
- >>> a_dog = a.applyto(dog)
- >>> print(a_dog.simplify())
- \Q.exists x.(dog(x) & Q(x)) : ((h -o H) -o H)
-
-The first reading is achieved by combining 'chases' with 'a dog' first.
-Since 'a girl' requires something of the form '(h -o H)' we must
-get rid of the 'g' in the glue of 'see'. We will do this with
-the '-o elimination' rule. So, x1 will be our subject placeholder.
-
- >>> xPrime = GlueFormula("x1", "g")
- >>> print(xPrime)
- x1 : g
- >>> xPrime_chases = chases.applyto(xPrime)
- >>> print(xPrime_chases.simplify())
- \y.chases(x1,y) : (h -o f)
- >>> xPrime_chases_a_dog = a_dog.applyto(xPrime_chases)
- >>> print(xPrime_chases_a_dog.simplify())
- exists x.(dog(x) & chases(x1,x)) : f
-
-Now we can retract our subject placeholder using lambda-abstraction and
-combine with the true subject.
-
- >>> chases_a_dog = xPrime_chases_a_dog.lambda_abstract(xPrime)
- >>> print(chases_a_dog.simplify())
- \x1.exists x.(dog(x) & chases(x1,x)) : (g -o f)
- >>> every_girl_chases_a_dog = every_girl.applyto(chases_a_dog)
- >>> r1 = every_girl_chases_a_dog.simplify()
- >>> r2 = GlueFormula(r'all x.(girl(x) -> exists z1.(dog(z1) & chases(x,z1)))', 'f')
- >>> r1 == r2
- True
-
-The second reading is achieved by combining 'every girl' with 'chases' first.
-
- >>> xPrime = GlueFormula("x1", "g")
- >>> print(xPrime)
- x1 : g
- >>> xPrime_chases = chases.applyto(xPrime)
- >>> print(xPrime_chases.simplify())
- \y.chases(x1,y) : (h -o f)
- >>> yPrime = GlueFormula("x2", "h")
- >>> print(yPrime)
- x2 : h
- >>> xPrime_chases_yPrime = xPrime_chases.applyto(yPrime)
- >>> print(xPrime_chases_yPrime.simplify())
- chases(x1,x2) : f
- >>> chases_yPrime = xPrime_chases_yPrime.lambda_abstract(xPrime)
- >>> print(chases_yPrime.simplify())
- \x1.chases(x1,x2) : (g -o f)
- >>> every_girl_chases_yPrime = every_girl.applyto(chases_yPrime)
- >>> print(every_girl_chases_yPrime.simplify())
- all x.(girl(x) -> chases(x,x2)) : f
- >>> every_girl_chases = every_girl_chases_yPrime.lambda_abstract(yPrime)
- >>> print(every_girl_chases.simplify())
- \x2.all x.(girl(x) -> chases(x,x2)) : (h -o f)
- >>> every_girl_chases_a_dog = a_dog.applyto(every_girl_chases)
- >>> r1 = every_girl_chases_a_dog.simplify()
- >>> r2 = GlueFormula(r'exists x.(dog(x) & all z2.(girl(z2) -> chases(z2,x)))', 'f')
- >>> r1 == r2
- True
-
-
-Compilation
------------
-
- >>> for cp in GlueFormula('m', '(b -o a)').compile(Counter()): print(cp)
- m : (b -o a) : {1}
- >>> for cp in GlueFormula('m', '((c -o b) -o a)').compile(Counter()): print(cp)
- v1 : c : {1}
- m : (b[1] -o a) : {2}
- >>> for cp in GlueFormula('m', '((d -o (c -o b)) -o a)').compile(Counter()): print(cp)
- v1 : c : {1}
- v2 : d : {2}
- m : (b[1, 2] -o a) : {3}
- >>> for cp in GlueFormula('m', '((d -o e) -o ((c -o b) -o a))').compile(Counter()): print(cp)
- v1 : d : {1}
- v2 : c : {2}
- m : (e[1] -o (b[2] -o a)) : {3}
- >>> for cp in GlueFormula('m', '(((d -o c) -o b) -o a)').compile(Counter()): print(cp)
- v1 : (d -o c) : {1}
- m : (b[1] -o a) : {2}
- >>> for cp in GlueFormula('m', '((((e -o d) -o c) -o b) -o a)').compile(Counter()): print(cp)
- v1 : e : {1}
- v2 : (d[1] -o c) : {2}
- m : (b[2] -o a) : {3}
-
-
-Demo of 'a man walks' using Compilation
----------------------------------------
-
-Premises
-
- >>> a = GlueFormula('\\P Q.some x.(P(x) and Q(x))', '((gv -o gr) -o ((g -o G) -o G))')
- >>> print(a)
- \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G))
-
- >>> man = GlueFormula('\\x.man(x)', '(gv -o gr)')
- >>> print(man)
- \x.man(x) : (gv -o gr)
-
- >>> walks = GlueFormula('\\x.walks(x)', '(g -o f)')
- >>> print(walks)
- \x.walks(x) : (g -o f)
-
-Compiled Premises:
-
- >>> counter = Counter()
- >>> ahc = a.compile(counter)
- >>> g1 = ahc[0]
- >>> print(g1)
- v1 : gv : {1}
- >>> g2 = ahc[1]
- >>> print(g2)
- v2 : g : {2}
- >>> g3 = ahc[2]
- >>> print(g3)
- \P Q.exists x.(P(x) & Q(x)) : (gr[1] -o (G[2] -o G)) : {3}
- >>> g4 = man.compile(counter)[0]
- >>> print(g4)
- \x.man(x) : (gv -o gr) : {4}
- >>> g5 = walks.compile(counter)[0]
- >>> print(g5)
- \x.walks(x) : (g -o f) : {5}
-
-Derivation:
-
- >>> g14 = g4.applyto(g1)
- >>> print(g14.simplify())
- man(v1) : gr : {1, 4}
- >>> g134 = g3.applyto(g14)
- >>> print(g134.simplify())
- \Q.exists x.(man(x) & Q(x)) : (G[2] -o G) : {1, 3, 4}
- >>> g25 = g5.applyto(g2)
- >>> print(g25.simplify())
- walks(v2) : f : {2, 5}
- >>> g12345 = g134.applyto(g25)
- >>> print(g12345.simplify())
- exists x.(man(x) & walks(x)) : f : {1, 2, 3, 4, 5}
-
----------------------------------
-Dependency Graph to Glue Formulas
----------------------------------
- >>> from nltk.corpus.reader.dependency import DependencyGraph
-
- >>> depgraph = DependencyGraph("""1 John _ NNP NNP _ 2 SUBJ _ _
- ... 2 sees _ VB VB _ 0 ROOT _ _
- ... 3 a _ ex_quant ex_quant _ 4 SPEC _ _
- ... 4 dog _ NN NN _ 2 OBJ _ _
- ... """)
- >>> gfl = GlueDict('nltk:grammars/sample_grammars/glue.semtype').to_glueformula_list(depgraph)
- >>> print(gfl) # doctest: +SKIP
- [\x y.sees(x,y) : (f -o (i -o g)),
- \x.dog(x) : (iv -o ir),
- \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I3) -o I3)),
- \P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F4) -o F4)),
- \x.John(x) : (fv -o fr)]
- >>> glue = Glue()
- >>> for r in sorted([r.simplify().normalize() for r in glue.get_readings(glue.gfl_to_compiled(gfl))], key=str):
- ... print(r)
- exists z1.(John(z1) & exists z2.(dog(z2) & sees(z1,z2)))
- exists z1.(dog(z1) & exists z2.(John(z2) & sees(z2,z1)))
-
------------------------------------
-Dependency Graph to LFG f-structure
------------------------------------
- >>> from nltk.sem.lfg import FStructure
-
- >>> fstruct = FStructure.read_depgraph(depgraph)
-
- >>> print(fstruct) # doctest: +SKIP
- f:[pred 'sees'
- obj h:[pred 'dog'
- spec 'a']
- subj g:[pred 'John']]
-
- >>> fstruct.to_depgraph().tree().pprint()
- (sees (dog a) John)
-
----------------------------------
-LFG f-structure to Glue
----------------------------------
- >>> fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype')) # doctest: +SKIP
- [\x y.sees(x,y) : (i -o (g -o f)),
- \x.dog(x) : (gv -o gr),
- \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G3) -o G3)),
- \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I4) -o I4)),
- \x.John(x) : (iv -o ir)]
-
-.. see gluesemantics_malt.doctest for more
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. see also: gluesemantics.doctest
-
-==============================================================================
- Glue Semantics
-==============================================================================
-
- >>> from nltk.sem.glue import *
- >>> nltk.sem.logic._counter._value = 0
-
---------------------------------
-Initialize the Dependency Parser
---------------------------------
- >>> from nltk.parse.malt import MaltParser
-
- >>> tagger = RegexpTagger(
- ... [('^(John|Mary)$', 'NNP'),
- ... ('^(sees|chases)$', 'VB'),
- ... ('^(a)$', 'ex_quant'),
- ... ('^(every)$', 'univ_quant'),
- ... ('^(girl|dog)$', 'NN')
- ... ])
- >>> depparser = MaltParser(tagger=tagger)
-
---------------------
-Automated Derivation
---------------------
- >>> glue = Glue(depparser=depparser)
- >>> readings = glue.parse_to_meaning('every girl chases a dog'.split())
- >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str):
- ... print(reading.normalize())
- all z1.(girl(z1) -> exists z2.(dog(z2) & chases(z1,z2)))
- exists z1.(dog(z1) & all z2.(girl(z2) -> chases(z2,z1)))
-
- >>> drtglue = DrtGlue(depparser=depparser)
- >>> readings = drtglue.parse_to_meaning('every girl chases a dog'.split())
- >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str):
- ... print(reading)
- ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chases(z1,z2)]))])
- ([z1],[dog(z1), (([z2],[girl(z2)]) -> ([],[chases(z2,z1)]))])
-
---------------
-With inference
---------------
-
-Checking for equality of two DRSs is very useful when generating readings of a sentence.
-For example, the ``glue`` module generates two readings for the sentence
-*John sees Mary*:
-
- >>> from nltk.sem.glue import DrtGlue
- >>> readings = drtglue.parse_to_meaning('John sees Mary'.split())
- >>> for drs in sorted([r.simplify().normalize() for r in readings], key=str):
- ... print(drs)
- ([z1,z2],[John(z1), Mary(z2), sees(z1,z2)])
- ([z1,z2],[Mary(z1), John(z2), sees(z2,z1)])
-
-However, it is easy to tell that these two readings are logically the
-same, and therefore one of them is superfluous. We can use the theorem prover
-to determine this equivalence, and then delete one of them. A particular
-theorem prover may be specified, or the argument may be left off to use the
-default.
-
- >>> readings[0].equiv(readings[1])
- True
-
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-def setup_module(module):
- from nose import SkipTest
- from nltk.parse.malt import MaltParser
-
- try:
- depparser = MaltParser("maltparser-1.7.2")
- except LookupError:
- raise SkipTest("MaltParser is not available")
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===============
-Grammar Parsing
-===============
-
-Grammars can be parsed from strings:
-
- >>> from nltk import CFG
- >>> grammar = CFG.fromstring("""
- ... S -> NP VP
- ... PP -> P NP
- ... NP -> Det N | NP PP
- ... VP -> V NP | VP PP
- ... Det -> 'a' | 'the'
- ... N -> 'dog' | 'cat'
- ... V -> 'chased' | 'sat'
- ... P -> 'on' | 'in'
- ... """)
- >>> grammar
- <Grammar with 14 productions>
- >>> grammar.start()
- S
- >>> grammar.productions() # doctest: +NORMALIZE_WHITESPACE
- [S -> NP VP, PP -> P NP, NP -> Det N, NP -> NP PP, VP -> V NP, VP -> VP PP,
- Det -> 'a', Det -> 'the', N -> 'dog', N -> 'cat', V -> 'chased', V -> 'sat',
- P -> 'on', P -> 'in']
-
-Probabilistic CFGs:
-
- >>> from nltk import PCFG
- >>> toy_pcfg1 = PCFG.fromstring("""
- ... S -> NP VP [1.0]
- ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
- ... Det -> 'the' [0.8] | 'my' [0.2]
- ... N -> 'man' [0.5] | 'telescope' [0.5]
- ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
- ... V -> 'ate' [0.35] | 'saw' [0.65]
- ... PP -> P NP [1.0]
- ... P -> 'with' [0.61] | 'under' [0.39]
- ... """)
-
-Chomsky Normal Form grammar (Test for bug 474)
-
- >>> g = CFG.fromstring("VP^<TOP> -> VBP NP^<VP-TOP>")
- >>> g.productions()[0].lhs()
- VP^<TOP>
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==========================
- Test Suites for Grammars
-==========================
-
-Sentences in the test suite are divided into two classes:
-
-- grammatical (*accept*) and
-- ungrammatical (*reject*).
-
-If a sentence should parse accordng to the grammar, the value of
-``trees`` will be a non-empty list. If a sentence should be rejected
-according to the grammar, then the value of ``trees`` will be ``None``.
-
- >>> from nltk.parse import TestGrammar
- >>> germantest1 = {}
- >>> germantest1['doc'] = "Tests for person agreement"
- >>> germantest1['accept'] = [
- ... 'ich komme',
- ... 'ich sehe mich',
- ... 'du kommst',
- ... 'du siehst mich',
- ... 'sie kommt',
- ... 'sie sieht mich',
- ... 'ihr kommt',
- ... 'wir kommen',
- ... 'sie kommen',
- ... 'du magst mich',
- ... 'er mag mich',
- ... 'du folgst mir',
- ... 'sie hilft mir',
- ... ]
- >>> germantest1['reject'] = [
- ... 'ich kommt',
- ... 'ich kommst',
- ... 'ich siehst mich',
- ... 'du komme',
- ... 'du sehe mich',
- ... 'du kommt',
- ... 'er komme',
- ... 'er siehst mich',
- ... 'wir komme',
- ... 'wir kommst',
- ... 'die Katzen kommst',
- ... 'sie komme',
- ... 'sie kommst',
- ... 'du mag mich',
- ... 'er magst mich',
- ... 'du folgt mir',
- ... 'sie hilfst mir',
- ... ]
- >>> germantest2 = {}
- >>> germantest2['doc'] = "Tests for number agreement"
- >>> germantest2['accept'] = [
- ... 'der Hund kommt',
- ... 'die Hunde kommen',
- ... 'ich komme',
- ... 'wir kommen',
- ... 'ich sehe die Katzen',
- ... 'ich folge den Katzen',
- ... 'ich sehe die Katzen',
- ... 'ich folge den Katzen',
- ... 'wir sehen die Katzen',
- ... 'wir folgen den Katzen'
- ... ]
- >>> germantest2['reject'] = [
- ... 'ich kommen',
- ... 'wir komme',
- ... 'der Hunde kommt',
- ... 'der Hunde kommen',
- ... 'die Katzen kommt',
- ... 'ich sehe der Hunde',
- ... 'ich folge den Hund',
- ... 'ich sehen der Hunde',
- ... 'ich folgen den Hund',
- ... 'wir sehe die Katzen',
- ... 'wir folge den Katzen'
- ... ]
- >>> germantest3 = {}
- >>> germantest3['doc'] = "Tests for case government and subcategorization"
- >>> germantest3['accept'] = [
- ... 'der Hund sieht mich',
- ... 'der Hund kommt',
- ... 'ich sehe den Hund',
- ... 'ich helfe dem Hund',
- ... ]
- >>> germantest3['reject'] = [
- ... 'ich sehe',
- ... 'ich helfe',
- ... 'ich komme den Hund',
- ... 'ich sehe den Hund die Katzen',
- ... 'du hilfst mich',
- ... 'du siehst mir',
- ... 'du siehst ich',
- ... 'der Hunde kommt mich',
- ... 'die Hunde sehe die Hunde',
- ... 'der Hund sehe die Hunde',
- ... 'ich hilft den Hund',
- ... 'ich hilft der Hund',
- ... 'ich sehe dem Hund',
- ... ]
- >>> germantestsuites = [germantest1, germantest2, germantest3]
- >>> tester = TestGrammar('grammars/book_grammars/german.fcfg', germantestsuites)
- >>> tester.run()
- Tests for person agreement: All tests passed!
- Tests for number agreement: All tests passed!
- Tests for case government and subcategorization: All tests passed!
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. _align howto: align.html
-.. _ccg howto: ccg.html
-.. _chat80 howto: chat80.html
-.. _childes howto: childes.html
-.. _chunk howto: chunk.html
-.. _classify howto: classify.html
-.. _collocations howto: collocations.html
-.. _compat howto: compat.html
-.. _corpus howto: corpus.html
-.. _data howto: data.html
-.. _dependency howto: dependency.html
-.. _discourse howto: discourse.html
-.. _drt howto: drt.html
-.. _featgram howto: featgram.html
-.. _featstruct howto: featstruct.html
-.. _framenet howto: framenet.html
-.. _generate howto: generate.html
-.. _gluesemantics howto: gluesemantics.html
-.. _gluesemantics_malt howto: gluesemantics_malt.html
-.. _grammar howto: grammar.html
-.. _grammartestsuites howto: grammartestsuites.html
-.. _index howto: index.html
-.. _inference howto: inference.html
-.. _internals howto: internals.html
-.. _japanese howto: japanese.html
-.. _logic howto: logic.html
-.. _metrics howto: metrics.html
-.. _misc howto: misc.html
-.. _nonmonotonic howto: nonmonotonic.html
-.. _parse howto: parse.html
-.. _portuguese_en howto: portuguese_en.html
-.. _probability howto: probability.html
-.. _propbank howto: propbank.html
-.. _relextract howto: relextract.html
-.. _resolution howto: resolution.html
-.. _semantics howto: semantics.html
-.. _simple howto: simple.html
-.. _stem howto: stem.html
-.. _tag howto: tag.html
-.. _tokenize howto: tokenize.html
-.. _toolbox howto: toolbox.html
-.. _tree howto: tree.html
-.. _treetransforms howto: treetransforms.html
-.. _util howto: util.html
-.. _wordnet howto: wordnet.html
-.. _wordnet_lch howto: wordnet_lch.html
-
-===========
-NLTK HOWTOs
-===========
-
-* `align HOWTO`_
-* `ccg HOWTO`_
-* `chat80 HOWTO`_
-* `childes HOWTO`_
-* `chunk HOWTO`_
-* `classify HOWTO`_
-* `collocations HOWTO`_
-* `compat HOWTO`_
-* `corpus HOWTO`_
-* `data HOWTO`_
-* `dependency HOWTO`_
-* `discourse HOWTO`_
-* `drt HOWTO`_
-* `featgram HOWTO`_
-* `featstruct HOWTO`_
-* `framenet HOWTO`_
-* `generate HOWTO`_
-* `gluesemantics HOWTO`_
-* `gluesemantics_malt HOWTO`_
-* `grammar HOWTO`_
-* `grammartestsuites HOWTO`_
-* `index HOWTO`_
-* `inference HOWTO`_
-* `internals HOWTO`_
-* `japanese HOWTO`_
-* `logic HOWTO`_
-* `metrics HOWTO`_
-* `misc HOWTO`_
-* `nonmonotonic HOWTO`_
-* `parse HOWTO`_
-* `portuguese_en HOWTO`_
-* `probability HOWTO`_
-* `propbank HOWTO`_
-* `relextract HOWTO`_
-* `resolution HOWTO`_
-* `semantics HOWTO`_
-* `simple HOWTO`_
-* `stem HOWTO`_
-* `tag HOWTO`_
-* `tokenize HOWTO`_
-* `toolbox HOWTO`_
-* `tree HOWTO`_
-* `treetransforms HOWTO`_
-* `util HOWTO`_
-* `wordnet HOWTO`_
-* `wordnet_lch HOWTO`_
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-====================================
-Logical Inference and Model Building
-====================================
-
- >>> from nltk import *
- >>> from nltk.sem.drt import DrtParser
- >>> from nltk.sem import logic
- >>> logic._counter._value = 0
-
-------------
-Introduction
-------------
-
-Within the area of automated reasoning, first order theorem proving
-and model building (or model generation) have both received much
-attention, and have given rise to highly sophisticated techniques. We
-focus therefore on providing an NLTK interface to third party tools
-for these tasks. In particular, the module ``nltk.inference`` can be
-used to access both theorem provers and model builders.
-
----------------------------------
-NLTK Interface to Theorem Provers
----------------------------------
-
-The main class used to interface with a theorem prover is the ``Prover``
-class, found in ``nltk.api``. The ``prove()`` method takes three optional
-arguments: a goal, a list of assumptions, and a ``verbose`` boolean to
-indicate whether the proof should be printed to the console. The proof goal
-and any assumptions need to be instances of the ``Expression`` class
-specified by ``nltk.sem.logic``. There are currently three theorem provers
-included with NLTK: ``Prover9``, ``TableauProver``, and
-``ResolutionProver``. The first is an off-the-shelf prover, while the other
-two are written in Python and included in the ``nltk.inference`` package.
-
- >>> from nltk.sem import Expression
- >>> read_expr = Expression.fromstring
- >>> p1 = read_expr('man(socrates)')
- >>> p2 = read_expr('all x.(man(x) -> mortal(x))')
- >>> c = read_expr('mortal(socrates)')
- >>> Prover9().prove(c, [p1,p2])
- True
- >>> TableauProver().prove(c, [p1,p2])
- True
- >>> ResolutionProver().prove(c, [p1,p2], verbose=True)
- [1] {-mortal(socrates)} A
- [2] {man(socrates)} A
- [3] {-man(z2), mortal(z2)} A
- [4] {-man(socrates)} (1, 3)
- [5] {mortal(socrates)} (2, 3)
- [6] {} (1, 5)
- <BLANKLINE>
- True
-
----------------------
-The ``ProverCommand``
----------------------
-
-A ``ProverCommand`` is a stateful holder for a theorem
-prover. The command stores a theorem prover instance (of type ``Prover``),
-a goal, a list of assumptions, the result of the proof, and a string version
-of the entire proof. Corresponding to the three included ``Prover``
-implementations, there are three ``ProverCommand`` implementations:
-``Prover9Command``, ``TableauProverCommand``, and
-``ResolutionProverCommand``.
-
-The ``ProverCommand``'s constructor takes its goal and assumptions. The
-``prove()`` command executes the ``Prover`` and ``proof()``
-returns a String form of the proof
-If the ``prove()`` method has not been called,
-then the prover command will be unable to display a proof.
-
- >>> prover = ResolutionProverCommand(c, [p1,p2])
- >>> print(prover.proof()) # doctest: +ELLIPSIS
- Traceback (most recent call last):
- File "...", line 1212, in __run
- compileflags, 1) in test.globs
- File "<doctest nltk/test/inference.doctest[10]>", line 1, in <module>
- File "...", line ..., in proof
- raise LookupError("You have to call prove() first to get a proof!")
- LookupError: You have to call prove() first to get a proof!
- >>> prover.prove()
- True
- >>> print(prover.proof())
- [1] {-mortal(socrates)} A
- [2] {man(socrates)} A
- [3] {-man(z4), mortal(z4)} A
- [4] {-man(socrates)} (1, 3)
- [5] {mortal(socrates)} (2, 3)
- [6] {} (1, 5)
- <BLANKLINE>
-
-The prover command stores the result of proving so that if ``prove()`` is
-called again, then the command can return the result without executing the
-prover again. This allows the user to access the result of the proof without
-wasting time re-computing what it already knows.
-
- >>> prover.prove()
- True
- >>> prover.prove()
- True
-
-The assumptions and goal may be accessed using the ``assumptions()`` and
-``goal()`` methods, respectively.
-
- >>> prover.assumptions()
- [<ApplicationExpression man(socrates)>, <Alread_expression all x.(man(x) -> mortal(x))>]
- >>> prover.goal()
- <ApplicationExpression mortal(socrates)>
-
-The assumptions list may be modified using the ``add_assumptions()`` and
-``retract_assumptions()`` methods. Both methods take a list of ``Expression``
-objects. Since adding or removing assumptions may change the result of the
-proof, the stored result is cleared when either of these methods are called.
-That means that ``proof()`` will be unavailable until ``prove()`` is called and
-a call to ``prove()`` will execute the theorem prover.
-
- >>> prover.retract_assumptions([read_expr('man(socrates)')])
- >>> print(prover.proof()) # doctest: +ELLIPSIS
- Traceback (most recent call last):
- File "...", line 1212, in __run
- compileflags, 1) in test.globs
- File "<doctest nltk/test/inference.doctest[10]>", line 1, in <module>
- File "...", line ..., in proof
- raise LookupError("You have to call prove() first to get a proof!")
- LookupError: You have to call prove() first to get a proof!
- >>> prover.prove()
- False
- >>> print(prover.proof())
- [1] {-mortal(socrates)} A
- [2] {-man(z6), mortal(z6)} A
- [3] {-man(socrates)} (1, 2)
- <BLANKLINE>
- >>> prover.add_assumptions([read_expr('man(socrates)')])
- >>> prover.prove()
- True
-
--------
-Prover9
--------
-
-Prover9 Installation
-~~~~~~~~~~~~~~~~~~~~
-
-You can download Prover9 from http://www.cs.unm.edu/~mccune/prover9/.
-
-Extract the source code into a suitable directory and follow the
-instructions in the Prover9 ``README.make`` file to compile the executables.
-Install these into an appropriate location; the
-``prover9_search`` variable is currently configured to look in the
-following locations:
-
- >>> p = Prover9()
- >>> p.binary_locations() # doctest: +NORMALIZE_WHITESPACE
- ['/usr/local/bin/prover9',
- '/usr/local/bin/prover9/bin',
- '/usr/local/bin',
- '/usr/bin',
- '/usr/local/prover9',
- '/usr/local/share/prover9']
-
-Alternatively, the environment variable ``PROVER9HOME`` may be configured with
-the binary's location.
-
-The path to the correct directory can be set manually in the following
-manner:
-
- >>> config_prover9(path='/usr/local/bin') # doctest: +SKIP
- [Found prover9: /usr/local/bin/prover9]
-
-If the executables cannot be found, ``Prover9`` will issue a warning message:
-
- >>> p.prove() # doctest: +SKIP
- Traceback (most recent call last):
- ...
- LookupError:
- ===========================================================================
- NLTK was unable to find the prover9 executable! Use config_prover9() or
- set the PROVER9HOME environment variable.
- <BLANKLINE>
- >> config_prover9('/path/to/prover9')
- <BLANKLINE>
- For more information, on prover9, see:
- <http://www.cs.unm.edu/~mccune/prover9/>
- ===========================================================================
-
-
-Using Prover9
-~~~~~~~~~~~~~
-
-The general case in theorem proving is to determine whether ``S |- g``
-holds, where ``S`` is a possibly empty set of assumptions, and ``g``
-is a proof goal.
-
-As mentioned earlier, NLTK input to ``Prover9`` must be
-``Expression``\ s of ``nltk.sem.logic``. A ``Prover9`` instance is
-initialized with a proof goal and, possibly, some assumptions. The
-``prove()`` method attempts to find a proof of the goal, given the
-list of assumptions (in this case, none).
-
- >>> goal = read_expr('(man(x) <-> --man(x))')
- >>> prover = Prover9Command(goal)
- >>> prover.prove()
- True
-
-Given a ``ProverCommand`` instance ``prover``, the method
-``prover.proof()`` will return a String of the extensive proof information
-provided by Prover9, shown in abbreviated form here::
-
- ============================== Prover9 ===============================
- Prover9 (32) version ...
- Process ... was started by ... on ...
- ...
- The command was ".../prover9 -f ...".
- ============================== end of head ===========================
-
- ============================== INPUT =================================
-
- % Reading from file /var/...
-
-
- formulas(goals).
- (all x (man(x) -> man(x))).
- end_of_list.
-
- ...
- ============================== end of search =========================
-
- THEOREM PROVED
-
- Exiting with 1 proof.
-
- Process 6317 exit (max_proofs) Mon Jan 21 15:23:28 2008
-
-
-As mentioned earlier, we may want to list some assumptions for
-the proof, as shown here.
-
- >>> g = read_expr('mortal(socrates)')
- >>> a1 = read_expr('all x.(man(x) -> mortal(x))')
- >>> prover = Prover9Command(g, assumptions=[a1])
- >>> prover.print_assumptions()
- all x.(man(x) -> mortal(x))
-
-However, the assumptions are not sufficient to derive the goal:
-
- >>> print(prover.prove())
- False
-
-So let's add another assumption:
-
- >>> a2 = read_expr('man(socrates)')
- >>> prover.add_assumptions([a2])
- >>> prover.print_assumptions()
- all x.(man(x) -> mortal(x))
- man(socrates)
- >>> print(prover.prove())
- True
-
-We can also show the assumptions in ``Prover9`` format.
-
- >>> prover.print_assumptions(output_format='Prover9')
- all x (man(x) -> mortal(x))
- man(socrates)
-
- >>> prover.print_assumptions(output_format='Spass')
- Traceback (most recent call last):
- . . .
- NameError: Unrecognized value for 'output_format': Spass
-
-Assumptions can be retracted from the list of assumptions.
-
- >>> prover.retract_assumptions([a1])
- >>> prover.print_assumptions()
- man(socrates)
- >>> prover.retract_assumptions([a1])
-
-Statements can be loaded from a file and parsed. We can then add these
-statements as new assumptions.
-
- >>> g = read_expr('all x.(boxer(x) -> -boxerdog(x))')
- >>> prover = Prover9Command(g)
- >>> prover.prove()
- False
- >>> import nltk.data
- >>> new = nltk.data.load('grammars/sample_grammars/background0.fol')
- >>> for a in new:
- ... print(a)
- all x.(boxerdog(x) -> dog(x))
- all x.(boxer(x) -> person(x))
- all x.-(dog(x) & person(x))
- exists x.boxer(x)
- exists x.boxerdog(x)
- >>> prover.add_assumptions(new)
- >>> print(prover.prove())
- True
- >>> print(prover.proof()) # doctest: +ELLIPSIS
- ============================== prooftrans ============================
- Prover9 (...) version ...
- Process ... was started by ... on ...
- ...
- The command was ".../prover9".
- ============================== end of head ===========================
- <BLANKLINE>
- ============================== end of input ==========================
- <BLANKLINE>
- ============================== PROOF =================================
- <BLANKLINE>
- % -------- Comments from original proof --------
- % Proof 1 at ... seconds.
- % Length of proof is 13.
- % Level of proof is 4.
- % Maximum clause weight is 0.000.
- % Given clauses 0.
- <BLANKLINE>
- <BLANKLINE>
- 1 (all x (boxerdog(x) -> dog(x))). [assumption].
- 2 (all x (boxer(x) -> person(x))). [assumption].
- 3 (all x -(dog(x) & person(x))). [assumption].
- 6 (all x (boxer(x) -> -boxerdog(x))). [goal].
- 8 -boxerdog(x) | dog(x). [clausify(1)].
- 9 boxerdog(c3). [deny(6)].
- 11 -boxer(x) | person(x). [clausify(2)].
- 12 boxer(c3). [deny(6)].
- 14 -dog(x) | -person(x). [clausify(3)].
- 15 dog(c3). [resolve(9,a,8,a)].
- 18 person(c3). [resolve(12,a,11,a)].
- 19 -person(c3). [resolve(15,a,14,a)].
- 20 $F. [resolve(19,a,18,a)].
- <BLANKLINE>
- ============================== end of proof ==========================
-
-----------------------
-The equiv() method
-----------------------
-
-One application of the theorem prover functionality is to check if
-two Expressions have the same meaning.
-The ``equiv()`` method calls a theorem prover to determine whether two
-Expressions are logically equivalent.
-
- >>> a = read_expr(r'exists x.(man(x) & walks(x))')
- >>> b = read_expr(r'exists x.(walks(x) & man(x))')
- >>> print(a.equiv(b))
- True
-
-The same method can be used on Discourse Representation Structures (DRSs).
-In this case, each DRS is converted to a first order logic form, and then
-passed to the theorem prover.
-
- >>> dp = DrtParser()
- >>> a = dp.parse(r'([x],[man(x), walks(x)])')
- >>> b = dp.parse(r'([x],[walks(x), man(x)])')
- >>> print(a.equiv(b))
- True
-
-
---------------------------------
-NLTK Interface to Model Builders
---------------------------------
-
-The top-level to model builders is parallel to that for
-theorem-provers. The ``ModelBuilder`` interface is located
-in ``nltk.inference.api``. It is currently only implemented by
-``Mace``, which interfaces with the Mace4 model builder.
-
-Typically we use a model builder to show that some set of formulas has
-a model, and is therefore consistent. One way of doing this is by
-treating our candidate set of sentences as assumptions, and leaving
-the goal unspecified.
-Thus, the following interaction shows how both ``{a, c1}`` and ``{a, c2}``
-are consistent sets, since Mace succeeds in a building a
-model for each of them, while ``{c1, c2}`` is inconsistent.
-
- >>> a3 = read_expr('exists x.(man(x) and walks(x))')
- >>> c1 = read_expr('mortal(socrates)')
- >>> c2 = read_expr('-mortal(socrates)')
- >>> mace = Mace()
- >>> print(mace.build_model(None, [a3, c1]))
- True
- >>> print(mace.build_model(None, [a3, c2]))
- True
-
-We can also use the model builder as an adjunct to theorem prover.
-Let's suppose we are trying to prove ``S |- g``, i.e. that ``g``
-is logically entailed by assumptions ``S = {s1, s2, ..., sn}``.
-We can this same input to Mace4, and the model builder will try to
-find a counterexample, that is, to show that ``g`` does *not* follow
-from ``S``. So, given this input, Mace4 will try to find a model for
-the set ``S' = {s1, s2, ..., sn, (not g)}``. If ``g`` fails to follow
-from ``S``, then Mace4 may well return with a counterexample faster
-than Prover9 concludes that it cannot find the required proof.
-Conversely, if ``g`` *is* provable from ``S``, Mace4 may take a long
-time unsuccessfully trying to find a counter model, and will eventually give up.
-
-In the following example, we see that the model builder does succeed
-in building a model of the assumptions together with the negation of
-the goal. That is, it succeeds in finding a model
-where there is a woman that every man loves; Adam is a man; Eve is a
-woman; but Adam does not love Eve.
-
- >>> a4 = read_expr('exists y. (woman(y) & all x. (man(x) -> love(x,y)))')
- >>> a5 = read_expr('man(adam)')
- >>> a6 = read_expr('woman(eve)')
- >>> g = read_expr('love(adam,eve)')
- >>> print(mace.build_model(g, [a4, a5, a6]))
- True
-
-The Model Builder will fail to find a model if the assumptions do entail
-the goal. Mace will continue to look for models of ever-increasing sizes
-until the end_size number is reached. By default, end_size is 500,
-but it can be set manually for quicker response time.
-
- >>> a7 = read_expr('all x.(man(x) -> mortal(x))')
- >>> a8 = read_expr('man(socrates)')
- >>> g2 = read_expr('mortal(socrates)')
- >>> print(Mace(end_size=50).build_model(g2, [a7, a8]))
- False
-
-There is also a ``ModelBuilderCommand`` class that, like ``ProverCommand``,
-stores a ``ModelBuilder``, a goal, assumptions, a result, and a model. The
-only implementation in NLTK is ``MaceCommand``.
-
-
------
-Mace4
------
-
-Mace4 Installation
-~~~~~~~~~~~~~~~~~~
-
-Mace4 is packaged with Prover9, and can be downloaded from the same
-source, namely http://www.cs.unm.edu/~mccune/prover9/. It is installed
-in the same manner as Prover9.
-
-Using Mace4
-~~~~~~~~~~~
-
-Check whether Mace4 can find a model.
-
- >>> a = read_expr('(see(mary,john) & -(mary = john))')
- >>> mb = MaceCommand(assumptions=[a])
- >>> mb.build_model()
- True
-
-Show the model in 'tabular' format.
-
- >>> print(mb.model(format='tabular'))
- % number = 1
- % seconds = 0
- <BLANKLINE>
- % Interpretation of size 2
- <BLANKLINE>
- john : 0
- <BLANKLINE>
- mary : 1
- <BLANKLINE>
- see :
- | 0 1
- ---+----
- 0 | 0 0
- 1 | 1 0
- <BLANKLINE>
-
-Show the model in 'tabular' format.
-
- >>> print(mb.model(format='cooked'))
- % number = 1
- % seconds = 0
- <BLANKLINE>
- % Interpretation of size 2
- <BLANKLINE>
- john = 0.
- <BLANKLINE>
- mary = 1.
- <BLANKLINE>
- - see(0,0).
- - see(0,1).
- see(1,0).
- - see(1,1).
- <BLANKLINE>
-
-The property ``valuation`` accesses the stored ``Valuation``.
-
- >>> print(mb.valuation)
- {'john': 'a', 'mary': 'b', 'see': {('b', 'a')}}
-
-We can return to our earlier example and inspect the model:
-
- >>> mb = MaceCommand(g, assumptions=[a4, a5, a6])
- >>> m = mb.build_model()
- >>> print(mb.model(format='cooked'))
- % number = 1
- % seconds = 0
- <BLANKLINE>
- % Interpretation of size 2
- <BLANKLINE>
- adam = 0.
- <BLANKLINE>
- eve = 0.
- <BLANKLINE>
- c1 = 1.
- <BLANKLINE>
- man(0).
- - man(1).
- <BLANKLINE>
- woman(0).
- woman(1).
- <BLANKLINE>
- - love(0,0).
- love(0,1).
- - love(1,0).
- - love(1,1).
- <BLANKLINE>
-
-Here, we can see that ``adam`` and ``eve`` have been assigned the same
-individual, namely ``0`` as value; ``0`` is both a man and a woman; a second
-individual ``1`` is also a woman; and ``0`` loves ``1``. Thus, this is
-an interpretation in which there is a woman that every man loves but
-Adam doesn't love Eve.
-
-Mace can also be used with propositional logic.
-
- >>> p = read_expr('P')
- >>> q = read_expr('Q')
- >>> mb = MaceCommand(q, [p, p>-q])
- >>> mb.build_model()
- True
- >>> mb.valuation['P']
- True
- >>> mb.valuation['Q']
- False
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-def setup_module(module):
- from nose import SkipTest
- from nltk.inference.mace import Mace
-
- try:
- m = Mace()
- m._find_binary("mace4")
- except LookupError:
- raise SkipTest(
- "Mace4/Prover9 is not available so inference.doctest was skipped"
- )
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==========================================
- Unit tests for the nltk.utilities module
-==========================================
-
-overridden()
-~~~~~~~~~~~~
- >>> from nltk.internals import overridden
-
-The typical use case is in defining methods for an interface or
-abstract base class, in such a way that subclasses don't have to
-implement all of the methods:
-
- >>> class EaterI(object):
- ... '''Subclass must define eat() or batch_eat().'''
- ... def eat(self, food):
- ... if overridden(self.batch_eat):
- ... return self.batch_eat([food])[0]
- ... else:
- ... raise NotImplementedError()
- ... def batch_eat(self, foods):
- ... return [self.eat(food) for food in foods]
-
-As long as a subclass implements one method, it will be used to
-perform the other method:
-
- >>> class GoodEater1(EaterI):
- ... def eat(self, food):
- ... return 'yum'
- >>> GoodEater1().eat('steak')
- 'yum'
- >>> GoodEater1().batch_eat(['steak', 'peas'])
- ['yum', 'yum']
-
- >>> class GoodEater2(EaterI):
- ... def batch_eat(self, foods):
- ... return ['yum' for food in foods]
- >>> GoodEater2().eat('steak')
- 'yum'
- >>> GoodEater2().batch_eat(['steak', 'peas'])
- ['yum', 'yum']
-
-But if a subclass doesn't implement either one, then they'll get an
-error when they try to call them. (nb this is better than infinite
-recursion):
-
- >>> class BadEater1(EaterI):
- ... pass
- >>> BadEater1().eat('steak')
- Traceback (most recent call last):
- . . .
- NotImplementedError
- >>> BadEater1().batch_eat(['steak', 'peas'])
- Traceback (most recent call last):
- . . .
- NotImplementedError
-
-Trying to use the abstract base class itself will also result in an
-error:
-
- >>> class EaterI(EaterI):
- ... pass
- >>> EaterI().eat('steak')
- Traceback (most recent call last):
- . . .
- NotImplementedError
- >>> EaterI().batch_eat(['steak', 'peas'])
- Traceback (most recent call last):
- . . .
- NotImplementedError
-
-It's ok to use intermediate abstract classes:
-
- >>> class AbstractEater(EaterI):
- ... pass
-
- >>> class GoodEater3(AbstractEater):
- ... def eat(self, food):
- ... return 'yum'
- ...
- >>> GoodEater3().eat('steak')
- 'yum'
- >>> GoodEater3().batch_eat(['steak', 'peas'])
- ['yum', 'yum']
-
- >>> class GoodEater4(AbstractEater):
- ... def batch_eat(self, foods):
- ... return ['yum' for food in foods]
- >>> GoodEater4().eat('steak')
- 'yum'
- >>> GoodEater4().batch_eat(['steak', 'peas'])
- ['yum', 'yum']
-
- >>> class BadEater2(AbstractEater):
- ... pass
- >>> BadEater2().eat('steak')
- Traceback (most recent call last):
- . . .
- NotImplementedError
- >>> BadEater2().batch_eat(['steak', 'peas'])
- Traceback (most recent call last):
- . . .
- NotImplementedError
-
-Here's some extra tests:
-
- >>> class A(object):
- ... def f(x): pass
- >>> class B(A):
- ... def f(x): pass
- >>> class C(A): pass
- >>> class D(B): pass
-
- >>> overridden(A().f)
- False
- >>> overridden(B().f)
- True
- >>> overridden(C().f)
- False
- >>> overridden(D().f)
- True
-
-It works for classic classes, too:
-
- >>> class A:
- ... def f(x): pass
- >>> class B(A):
- ... def f(x): pass
- >>> class C(A): pass
- >>> class D(B): pass
- >>> overridden(A().f)
- False
- >>> overridden(B().f)
- True
- >>> overridden(C().f)
- False
- >>> overridden(D().f)
- True
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-============================
-Japanese Language Processing
-============================
-
- >>> from nltk import *
-
--------------
-Corpus Access
--------------
-
-KNB Corpus
-----------
-
- >>> from nltk.corpus import knbc
-
-Access the words: this should produce a list of strings:
-
- >>> type(knbc.words()[0]) is not bytes
- True
-
-Access the sentences: this should produce a list of lists of strings:
-
- >>> type(knbc.sents()[0][0]) is not bytes
- True
-
-Access the tagged words: this should produce a list of word, tag pairs:
-
- >>> type(knbc.tagged_words()[0])
- <... 'tuple'>
-
-Access the tagged sentences: this should produce a list of lists of word, tag pairs:
-
- >>> type(knbc.tagged_sents()[0][0])
- <... 'tuple'>
-
-
-JEITA Corpus
-------------
-
- >>> from nltk.corpus import jeita
-
-Access the tagged words: this should produce a list of word, tag pairs, where a tag is a string:
-
- >>> type(jeita.tagged_words()[0][1]) is not bytes
- True
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. -*- coding: utf-8 -*-
-
-
-Regression Tests
-================
-
-
-Issue 167
----------
-https://github.com/nltk/nltk/issues/167
-
- >>> from nltk.corpus import brown
- >>> from nltk.lm.preprocessing import padded_everygram_pipeline
- >>> ngram_order = 3
- >>> train_data, vocab_data = padded_everygram_pipeline(
- ... ngram_order,
- ... brown.sents(categories="news")
- ... )
-
- >>> from nltk.lm import WittenBellInterpolated
- >>> lm = WittenBellInterpolated(ngram_order)
- >>> lm.fit(train_data, vocab_data)
-
-Sentence containing an unseen word should result in infinite entropy because
-Witten-Bell is based ultimately on MLE, which cannot handle unseen ngrams.
-Crucially, it shouldn't raise any exceptions for unseen words.
-
- >>> from nltk.util import ngrams
- >>> sent = ngrams("This is a sentence with the word aaddvark".split(), 3)
- >>> lm.entropy(sent)
- inf
-
-If we remove all unseen ngrams from the sentence, we'll get a non-infinite value
-for the entropy.
-
- >>> sent = ngrams("This is a sentence".split(), 3)
- >>> lm.entropy(sent)
- 17.41365588455936
-
-
-Issue 367
----------
-https://github.com/nltk/nltk/issues/367
-
-Reproducing Dan Blanchard's example:
-https://github.com/nltk/nltk/issues/367#issuecomment-14646110
-
- >>> from nltk.lm import Lidstone, Vocabulary
- >>> word_seq = list('aaaababaaccbacb')
- >>> ngram_order = 2
- >>> from nltk.util import everygrams
- >>> train_data = [everygrams(word_seq, max_len=ngram_order)]
- >>> V = Vocabulary(['a', 'b', 'c', ''])
- >>> lm = Lidstone(0.2, ngram_order, vocabulary=V)
- >>> lm.fit(train_data)
-
-For doctest to work we have to sort the vocabulary keys.
-
- >>> V_keys = sorted(V)
- >>> round(sum(lm.score(w, ("b",)) for w in V_keys), 6)
- 1.0
- >>> round(sum(lm.score(w, ("a",)) for w in V_keys), 6)
- 1.0
-
- >>> [lm.score(w, ("b",)) for w in V_keys]
- [0.05, 0.05, 0.8, 0.05, 0.05]
- >>> [round(lm.score(w, ("a",)), 4) for w in V_keys]
- [0.0222, 0.0222, 0.4667, 0.2444, 0.2444]
-
-
-Here's reproducing @afourney's comment:
-https://github.com/nltk/nltk/issues/367#issuecomment-15686289
-
- >>> sent = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']
- >>> ngram_order = 3
- >>> from nltk.lm.preprocessing import padded_everygram_pipeline
- >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, [sent])
- >>> from nltk.lm import Lidstone
- >>> lm = Lidstone(0.2, ngram_order)
- >>> lm.fit(train_data, vocab_data)
-
-The vocabulary includes the "UNK" symbol as well as two padding symbols.
-
- >>> len(lm.vocab)
- 6
- >>> word = "foo"
- >>> context = ("bar", "baz")
-
-The raw counts.
-
- >>> lm.context_counts(context)[word]
- 0
- >>> lm.context_counts(context).N()
- 1
-
-Counts with Lidstone smoothing.
-
- >>> lm.context_counts(context)[word] + lm.gamma
- 0.2
- >>> lm.context_counts(context).N() + len(lm.vocab) * lm.gamma
- 2.2
-
-Without any backoff, just using Lidstone smoothing, P("foo" | "bar", "baz") should be:
-0.2 / 2.2 ~= 0.090909
-
- >>> round(lm.score(word, context), 6)
- 0.090909
-
-
-Issue 380
----------
-https://github.com/nltk/nltk/issues/380
-
-Reproducing setup akin to this comment:
-https://github.com/nltk/nltk/issues/380#issue-12879030
-
-For speed take only the first 100 sentences of reuters. Shouldn't affect the test.
- >>> from nltk.corpus import reuters
- >>> sents = reuters.sents()[:100]
- >>> ngram_order = 3
- >>> from nltk.lm.preprocessing import padded_everygram_pipeline
- >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, sents)
-
- >>> from nltk.lm import Lidstone
- >>> lm = Lidstone(0.2, ngram_order)
- >>> lm.fit(train_data, vocab_data)
- >>> lm.score("said", ("",)) < 1
- True
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=======================
-Logic & Lambda Calculus
-=======================
-
-The `nltk.logic` package allows expressions of First-Order Logic (FOL) to be
-parsed into ``Expression`` objects. In addition to FOL, the parser
-handles lambda-abstraction with variables of higher order.
-
---------
-Overview
---------
-
- >>> from nltk.sem.logic import *
-
-The default inventory of logical constants is the following:
-
- >>> boolean_ops() # doctest: +NORMALIZE_WHITESPACE
- negation -
- conjunction &
- disjunction |
- implication ->
- equivalence <->
- >>> equality_preds() # doctest: +NORMALIZE_WHITESPACE
- equality =
- inequality !=
- >>> binding_ops() # doctest: +NORMALIZE_WHITESPACE
- existential exists
- universal all
- lambda \
-
-----------------
-Regression Tests
-----------------
-
-
-Untyped Logic
-+++++++++++++
-
-Process logical expressions conveniently:
-
- >>> read_expr = Expression.fromstring
-
-Test for equality under alpha-conversion
-========================================
-
- >>> e1 = read_expr('exists x.P(x)')
- >>> print(e1)
- exists x.P(x)
- >>> e2 = e1.alpha_convert(Variable('z'))
- >>> print(e2)
- exists z.P(z)
- >>> e1 == e2
- True
-
-
- >>> l = read_expr(r'\X.\X.X(X)(1)').simplify()
- >>> id = read_expr(r'\X.X(X)')
- >>> l == id
- True
-
-Test numerals
-=============
-
- >>> zero = read_expr(r'\F x.x')
- >>> one = read_expr(r'\F x.F(x)')
- >>> two = read_expr(r'\F x.F(F(x))')
- >>> three = read_expr(r'\F x.F(F(F(x)))')
- >>> four = read_expr(r'\F x.F(F(F(F(x))))')
- >>> succ = read_expr(r'\N F x.F(N(F,x))')
- >>> plus = read_expr(r'\M N F x.M(F,N(F,x))')
- >>> mult = read_expr(r'\M N F.M(N(F))')
- >>> pred = read_expr(r'\N F x.(N(\G H.H(G(F)))(\u.x)(\u.u))')
- >>> v1 = ApplicationExpression(succ, zero).simplify()
- >>> v1 == one
- True
- >>> v2 = ApplicationExpression(succ, v1).simplify()
- >>> v2 == two
- True
- >>> v3 = ApplicationExpression(ApplicationExpression(plus, v1), v2).simplify()
- >>> v3 == three
- True
- >>> v4 = ApplicationExpression(ApplicationExpression(mult, v2), v2).simplify()
- >>> v4 == four
- True
- >>> v5 = ApplicationExpression(pred, ApplicationExpression(pred, v4)).simplify()
- >>> v5 == two
- True
-
-Overloaded operators also exist, for convenience.
-
- >>> print(succ(zero).simplify() == one)
- True
- >>> print(plus(one,two).simplify() == three)
- True
- >>> print(mult(two,two).simplify() == four)
- True
- >>> print(pred(pred(four)).simplify() == two)
- True
-
- >>> john = read_expr(r'john')
- >>> man = read_expr(r'\x.man(x)')
- >>> walk = read_expr(r'\x.walk(x)')
- >>> man(john).simplify()
- <ApplicationExpression man(john)>
- >>> print(-walk(john).simplify())
- -walk(john)
- >>> print((man(john) & walk(john)).simplify())
- (man(john) & walk(john))
- >>> print((man(john) | walk(john)).simplify())
- (man(john) | walk(john))
- >>> print((man(john) > walk(john)).simplify())
- (man(john) -> walk(john))
- >>> print((man(john) < walk(john)).simplify())
- (man(john) <-> walk(john))
-
-Python's built-in lambda operator can also be used with Expressions
-
- >>> john = VariableExpression(Variable('john'))
- >>> run_var = VariableExpression(Variable('run'))
- >>> run = lambda x: run_var(x)
- >>> run(john)
- <ApplicationExpression run(john)>
-
-
-``betaConversionTestSuite.pl``
-------------------------------
-
-Tests based on Blackburn & Bos' book, *Representation and Inference
-for Natural Language*.
-
- >>> x1 = read_expr(r'\P.P(mia)(\x.walk(x))').simplify()
- >>> x2 = read_expr(r'walk(mia)').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'exists x.(man(x) & ((\P.exists x.(woman(x) & P(x)))(\y.love(x,y))))').simplify()
- >>> x2 = read_expr(r'exists x.(man(x) & exists y.(woman(y) & love(x,y)))').simplify()
- >>> x1 == x2
- True
- >>> x1 = read_expr(r'\a.sleep(a)(mia)').simplify()
- >>> x2 = read_expr(r'sleep(mia)').simplify()
- >>> x1 == x2
- True
- >>> x1 = read_expr(r'\a.\b.like(b,a)(mia)').simplify()
- >>> x2 = read_expr(r'\b.like(b,mia)').simplify()
- >>> x1 == x2
- True
- >>> x1 = read_expr(r'\a.(\b.like(b,a)(vincent))').simplify()
- >>> x2 = read_expr(r'\a.like(vincent,a)').simplify()
- >>> x1 == x2
- True
- >>> x1 = read_expr(r'\a.((\b.like(b,a)(vincent)) & sleep(a))').simplify()
- >>> x2 = read_expr(r'\a.(like(vincent,a) & sleep(a))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'(\a.\b.like(b,a)(mia)(vincent))').simplify()
- >>> x2 = read_expr(r'like(vincent,mia)').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'P((\a.sleep(a)(vincent)))').simplify()
- >>> x2 = read_expr(r'P(sleep(vincent))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'\A.A((\b.sleep(b)(vincent)))').simplify()
- >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'\A.A(sleep(vincent))').simplify()
- >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'(\A.A(vincent)(\b.sleep(b)))').simplify()
- >>> x2 = read_expr(r'sleep(vincent)').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'\A.believe(mia,A(vincent))(\b.sleep(b))').simplify()
- >>> x2 = read_expr(r'believe(mia,sleep(vincent))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'(\A.(A(vincent) & A(mia)))(\b.sleep(b))').simplify()
- >>> x2 = read_expr(r'(sleep(vincent) & sleep(mia))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'\A.\B.(\C.C(A(vincent))(\d.probably(d)) & (\C.C(B(mia))(\d.improbably(d))))(\f.walk(f))(\f.talk(f))').simplify()
- >>> x2 = read_expr(r'(probably(walk(vincent)) & improbably(talk(mia)))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\d.\f.love(d,f))))(jules)(mia)').simplify()
- >>> x2 = read_expr(r'love(jules,mia)').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'(\A.\B.exists c.(A(c) & B(c)))(\d.boxer(d),\d.sleep(d))').simplify()
- >>> x2 = read_expr(r'exists c.(boxer(c) & sleep(c))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'\A.Z(A)(\c.\a.like(a,c))').simplify()
- >>> x2 = read_expr(r'Z(\c.\a.like(a,c))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'\A.\b.A(b)(\c.\b.like(b,c))').simplify()
- >>> x2 = read_expr(r'\b.(\c.\b.like(b,c)(b))').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\b.\a.loves(b,a))))(jules)(mia)').simplify()
- >>> x2 = read_expr(r'loves(jules,mia)').simplify()
- >>> x1 == x2
- True
-
- >>> x1 = read_expr(r'(\A.\b.(exists b.A(b) & A(b)))(\c.boxer(c))(vincent)').simplify()
- >>> x2 = read_expr(r'((exists b.boxer(b)) & boxer(vincent))').simplify()
- >>> x1 == x2
- True
-
-Test Parser
-===========
-
- >>> print(read_expr(r'john'))
- john
- >>> print(read_expr(r'x'))
- x
- >>> print(read_expr(r'-man(x)'))
- -man(x)
- >>> print(read_expr(r'--man(x)'))
- --man(x)
- >>> print(read_expr(r'(man(x))'))
- man(x)
- >>> print(read_expr(r'((man(x)))'))
- man(x)
- >>> print(read_expr(r'man(x) <-> tall(x)'))
- (man(x) <-> tall(x))
- >>> print(read_expr(r'(man(x) <-> tall(x))'))
- (man(x) <-> tall(x))
- >>> print(read_expr(r'(man(x) & tall(x) & walks(x))'))
- (man(x) & tall(x) & walks(x))
- >>> print(read_expr(r'(man(x) & tall(x) & walks(x))').first)
- (man(x) & tall(x))
- >>> print(read_expr(r'man(x) | tall(x) & walks(x)'))
- (man(x) | (tall(x) & walks(x)))
- >>> print(read_expr(r'((man(x) & tall(x)) | walks(x))'))
- ((man(x) & tall(x)) | walks(x))
- >>> print(read_expr(r'man(x) & (tall(x) | walks(x))'))
- (man(x) & (tall(x) | walks(x)))
- >>> print(read_expr(r'(man(x) & (tall(x) | walks(x)))'))
- (man(x) & (tall(x) | walks(x)))
- >>> print(read_expr(r'P(x) -> Q(x) <-> R(x) | S(x) & T(x)'))
- ((P(x) -> Q(x)) <-> (R(x) | (S(x) & T(x))))
- >>> print(read_expr(r'exists x.man(x)'))
- exists x.man(x)
- >>> print(read_expr(r'exists x.(man(x) & tall(x))'))
- exists x.(man(x) & tall(x))
- >>> print(read_expr(r'exists x.(man(x) & tall(x) & walks(x))'))
- exists x.(man(x) & tall(x) & walks(x))
- >>> print(read_expr(r'-P(x) & Q(x)'))
- (-P(x) & Q(x))
- >>> read_expr(r'-P(x) & Q(x)') == read_expr(r'(-P(x)) & Q(x)')
- True
- >>> print(read_expr(r'\x.man(x)'))
- \x.man(x)
- >>> print(read_expr(r'\x.man(x)(john)'))
- \x.man(x)(john)
- >>> print(read_expr(r'\x.man(x)(john) & tall(x)'))
- (\x.man(x)(john) & tall(x))
- >>> print(read_expr(r'\x.\y.sees(x,y)'))
- \x y.sees(x,y)
- >>> print(read_expr(r'\x y.sees(x,y)'))
- \x y.sees(x,y)
- >>> print(read_expr(r'\x.\y.sees(x,y)(a)'))
- (\x y.sees(x,y))(a)
- >>> print(read_expr(r'\x y.sees(x,y)(a)'))
- (\x y.sees(x,y))(a)
- >>> print(read_expr(r'\x.\y.sees(x,y)(a)(b)'))
- ((\x y.sees(x,y))(a))(b)
- >>> print(read_expr(r'\x y.sees(x,y)(a)(b)'))
- ((\x y.sees(x,y))(a))(b)
- >>> print(read_expr(r'\x.\y.sees(x,y)(a,b)'))
- ((\x y.sees(x,y))(a))(b)
- >>> print(read_expr(r'\x y.sees(x,y)(a,b)'))
- ((\x y.sees(x,y))(a))(b)
- >>> print(read_expr(r'((\x.\y.sees(x,y))(a))(b)'))
- ((\x y.sees(x,y))(a))(b)
- >>> print(read_expr(r'P(x)(y)(z)'))
- P(x,y,z)
- >>> print(read_expr(r'P(Q)'))
- P(Q)
- >>> print(read_expr(r'P(Q(x))'))
- P(Q(x))
- >>> print(read_expr(r'(\x.exists y.walks(x,y))(x)'))
- (\x.exists y.walks(x,y))(x)
- >>> print(read_expr(r'exists x.(x = john)'))
- exists x.(x = john)
- >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))'))
- ((\P Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))
- >>> a = read_expr(r'exists c.exists b.A(b,c) & A(b,c)')
- >>> b = read_expr(r'(exists c.(exists b.A(b,c))) & A(b,c)')
- >>> print(a == b)
- True
- >>> a = read_expr(r'exists c.(exists b.A(b,c) & A(b,c))')
- >>> b = read_expr(r'exists c.((exists b.A(b,c)) & A(b,c))')
- >>> print(a == b)
- True
- >>> print(read_expr(r'exists x.x = y'))
- exists x.(x = y)
- >>> print(read_expr('A(B)(C)'))
- A(B,C)
- >>> print(read_expr('(A(B))(C)'))
- A(B,C)
- >>> print(read_expr('A((B)(C))'))
- A(B(C))
- >>> print(read_expr('A(B(C))'))
- A(B(C))
- >>> print(read_expr('(A)(B(C))'))
- A(B(C))
- >>> print(read_expr('(((A)))(((B))(((C))))'))
- A(B(C))
- >>> print(read_expr(r'A != B'))
- -(A = B)
- >>> print(read_expr('P(x) & x=y & P(y)'))
- (P(x) & (x = y) & P(y))
- >>> try: print(read_expr(r'\walk.walk(x)'))
- ... except LogicalExpressionException as e: print(e)
- 'walk' is an illegal variable name. Constants may not be abstracted.
- \walk.walk(x)
- ^
- >>> try: print(read_expr(r'all walk.walk(john)'))
- ... except LogicalExpressionException as e: print(e)
- 'walk' is an illegal variable name. Constants may not be quantified.
- all walk.walk(john)
- ^
- >>> try: print(read_expr(r'x(john)'))
- ... except LogicalExpressionException as e: print(e)
- 'x' is an illegal predicate name. Individual variables may not be used as predicates.
- x(john)
- ^
-
- >>> from nltk.sem.logic import LogicParser # hack to give access to custom quote chars
- >>> lpq = LogicParser()
- >>> lpq.quote_chars = [("'", "'", "\\", False)]
- >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )"))
- (man(x) & tall's,(x) & walks(x))
- >>> lpq.quote_chars = [("'", "'", "\\", True)]
- >>> print(lpq.parse(r"'tall\'s,'"))
- 'tall\'s,'
- >>> print(lpq.parse(r"'spaced name(x)'"))
- 'spaced name(x)'
- >>> print(lpq.parse(r"-'tall\'s,'(x)"))
- -'tall\'s,'(x)
- >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )"))
- (man(x) & 'tall\'s,'(x) & walks(x))
-
-
-Simplify
-========
-
- >>> print(read_expr(r'\x.man(x)(john)').simplify())
- man(john)
- >>> print(read_expr(r'\x.((man(x)))(john)').simplify())
- man(john)
- >>> print(read_expr(r'\x.\y.sees(x,y)(john, mary)').simplify())
- sees(john,mary)
- >>> print(read_expr(r'\x y.sees(x,y)(john, mary)').simplify())
- sees(john,mary)
- >>> print(read_expr(r'\x.\y.sees(x,y)(john)(mary)').simplify())
- sees(john,mary)
- >>> print(read_expr(r'\x y.sees(x,y)(john)(mary)').simplify())
- sees(john,mary)
- >>> print(read_expr(r'\x.\y.sees(x,y)(john)').simplify())
- \y.sees(john,y)
- >>> print(read_expr(r'\x y.sees(x,y)(john)').simplify())
- \y.sees(john,y)
- >>> print(read_expr(r'(\x.\y.sees(x,y)(john))(mary)').simplify())
- sees(john,mary)
- >>> print(read_expr(r'(\x y.sees(x,y)(john))(mary)').simplify())
- sees(john,mary)
- >>> print(read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify())
- exists x.(man(x) & exists y.walks(x,y))
- >>> e1 = read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(y))').simplify()
- >>> e2 = read_expr(r'exists x.(man(x) & exists z1.walks(y,z1))')
- >>> e1 == e2
- True
- >>> print(read_expr(r'(\P Q.exists x.(P(x) & Q(x)))(\x.dog(x))').simplify())
- \Q.exists x.(dog(x) & Q(x))
- >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))').simplify())
- exists x.(dog(x) & bark(x))
- >>> print(read_expr(r'\P.(P(x)(y))(\a b.Q(a,b))').simplify())
- Q(x,y)
-
-Replace
-=======
-
- >>> a = read_expr(r'a')
- >>> x = read_expr(r'x')
- >>> y = read_expr(r'y')
- >>> z = read_expr(r'z')
-
- >>> print(read_expr(r'man(x)').replace(x.variable, a, False))
- man(a)
- >>> print(read_expr(r'(man(x) & tall(x))').replace(x.variable, a, False))
- (man(a) & tall(a))
- >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, False))
- exists x.man(x)
- >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, True))
- exists a.man(a)
- >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, False))
- exists x.give(x,a,z)
- >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, True))
- exists x.give(x,a,z)
- >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, False)
- >>> e2 = read_expr(r'exists z1.give(z1,x,z)')
- >>> e1 == e2
- True
- >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, True)
- >>> e2 = read_expr(r'exists z1.give(z1,x,z)')
- >>> e1 == e2
- True
- >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, False))
- \x y z.give(x,y,z)
- >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, True))
- \x a z.give(x,a,z)
- >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, False))
- \x y.give(x,y,a)
- >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, True))
- \x y.give(x,y,a)
- >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, False)
- >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)')
- >>> e1 == e2
- True
- >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, True)
- >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)')
- >>> e1 == e2
- True
- >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, False))
- \x.give(x,y,y)
- >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, True))
- \x.give(x,y,y)
-
- >>> from nltk.sem import logic
- >>> logic._counter._value = 0
- >>> e1 = read_expr('e1')
- >>> e2 = read_expr('e2')
- >>> print(read_expr('exists e1 e2.(walk(e1) & talk(e2))').replace(e1.variable, e2, True))
- exists e2 e01.(walk(e2) & talk(e01))
-
-
-Variables / Free
-================
-
- >>> examples = [r'walk(john)',
- ... r'walk(x)',
- ... r'?vp(?np)',
- ... r'see(john,mary)',
- ... r'exists x.walk(x)',
- ... r'\x.see(john,x)',
- ... r'\x.see(john,x)(mary)',
- ... r'P(x)',
- ... r'\P.P(x)',
- ... r'aa(x,bb(y),cc(z),P(w),u)',
- ... r'bo(?det(?n),@x)']
- >>> examples = [read_expr(e) for e in examples]
-
- >>> for e in examples:
- ... print('%-25s' % e, sorted(e.free()))
- walk(john) []
- walk(x) [Variable('x')]
- ?vp(?np) []
- see(john,mary) []
- exists x.walk(x) []
- \x.see(john,x) []
- (\x.see(john,x))(mary) []
- P(x) [Variable('P'), Variable('x')]
- \P.P(x) [Variable('x')]
- aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')]
- bo(?det(?n),@x) []
-
- >>> for e in examples:
- ... print('%-25s' % e, sorted(e.constants()))
- walk(john) [Variable('john')]
- walk(x) []
- ?vp(?np) [Variable('?np')]
- see(john,mary) [Variable('john'), Variable('mary')]
- exists x.walk(x) []
- \x.see(john,x) [Variable('john')]
- (\x.see(john,x))(mary) [Variable('john'), Variable('mary')]
- P(x) []
- \P.P(x) []
- aa(x,bb(y),cc(z),P(w),u) []
- bo(?det(?n),@x) [Variable('?n'), Variable('@x')]
-
- >>> for e in examples:
- ... print('%-25s' % e, sorted(e.predicates()))
- walk(john) [Variable('walk')]
- walk(x) [Variable('walk')]
- ?vp(?np) [Variable('?vp')]
- see(john,mary) [Variable('see')]
- exists x.walk(x) [Variable('walk')]
- \x.see(john,x) [Variable('see')]
- (\x.see(john,x))(mary) [Variable('see')]
- P(x) []
- \P.P(x) []
- aa(x,bb(y),cc(z),P(w),u) [Variable('aa'), Variable('bb'), Variable('cc')]
- bo(?det(?n),@x) [Variable('?det'), Variable('bo')]
-
- >>> for e in examples:
- ... print('%-25s' % e, sorted(e.variables()))
- walk(john) []
- walk(x) [Variable('x')]
- ?vp(?np) [Variable('?np'), Variable('?vp')]
- see(john,mary) []
- exists x.walk(x) []
- \x.see(john,x) []
- (\x.see(john,x))(mary) []
- P(x) [Variable('P'), Variable('x')]
- \P.P(x) [Variable('x')]
- aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')]
- bo(?det(?n),@x) [Variable('?det'), Variable('?n'), Variable('@x')]
-
-
-
-`normalize`
- >>> print(read_expr(r'\e083.(walk(e083, z472) & talk(e092, z938))').normalize())
- \e01.(walk(e01,z3) & talk(e02,z4))
-
-Typed Logic
-+++++++++++
-
- >>> from nltk.sem.logic import LogicParser
- >>> tlp = LogicParser(True)
- >>> print(tlp.parse(r'man(x)').type)
- ?
- >>> print(tlp.parse(r'walk(angus)').type)
- ?
- >>> print(tlp.parse(r'-man(x)').type)
- t
- >>> print(tlp.parse(r'(man(x) <-> tall(x))').type)
- t
- >>> print(tlp.parse(r'exists x.(man(x) & tall(x))').type)
- t
- >>> print(tlp.parse(r'\x.man(x)').type)
- <e,?>
- >>> print(tlp.parse(r'john').type)
- e
- >>> print(tlp.parse(r'\x y.sees(x,y)').type)
- <e,<e,?>>
- >>> print(tlp.parse(r'\x.man(x)(john)').type)
- ?
- >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)').type)
- <e,?>
- >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)(mary)').type)
- ?
- >>> print(tlp.parse(r'\P.\Q.exists x.(P(x) & Q(x))').type)
- <<e,t>,<<e,t>,t>>
- >>> print(tlp.parse(r'\x.y').type)
- <?,e>
- >>> print(tlp.parse(r'\P.P(x)').type)
- <<e,?>,?>
-
- >>> parsed = tlp.parse('see(john,mary)')
- >>> print(parsed.type)
- ?
- >>> print(parsed.function)
- see(john)
- >>> print(parsed.function.type)
- <e,?>
- >>> print(parsed.function.function)
- see
- >>> print(parsed.function.function.type)
- <e,<e,?>>
-
- >>> parsed = tlp.parse('P(x,y)')
- >>> print(parsed)
- P(x,y)
- >>> print(parsed.type)
- ?
- >>> print(parsed.function)
- P(x)
- >>> print(parsed.function.type)
- <e,?>
- >>> print(parsed.function.function)
- P
- >>> print(parsed.function.function.type)
- <e,<e,?>>
-
- >>> print(tlp.parse(r'P').type)
- ?
-
- >>> print(tlp.parse(r'P', {'P': 't'}).type)
- t
-
- >>> a = tlp.parse(r'P(x)')
- >>> print(a.type)
- ?
- >>> print(a.function.type)
- <e,?>
- >>> print(a.argument.type)
- e
-
- >>> a = tlp.parse(r'-P(x)')
- >>> print(a.type)
- t
- >>> print(a.term.type)
- t
- >>> print(a.term.function.type)
- <e,t>
- >>> print(a.term.argument.type)
- e
-
- >>> a = tlp.parse(r'P & Q')
- >>> print(a.type)
- t
- >>> print(a.first.type)
- t
- >>> print(a.second.type)
- t
-
- >>> a = tlp.parse(r'(P(x) & Q(x))')
- >>> print(a.type)
- t
- >>> print(a.first.type)
- t
- >>> print(a.first.function.type)
- <e,t>
- >>> print(a.first.argument.type)
- e
- >>> print(a.second.type)
- t
- >>> print(a.second.function.type)
- <e,t>
- >>> print(a.second.argument.type)
- e
-
- >>> a = tlp.parse(r'\x.P(x)')
- >>> print(a.type)
- <e,?>
- >>> print(a.term.function.type)
- <e,?>
- >>> print(a.term.argument.type)
- e
-
- >>> a = tlp.parse(r'\P.P(x)')
- >>> print(a.type)
- <<e,?>,?>
- >>> print(a.term.function.type)
- <e,?>
- >>> print(a.term.argument.type)
- e
-
- >>> a = tlp.parse(r'(\x.P(x)(john)) & Q(x)')
- >>> print(a.type)
- t
- >>> print(a.first.type)
- t
- >>> print(a.first.function.type)
- <e,t>
- >>> print(a.first.function.term.function.type)
- <e,t>
- >>> print(a.first.function.term.argument.type)
- e
- >>> print(a.first.argument.type)
- e
-
- >>> a = tlp.parse(r'\x y.P(x,y)(john)(mary) & Q(x)')
- >>> print(a.type)
- t
- >>> print(a.first.type)
- t
- >>> print(a.first.function.type)
- <e,t>
- >>> print(a.first.function.function.type)
- <e,<e,t>>
-
- >>> a = tlp.parse(r'--P')
- >>> print(a.type)
- t
- >>> print(a.term.type)
- t
- >>> print(a.term.term.type)
- t
-
- >>> tlp.parse(r'\x y.P(x,y)').type
- <e,<e,?>>
- >>> tlp.parse(r'\x y.P(x,y)', {'P': '<e,<e,t>>'}).type
- <e,<e,t>>
-
- >>> a = tlp.parse(r'\P y.P(john,y)(\x y.see(x,y))')
- >>> a.type
- <e,?>
- >>> a.function.type
- <<e,<e,?>>,<e,?>>
- >>> a.function.term.term.function.function.type
- <e,<e,?>>
- >>> a.argument.type
- <e,<e,?>>
-
- >>> a = tlp.parse(r'exists c f.(father(c) = f)')
- >>> a.type
- t
- >>> a.term.term.type
- t
- >>> a.term.term.first.type
- e
- >>> a.term.term.first.function.type
- <e,e>
- >>> a.term.term.second.type
- e
-
-typecheck()
-
- >>> a = tlp.parse('P(x)')
- >>> b = tlp.parse('Q(x)')
- >>> a.type
- ?
- >>> c = a & b
- >>> c.first.type
- ?
- >>> c.typecheck() # doctest: +ELLIPSIS
- {...}
- >>> c.first.type
- t
-
- >>> a = tlp.parse('P(x)')
- >>> b = tlp.parse('P(x) & Q(x)')
- >>> a.type
- ?
- >>> typecheck([a,b]) # doctest: +ELLIPSIS
- {...}
- >>> a.type
- t
-
- >>> e = tlp.parse(r'man(x)')
- >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': '<e,?>'})
- True
- >>> sig = {'man': '<e, t>'}
- >>> e = tlp.parse(r'man(x)', sig)
- >>> print(e.function.type)
- <e,t>
- >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': '<e,t>'})
- True
- >>> print(e.function.type)
- <e,t>
- >>> print(dict((k,str(v)) for k,v in e.typecheck(sig).items()) == {'x': 'e', 'man': '<e,t>'})
- True
-
-findtype()
-
- >>> print(tlp.parse(r'man(x)').findtype(Variable('man')))
- <e,?>
- >>> print(tlp.parse(r'see(x,y)').findtype(Variable('see')))
- <e,<e,?>>
- >>> print(tlp.parse(r'P(Q(R(x)))').findtype(Variable('Q')))
- ?
-
-reading types from strings
-
- >>> Type.fromstring('e')
- e
- >>> Type.fromstring('<e,t>')
- <e,t>
- >>> Type.fromstring('<<e,t>,<e,t>>')
- <<e,t>,<e,t>>
- >>> Type.fromstring('<<e,?>,?>')
- <<e,?>,?>
-
-alternative type format
-
- >>> Type.fromstring('e').str()
- 'IND'
- >>> Type.fromstring('<e,?>').str()
- '(IND -> ANY)'
- >>> Type.fromstring('<<e,t>,t>').str()
- '((IND -> BOOL) -> BOOL)'
-
-Type.__eq__()
-
- >>> from nltk.sem.logic import *
-
- >>> e = ENTITY_TYPE
- >>> t = TRUTH_TYPE
- >>> a = ANY_TYPE
- >>> et = ComplexType(e,t)
- >>> eet = ComplexType(e,ComplexType(e,t))
- >>> at = ComplexType(a,t)
- >>> ea = ComplexType(e,a)
- >>> aa = ComplexType(a,a)
-
- >>> e == e
- True
- >>> t == t
- True
- >>> e == t
- False
- >>> a == t
- False
- >>> t == a
- False
- >>> a == a
- True
- >>> et == et
- True
- >>> a == et
- False
- >>> et == a
- False
- >>> a == ComplexType(a,aa)
- True
- >>> ComplexType(a,aa) == a
- True
-
-matches()
-
- >>> e.matches(t)
- False
- >>> a.matches(t)
- True
- >>> t.matches(a)
- True
- >>> a.matches(et)
- True
- >>> et.matches(a)
- True
- >>> ea.matches(eet)
- True
- >>> eet.matches(ea)
- True
- >>> aa.matches(et)
- True
- >>> aa.matches(t)
- True
-
-Type error during parsing
-=========================
-
- >>> try: print(tlp.parse(r'exists x y.(P(x) & P(x,y))'))
- ... except InconsistentTypeHierarchyException as e: print(e)
- The variable 'P' was found in multiple places with different types.
- >>> try: tlp.parse(r'\x y.see(x,y)(\x.man(x))')
- ... except TypeException as e: print(e)
- The function '\x y.see(x,y)' is of type '<e,<e,?>>' and cannot be applied to '\x.man(x)' of type '<e,?>'. Its argument must match type 'e'.
- >>> try: tlp.parse(r'\P x y.-P(x,y)(\x.-man(x))')
- ... except TypeException as e: print(e)
- The function '\P x y.-P(x,y)' is of type '<<e,<e,t>>,<e,<e,t>>>' and cannot be applied to '\x.-man(x)' of type '<e,t>'. Its argument must match type '<e,<e,t>>'.
-
- >>> a = tlp.parse(r'-talk(x)')
- >>> signature = a.typecheck()
- >>> try: print(tlp.parse(r'-talk(x,y)', signature))
- ... except InconsistentTypeHierarchyException as e: print(e)
- The variable 'talk' was found in multiple places with different types.
-
- >>> a = tlp.parse(r'-P(x)')
- >>> b = tlp.parse(r'-P(x,y)')
- >>> a.typecheck() # doctest: +ELLIPSIS
- {...}
- >>> b.typecheck() # doctest: +ELLIPSIS
- {...}
- >>> try: typecheck([a,b])
- ... except InconsistentTypeHierarchyException as e: print(e)
- The variable 'P' was found in multiple places with different types.
-
- >>> a = tlp.parse(r'P(x)')
- >>> b = tlp.parse(r'P(x,y)')
- >>> signature = {'P': '<e,t>'}
- >>> a.typecheck(signature) # doctest: +ELLIPSIS
- {...}
- >>> try: typecheck([a,b], signature)
- ... except InconsistentTypeHierarchyException as e: print(e)
- The variable 'P' was found in multiple places with different types.
-
-Parse errors
-============
-
- >>> try: read_expr(r'')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- <BLANKLINE>
- ^
- >>> try: read_expr(r'(')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- (
- ^
- >>> try: read_expr(r')')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- )
- ^
- >>> try: read_expr(r'()')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- ()
- ^
- >>> try: read_expr(r'(P(x) & Q(x)')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expected token ')'.
- (P(x) & Q(x)
- ^
- >>> try: read_expr(r'(P(x) &')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- (P(x) &
- ^
- >>> try: read_expr(r'(P(x) | )')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- (P(x) | )
- ^
- >>> try: read_expr(r'P(x) ->')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- P(x) ->
- ^
- >>> try: read_expr(r'P(x')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expected token ')'.
- P(x
- ^
- >>> try: read_expr(r'P(x,')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- P(x,
- ^
- >>> try: read_expr(r'P(x,)')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- P(x,)
- ^
- >>> try: read_expr(r'exists')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Variable and Expression expected following quantifier 'exists'.
- exists
- ^
- >>> try: read_expr(r'exists x')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- exists x
- ^
- >>> try: read_expr(r'exists x.')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- exists x.
- ^
- >>> try: read_expr(r'\ ')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Variable and Expression expected following lambda operator.
- \
- ^
- >>> try: read_expr(r'\ x')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- \ x
- ^
- >>> try: read_expr(r'\ x y')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- \ x y
- ^
- >>> try: read_expr(r'\ x.')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- \ x.
- ^
- >>> try: read_expr(r'P(x)Q(x)')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: 'Q'.
- P(x)Q(x)
- ^
- >>> try: read_expr(r'(P(x)Q(x)')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: 'Q'. Expected token ')'.
- (P(x)Q(x)
- ^
- >>> try: read_expr(r'exists x y')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- exists x y
- ^
- >>> try: read_expr(r'exists x y.')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expression expected.
- exists x y.
- ^
- >>> try: read_expr(r'exists x -> y')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: '->'. Expression expected.
- exists x -> y
- ^
-
-
- >>> try: read_expr(r'A -> ((P(x) & Q(x)) -> Z')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expected token ')'.
- A -> ((P(x) & Q(x)) -> Z
- ^
- >>> try: read_expr(r'A -> ((P(x) &) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> ((P(x) &) -> Z
- ^
- >>> try: read_expr(r'A -> ((P(x) | )) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> ((P(x) | )) -> Z
- ^
- >>> try: read_expr(r'A -> (P(x) ->) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (P(x) ->) -> Z
- ^
- >>> try: read_expr(r'A -> (P(x) -> Z')
- ... except LogicalExpressionException as e: print(e)
- End of input found. Expected token ')'.
- A -> (P(x) -> Z
- ^
- >>> try: read_expr(r'A -> (P(x,) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (P(x,) -> Z
- ^
- >>> try: read_expr(r'A -> (P(x,)) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (P(x,)) -> Z
- ^
- >>> try: read_expr(r'A -> (exists) -> Z')
- ... except LogicalExpressionException as e: print(e)
- ')' is an illegal variable name. Constants may not be quantified.
- A -> (exists) -> Z
- ^
- >>> try: read_expr(r'A -> (exists x) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (exists x) -> Z
- ^
- >>> try: read_expr(r'A -> (exists x.) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (exists x.) -> Z
- ^
- >>> try: read_expr(r'A -> (\ ) -> Z')
- ... except LogicalExpressionException as e: print(e)
- ')' is an illegal variable name. Constants may not be abstracted.
- A -> (\ ) -> Z
- ^
- >>> try: read_expr(r'A -> (\ x) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (\ x) -> Z
- ^
- >>> try: read_expr(r'A -> (\ x y) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (\ x y) -> Z
- ^
- >>> try: read_expr(r'A -> (\ x.) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (\ x.) -> Z
- ^
- >>> try: read_expr(r'A -> (P(x)Q(x)) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: 'Q'. Expected token ')'.
- A -> (P(x)Q(x)) -> Z
- ^
- >>> try: read_expr(r'A -> ((P(x)Q(x)) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: 'Q'. Expected token ')'.
- A -> ((P(x)Q(x)) -> Z
- ^
- >>> try: read_expr(r'A -> (all x y) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (all x y) -> Z
- ^
- >>> try: read_expr(r'A -> (exists x y.) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: ')'. Expression expected.
- A -> (exists x y.) -> Z
- ^
- >>> try: read_expr(r'A -> (exists x -> y) -> Z')
- ... except LogicalExpressionException as e: print(e)
- Unexpected token: '->'. Expression expected.
- A -> (exists x -> y) -> Z
- ^
-
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. -*- coding: utf-8 -*-
-
-=============
-METEOR tests
-=============
-
-No Allignment test
-------------------
-
- >>> from nltk.translate import meteor
-
-If the candidate has no alignment to any of the references, the METEOR score is 0.
-
- >>> round(meteor(
- ... ['The candidate has no alignment to any of the references'],
- ... 'John loves Mary'
- ... ),4)
- 0.0
-
-Tests based on wikipedia examples
----------------------------------
-
-Testing on `wikipedia examples <https://en.wikipedia.org/wiki/METEOR#Examples>`_
-
- >>> same_res = round(meteor(
- ... ['The cat sat on the mat'],
- ... 'The cat sat on the mat'
- ... ),4)
- >>> abs(same_res - 0.9977) < 1e-2
- True
-
- >>> meteor(
- ... ['The cat sat on the mat'],
- ... 'on the mat sat the cat'
- ... )
- 0.5
-
- >>> round(meteor(
- ... ['The cat sat on the mat'],
- ... 'The cat was sat on the mat'
- ... ),4)
- 0.9654
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=======
-Metrics
-=======
-
-The `nltk.metrics` package provides a variety of *evaluation measures*
-which can be used for a wide variety of NLP tasks.
-
- >>> from nltk.metrics import *
-
-------------------
-Standard IR Scores
-------------------
-
-We can use standard scores from information retrieval to test the
-performance of taggers, chunkers, etc.
-
- >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
- >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
- >>> print(accuracy(reference, test))
- 0.8
-
-
-The following measures apply to sets:
-
- >>> reference_set = set(reference)
- >>> test_set = set(test)
- >>> precision(reference_set, test_set)
- 1.0
- >>> print(recall(reference_set, test_set))
- 0.8
- >>> print(f_measure(reference_set, test_set))
- 0.88888888888...
-
-Measuring the likelihood of the data, given probability distributions:
-
- >>> from nltk import FreqDist, MLEProbDist
- >>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf"))
- >>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss"))
- >>> print(log_likelihood(['a', 'd'], [pdist1, pdist2]))
- -2.7075187496...
-
-
-----------------
-Distance Metrics
-----------------
-
-String edit distance (Levenshtein):
-
- >>> edit_distance("rain", "shine")
- 3
- >>> edit_distance_align("shine", "shine")
- [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
- >>> edit_distance_align("rain", "brainy")
- [(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)]
- >>> edit_distance_align("", "brainy")
- [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)]
- >>> edit_distance_align("", "")
- [(0, 0)]
-
-Other distance measures:
-
- >>> s1 = set([1,2,3,4])
- >>> s2 = set([3,4,5])
- >>> binary_distance(s1, s2)
- 1.0
- >>> print(jaccard_distance(s1, s2))
- 0.6
- >>> print(masi_distance(s1, s2))
- 0.868
-
-----------------------
-Miscellaneous Measures
-----------------------
-
-Rank Correlation works with two dictionaries mapping keys to ranks.
-The dictionaries should have the same set of keys.
-
- >>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3})
- 0.5
-
-Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings).
-Segmentations are represented using strings of zeros and ones.
-
- >>> s1 = "000100000010"
- >>> s2 = "000010000100"
- >>> s3 = "100000010000"
- >>> s4 = "000000000000"
- >>> s5 = "111111111111"
- >>> windowdiff(s1, s1, 3)
- 0.0
- >>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3
- True
- >>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8
- True
- >>> windowdiff(s1, s4, 3)
- 0.5
- >>> windowdiff(s1, s5, 3)
- 1.0
-
-----------------
-Confusion Matrix
-----------------
-
- >>> reference = 'This is the reference data. Testing 123. aoaeoeoe'
- >>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe'
- >>> print(ConfusionMatrix(reference, test))
- | . 1 2 3 T _ a c d e f g h i n o r s t z |
- --+-------------------------------------------+
- |<8>. . . . . 1 . . . . . . . . . . . . . . |
- . | .<2>. . . . . . . . . . . . . . . . . . . |
- 1 | . .<1>. . . . . . . . . . . . . . . . . . |
- 2 | . . .<1>. . . . . . . . . . . . . . . . . |
- 3 | . . . .<1>. . . . . . . . . . . . . . . . |
- T | . . . . .<2>. . . . . . . . . . . . . . . |
- _ | . . . . . .<.>. . . . . . . . . . . . . . |
- a | . . . . . . .<4>. . . . . . . . . . . . . |
- c | . . . . . . . .<1>. . . . . . . . . . . . |
- d | . . . . . . . . .<1>. . . . . . . . . . . |
- e | . . . . . . . . . .<6>. . . 3 . . . . . . |
- f | . . . . . . . . . . .<1>. . . . . . . . . |
- g | . . . . . . . . . . . .<1>. . . . . . . . |
- h | . . . . . . . . . . . . .<2>. . . . . . . |
- i | . . . . . . . . . . 1 . . .<1>. 1 . . . . |
- n | . . . . . . . . . . . . . . .<2>. . . . . |
- o | . . . . . . . . . . . . . . . .<3>. . . . |
- r | . . . . . . . . . . . . . . . . .<2>. . . |
- s | . . . . . . . . . . . . . . . . . .<2>. 1 |
- t | . . . . . . . . . . . . . . . . . . .<3>. |
- z | . . . . . . . . . . . . . . . . . . . .<.>|
- --+-------------------------------------------+
- (row = reference; col = test)
- <BLANKLINE>
-
- >>> cm = ConfusionMatrix(reference, test)
- >>> print(cm.pretty_format(sort_by_count=True))
- | e a i o s t . T h n r 1 2 3 c d f g _ z |
- --+-------------------------------------------+
- |<8>. . . . . . . . . . . . . . . . . . 1 . |
- e | .<6>. 3 . . . . . . . . . . . . . . . . . |
- a | . .<4>. . . . . . . . . . . . . . . . . . |
- i | . 1 .<1>1 . . . . . . . . . . . . . . . . |
- o | . . . .<3>. . . . . . . . . . . . . . . . |
- s | . . . . .<2>. . . . . . . . . . . . . . 1 |
- t | . . . . . .<3>. . . . . . . . . . . . . . |
- . | . . . . . . .<2>. . . . . . . . . . . . . |
- T | . . . . . . . .<2>. . . . . . . . . . . . |
- h | . . . . . . . . .<2>. . . . . . . . . . . |
- n | . . . . . . . . . .<2>. . . . . . . . . . |
- r | . . . . . . . . . . .<2>. . . . . . . . . |
- 1 | . . . . . . . . . . . .<1>. . . . . . . . |
- 2 | . . . . . . . . . . . . .<1>. . . . . . . |
- 3 | . . . . . . . . . . . . . .<1>. . . . . . |
- c | . . . . . . . . . . . . . . .<1>. . . . . |
- d | . . . . . . . . . . . . . . . .<1>. . . . |
- f | . . . . . . . . . . . . . . . . .<1>. . . |
- g | . . . . . . . . . . . . . . . . . .<1>. . |
- _ | . . . . . . . . . . . . . . . . . . .<.>. |
- z | . . . . . . . . . . . . . . . . . . . .<.>|
- --+-------------------------------------------+
- (row = reference; col = test)
- <BLANKLINE>
-
- >>> print(cm.pretty_format(sort_by_count=True, truncate=10))
- | e a i o s t . T h |
- --+---------------------+
- |<8>. . . . . . . . . |
- e | .<6>. 3 . . . . . . |
- a | . .<4>. . . . . . . |
- i | . 1 .<1>1 . . . . . |
- o | . . . .<3>. . . . . |
- s | . . . . .<2>. . . . |
- t | . . . . . .<3>. . . |
- . | . . . . . . .<2>. . |
- T | . . . . . . . .<2>. |
- h | . . . . . . . . .<2>|
- --+---------------------+
- (row = reference; col = test)
- <BLANKLINE>
-
- >>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False))
- | 1 |
- | 1 2 3 4 5 6 7 8 9 0 |
- ---+---------------------+
- 1 |<8>. . . . . . . . . |
- 2 | .<6>. 3 . . . . . . |
- 3 | . .<4>. . . . . . . |
- 4 | . 1 .<1>1 . . . . . |
- 5 | . . . .<3>. . . . . |
- 6 | . . . . .<2>. . . . |
- 7 | . . . . . .<3>. . . |
- 8 | . . . . . . .<2>. . |
- 9 | . . . . . . . .<2>. |
- 10 | . . . . . . . . .<2>|
- ---+---------------------+
- (row = reference; col = test)
- Value key:
- 1:
- 2: e
- 3: a
- 4: i
- 5: o
- 6: s
- 7: t
- 8: .
- 9: T
- 10: h
- <BLANKLINE>
-
-
---------------------
-Association measures
---------------------
-
-These measures are useful to determine whether the coocurrence of two random
-events is meaningful. They are used, for instance, to distinguish collocations
-from other pairs of adjacent words.
-
-We bring some examples of bigram association calculations from Manning and
-Schutze's SNLP, 2nd Ed. chapter 5.
-
- >>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668
- >>> bam = BigramAssocMeasures
- >>> bam.raw_freq(20, (42, 20), N) == 20. / N
- True
- >>> bam.student_t(n_new_companies, (n_new, n_companies), N)
- 0.999...
- >>> bam.chi_sq(n_new_companies, (n_new, n_companies), N)
- 1.54...
- >>> bam.likelihood_ratio(150, (12593, 932), N)
- 1291...
-
-For other associations, we ensure the ordering of the measures:
-
- >>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N)
- True
- >>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N)
- True
- >>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N)
- True
- >>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N)
- True
- >>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N)
- True
- >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N)
- True
- >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP
- False
-
-For trigrams, we have to provide more count information:
-
- >>> n_w1_w2_w3 = 20
- >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
- >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
- >>> n_w1, n_w2, n_w3 = 100, 200, 300
- >>> uni_counts = (n_w1, n_w2, n_w3)
- >>> N = 14307668
- >>> tam = TrigramAssocMeasures
- >>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N
- True
- >>> uni_counts2 = (n_w1, n_w2, 100)
- >>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N)
- True
- >>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N)
- True
- >>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N)
- True
- >>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N)
- True
- >>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N)
- True
- >>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N)
- True
- >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N)
- True
-
-
-For fourgrams, we have to provide more count information:
-
- >>> n_w1_w2_w3_w4 = 5
- >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
- >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10
- >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
- >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4)
- >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400
- >>> uni_counts = (n_w1, n_w2, n_w3, n_w4)
- >>> N = 14307668
- >>> qam = QuadgramAssocMeasures
- >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N
- True
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
---------------------------------------------------------------------------------
-Unit tests for the miscellaneous sort functions.
---------------------------------------------------------------------------------
-
- >>> from copy import deepcopy
- >>> from nltk.misc.sort import *
-
-A (very) small list of unsorted integers.
-
- >>> test_data = [12, 67, 7, 28, 92, 56, 53, 720, 91, 57, 20, 20]
-
-Test each sorting method - each method returns the number of operations
-required to sort the data, and sorts in-place (desctructively - hence the need
-for multiple copies).
-
- >>> sorted_data = deepcopy(test_data)
- >>> selection(sorted_data)
- 66
-
- >>> sorted_data
- [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
-
- >>> sorted_data = deepcopy(test_data)
- >>> bubble(sorted_data)
- 30
-
- >>> sorted_data
- [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
-
- >>> sorted_data = deepcopy(test_data)
- >>> merge(sorted_data)
- 30
-
- >>> sorted_data
- [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
-
- >>> sorted_data = deepcopy(test_data)
- >>> quick(sorted_data)
- 13
-
- >>> sorted_data
- [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
-
---------------------------------------------------------------------------------
-Unit tests for Wordfinder class
---------------------------------------------------------------------------------
-
- >>> import random
-
- >>> # The following is not enough for reproducibility under Python 2/3
- >>> # (see http://bugs.python.org/issue9025) so this test is skipped.
- >>> random.seed(12345)
-
- >>> from nltk.misc import wordfinder
- >>> wordfinder.word_finder() # doctest: +SKIP
- Word Finder
- <BLANKLINE>
- J V L A I R O T A T I S I V O D E R E T
- H U U B E A R O E P O C S O R E T N E P
- A D A U Z E E S R A P P A L L M E N T R
- C X A D Q S Z T P E O R S N G P J A D E
- I G Y K K T I A A R G F I D T E L C N S
- R E C N B H T R L T N N B W N T A O A I
- A Y I L O E I A M E I A A Y U R P L L D
- G L T V S T S F E A D I P H D O O H N I
- R L S E C I N I L R N N M E C G R U E A
- A A Y G I C E N L L E O I G Q R T A E L
- M R C E T I S T A E T L L E U A E N R L
- O U O T A S E E C S O O N H Y P A T G Y
- E M H O M M D R E S F P U L T H C F N V
- L A C A I M A M A N L B R U T E D O M I
- O R I L N E E E E E U A R S C R Y L I P
- H T R K E S N N M S I L A S R E V I N U
- T X T A A O U T K S E T A R R E S I B J
- A E D L E L J I F O O R P E L K N I R W
- K H A I D E Q O P R I C K T I M B E R P
- Z K D O O H G N I H T U R V E Y D R O P
- <BLANKLINE>
- 1: INTERCHANGER
- 2: TEARLESSNESS
- 3: UNIVERSALISM
- 4: DESENSITIZER
- 5: INTERMENTION
- 6: TRICHOCYSTIC
- 7: EXTRAMURALLY
- 8: VEGETOALKALI
- 9: PALMELLACEAE
- 10: AESTHETICISM
- 11: PETROGRAPHER
- 12: VISITATORIAL
- 13: OLEOMARGARIC
- 14: WRINKLEPROOF
- 15: PRICKTIMBER
- 16: PRESIDIALLY
- 17: SCITAMINEAE
- 18: ENTEROSCOPE
- 19: APPALLMENT
- 20: TURVEYDROP
- 21: THINGHOOD
- 22: BISERRATE
- 23: GREENLAND
- 24: BRUTEDOM
- 25: POLONIAN
- 26: ACOLHUAN
- 27: LAPORTEA
- 28: TENDING
- 29: TEREDO
- 30: MESOLE
- 31: UNLIMP
- 32: OSTARA
- 33: PILY
- 34: DUNT
- 35: ONYX
- 36: KATH
- 37: JUNE
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-======================
-Nonmonotonic Reasoning
-======================
-
- >>> from nltk import *
- >>> from nltk.inference.nonmonotonic import *
- >>> from nltk.sem import logic
- >>> logic._counter._value = 0
- >>> read_expr = logic.Expression.fromstring
-
-------------------------
-Closed Domain Assumption
-------------------------
-
-The only entities in the domain are those found in the assumptions or goal.
-If the domain only contains "A" and "B", then the expression "exists x.P(x)" can
-be replaced with "P(A) | P(B)" and an expression "all x.P(x)" can be replaced
-with "P(A) & P(B)".
-
- >>> p1 = read_expr(r'all x.(man(x) -> mortal(x))')
- >>> p2 = read_expr(r'man(Socrates)')
- >>> c = read_expr(r'mortal(Socrates)')
- >>> prover = Prover9Command(c, [p1,p2])
- >>> prover.prove()
- True
- >>> cdp = ClosedDomainProver(prover)
- >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
- (man(Socrates) -> mortal(Socrates))
- man(Socrates)
- >>> cdp.prove()
- True
-
- >>> p1 = read_expr(r'exists x.walk(x)')
- >>> p2 = read_expr(r'man(Socrates)')
- >>> c = read_expr(r'walk(Socrates)')
- >>> prover = Prover9Command(c, [p1,p2])
- >>> prover.prove()
- False
- >>> cdp = ClosedDomainProver(prover)
- >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
- walk(Socrates)
- man(Socrates)
- >>> cdp.prove()
- True
-
- >>> p1 = read_expr(r'exists x.walk(x)')
- >>> p2 = read_expr(r'man(Socrates)')
- >>> p3 = read_expr(r'-walk(Bill)')
- >>> c = read_expr(r'walk(Socrates)')
- >>> prover = Prover9Command(c, [p1,p2,p3])
- >>> prover.prove()
- False
- >>> cdp = ClosedDomainProver(prover)
- >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
- (walk(Socrates) | walk(Bill))
- man(Socrates)
- -walk(Bill)
- >>> cdp.prove()
- True
-
- >>> p1 = read_expr(r'walk(Socrates)')
- >>> p2 = read_expr(r'walk(Bill)')
- >>> c = read_expr(r'all x.walk(x)')
- >>> prover = Prover9Command(c, [p1,p2])
- >>> prover.prove()
- False
- >>> cdp = ClosedDomainProver(prover)
- >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
- walk(Socrates)
- walk(Bill)
- >>> print(cdp.goal()) # doctest: +SKIP
- (walk(Socrates) & walk(Bill))
- >>> cdp.prove()
- True
-
- >>> p1 = read_expr(r'girl(mary)')
- >>> p2 = read_expr(r'dog(rover)')
- >>> p3 = read_expr(r'all x.(girl(x) -> -dog(x))')
- >>> p4 = read_expr(r'all x.(dog(x) -> -girl(x))')
- >>> p5 = read_expr(r'chase(mary, rover)')
- >>> c = read_expr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))')
- >>> prover = Prover9Command(c, [p1,p2,p3,p4,p5])
- >>> print(prover.prove())
- False
- >>> cdp = ClosedDomainProver(prover)
- >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
- girl(mary)
- dog(rover)
- ((girl(rover) -> -dog(rover)) & (girl(mary) -> -dog(mary)))
- ((dog(rover) -> -girl(rover)) & (dog(mary) -> -girl(mary)))
- chase(mary,rover)
- >>> print(cdp.goal()) # doctest: +SKIP
- ((dog(rover) & (girl(rover) -> chase(rover,rover)) & (girl(mary) -> chase(mary,rover))) | (dog(mary) & (girl(rover) -> chase(rover,mary)) & (girl(mary) -> chase(mary,mary))))
- >>> print(cdp.prove())
- True
-
------------------------
-Unique Names Assumption
------------------------
-
-No two entities in the domain represent the same entity unless it can be
-explicitly proven that they do. Therefore, if the domain contains "A" and "B",
-then add the assumption "-(A = B)" if it is not the case that
-"<assumptions> \|- (A = B)".
-
- >>> p1 = read_expr(r'man(Socrates)')
- >>> p2 = read_expr(r'man(Bill)')
- >>> c = read_expr(r'exists x.exists y.-(x = y)')
- >>> prover = Prover9Command(c, [p1,p2])
- >>> prover.prove()
- False
- >>> unp = UniqueNamesProver(prover)
- >>> for a in unp.assumptions(): print(a) # doctest: +SKIP
- man(Socrates)
- man(Bill)
- -(Socrates = Bill)
- >>> unp.prove()
- True
-
- >>> p1 = read_expr(r'all x.(walk(x) -> (x = Socrates))')
- >>> p2 = read_expr(r'Bill = William')
- >>> p3 = read_expr(r'Bill = Billy')
- >>> c = read_expr(r'-walk(William)')
- >>> prover = Prover9Command(c, [p1,p2,p3])
- >>> prover.prove()
- False
- >>> unp = UniqueNamesProver(prover)
- >>> for a in unp.assumptions(): print(a) # doctest: +SKIP
- all x.(walk(x) -> (x = Socrates))
- (Bill = William)
- (Bill = Billy)
- -(William = Socrates)
- -(Billy = Socrates)
- -(Socrates = Bill)
- >>> unp.prove()
- True
-
------------------------
-Closed World Assumption
------------------------
-
-The only entities that have certain properties are those that is it stated
-have the properties. We accomplish this assumption by "completing" predicates.
-
-If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion
-of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then
-"all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the
-assumptions don't contain anything that are "P", then "all x.-P(x)" is the
-completion of "P".
-
- >>> p1 = read_expr(r'walk(Socrates)')
- >>> p2 = read_expr(r'-(Socrates = Bill)')
- >>> c = read_expr(r'-walk(Bill)')
- >>> prover = Prover9Command(c, [p1,p2])
- >>> prover.prove()
- False
- >>> cwp = ClosedWorldProver(prover)
- >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP
- walk(Socrates)
- -(Socrates = Bill)
- all z1.(walk(z1) -> (z1 = Socrates))
- >>> cwp.prove()
- True
-
- >>> p1 = read_expr(r'see(Socrates, John)')
- >>> p2 = read_expr(r'see(John, Mary)')
- >>> p3 = read_expr(r'-(Socrates = John)')
- >>> p4 = read_expr(r'-(John = Mary)')
- >>> c = read_expr(r'-see(Socrates, Mary)')
- >>> prover = Prover9Command(c, [p1,p2,p3,p4])
- >>> prover.prove()
- False
- >>> cwp = ClosedWorldProver(prover)
- >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP
- see(Socrates,John)
- see(John,Mary)
- -(Socrates = John)
- -(John = Mary)
- all z3 z4.(see(z3,z4) -> (((z3 = Socrates) & (z4 = John)) | ((z3 = John) & (z4 = Mary))))
- >>> cwp.prove()
- True
-
- >>> p1 = read_expr(r'all x.(ostrich(x) -> bird(x))')
- >>> p2 = read_expr(r'bird(Tweety)')
- >>> p3 = read_expr(r'-ostrich(Sam)')
- >>> p4 = read_expr(r'Sam != Tweety')
- >>> c = read_expr(r'-bird(Sam)')
- >>> prover = Prover9Command(c, [p1,p2,p3,p4])
- >>> prover.prove()
- False
- >>> cwp = ClosedWorldProver(prover)
- >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP
- all x.(ostrich(x) -> bird(x))
- bird(Tweety)
- -ostrich(Sam)
- -(Sam = Tweety)
- all z7.-ostrich(z7)
- all z8.(bird(z8) -> ((z8 = Tweety) | ostrich(z8)))
- >>> print(cwp.prove())
- True
-
------------------------
-Multi-Decorator Example
------------------------
-
-Decorators can be nested to utilize multiple assumptions.
-
- >>> p1 = read_expr(r'see(Socrates, John)')
- >>> p2 = read_expr(r'see(John, Mary)')
- >>> c = read_expr(r'-see(Socrates, Mary)')
- >>> prover = Prover9Command(c, [p1,p2])
- >>> print(prover.prove())
- False
- >>> cmd = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover)))
- >>> print(cmd.prove())
- True
-
------------------
-Default Reasoning
------------------
- >>> logic._counter._value = 0
- >>> premises = []
-
-define the taxonomy
- >>> premises.append(read_expr(r'all x.(elephant(x) -> animal(x))'))
- >>> premises.append(read_expr(r'all x.(bird(x) -> animal(x))'))
- >>> premises.append(read_expr(r'all x.(dove(x) -> bird(x))'))
- >>> premises.append(read_expr(r'all x.(ostrich(x) -> bird(x))'))
- >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> ostrich(x))'))
-
-default the properties using abnormalities
- >>> premises.append(read_expr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly
- >>> premises.append(read_expr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly
- >>> premises.append(read_expr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly
-
-specify abnormal entities
- >>> premises.append(read_expr(r'all x.(bird(x) -> Ab1(x))')) #flight
- >>> premises.append(read_expr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird
- >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich
-
-define entities
- >>> premises.append(read_expr(r'elephant(el)'))
- >>> premises.append(read_expr(r'dove(do)'))
- >>> premises.append(read_expr(r'ostrich(os)'))
-
-print the augmented assumptions list
- >>> prover = Prover9Command(None, premises)
- >>> command = UniqueNamesProver(ClosedWorldProver(prover))
- >>> for a in command.assumptions(): print(a) # doctest: +SKIP
- all x.(elephant(x) -> animal(x))
- all x.(bird(x) -> animal(x))
- all x.(dove(x) -> bird(x))
- all x.(ostrich(x) -> bird(x))
- all x.(flying_ostrich(x) -> ostrich(x))
- all x.((animal(x) & -Ab1(x)) -> -fly(x))
- all x.((bird(x) & -Ab2(x)) -> fly(x))
- all x.((ostrich(x) & -Ab3(x)) -> -fly(x))
- all x.(bird(x) -> Ab1(x))
- all x.(ostrich(x) -> Ab2(x))
- all x.(flying_ostrich(x) -> Ab3(x))
- elephant(el)
- dove(do)
- ostrich(os)
- all z1.(animal(z1) -> (elephant(z1) | bird(z1)))
- all z2.(Ab1(z2) -> bird(z2))
- all z3.(bird(z3) -> (dove(z3) | ostrich(z3)))
- all z4.(dove(z4) -> (z4 = do))
- all z5.(Ab2(z5) -> ostrich(z5))
- all z6.(Ab3(z6) -> flying_ostrich(z6))
- all z7.(ostrich(z7) -> ((z7 = os) | flying_ostrich(z7)))
- all z8.-flying_ostrich(z8)
- all z9.(elephant(z9) -> (z9 = el))
- -(el = os)
- -(el = do)
- -(os = do)
-
- >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(el)'), premises))).prove()
- True
- >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('fly(do)'), premises))).prove()
- True
- >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(os)'), premises))).prove()
- True
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-def setup_module(module):
- from nose import SkipTest
- from nltk.inference.mace import Mace
-
- try:
- m = Mace()
- m._find_binary("mace4")
- except LookupError:
- raise SkipTest(
- "Mace4/Prover9 is not available so nonmonotonic.doctest was skipped"
- )
+++ /dev/null
-
-=====================================================
-PAICE's evaluation statistics for stemming algorithms
-=====================================================
-
-Given a list of words with their real lemmas and stems according to stemming algorithm under evaluation,
-counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT).
-
- >>> from nltk.metrics import Paice
-
-
--------------------------------------
-Understemming and Overstemming values
--------------------------------------
-
- >>> lemmas = {'kneel': ['kneel', 'knelt'],
- ... 'range': ['range', 'ranged'],
- ... 'ring': ['ring', 'rang', 'rung']}
- >>> stems = {'kneel': ['kneel'],
- ... 'knelt': ['knelt'],
- ... 'rang': ['rang', 'range', 'ranged'],
- ... 'ring': ['ring'],
- ... 'rung': ['rung']}
- >>> p = Paice(lemmas, stems)
- >>> p.gumt, p.gdmt, p.gwmt, p.gdnt
- (4.0, 5.0, 2.0, 16.0)
-
- >>> p.ui, p.oi, p.sw
- (0.8..., 0.125..., 0.15625...)
-
- >>> p.errt
- 1.0
-
- >>> [('{0:.3f}'.format(a), '{0:.3f}'.format(b)) for a, b in p.coords]
- [('0.000', '1.000'), ('0.000', '0.375'), ('0.600', '0.125'), ('0.800', '0.125')]
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=========
- Parsing
-=========
-
-Unit tests for the Context Free Grammar class
----------------------------------------------
-
- >>> from nltk import Nonterminal, nonterminals, Production, CFG
-
- >>> nt1 = Nonterminal('NP')
- >>> nt2 = Nonterminal('VP')
-
- >>> nt1.symbol()
- 'NP'
-
- >>> nt1 == Nonterminal('NP')
- True
-
- >>> nt1 == nt2
- False
-
- >>> S, NP, VP, PP = nonterminals('S, NP, VP, PP')
- >>> N, V, P, DT = nonterminals('N, V, P, DT')
-
- >>> prod1 = Production(S, [NP, VP])
- >>> prod2 = Production(NP, [DT, NP])
-
- >>> prod1.lhs()
- S
-
- >>> prod1.rhs()
- (NP, VP)
-
- >>> prod1 == Production(S, [NP, VP])
- True
-
- >>> prod1 == prod2
- False
-
- >>> grammar = CFG.fromstring("""
- ... S -> NP VP
- ... PP -> P NP
- ... NP -> 'the' N | N PP | 'the' N PP
- ... VP -> V NP | V PP | V NP PP
- ... N -> 'cat'
- ... N -> 'dog'
- ... N -> 'rug'
- ... V -> 'chased'
- ... V -> 'sat'
- ... P -> 'in'
- ... P -> 'on'
- ... """)
-
-Unit tests for the rd (Recursive Descent Parser) class
-------------------------------------------------------
-
-Create and run a recursive descent parser over both a syntactically ambiguous
-and unambiguous sentence.
-
- >>> from nltk.parse import RecursiveDescentParser
- >>> rd = RecursiveDescentParser(grammar)
-
- >>> sentence1 = 'the cat chased the dog'.split()
- >>> sentence2 = 'the cat chased the dog on the rug'.split()
-
- >>> for t in rd.parse(sentence1):
- ... print(t)
- (S (NP the (N cat)) (VP (V chased) (NP the (N dog))))
-
- >>> for t in rd.parse(sentence2):
- ... print(t)
- (S
- (NP the (N cat))
- (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug))))))
- (S
- (NP the (N cat))
- (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug)))))
-
-
-(dolist (expr doctest-font-lock-keywords)
- (add-to-list 'font-lock-keywords expr))
-
- font-lock-keywords
-(add-to-list 'font-lock-keywords
- (car doctest-font-lock-keywords))
-
-
-Unit tests for the sr (Shift Reduce Parser) class
--------------------------------------------------
-
-Create and run a shift reduce parser over both a syntactically ambiguous
-and unambiguous sentence. Note that unlike the recursive descent parser, one
-and only one parse is ever returned.
-
- >>> from nltk.parse import ShiftReduceParser
- >>> sr = ShiftReduceParser(grammar)
-
- >>> sentence1 = 'the cat chased the dog'.split()
- >>> sentence2 = 'the cat chased the dog on the rug'.split()
-
- >>> for t in sr.parse(sentence1):
- ... print(t)
- (S (NP the (N cat)) (VP (V chased) (NP the (N dog))))
-
-
-The shift reduce parser uses heuristics to decide what to do when there are
-multiple possible shift or reduce operations available - for the supplied
-grammar clearly the wrong operation is selected.
-
- >>> for t in sr.parse(sentence2):
- ... print(t)
-
-
-Unit tests for the Chart Parser class
--------------------------------------
-
-We use the demo() function for testing.
-We must turn off showing of times.
-
- >>> import nltk
-
-First we test tracing with a short sentence
-
- >>> nltk.parse.chart.demo(2, print_times=False, trace=1,
- ... sent='I saw a dog', numparses=1)
- * Sentence:
- I saw a dog
- ['I', 'saw', 'a', 'dog']
- <BLANKLINE>
- * Strategy: Bottom-up
- <BLANKLINE>
- |. I . saw . a . dog .|
- |[---------] . . .| [0:1] 'I'
- |. [---------] . .| [1:2] 'saw'
- |. . [---------] .| [2:3] 'a'
- |. . . [---------]| [3:4] 'dog'
- |> . . . .| [0:0] NP -> * 'I'
- |[---------] . . .| [0:1] NP -> 'I' *
- |> . . . .| [0:0] S -> * NP VP
- |> . . . .| [0:0] NP -> * NP PP
- |[---------> . . .| [0:1] S -> NP * VP
- |[---------> . . .| [0:1] NP -> NP * PP
- |. > . . .| [1:1] Verb -> * 'saw'
- |. [---------] . .| [1:2] Verb -> 'saw' *
- |. > . . .| [1:1] VP -> * Verb NP
- |. > . . .| [1:1] VP -> * Verb
- |. [---------> . .| [1:2] VP -> Verb * NP
- |. [---------] . .| [1:2] VP -> Verb *
- |. > . . .| [1:1] VP -> * VP PP
- |[-------------------] . .| [0:2] S -> NP VP *
- |. [---------> . .| [1:2] VP -> VP * PP
- |. . > . .| [2:2] Det -> * 'a'
- |. . [---------] .| [2:3] Det -> 'a' *
- |. . > . .| [2:2] NP -> * Det Noun
- |. . [---------> .| [2:3] NP -> Det * Noun
- |. . . > .| [3:3] Noun -> * 'dog'
- |. . . [---------]| [3:4] Noun -> 'dog' *
- |. . [-------------------]| [2:4] NP -> Det Noun *
- |. . > . .| [2:2] S -> * NP VP
- |. . > . .| [2:2] NP -> * NP PP
- |. [-----------------------------]| [1:4] VP -> Verb NP *
- |. . [------------------->| [2:4] S -> NP * VP
- |. . [------------------->| [2:4] NP -> NP * PP
- |[=======================================]| [0:4] S -> NP VP *
- |. [----------------------------->| [1:4] VP -> VP * PP
- Nr edges in chart: 33
- (S (NP I) (VP (Verb saw) (NP (Det a) (Noun dog))))
- <BLANKLINE>
-
-Then we test the different parsing Strategies.
-Note that the number of edges differ between the strategies.
-
-Top-down
-
- >>> nltk.parse.chart.demo(1, print_times=False, trace=0,
- ... sent='I saw John with a dog', numparses=2)
- * Sentence:
- I saw John with a dog
- ['I', 'saw', 'John', 'with', 'a', 'dog']
- <BLANKLINE>
- * Strategy: Top-down
- <BLANKLINE>
- Nr edges in chart: 48
- (S
- (NP I)
- (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
- (S
- (NP I)
- (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
- <BLANKLINE>
-
-Bottom-up
-
- >>> nltk.parse.chart.demo(2, print_times=False, trace=0,
- ... sent='I saw John with a dog', numparses=2)
- * Sentence:
- I saw John with a dog
- ['I', 'saw', 'John', 'with', 'a', 'dog']
- <BLANKLINE>
- * Strategy: Bottom-up
- <BLANKLINE>
- Nr edges in chart: 53
- (S
- (NP I)
- (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
- (S
- (NP I)
- (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
- <BLANKLINE>
-
-Bottom-up Left-Corner
-
- >>> nltk.parse.chart.demo(3, print_times=False, trace=0,
- ... sent='I saw John with a dog', numparses=2)
- * Sentence:
- I saw John with a dog
- ['I', 'saw', 'John', 'with', 'a', 'dog']
- <BLANKLINE>
- * Strategy: Bottom-up left-corner
- <BLANKLINE>
- Nr edges in chart: 36
- (S
- (NP I)
- (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
- (S
- (NP I)
- (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
- <BLANKLINE>
-
-Left-Corner with Bottom-Up Filter
-
- >>> nltk.parse.chart.demo(4, print_times=False, trace=0,
- ... sent='I saw John with a dog', numparses=2)
- * Sentence:
- I saw John with a dog
- ['I', 'saw', 'John', 'with', 'a', 'dog']
- <BLANKLINE>
- * Strategy: Filtered left-corner
- <BLANKLINE>
- Nr edges in chart: 28
- (S
- (NP I)
- (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
- (S
- (NP I)
- (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
- <BLANKLINE>
-
-The stepping chart parser
-
- >>> nltk.parse.chart.demo(5, print_times=False, trace=1,
- ... sent='I saw John with a dog', numparses=2)
- * Sentence:
- I saw John with a dog
- ['I', 'saw', 'John', 'with', 'a', 'dog']
- <BLANKLINE>
- * Strategy: Stepping (top-down vs bottom-up)
- <BLANKLINE>
- *** SWITCH TO TOP DOWN
- |[------] . . . . .| [0:1] 'I'
- |. [------] . . . .| [1:2] 'saw'
- |. . [------] . . .| [2:3] 'John'
- |. . . [------] . .| [3:4] 'with'
- |. . . . [------] .| [4:5] 'a'
- |. . . . . [------]| [5:6] 'dog'
- |> . . . . . .| [0:0] S -> * NP VP
- |> . . . . . .| [0:0] NP -> * NP PP
- |> . . . . . .| [0:0] NP -> * Det Noun
- |> . . . . . .| [0:0] NP -> * 'I'
- |[------] . . . . .| [0:1] NP -> 'I' *
- |[------> . . . . .| [0:1] S -> NP * VP
- |[------> . . . . .| [0:1] NP -> NP * PP
- |. > . . . . .| [1:1] VP -> * VP PP
- |. > . . . . .| [1:1] VP -> * Verb NP
- |. > . . . . .| [1:1] VP -> * Verb
- |. > . . . . .| [1:1] Verb -> * 'saw'
- |. [------] . . . .| [1:2] Verb -> 'saw' *
- |. [------> . . . .| [1:2] VP -> Verb * NP
- |. [------] . . . .| [1:2] VP -> Verb *
- |[-------------] . . . .| [0:2] S -> NP VP *
- |. [------> . . . .| [1:2] VP -> VP * PP
- *** SWITCH TO BOTTOM UP
- |. . > . . . .| [2:2] NP -> * 'John'
- |. . . > . . .| [3:3] PP -> * 'with' NP
- |. . . > . . .| [3:3] Prep -> * 'with'
- |. . . . > . .| [4:4] Det -> * 'a'
- |. . . . . > .| [5:5] Noun -> * 'dog'
- |. . [------] . . .| [2:3] NP -> 'John' *
- |. . . [------> . .| [3:4] PP -> 'with' * NP
- |. . . [------] . .| [3:4] Prep -> 'with' *
- |. . . . [------] .| [4:5] Det -> 'a' *
- |. . . . . [------]| [5:6] Noun -> 'dog' *
- |. [-------------] . . .| [1:3] VP -> Verb NP *
- |[--------------------] . . .| [0:3] S -> NP VP *
- |. [-------------> . . .| [1:3] VP -> VP * PP
- |. . > . . . .| [2:2] S -> * NP VP
- |. . > . . . .| [2:2] NP -> * NP PP
- |. . . . > . .| [4:4] NP -> * Det Noun
- |. . [------> . . .| [2:3] S -> NP * VP
- |. . [------> . . .| [2:3] NP -> NP * PP
- |. . . . [------> .| [4:5] NP -> Det * Noun
- |. . . . [-------------]| [4:6] NP -> Det Noun *
- |. . . [--------------------]| [3:6] PP -> 'with' NP *
- |. [----------------------------------]| [1:6] VP -> VP PP *
- *** SWITCH TO TOP DOWN
- |. . > . . . .| [2:2] NP -> * Det Noun
- |. . . . > . .| [4:4] NP -> * NP PP
- |. . . > . . .| [3:3] VP -> * VP PP
- |. . . > . . .| [3:3] VP -> * Verb NP
- |. . . > . . .| [3:3] VP -> * Verb
- |[=========================================]| [0:6] S -> NP VP *
- |. [---------------------------------->| [1:6] VP -> VP * PP
- |. . [---------------------------]| [2:6] NP -> NP PP *
- |. . . . [------------->| [4:6] NP -> NP * PP
- |. [----------------------------------]| [1:6] VP -> Verb NP *
- |. . [--------------------------->| [2:6] S -> NP * VP
- |. . [--------------------------->| [2:6] NP -> NP * PP
- |[=========================================]| [0:6] S -> NP VP *
- |. [---------------------------------->| [1:6] VP -> VP * PP
- |. . . . . . >| [6:6] VP -> * VP PP
- |. . . . . . >| [6:6] VP -> * Verb NP
- |. . . . . . >| [6:6] VP -> * Verb
- *** SWITCH TO BOTTOM UP
- |. . . . > . .| [4:4] S -> * NP VP
- |. . . . [------------->| [4:6] S -> NP * VP
- *** SWITCH TO TOP DOWN
- *** SWITCH TO BOTTOM UP
- *** SWITCH TO TOP DOWN
- *** SWITCH TO BOTTOM UP
- *** SWITCH TO TOP DOWN
- *** SWITCH TO BOTTOM UP
- Nr edges in chart: 61
- (S
- (NP I)
- (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
- (S
- (NP I)
- (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
- <BLANKLINE>
-
-
-Unit tests for the Incremental Chart Parser class
--------------------------------------------------
-
-The incremental chart parsers are defined in earleychart.py.
-We use the demo() function for testing. We must turn off showing of times.
-
- >>> import nltk
-
-Earley Chart Parser
-
- >>> nltk.parse.earleychart.demo(print_times=False, trace=1,
- ... sent='I saw John with a dog', numparses=2)
- * Sentence:
- I saw John with a dog
- ['I', 'saw', 'John', 'with', 'a', 'dog']
- <BLANKLINE>
- |. I . saw . John . with . a . dog .|
- |[------] . . . . .| [0:1] 'I'
- |. [------] . . . .| [1:2] 'saw'
- |. . [------] . . .| [2:3] 'John'
- |. . . [------] . .| [3:4] 'with'
- |. . . . [------] .| [4:5] 'a'
- |. . . . . [------]| [5:6] 'dog'
- |> . . . . . .| [0:0] S -> * NP VP
- |> . . . . . .| [0:0] NP -> * NP PP
- |> . . . . . .| [0:0] NP -> * Det Noun
- |> . . . . . .| [0:0] NP -> * 'I'
- |[------] . . . . .| [0:1] NP -> 'I' *
- |[------> . . . . .| [0:1] S -> NP * VP
- |[------> . . . . .| [0:1] NP -> NP * PP
- |. > . . . . .| [1:1] VP -> * VP PP
- |. > . . . . .| [1:1] VP -> * Verb NP
- |. > . . . . .| [1:1] VP -> * Verb
- |. > . . . . .| [1:1] Verb -> * 'saw'
- |. [------] . . . .| [1:2] Verb -> 'saw' *
- |. [------> . . . .| [1:2] VP -> Verb * NP
- |. [------] . . . .| [1:2] VP -> Verb *
- |[-------------] . . . .| [0:2] S -> NP VP *
- |. [------> . . . .| [1:2] VP -> VP * PP
- |. . > . . . .| [2:2] NP -> * NP PP
- |. . > . . . .| [2:2] NP -> * Det Noun
- |. . > . . . .| [2:2] NP -> * 'John'
- |. . [------] . . .| [2:3] NP -> 'John' *
- |. [-------------] . . .| [1:3] VP -> Verb NP *
- |. . [------> . . .| [2:3] NP -> NP * PP
- |. . . > . . .| [3:3] PP -> * 'with' NP
- |[--------------------] . . .| [0:3] S -> NP VP *
- |. [-------------> . . .| [1:3] VP -> VP * PP
- |. . . [------> . .| [3:4] PP -> 'with' * NP
- |. . . . > . .| [4:4] NP -> * NP PP
- |. . . . > . .| [4:4] NP -> * Det Noun
- |. . . . > . .| [4:4] Det -> * 'a'
- |. . . . [------] .| [4:5] Det -> 'a' *
- |. . . . [------> .| [4:5] NP -> Det * Noun
- |. . . . . > .| [5:5] Noun -> * 'dog'
- |. . . . . [------]| [5:6] Noun -> 'dog' *
- |. . . . [-------------]| [4:6] NP -> Det Noun *
- |. . . [--------------------]| [3:6] PP -> 'with' NP *
- |. . . . [------------->| [4:6] NP -> NP * PP
- |. . [---------------------------]| [2:6] NP -> NP PP *
- |. [----------------------------------]| [1:6] VP -> VP PP *
- |[=========================================]| [0:6] S -> NP VP *
- |. [---------------------------------->| [1:6] VP -> VP * PP
- |. [----------------------------------]| [1:6] VP -> Verb NP *
- |. . [--------------------------->| [2:6] NP -> NP * PP
- |[=========================================]| [0:6] S -> NP VP *
- |. [---------------------------------->| [1:6] VP -> VP * PP
- (S
- (NP I)
- (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
- (S
- (NP I)
- (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
-
-
-Unit tests for LARGE context-free grammars
-------------------------------------------
-
-Reading the ATIS grammar.
-
- >>> grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
- >>> grammar
- <Grammar with 5517 productions>
-
-Reading the test sentences.
-
- >>> sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
- >>> sentences = nltk.parse.util.extract_test_sentences(sentences)
- >>> len(sentences)
- 98
- >>> testsentence = sentences[22]
- >>> testsentence[0]
- ['show', 'me', 'northwest', 'flights', 'to', 'detroit', '.']
- >>> testsentence[1]
- 17
- >>> sentence = testsentence[0]
-
-Now we test all different parsing strategies.
-Note that the number of edges differ between the strategies.
-
-Bottom-up parsing.
-
- >>> parser = nltk.parse.BottomUpChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 7661
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Bottom-up Left-corner parsing.
-
- >>> parser = nltk.parse.BottomUpLeftCornerChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 4986
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Left-corner parsing with bottom-up filter.
-
- >>> parser = nltk.parse.LeftCornerChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 1342
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Top-down parsing.
-
- >>> parser = nltk.parse.TopDownChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 28352
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Incremental Bottom-up parsing.
-
- >>> parser = nltk.parse.IncrementalBottomUpChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 7661
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Incremental Bottom-up Left-corner parsing.
-
- >>> parser = nltk.parse.IncrementalBottomUpLeftCornerChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 4986
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Incremental Left-corner parsing with bottom-up filter.
-
- >>> parser = nltk.parse.IncrementalLeftCornerChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 1342
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Incremental Top-down parsing.
-
- >>> parser = nltk.parse.IncrementalTopDownChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 28352
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-Earley parsing. This is similar to the incremental top-down algorithm.
-
- >>> parser = nltk.parse.EarleyChartParser(grammar)
- >>> chart = parser.chart_parse(sentence)
- >>> print((chart.num_edges()))
- 28352
- >>> print((len(list(chart.parses(grammar.start())))))
- 17
-
-
-Unit tests for the Probabilistic CFG class
-------------------------------------------
-
- >>> from nltk.corpus import treebank
- >>> from itertools import islice
- >>> from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
-
-Create a set of PCFG productions.
-
- >>> grammar = PCFG.fromstring("""
- ... A -> B B [.3] | C B C [.7]
- ... B -> B D [.5] | C [.5]
- ... C -> 'a' [.1] | 'b' [0.9]
- ... D -> 'b' [1.0]
- ... """)
- >>> prod = grammar.productions()[0]
- >>> prod
- A -> B B [0.3]
-
- >>> prod.lhs()
- A
-
- >>> prod.rhs()
- (B, B)
-
- >>> print((prod.prob()))
- 0.3
-
- >>> grammar.start()
- A
-
- >>> grammar.productions()
- [A -> B B [0.3], A -> C B C [0.7], B -> B D [0.5], B -> C [0.5], C -> 'a' [0.1], C -> 'b' [0.9], D -> 'b' [1.0]]
-
-Induce some productions using parsed Treebank data.
-
- >>> productions = []
- >>> for fileid in treebank.fileids()[:2]:
- ... for t in treebank.parsed_sents(fileid):
- ... productions += t.productions()
-
- >>> grammar = induce_pcfg(S, productions)
- >>> grammar
- <Grammar with 71 productions>
-
- >>> sorted(grammar.productions(lhs=Nonterminal('PP')))[:2]
- [PP -> IN NP [1.0]]
- >>> sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2]
- [NNP -> 'Agnew' [0.0714286], NNP -> 'Consolidated' [0.0714286]]
- >>> sorted(grammar.productions(lhs=Nonterminal('JJ')))[:2]
- [JJ -> 'British' [0.142857], JJ -> 'former' [0.142857]]
- >>> sorted(grammar.productions(lhs=Nonterminal('NP')))[:2]
- [NP -> CD NNS [0.133333], NP -> DT JJ JJ NN [0.0666667]]
-
-Unit tests for the Probabilistic Chart Parse classes
-----------------------------------------------------
-
- >>> tokens = "Jack saw Bob with my cookie".split()
- >>> grammar = toy_pcfg2
- >>> print(grammar)
- Grammar with 23 productions (start state = S)
- S -> NP VP [1.0]
- VP -> V NP [0.59]
- VP -> V [0.4]
- VP -> VP PP [0.01]
- NP -> Det N [0.41]
- NP -> Name [0.28]
- NP -> NP PP [0.31]
- PP -> P NP [1.0]
- V -> 'saw' [0.21]
- V -> 'ate' [0.51]
- V -> 'ran' [0.28]
- N -> 'boy' [0.11]
- N -> 'cookie' [0.12]
- N -> 'table' [0.13]
- N -> 'telescope' [0.14]
- N -> 'hill' [0.5]
- Name -> 'Jack' [0.52]
- Name -> 'Bob' [0.48]
- P -> 'with' [0.61]
- P -> 'under' [0.39]
- Det -> 'the' [0.41]
- Det -> 'a' [0.31]
- Det -> 'my' [0.28]
-
-Create several parsers using different queuing strategies and show the
-resulting parses.
-
- >>> from nltk.parse import pchart
-
- >>> parser = pchart.InsideChartParser(grammar)
- >>> for t in parser.parse(tokens):
- ... print(t)
- (S
- (NP (Name Jack))
- (VP
- (V saw)
- (NP
- (NP (Name Bob))
- (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
- (S
- (NP (Name Jack))
- (VP
- (VP (V saw) (NP (Name Bob)))
- (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
-
- >>> parser = pchart.RandomChartParser(grammar)
- >>> for t in parser.parse(tokens):
- ... print(t)
- (S
- (NP (Name Jack))
- (VP
- (V saw)
- (NP
- (NP (Name Bob))
- (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
- (S
- (NP (Name Jack))
- (VP
- (VP (V saw) (NP (Name Bob)))
- (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
-
- >>> parser = pchart.UnsortedChartParser(grammar)
- >>> for t in parser.parse(tokens):
- ... print(t)
- (S
- (NP (Name Jack))
- (VP
- (V saw)
- (NP
- (NP (Name Bob))
- (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
- (S
- (NP (Name Jack))
- (VP
- (VP (V saw) (NP (Name Bob)))
- (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
-
- >>> parser = pchart.LongestChartParser(grammar)
- >>> for t in parser.parse(tokens):
- ... print(t)
- (S
- (NP (Name Jack))
- (VP
- (V saw)
- (NP
- (NP (Name Bob))
- (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
- (S
- (NP (Name Jack))
- (VP
- (VP (V saw) (NP (Name Bob)))
- (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
-
- >>> parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)
- >>> for t in parser.parse(tokens):
- ... print(t)
-
-
-Unit tests for the Viterbi Parse classes
-----------------------------------------
-
- >>> from nltk.parse import ViterbiParser
- >>> tokens = "Jack saw Bob with my cookie".split()
- >>> grammar = toy_pcfg2
-
-Parse the tokenized sentence.
-
- >>> parser = ViterbiParser(grammar)
- >>> for t in parser.parse(tokens):
- ... print(t)
- (S
- (NP (Name Jack))
- (VP
- (V saw)
- (NP
- (NP (Name Bob))
- (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
-
-
-Unit tests for the FeatStructNonterminal class
-----------------------------------------------
-
- >>> from nltk.grammar import FeatStructNonterminal
- >>> FeatStructNonterminal(
- ... pos='n', agr=FeatStructNonterminal(number='pl', gender='f'))
- [agr=[gender='f', number='pl'], pos='n']
-
- >>> FeatStructNonterminal('VP[+fin]/NP[+pl]')
- VP[+fin]/NP[+pl]
-
-
-Tracing the Feature Chart Parser
---------------------------------
-
-We use the featurechart.demo() function for tracing the Feature Chart Parser.
-
- >>> nltk.parse.featurechart.demo(print_times=False,
- ... print_grammar=True,
- ... parser=nltk.parse.featurechart.FeatureChartParser,
- ... sent='I saw John with a dog')
- <BLANKLINE>
- Grammar with 18 productions (start state = S[])
- S[] -> NP[] VP[]
- PP[] -> Prep[] NP[]
- NP[] -> NP[] PP[]
- VP[] -> VP[] PP[]
- VP[] -> Verb[] NP[]
- VP[] -> Verb[]
- NP[] -> Det[pl=?x] Noun[pl=?x]
- NP[] -> 'John'
- NP[] -> 'I'
- Det[] -> 'the'
- Det[] -> 'my'
- Det[-pl] -> 'a'
- Noun[-pl] -> 'dog'
- Noun[-pl] -> 'cookie'
- Verb[] -> 'ate'
- Verb[] -> 'saw'
- Prep[] -> 'with'
- Prep[] -> 'under'
- <BLANKLINE>
- * FeatureChartParser
- Sentence: I saw John with a dog
- |.I.s.J.w.a.d.|
- |[-] . . . . .| [0:1] 'I'
- |. [-] . . . .| [1:2] 'saw'
- |. . [-] . . .| [2:3] 'John'
- |. . . [-] . .| [3:4] 'with'
- |. . . . [-] .| [4:5] 'a'
- |. . . . . [-]| [5:6] 'dog'
- |[-] . . . . .| [0:1] NP[] -> 'I' *
- |[-> . . . . .| [0:1] S[] -> NP[] * VP[] {}
- |[-> . . . . .| [0:1] NP[] -> NP[] * PP[] {}
- |. [-] . . . .| [1:2] Verb[] -> 'saw' *
- |. [-> . . . .| [1:2] VP[] -> Verb[] * NP[] {}
- |. [-] . . . .| [1:2] VP[] -> Verb[] *
- |. [-> . . . .| [1:2] VP[] -> VP[] * PP[] {}
- |[---] . . . .| [0:2] S[] -> NP[] VP[] *
- |. . [-] . . .| [2:3] NP[] -> 'John' *
- |. . [-> . . .| [2:3] S[] -> NP[] * VP[] {}
- |. . [-> . . .| [2:3] NP[] -> NP[] * PP[] {}
- |. [---] . . .| [1:3] VP[] -> Verb[] NP[] *
- |. [---> . . .| [1:3] VP[] -> VP[] * PP[] {}
- |[-----] . . .| [0:3] S[] -> NP[] VP[] *
- |. . . [-] . .| [3:4] Prep[] -> 'with' *
- |. . . [-> . .| [3:4] PP[] -> Prep[] * NP[] {}
- |. . . . [-] .| [4:5] Det[-pl] -> 'a' *
- |. . . . [-> .| [4:5] NP[] -> Det[pl=?x] * Noun[pl=?x] {?x: False}
- |. . . . . [-]| [5:6] Noun[-pl] -> 'dog' *
- |. . . . [---]| [4:6] NP[] -> Det[-pl] Noun[-pl] *
- |. . . . [--->| [4:6] S[] -> NP[] * VP[] {}
- |. . . . [--->| [4:6] NP[] -> NP[] * PP[] {}
- |. . . [-----]| [3:6] PP[] -> Prep[] NP[] *
- |. . [-------]| [2:6] NP[] -> NP[] PP[] *
- |. [---------]| [1:6] VP[] -> VP[] PP[] *
- |. [--------->| [1:6] VP[] -> VP[] * PP[] {}
- |[===========]| [0:6] S[] -> NP[] VP[] *
- |. . [------->| [2:6] S[] -> NP[] * VP[] {}
- |. . [------->| [2:6] NP[] -> NP[] * PP[] {}
- |. [---------]| [1:6] VP[] -> Verb[] NP[] *
- |. [--------->| [1:6] VP[] -> VP[] * PP[] {}
- |[===========]| [0:6] S[] -> NP[] VP[] *
- (S[]
- (NP[] I)
- (VP[]
- (VP[] (Verb[] saw) (NP[] John))
- (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog)))))
- (S[]
- (NP[] I)
- (VP[]
- (Verb[] saw)
- (NP[]
- (NP[] John)
- (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog))))))
-
-
-Unit tests for the Feature Chart Parser classes
------------------------------------------------
-
-The list of parsers we want to test.
-
- >>> parsers = [nltk.parse.featurechart.FeatureChartParser,
- ... nltk.parse.featurechart.FeatureTopDownChartParser,
- ... nltk.parse.featurechart.FeatureBottomUpChartParser,
- ... nltk.parse.featurechart.FeatureBottomUpLeftCornerChartParser,
- ... nltk.parse.earleychart.FeatureIncrementalChartParser,
- ... nltk.parse.earleychart.FeatureEarleyChartParser,
- ... nltk.parse.earleychart.FeatureIncrementalTopDownChartParser,
- ... nltk.parse.earleychart.FeatureIncrementalBottomUpChartParser,
- ... nltk.parse.earleychart.FeatureIncrementalBottomUpLeftCornerChartParser,
- ... ]
-
-A helper function that tests each parser on the given grammar and sentence.
-We check that the number of trees are correct, and that all parsers
-return the same trees. Otherwise an error is printed.
-
- >>> def unittest(grammar, sentence, nr_trees):
- ... sentence = sentence.split()
- ... trees = None
- ... for P in parsers:
- ... result = P(grammar).parse(sentence)
- ... result = set(tree.freeze() for tree in result)
- ... if len(result) != nr_trees:
- ... print("Wrong nr of trees:", len(result))
- ... elif trees is None:
- ... trees = result
- ... elif result != trees:
- ... print("Trees differ for parser:", P.__name__)
-
-The demo grammar from before, with an ambiguous sentence.
-
- >>> isawjohn = nltk.parse.featurechart.demo_grammar()
- >>> unittest(isawjohn, "I saw John with a dog with my cookie", 5)
-
-This grammar tests that variables in different grammar rules are renamed
-before unification. (The problematic variable is in this case ?X).
-
- >>> whatwasthat = nltk.grammar.FeatureGrammar.fromstring('''
- ... S[] -> NP[num=?N] VP[num=?N, slash=?X]
- ... NP[num=?X] -> "what"
- ... NP[num=?X] -> "that"
- ... VP[num=?P, slash=none] -> V[num=?P] NP[]
- ... V[num=sg] -> "was"
- ... ''')
- >>> unittest(whatwasthat, "what was that", 1)
-
-This grammar tests that the same rule can be used in different places
-in another rule, and that the variables are properly renamed.
-
- >>> thislovesthat = nltk.grammar.FeatureGrammar.fromstring('''
- ... S[] -> NP[case=nom] V[] NP[case=acc]
- ... NP[case=?X] -> Pron[case=?X]
- ... Pron[] -> "this"
- ... Pron[] -> "that"
- ... V[] -> "loves"
- ... ''')
- >>> unittest(thislovesthat, "this loves that", 1)
-
-
-Tests for loading feature grammar files
----------------------------------------
-
-Alternative 1: first load the grammar, then create the parser.
-
- >>> fcfg = nltk.data.load('grammars/book_grammars/feat0.fcfg')
- >>> fcp1 = nltk.parse.FeatureChartParser(fcfg)
- >>> print((type(fcp1)))
- <class 'nltk.parse.featurechart.FeatureChartParser'>
-
-Alternative 2: directly load the parser.
-
- >>> fcp2 = nltk.parse.load_parser('grammars/book_grammars/feat0.fcfg')
- >>> print((type(fcp2)))
- <class 'nltk.parse.featurechart.FeatureChartParser'>
-
-
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==================================
-Examples for Portuguese Processing
-==================================
-
-This HOWTO contains a variety of examples relating to the Portuguese language.
-It is intended to be read in conjunction with the NLTK book
-(``http://nltk.org/book``). For instructions on running the Python
-interpreter, please see the section *Getting Started with Python*, in Chapter 1.
-
---------------------------------------------
-Python Programming, with Portuguese Examples
---------------------------------------------
-
-Chapter 1 of the NLTK book contains many elementary programming examples, all
-with English texts. In this section, we'll see some corresponding examples
-using Portuguese. Please refer to the chapter for full discussion. *Vamos!*
-
- >>> from nltk.examples.pt import *
- *** Introductory Examples for the NLTK Book ***
- Loading ptext1, ... and psent1, ...
- Type the name of the text or sentence to view it.
- Type: 'texts()' or 'sents()' to list the materials.
- ptext1: Memórias Póstumas de Brás Cubas (1881)
- ptext2: Dom Casmurro (1899)
- ptext3: Gênesis
- ptext4: Folha de Sao Paulo (1994)
-
-
-Any time we want to find out about these texts, we just have
-to enter their names at the Python prompt:
-
- >>> ptext2
- <Text: Dom Casmurro (1899)>
-
-Searching Text
---------------
-
-A concordance permits us to see words in context.
-
- >>> ptext1.concordance('olhos')
- Building index...
- Displaying 25 of 138 matches:
- De pé , à cabeceira da cama , com os olhos estúpidos , a boca entreaberta , a t
- orelhas . Pela minha parte fechei os olhos e deixei - me ir à ventura . Já agor
- xões de cérebro enfermo . Como ia de olhos fechados , não via o caminho ; lembr
- gelos eternos . Com efeito , abri os olhos e vi que o meu animal galopava numa
- me apareceu então , fitando - me uns olhos rutilantes como o sol . Tudo nessa f
- mim mesmo . Então , encarei - a com olhos súplices , e pedi mais alguns anos .
- ...
-
-For a given word, we can find words with a similar text distribution:
-
- >>> ptext1.similar('chegar')
- Building word-context index...
- acabada acudir aludir avistar bramanismo casamento cheguei com contar
- contrário corpo dali deixei desferirem dizer fazer filhos já leitor lhe
- >>> ptext3.similar('chegar')
- Building word-context index...
- achar alumiar arrombar destruir governar guardar ir lavrar passar que
- toda tomar ver vir
-
-We can search for the statistically significant collocations in a text:
-
- >>> ptext1.collocations()
- Building collocations list
- Quincas Borba; Lobo Neves; alguma coisa; Brás Cubas; meu pai; dia
- seguinte; não sei; Meu pai; alguns instantes; outra vez; outra coisa;
- por exemplo; mim mesmo; coisa nenhuma; mesma coisa; não era; dias
- depois; Passeio Público; olhar para; das coisas
-
-We can search for words in context, with the help of *regular expressions*, e.g.:
-
- >>> ptext1.findall("<olhos> (<.*>)")
- estúpidos; e; fechados; rutilantes; súplices; a; do; babavam;
- na; moles; se; da; umas; espraiavam; chamejantes; espetados;
- ...
-
-We can automatically generate random text based on a given text, e.g.:
-
- >>> ptext3.generate() # doctest: +SKIP
- No princípio , criou Deus os abençoou , dizendo : Onde { estão } e até
- à ave dos céus , { que } será . Disse mais Abrão : Dá - me a mulher
- que tomaste ; porque daquele poço Eseque , { tinha .} E disse : Não
- poderemos descer ; mas , do campo ainda não estava na casa do teu
- pescoço . E viveu Serugue , depois Simeão e Levi { são } estes ? E o
- varão , porque habitava na terra de Node , da mão de Esaú : Jeús ,
- Jalão e Corá
-
-Texts as List of Words
-----------------------
-
-A few sentences have been defined for you.
-
- >>> psent1
- ['o', 'amor', 'da', 'gl\xf3ria', 'era', 'a', 'coisa', 'mais',
- 'verdadeiramente', 'humana', 'que', 'h\xe1', 'no', 'homem', ',',
- 'e', ',', 'conseq\xfcentemente', ',', 'a', 'sua', 'mais',
- 'genu\xedna', 'fei\xe7\xe3o', '.']
- >>>
-
-Notice that the sentence has been *tokenized*. Each token is
-represented as a string, represented using quotes, e.g. ``'coisa'``.
-Some strings contain special characters, e.g. ``\xf3``,
-the internal representation for ó.
-The tokens are combined in the form of a *list*. How long is this list?
-
- >>> len(psent1)
- 25
- >>>
-
-What is the vocabulary of this sentence?
-
- >>> sorted(set(psent1))
- [',', '.', 'a', 'amor', 'coisa', 'conseqüentemente', 'da', 'e', 'era',
- 'feição', 'genuína', 'glória', 'homem', 'humana', 'há', 'mais', 'no',
- 'o', 'que', 'sua', 'verdadeiramente']
- >>>
-
-Let's iterate over each item in ``psent2``, and print information for each:
-
- >>> for w in psent2:
- ... print(w, len(w), w[-1])
- ...
- Não 3 o
- consultes 9 s
- dicionários 11 s
- . 1 .
-
-Observe how we make a human-readable version of a string, using ``decode()``.
-Also notice that we accessed the last character of a string ``w`` using ``w[-1]``.
-
-We just saw a ``for`` loop above. Another useful control structure is a
-*list comprehension*.
-
- >>> [w.upper() for w in psent2]
- ['N\xc3O', 'CONSULTES', 'DICION\xc1RIOS', '.']
- >>> [w for w in psent1 if w.endswith('a')]
- ['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna']
- >>> [w for w in ptext4 if len(w) > 15]
- ['norte-irlandeses', 'pan-nacionalismo', 'predominatemente', 'primeiro-ministro',
- 'primeiro-ministro', 'irlandesa-americana', 'responsabilidades', 'significativamente']
-
-We can examine the relative frequency of words in a text, using ``FreqDist``:
-
- >>> fd1 = FreqDist(ptext1)
- >>> fd1
- <FreqDist with 10848 samples and 77098 outcomes>
- >>> fd1['olhos']
- 137
- >>> fd1.max()
- ','
- >>> fd1.samples()[:100]
- [',', '.', 'a', 'que', 'de', 'e', '-', 'o', ';', 'me', 'um', 'n\xe3o',
- '\x97', 'se', 'do', 'da', 'uma', 'com', 'os', '\xe9', 'era', 'as', 'eu',
- 'lhe', 'ao', 'em', 'para', 'mas', '...', '!', '\xe0', 'na', 'mais', '?',
- 'no', 'como', 'por', 'N\xe3o', 'dos', 'o', 'ele', ':', 'Virg\xedlia',
- 'me', 'disse', 'minha', 'das', 'O', '/', 'A', 'CAP\xcdTULO', 'muito',
- 'depois', 'coisa', 'foi', 'sem', 'olhos', 'ela', 'nos', 'tinha', 'nem',
- 'E', 'outro', 'vida', 'nada', 'tempo', 'menos', 'outra', 'casa', 'homem',
- 'porque', 'quando', 'mim', 'mesmo', 'ser', 'pouco', 'estava', 'dia',
- 't\xe3o', 'tudo', 'Mas', 'at\xe9', 'D', 'ainda', 's\xf3', 'alguma',
- 'la', 'vez', 'anos', 'h\xe1', 'Era', 'pai', 'esse', 'lo', 'dizer', 'assim',
- 'ent\xe3o', 'dizia', 'aos', 'Borba']
-
----------------
-Reading Corpora
----------------
-
-Accessing the Machado Text Corpus
----------------------------------
-
-NLTK includes the complete works of Machado de Assis.
-
- >>> from nltk.corpus import machado
- >>> machado.fileids()
- ['contos/macn001.txt', 'contos/macn002.txt', 'contos/macn003.txt', ...]
-
-Each file corresponds to one of the works of Machado de Assis. To see a complete
-list of works, you can look at the corpus README file: ``print machado.readme()``.
-Let's access the text of the *Posthumous Memories of Brás Cubas*.
-
-We can access the text as a list of characters, and access 200 characters starting
-from position 10,000.
-
- >>> raw_text = machado.raw('romance/marm05.txt')
- >>> raw_text[10000:10200]
- u', primou no\nEstado, e foi um dos amigos particulares do vice-rei Conde
- da Cunha.\n\nComo este apelido de Cubas lhe\ncheirasse excessivamente a
- tanoaria, alegava meu pai, bisneto de Dami\xe3o, que o\ndito ape'
-
-However, this is not a very useful way to work with a text. We generally think
-of a text as a sequence of words and punctuation, not characters:
-
- >>> text1 = machado.words('romance/marm05.txt')
- >>> text1
- ['Romance', ',', 'Mem\xf3rias', 'P\xf3stumas', 'de', ...]
- >>> len(text1)
- 77098
- >>> len(set(text1))
- 10848
-
-Here's a program that finds the most common ngrams that contain a
-particular target word.
-
- >>> from nltk import ngrams, FreqDist
- >>> target_word = 'olhos'
- >>> fd = FreqDist(ng
- ... for ng in ngrams(text1, 5)
- ... if target_word in ng)
- >>> for hit in fd.samples():
- ... print(' '.join(hit))
- ...
- , com os olhos no
- com os olhos no ar
- com os olhos no chão
- e todos com os olhos
- me estar com os olhos
- os olhos estúpidos , a
- os olhos na costura ,
- os olhos no ar ,
- , com os olhos espetados
- , com os olhos estúpidos
- , com os olhos fitos
- , com os olhos naquele
- , com os olhos para
-
-
-Accessing the MacMorpho Tagged Corpus
--------------------------------------
-
-NLTK includes the MAC-MORPHO Brazilian Portuguese POS-tagged news text,
-with over a million words of
-journalistic texts extracted from ten sections of
-the daily newspaper *Folha de Sao Paulo*, 1994.
-
-We can access this corpus as a sequence of words or tagged words as follows:
- >>> import nltk.corpus
- >>> nltk.corpus.mac_morpho.words()
- ['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', ...]
- >>> nltk.corpus.mac_morpho.sents() # doctest: +NORMALIZE_WHITESPACE
- [['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o',
- 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'],
- ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional',
- 'do', 'Zeb', ',', 'que', 'come\xe7a', 'dia', '25'], ...]
- >>> nltk.corpus.mac_morpho.tagged_words()
- [('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...]
-
-We can also access it in sentence chunks.
-
- >>> nltk.corpus.mac_morpho.tagged_sents() # doctest: +NORMALIZE_WHITESPACE
- [[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ('de', 'PREP'),
- ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milh\xe3o', 'N'), ('em', 'PREP|+'),
- ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'),
- ('Pinhal', 'NPROP'), ('em', 'PREP'), ('S\xe3o', 'NPROP'),
- ('Paulo', 'NPROP')],
- [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'),
- ('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'),
- ('do', 'NPROP'), ('Zeb', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'),
- ('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...]
-
-This data can be used to train taggers (examples below for the Floresta treebank).
-
-Accessing the Floresta Portuguese Treebank
-------------------------------------------
-
-The NLTK data distribution includes the
-"Floresta Sinta(c)tica Corpus" version 7.4, available from
-``http://www.linguateca.pt/Floresta/``.
-
-We can access this corpus as a sequence of words or tagged words as follows:
-
- >>> from nltk.corpus import floresta
- >>> floresta.words()
- ['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...]
- >>> floresta.tagged_words()
- [('Um', '>N+art'), ('revivalismo', 'H+n'), ...]
-
-The tags consist of some syntactic information, followed by a plus sign,
-followed by a conventional part-of-speech tag. Let's strip off the material before
-the plus sign:
-
- >>> def simplify_tag(t):
- ... if "+" in t:
- ... return t[t.index("+")+1:]
- ... else:
- ... return t
- >>> twords = floresta.tagged_words()
- >>> twords = [(w.lower(), simplify_tag(t)) for (w,t) in twords]
- >>> twords[:10]
- [('um', 'art'), ('revivalismo', 'n'), ('refrescante', 'adj'), ('o', 'art'), ('7_e_meio', 'prop'),
- ('\xe9', 'v-fin'), ('um', 'art'), ('ex-libris', 'n'), ('de', 'prp'), ('a', 'art')]
-
-Pretty printing the tagged words:
-
- >>> print(' '.join(word + '/' + tag for (word, tag) in twords[:10]))
- um/art revivalismo/n refrescante/adj o/art 7_e_meio/prop é/v-fin um/art ex-libris/n de/prp a/art
-
-Count the word tokens and types, and determine the most common word:
-
- >>> words = floresta.words()
- >>> len(words)
- 211852
- >>> fd = nltk.FreqDist(words)
- >>> len(fd)
- 29421
- >>> fd.max()
- 'de'
-
-List the 20 most frequent tags, in order of decreasing frequency:
-
- >>> tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()]
- >>> fd = nltk.FreqDist(tags)
- >>> fd.keys()[:20] # doctest: +NORMALIZE_WHITESPACE
- ['n', 'prp', 'art', 'v-fin', ',', 'prop', 'adj', 'adv', '.',
- 'conj-c', 'v-inf', 'pron-det', 'v-pcp', 'num', 'pron-indp',
- 'pron-pers', '\xab', '\xbb', 'conj-s', '}']
-
-We can also access the corpus grouped by sentence:
-
- >>> floresta.sents() # doctest: +NORMALIZE_WHITESPACE
- [['Um', 'revivalismo', 'refrescante'],
- ['O', '7_e_Meio', '\xe9', 'um', 'ex-libris', 'de', 'a', 'noite',
- 'algarvia', '.'], ...]
- >>> floresta.tagged_sents() # doctest: +NORMALIZE_WHITESPACE
- [[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')],
- [('O', '>N+art'), ('7_e_Meio', 'H+prop'), ('\xe9', 'P+v-fin'),
- ('um', '>N+art'), ('ex-libris', 'H+n'), ('de', 'H+prp'),
- ('a', '>N+art'), ('noite', 'H+n'), ('algarvia', 'N<+adj'), ('.', '.')],
- ...]
- >>> floresta.parsed_sents() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
- [Tree('UTT+np', [Tree('>N+art', ['Um']), Tree('H+n', ['revivalismo']),
- Tree('N<+adj', ['refrescante'])]),
- Tree('STA+fcl',
- [Tree('SUBJ+np', [Tree('>N+art', ['O']),
- Tree('H+prop', ['7_e_Meio'])]),
- Tree('P+v-fin', ['\xe9']),
- Tree('SC+np',
- [Tree('>N+art', ['um']),
- Tree('H+n', ['ex-libris']),
- Tree('N<+pp', [Tree('H+prp', ['de']),
- Tree('P<+np', [Tree('>N+art', ['a']),
- Tree('H+n', ['noite']),
- Tree('N<+adj', ['algarvia'])])])]),
- Tree('.', ['.'])]), ...]
-
-To view a parse tree, use the ``draw()`` method, e.g.:
-
- >>> psents = floresta.parsed_sents()
- >>> psents[5].draw() # doctest: +SKIP
-
-Character Encodings
--------------------
-
-Python understands the common character encoding used for Portuguese, ISO 8859-1 (ISO Latin 1).
-
- >>> import os, nltk.test
- >>> testdir = os.path.split(nltk.test.__file__)[0]
- >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO 8859-1')
- >>> text[:60]
- 'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais '
- >>> print(text[:60])
- O 7 e Meio é um ex-libris da noite algarvia.
- É uma das mais
-
-For more information about character encodings and Python, please see section 3.3 of the book.
-
-----------------
-Processing Tasks
-----------------
-
-
-Simple Concordancing
---------------------
-
-Here's a function that takes a word and a specified amount of context (measured
-in characters), and generates a concordance for that word.
-
- >>> def concordance(word, context=30):
- ... for sent in floresta.sents():
- ... if word in sent:
- ... pos = sent.index(word)
- ... left = ' '.join(sent[:pos])
- ... right = ' '.join(sent[pos+1:])
- ... print('%*s %s %-*s' %
- ... (context, left[-context:], word, context, right[:context]))
-
- >>> concordance("dar") # doctest: +SKIP
- anduru , foi o suficiente para dar a volta a o resultado .
- 1. O P?BLICO veio dar a a imprensa di?ria portuguesa
- A fartura de pensamento pode dar maus resultados e n?s n?o quer
- Come?a a dar resultados a pol?tica de a Uni
- ial come?ar a incorporar- lo e dar forma a um ' site ' que tem se
- r com Constantino para ele lhe dar tamb?m os pap?is assinados .
- va a brincar , pois n?o lhe ia dar procura??o nenhuma enquanto n?
- ?rica como o ant?doto capaz de dar sentido a o seu enorme poder .
- . . .
- >>> concordance("vender") # doctest: +SKIP
- er recebido uma encomenda para vender 4000 blindados a o Iraque .
- m?rico_Amorim caso conseguisse vender o lote de ac??es de o empres?r
- mpre ter jovens simp?ticos a ? vender ? chega ! }
- Disse que o governo vai vender ? desde autom?vel at? particip
- ndiciou ontem duas pessoas por vender carro com ?gio .
- A inten??o de Fleury ? vender as a??es para equilibrar as fi
-
-Part-of-Speech Tagging
-----------------------
-
-Let's begin by getting the tagged sentence data, and simplifying the tags
-as described earlier.
-
- >>> from nltk.corpus import floresta
- >>> tsents = floresta.tagged_sents()
- >>> tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
- >>> train = tsents[100:]
- >>> test = tsents[:100]
-
-We already know that ``n`` is the most common tag, so we can set up a
-default tagger that tags every word as a noun, and see how well it does:
-
- >>> tagger0 = nltk.DefaultTagger('n')
- >>> nltk.tag.accuracy(tagger0, test)
- 0.17697228144989338
-
-Evidently, about one in every six words is a noun. Let's improve on this by
-training a unigram tagger:
-
- >>> tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
- >>> nltk.tag.accuracy(tagger1, test)
- 0.87029140014214645
-
-Next a bigram tagger:
-
- >>> tagger2 = nltk.BigramTagger(train, backoff=tagger1)
- >>> nltk.tag.accuracy(tagger2, test)
- 0.89019189765458417
-
-
-Sentence Segmentation
----------------------
-
-Punkt is a language-neutral sentence segmentation tool. We
-
- >>> sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle')
- >>> raw_text = machado.raw('romance/marm05.txt')
- >>> sentences = sent_tokenizer.tokenize(raw_text)
- >>> for sent in sentences[1000:1005]:
- ... print("<<", sent, ">>")
- ...
- << Em verdade, parecia ainda mais mulher do que era;
- seria criança nos seus folgares de moça; mas assim quieta, impassível, tinha a
- compostura da mulher casada. >>
- << Talvez essa circunstância lhe diminuía um pouco da
- graça virginal. >>
- << Depressa nos familiarizamos; a mãe fazia-lhe grandes elogios, eu
- escutava-os de boa sombra, e ela sorria com os olhos fúlgidos, como se lá dentro
- do cérebro lhe estivesse a voar uma borboletinha de asas de ouro e olhos de
- diamante... >>
- << Digo lá dentro, porque cá fora o
- que esvoaçou foi uma borboleta preta, que subitamente penetrou na varanda, e
- começou a bater as asas em derredor de D. Eusébia. >>
- << D. Eusébia deu um grito,
- levantou-se, praguejou umas palavras soltas: - T'esconjuro!... >>
-
-The sentence tokenizer can be trained and evaluated on other text.
-The source text (from the Floresta Portuguese Treebank) contains one sentence per line.
-We read the text, split it into its lines, and then join these lines together using
-spaces. Now the information about sentence breaks has been discarded. We split this
-material into training and testing data:
-
- >>> import os, nltk.test
- >>> testdir = os.path.split(nltk.test.__file__)[0]
- >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO-8859-1')
- >>> lines = text.split('\n')
- >>> train = ' '.join(lines[10:])
- >>> test = ' '.join(lines[:10])
-
-Now we train the sentence segmenter (or sentence tokenizer) and use it on our test sentences:
-
- >>> stok = nltk.PunktSentenceTokenizer(train)
- >>> print(stok.tokenize(test))
- ['O 7 e Meio \xe9 um ex-libris da noite algarvia.',
- '\xc9 uma das mais antigas discotecas do Algarve, situada em Albufeira,
- que continua a manter os tra\xe7os decorativos e as clientelas de sempre.',
- '\xc9 um pouco a vers\xe3o de uma esp\xe9cie de \xaboutro lado\xbb da noite,
- a meio caminho entre os devaneios de uma fauna perif\xe9rica, seja de Lisboa,
- Londres, Dublin ou Faro e Portim\xe3o, e a postura circunspecta dos fi\xe9is da casa,
- que dela esperam a m\xfasica \xabgeracionista\xbb dos 60 ou dos 70.',
- 'N\xe3o deixa de ser, nos tempos que correm, um certo \xabvery typical\xbb algarvio,
- cabe\xe7a de cartaz para os que querem fugir a algumas movimenta\xe7\xf5es nocturnas
- j\xe1 a caminho da ritualiza\xe7\xe3o de massas, do g\xe9nero \xabvamos todos ao
- Calypso e encontramo-nos na Locomia\xbb.',
- 'E assim, aos 2,5 milh\xf5es que o Minist\xe9rio do Planeamento e Administra\xe7\xe3o
- do Territ\xf3rio j\xe1 gasta no pagamento do pessoal afecto a estes organismos,
- v\xeam juntar-se os montantes das obras propriamente ditas, que os munic\xedpios,
- j\xe1 com projectos na m\xe3o, v\xeam reivindicar junto do Executivo, como salienta
- aquele membro do Governo.',
- 'E o dinheiro \xabn\xe3o falta s\xf3 \xe0s c\xe2maras\xbb, lembra o secret\xe1rio de Estado,
- que considera que a solu\xe7\xe3o para as autarquias \xe9 \xabespecializarem-se em
- fundos comunit\xe1rios\xbb.',
- 'Mas como, se muitas n\xe3o disp\xf5em, nos seus quadros, dos t\xe9cnicos necess\xe1rios?',
- '\xabEncomendem-nos a projectistas de fora\xbb porque, se as obras vierem a ser financiadas,
- eles at\xe9 saem de gra\xe7a, j\xe1 que, nesse caso, \xabos fundos comunit\xe1rios pagam
- os projectos, o mesmo n\xe3o acontecendo quando eles s\xe3o feitos pelos GAT\xbb,
- dado serem organismos do Estado.',
- 'Essa poder\xe1 vir a ser uma hip\xf3tese, at\xe9 porque, no terreno, a capacidade dos GAT
- est\xe1 cada vez mais enfraquecida.',
- 'Alguns at\xe9 j\xe1 desapareceram, como o de Castro Verde, e outros t\xeam vindo a perder quadros.']
-
-NLTK's data collection includes a trained model for Portuguese sentence
-segmentation, which can be loaded as follows. It is faster to load a trained model than
-to retrain it.
-
- >>> stok = nltk.data.load('tokenizers/punkt/portuguese.pickle')
-
-Stemming
---------
-
-NLTK includes the RSLP Portuguese stemmer. Here we use it to stem some Portuguese text:
-
- >>> stemmer = nltk.stem.RSLPStemmer()
- >>> stemmer.stem("copiar")
- 'copi'
- >>> stemmer.stem("paisagem")
- 'pais'
-
-
-Stopwords
----------
-
-NLTK includes Portuguese stopwords:
-
- >>> stopwords = nltk.corpus.stopwords.words('portuguese')
- >>> stopwords[:10]
- ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'at\xe9']
-
-Now we can use these to filter text. Let's find the most frequent words (other than stopwords)
-and print them in descending order of frequency:
-
- >>> fd = nltk.FreqDist(w.lower() for w in floresta.words() if w not in stopwords)
- >>> for word in list(fd.keys())[:20]:
- ... print(word, fd[word])
- , 13444
- . 7725
- « 2369
- » 2310
- é 1305
- o 1086
- } 1047
- { 1044
- a 897
- ; 633
- em 516
- ser 466
- sobre 349
- os 313
- anos 301
- ontem 292
- ainda 279
- segundo 256
- ter 249
- dois 231
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-from nltk.corpus import teardown_module
-
-
-def setup_module(module):
- from nose import SkipTest
-
- raise SkipTest(
- "portuguese_en.doctest imports nltk.examples.pt which doesn't exist!"
- )
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===========
-Probability
-===========
-
- >>> import nltk
- >>> from nltk.probability import *
-
-FreqDist
---------
-
- >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!']
- >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.']
-
- >>> fd1 = nltk.FreqDist(text1)
- >>> fd1 == nltk.FreqDist(text1)
- True
-
-Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order.
-
- >>> import itertools
- >>> both = nltk.FreqDist(text1 + text2)
- >>> both_most_common = both.most_common()
- >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1]))))
- [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)]
-
- >>> both == fd1 + nltk.FreqDist(text2)
- True
- >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged
- True
-
- >>> fd2 = nltk.FreqDist(text2)
- >>> fd1.update(fd2)
- >>> fd1 == both
- True
-
- >>> fd1 = nltk.FreqDist(text1)
- >>> fd1.update(text2)
- >>> fd1 == both
- True
-
- >>> fd1 = nltk.FreqDist(text1)
- >>> fd2 = nltk.FreqDist(fd1)
- >>> fd2 == fd1
- True
-
-``nltk.FreqDist`` can be pickled:
-
- >>> import pickle
- >>> fd1 = nltk.FreqDist(text1)
- >>> pickled = pickle.dumps(fd1)
- >>> fd1 == pickle.loads(pickled)
- True
-
-Mathematical operations:
-
- >>> FreqDist('abbb') + FreqDist('bcc')
- FreqDist({'b': 4, 'c': 2, 'a': 1})
- >>> FreqDist('abbbc') - FreqDist('bccd')
- FreqDist({'b': 2, 'a': 1})
- >>> FreqDist('abbb') | FreqDist('bcc')
- FreqDist({'b': 3, 'c': 2, 'a': 1})
- >>> FreqDist('abbb') & FreqDist('bcc')
- FreqDist({'b': 1})
-
-ConditionalFreqDist
--------------------
-
- >>> cfd1 = ConditionalFreqDist()
- >>> cfd1[1] = FreqDist('abbbb')
- >>> cfd1[2] = FreqDist('xxxxyy')
- >>> cfd1
- <ConditionalFreqDist with 2 conditions>
-
- >>> cfd2 = ConditionalFreqDist()
- >>> cfd2[1] = FreqDist('bbccc')
- >>> cfd2[2] = FreqDist('xxxyyyzz')
- >>> cfd2[3] = FreqDist('m')
- >>> cfd2
- <ConditionalFreqDist with 3 conditions>
-
- >>> r = cfd1 + cfd2
- >>> [(i,r[i]) for i in r.conditions()]
- [(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))]
-
- >>> r = cfd1 - cfd2
- >>> [(i,r[i]) for i in r.conditions()]
- [(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))]
-
- >>> r = cfd1 | cfd2
- >>> [(i,r[i]) for i in r.conditions()]
- [(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))]
-
- >>> r = cfd1 & cfd2
- >>> [(i,r[i]) for i in r.conditions()]
- [(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))]
-
-Testing some HMM estimators
----------------------------
-
-We extract a small part (500 sentences) of the Brown corpus
-
- >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
- >>> print(len(corpus))
- 500
-
-We create a HMM trainer - note that we need the tags and symbols
-from the whole corpus, not just the training corpus
-
- >>> from nltk.util import unique_list
- >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
- >>> print(len(tag_set))
- 92
- >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
- >>> print(len(symbols))
- 1464
- >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
-
-We divide the corpus into 90% training and 10% testing
-
- >>> train_corpus = []
- >>> test_corpus = []
- >>> for i in range(len(corpus)):
- ... if i % 10:
- ... train_corpus += [corpus[i]]
- ... else:
- ... test_corpus += [corpus[i]]
- >>> print(len(train_corpus))
- 450
- >>> print(len(test_corpus))
- 50
-
-And now we can test the estimators
-
- >>> def train_and_test(est):
- ... hmm = trainer.train_supervised(train_corpus, estimator=est)
- ... print('%.2f%%' % (100 * hmm.evaluate(test_corpus)))
-
-Maximum Likelihood Estimation
------------------------------
-- this resulted in an initialization error before r7209
-
- >>> mle = lambda fd, bins: MLEProbDist(fd)
- >>> train_and_test(mle)
- 22.75%
-
-Laplace (= Lidstone with gamma==1)
-
- >>> train_and_test(LaplaceProbDist)
- 66.04%
-
-Expected Likelihood Estimation (= Lidstone with gamma==0.5)
-
- >>> train_and_test(ELEProbDist)
- 73.01%
-
-Lidstone Estimation, for gamma==0.1, 0.5 and 1
-(the later two should be exactly equal to MLE and ELE above)
-
- >>> def lidstone(gamma):
- ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins)
- >>> train_and_test(lidstone(0.1))
- 82.51%
- >>> train_and_test(lidstone(0.5))
- 73.01%
- >>> train_and_test(lidstone(1.0))
- 66.04%
-
-Witten Bell Estimation
-----------------------
-- This resulted in ZeroDivisionError before r7209
-
- >>> train_and_test(WittenBellProbDist)
- 88.12%
-
-Good Turing Estimation
-
- >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5)
- >>> train_and_test(gt)
- 86.93%
-
-Kneser Ney Estimation
----------------------
-Since the Kneser-Ney distribution is best suited for trigrams, we must adjust
-our testing accordingly.
-
- >>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1]))
- ... for x, y, z in nltk.trigrams(sent)]
- ... for sent in corpus[:100]]
-
-We will then need to redefine the rest of the training/testing variables
- >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
- >>> len(tag_set)
- 906
-
- >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
- >>> len(symbols)
- 1341
-
- >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
- >>> train_corpus = []
- >>> test_corpus = []
-
- >>> for i in range(len(corpus)):
- ... if i % 10:
- ... train_corpus += [corpus[i]]
- ... else:
- ... test_corpus += [corpus[i]]
-
- >>> len(train_corpus)
- 90
- >>> len(test_corpus)
- 10
-
- >>> kn = lambda fd, bins: KneserNeyProbDist(fd)
- >>> train_and_test(kn)
- 0.86%
-
-Remains to be added:
-- Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist
-
-Squashed bugs
--------------
-
-Issue 511: override pop and popitem to invalidate the cache
-
- >>> fd = nltk.FreqDist('a')
- >>> list(fd.keys())
- ['a']
- >>> fd.pop('a')
- 1
- >>> list(fd.keys())
- []
-
-Issue 533: access cumulative frequencies with no arguments
-
- >>> fd = nltk.FreqDist('aab')
- >>> list(fd._cumulative_frequencies(['a']))
- [2.0]
- >>> list(fd._cumulative_frequencies(['a', 'b']))
- [2.0, 3.0]
-
-Issue 579: override clear to reset some variables
-
- >>> fd = FreqDist('aab')
- >>> fd.clear()
- >>> fd.N()
- 0
-
-Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently
-add errant categories
-
- >>> from nltk.corpus import brown
- >>> brown.fileids('blah')
- Traceback (most recent call last):
- ...
- ValueError: Category blah not found
- >>> brown.categories()
- ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
-
-Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default
-otherwise any unseen events get a probability of zero, i.e.,
-they don't get smoothed
-
- >>> from nltk import SimpleGoodTuringProbDist, FreqDist
- >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10})
- >>> p = SimpleGoodTuringProbDist(fd)
- >>> p.prob('a')
- 0.017649766667026317...
- >>> p.prob('o')
- 0.08433050215340411...
- >>> p.prob('z')
- 0.022727272727272728...
- >>> p.prob('foobar')
- 0.022727272727272728...
-
-``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and
-``ConditionalFreqDist`` can be pickled:
-
- >>> import pickle
- >>> pd = MLEProbDist(fd)
- >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples())
- True
- >>> dpd = DictionaryConditionalProbDist({'x': pd})
- >>> unpickled = pickle.loads(pickle.dumps(dpd))
- >>> dpd['x'].prob('a')
- 0.011363636...
- >>> dpd['x'].prob('a') == unpickled['x'].prob('a')
- True
- >>> cfd = nltk.probability.ConditionalFreqDist()
- >>> cfd['foo']['hello'] += 1
- >>> cfd['foo']['hello'] += 1
- >>> cfd['bar']['hello'] += 1
- >>> cfd2 = pickle.loads(pickle.dumps(cfd))
- >>> cfd2 == cfd
- True
- >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
- >>> cpd2 = pickle.loads(pickle.dumps(cpd))
- >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello')
- True
-
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-# probability.doctest uses HMM which requires numpy;
-# skip probability.doctest if numpy is not available
-
-
-def setup_module(module):
- from nose import SkipTest
-
- try:
- import numpy
- except ImportError:
- raise SkipTest("probability.doctest requires numpy")
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-========
-PropBank
-========
-
-The PropBank Corpus provides predicate-argument annotation for the
-entire Penn Treebank. Each verb in the treebank is annotated by a single
-instance in PropBank, containing information about the location of
-the verb, and the location and identity of its arguments:
-
- >>> from nltk.corpus import propbank
- >>> pb_instances = propbank.instances()
- >>> print(pb_instances) # doctest: +NORMALIZE_WHITESPACE
- [<PropbankInstance: wsj_0001.mrg, sent 0, word 8>,
- <PropbankInstance: wsj_0001.mrg, sent 1, word 10>, ...]
-
-Each propbank instance defines the following member variables:
-
- - Location information: `fileid`, `sentnum`, `wordnum`
- - Annotator information: `tagger`
- - Inflection information: `inflection`
- - Roleset identifier: `roleset`
- - Verb (aka predicate) location: `predicate`
- - Argument locations and types: `arguments`
-
-The following examples show the types of these arguments:
-
- >>> inst = pb_instances[103]
- >>> (inst.fileid, inst.sentnum, inst.wordnum)
- ('wsj_0004.mrg', 8, 16)
- >>> inst.tagger
- 'gold'
- >>> inst.inflection
- <PropbankInflection: vp--a>
- >>> infl = inst.inflection
- >>> infl.form, infl.tense, infl.aspect, infl.person, infl.voice
- ('v', 'p', '-', '-', 'a')
- >>> inst.roleset
- 'rise.01'
- >>> inst.predicate
- PropbankTreePointer(16, 0)
- >>> inst.arguments # doctest: +NORMALIZE_WHITESPACE
- ((PropbankTreePointer(0, 2), 'ARG1'),
- (PropbankTreePointer(13, 1), 'ARGM-DIS'),
- (PropbankTreePointer(17, 1), 'ARG4-to'),
- (PropbankTreePointer(20, 1), 'ARG3-from'))
-
-The location of the predicate and of the arguments are encoded using
-`PropbankTreePointer` objects, as well as `PropbankChainTreePointer`
-objects and `PropbankSplitTreePointer` objects. A
-`PropbankTreePointer` consists of a `wordnum` and a `height`:
-
- >>> print(inst.predicate.wordnum, inst.predicate.height)
- 16 0
-
-This identifies the tree constituent that is headed by the word that
-is the `wordnum`\ 'th token in the sentence, and whose span is found
-by going `height` nodes up in the tree. This type of pointer is only
-useful if we also have the corresponding tree structure, since it
-includes empty elements such as traces in the word number count. The
-trees for 10% of the standard PropBank Corpus are contained in the
-`treebank` corpus:
-
- >>> tree = inst.tree
-
- >>> from nltk.corpus import treebank
- >>> assert tree == treebank.parsed_sents(inst.fileid)[inst.sentnum]
-
- >>> inst.predicate.select(tree)
- Tree('VBD', ['rose'])
- >>> for (argloc, argid) in inst.arguments:
- ... print('%-10s %s' % (argid, argloc.select(tree).pformat(500)[:50]))
- ARG1 (NP-SBJ (NP (DT The) (NN yield)) (PP (IN on) (NP (
- ARGM-DIS (PP (IN for) (NP (NN example)))
- ARG4-to (PP-DIR (TO to) (NP (CD 8.04) (NN %)))
- ARG3-from (PP-DIR (IN from) (NP (CD 7.90) (NN %)))
-
-Propbank tree pointers can be converted to standard tree locations,
-which are usually easier to work with, using the `treepos()` method:
-
- >>> treepos = inst.predicate.treepos(tree)
- >>> print (treepos, tree[treepos])
- (4, 0) (VBD rose)
-
-In some cases, argument locations will be encoded using
-`PropbankChainTreePointer`\ s (for trace chains) or
-`PropbankSplitTreePointer`\ s (for discontinuous constituents). Both
-of these objects contain a single member variable, `pieces`,
-containing a list of the constituent pieces. They also define the
-method `select()`, which will return a tree containing all the
-elements of the argument. (A new head node is created, labeled
-"*CHAIN*" or "*SPLIT*", since the argument is not a single constituent
-in the original tree). Sentence #6 contains an example of an argument
-that is both discontinuous and contains a chain:
-
- >>> inst = pb_instances[6]
- >>> inst.roleset
- 'expose.01'
- >>> argloc, argid = inst.arguments[2]
- >>> argloc
- <PropbankChainTreePointer: 22:1,24:0,25:1*27:0>
- >>> argloc.pieces
- [<PropbankSplitTreePointer: 22:1,24:0,25:1>, PropbankTreePointer(27, 0)]
- >>> argloc.pieces[0].pieces
- ... # doctest: +NORMALIZE_WHITESPACE
- [PropbankTreePointer(22, 1), PropbankTreePointer(24, 0),
- PropbankTreePointer(25, 1)]
- >>> print(argloc.select(inst.tree))
- (*CHAIN*
- (*SPLIT* (NP (DT a) (NN group)) (IN of) (NP (NNS workers)))
- (-NONE- *))
-
-The PropBank Corpus also provides access to the frameset files, which
-define the argument labels used by the annotations, on a per-verb
-basis. Each frameset file contains one or more predicates, such as
-'turn' or 'turn_on', each of which is divided into coarse-grained word
-senses called rolesets. For each roleset, the frameset file provides
-descriptions of the argument roles, along with examples.
-
- >>> expose_01 = propbank.roleset('expose.01')
- >>> turn_01 = propbank.roleset('turn.01')
- >>> print(turn_01) # doctest: +ELLIPSIS
- <Element 'roleset' at ...>
- >>> for role in turn_01.findall("roles/role"):
- ... print(role.attrib['n'], role.attrib['descr'])
- 0 turner
- 1 thing turning
- m direction, location
-
- >>> from xml.etree import ElementTree
- >>> print(ElementTree.tostring(turn_01.find('example')).decode('utf8').strip())
- <example name="transitive agentive">
- <text>
- John turned the key in the lock.
- </text>
- <arg n="0">John</arg>
- <rel>turned</rel>
- <arg n="1">the key</arg>
- <arg f="LOC" n="m">in the lock</arg>
- </example>
-
-Note that the standard corpus distribution only contains 10% of the
-treebank, so the parse trees are not available for instances starting
-at 9353:
-
- >>> inst = pb_instances[9352]
- >>> inst.fileid
- 'wsj_0199.mrg'
- >>> print(inst.tree) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
- (S (NP-SBJ (NNP Trinity)) (VP (VBD said) (SBAR (-NONE- 0) ...))
- >>> print(inst.predicate.select(inst.tree))
- (VB begin)
-
- >>> inst = pb_instances[9353]
- >>> inst.fileid
- 'wsj_0200.mrg'
- >>> print(inst.tree)
- None
- >>> print(inst.predicate.select(inst.tree))
- Traceback (most recent call last):
- . . .
- ValueError: Parse tree not avaialable
-
-However, if you supply your own version of the treebank corpus (by
-putting it before the nltk-provided version on `nltk.data.path`, or
-by creating a `ptb` directory as described above and using the
-`propbank_ptb` module), then you can access the trees for all
-instances.
-
-A list of the verb lemmas contained in PropBank is returned by the
-`propbank.verbs()` method:
-
- >>> propbank.verbs()
- ['abandon', 'abate', 'abdicate', 'abet', 'abide', ...]
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-======================
-Information Extraction
-======================
-
-Information Extraction standardly consists of three subtasks:
-
-#. Named Entity Recognition
-
-#. Relation Extraction
-
-#. Template Filling
-
-Named Entities
-~~~~~~~~~~~~~~
-
-The IEER corpus is marked up for a variety of Named Entities. A `Named
-Entity`:dt: (more strictly, a Named Entity mention) is a name of an
-entity belonging to a specified class. For example, the Named Entity
-classes in IEER include PERSON, LOCATION, ORGANIZATION, DATE and so
-on. Within NLTK, Named Entities are represented as subtrees within a
-chunk structure: the class name is treated as node label, while the
-entity mention itself appears as the leaves of the subtree. This is
-illustrated below, where we have show an extract of the chunk
-representation of document NYT_19980315.064:
-
- >>> from nltk.corpus import ieer
- >>> docs = ieer.parsed_docs('NYT_19980315')
- >>> tree = docs[1].text
- >>> print(tree) # doctest: +ELLIPSIS
- (DOCUMENT
- ...
- ``It's
- a
- chance
- to
- think
- about
- first-level
- questions,''
- said
- Ms.
- (PERSON Cohn)
- ,
- a
- partner
- in
- the
- (ORGANIZATION McGlashan & Sarrail)
- firm
- in
- (LOCATION San Mateo)
- ,
- (LOCATION Calif.)
- ...)
-
-Thus, the Named Entity mentions in this example are *Cohn*, *McGlashan &
-Sarrail*, *San Mateo* and *Calif.*.
-
-The CoNLL2002 Dutch and Spanish data is treated similarly, although in
-this case, the strings are also POS tagged.
-
- >>> from nltk.corpus import conll2002
- >>> for doc in conll2002.chunked_sents('ned.train')[27]:
- ... print(doc)
- ('Het', 'Art')
- (ORG Hof/N van/Prep Cassatie/N)
- ('verbrak', 'V')
- ('het', 'Art')
- ('arrest', 'N')
- ('zodat', 'Conj')
- ('het', 'Pron')
- ('moest', 'V')
- ('worden', 'V')
- ('overgedaan', 'V')
- ('door', 'Prep')
- ('het', 'Art')
- ('hof', 'N')
- ('van', 'Prep')
- ('beroep', 'N')
- ('van', 'Prep')
- (LOC Antwerpen/N)
- ('.', 'Punc')
-
-Relation Extraction
-~~~~~~~~~~~~~~~~~~~
-
-Relation Extraction standardly consists of identifying specified
-relations between Named Entities. For example, assuming that we can
-recognize ORGANIZATIONs and LOCATIONs in text, we might want to also
-recognize pairs *(o, l)* of these kinds of entities such that *o* is
-located in *l*.
-
-The `sem.relextract` module provides some tools to help carry out a
-simple version of this task. The `tree2semi_rel()` function splits a chunk
-document into a list of two-member lists, each of which consists of a
-(possibly empty) string followed by a `Tree` (i.e., a Named Entity):
-
- >>> from nltk.sem import relextract
- >>> pairs = relextract.tree2semi_rel(tree)
- >>> for s, tree in pairs[18:22]:
- ... print('("...%s", %s)' % (" ".join(s[-5:]),tree))
- ("...about first-level questions,'' said Ms.", (PERSON Cohn))
- ("..., a partner in the", (ORGANIZATION McGlashan & Sarrail))
- ("...firm in", (LOCATION San Mateo))
- ("...,", (LOCATION Calif.))
-
-The function `semi_rel2reldict()` processes triples of these pairs, i.e.,
-pairs of the form ``((string1, Tree1), (string2, Tree2), (string3,
-Tree3))`` and outputs a dictionary (a `reldict`) in which ``Tree1`` is
-the subject of the relation, ``string2`` is the filler
-and ``Tree3`` is the object of the relation. ``string1`` and ``string3`` are
-stored as left and right context respectively.
-
- >>> reldicts = relextract.semi_rel2reldict(pairs)
- >>> for k, v in sorted(reldicts[0].items()):
- ... print(k, '=>', v) # doctest: +ELLIPSIS
- filler => of messages to their own ``Cyberia'' ...
- lcon => transactions.'' Each week, they post
- objclass => ORGANIZATION
- objsym => white_house
- objtext => White House
- rcon => for access to its planned
- subjclass => CARDINAL
- subjsym => hundreds
- subjtext => hundreds
- untagged_filler => of messages to their own ``Cyberia'' ...
-
-The next example shows some of the values for two `reldict`\ s
-corresponding to the ``'NYT_19980315'`` text extract shown earlier.
-
- >>> for r in reldicts[18:20]:
- ... print('=' * 20)
- ... print(r['subjtext'])
- ... print(r['filler'])
- ... print(r['objtext'])
- ====================
- Cohn
- , a partner in the
- McGlashan & Sarrail
- ====================
- McGlashan & Sarrail
- firm in
- San Mateo
-
-The function `relextract()` allows us to filter the `reldict`\ s
-according to the classes of the subject and object named entities. In
-addition, we can specify that the filler text has to match a given
-regular expression, as illustrated in the next example. Here, we are
-looking for pairs of entities in the IN relation, where IN has
-signature <ORG, LOC>.
-
- >>> import re
- >>> IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
- >>> for fileid in ieer.fileids():
- ... for doc in ieer.parsed_docs(fileid):
- ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
- ... print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
- [ORG: 'Christian Democrats'] ', the leading political forces in' [LOC: 'Italy']
- [ORG: 'AP'] ') _ Lebanese guerrillas attacked Israeli forces in southern' [LOC: 'Lebanon']
- [ORG: 'Security Council'] 'adopted Resolution 425. Huge yellow banners hung across intersections in' [LOC: 'Beirut']
- [ORG: 'U.N.'] 'failures in' [LOC: 'Africa']
- [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia']
- [ORG: 'U.N.'] 'partners on a more effective role in' [LOC: 'Africa']
- [ORG: 'AP'] ') _ A bomb exploded in a mosque in central' [LOC: 'San`a']
- [ORG: 'Krasnoye Sormovo'] 'shipyard in the Soviet city of' [LOC: 'Gorky']
- [ORG: 'Kelab Golf Darul Ridzuan'] 'in' [LOC: 'Perak']
- [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia']
- [ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
- [ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo']
- [ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
- [ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
- [ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
- [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
- ...
-
-The next example illustrates a case where the patter is a disjunction
-of roles that a PERSON can occupy in an ORGANIZATION.
-
- >>> roles = """
- ... (.*(
- ... analyst|
- ... chair(wo)?man|
- ... commissioner|
- ... counsel|
- ... director|
- ... economist|
- ... editor|
- ... executive|
- ... foreman|
- ... governor|
- ... head|
- ... lawyer|
- ... leader|
- ... librarian).*)|
- ... manager|
- ... partner|
- ... president|
- ... producer|
- ... professor|
- ... researcher|
- ... spokes(wo)?man|
- ... writer|
- ... ,\sof\sthe?\s* # "X, of (the) Y"
- ... """
- >>> ROLES = re.compile(roles, re.VERBOSE)
- >>> for fileid in ieer.fileids():
- ... for doc in ieer.parsed_docs(fileid):
- ... for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
- ... print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
- [PER: 'Kivutha Kibwana'] ', of the' [ORG: 'National Convention Assembly']
- [PER: 'Boban Boskovic'] ', chief executive of the' [ORG: 'Plastika']
- [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations']
- [PER: 'Kiriyenko'] 'became a foreman at the' [ORG: 'Krasnoye Sormovo']
- [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations']
- [PER: 'Mike Godwin'] ', chief counsel for the' [ORG: 'Electronic Frontier Foundation']
- ...
-
-In the case of the CoNLL2002 data, we can include POS tags in the
-query pattern. This example also illustrates how the output can be
-presented as something that looks more like a clause in a logical language.
-
- >>> de = """
- ... .*
- ... (
- ... de/SP|
- ... del/SP
- ... )
- ... """
- >>> DE = re.compile(de, re.VERBOSE)
- >>> rels = [rel for doc in conll2002.chunked_sents('esp.train')
- ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
- >>> for r in rels[:10]:
- ... print(relextract.clause(r, relsym='DE')) # doctest: +NORMALIZE_WHITESPACE
- DE('tribunal_supremo', 'victoria')
- DE('museo_de_arte', 'alcorc\xf3n')
- DE('museo_de_bellas_artes', 'a_coru\xf1a')
- DE('siria', 'l\xedbano')
- DE('uni\xf3n_europea', 'pek\xedn')
- DE('ej\xe9rcito', 'rogberi')
- DE('juzgado_de_instrucci\xf3n_n\xfamero_1', 'san_sebasti\xe1n')
- DE('psoe', 'villanueva_de_la_serena')
- DE('ej\xe9rcito', 'l\xedbano')
- DE('juzgado_de_lo_penal_n\xfamero_2', 'ceuta')
- >>> vnv = """
- ... (
- ... is/V|
- ... was/V|
- ... werd/V|
- ... wordt/V
- ... )
- ... .*
- ... van/Prep
- ... """
- >>> VAN = re.compile(vnv, re.VERBOSE)
- >>> for doc in conll2002.chunked_sents('ned.train'):
- ... for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
- ... print(relextract.clause(r, relsym="VAN"))
- VAN("cornet_d'elzius", 'buitenlandse_handel')
- VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
- VAN('annie_lennox', 'eurythmics')
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=========================
-Resolution Theorem Prover
-=========================
-
- >>> from nltk.inference.resolution import *
- >>> from nltk.sem import logic
- >>> from nltk.sem.logic import *
- >>> logic._counter._value = 0
- >>> read_expr = logic.Expression.fromstring
-
- >>> P = read_expr('P')
- >>> Q = read_expr('Q')
- >>> R = read_expr('R')
- >>> A = read_expr('A')
- >>> B = read_expr('B')
- >>> x = read_expr('x')
- >>> y = read_expr('y')
- >>> z = read_expr('z')
-
--------------------------------
-Test most_general_unification()
--------------------------------
- >>> print(most_general_unification(x, x))
- {}
- >>> print(most_general_unification(A, A))
- {}
- >>> print(most_general_unification(A, x))
- {x: A}
- >>> print(most_general_unification(x, A))
- {x: A}
- >>> print(most_general_unification(x, y))
- {x: y}
- >>> print(most_general_unification(P(x), P(A)))
- {x: A}
- >>> print(most_general_unification(P(x,B), P(A,y)))
- {x: A, y: B}
- >>> print(most_general_unification(P(x,B), P(B,x)))
- {x: B}
- >>> print(most_general_unification(P(x,y), P(A,x)))
- {x: A, y: x}
- >>> print(most_general_unification(P(Q(x)), P(y)))
- {y: Q(x)}
-
-------------
-Test unify()
-------------
- >>> print(Clause([]).unify(Clause([])))
- []
- >>> print(Clause([P(x)]).unify(Clause([-P(A)])))
- [{}]
- >>> print(Clause([P(A), Q(x)]).unify(Clause([-P(x), R(x)])))
- [{R(A), Q(A)}]
- >>> print(Clause([P(A), Q(x), R(x,y)]).unify(Clause([-P(x), Q(y)])))
- [{Q(y), Q(A), R(A,y)}]
- >>> print(Clause([P(A), -Q(y)]).unify(Clause([-P(x), Q(B)])))
- [{}]
- >>> print(Clause([P(x), Q(x)]).unify(Clause([-P(A), -Q(B)])))
- [{-Q(B), Q(A)}, {-P(A), P(B)}]
- >>> print(Clause([P(x,x), Q(x), R(x)]).unify(Clause([-P(A,z), -Q(B)])))
- [{-Q(B), Q(A), R(A)}, {-P(A,z), R(B), P(B,B)}]
-
- >>> a = clausify(read_expr('P(A)'))
- >>> b = clausify(read_expr('A=B'))
- >>> print(a[0].unify(b[0]))
- [{P(B)}]
-
--------------------------
-Test is_tautology()
--------------------------
- >>> print(Clause([P(A), -P(A)]).is_tautology())
- True
- >>> print(Clause([-P(A), P(A)]).is_tautology())
- True
- >>> print(Clause([P(x), -P(A)]).is_tautology())
- False
- >>> print(Clause([Q(B), -P(A), P(A)]).is_tautology())
- True
- >>> print(Clause([-Q(A), P(R(A)), -P(R(A)), Q(x), -R(y)]).is_tautology())
- True
- >>> print(Clause([P(x), -Q(A)]).is_tautology())
- False
-
--------------------------
-Test subsumes()
--------------------------
- >>> print(Clause([P(A), Q(B)]).subsumes(Clause([P(A), Q(B)])))
- True
- >>> print(Clause([-P(A)]).subsumes(Clause([P(A)])))
- False
- >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), P(A)])))
- True
- >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), R(A), P(A)])))
- True
- >>> print(Clause([P(A), R(A), Q(B)]).subsumes(Clause([Q(B), P(A)])))
- False
- >>> print(Clause([P(x)]).subsumes(Clause([P(A)])))
- True
- >>> print(Clause([P(A)]).subsumes(Clause([P(x)])))
- True
-
-------------
-Test prove()
-------------
- >>> print(ResolutionProverCommand(read_expr('man(x)')).prove())
- False
- >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('(man(x) -> --man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('(man(x) <-> man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('-(man(x) <-> -man(x))')).prove())
- True
- >>> print(ResolutionProverCommand(read_expr('all x.man(x)')).prove())
- False
- >>> print(ResolutionProverCommand(read_expr('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')).prove())
- False
- >>> print(ResolutionProverCommand(read_expr('some x.all y.sees(x,y)')).prove())
- False
-
- >>> p1 = read_expr('all x.(man(x) -> mortal(x))')
- >>> p2 = read_expr('man(Socrates)')
- >>> c = read_expr('mortal(Socrates)')
- >>> ResolutionProverCommand(c, [p1,p2]).prove()
- True
-
- >>> p1 = read_expr('all x.(man(x) -> walks(x))')
- >>> p2 = read_expr('man(John)')
- >>> c = read_expr('some y.walks(y)')
- >>> ResolutionProverCommand(c, [p1,p2]).prove()
- True
-
- >>> p = read_expr('some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))')
- >>> c = read_expr('some e0.walk(e0,mary)')
- >>> ResolutionProverCommand(c, [p]).prove()
- True
-
-------------
-Test proof()
-------------
- >>> p1 = read_expr('all x.(man(x) -> mortal(x))')
- >>> p2 = read_expr('man(Socrates)')
- >>> c = read_expr('mortal(Socrates)')
- >>> logic._counter._value = 0
- >>> tp = ResolutionProverCommand(c, [p1,p2])
- >>> tp.prove()
- True
- >>> print(tp.proof())
- [1] {-mortal(Socrates)} A
- [2] {-man(z2), mortal(z2)} A
- [3] {man(Socrates)} A
- [4] {-man(Socrates)} (1, 2)
- [5] {mortal(Socrates)} (2, 3)
- [6] {} (1, 5)
- <BLANKLINE>
-
-------------------
-Question Answering
-------------------
-One answer
- >>> p1 = read_expr('father_of(art,john)')
- >>> p2 = read_expr('father_of(bob,kim)')
- >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))')
- >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))')
- >>> logic._counter._value = 0
- >>> tp = ResolutionProverCommand(None, [p1,p2,p3,c])
- >>> sorted(tp.find_answers())
- [<ConstantExpression art>]
- >>> print(tp.proof()) # doctest: +SKIP
- [1] {father_of(art,john)} A
- [2] {father_of(bob,kim)} A
- [3] {-father_of(z3,z4), parent_of(z3,z4)} A
- [4] {-parent_of(z6,john), ANSWER(z6)} A
- [5] {parent_of(art,john)} (1, 3)
- [6] {parent_of(bob,kim)} (2, 3)
- [7] {ANSWER(z6), -father_of(z6,john)} (3, 4)
- [8] {ANSWER(art)} (1, 7)
- [9] {ANSWER(art)} (4, 5)
- <BLANKLINE>
-
-Multiple answers
- >>> p1 = read_expr('father_of(art,john)')
- >>> p2 = read_expr('mother_of(ann,john)')
- >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))')
- >>> p4 = read_expr('all x.all y.(mother_of(x,y) -> parent_of(x,y))')
- >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))')
- >>> logic._counter._value = 0
- >>> tp = ResolutionProverCommand(None, [p1,p2,p3,p4,c])
- >>> sorted(tp.find_answers())
- [<ConstantExpression ann>, <ConstantExpression art>]
- >>> print(tp.proof()) # doctest: +SKIP
- [ 1] {father_of(art,john)} A
- [ 2] {mother_of(ann,john)} A
- [ 3] {-father_of(z3,z4), parent_of(z3,z4)} A
- [ 4] {-mother_of(z7,z8), parent_of(z7,z8)} A
- [ 5] {-parent_of(z10,john), ANSWER(z10)} A
- [ 6] {parent_of(art,john)} (1, 3)
- [ 7] {parent_of(ann,john)} (2, 4)
- [ 8] {ANSWER(z10), -father_of(z10,john)} (3, 5)
- [ 9] {ANSWER(art)} (1, 8)
- [10] {ANSWER(z10), -mother_of(z10,john)} (4, 5)
- [11] {ANSWER(ann)} (2, 10)
- [12] {ANSWER(art)} (5, 6)
- [13] {ANSWER(ann)} (5, 7)
- <BLANKLINE>
-
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import sys
-import os
-import nose
-from nose.plugins.manager import PluginManager
-from nose.plugins.doctests import Doctest
-from nose.plugins import builtin
-
-NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
-sys.path.insert(0, NLTK_ROOT)
-
-NLTK_TEST_DIR = os.path.join(NLTK_ROOT, "nltk")
-
-if __name__ == "__main__":
- # there shouldn't be import from NLTK for coverage to work properly
- try:
- # Import RedNose plugin for colored test output
- from rednose import RedNose
-
- rednose_available = True
- except ImportError:
- rednose_available = False
-
- class NltkPluginManager(PluginManager):
- """
- Nose plugin manager that replaces standard doctest plugin
- with a patched version and adds RedNose plugin for colored test output.
- """
-
- def loadPlugins(self):
- for plug in builtin.plugins:
- self.addPlugin(plug())
- if rednose_available:
- self.addPlugin(RedNose())
-
- super(NltkPluginManager, self).loadPlugins()
-
- manager = NltkPluginManager()
- manager.loadPlugins()
-
- # allow passing extra options and running individual tests
- # Examples:
- #
- # python runtests.py semantics.doctest
- # python runtests.py --with-id -v
- # python runtests.py --with-id -v nltk.featstruct
-
- args = sys.argv[1:]
- if not args:
- args = [NLTK_TEST_DIR]
-
- if all(arg.startswith("-") for arg in args):
- # only extra options were passed
- args += [NLTK_TEST_DIR]
-
- # Activate RedNose and hide skipped test messages from output
- if rednose_available:
- args += ["--rednose", "--hide-skips"]
-
- arguments = [
- "--exclude=", # why is this needed?
- # '--with-xunit',
- # '--xunit-file=$WORKSPACE/nosetests.xml',
- # '--nocapture',
- "--with-doctest",
- # '--doctest-tests',
- # '--debug=nose,nose.importer,nose.inspector,nose.plugins,nose.result,nose.selector',
- "--doctest-extension=.doctest",
- "--doctest-fixtures=_fixt",
- "--doctest-options=+ELLIPSIS,+NORMALIZE_WHITESPACE,+IGNORE_EXCEPTION_DETAIL",
- # '--verbosity=3',
- ] + args
-
- nose.main(argv=arguments, plugins=manager.plugins)
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-# skip segmentation.doctest if numpy is not available
-def setup_module(module):
- from nose import SkipTest
-
- try:
- import numpy
- except ImportError:
- raise SkipTest("segmentation.doctest requires numpy")
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=========
-Semantics
-=========
-
- >>> import nltk
- >>> from nltk.sem import Valuation, Model
- >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),
- ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])),
- ... ('dog', set(['d1'])),
- ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
- >>> val = Valuation(v)
- >>> dom = val.domain
- >>> m = Model(dom, val)
-
-Evaluation
-----------
-
-The top-level method of a ``Model`` instance is ``evaluate()``, which
-assigns a semantic value to expressions of the ``logic`` module, under
-an assignment ``g``:
-
- >>> dom = val.domain
- >>> g = nltk.sem.Assignment(dom)
- >>> m.evaluate('all x.(boy(x) -> - girl(x))', g)
- True
-
-
-``evaluate()`` calls a recursive function ``satisfy()``, which in turn
-calls a function ``i()`` to interpret non-logical constants and
-individual variables. ``i()`` delegates the interpretation of these to
-the the model's ``Valuation`` and the variable assignment ``g``
-respectively. Any atomic expression which cannot be assigned a value
-by ``i`` raises an ``Undefined`` exception; this is caught by
-``evaluate``, which returns the string ``'Undefined'``.
-
- >>> m.evaluate('walk(adam)', g, trace=2)
- <BLANKLINE>
- 'walk(adam)' is undefined under M, g
- 'Undefined'
-
-Batch Processing
-----------------
-
-The utility functions ``interpret_sents()`` and ``evaluate_sents()`` are intended to
-help with processing multiple sentences. Here's an example of the first of these:
-
- >>> sents = ['Mary walks']
- >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg')
- >>> for result in results:
- ... for (synrep, semrep) in result:
- ... print(synrep)
- (S[SEM=<walk(mary)>]
- (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>]
- (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary))
- (VP[NUM='sg', SEM=<\x.walk(x)>]
- (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks)))
-
-In order to provide backwards compatibility with 'legacy' grammars where the semantics value
-is specified with a lowercase
-``sem`` feature, the relevant feature name can be passed to the function using the
-``semkey`` parameter, as shown here:
-
- >>> sents = ['raining']
- >>> g = nltk.grammar.FeatureGrammar.fromstring("""
- ... % start S
- ... S[sem=<raining>] -> 'raining'
- ... """)
- >>> results = nltk.sem.util.interpret_sents(sents, g, semkey='sem')
- >>> for result in results:
- ... for (synrep, semrep) in result:
- ... print(semrep)
- raining
-
-The function ``evaluate_sents()`` works in a similar manner, but also needs to be
-passed a ``Model`` against which the semantic representations are evaluated.
-
-Unit Tests
-==========
-
-
-Unit tests for relations and valuations
----------------------------------------
-
- >>> from nltk.sem import *
-
-Relations are sets of tuples, all of the same length.
-
- >>> s1 = set([('d1', 'd2'), ('d1', 'd1'), ('d2', 'd1')])
- >>> is_rel(s1)
- True
- >>> s2 = set([('d1', 'd2'), ('d1', 'd2'), ('d1',)])
- >>> is_rel(s2)
- Traceback (most recent call last):
- . . .
- ValueError: Set set([('d1', 'd2'), ('d1',)]) contains sequences of different lengths
- >>> s3 = set(['d1', 'd2'])
- >>> is_rel(s3)
- Traceback (most recent call last):
- . . .
- ValueError: Set set(['d2', 'd1']) contains sequences of different lengths
- >>> s4 = set2rel(s3)
- >>> is_rel(s4)
- True
- >>> is_rel(set())
- True
- >>> null_binary_rel = set([(None, None)])
- >>> is_rel(null_binary_rel)
- True
-
-Sets of entities are converted into sets of singleton tuples
-(containing strings).
-
- >>> sorted(set2rel(s3))
- [('d1',), ('d2',)]
- >>> sorted(set2rel(set([1,3,5,])))
- ['1', '3', '5']
- >>> set2rel(set()) == set()
- True
- >>> set2rel(set2rel(s3)) == set2rel(s3)
- True
-
-Predication is evaluated by set membership.
-
- >>> ('d1', 'd2') in s1
- True
- >>> ('d2', 'd2') in s1
- False
- >>> ('d1',) in s1
- False
- >>> 'd2' in s1
- False
- >>> ('d1',) in s4
- True
- >>> ('d1',) in set()
- False
- >>> 'd1' in null_binary_rel
- False
-
-
- >>> val = Valuation([('Fido', 'd1'), ('dog', set(['d1', 'd2'])), ('walk', set())])
- >>> sorted(val['dog'])
- [('d1',), ('d2',)]
- >>> val.domain == set(['d1', 'd2'])
- True
- >>> print(val.symbols)
- ['Fido', 'dog', 'walk']
-
-
-Parse a valuation from a string.
-
- >>> v = """
- ... john => b1
- ... mary => g1
- ... suzie => g2
- ... fido => d1
- ... tess => d2
- ... noosa => n
- ... girl => {g1, g2}
- ... boy => {b1, b2}
- ... dog => {d1, d2}
- ... bark => {d1, d2}
- ... walk => {b1, g2, d1}
- ... chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)}
- ... see => {(b1, g1), (b2, d2), (g1, b1),(d2, b1), (g2, n)}
- ... in => {(b1, n), (b2, n), (d2, n)}
- ... with => {(b1, g1), (g1, b1), (d1, b1), (b1, d1)}
- ... """
- >>> val = Valuation.fromstring(v)
-
- >>> print(val) # doctest: +SKIP
- {'bark': set([('d1',), ('d2',)]),
- 'boy': set([('b1',), ('b2',)]),
- 'chase': set([('b1', 'g1'), ('g2', 'd2'), ('g1', 'd1'), ('b2', 'g1')]),
- 'dog': set([('d1',), ('d2',)]),
- 'fido': 'd1',
- 'girl': set([('g2',), ('g1',)]),
- 'in': set([('d2', 'n'), ('b1', 'n'), ('b2', 'n')]),
- 'john': 'b1',
- 'mary': 'g1',
- 'noosa': 'n',
- 'see': set([('b1', 'g1'), ('b2', 'd2'), ('d2', 'b1'), ('g2', 'n'), ('g1', 'b1')]),
- 'suzie': 'g2',
- 'tess': 'd2',
- 'walk': set([('d1',), ('b1',), ('g2',)]),
- 'with': set([('b1', 'g1'), ('d1', 'b1'), ('b1', 'd1'), ('g1', 'b1')])}
-
-
-Unit tests for function argument application in a Model
--------------------------------------------------------
-
- >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\
- ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])),
- ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])),
- ... ('kiss', null_binary_rel)]
- >>> val = Valuation(v)
- >>> dom = val.domain
- >>> m = Model(dom, val)
- >>> g = Assignment(dom)
- >>> sorted(val['boy'])
- [('b1',), ('b2',)]
- >>> ('b1',) in val['boy']
- True
- >>> ('g1',) in val['boy']
- False
- >>> ('foo',) in val['boy']
- False
- >>> ('b1', 'g1') in val['love']
- True
- >>> ('b1', 'b1') in val['kiss']
- False
- >>> sorted(val.domain)
- ['b1', 'b2', 'd1', 'g1', 'g2']
-
-
-Model Tests
-===========
-
-Extension of Lambda expressions
-
- >>> v0 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\
- ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])),
- ... ('dog', set(['d1'])),
- ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
-
- >>> val0 = Valuation(v0)
- >>> dom0 = val0.domain
- >>> m0 = Model(dom0, val0)
- >>> g0 = Assignment(dom0)
-
- >>> print(m0.evaluate(r'\x. \y. love(x, y)', g0) == {'g2': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'b2': {'g2': True, 'b2': False, 'b1': False, 'g1': False, 'd1': False}, 'b1': {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}, 'g1': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'd1': {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}})
- True
- >>> print(m0.evaluate(r'\x. dog(x) (adam)', g0))
- False
- >>> print(m0.evaluate(r'\x. (dog(x) | boy(x)) (adam)', g0))
- True
- >>> print(m0.evaluate(r'\x. \y. love(x, y)(fido)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False})
- True
- >>> print(m0.evaluate(r'\x. \y. love(x, y)(adam)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False})
- True
- >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)', g0) == {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False})
- True
- >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)(adam)', g0))
- True
- >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty, adam)', g0))
- True
- >>> print(m0.evaluate(r'\y. \x. love(x, y)(fido)(adam)', g0))
- False
- >>> print(m0.evaluate(r'\y. \x. love(x, y)(betty, adam)', g0))
- True
- >>> print(m0.evaluate(r'\x. exists y. love(x, y)', g0) == {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False})
- True
- >>> print(m0.evaluate(r'\z. adam', g0) == {'g2': 'b1', 'b2': 'b1', 'b1': 'b1', 'g1': 'b1', 'd1': 'b1'})
- True
- >>> print(m0.evaluate(r'\z. love(x, y)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False})
- True
-
-
-Propositional Model Test
-------------------------
-
- >>> tests = [
- ... ('P & Q', True),
- ... ('P & R', False),
- ... ('- P', False),
- ... ('- R', True),
- ... ('- - P', True),
- ... ('- (P & R)', True),
- ... ('P | R', True),
- ... ('R | P', True),
- ... ('R | R', False),
- ... ('- P | R', False),
- ... ('P | - P', True),
- ... ('P -> Q', True),
- ... ('P -> R', False),
- ... ('R -> P', True),
- ... ('P <-> P', True),
- ... ('R <-> R', True),
- ... ('P <-> R', False),
- ... ]
- >>> val1 = Valuation([('P', True), ('Q', True), ('R', False)])
- >>> dom = set([])
- >>> m = Model(dom, val1)
- >>> g = Assignment(dom)
- >>> for (sent, testvalue) in tests:
- ... semvalue = m.evaluate(sent, g)
- ... if semvalue == testvalue:
- ... print('*', end=' ')
- * * * * * * * * * * * * * * * * *
-
-
-Test of i Function
-------------------
-
- >>> from nltk.sem import Expression
- >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),
- ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])),
- ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
- >>> val = Valuation(v)
- >>> dom = val.domain
- >>> m = Model(dom, val)
- >>> g = Assignment(dom, [('x', 'b1'), ('y', 'g2')])
- >>> exprs = ['adam', 'girl', 'love', 'walks', 'x', 'y', 'z']
- >>> parsed_exprs = [Expression.fromstring(e) for e in exprs]
- >>> sorted_set = lambda x: sorted(x) if isinstance(x, set) else x
- >>> for parsed in parsed_exprs:
- ... try:
- ... print("'%s' gets value %s" % (parsed, sorted_set(m.i(parsed, g))))
- ... except Undefined:
- ... print("'%s' is Undefined" % parsed)
- 'adam' gets value b1
- 'girl' gets value [('g1',), ('g2',)]
- 'love' gets value [('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]
- 'walks' is Undefined
- 'x' gets value b1
- 'y' gets value g2
- 'z' is Undefined
-
-Test for formulas in Model
---------------------------
-
- >>> tests = [
- ... ('love(adam, betty)', True),
- ... ('love(adam, sue)', 'Undefined'),
- ... ('dog(fido)', True),
- ... ('- dog(fido)', False),
- ... ('- - dog(fido)', True),
- ... ('- dog(sue)', 'Undefined'),
- ... ('dog(fido) & boy(adam)', True),
- ... ('- (dog(fido) & boy(adam))', False),
- ... ('- dog(fido) & boy(adam)', False),
- ... ('dog(fido) | boy(adam)', True),
- ... ('- (dog(fido) | boy(adam))', False),
- ... ('- dog(fido) | boy(adam)', True),
- ... ('- dog(fido) | - boy(adam)', False),
- ... ('dog(fido) -> boy(adam)', True),
- ... ('- (dog(fido) -> boy(adam))', False),
- ... ('- dog(fido) -> boy(adam)', True),
- ... ('exists x . love(adam, x)', True),
- ... ('all x . love(adam, x)', False),
- ... ('fido = fido', True),
- ... ('exists x . all y. love(x, y)', False),
- ... ('exists x . (x = fido)', True),
- ... ('all x . (dog(x) | - dog(x))', True),
- ... ('adam = mia', 'Undefined'),
- ... ('\\x. (boy(x) | girl(x))', {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}),
- ... ('\\x. exists y. (boy(x) & love(x, y))', {'g2': False, 'b2': True, 'b1': True, 'g1': False, 'd1': False}),
- ... ('exists z1. boy(z1)', True),
- ... ('exists x. (boy(x) & - (x = adam))', True),
- ... ('exists x. (boy(x) & all y. love(y, x))', False),
- ... ('all x. (boy(x) | girl(x))', False),
- ... ('all x. (girl(x) -> exists y. boy(y) & love(x, y))', False),
- ... ('exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', True),
- ... ('exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', False),
- ... ('all x. (dog(x) -> - girl(x))', True),
- ... ('exists x. exists y. (love(x, y) & love(x, y))', True),
- ... ]
- >>> for (sent, testvalue) in tests:
- ... semvalue = m.evaluate(sent, g)
- ... if semvalue == testvalue:
- ... print('*', end=' ')
- ... else:
- ... print(sent, semvalue)
- * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-
-
-
-Satisfier Tests
----------------
-
- >>> formulas = [
- ... 'boy(x)',
- ... '(x = x)',
- ... '(boy(x) | girl(x))',
- ... '(boy(x) & girl(x))',
- ... 'love(adam, x)',
- ... 'love(x, adam)',
- ... '- (x = adam)',
- ... 'exists z22. love(x, z22)',
- ... 'exists y. love(y, x)',
- ... 'all y. (girl(y) -> love(x, y))',
- ... 'all y. (girl(y) -> love(y, x))',
- ... 'all y. (girl(y) -> (boy(x) & love(y, x)))',
- ... 'boy(x) & all y. (girl(y) -> love(x, y))',
- ... 'boy(x) & all y. (girl(y) -> love(y, x))',
- ... 'boy(x) & exists y. (girl(y) & love(y, x))',
- ... 'girl(x) -> dog(x)',
- ... 'all y. (dog(y) -> (x = y))',
- ... '- exists y. love(y, x)',
- ... 'exists y. (love(adam, y) & love(y, x))'
- ... ]
- >>> g.purge()
- >>> g.add('x', 'b1')
- {'x': 'b1'}
- >>> for f in formulas: # doctest: +NORMALIZE_WHITESPACE
- ... try:
- ... print("'%s' gets value: %s" % (f, m.evaluate(f, g)))
- ... except Undefined:
- ... print("'%s' is Undefined" % f)
- 'boy(x)' gets value: True
- '(x = x)' gets value: True
- '(boy(x) | girl(x))' gets value: True
- '(boy(x) & girl(x))' gets value: False
- 'love(adam, x)' gets value: False
- 'love(x, adam)' gets value: False
- '- (x = adam)' gets value: False
- 'exists z22. love(x, z22)' gets value: True
- 'exists y. love(y, x)' gets value: True
- 'all y. (girl(y) -> love(x, y))' gets value: False
- 'all y. (girl(y) -> love(y, x))' gets value: True
- 'all y. (girl(y) -> (boy(x) & love(y, x)))' gets value: True
- 'boy(x) & all y. (girl(y) -> love(x, y))' gets value: False
- 'boy(x) & all y. (girl(y) -> love(y, x))' gets value: True
- 'boy(x) & exists y. (girl(y) & love(y, x))' gets value: True
- 'girl(x) -> dog(x)' gets value: True
- 'all y. (dog(y) -> (x = y))' gets value: False
- '- exists y. love(y, x)' gets value: False
- 'exists y. (love(adam, y) & love(y, x))' gets value: True
-
- >>> from nltk.sem import Expression
- >>> for fmla in formulas: # doctest: +NORMALIZE_WHITESPACE
- ... p = Expression.fromstring(fmla)
- ... g.purge()
- ... print("Satisfiers of '%s':\n\t%s" % (p, sorted(m.satisfiers(p, 'x', g))))
- Satisfiers of 'boy(x)':
- ['b1', 'b2']
- Satisfiers of '(x = x)':
- ['b1', 'b2', 'd1', 'g1', 'g2']
- Satisfiers of '(boy(x) | girl(x))':
- ['b1', 'b2', 'g1', 'g2']
- Satisfiers of '(boy(x) & girl(x))':
- []
- Satisfiers of 'love(adam,x)':
- ['g1']
- Satisfiers of 'love(x,adam)':
- ['g1', 'g2']
- Satisfiers of '-(x = adam)':
- ['b2', 'd1', 'g1', 'g2']
- Satisfiers of 'exists z22.love(x,z22)':
- ['b1', 'b2', 'g1', 'g2']
- Satisfiers of 'exists y.love(y,x)':
- ['b1', 'g1', 'g2']
- Satisfiers of 'all y.(girl(y) -> love(x,y))':
- []
- Satisfiers of 'all y.(girl(y) -> love(y,x))':
- ['b1']
- Satisfiers of 'all y.(girl(y) -> (boy(x) & love(y,x)))':
- ['b1']
- Satisfiers of '(boy(x) & all y.(girl(y) -> love(x,y)))':
- []
- Satisfiers of '(boy(x) & all y.(girl(y) -> love(y,x)))':
- ['b1']
- Satisfiers of '(boy(x) & exists y.(girl(y) & love(y,x)))':
- ['b1']
- Satisfiers of '(girl(x) -> dog(x))':
- ['b1', 'b2', 'd1']
- Satisfiers of 'all y.(dog(y) -> (x = y))':
- ['d1']
- Satisfiers of '-exists y.love(y,x)':
- ['b2', 'd1']
- Satisfiers of 'exists y.(love(adam,y) & love(y,x))':
- ['b1']
-
-
-Tests based on the Blackburn & Bos testsuite
---------------------------------------------
-
- >>> v1 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'),
- ... ('honey_bunny', 'd4'), ('yolanda', 'd5'),
- ... ('customer', set(['d1', 'd2'])),
- ... ('robber', set(['d3', 'd4'])),
- ... ('love', set([('d3', 'd4')]))]
- >>> val1 = Valuation(v1)
- >>> dom1 = val1.domain
- >>> m1 = Model(dom1, val1)
- >>> g1 = Assignment(dom1)
-
- >>> v2 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'),
- ... ('honey_bunny', 'd4'), ('yolanda', 'd4'),
- ... ('customer', set(['d1', 'd2', 'd5', 'd6'])),
- ... ('robber', set(['d3', 'd4'])),
- ... ('love', set([(None, None)]))]
- >>> val2 = Valuation(v2)
- >>> dom2 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6'])
- >>> m2 = Model(dom2, val2)
- >>> g2 = Assignment(dom2)
- >>> g21 = Assignment(dom2)
- >>> g21.add('y', 'd3')
- {'y': 'd3'}
-
- >>> v3 = [('mia', 'd1'), ('jody', 'd2'), ('jules', 'd3'),
- ... ('vincent', 'd4'),
- ... ('woman', set(['d1', 'd2'])), ('man', set(['d3', 'd4'])),
- ... ('joke', set(['d5', 'd6'])), ('episode', set(['d7', 'd8'])),
- ... ('in', set([('d5', 'd7'), ('d5', 'd8')])),
- ... ('tell', set([('d1', 'd5'), ('d2', 'd6')]))]
- >>> val3 = Valuation(v3)
- >>> dom3 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8'])
- >>> m3 = Model(dom3, val3)
- >>> g3 = Assignment(dom3)
-
- >>> tests = [
- ... ('exists x. robber(x)', m1, g1, True),
- ... ('exists x. exists y. love(y, x)', m1, g1, True),
- ... ('exists x0. exists x1. love(x1, x0)', m2, g2, False),
- ... ('all x. all y. love(y, x)', m2, g2, False),
- ... ('- (all x. all y. love(y, x))', m2, g2, True),
- ... ('all x. all y. - love(y, x)', m2, g2, True),
- ... ('yolanda = honey_bunny', m2, g2, True),
- ... ('mia = honey_bunny', m2, g2, 'Undefined'),
- ... ('- (yolanda = honey_bunny)', m2, g2, False),
- ... ('- (mia = honey_bunny)', m2, g2, 'Undefined'),
- ... ('all x. (robber(x) | customer(x))', m2, g2, True),
- ... ('- (all x. (robber(x) | customer(x)))', m2, g2, False),
- ... ('(robber(x) | customer(x))', m2, g2, 'Undefined'),
- ... ('(robber(y) | customer(y))', m2, g21, True),
- ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True),
- ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True),
- ... ('- exists x. woman(x)', m3, g3, False),
- ... ('exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'),
- ... ('- exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'),
- ... ('exists x. (man(x) & - exists y. woman(y))', m3, g3, False),
- ... ('exists x. (man(x) & - exists x. woman(x))', m3, g3, False),
- ... ('exists x. (woman(x) & - exists x. customer(x))', m2, g2, 'Undefined'),
- ... ]
-
- >>> for item in tests:
- ... sentence, model, g, testvalue = item
- ... semvalue = model.evaluate(sentence, g)
- ... if semvalue == testvalue:
- ... print('*', end=' ')
- ... g.purge()
- * * * * * * * * * * * * * * * * * * * * * *
-
-
-Tests for mapping from syntax to semantics
-------------------------------------------
-
-Load a valuation from a file.
-
- >>> import nltk.data
- >>> from nltk.sem.util import parse_sents
- >>> val = nltk.data.load('grammars/sample_grammars/valuation1.val')
- >>> dom = val.domain
- >>> m = Model(dom, val)
- >>> g = Assignment(dom)
- >>> gramfile = 'grammars/sample_grammars/sem2.fcfg'
- >>> inputs = ['John sees a girl', 'every dog barks']
- >>> parses = parse_sents(inputs, gramfile)
- >>> for sent, trees in zip(inputs, parses):
- ... print()
- ... print("Sentence: %s" % sent)
- ... for tree in trees:
- ... print("Parse:\n %s" %tree)
- ... print("Semantics: %s" % root_semrep(tree))
- <BLANKLINE>
- Sentence: John sees a girl
- Parse:
- (S[SEM=<exists x.(girl(x) & see(john,x))>]
- (NP[-LOC, NUM='sg', SEM=<\P.P(john)>]
- (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John))
- (VP[NUM='sg', SEM=<\y.exists x.(girl(x) & see(y,x))>]
- (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees)
- (NP[NUM='sg', SEM=<\Q.exists x.(girl(x) & Q(x))>]
- (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a)
- (Nom[NUM='sg', SEM=<\x.girl(x)>]
- (N[NUM='sg', SEM=<\x.girl(x)>] girl)))))
- Semantics: exists x.(girl(x) & see(john,x))
- <BLANKLINE>
- Sentence: every dog barks
- Parse:
- (S[SEM=<all x.(dog(x) -> bark(x))>]
- (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>]
- (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every)
- (Nom[NUM='sg', SEM=<\x.dog(x)>]
- (N[NUM='sg', SEM=<\x.dog(x)>] dog)))
- (VP[NUM='sg', SEM=<\x.bark(x)>]
- (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks)))
- Semantics: all x.(dog(x) -> bark(x))
-
- >>> sent = "every dog barks"
- >>> result = nltk.sem.util.interpret_sents([sent], gramfile)[0]
- >>> for (syntree, semrep) in result:
- ... print(syntree)
- ... print()
- ... print(semrep)
- (S[SEM=<all x.(dog(x) -> bark(x))>]
- (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>]
- (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every)
- (Nom[NUM='sg', SEM=<\x.dog(x)>]
- (N[NUM='sg', SEM=<\x.dog(x)>] dog)))
- (VP[NUM='sg', SEM=<\x.bark(x)>]
- (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks)))
- <BLANKLINE>
- all x.(dog(x) -> bark(x))
-
- >>> result = nltk.sem.util.evaluate_sents([sent], gramfile, m, g)[0]
- >>> for (syntree, semrel, value) in result:
- ... print(syntree)
- ... print()
- ... print(semrep)
- ... print()
- ... print(value)
- (S[SEM=<all x.(dog(x) -> bark(x))>]
- (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>]
- (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every)
- (Nom[NUM='sg', SEM=<\x.dog(x)>]
- (N[NUM='sg', SEM=<\x.dog(x)>] dog)))
- (VP[NUM='sg', SEM=<\x.bark(x)>]
- (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks)))
- <BLANKLINE>
- all x.(dog(x) -> bark(x))
- <BLANKLINE>
- True
-
- >>> sents = ['Mary walks', 'John sees a dog']
- >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg')
- >>> for result in results:
- ... for (synrep, semrep) in result:
- ... print(synrep)
- (S[SEM=<walk(mary)>]
- (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>]
- (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary))
- (VP[NUM='sg', SEM=<\x.walk(x)>]
- (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks)))
- (S[SEM=<exists x.(dog(x) & see(john,x))>]
- (NP[-LOC, NUM='sg', SEM=<\P.P(john)>]
- (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John))
- (VP[NUM='sg', SEM=<\y.exists x.(dog(x) & see(y,x))>]
- (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees)
- (NP[NUM='sg', SEM=<\Q.exists x.(dog(x) & Q(x))>]
- (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a)
- (Nom[NUM='sg', SEM=<\x.dog(x)>]
- (N[NUM='sg', SEM=<\x.dog(x)>] dog)))))
-
-Cooper Storage
---------------
-
- >>> from nltk.sem import cooper_storage as cs
- >>> sentence = 'every girl chases a dog'
- >>> trees = cs.parse_with_bindops(sentence, grammar='grammars/book_grammars/storage.fcfg')
- >>> semrep = trees[0].label()['SEM']
- >>> cs_semrep = cs.CooperStore(semrep)
- >>> print(cs_semrep.core)
- chase(z2,z4)
- >>> for bo in cs_semrep.store:
- ... print(bo)
- bo(\P.all x.(girl(x) -> P(x)),z2)
- bo(\P.exists x.(dog(x) & P(x)),z4)
- >>> cs_semrep.s_retrieve(trace=True)
- Permutation 1
- (\P.all x.(girl(x) -> P(x)))(\z2.chase(z2,z4))
- (\P.exists x.(dog(x) & P(x)))(\z4.all x.(girl(x) -> chase(x,z4)))
- Permutation 2
- (\P.exists x.(dog(x) & P(x)))(\z4.chase(z2,z4))
- (\P.all x.(girl(x) -> P(x)))(\z2.exists x.(dog(x) & chase(z2,x)))
-
- >>> for reading in cs_semrep.readings:
- ... print(reading)
- exists x.(dog(x) & all z3.(girl(z3) -> chase(z3,x)))
- all x.(girl(x) -> exists z4.(dog(z4) & chase(x,z4)))
-
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-# reset the variables counter before running tests
-def setup_module(module):
- from nltk.sem import logic
-
- logic._counter._value = 0
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===================
-Sentiment Analysis
-===================
-
- >>> from nltk.classify import NaiveBayesClassifier
- >>> from nltk.corpus import subjectivity
- >>> from nltk.sentiment import SentimentAnalyzer
- >>> from nltk.sentiment.util import *
-
- >>> n_instances = 100
- >>> subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
- >>> obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
- >>> len(subj_docs), len(obj_docs)
- (100, 100)
-
-Each document is represented by a tuple (sentence, label). The sentence is tokenized,
-so it is represented by a list of strings:
-
- >>> subj_docs[0]
- (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one',
- 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')
-
-We separately split subjective and objective instances to keep a balanced uniform
-class distribution in both train and test sets.
-
- >>> train_subj_docs = subj_docs[:80]
- >>> test_subj_docs = subj_docs[80:100]
- >>> train_obj_docs = obj_docs[:80]
- >>> test_obj_docs = obj_docs[80:100]
- >>> training_docs = train_subj_docs+train_obj_docs
- >>> testing_docs = test_subj_docs+test_obj_docs
-
- >>> sentim_analyzer = SentimentAnalyzer()
- >>> all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
-
-We use simple unigram word features, handling negation:
-
- >>> unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
- >>> len(unigram_feats)
- 83
- >>> sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
-
-We apply features to obtain a feature-value representation of our datasets:
-
- >>> training_set = sentim_analyzer.apply_features(training_docs)
- >>> test_set = sentim_analyzer.apply_features(testing_docs)
-
-We can now train our classifier on the training set, and subsequently output the
-evaluation results:
-
- >>> trainer = NaiveBayesClassifier.train
- >>> classifier = sentim_analyzer.train(trainer, training_set)
- Training classifier
- >>> for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
- ... print('{0}: {1}'.format(key, value))
- Evaluating NaiveBayesClassifier results...
- Accuracy: 0.8
- F-measure [obj]: 0.8
- F-measure [subj]: 0.8
- Precision [obj]: 0.8
- Precision [subj]: 0.8
- Recall [obj]: 0.8
- Recall [subj]: 0.8
-
-
-Vader
-------
-
- >>> from nltk.sentiment.vader import SentimentIntensityAnalyzer
- >>> sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
- ... "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
- ... "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted)
- ... "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled
- ... "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
- ... "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score
- ... "The book was good.", # positive sentence
- ... "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
- ... "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
- ... "A really bad, horrible book.", # negative sentence with booster words
- ... "At least it isn't a horrible book.", # negated negative sentence with contraction
- ... ":) and :D", # emoticons handled
- ... "", # an empty string is correctly handled
- ... "Today sux", # negative slang handled
- ... "Today sux!", # negative slang with punctuation emphasis handled
- ... "Today SUX!", # negative slang with capitalization emphasis
- ... "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
- ... ]
- >>> paragraph = "It was one of the worst movies I've seen, despite good reviews. \
- ... Unbelievably bad acting!! Poor direction. VERY poor production. \
- ... The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"
-
- >>> from nltk import tokenize
- >>> lines_list = tokenize.sent_tokenize(paragraph)
- >>> sentences.extend(lines_list)
-
- >>> tricky_sentences = [
- ... "Most automated sentiment analysis tools are shit.",
- ... "VADER sentiment analysis is the shit.",
- ... "Sentiment analysis has never been good.",
- ... "Sentiment analysis with VADER has never been this good.",
- ... "Warren Beatty has never been so entertaining.",
- ... "I won't say that the movie is astounding and I wouldn't claim that \
- ... the movie is too banal either.",
- ... "I like to hate Michael Bay films, but I couldn't fault this one",
- ... "It's one thing to watch an Uwe Boll film, but another thing entirely \
- ... to pay for it",
- ... "The movie was too good",
- ... "This movie was actually neither that funny, nor super witty.",
- ... "This movie doesn't care about cleverness, wit or any other kind of \
- ... intelligent humor.",
- ... "Those who find ugly meanings in beautiful things are corrupt without \
- ... being charming.",
- ... "There are slow and repetitive parts, BUT it has just enough spice to \
- ... keep it interesting.",
- ... "The script is not fantastic, but the acting is decent and the cinematography \
- ... is EXCELLENT!",
- ... "Roger Dodger is one of the most compelling variations on this theme.",
- ... "Roger Dodger is one of the least compelling variations on this theme.",
- ... "Roger Dodger is at least compelling as a variation on the theme.",
- ... "they fall in love with the product",
- ... "but then it breaks",
- ... "usually around the time the 90 day warranty expires",
- ... "the twin towers collapsed today",
- ... "However, Mr. Carter solemnly argues, his client carried out the kidnapping \
- ... under orders and in the ''least offensive way possible.''"
- ... ]
- >>> sentences.extend(tricky_sentences)
- >>> for sentence in sentences:
- ... sid = SentimentIntensityAnalyzer()
- ... print(sentence)
- ... ss = sid.polarity_scores(sentence)
- ... for k in sorted(ss):
- ... print('{0}: {1}, '.format(k, ss[k]), end='')
- ... print()
- VADER is smart, handsome, and funny.
- compound: 0.8316, neg: 0.0, neu: 0.254, pos: 0.746,
- VADER is smart, handsome, and funny!
- compound: 0.8439, neg: 0.0, neu: 0.248, pos: 0.752,
- VADER is very smart, handsome, and funny.
- compound: 0.8545, neg: 0.0, neu: 0.299, pos: 0.701,
- VADER is VERY SMART, handsome, and FUNNY.
- compound: 0.9227, neg: 0.0, neu: 0.246, pos: 0.754,
- VADER is VERY SMART, handsome, and FUNNY!!!
- compound: 0.9342, neg: 0.0, neu: 0.233, pos: 0.767,
- VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!
- compound: 0.9469, neg: 0.0, neu: 0.294, pos: 0.706,
- The book was good.
- compound: 0.4404, neg: 0.0, neu: 0.508, pos: 0.492,
- The book was kind of good.
- compound: 0.3832, neg: 0.0, neu: 0.657, pos: 0.343,
- The plot was good, but the characters are uncompelling and the dialog is not great.
- compound: -0.7042, neg: 0.327, neu: 0.579, pos: 0.094,
- A really bad, horrible book.
- compound: -0.8211, neg: 0.791, neu: 0.209, pos: 0.0,
- At least it isn't a horrible book.
- compound: 0.431, neg: 0.0, neu: 0.637, pos: 0.363,
- :) and :D
- compound: 0.7925, neg: 0.0, neu: 0.124, pos: 0.876,
- <BLANKLINE>
- compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0,
- Today sux
- compound: -0.3612, neg: 0.714, neu: 0.286, pos: 0.0,
- Today sux!
- compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0,
- Today SUX!
- compound: -0.5461, neg: 0.779, neu: 0.221, pos: 0.0,
- Today kinda sux! But I'll get by, lol
- compound: 0.2228, neg: 0.195, neu: 0.531, pos: 0.274,
- It was one of the worst movies I've seen, despite good reviews.
- compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0,
- Unbelievably bad acting!!
- compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0,
- Poor direction.
- compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0,
- VERY poor production.
- compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0,
- The movie was bad.
- compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0,
- Very bad movie.
- compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0,
- VERY bad movie.
- compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0,
- VERY BAD movie.
- compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0,
- VERY BAD movie!
- compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0,
- Most automated sentiment analysis tools are shit.
- compound: -0.5574, neg: 0.375, neu: 0.625, pos: 0.0,
- VADER sentiment analysis is the shit.
- compound: 0.6124, neg: 0.0, neu: 0.556, pos: 0.444,
- Sentiment analysis has never been good.
- compound: -0.3412, neg: 0.325, neu: 0.675, pos: 0.0,
- Sentiment analysis with VADER has never been this good.
- compound: 0.5228, neg: 0.0, neu: 0.703, pos: 0.297,
- Warren Beatty has never been so entertaining.
- compound: 0.5777, neg: 0.0, neu: 0.616, pos: 0.384,
- I won't say that the movie is astounding and I wouldn't claim that the movie is too banal either.
- compound: 0.4215, neg: 0.0, neu: 0.851, pos: 0.149,
- I like to hate Michael Bay films, but I couldn't fault this one
- compound: 0.3153, neg: 0.157, neu: 0.534, pos: 0.309,
- It's one thing to watch an Uwe Boll film, but another thing entirely to pay for it
- compound: -0.2541, neg: 0.112, neu: 0.888, pos: 0.0,
- The movie was too good
- compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42,
- This movie was actually neither that funny, nor super witty.
- compound: -0.6759, neg: 0.41, neu: 0.59, pos: 0.0,
- This movie doesn't care about cleverness, wit or any other kind of intelligent humor.
- compound: -0.1338, neg: 0.265, neu: 0.497, pos: 0.239,
- Those who find ugly meanings in beautiful things are corrupt without being charming.
- compound: -0.3553, neg: 0.314, neu: 0.493, pos: 0.192,
- There are slow and repetitive parts, BUT it has just enough spice to keep it interesting.
- compound: 0.4678, neg: 0.079, neu: 0.735, pos: 0.186,
- The script is not fantastic, but the acting is decent and the cinematography is EXCELLENT!
- compound: 0.7565, neg: 0.092, neu: 0.607, pos: 0.301,
- Roger Dodger is one of the most compelling variations on this theme.
- compound: 0.2944, neg: 0.0, neu: 0.834, pos: 0.166,
- Roger Dodger is one of the least compelling variations on this theme.
- compound: -0.1695, neg: 0.132, neu: 0.868, pos: 0.0,
- Roger Dodger is at least compelling as a variation on the theme.
- compound: 0.2263, neg: 0.0, neu: 0.84, pos: 0.16,
- they fall in love with the product
- compound: 0.6369, neg: 0.0, neu: 0.588, pos: 0.412,
- but then it breaks
- compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0,
- usually around the time the 90 day warranty expires
- compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0,
- the twin towers collapsed today
- compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0,
- However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.''
- compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074,
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-======================
-SentiWordNet Interface
-======================
-
-SentiWordNet can be imported like this:
-
- >>> from nltk.corpus import sentiwordnet as swn
-
-------------
-SentiSynsets
-------------
-
- >>> breakdown = swn.senti_synset('breakdown.n.03')
- >>> print(breakdown)
- <breakdown.n.03: PosScore=0.0 NegScore=0.25>
- >>> breakdown.pos_score()
- 0.0
- >>> breakdown.neg_score()
- 0.25
- >>> breakdown.obj_score()
- 0.75
-
-
-------
-Lookup
-------
-
- >>> list(swn.senti_synsets('slow')) # doctest: +NORMALIZE_WHITESPACE
- [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),
- SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),
- SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),
- SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),
- SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),
- SentiSynset('behind.r.03')]
-
- >>> happy = swn.senti_synsets('happy', 'a')
-
- >>> all = swn.all_senti_synsets()
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=================
-EasyInstall Tests
-=================
-
-This file contains some simple tests that will be run by EasyInstall in
-order to test the installation when NLTK-Data is absent.
-
-
-------------
-Tokenization
-------------
-
- >>> from nltk.tokenize import wordpunct_tokenize
- >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
- ... "two of them.\n\nThanks.")
- >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
- ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
- 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
-
--------
-Metrics
--------
-
- >>> from nltk.metrics import precision, recall, f_measure
- >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
- >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
- >>> reference_set = set(reference)
- >>> test_set = set(test)
- >>> precision(reference_set, test_set)
- 1.0
- >>> print(recall(reference_set, test_set))
- 0.8
- >>> print(f_measure(reference_set, test_set))
- 0.88888888888...
-
-------------------
-Feature Structures
-------------------
-
- >>> from nltk import FeatStruct
- >>> fs1 = FeatStruct(PER=3, NUM='pl', GND='fem')
- >>> fs2 = FeatStruct(POS='N', AGR=fs1)
- >>> print(fs2)
- [ [ GND = 'fem' ] ]
- [ AGR = [ NUM = 'pl' ] ]
- [ [ PER = 3 ] ]
- [ ]
- [ POS = 'N' ]
- >>> print(fs2['AGR'])
- [ GND = 'fem' ]
- [ NUM = 'pl' ]
- [ PER = 3 ]
- >>> print(fs2['AGR']['PER'])
- 3
-
--------
-Parsing
--------
-
- >>> from nltk.parse.recursivedescent import RecursiveDescentParser
- >>> from nltk.grammar import CFG
- >>> grammar = CFG.fromstring("""
- ... S -> NP VP
- ... PP -> P NP
- ... NP -> 'the' N | N PP | 'the' N PP
- ... VP -> V NP | V PP | V NP PP
- ... N -> 'cat' | 'dog' | 'rug'
- ... V -> 'chased'
- ... P -> 'on'
- ... """)
- >>> rd = RecursiveDescentParser(grammar)
- >>> sent = 'the cat chased the dog on the rug'.split()
- >>> for t in rd.parse(sent):
- ... print(t)
- (S
- (NP the (N cat))
- (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug))))))
- (S
- (NP the (N cat))
- (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug)))))
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-==========
- Stemmers
-==========
-
-Overview
-~~~~~~~~
-
-Stemmers remove morphological affixes from words, leaving only the
-word stem.
-
- >>> from nltk.stem import *
-
-Unit tests for the Porter stemmer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- >>> from nltk.stem.porter import *
-
-Create a new Porter stemmer.
-
- >>> stemmer = PorterStemmer()
-
-Test the stemmer on various pluralised words.
-
- >>> plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
- ... 'died', 'agreed', 'owned', 'humbled', 'sized',
- ... 'meeting', 'stating', 'siezing', 'itemization',
- ... 'sensational', 'traditional', 'reference', 'colonizer',
- ... 'plotted']
-
- >>> singles = [stemmer.stem(plural) for plural in plurals]
-
- >>> print(' '.join(singles)) # doctest: +NORMALIZE_WHITESPACE
- caress fli die mule deni die agre own humbl size meet
- state siez item sensat tradit refer colon plot
-
-
-Unit tests for Snowball stemmer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- >>> from nltk.stem.snowball import SnowballStemmer
-
-See which languages are supported.
-
- >>> print(" ".join(SnowballStemmer.languages))
- arabic danish dutch english finnish french german hungarian italian
- norwegian porter portuguese romanian russian spanish swedish
-
-Create a new instance of a language specific subclass.
-
- >>> stemmer = SnowballStemmer("english")
-
-Stem a word.
-
- >>> print(stemmer.stem("running"))
- run
-
-Decide not to stem stopwords.
-
- >>> stemmer2 = SnowballStemmer("english", ignore_stopwords=True)
- >>> print(stemmer.stem("having"))
- have
- >>> print(stemmer2.stem("having"))
- having
-
-The 'english' stemmer is better than the original 'porter' stemmer.
-
- >>> print(SnowballStemmer("english").stem("generously"))
- generous
- >>> print(SnowballStemmer("porter").stem("generously"))
- gener
-
-.. note::
-
- Extra stemmer tests can be found in `nltk.test.unit.test_stem`.
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-Regression Tests
-~~~~~~~~~~~~~~~~
-
-Sequential Taggers
-------------------
-
-Add tests for:
- - make sure backoff is being done correctly.
- - make sure ngram taggers don't use previous sentences for context.
- - make sure ngram taggers see 'beginning of the sentence' as a
- unique context
- - make sure regexp tagger's regexps are tried in order
- - train on some simple examples, & make sure that the size & the
- generated models are correct.
- - make sure cutoff works as intended
- - make sure that ngram models only exclude contexts covered by the
- backoff tagger if the backoff tagger gets that context correct at
- *all* locations.
-
-
-Regression Testing for issue #1025
-==================================
-
-We want to ensure that a RegexpTagger can be created with more than 100 patterns
-and does not fail with:
- "AssertionError: sorry, but this version only supports 100 named groups"
-
- >>> from nltk.tag import RegexpTagger
- >>> patterns = [(str(i), 'NNP',) for i in range(200)]
- >>> tagger = RegexpTagger(patterns)
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
- >>> from nltk.tokenize import *
-
-Regression Tests: Treebank Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some test strings.
-
- >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
- >>> word_tokenize(s1)
- ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
- >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
- >>> word_tokenize(s2)
- ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
- >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
- >>> word_tokenize(s3)
- ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
- >>> s4 = "I cannot cannot work under these conditions!"
- >>> word_tokenize(s4)
- ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']
- >>> s5 = "The company spent $30,000,000 last year."
- >>> word_tokenize(s5)
- ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.']
- >>> s6 = "The company spent 40.75% of its income last year."
- >>> word_tokenize(s6)
- ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.']
- >>> s7 = "He arrived at 3:00 pm."
- >>> word_tokenize(s7)
- ['He', 'arrived', 'at', '3:00', 'pm', '.']
- >>> s8 = "I bought these items: books, pencils, and pens."
- >>> word_tokenize(s8)
- ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.']
- >>> s9 = "Though there were 150, 100 of them were old."
- >>> word_tokenize(s9)
- ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.']
- >>> s10 = "There were 300,000, but that wasn't enough."
- >>> word_tokenize(s10)
- ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']
-
-
-Testing improvement made to the TreebankWordTokenizer
-
- >>> sx1 = '\xabNow that I can do.\xbb'
- >>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb']
- >>> word_tokenize(sx1) == expected
- True
- >>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
- >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
- >>> word_tokenize(sx2) == expected
- True
-
-
-Sentence tokenization in word_tokenize:
-
- >>> s11 = "I called Dr. Jones. I called Dr. Jones."
- >>> word_tokenize(s11)
- ['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.']
- >>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen "
- ... "Kuchen einzukaufen. Ich muss.")
- >>> word_tokenize(s12)
- ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw',
- '.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.']
- >>> word_tokenize(s12, 'german')
- ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.',
- 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.']
-
-
-Regression Tests: Regexp Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some additional test strings.
-
- >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
- ... "two of them.\n\nThanks.")
- >>> s2 = ("Alas, it has not rained today. When, do you think, "
- ... "will it rain again?")
- >>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
- ... "not relax our vigilance!</p>")
-
- >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
- [', ', '. ', ', ', ', ', '?']
- >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
- ['Alas', 'it has not rained today', 'When', 'do you think',
- 'will it rain again']
-
-Take care to avoid using capturing groups:
-
- >>> regexp_tokenize(s3, r'</?[bp]>', gaps=False)
- ['<p>', '<b>', '</b>', '</p>']
- >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=False)
- ['<p>', '<b>', '</b>', '</p>']
- >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=True)
- ['Although this is ', 'not',
- ' the case here, we must not relax our vigilance!']
-
-Named groups are capturing groups, and confuse the tokenizer:
-
- >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False)
- ['p', 'b', 'b', 'p']
- >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True)
- ['p', 'Although this is ', 'b', 'not', 'b',
- ' the case here, we must not relax our vigilance!', 'p']
-
-Make sure that nested groups don't confuse the tokenizer:
-
- >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False)
- ['las', 'has', 'rai', 'rai']
- >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True)
- ['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
- 'n again?']
-
-Back-references require capturing groups, and these are not supported:
-
- >>> regexp_tokenize("aabbbcccc", r'(.)\1')
- ['a', 'b', 'c', 'c']
-
-A simple sentence tokenizer '\.(\s+|$)'
-
- >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True)
- ['Good muffins cost $3.88\nin New York',
- 'Please buy me\ntwo of them', 'Thanks']
-
-
-Regression Tests: TweetTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks.
-
- >>> from nltk.tokenize import TweetTokenizer
- >>> tknzr = TweetTokenizer()
- >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
- >>> tknzr.tokenize(s0)
- ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
- >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)"
- >>> tknzr.tokenize(s1)
- ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)']
- >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn"
- >>> tknzr.tokenize(s2)
- ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn']
- >>> s3 = "@Insanomania They do... Their mentality doesn't :("
- >>> tknzr.tokenize(s3)
- ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':(']
- >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!"
- >>> tknzr.tokenize(s4)
- ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!']
- >>> tknzr = TweetTokenizer(reduce_len=True)
- >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :("
- >>> tknzr.tokenize(s5)
- ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':(']
-
-It is possible to specify `strip_handles` and `reduce_len` parameters for a TweetTokenizer instance. Setting `strip_handles` to True, the tokenizer will remove Twitter handles (e.g. usernames). Setting `reduce_len` to True, repeated character sequences of length 3 or greater will be replaced with sequences of length 3.
-
- >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
- >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!'
- >>> tknzr.tokenize(s6)
- [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
- >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.'
- >>> tknzr.tokenize(s7)
- [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.']
- >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com.'
- >>> tknzr.tokenize(s8)
- ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin@email.com', '.']
-
-The `preserve_case` parameter (default: True) allows to convert uppercase tokens to lowercase tokens. Emoticons are not affected:
-
- >>> tknzr = TweetTokenizer(preserve_case=False)
- >>> s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P"
- >>> tknzr.tokenize(s9)
- ['@jrmy', ':', "i'm", 'really', 'happyyy', 'about', 'that', '!', 'niceeee', ':D', ':P']
-
-It should not hang on long sequences of the same punctuation character.
-
- >>> tknzr = TweetTokenizer()
- >>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L"
- >>> tknzr.tokenize(s10)
- ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']
-
-
-Regression Tests: PunktSentenceTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The sentence splitter should remove whitespace following the sentence boundary.
-
- >>> pst = PunktSentenceTokenizer()
- >>> pst.tokenize('See Section 3). Or Section 2). ')
- ['See Section 3).', 'Or Section 2).']
- >>> pst.tokenize('See Section 3.) Or Section 2.) ')
- ['See Section 3.)', 'Or Section 2.)']
- >>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False)
- ['See Section 3.', ') Or Section 2.', ')']
-
-
-Two instances of PunktSentenceTokenizer should not share PunktParameters.
-
- >>> pst = PunktSentenceTokenizer()
- >>> pst2 = PunktSentenceTokenizer()
- >>> pst._params is pst2._params
- False
-
-Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067
-
- >>> from nltk.tokenize.punkt import PunktBaseClass, PunktTrainer, PunktSentenceTokenizer
- >>> from nltk.tokenize.punkt import PunktLanguageVars, PunktParameters
- >>> pbc = PunktBaseClass(lang_vars=None, params=None)
- >>> type(pbc._params)
- <class 'nltk.tokenize.punkt.PunktParameters'>
- >>> type(pbc._lang_vars)
- <class 'nltk.tokenize.punkt.PunktLanguageVars'>
- >>> pt = PunktTrainer(lang_vars=None)
- >>> type(pt._lang_vars)
- <class 'nltk.tokenize.punkt.PunktLanguageVars'>
- >>> pst = PunktSentenceTokenizer(lang_vars=None)
- >>> type(pst._lang_vars)
- <class 'nltk.tokenize.punkt.PunktLanguageVars'>
-
-
-Regression Tests: align_tokens
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Post-hoc alignment of tokens with a source string
-
- >>> from nltk.tokenize.util import align_tokens
- >>> list(align_tokens([''], ""))
- [(0, 0)]
- >>> list(align_tokens([''], " "))
- [(0, 0)]
- >>> list(align_tokens([], ""))
- []
- >>> list(align_tokens([], " "))
- []
- >>> list(align_tokens(['a'], "a"))
- [(0, 1)]
- >>> list(align_tokens(['abc', 'def'], "abcdef"))
- [(0, 3), (3, 6)]
- >>> list(align_tokens(['abc', 'def'], "abc def"))
- [(0, 3), (4, 7)]
- >>> list(align_tokens(['ab', 'cd'], "ab cd ef"))
- [(0, 2), (3, 5)]
- >>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef"))
- [(0, 2), (3, 5), (6, 8)]
- >>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef"))
- Traceback (most recent call last):
- ....
- ValueError: substring "efg" not found in "ab cd ef"
- >>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef"))
- Traceback (most recent call last):
- ....
- ValueError: substring "gh" not found in "ab cd ef"
- >>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday."))
- [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)]
-
-
-Regression Tests: MWETokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Pickle an MWETokenizer
-
- >>> from nltk.tokenize import MWETokenizer
- >>> import pickle
-
- >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
- >>> p = pickle.dumps(tokenizer)
- >>> unpickeled = pickle.loads(p)
- >>> unpickeled.tokenize("An hors d'oeuvre tonight, sir?".split())
- ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
-
-
-Regression Tests: TextTilingTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-TextTilingTokneizer tokenizes text into coherent subtopic chunks based upon Hearst's TextTiling algorithm.
-
- >>> from nltk.tokenize import TextTilingTokenizer
- >>> from nltk.corpus import brown
- >>> tt = TextTilingTokenizer()
- >>> tt.tokenize(brown.raw()[0:1000])
- ["\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np Allen/np Jr./"]
-
-Test that `ValueError` exceptions are raised when illegal arguments are used.
-
- >>> TextTilingTokenizer(similarity_method='foo').tokenize(brown.raw()[0:1000])
- Traceback (most recent call last):
- ...
- ValueError: Similarity method foo not recognized
- >>> TextTilingTokenizer(smoothing_method='bar').tokenize(brown.raw()[0:1000])
- Traceback (most recent call last):
- ...
- ValueError: Smoothing method bar not recognized
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===============================
-Unit test cases for ``toolbox``
-===============================
-
- >>> from nltk import toolbox
-
---------------------------
-``toolbox.StandardFormat``
---------------------------
-
- >>> f = toolbox.StandardFormat()
-
-``toolbox.StandardFormat.open()``
----------------------------------
- >>> import os, tempfile
- >>> (fd, fname) = tempfile.mkstemp()
- >>> tf = os.fdopen(fd, "w")
- >>> _ = tf.write('\\lx a value\n\\lx another value\n')
- >>> tf.close()
- >>> f = toolbox.StandardFormat()
- >>> f.open(fname)
- >>> list(f.fields())
- [('lx', 'a value'), ('lx', 'another value')]
- >>> f.close()
- >>> os.unlink(fname)
-
-``toolbox.StandardFormat.open_string()``
-----------------------------------------
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n')
- >>> list(f.fields())
- [('lx', 'a value'), ('lx', 'another value')]
- >>> f.close()
-
-``toolbox.StandardFormat.close()``
-----------------------------------
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n')
- >>> list(f.fields())
- [('lx', 'a value'), ('lx', 'another value')]
- >>> f.close()
-
-``toolbox.StandardFormat.line_num``
----------------------------------------
-
-``StandardFormat.line_num`` contains the line number of the last line returned:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n')
- >>> line_nums = []
- >>> for l in f.raw_fields():
- ... line_nums.append(f.line_num)
- >>> line_nums
- [1, 2, 3]
-
-``StandardFormat.line_num`` contains the line number of the last line returned:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
- >>> line_nums = []
- >>> for l in f.raw_fields():
- ... line_nums.append(f.line_num)
- >>> line_nums
- [2, 5, 7]
-
-``StandardFormat.line_num`` doesn't exist before openning or after closing
-a file or string:
-
- >>> f = toolbox.StandardFormat()
- >>> f.line_num
- Traceback (most recent call last):
- ...
- AttributeError: 'StandardFormat' object has no attribute 'line_num'
- >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
- >>> line_nums = []
- >>> for l in f.raw_fields():
- ... line_nums.append(f.line_num)
- >>> line_nums
- [2, 5, 7]
- >>> f.close()
- >>> f.line_num
- Traceback (most recent call last):
- ...
- AttributeError: 'StandardFormat' object has no attribute 'line_num'
-
-``toolbox.StandardFormat.raw_fields()``
----------------------------------------
-``raw_fields()`` returns an iterator over tuples of two strings representing the
-marker and its value. The marker is given without the backslash and the value
-without its trailing newline:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value\n')
- >>> list(f.raw_fields())
- [('lx', 'a value'), ('lx', 'another value')]
-
-an empty file returns nothing:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('')
- >>> list(f.raw_fields())
- []
-
-file with only a newline returns WHAT SHOULD IT RETURN???:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\n')
- >>> list(f.raw_fields())
- [(None, '')]
-
-file with only one field should be parsed ok:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx one value\n')
- >>> list(f.raw_fields())
- [('lx', 'one value')]
-
-file without a trailing newline should be parsed ok:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\\lx another value')
- >>> list(f.raw_fields())
- [('lx', 'a value'), ('lx', 'another value')]
-
-trailing white space is preserved except for the final newline:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
- >>> list(f.raw_fields())
- [('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')]
-
-line wrapping is preserved:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
- >>> list(f.raw_fields())
- [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
-
-file beginning with a multiline record should be parsed ok:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
- >>> list(f.raw_fields())
- [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
-
-file ending with a multiline record should be parsed ok:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n')
- >>> list(f.raw_fields())
- [('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')]
-
-file beginning with a BOM should be parsed ok:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n')
- >>> list(f.raw_fields())
- [('lx', 'a value'), ('lx', 'another value')]
-
-file beginning with two BOMs should ignore only the first one:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n')
- >>> list(f.raw_fields())
- [(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')]
-
-should not ignore a BOM not at the beginning of the file:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n')
- >>> list(f.raw_fields())
- [('lx', 'a value\n\xef\xbb\xbf\\lx another value')]
-
-``toolbox.StandardFormat.fields()``
------------------------------------
-trailing white space is not preserved:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
- >>> list(f.fields())
- [('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')]
-
-multiline fields are unwrapped:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
- >>> list(f.fields())
- [('lx', 'a value more of the value and still more'), ('lc', 'another val')]
-
-markers
--------
-A backslash in the first position on a new line indicates the start of a
-marker. The backslash is not part of the marker:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk a value\n')
- >>> list(f.fields())
- [('mk', 'a value')]
-
-If the backslash occurs later in the line it does not indicate the start
-of a marker:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk a value\n \\mk another one\n')
- >>> list(f.raw_fields())
- [('mk', 'a value\n \\mk another one')]
-
-There is no specific limit to the length of a marker:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\this_is_an_extremely_long_marker value\n')
- >>> list(f.fields())
- [('this_is_an_extremely_long_marker', 'value')]
-
-A marker can contain any non white space character:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\|,<.>/?;:"0123456789 value\n')
- >>> list(f.fields())
- [('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')]
-
-A marker is terminated by any white space character:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one')
- >>> list(f.fields())
- [('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')]
-
-Consecutive whitespace characters (except newline) are treated the same as one:
-
- >>> f = toolbox.StandardFormat()
- >>> f.open_string('\\mk \t\r\fa value\n')
- >>> list(f.fields())
- [('mk', 'a value')]
-
------------------------
-``toolbox.ToolboxData``
------------------------
-
- >>> db = toolbox.ToolboxData()
-
-``toolbox.ToolboxData.parse()``
--------------------------------
-check that normal parsing works:
-
- >>> from xml.etree import ElementTree
- >>> td = toolbox.ToolboxData()
- >>> s = """\\_sh v3.0 400 Rotokas Dictionary
- ... \\_DateStampHasFourDigitYear
- ...
- ... \\lx kaa
- ... \\ps V.A
- ... \\ge gag
- ... \\gp nek i pas
- ...
- ... \\lx kaa
- ... \\ps V.B
- ... \\ge strangle
- ... \\gp pasim nek
- ... """
- >>> td.open_string(s)
- >>> tree = td.parse(key='lx')
- >>> tree.tag
- 'toolbox_data'
- >>> ElementTree.tostring(list(tree)[0]).decode('utf8')
- '<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
- >>> ElementTree.tostring(list(tree)[1]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
- >>> ElementTree.tostring(list(tree)[2]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
-
-check that guessing the key marker works:
-
- >>> from xml.etree import ElementTree
- >>> td = toolbox.ToolboxData()
- >>> s = """\\_sh v3.0 400 Rotokas Dictionary
- ... \\_DateStampHasFourDigitYear
- ...
- ... \\lx kaa
- ... \\ps V.A
- ... \\ge gag
- ... \\gp nek i pas
- ...
- ... \\lx kaa
- ... \\ps V.B
- ... \\ge strangle
- ... \\gp pasim nek
- ... """
- >>> td.open_string(s)
- >>> tree = td.parse()
- >>> ElementTree.tostring(list(tree)[0]).decode('utf8')
- '<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
- >>> ElementTree.tostring(list(tree)[1]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
- >>> ElementTree.tostring(list(tree)[2]).decode('utf8')
- '<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
-
------------------------
-``toolbox`` functions
------------------------
-
-``toolbox.to_sfm_string()``
--------------------------------
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. -*- coding: utf-8 -*-
-
-=========
-Alignment
-=========
-
-Corpus Reader
--------------
-
- >>> from nltk.corpus import comtrans
- >>> words = comtrans.words('alignment-en-fr.txt')
- >>> for word in words[:6]:
- ... print(word)
- Resumption
- of
- the
- session
- I
- declare
- >>> als = comtrans.aligned_sents('alignment-en-fr.txt')[0]
- >>> als # doctest: +NORMALIZE_WHITESPACE
- AlignedSent(['Resumption', 'of', 'the', 'session'],
- ['Reprise', 'de', 'la', 'session'],
- Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]))
-
-
-Alignment Objects
------------------
-
-Aligned sentences are simply a mapping between words in a sentence:
-
- >>> print(" ".join(als.words))
- Resumption of the session
- >>> print(" ".join(als.mots))
- Reprise de la session
- >>> als.alignment
- Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])
-
-
-Usually we look at them from the perspective of a source to a target language,
-but they are easily inverted:
-
- >>> als.invert() # doctest: +NORMALIZE_WHITESPACE
- AlignedSent(['Reprise', 'de', 'la', 'session'],
- ['Resumption', 'of', 'the', 'session'],
- Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]))
-
-
-We can create new alignments, but these need to be in the correct range of
-the corresponding sentences:
-
- >>> from nltk.translate import Alignment, AlignedSent
- >>> als = AlignedSent(['Reprise', 'de', 'la', 'session'],
- ... ['Resumption', 'of', 'the', 'session'],
- ... Alignment([(0, 0), (1, 4), (2, 1), (3, 3)]))
- Traceback (most recent call last):
- ...
- IndexError: Alignment is outside boundary of mots
-
-
-You can set alignments with any sequence of tuples, so long as the first two
-indexes of the tuple are the alignment indices:
-
- >>> als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
-
- >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
- Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))])
-
-
-Alignment Algorithms
---------------------
-
-EM for IBM Model 1
-~~~~~~~~~~~~~~~~~~
-
-Here is an example from Koehn, 2010:
-
- >>> from nltk.translate import IBMModel1
- >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
- ... AlignedSent(['the', 'book'], ['das', 'Buch']),
- ... AlignedSent(['a', 'book'], ['ein', 'Buch'])]
- >>> em_ibm1 = IBMModel1(corpus, 20)
- >>> print(round(em_ibm1.translation_table['the']['das'], 1))
- 1.0
- >>> print(round(em_ibm1.translation_table['book']['das'], 1))
- 0.0
- >>> print(round(em_ibm1.translation_table['house']['das'], 1))
- 0.0
- >>> print(round(em_ibm1.translation_table['the']['Buch'], 1))
- 0.0
- >>> print(round(em_ibm1.translation_table['book']['Buch'], 1))
- 1.0
- >>> print(round(em_ibm1.translation_table['a']['Buch'], 1))
- 0.0
- >>> print(round(em_ibm1.translation_table['book']['ein'], 1))
- 0.0
- >>> print(round(em_ibm1.translation_table['a']['ein'], 1))
- 1.0
- >>> print(round(em_ibm1.translation_table['the']['Haus'], 1))
- 0.0
- >>> print(round(em_ibm1.translation_table['house']['Haus'], 1))
- 1.0
- >>> print(round(em_ibm1.translation_table['book'][None], 1))
- 0.5
-
-And using an NLTK corpus. We train on only 10 sentences, since it is so slow:
-
- >>> from nltk.corpus import comtrans
- >>> com_ibm1 = IBMModel1(comtrans.aligned_sents()[:10], 20)
- >>> print(round(com_ibm1.translation_table['bitte']['Please'], 1))
- 0.2
- >>> print(round(com_ibm1.translation_table['Sitzungsperiode']['session'], 1))
- 1.0
-
-
-Evaluation
-----------
-The evaluation metrics for alignments are usually not interested in the
-contents of alignments but more often the comparison to a "gold standard"
-alignment that has been been constructed by human experts. For this reason we
-often want to work just with raw set operations against the alignment points.
-This then gives us a very clean form for defining our evaluation metrics.
-
-.. Note::
- The AlignedSent class has no distinction of "possible" or "sure"
- alignments. Thus all alignments are treated as "sure".
-
-Consider the following aligned sentence for evaluation:
-
- >>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'],
- ... ['Reprise', 'de', 'la', 'session'],
- ... Alignment([(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)]))
-
-Precision
-~~~~~~~~~
-``precision = |A∩P| / |A|``
-
-**Precision** is probably the most well known evaluation metric and it is implemented
-in `nltk.metrics.scores.precision`_. Since precision is simply interested in the
-proportion of correct alignments, we calculate the ratio of the number of our
-test alignments (*A*) that match a possible alignment (*P*), over the number of
-test alignments provided. There is no penalty for missing a possible alignment
-in our test alignments. An easy way to game this metric is to provide just one
-test alignment that is in *P* [OCH2000]_.
-
-Here are some examples:
-
- >>> from nltk.metrics import precision
- >>> als.alignment = Alignment([(0,0), (1,1), (2,2), (3,3)])
- >>> precision(Alignment([]), als.alignment)
- 0.0
- >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
- 1.0
- >>> precision(Alignment([(0,0), (3,3)]), als.alignment)
- 0.5
- >>> precision(Alignment.fromstring('0-0 3-3'), als.alignment)
- 0.5
- >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment)
- 1.0
- >>> precision(als.alignment, my_als.alignment)
- 0.6
-
-
-.. _nltk.metrics.scores.precision:
- http://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.precision
-
-
-Recall
-~~~~~~
-``recall = |A∩S| / |S|``
-
-**Recall** is another well known evaluation metric that has a set based
-implementation in NLTK as `nltk.metrics.scores.recall`_. Since recall is
-simply interested in the proportion of found alignments, we calculate the
-ratio of the number of our test alignments (*A*) that match a sure alignment
-(*S*) over the number of sure alignments. There is no penalty for producing
-a lot of test alignments. An easy way to game this metric is to include every
-possible alignment in our test alignments, regardless if they are correct or
-not [OCH2000]_.
-
-Here are some examples:
-
- >>> from nltk.metrics import recall
- >>> print(recall(Alignment([]), als.alignment))
- None
- >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
- 1.0
- >>> recall(Alignment.fromstring('0-0 3-3'), als.alignment)
- 1.0
- >>> recall(Alignment([(0,0), (3,3)]), als.alignment)
- 1.0
- >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment)
- 0.66666...
- >>> recall(als.alignment, my_als.alignment)
- 0.75
-
-
-.. _nltk.metrics.scores.recall:
- http://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.recall
-
-
-Alignment Error Rate (AER)
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-``AER = 1 - (|A∩S| + |A∩P|) / (|A| + |S|)``
-
-**Alignment Error Rate** is commonly used metric for assessing sentence
-alignments. It combines precision and recall metrics together such that a
-perfect alignment must have all of the sure alignments and may have some
-possible alignments [MIHALCEA2003]_ [KOEHN2010]_.
-
-.. Note::
- [KOEHN2010]_ defines the AER as ``AER = (|A∩S| + |A∩P|) / (|A| + |S|)``
- in his book, but corrects it to the above in his online errata. This is
- in line with [MIHALCEA2003]_.
-
-Here are some examples:
-
- >>> from nltk.translate import alignment_error_rate
- >>> alignment_error_rate(Alignment([]), als.alignment)
- 1.0
- >>> alignment_error_rate(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
- 0.0
- >>> alignment_error_rate(als.alignment, my_als.alignment)
- 0.333333...
- >>> alignment_error_rate(als.alignment, my_als.alignment,
- ... als.alignment | Alignment([(1,2), (2,1)]))
- 0.222222...
-
-
-.. [OCH2000] Och, F. and Ney, H. (2000)
- *Statistical Machine Translation*, EAMT Workshop
-
-.. [MIHALCEA2003] Mihalcea, R. and Pedersen, T. (2003)
- *An evaluation exercise for word alignment*, HLT-NAACL 2003
-
-.. [KOEHN2010] Koehn, P. (2010)
- *Statistical Machine Translation*, Cambridge University Press
-
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-from nltk.corpus import teardown_module
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===============================
- Unit tests for nltk.tree.Tree
-===============================
-
- >>> from nltk.tree import *
-
-Some trees to run tests on:
-
- >>> dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])])
- >>> dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])])
- >>> vp = Tree('vp', [Tree('v', ['chased']), dp2])
- >>> tree = Tree('s', [dp1, vp])
- >>> print(tree)
- (s (dp (d the) (np dog)) (vp (v chased) (dp (d the) (np cat))))
-
-The node label is accessed using the `label()` method:
-
- >>> dp1.label(), dp2.label(), vp.label(), tree.label()
- ('dp', 'dp', 'vp', 's')
-
- >>> print(tree[1,1,1,0])
- cat
-
-The `treepositions` method returns a list of the tree positions of
-subtrees and leaves in a tree. By default, it gives the position of
-every tree, subtree, and leaf, in prefix order:
-
- >>> print(tree.treepositions())
- [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0), (1, 1, 0, 0), (1, 1, 1), (1, 1, 1, 0)]
-
-In addition to `str` and `repr`, several methods exist to convert a
-tree object to one of several standard tree encodings:
-
- >>> print(tree.pformat_latex_qtree())
- \Tree [.s
- [.dp [.d the ] [.np dog ] ]
- [.vp [.v chased ] [.dp [.d the ] [.np cat ] ] ] ]
-
-There is also a fancy ASCII art representation:
-
- >>> tree.pretty_print()
- s
- ________|_____
- | vp
- | _____|___
- dp | dp
- ___|___ | ___|___
- d np v d np
- | | | | |
- the dog chased the cat
-
- >>> tree.pretty_print(unicodelines=True, nodedist=4)
- s
- ┌──────────────┴────────┐
- │ vp
- │ ┌────────┴──────┐
- dp │ dp
- ┌──────┴──────┐ │ ┌──────┴──────┐
- d np v d np
- │ │ │ │ │
- the dog chased the cat
-
-Trees can be initialized from treebank strings:
-
- >>> tree2 = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))')
- >>> print(tree2)
- (S (NP I) (VP (V enjoyed) (NP my cookie)))
-
-Trees can be compared for equality:
-
- >>> tree == Tree.fromstring(str(tree))
- True
- >>> tree2 == Tree.fromstring(str(tree2))
- True
- >>> tree == tree2
- False
- >>> tree == Tree.fromstring(str(tree2))
- False
- >>> tree2 == Tree.fromstring(str(tree))
- False
-
- >>> tree != Tree.fromstring(str(tree))
- False
- >>> tree2 != Tree.fromstring(str(tree2))
- False
- >>> tree != tree2
- True
- >>> tree != Tree.fromstring(str(tree2))
- True
- >>> tree2 != Tree.fromstring(str(tree))
- True
-
- >>> tree < tree2 or tree > tree2
- True
-
-Tree Parsing
-============
-
-The class method `Tree.fromstring()` can be used to parse trees, and it
-provides some additional options.
-
- >>> tree = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))')
- >>> print(tree)
- (S (NP I) (VP (V enjoyed) (NP my cookie)))
-
-When called on a subclass of `Tree`, it will create trees of that
-type:
-
- >>> tree = ImmutableTree.fromstring('(VP (V enjoyed) (NP my cookie))')
- >>> print(tree)
- (VP (V enjoyed) (NP my cookie))
- >>> print(type(tree))
- <class 'nltk.tree.ImmutableTree'>
- >>> tree[1] = 'x'
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableTree may not be modified
- >>> del tree[0]
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableTree may not be modified
-
-The ``brackets`` parameter can be used to specify two characters that
-should be used as brackets:
-
- >>> print(Tree.fromstring('[S [NP I] [VP [V enjoyed] [NP my cookie]]]',
- ... brackets='[]'))
- (S (NP I) (VP (V enjoyed) (NP my cookie)))
- >>> print(Tree.fromstring('<S <NP I> <VP <V enjoyed> <NP my cookie>>>',
- ... brackets='<>'))
- (S (NP I) (VP (V enjoyed) (NP my cookie)))
-
-If ``brackets`` is not a string, or is not exactly two characters,
-then `Tree.fromstring` raises an exception:
-
- >>> Tree.fromstring('<VP <V enjoyed> <NP my cookie>>', brackets='')
- Traceback (most recent call last):
- . . .
- TypeError: brackets must be a length-2 string
- >>> Tree.fromstring('<VP <V enjoyed> <NP my cookie>>', brackets='<<>>')
- Traceback (most recent call last):
- . . .
- TypeError: brackets must be a length-2 string
- >>> Tree.fromstring('<VP <V enjoyed> <NP my cookie>>', brackets=12)
- Traceback (most recent call last):
- . . .
- TypeError: brackets must be a length-2 string
- >>> Tree.fromstring('<<NP my cookie>>', brackets=('<<','>>'))
- Traceback (most recent call last):
- . . .
- TypeError: brackets must be a length-2 string
-
-(We may add support for multi-character brackets in the future, in
-which case the ``brackets=('<<','>>')`` example would start working.)
-
-Whitespace brackets are not permitted:
-
- >>> Tree.fromstring('(NP my cookie\n', brackets='(\n')
- Traceback (most recent call last):
- . . .
- TypeError: whitespace brackets not allowed
-
-If an invalid tree is given to Tree.fromstring, then it raises a
-ValueError, with a description of the problem:
-
- >>> Tree.fromstring('(NP my cookie) (NP my milk)')
- Traceback (most recent call last):
- . . .
- ValueError: Tree.fromstring(): expected 'end-of-string' but got '(NP'
- at index 15.
- "...y cookie) (NP my mil..."
- ^
- >>> Tree.fromstring(')NP my cookie(')
- Traceback (most recent call last):
- . . .
- ValueError: Tree.fromstring(): expected '(' but got ')'
- at index 0.
- ")NP my coo..."
- ^
- >>> Tree.fromstring('(NP my cookie))')
- Traceback (most recent call last):
- . . .
- ValueError: Tree.fromstring(): expected 'end-of-string' but got ')'
- at index 14.
- "...my cookie))"
- ^
- >>> Tree.fromstring('my cookie)')
- Traceback (most recent call last):
- . . .
- ValueError: Tree.fromstring(): expected '(' but got 'my'
- at index 0.
- "my cookie)"
- ^
- >>> Tree.fromstring('(NP my cookie')
- Traceback (most recent call last):
- . . .
- ValueError: Tree.fromstring(): expected ')' but got 'end-of-string'
- at index 13.
- "... my cookie"
- ^
- >>> Tree.fromstring('')
- Traceback (most recent call last):
- . . .
- ValueError: Tree.fromstring(): expected '(' but got 'end-of-string'
- at index 0.
- ""
- ^
-
-Trees with no children are supported:
-
- >>> print(Tree.fromstring('(S)'))
- (S )
- >>> print(Tree.fromstring('(X (Y) (Z))'))
- (X (Y ) (Z ))
-
-Trees with an empty node label and no children are supported:
-
- >>> print(Tree.fromstring('()'))
- ( )
- >>> print(Tree.fromstring('(X () ())'))
- (X ( ) ( ))
-
-Trees with an empty node label and children are supported, but only if the
-first child is not a leaf (otherwise, it will be treated as the node label).
-
- >>> print(Tree.fromstring('((A) (B) (C))'))
- ( (A ) (B ) (C ))
- >>> print(Tree.fromstring('((A) leaf)'))
- ( (A ) leaf)
- >>> print(Tree.fromstring('(((())))'))
- ( ( ( ( ))))
-
-The optional arguments `read_node` and `read_leaf` may be used to
-transform the string values of nodes or leaves.
-
- >>> print(Tree.fromstring('(A b (C d e) (F (G h i)))',
- ... read_node=lambda s: '<%s>' % s,
- ... read_leaf=lambda s: '"%s"' % s))
- (<A> "b" (<C> "d" "e") (<F> (<G> "h" "i")))
-
-These transformation functions are typically used when the node or
-leaf labels should be parsed to a non-string value (such as a feature
-structure). If node and leaf labels need to be able to include
-whitespace, then you must also use the optional `node_pattern` and
-`leaf_pattern` arguments.
-
- >>> from nltk.featstruct import FeatStruct
- >>> tree = Tree.fromstring('([cat=NP] [lex=the] [lex=dog])',
- ... read_node=FeatStruct, read_leaf=FeatStruct)
- >>> tree.set_label(tree.label().unify(FeatStruct('[num=singular]')))
- >>> print(tree)
- ([cat='NP', num='singular'] [lex='the'] [lex='dog'])
-
-The optional argument ``remove_empty_top_bracketing`` can be used to
-remove any top-level empty bracketing that occurs.
-
- >>> print(Tree.fromstring('((S (NP I) (VP (V enjoyed) (NP my cookie))))',
- ... remove_empty_top_bracketing=True))
- (S (NP I) (VP (V enjoyed) (NP my cookie)))
-
-It will not remove a top-level empty bracketing with multiple children:
-
- >>> print(Tree.fromstring('((A a) (B b))'))
- ( (A a) (B b))
-
-Parented Trees
-==============
-`ParentedTree` is a subclass of `Tree` that automatically maintains
-parent pointers for single-parented trees. Parented trees can be
-created directly from a node label and a list of children:
-
- >>> ptree = (
- ... ParentedTree('VP', [
- ... ParentedTree('VERB', ['saw']),
- ... ParentedTree('NP', [
- ... ParentedTree('DET', ['the']),
- ... ParentedTree('NOUN', ['dog'])])]))
- >>> print(ptree)
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
-
-Parented trees can be created from strings using the classmethod
-`ParentedTree.fromstring`:
-
- >>> ptree = ParentedTree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))')
- >>> print(ptree)
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
- >>> print(type(ptree))
- <class 'nltk.tree.ParentedTree'>
-
-Parented trees can also be created by using the classmethod
-`ParentedTree.convert` to convert another type of tree to a parented
-tree:
-
- >>> tree = Tree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))')
- >>> ptree = ParentedTree.convert(tree)
- >>> print(ptree)
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
- >>> print(type(ptree))
- <class 'nltk.tree.ParentedTree'>
-
-.. clean-up:
-
- >>> del tree
-
-`ParentedTree`\ s should never be used in the same tree as `Tree`\ s
-or `MultiParentedTree`\ s. Mixing tree implementations may result in
-incorrect parent pointers and in `TypeError` exceptions:
-
- >>> # Inserting a Tree in a ParentedTree gives an exception:
- >>> ParentedTree('NP', [
- ... Tree('DET', ['the']), Tree('NOUN', ['dog'])])
- Traceback (most recent call last):
- . . .
- TypeError: Can not insert a non-ParentedTree into a ParentedTree
-
- >>> # inserting a ParentedTree in a Tree gives incorrect parent pointers:
- >>> broken_tree = Tree('NP', [
- ... ParentedTree('DET', ['the']), ParentedTree('NOUN', ['dog'])])
- >>> print(broken_tree[0].parent())
- None
-
-Parented Tree Methods
-------------------------
-In addition to all the methods defined by the `Tree` class, the
-`ParentedTree` class adds six new methods whose values are
-automatically updated whenver a parented tree is modified: `parent()`,
-`parent_index()`, `left_sibling()`, `right_sibling()`, `root()`, and
-`treeposition()`.
-
-The `parent()` method contains a `ParentedTree`\ 's parent, if it has
-one; and ``None`` otherwise. `ParentedTree`\ s that do not have
-parents are known as "root trees."
-
- >>> for subtree in ptree.subtrees():
- ... print(subtree)
- ... print(' Parent = %s' % subtree.parent())
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
- Parent = None
- (VERB saw)
- Parent = (VP (VERB saw) (NP (DET the) (NOUN dog)))
- (NP (DET the) (NOUN dog))
- Parent = (VP (VERB saw) (NP (DET the) (NOUN dog)))
- (DET the)
- Parent = (NP (DET the) (NOUN dog))
- (NOUN dog)
- Parent = (NP (DET the) (NOUN dog))
-
-The `parent_index()` method stores the index of a tree in its parent's
-child list. If a tree does not have a parent, then its `parent_index`
-is ``None``.
-
- >>> for subtree in ptree.subtrees():
- ... print(subtree)
- ... print(' Parent Index = %s' % subtree.parent_index())
- ... assert (subtree.parent() is None or
- ... subtree.parent()[subtree.parent_index()] is subtree)
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
- Parent Index = None
- (VERB saw)
- Parent Index = 0
- (NP (DET the) (NOUN dog))
- Parent Index = 1
- (DET the)
- Parent Index = 0
- (NOUN dog)
- Parent Index = 1
-
-Note that ``ptree.parent().index(ptree)`` is *not* equivalent to
-``ptree.parent_index()``. In particular, ``ptree.parent().index(ptree)``
-will return the index of the first child of ``ptree.parent()`` that is
-equal to ``ptree`` (using ``==``); and that child may not be
-``ptree``:
-
- >>> on_and_on = ParentedTree('CONJP', [
- ... ParentedTree('PREP', ['on']),
- ... ParentedTree('COJN', ['and']),
- ... ParentedTree('PREP', ['on'])])
- >>> second_on = on_and_on[2]
- >>> print(second_on.parent_index())
- 2
- >>> print(second_on.parent().index(second_on))
- 0
-
-The methods `left_sibling()` and `right_sibling()` can be used to get a
-parented tree's siblings. If a tree does not have a left or right
-sibling, then the corresponding method's value is ``None``:
-
- >>> for subtree in ptree.subtrees():
- ... print(subtree)
- ... print(' Left Sibling = %s' % subtree.left_sibling())
- ... print(' Right Sibling = %s' % subtree.right_sibling())
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
- Left Sibling = None
- Right Sibling = None
- (VERB saw)
- Left Sibling = None
- Right Sibling = (NP (DET the) (NOUN dog))
- (NP (DET the) (NOUN dog))
- Left Sibling = (VERB saw)
- Right Sibling = None
- (DET the)
- Left Sibling = None
- Right Sibling = (NOUN dog)
- (NOUN dog)
- Left Sibling = (DET the)
- Right Sibling = None
-
-A parented tree's root tree can be accessed using the `root()`
-method. This method follows the tree's parent pointers until it
-finds a tree without a parent. If a tree does not have a parent, then
-it is its own root:
-
- >>> for subtree in ptree.subtrees():
- ... print(subtree)
- ... print(' Root = %s' % subtree.root())
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
- Root = (VP (VERB saw) (NP (DET the) (NOUN dog)))
- (VERB saw)
- Root = (VP (VERB saw) (NP (DET the) (NOUN dog)))
- (NP (DET the) (NOUN dog))
- Root = (VP (VERB saw) (NP (DET the) (NOUN dog)))
- (DET the)
- Root = (VP (VERB saw) (NP (DET the) (NOUN dog)))
- (NOUN dog)
- Root = (VP (VERB saw) (NP (DET the) (NOUN dog)))
-
-The `treeposition()` method can be used to find a tree's treeposition
-relative to its root:
-
- >>> for subtree in ptree.subtrees():
- ... print(subtree)
- ... print(' Tree Position = %s' % (subtree.treeposition(),))
- ... assert subtree.root()[subtree.treeposition()] is subtree
- (VP (VERB saw) (NP (DET the) (NOUN dog)))
- Tree Position = ()
- (VERB saw)
- Tree Position = (0,)
- (NP (DET the) (NOUN dog))
- Tree Position = (1,)
- (DET the)
- Tree Position = (1, 0)
- (NOUN dog)
- Tree Position = (1, 1)
-
-Whenever a parented tree is modified, all of the methods described
-above (`parent()`, `parent_index()`, `left_sibling()`, `right_sibling()`,
-`root()`, and `treeposition()`) are automatically updated. For example,
-if we replace ``ptree``\ 's subtree for the word "dog" with a new
-subtree for "cat," the method values for both the "dog" subtree and the
-"cat" subtree get automatically updated:
-
- >>> # Replace the dog with a cat
- >>> dog = ptree[1,1]
- >>> cat = ParentedTree('NOUN', ['cat'])
- >>> ptree[1,1] = cat
-
- >>> # the noun phrase is no longer the dog's parent:
- >>> print(dog.parent(), dog.parent_index(), dog.left_sibling())
- None None None
- >>> # dog is now its own root.
- >>> print(dog.root())
- (NOUN dog)
- >>> print(dog.treeposition())
- ()
-
- >>> # the cat's parent is now the noun phrase:
- >>> print(cat.parent())
- (NP (DET the) (NOUN cat))
- >>> print(cat.parent_index())
- 1
- >>> print(cat.left_sibling())
- (DET the)
- >>> print(cat.root())
- (VP (VERB saw) (NP (DET the) (NOUN cat)))
- >>> print(cat.treeposition())
- (1, 1)
-
-ParentedTree Regression Tests
------------------------------
-Keep track of all trees that we create (including subtrees) using this
-variable:
-
- >>> all_ptrees = []
-
-Define a helper funciton to create new parented trees:
-
- >>> def make_ptree(s):
- ... ptree = ParentedTree.convert(Tree.fromstring(s))
- ... all_ptrees.extend(t for t in ptree.subtrees()
- ... if isinstance(t, Tree))
- ... return ptree
-
-Define a test function that examines every subtree in all_ptrees; and
-checks that all six of its methods are defined correctly. If any
-ptrees are passed as arguments, then they are printed.
-
- >>> def pcheck(*print_ptrees):
- ... for ptree in all_ptrees:
- ... # Check ptree's methods.
- ... if ptree.parent() is not None:
- ... i = ptree.parent_index()
- ... assert ptree.parent()[i] is ptree
- ... if i > 0:
- ... assert ptree.left_sibling() is ptree.parent()[i-1]
- ... if i < (len(ptree.parent())-1):
- ... assert ptree.right_sibling() is ptree.parent()[i+1]
- ... assert len(ptree.treeposition()) > 0
- ... assert (ptree.treeposition() ==
- ... ptree.parent().treeposition() + (ptree.parent_index(),))
- ... assert ptree.root() is not ptree
- ... assert ptree.root() is not None
- ... assert ptree.root() is ptree.parent().root()
- ... assert ptree.root()[ptree.treeposition()] is ptree
- ... else:
- ... assert ptree.parent_index() is None
- ... assert ptree.left_sibling() is None
- ... assert ptree.right_sibling() is None
- ... assert ptree.root() is ptree
- ... assert ptree.treeposition() == ()
- ... # Check ptree's children's methods:
- ... for i, child in enumerate(ptree):
- ... if isinstance(child, Tree):
- ... # pcheck parent() & parent_index() methods
- ... assert child.parent() is ptree
- ... assert child.parent_index() == i
- ... # pcheck sibling methods
- ... if i == 0:
- ... assert child.left_sibling() is None
- ... else:
- ... assert child.left_sibling() is ptree[i-1]
- ... if i == len(ptree)-1:
- ... assert child.right_sibling() is None
- ... else:
- ... assert child.right_sibling() is ptree[i+1]
- ... if print_ptrees:
- ... print('ok!', end=' ')
- ... for ptree in print_ptrees: print(ptree)
- ... else:
- ... print('ok!')
-
-Run our test function on a variety of newly-created trees:
-
- >>> pcheck(make_ptree('(A)'))
- ok! (A )
- >>> pcheck(make_ptree('(A (B (C (D) (E f)) g) h)'))
- ok! (A (B (C (D ) (E f)) g) h)
- >>> pcheck(make_ptree('(A (B) (C c) (D d d) (E e e e))'))
- ok! (A (B ) (C c) (D d d) (E e e e))
- >>> pcheck(make_ptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))'))
- ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e )))
-
-Run our test function after performing various tree-modification
-operations:
-
-**__delitem__()**
-
- >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> e = ptree[0,0,1]
- >>> del ptree[0,0,1]; pcheck(ptree); pcheck(e)
- ok! (A (B (C (D ) (Q p)) g) h)
- ok! (E f)
- >>> del ptree[0,0,0]; pcheck(ptree)
- ok! (A (B (C (Q p)) g) h)
- >>> del ptree[0,1]; pcheck(ptree)
- ok! (A (B (C (Q p))) h)
- >>> del ptree[-1]; pcheck(ptree)
- ok! (A (B (C (Q p))))
- >>> del ptree[-100]
- Traceback (most recent call last):
- . . .
- IndexError: index out of range
- >>> del ptree[()]
- Traceback (most recent call last):
- . . .
- IndexError: The tree position () may not be deleted.
-
- >>> # With slices:
- >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> b = ptree[0]
- >>> del ptree[0:0]; pcheck(ptree)
- ok! (A (B c) (D e) f g (H i) j (K l))
- >>> del ptree[:1]; pcheck(ptree); pcheck(b)
- ok! (A (D e) f g (H i) j (K l))
- ok! (B c)
- >>> del ptree[-2:]; pcheck(ptree)
- ok! (A (D e) f g (H i))
- >>> del ptree[1:3]; pcheck(ptree)
- ok! (A (D e) (H i))
- >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> del ptree[5:1000]; pcheck(ptree)
- ok! (A (B c) (D e) f g (H i))
- >>> del ptree[-2:1000]; pcheck(ptree)
- ok! (A (B c) (D e) f)
- >>> del ptree[-100:1]; pcheck(ptree)
- ok! (A (D e) f)
- >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> del ptree[1:-2:2]; pcheck(ptree)
- ok! (A (B c) f (H i) j (K l))
-
-**__setitem__()**
-
- >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> d, e, q = ptree[0,0]
- >>> ptree[0,0,0] = 'x'; pcheck(ptree); pcheck(d)
- ok! (A (B (C x (E f) (Q p)) g) h)
- ok! (D )
- >>> ptree[0,0,1] = make_ptree('(X (Y z))'); pcheck(ptree); pcheck(e)
- ok! (A (B (C x (X (Y z)) (Q p)) g) h)
- ok! (E f)
- >>> ptree[1] = d; pcheck(ptree)
- ok! (A (B (C x (X (Y z)) (Q p)) g) (D ))
- >>> ptree[-1] = 'x'; pcheck(ptree)
- ok! (A (B (C x (X (Y z)) (Q p)) g) x)
- >>> ptree[-100] = 'y'
- Traceback (most recent call last):
- . . .
- IndexError: index out of range
- >>> ptree[()] = make_ptree('(X y)')
- Traceback (most recent call last):
- . . .
- IndexError: The tree position () may not be assigned to.
-
- >>> # With slices:
- >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> b = ptree[0]
- >>> ptree[0:0] = ('x', make_ptree('(Y)')); pcheck(ptree)
- ok! (A x (Y ) (B c) (D e) f g (H i) j (K l))
- >>> ptree[2:6] = (); pcheck(ptree); pcheck(b)
- ok! (A x (Y ) (H i) j (K l))
- ok! (B c)
- >>> ptree[-2:] = ('z', 'p'); pcheck(ptree)
- ok! (A x (Y ) (H i) z p)
- >>> ptree[1:3] = [make_ptree('(X)') for x in range(10)]; pcheck(ptree)
- ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p)
- >>> ptree[5:1000] = []; pcheck(ptree)
- ok! (A x (X ) (X ) (X ) (X ))
- >>> ptree[-2:1000] = ['n']; pcheck(ptree)
- ok! (A x (X ) (X ) n)
- >>> ptree[-100:1] = [make_ptree('(U v)')]; pcheck(ptree)
- ok! (A (U v) (X ) (X ) n)
- >>> ptree[-1:] = (make_ptree('(X)') for x in range(3)); pcheck(ptree)
- ok! (A (U v) (X ) (X ) (X ) (X ) (X ))
- >>> ptree[1:-2:2] = ['x', 'y']; pcheck(ptree)
- ok! (A (U v) x (X ) y (X ) (X ))
-
-**append()**
-
- >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> ptree.append('x'); pcheck(ptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x)
- >>> ptree.append(make_ptree('(X (Y z))')); pcheck(ptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z)))
-
-**extend()**
-
- >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> ptree.extend(['x', 'y', make_ptree('(X (Y z))')]); pcheck(ptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)))
- >>> ptree.extend([]); pcheck(ptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)))
- >>> ptree.extend(make_ptree('(X)') for x in range(3)); pcheck(ptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X ))
-
-**insert()**
-
- >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> ptree.insert(0, make_ptree('(X (Y z))')); pcheck(ptree)
- ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h)
- >>> ptree.insert(-1, make_ptree('(X (Y z))')); pcheck(ptree)
- ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h)
- >>> ptree.insert(-4, make_ptree('(X (Y z))')); pcheck(ptree)
- ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h)
- >>> # Note: as with ``list``, inserting at a negative index that
- >>> # gives a position before the start of the list does *not*
- >>> # raise an IndexError exception; it just inserts at 0.
- >>> ptree.insert(-400, make_ptree('(X (Y z))')); pcheck(ptree)
- ok! (A
- (X (Y z))
- (X (Y z))
- (X (Y z))
- (B (C (D ) (E f) (Q p)) g)
- (X (Y z))
- h)
-
-**pop()**
-
- >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> ptree[0,0].pop(1); pcheck(ptree)
- ParentedTree('E', ['f'])
- ok! (A (B (C (D ) (Q p)) g) h)
- >>> ptree[0].pop(-1); pcheck(ptree)
- 'g'
- ok! (A (B (C (D ) (Q p))) h)
- >>> ptree.pop(); pcheck(ptree)
- 'h'
- ok! (A (B (C (D ) (Q p))))
- >>> ptree.pop(-100)
- Traceback (most recent call last):
- . . .
- IndexError: index out of range
-
-**remove()**
-
- >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> e = ptree[0,0,1]
- >>> ptree[0,0].remove(ptree[0,0,1]); pcheck(ptree); pcheck(e)
- ok! (A (B (C (D ) (Q p)) g) h)
- ok! (E f)
- >>> ptree[0,0].remove(make_ptree('(Q p)')); pcheck(ptree)
- ok! (A (B (C (D )) g) h)
- >>> ptree[0,0].remove(make_ptree('(Q p)'))
- Traceback (most recent call last):
- . . .
- ValueError: ParentedTree('Q', ['p']) is not in list
- >>> ptree.remove('h'); pcheck(ptree)
- ok! (A (B (C (D )) g))
- >>> ptree.remove('h');
- Traceback (most recent call last):
- . . .
- ValueError: 'h' is not in list
- >>> # remove() removes the first subtree that is equal (==) to the
- >>> # given tree, which may not be the identical tree we give it:
- >>> ptree = make_ptree('(A (X x) (Y y) (X x))')
- >>> x1, y, x2 = ptree
- >>> ptree.remove(ptree[-1]); pcheck(ptree)
- ok! (A (Y y) (X x))
- >>> print(x1.parent()); pcheck(x1)
- None
- ok! (X x)
- >>> print(x2.parent())
- (A (Y y) (X x))
-
-Test that a tree can not be given multiple parents:
-
- >>> ptree = make_ptree('(A (X x) (Y y) (Z z))')
- >>> ptree[0] = ptree[1]
- Traceback (most recent call last):
- . . .
- ValueError: Can not insert a subtree that already has a parent.
- >>> pcheck()
- ok!
-
-[more to be written]
-
-
-ImmutableParentedTree Regression Tests
---------------------------------------
-
- >>> iptree = ImmutableParentedTree.convert(ptree)
- >>> type(iptree)
- <class 'nltk.tree.ImmutableParentedTree'>
- >>> del iptree[0]
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableParentedTree may not be modified
- >>> iptree.set_label('newnode')
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableParentedTree may not be modified
-
-
-MultiParentedTree Regression Tests
-----------------------------------
-Keep track of all trees that we create (including subtrees) using this
-variable:
-
- >>> all_mptrees = []
-
-Define a helper funciton to create new parented trees:
-
- >>> def make_mptree(s):
- ... mptree = MultiParentedTree.convert(Tree.fromstring(s))
- ... all_mptrees.extend(t for t in mptree.subtrees()
- ... if isinstance(t, Tree))
- ... return mptree
-
-Define a test function that examines every subtree in all_mptrees; and
-checks that all six of its methods are defined correctly. If any
-mptrees are passed as arguments, then they are printed.
-
- >>> def mpcheck(*print_mptrees):
- ... def has(seq, val): # uses identity comparison
- ... for item in seq:
- ... if item is val: return True
- ... return False
- ... for mptree in all_mptrees:
- ... # Check mptree's methods.
- ... if len(mptree.parents()) == 0:
- ... assert len(mptree.left_siblings()) == 0
- ... assert len(mptree.right_siblings()) == 0
- ... assert len(mptree.roots()) == 1
- ... assert mptree.roots()[0] is mptree
- ... assert mptree.treepositions(mptree) == [()]
- ... left_siblings = right_siblings = ()
- ... roots = {id(mptree): 1}
- ... else:
- ... roots = dict((id(r), 0) for r in mptree.roots())
- ... left_siblings = mptree.left_siblings()
- ... right_siblings = mptree.right_siblings()
- ... for parent in mptree.parents():
- ... for i in mptree.parent_indices(parent):
- ... assert parent[i] is mptree
- ... # check left siblings
- ... if i > 0:
- ... for j in range(len(left_siblings)):
- ... if left_siblings[j] is parent[i-1]:
- ... del left_siblings[j]
- ... break
- ... else:
- ... assert 0, 'sibling not found!'
- ... # check ight siblings
- ... if i < (len(parent)-1):
- ... for j in range(len(right_siblings)):
- ... if right_siblings[j] is parent[i+1]:
- ... del right_siblings[j]
- ... break
- ... else:
- ... assert 0, 'sibling not found!'
- ... # check roots
- ... for root in parent.roots():
- ... assert id(root) in roots, 'missing root'
- ... roots[id(root)] += 1
- ... # check that we don't have any unexplained values
- ... assert len(left_siblings)==0, 'unexpected sibling'
- ... assert len(right_siblings)==0, 'unexpected sibling'
- ... for v in roots.values(): assert v>0, roots #'unexpected root'
- ... # check treepositions
- ... for root in mptree.roots():
- ... for treepos in mptree.treepositions(root):
- ... assert root[treepos] is mptree
- ... # Check mptree's children's methods:
- ... for i, child in enumerate(mptree):
- ... if isinstance(child, Tree):
- ... # mpcheck parent() & parent_index() methods
- ... assert has(child.parents(), mptree)
- ... assert i in child.parent_indices(mptree)
- ... # mpcheck sibling methods
- ... if i > 0:
- ... assert has(child.left_siblings(), mptree[i-1])
- ... if i < len(mptree)-1:
- ... assert has(child.right_siblings(), mptree[i+1])
- ... if print_mptrees:
- ... print('ok!', end=' ')
- ... for mptree in print_mptrees: print(mptree)
- ... else:
- ... print('ok!')
-
-Run our test function on a variety of newly-created trees:
-
- >>> mpcheck(make_mptree('(A)'))
- ok! (A )
- >>> mpcheck(make_mptree('(A (B (C (D) (E f)) g) h)'))
- ok! (A (B (C (D ) (E f)) g) h)
- >>> mpcheck(make_mptree('(A (B) (C c) (D d d) (E e e e))'))
- ok! (A (B ) (C c) (D d d) (E e e e))
- >>> mpcheck(make_mptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))'))
- ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e )))
- >>> subtree = make_mptree('(A (B (C (D) (E f)) g) h)')
-
-Including some trees that contain multiple parents:
-
- >>> mpcheck(MultiParentedTree('Z', [subtree, subtree]))
- ok! (Z (A (B (C (D ) (E f)) g) h) (A (B (C (D ) (E f)) g) h))
-
-Run our test function after performing various tree-modification
-operations (n.b., these are the same tests that we ran for
-`ParentedTree`, above; thus, none of these trees actually *uses*
-multiple parents.)
-
-**__delitem__()**
-
- >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> e = mptree[0,0,1]
- >>> del mptree[0,0,1]; mpcheck(mptree); mpcheck(e)
- ok! (A (B (C (D ) (Q p)) g) h)
- ok! (E f)
- >>> del mptree[0,0,0]; mpcheck(mptree)
- ok! (A (B (C (Q p)) g) h)
- >>> del mptree[0,1]; mpcheck(mptree)
- ok! (A (B (C (Q p))) h)
- >>> del mptree[-1]; mpcheck(mptree)
- ok! (A (B (C (Q p))))
- >>> del mptree[-100]
- Traceback (most recent call last):
- . . .
- IndexError: index out of range
- >>> del mptree[()]
- Traceback (most recent call last):
- . . .
- IndexError: The tree position () may not be deleted.
-
- >>> # With slices:
- >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> b = mptree[0]
- >>> del mptree[0:0]; mpcheck(mptree)
- ok! (A (B c) (D e) f g (H i) j (K l))
- >>> del mptree[:1]; mpcheck(mptree); mpcheck(b)
- ok! (A (D e) f g (H i) j (K l))
- ok! (B c)
- >>> del mptree[-2:]; mpcheck(mptree)
- ok! (A (D e) f g (H i))
- >>> del mptree[1:3]; mpcheck(mptree)
- ok! (A (D e) (H i))
- >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> del mptree[5:1000]; mpcheck(mptree)
- ok! (A (B c) (D e) f g (H i))
- >>> del mptree[-2:1000]; mpcheck(mptree)
- ok! (A (B c) (D e) f)
- >>> del mptree[-100:1]; mpcheck(mptree)
- ok! (A (D e) f)
- >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> del mptree[1:-2:2]; mpcheck(mptree)
- ok! (A (B c) f (H i) j (K l))
-
-**__setitem__()**
-
- >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> d, e, q = mptree[0,0]
- >>> mptree[0,0,0] = 'x'; mpcheck(mptree); mpcheck(d)
- ok! (A (B (C x (E f) (Q p)) g) h)
- ok! (D )
- >>> mptree[0,0,1] = make_mptree('(X (Y z))'); mpcheck(mptree); mpcheck(e)
- ok! (A (B (C x (X (Y z)) (Q p)) g) h)
- ok! (E f)
- >>> mptree[1] = d; mpcheck(mptree)
- ok! (A (B (C x (X (Y z)) (Q p)) g) (D ))
- >>> mptree[-1] = 'x'; mpcheck(mptree)
- ok! (A (B (C x (X (Y z)) (Q p)) g) x)
- >>> mptree[-100] = 'y'
- Traceback (most recent call last):
- . . .
- IndexError: index out of range
- >>> mptree[()] = make_mptree('(X y)')
- Traceback (most recent call last):
- . . .
- IndexError: The tree position () may not be assigned to.
-
- >>> # With slices:
- >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))')
- >>> b = mptree[0]
- >>> mptree[0:0] = ('x', make_mptree('(Y)')); mpcheck(mptree)
- ok! (A x (Y ) (B c) (D e) f g (H i) j (K l))
- >>> mptree[2:6] = (); mpcheck(mptree); mpcheck(b)
- ok! (A x (Y ) (H i) j (K l))
- ok! (B c)
- >>> mptree[-2:] = ('z', 'p'); mpcheck(mptree)
- ok! (A x (Y ) (H i) z p)
- >>> mptree[1:3] = [make_mptree('(X)') for x in range(10)]; mpcheck(mptree)
- ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p)
- >>> mptree[5:1000] = []; mpcheck(mptree)
- ok! (A x (X ) (X ) (X ) (X ))
- >>> mptree[-2:1000] = ['n']; mpcheck(mptree)
- ok! (A x (X ) (X ) n)
- >>> mptree[-100:1] = [make_mptree('(U v)')]; mpcheck(mptree)
- ok! (A (U v) (X ) (X ) n)
- >>> mptree[-1:] = (make_mptree('(X)') for x in range(3)); mpcheck(mptree)
- ok! (A (U v) (X ) (X ) (X ) (X ) (X ))
- >>> mptree[1:-2:2] = ['x', 'y']; mpcheck(mptree)
- ok! (A (U v) x (X ) y (X ) (X ))
-
-**append()**
-
- >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> mptree.append('x'); mpcheck(mptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x)
- >>> mptree.append(make_mptree('(X (Y z))')); mpcheck(mptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z)))
-
-**extend()**
-
- >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> mptree.extend(['x', 'y', make_mptree('(X (Y z))')]); mpcheck(mptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)))
- >>> mptree.extend([]); mpcheck(mptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)))
- >>> mptree.extend(make_mptree('(X)') for x in range(3)); mpcheck(mptree)
- ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X ))
-
-**insert()**
-
- >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> mptree.insert(0, make_mptree('(X (Y z))')); mpcheck(mptree)
- ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h)
- >>> mptree.insert(-1, make_mptree('(X (Y z))')); mpcheck(mptree)
- ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h)
- >>> mptree.insert(-4, make_mptree('(X (Y z))')); mpcheck(mptree)
- ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h)
- >>> # Note: as with ``list``, inserting at a negative index that
- >>> # gives a position before the start of the list does *not*
- >>> # raise an IndexError exception; it just inserts at 0.
- >>> mptree.insert(-400, make_mptree('(X (Y z))')); mpcheck(mptree)
- ok! (A
- (X (Y z))
- (X (Y z))
- (X (Y z))
- (B (C (D ) (E f) (Q p)) g)
- (X (Y z))
- h)
-
-**pop()**
-
- >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> mptree[0,0].pop(1); mpcheck(mptree)
- MultiParentedTree('E', ['f'])
- ok! (A (B (C (D ) (Q p)) g) h)
- >>> mptree[0].pop(-1); mpcheck(mptree)
- 'g'
- ok! (A (B (C (D ) (Q p))) h)
- >>> mptree.pop(); mpcheck(mptree)
- 'h'
- ok! (A (B (C (D ) (Q p))))
- >>> mptree.pop(-100)
- Traceback (most recent call last):
- . . .
- IndexError: index out of range
-
-**remove()**
-
- >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)')
- >>> e = mptree[0,0,1]
- >>> mptree[0,0].remove(mptree[0,0,1]); mpcheck(mptree); mpcheck(e)
- ok! (A (B (C (D ) (Q p)) g) h)
- ok! (E f)
- >>> mptree[0,0].remove(make_mptree('(Q p)')); mpcheck(mptree)
- ok! (A (B (C (D )) g) h)
- >>> mptree[0,0].remove(make_mptree('(Q p)'))
- Traceback (most recent call last):
- . . .
- ValueError: MultiParentedTree('Q', ['p']) is not in list
- >>> mptree.remove('h'); mpcheck(mptree)
- ok! (A (B (C (D )) g))
- >>> mptree.remove('h');
- Traceback (most recent call last):
- . . .
- ValueError: 'h' is not in list
- >>> # remove() removes the first subtree that is equal (==) to the
- >>> # given tree, which may not be the identical tree we give it:
- >>> mptree = make_mptree('(A (X x) (Y y) (X x))')
- >>> x1, y, x2 = mptree
- >>> mptree.remove(mptree[-1]); mpcheck(mptree)
- ok! (A (Y y) (X x))
- >>> print([str(p) for p in x1.parents()])
- []
- >>> print([str(p) for p in x2.parents()])
- ['(A (Y y) (X x))']
-
-
-ImmutableMultiParentedTree Regression Tests
--------------------------------------------
-
- >>> imptree = ImmutableMultiParentedTree.convert(mptree)
- >>> type(imptree)
- <class 'nltk.tree.ImmutableMultiParentedTree'>
- >>> del imptree[0]
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableMultiParentedTree may not be modified
- >>> imptree.set_label('newnode')
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableMultiParentedTree may not be modified
-
-
-ProbabilisticTree Regression Tests
-----------------------------------
-
- >>> prtree = ProbabilisticTree("S", [ProbabilisticTree("NP", ["N"], prob=0.3)], prob=0.6)
- >>> print(prtree)
- (S (NP N)) (p=0.6)
- >>> import copy
- >>> prtree == copy.deepcopy(prtree) == prtree.copy(deep=True) == prtree.copy()
- True
- >>> prtree[0] is prtree.copy()[0]
- True
- >>> prtree[0] is prtree.copy(deep=True)[0]
- False
-
- >>> imprtree = ImmutableProbabilisticTree.convert(prtree)
- >>> type(imprtree)
- <class 'nltk.tree.ImmutableProbabilisticTree'>
- >>> del imprtree[0]
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableProbabilisticTree may not be modified
- >>> imprtree.set_label('newnode')
- Traceback (most recent call last):
- . . .
- ValueError: ImmutableProbabilisticTree may not be modified
-
-
-Squashed Bugs
-=============
-
-This used to discard the ``(B b)`` subtree (fixed in svn 6270):
-
- >>> print(Tree.fromstring('((A a) (B b))'))
- ( (A a) (B b))
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-========================================================
- Unit tests for nltk.treeprettyprinter.TreePrettyPrinter
-========================================================
-
- >>> from nltk.tree import Tree
- >>> from nltk.treeprettyprinter import TreePrettyPrinter
-
-Tree nr 2170 from nltk.corpus.treebank:
-
- >>> tree = Tree.fromstring(
- ... '(S (NP-SBJ (PRP I)) (VP (VBP feel) (ADJP-PRD (RB pretty) '
- ... '(JJ good)) (PP-CLR (IN about) (NP (PRP it)))) (. .))')
- >>> tpp = TreePrettyPrinter(tree)
- >>> print(tpp.text())
- S
- __________________________|_____________________
- | VP |
- | ____________________|___________ |
- | | | PP-CLR |
- | | | _____|_____ |
- NP-SBJ | ADJP-PRD | NP |
- | | _______|______ | | |
- PRP VBP RB JJ IN PRP .
- | | | | | | |
- I feel pretty good about it .
-
- >>> print(tpp.text(unicodelines=True))
- S
- ┌──────────────────────────┼─────────────────────┐
- │ VP │
- │ ┌─────────────┬──────┴───────────┐ │
- │ │ │ PP-CLR │
- │ │ │ ┌─────┴─────┐ │
- NP-SBJ │ ADJP-PRD │ NP │
- │ │ ┌───────┴──────┐ │ │ │
- PRP VBP RB JJ IN PRP .
- │ │ │ │ │ │ │
- I feel pretty good about it .
-
-A tree with long labels:
-
- >>> tree = Tree.fromstring(
- ... '(sentence (plural-noun-phrase (plural-noun Superconductors)) '
- ... '(verb-phrase (plural-verb conduct) '
- ... '(noun-phrase (singular-noun electricity))))')
- >>> tpp = TreePrettyPrinter(tree)
- >>> print(tpp.text(abbreviate=8, nodedist=2))
- sentence
- __________|__________
- | verb-phr.
- | __________|__________
- plural-n. | noun-phr.
- | | |
- plural-n. plural-v. singular.
- | | |
- Supercon. conduct electric.
-
- >>> print(tpp.text(maxwidth=8, nodedist=2))
- sentence
- _________|________
- | verb-
- | phrase
- | ________|_________
- plural- | noun-
- noun- | phrase
- phrase | |
- | | |
- plural- plural- singular-
- noun verb noun
- | | |
- Supercon conduct electric
- ductors ity
-
-A discontinuous tree:
-
- >>> tree = Tree.fromstring(
- ... '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
- ... '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
- ... '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int)
- >>> sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
- ... ' zwemmen of terrassen .'.split())
- >>> tpp = TreePrettyPrinter(tree, sentence)
- >>> print(tpp.text())
- top
- _____|______________________________________________
- smain | |
- _______________________________|_____ | |
- | | inf | |
- | | _____|____ | |
- | | | inf | |
- | | | ____|_____ | |
- | | | | conj | |
- | | _____ | ___ | _________|______ | __________________ |
- | | inf | | | | | | |
- | | _________|_____ | ___ | _________ | | | | |
- | | pp | | | | | | | |
- | | ____|____ | | | | | | | |
- | | | np | | | | inf | inf |
- | | | ____|____ | | | | | | | |
- noun verb prep det noun verb verb verb punct verb vg verb punct
- | | | | | | | | | | | | |
- Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen .
-
- >>> print(tpp.text(unicodelines=True))
- top
- ┌─────┴──────────────────┬───────────────────────────┐
- smain │ │
- ┌────┬──────────────────────────┴─────┐ │ │
- │ │ inf │ │
- │ │ ┌─────┴────┐ │ │
- │ │ │ inf │ │
- │ │ │ ┌────┴─────┐ │ │
- │ │ │ │ conj │ │
- │ │ ┌───── │ ─── │ ─────────┴────── │ ─────┬─────┬──────┐ │
- │ │ inf │ │ │ │ │ │ │
- │ │ ┌─────────┴───── │ ─── │ ─────────┐ │ │ │ │ │
- │ │ pp │ │ │ │ │ │ │ │
- │ │ ┌────┴────┐ │ │ │ │ │ │ │ │
- │ │ │ np │ │ │ │ inf │ inf │
- │ │ │ ┌────┴────┐ │ │ │ │ │ │ │ │
- noun verb prep det noun verb verb verb punct verb vg verb punct
- │ │ │ │ │ │ │ │ │ │ │ │ │
- Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen .
-
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
--------------------------------------------
-Unit tests for the TreeTransformation class
--------------------------------------------
-
- >>> from copy import deepcopy
- >>> from nltk.tree import *
- >>> from nltk.treetransforms import *
-
- >>> tree_string = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))"
-
- >>> tree = Tree.fromstring(tree_string)
- >>> print(tree)
- (TOP
- (S
- (S
- (VP
- (VBN Turned)
- (ADVP (RB loose))
- (PP
- (IN in)
- (NP
- (NP (NNP Shane) (NNP Longman) (POS 's))
- (NN trading)
- (NN room)))))
- (, ,)
- (NP (DT the) (NN yuppie) (NNS dealers))
- (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
- (. .)))
-
-Make a copy of the original tree and collapse the subtrees with only one child
-
- >>> collapsedTree = deepcopy(tree)
- >>> collapse_unary(collapsedTree)
- >>> print(collapsedTree)
- (TOP
- (S
- (S+VP
- (VBN Turned)
- (ADVP (RB loose))
- (PP
- (IN in)
- (NP
- (NP (NNP Shane) (NNP Longman) (POS 's))
- (NN trading)
- (NN room))))
- (, ,)
- (NP (DT the) (NN yuppie) (NNS dealers))
- (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
- (. .)))
-
- >>> collapsedTree2 = deepcopy(tree)
- >>> collapse_unary(collapsedTree2, collapsePOS=True, collapseRoot=True)
- >>> print(collapsedTree2)
- (TOP+S
- (S+VP
- (VBN Turned)
- (ADVP+RB loose)
- (PP
- (IN in)
- (NP
- (NP (NNP Shane) (NNP Longman) (POS 's))
- (NN trading)
- (NN room))))
- (, ,)
- (NP (DT the) (NN yuppie) (NNS dealers))
- (VP (AUX do) (NP (NP+RB little) (ADJP+RB right)))
- (. .))
-
-Convert the tree to Chomsky Normal Form i.e. each subtree has either two
-subtree children or a single leaf value. This conversion can be performed
-using either left- or right-factoring.
-
- >>> cnfTree = deepcopy(collapsedTree)
- >>> chomsky_normal_form(cnfTree, factor='left')
- >>> print(cnfTree)
- (TOP
- (S
- (S|<S+VP-,-NP-VP>
- (S|<S+VP-,-NP>
- (S|<S+VP-,>
- (S+VP
- (S+VP|<VBN-ADVP> (VBN Turned) (ADVP (RB loose)))
- (PP
- (IN in)
- (NP
- (NP|<NP-NN>
- (NP
- (NP|<NNP-NNP> (NNP Shane) (NNP Longman))
- (POS 's))
- (NN trading))
- (NN room))))
- (, ,))
- (NP (NP|<DT-NN> (DT the) (NN yuppie)) (NNS dealers)))
- (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))))
- (. .)))
-
- >>> cnfTree = deepcopy(collapsedTree)
- >>> chomsky_normal_form(cnfTree, factor='right')
- >>> print(cnfTree)
- (TOP
- (S
- (S+VP
- (VBN Turned)
- (S+VP|<ADVP-PP>
- (ADVP (RB loose))
- (PP
- (IN in)
- (NP
- (NP (NNP Shane) (NP|<NNP-POS> (NNP Longman) (POS 's)))
- (NP|<NN-NN> (NN trading) (NN room))))))
- (S|<,-NP-VP-.>
- (, ,)
- (S|<NP-VP-.>
- (NP (DT the) (NP|<NN-NNS> (NN yuppie) (NNS dealers)))
- (S|<VP-.>
- (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
- (. .))))))
-
-Employ some Markov smoothing to make the artificial node labels a bit more
-readable. See the treetransforms.py documentation for more details.
-
- >>> markovTree = deepcopy(collapsedTree)
- >>> chomsky_normal_form(markovTree, horzMarkov=2, vertMarkov=1)
- >>> print(markovTree)
- (TOP
- (S^<TOP>
- (S+VP^<S>
- (VBN Turned)
- (S+VP|<ADVP-PP>^<S>
- (ADVP^<S+VP> (RB loose))
- (PP^<S+VP>
- (IN in)
- (NP^<PP>
- (NP^<NP>
- (NNP Shane)
- (NP|<NNP-POS>^<NP> (NNP Longman) (POS 's)))
- (NP|<NN-NN>^<PP> (NN trading) (NN room))))))
- (S|<,-NP>^<TOP>
- (, ,)
- (S|<NP-VP>^<TOP>
- (NP^<S> (DT the) (NP|<NN-NNS>^<S> (NN yuppie) (NNS dealers)))
- (S|<VP-.>^<TOP>
- (VP^<S>
- (AUX do)
- (NP^<VP> (NP^<NP> (RB little)) (ADJP^<NP> (RB right))))
- (. .))))))
-
-Convert the transformed tree back to its original form
-
- >>> un_chomsky_normal_form(markovTree)
- >>> tree == markovTree
- True
-
+++ /dev/null
-# Natural Language Toolkit: Language Model Unit Tests
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-import unittest
-
-from nltk import FreqDist
-from nltk.lm import NgramCounter
-from nltk.util import everygrams
-
-
-class NgramCounterTests(unittest.TestCase):
- """Tests for NgramCounter that only involve lookup, no modification."""
-
- @classmethod
- def setUpClass(cls):
-
- text = [list("abcd"), list("egdbe")]
- cls.trigram_counter = NgramCounter(
- (everygrams(sent, max_len=3) for sent in text)
- )
- cls.bigram_counter = NgramCounter(
- (everygrams(sent, max_len=2) for sent in text)
- )
-
- def test_N(self):
- self.assertEqual(self.bigram_counter.N(), 16)
- self.assertEqual(self.trigram_counter.N(), 21)
-
- def test_counter_len_changes_with_lookup(self):
- self.assertEqual(len(self.bigram_counter), 2)
- _ = self.bigram_counter[50]
- self.assertEqual(len(self.bigram_counter), 3)
-
- def test_ngram_order_access_unigrams(self):
- self.assertEqual(self.bigram_counter[1], self.bigram_counter.unigrams)
-
- def test_ngram_conditional_freqdist(self):
- expected_trigram_contexts = [
- ("a", "b"),
- ("b", "c"),
- ("e", "g"),
- ("g", "d"),
- ("d", "b"),
- ]
- expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)]
-
- bigrams = self.trigram_counter[2]
- trigrams = self.trigram_counter[3]
-
- self.assertCountEqual(expected_bigram_contexts, bigrams.conditions())
- self.assertCountEqual(expected_trigram_contexts, trigrams.conditions())
-
- def test_bigram_counts_seen_ngrams(self):
- b_given_a_count = 1
- unk_given_b_count = 1
-
- self.assertEqual(b_given_a_count, self.bigram_counter[["a"]]["b"])
- self.assertEqual(unk_given_b_count, self.bigram_counter[["b"]]["c"])
-
- def test_bigram_counts_unseen_ngrams(self):
- z_given_b_count = 0
-
- self.assertEqual(z_given_b_count, self.bigram_counter[["b"]]["z"])
-
- def test_unigram_counts_seen_words(self):
- expected_count_b = 2
-
- self.assertEqual(expected_count_b, self.bigram_counter["b"])
-
- def test_unigram_counts_completely_unseen_words(self):
- unseen_count = 0
-
- self.assertEqual(unseen_count, self.bigram_counter["z"])
-
-
-class NgramCounterTrainingTests(unittest.TestCase):
- def setUp(self):
- self.counter = NgramCounter()
-
- def test_empty_string(self):
- test = NgramCounter("")
- self.assertNotIn(2, test)
- self.assertEqual(test[1], FreqDist())
-
- def test_empty_list(self):
- test = NgramCounter([])
- self.assertNotIn(2, test)
- self.assertEqual(test[1], FreqDist())
-
- def test_None(self):
- test = NgramCounter(None)
- self.assertNotIn(2, test)
- self.assertEqual(test[1], FreqDist())
-
- def test_train_on_unigrams(self):
- words = list("abcd")
- counter = NgramCounter([[(w,) for w in words]])
-
- self.assertFalse(bool(counter[3]))
- self.assertFalse(bool(counter[2]))
- self.assertCountEqual(words, counter[1].keys())
-
- def test_train_on_illegal_sentences(self):
- str_sent = ["Check", "this", "out", "!"]
- list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]]
-
- with self.assertRaises(TypeError):
- NgramCounter([str_sent])
-
- with self.assertRaises(TypeError):
- NgramCounter([list_sent])
-
- def test_train_on_bigrams(self):
- bigram_sent = [("a", "b"), ("c", "d")]
- counter = NgramCounter([bigram_sent])
-
- self.assertFalse(bool(counter[3]))
-
- def test_train_on_mix(self):
- mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)]
- counter = NgramCounter([mixed_sent])
- unigrams = ["h"]
- bigram_contexts = [("a",), ("c",)]
- trigram_contexts = [("e", "f")]
-
- self.assertCountEqual(unigrams, counter[1].keys())
- self.assertCountEqual(bigram_contexts, counter[2].keys())
- self.assertCountEqual(trigram_contexts, counter[3].keys())
+++ /dev/null
-# Natural Language Toolkit: Language Model Unit Tests
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-import math
-import unittest
-
-
-from nltk.lm import (
- Vocabulary,
- MLE,
- Lidstone,
- Laplace,
- WittenBellInterpolated,
- KneserNeyInterpolated,
-)
-from nltk.lm.preprocessing import padded_everygrams
-
-
-def _prepare_test_data(ngram_order):
- return (
- Vocabulary(["a", "b", "c", "d", "z", "<s>", "</s>"], unk_cutoff=1),
- [
- list(padded_everygrams(ngram_order, sent))
- for sent in (list("abcd"), list("egadbe"))
- ],
- )
-
-
-class ParametrizeTestsMeta(type):
- """Metaclass for generating parametrized tests."""
-
- def __new__(cls, name, bases, dct):
- contexts = (
- ("a",),
- ("c",),
- (u"<s>",),
- ("b",),
- (u"<UNK>",),
- ("d",),
- ("e",),
- ("r",),
- ("w",),
- )
- for i, c in enumerate(contexts):
- dct["test_sumto1_{0}".format(i)] = cls.add_sum_to_1_test(c)
- scores = dct.get("score_tests", [])
- for i, (word, context, expected_score) in enumerate(scores):
- dct["test_score_{0}".format(i)] = cls.add_score_test(
- word, context, expected_score
- )
- return super().__new__(cls, name, bases, dct)
-
- @classmethod
- def add_score_test(cls, word, context, expected_score):
- message = "word='{word}', context={context}"
-
- def test_method(self):
- score = self.model.score(word, context)
- self.assertAlmostEqual(
- score, expected_score, msg=message.format(**locals()), places=4
- )
-
- return test_method
-
- @classmethod
- def add_sum_to_1_test(cls, context):
- def test(self):
- s = sum(self.model.score(w, context) for w in self.model.vocab)
- self.assertAlmostEqual(s, 1.0, msg="The context is {}".format(context))
-
- return test
-
-
-class MleBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- """Unit tests for MLE ngram model."""
-
- score_tests = [
- ("d", ["c"], 1),
- # Unseen ngrams should yield 0
- ("d", ["e"], 0),
- # Unigrams should also be 0
- ("z", None, 0),
- # N unigrams = 14
- # count('a') = 2
- ("a", None, 2.0 / 14),
- # count('y') = 3
- ("y", None, 3.0 / 14),
- ]
-
- def setUp(self):
- vocab, training_text = _prepare_test_data(2)
- self.model = MLE(2, vocabulary=vocab)
- self.model.fit(training_text)
-
- def test_logscore_zero_score(self):
- # logscore of unseen ngrams should be -inf
- logscore = self.model.logscore("d", ["e"])
-
- self.assertTrue(math.isinf(logscore))
-
- def test_entropy_perplexity_seen(self):
- # ngrams seen during training
- trained = [
- ("<s>", "a"),
- ("a", "b"),
- ("b", "<UNK>"),
- ("<UNK>", "a"),
- ("a", "d"),
- ("d", "</s>"),
- ]
- # Ngram = Log score
- # <s>, a = -1
- # a, b = -1
- # b, UNK = -1
- # UNK, a = -1.585
- # a, d = -1
- # d, </s> = -1
- # TOTAL logscores = -6.585
- # - AVG logscores = 1.0975
- H = 1.0975
- perplexity = 2.1398
-
- self.assertAlmostEqual(H, self.model.entropy(trained), places=4)
- self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4)
-
- def test_entropy_perplexity_unseen(self):
- # In MLE, even one unseen ngram should make entropy and perplexity infinite
- untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]
-
- self.assertTrue(math.isinf(self.model.entropy(untrained)))
- self.assertTrue(math.isinf(self.model.perplexity(untrained)))
-
- def test_entropy_perplexity_unigrams(self):
- # word = score, log score
- # <s> = 0.1429, -2.8074
- # a = 0.1429, -2.8074
- # c = 0.0714, -3.8073
- # UNK = 0.2143, -2.2224
- # d = 0.1429, -2.8074
- # c = 0.0714, -3.8073
- # </s> = 0.1429, -2.8074
- # TOTAL logscores = -21.6243
- # - AVG logscores = 3.0095
- H = 3.0095
- perplexity = 8.0529
-
- text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]
-
- self.assertAlmostEqual(H, self.model.entropy(text), places=4)
- self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
-
-
-class MleTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- """MLE trigram model tests"""
-
- score_tests = [
- # count(d | b, c) = 1
- # count(b, c) = 1
- ("d", ("b", "c"), 1),
- # count(d | c) = 1
- # count(c) = 1
- ("d", ["c"], 1),
- # total number of tokens is 18, of which "a" occured 2 times
- ("a", None, 2.0 / 18),
- # in vocabulary but unseen
- ("z", None, 0),
- # out of vocabulary should use "UNK" score
- ("y", None, 3.0 / 18),
- ]
-
- def setUp(self):
- vocab, training_text = _prepare_test_data(3)
- self.model = MLE(3, vocabulary=vocab)
- self.model.fit(training_text)
-
-
-class LidstoneBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- """Unit tests for Lidstone class"""
-
- score_tests = [
- # count(d | c) = 1
- # *count(d | c) = 1.1
- # Count(w | c for w in vocab) = 1
- # *Count(w | c for w in vocab) = 1.8
- ("d", ["c"], 1.1 / 1.8),
- # Total unigrams: 14
- # Vocab size: 8
- # Denominator: 14 + 0.8 = 14.8
- # count("a") = 2
- # *count("a") = 2.1
- ("a", None, 2.1 / 14.8),
- # in vocabulary but unseen
- # count("z") = 0
- # *count("z") = 0.1
- ("z", None, 0.1 / 14.8),
- # out of vocabulary should use "UNK" score
- # count("<UNK>") = 3
- # *count("<UNK>") = 3.1
- ("y", None, 3.1 / 14.8),
- ]
-
- def setUp(self):
- vocab, training_text = _prepare_test_data(2)
- self.model = Lidstone(0.1, 2, vocabulary=vocab)
- self.model.fit(training_text)
-
- def test_gamma(self):
- self.assertEqual(0.1, self.model.gamma)
-
- def test_entropy_perplexity(self):
- text = [
- ("<s>", "a"),
- ("a", "c"),
- ("c", "<UNK>"),
- ("<UNK>", "d"),
- ("d", "c"),
- ("c", "</s>"),
- ]
- # Unlike MLE this should be able to handle completely novel ngrams
- # Ngram = score, log score
- # <s>, a = 0.3929, -1.3479
- # a, c = 0.0357, -4.8074
- # c, UNK = 0.0(5), -4.1699
- # UNK, d = 0.0263, -5.2479
- # d, c = 0.0357, -4.8074
- # c, </s> = 0.0(5), -4.1699
- # TOTAL logscore: −24.5504
- # - AVG logscore: 4.0917
- H = 4.0917
- perplexity = 17.0504
- self.assertAlmostEqual(H, self.model.entropy(text), places=4)
- self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
-
-
-class LidstoneTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- score_tests = [
- # Logic behind this is the same as for bigram model
- ("d", ["c"], 1.1 / 1.8),
- # if we choose a word that hasn't appeared after (b, c)
- ("e", ["c"], 0.1 / 1.8),
- # Trigram score now
- ("d", ["b", "c"], 1.1 / 1.8),
- ("e", ["b", "c"], 0.1 / 1.8),
- ]
-
- def setUp(self):
- vocab, training_text = _prepare_test_data(3)
- self.model = Lidstone(0.1, 3, vocabulary=vocab)
- self.model.fit(training_text)
-
-
-class LaplaceBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- """Unit tests for Laplace class"""
-
- score_tests = [
- # basic sanity-check:
- # count(d | c) = 1
- # *count(d | c) = 2
- # Count(w | c for w in vocab) = 1
- # *Count(w | c for w in vocab) = 9
- ("d", ["c"], 2.0 / 9),
- # Total unigrams: 14
- # Vocab size: 8
- # Denominator: 14 + 8 = 22
- # count("a") = 2
- # *count("a") = 3
- ("a", None, 3.0 / 22),
- # in vocabulary but unseen
- # count("z") = 0
- # *count("z") = 1
- ("z", None, 1.0 / 22),
- # out of vocabulary should use "UNK" score
- # count("<UNK>") = 3
- # *count("<UNK>") = 4
- ("y", None, 4.0 / 22),
- ]
-
- def setUp(self):
- vocab, training_text = _prepare_test_data(2)
- self.model = Laplace(2, vocabulary=vocab)
- self.model.fit(training_text)
-
- def test_gamma(self):
- # Make sure the gamma is set to 1
- self.assertEqual(1, self.model.gamma)
-
- def test_entropy_perplexity(self):
- text = [
- ("<s>", "a"),
- ("a", "c"),
- ("c", "<UNK>"),
- ("<UNK>", "d"),
- ("d", "c"),
- ("c", "</s>"),
- ]
- # Unlike MLE this should be able to handle completely novel ngrams
- # Ngram = score, log score
- # <s>, a = 0.2, -2.3219
- # a, c = 0.1, -3.3219
- # c, UNK = 0.(1), -3.1699
- # UNK, d = 0.(09), 3.4594
- # d, c = 0.1 -3.3219
- # c, </s> = 0.(1), -3.1699
- # Total logscores: −18.7651
- # - AVG logscores: 3.1275
- H = 3.1275
- perplexity = 8.7393
- self.assertAlmostEqual(H, self.model.entropy(text), places=4)
- self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
-
-
-class WittenBellInterpolatedTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- def setUp(self):
- vocab, training_text = _prepare_test_data(3)
- self.model = WittenBellInterpolated(3, vocabulary=vocab)
- self.model.fit(training_text)
-
- score_tests = [
- # For unigram scores by default revert to MLE
- # Total unigrams: 18
- # count('c'): 1
- ("c", None, 1.0 / 18),
- # in vocabulary but unseen
- # count("z") = 0
- ("z", None, 0.0 / 18),
- # out of vocabulary should use "UNK" score
- # count("<UNK>") = 3
- ("y", None, 3.0 / 18),
- # gamma(['b']) = 0.1111
- # mle.score('c', ['b']) = 0.5
- # (1 - gamma) * mle + gamma * mle('c') ~= 0.45 + .3 / 18
- ("c", ["b"], (1 - 0.1111) * 0.5 + 0.1111 * 1 / 18),
- # building on that, let's try 'a b c' as the trigram
- # gamma(['a', 'b']) = 0.0667
- # mle("c", ["a", "b"]) = 1
- ("c", ["a", "b"], (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
- # The ngram 'z b c' was not seen, so we should simply revert to
- # the score of the ngram 'b c'. See issue #2332.
- ("c", ["z", "b"], ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
- ]
-
-
-class KneserNeyInterpolatedTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
- def setUp(self):
- vocab, training_text = _prepare_test_data(3)
- self.model = KneserNeyInterpolated(3, vocabulary=vocab)
- self.model.fit(training_text)
-
- score_tests = [
- # For unigram scores revert to uniform
- # Vocab size: 8
- # count('c'): 1
- ("c", None, 1.0 / 8),
- # in vocabulary but unseen, still uses uniform
- ("z", None, 1 / 8),
- # out of vocabulary should use "UNK" score, i.e. again uniform
- ("y", None, 1.0 / 8),
- # alpha = count('bc') - discount = 1 - 0.1 = 0.9
- # gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2
- # normalizer = total number of bigrams with this context = 2
- # the final should be: (alpha + gamma * unigram_score("c"))
- ("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2),
- # building on that, let's try 'a b c' as the trigram
- # alpha = count('abc') - discount = 1 - 0.1 = 0.9
- # gamma(['a', 'b']) = 0.1 * 1
- # normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
- ("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
- # The ngram 'z b c' was not seen, so we should simply revert to
- # the score of the ngram 'b c'. See issue #2332.
- ("c", ["z", "b"], ((0.9 + 0.2 * (1 / 8)) / 2)),
- ]
-
-
-class NgramModelTextGenerationTests(unittest.TestCase):
- """Using MLE model, generate some text."""
-
- def setUp(self):
- vocab, training_text = _prepare_test_data(3)
- self.model = MLE(3, vocabulary=vocab)
- self.model.fit(training_text)
-
- def test_generate_one_no_context(self):
- self.assertEqual(self.model.generate(random_seed=3), "<UNK>")
-
- def test_generate_one_limiting_context(self):
- # We don't need random_seed for contexts with only one continuation
- self.assertEqual(self.model.generate(text_seed=["c"]), "d")
- self.assertEqual(self.model.generate(text_seed=["b", "c"]), "d")
- self.assertEqual(self.model.generate(text_seed=["a", "c"]), "d")
-
- def test_generate_one_varied_context(self):
- # When context doesn't limit our options enough, seed the random choice
- self.assertEqual(
- self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a"
- )
-
- def test_generate_cycle(self):
- # Add a cycle to the model: bd -> b, db -> d
- more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))]
- self.model.fit(more_training_text)
- # Test that we can escape the cycle
- self.assertEqual(
- self.model.generate(7, text_seed=("b", "d"), random_seed=5),
- ["b", "d", "b", "d", "b", "d", "</s>"],
- )
-
- def test_generate_with_text_seed(self):
- self.assertEqual(
- self.model.generate(5, text_seed=("<s>", "e"), random_seed=3),
- ["<UNK>", "a", "d", "b", "<UNK>"],
- )
-
- def test_generate_oov_text_seed(self):
- self.assertEqual(
- self.model.generate(text_seed=("aliens",), random_seed=3),
- self.model.generate(text_seed=("<UNK>",), random_seed=3),
- )
-
- def test_generate_None_text_seed(self):
- # should crash with type error when we try to look it up in vocabulary
- with self.assertRaises(TypeError):
- self.model.generate(text_seed=(None,))
-
- # This will work
- self.assertEqual(
- self.model.generate(text_seed=None, random_seed=3),
- self.model.generate(random_seed=3),
- )
+++ /dev/null
-# Natural Language Toolkit: Language Model Unit Tests
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-import unittest
-
-from nltk.lm.preprocessing import padded_everygram_pipeline
-
-
-class TestPreprocessing(unittest.TestCase):
- def test_padded_everygram_pipeline(self):
- expected_train = [
- [
- ("<s>",),
- ("a",),
- ("b",),
- ("c",),
- ("</s>",),
- ("<s>", "a"),
- ("a", "b"),
- ("b", "c"),
- ("c", "</s>"),
- ]
- ]
- expected_vocab = ["<s>", "a", "b", "c", "</s>"]
- train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
- self.assertEqual([list(sent) for sent in train_data], expected_train)
- self.assertEqual(list(vocab_data), expected_vocab)
+++ /dev/null
-# Natural Language Toolkit: Language Model Unit Tests
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-import unittest
-from collections import Counter
-
-from nltk.lm import Vocabulary
-
-
-class NgramModelVocabularyTests(unittest.TestCase):
- """tests Vocabulary Class"""
-
- @classmethod
- def setUpClass(cls):
- cls.vocab = Vocabulary(
- ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"],
- unk_cutoff=2,
- )
-
- def test_truthiness(self):
- self.assertTrue(self.vocab)
-
- def test_cutoff_value_set_correctly(self):
- self.assertEqual(self.vocab.cutoff, 2)
-
- def test_unable_to_change_cutoff(self):
- with self.assertRaises(AttributeError):
- self.vocab.cutoff = 3
-
- def test_cutoff_setter_checks_value(self):
- with self.assertRaises(ValueError) as exc_info:
- Vocabulary("abc", unk_cutoff=0)
- expected_error_msg = "Cutoff value cannot be less than 1. Got: 0"
- self.assertEqual(expected_error_msg, str(exc_info.exception))
-
- def test_counts_set_correctly(self):
- self.assertEqual(self.vocab.counts["a"], 2)
- self.assertEqual(self.vocab.counts["b"], 2)
- self.assertEqual(self.vocab.counts["c"], 1)
-
- def test_membership_check_respects_cutoff(self):
- # a was seen 2 times, so it should be considered part of the vocabulary
- self.assertTrue("a" in self.vocab)
- # "c" was seen once, it shouldn't be considered part of the vocab
- self.assertFalse("c" in self.vocab)
- # "z" was never seen at all, also shouldn't be considered in the vocab
- self.assertFalse("z" in self.vocab)
-
- def test_vocab_len_respects_cutoff(self):
- # Vocab size is the number of unique tokens that occur at least as often
- # as the cutoff value, plus 1 to account for unknown words.
- self.assertEqual(5, len(self.vocab))
-
- def test_vocab_iter_respects_cutoff(self):
- vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"]
- vocab_items = ["a", "b", "d", "e", "<UNK>"]
-
- self.assertCountEqual(vocab_counts, list(self.vocab.counts.keys()))
- self.assertCountEqual(vocab_items, list(self.vocab))
-
- def test_update_empty_vocab(self):
- empty = Vocabulary(unk_cutoff=2)
- self.assertEqual(len(empty), 0)
- self.assertFalse(empty)
- self.assertIn(empty.unk_label, empty)
-
- empty.update(list("abcde"))
- self.assertIn(empty.unk_label, empty)
-
- def test_lookup(self):
- self.assertEqual(self.vocab.lookup("a"), "a")
- self.assertEqual(self.vocab.lookup("c"), "<UNK>")
-
- def test_lookup_iterables(self):
- self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b"))
- self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b"))
- self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "<UNK>"))
- self.assertEqual(
- self.vocab.lookup(map(str, range(3))), ("<UNK>", "<UNK>", "<UNK>")
- )
-
- def test_lookup_empty_iterables(self):
- self.assertEqual(self.vocab.lookup(()), ())
- self.assertEqual(self.vocab.lookup([]), ())
- self.assertEqual(self.vocab.lookup(iter([])), ())
- self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ())
-
- def test_lookup_recursive(self):
- self.assertEqual(
- self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "<UNK>"))
- )
- self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "<UNK>"))
- self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),))
-
- def test_lookup_None(self):
- with self.assertRaises(TypeError):
- self.vocab.lookup(None)
- with self.assertRaises(TypeError):
- list(self.vocab.lookup([None, None]))
-
- def test_lookup_int(self):
- with self.assertRaises(TypeError):
- self.vocab.lookup(1)
- with self.assertRaises(TypeError):
- list(self.vocab.lookup([1, 2]))
-
- def test_lookup_empty_str(self):
- self.assertEqual(self.vocab.lookup(""), "<UNK>")
-
- def test_eqality(self):
- v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
- v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
- v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah")
- v4 = Vocabulary(["a", "b"], unk_cutoff=1)
-
- self.assertEqual(v1, v2)
- self.assertNotEqual(v1, v3)
- self.assertNotEqual(v1, v4)
-
- def test_str(self):
- self.assertEqual(
- str(self.vocab), "<Vocabulary with cutoff=2 unk_label='<UNK>' and 5 items>"
- )
-
- def test_creation_with_counter(self):
- self.assertEqual(
- self.vocab,
- Vocabulary(
- Counter(
- ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"]
- ),
- unk_cutoff=2,
- ),
- )
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Unit tests for nltk.metrics.aline
-"""
-
-
-import unittest
-
-from nltk.metrics import aline
-
-
-class TestAline(unittest.TestCase):
- """
- Test Aline algorithm for aligning phonetic sequences
- """
-
- def test_aline(self):
- result = aline.align('θin', 'tenwis')
- expected = [
- [('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]
- ]
-
- self.assertEqual(result, expected)
-
- result = aline.align('jo', 'ʒə')
- expected = [[('j', 'ʒ'), ('o', 'ə')]]
-
- self.assertEqual(result, expected)
-
- result = aline.align('pematesiweni', 'pematesewen')
- expected = [
- [
- ('p', 'p'),
- ('e', 'e'),
- ('m', 'm'),
- ('a', 'a'),
- ('t', 't'),
- ('e', 'e'),
- ('s', 's'),
- ('i', 'e'),
- ('w', 'w'),
- ('e', 'e'),
- ('n', 'n'),
- ('i', '-'),
- ]
- ]
-
- self.assertEqual(result, expected)
-
- result = aline.align('tuwθ', 'dentis')
- expected = [
- [
- ('t', 'd'),
- ('u', 'e'),
- ('w', '-'),
- ('-', 'n'),
- ('-', 't'),
- ('-', 'i'),
- ('θ', 's'),
- ]
- ]
-
- self.assertEqual(result, expected)
-
- def test_aline_delta(self):
- """
- Test aline for computing the difference between two segments
- """
- result = aline.delta('p', 'q')
- expected = 20.0
-
- self.assertEqual(result, expected)
-
- result = aline.delta('a', 'A')
- expected = 0.0
-
- self.assertEqual(result, expected)
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for Brill tagger.
-"""
-
-import unittest
-
-from nltk.tag import UnigramTagger, brill, brill_trainer
-from nltk.tbl import Template
-from nltk.corpus import treebank
-
-from nltk.tbl import demo
-
-
-class TestBrill(unittest.TestCase):
- def test_pos_template(self):
- train_sents = treebank.tagged_sents()[:1000]
- tagger = UnigramTagger(train_sents)
- trainer = brill_trainer.BrillTaggerTrainer(
- tagger, [brill.Template(brill.Pos([-1]))]
- )
- brill_tagger = trainer.train(train_sents)
- # Example from https://github.com/nltk/nltk/issues/769
- result = brill_tagger.tag('This is a foo bar sentence'.split())
- expected = [
- ('This', 'DT'),
- ('is', 'VBZ'),
- ('a', 'DT'),
- ('foo', None),
- ('bar', 'NN'),
- ('sentence', None),
- ]
- self.assertEqual(result, expected)
-
- @unittest.skip("Should be tested in __main__ of nltk.tbl.demo")
- def test_brill_demo(self):
- demo()
+++ /dev/null
-import unittest
-from nltk import ConditionalFreqDist, tokenize
-
-class TestEmptyCondFreq(unittest.TestCase):
- def test_tabulate(self):
- empty = ConditionalFreqDist()
- self.assertEqual(empty.conditions(),[])
- try:
- empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
- except:
- pass
- self.assertEqual(empty.conditions(), [])
-
-
- def test_plot(self):
- empty = ConditionalFreqDist()
- self.assertEqual(empty.conditions(),[])
- try:
- empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added
- except:
- pass
- self.assertEqual(empty.conditions(),[])
-
- def test_increment(self):
- # make sure that we can still mutate cfd normally
- text = "cow cat mouse cat tiger"
- cfd = ConditionalFreqDist()
-
- # create cfd with word length as condition
- for word in tokenize.word_tokenize(text):
- condition = len(word)
- cfd[condition][word] += 1
-
- self.assertEqual(cfd.conditions(), [3,5])
-
- # incrementing previously unseen key is still possible
- cfd[2]['hi'] += 1
- self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
- self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
+++ /dev/null
-# -*- coding: utf-8 -*-
-import unittest
-import nltk
-from nltk.grammar import CFG
-
-
-class ChomskyNormalFormForCFGTest(unittest.TestCase):
- def test_simple(self):
- grammar = CFG.fromstring(
- """
- S -> NP VP
- PP -> P NP
- NP -> Det N | NP PP P
- VP -> V NP | VP PP
- VP -> Det
- Det -> 'a' | 'the'
- N -> 'dog' | 'cat'
- V -> 'chased' | 'sat'
- P -> 'on' | 'in'
- """
- )
- self.assertFalse(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
- grammar = grammar.chomsky_normal_form(flexible=True)
- self.assertTrue(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
-
- grammar2 = CFG.fromstring(
- """
- S -> NP VP
- NP -> VP N P
- VP -> P
- N -> 'dog' | 'cat'
- P -> 'on' | 'in'
- """
- )
- self.assertFalse(grammar2.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar2.is_chomsky_normal_form())
- grammar2 = grammar2.chomsky_normal_form()
- self.assertTrue(grammar2.is_flexible_chomsky_normal_form())
- self.assertTrue(grammar2.is_chomsky_normal_form())
-
- def test_complex(self):
- grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
- self.assertFalse(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
- grammar = grammar.chomsky_normal_form(flexible=True)
- self.assertTrue(grammar.is_flexible_chomsky_normal_form())
- self.assertFalse(grammar.is_chomsky_normal_form())
+++ /dev/null
-# -*- coding: utf-8 -*-
-import unittest
-
-from nltk import RegexpParser
-
-
-class TestChunkRule(unittest.TestCase):
- def test_tag_pattern2re_pattern_quantifier(self):
- """Test for bug https://github.com/nltk/nltk/issues/1597
-
- Ensures that curly bracket quantifiers can be used inside a chunk rule.
- This type of quantifier has been used for the supplementary example
- in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
- """
- sent = [
- ('The', 'AT'),
- ('September-October', 'NP'),
- ('term', 'NN'),
- ('jury', 'NN'),
- ('had', 'HVD'),
- ('been', 'BEN'),
- ('charged', 'VBN'),
- ('by', 'IN'),
- ('Fulton', 'NP-TL'),
- ('Superior', 'JJ-TL'),
- ('Court', 'NN-TL'),
- ('Judge', 'NN-TL'),
- ('Durwood', 'NP'),
- ('Pye', 'NP'),
- ('to', 'TO'),
- ('investigate', 'VB'),
- ('reports', 'NNS'),
- ('of', 'IN'),
- ('possible', 'JJ'),
- ('``', '``'),
- ('irregularities', 'NNS'),
- ("''", "''"),
- ('in', 'IN'),
- ('the', 'AT'),
- ('hard-fought', 'JJ'),
- ('primary', 'NN'),
- ('which', 'WDT'),
- ('was', 'BEDZ'),
- ('won', 'VBN'),
- ('by', 'IN'),
- ('Mayor-nominate', 'NN-TL'),
- ('Ivan', 'NP'),
- ('Allen', 'NP'),
- ('Jr.', 'NP'),
- ('.', '.'),
- ] # source: brown corpus
- cp = RegexpParser('CHUNK: {<N.*>{4,}}')
- tree = cp.parse(sent)
- assert (
- tree.pformat()
- == """(S
- The/AT
- September-October/NP
- term/NN
- jury/NN
- had/HVD
- been/BEN
- charged/VBN
- by/IN
- Fulton/NP-TL
- Superior/JJ-TL
- (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
- to/TO
- investigate/VB
- reports/NNS
- of/IN
- possible/JJ
- ``/``
- irregularities/NNS
- ''/''
- in/IN
- the/AT
- hard-fought/JJ
- primary/NN
- which/WDT
- was/BEDZ
- won/VBN
- by/IN
- (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
- ./.)"""
- )
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Unit tests for nltk.classify. See also: nltk/test/classify.doctest
-"""
-from nose import SkipTest
-from nltk import classify
-
-TRAIN = [
- (dict(a=1, b=1, c=1), 'y'),
- (dict(a=1, b=1, c=1), 'x'),
- (dict(a=1, b=1, c=0), 'y'),
- (dict(a=0, b=1, c=1), 'x'),
- (dict(a=0, b=1, c=1), 'y'),
- (dict(a=0, b=0, c=1), 'y'),
- (dict(a=0, b=1, c=0), 'x'),
- (dict(a=0, b=0, c=0), 'x'),
- (dict(a=0, b=1, c=1), 'y'),
-]
-
-TEST = [
- (dict(a=1, b=0, c=1)), # unseen
- (dict(a=1, b=0, c=0)), # unseen
- (dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x
- (dict(a=0, b=1, c=0)), # seen 1 time, label=x
-]
-
-RESULTS = [(0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24)]
-
-
-def assert_classifier_correct(algorithm):
- try:
- classifier = classify.MaxentClassifier.train(
- TRAIN, algorithm, trace=0, max_iter=1000
- )
- except (LookupError, AttributeError) as e:
- raise SkipTest(str(e))
-
- for (px, py), featureset in zip(RESULTS, TEST):
- pdist = classifier.prob_classify(featureset)
- assert abs(pdist.prob('x') - px) < 1e-2, (pdist.prob('x'), px)
- assert abs(pdist.prob('y') - py) < 1e-2, (pdist.prob('y'), py)
-
-
-def test_megam():
- assert_classifier_correct('MEGAM')
-
-
-def test_tadm():
- assert_classifier_correct('TADM')
+++ /dev/null
-# -*- coding: utf-8 -*-
-import unittest
-
-from nltk.collocations import BigramCollocationFinder
-from nltk.metrics import BigramAssocMeasures
-
-## Test bigram counters with discontinuous bigrams and repeated words
-
-_EPSILON = 1e-8
-
-
-def close_enough(x, y):
- """Verify that two sequences of n-gram association values are within
- _EPSILON of each other.
- """
-
- for (x1, y1) in zip(x, y):
- if x1[0] != y1[0] or abs(x1[1] - y1[1]) > _EPSILON:
- return False
- return True
-
-
-class TestBigram(unittest.TestCase):
- def test_bigram2(self):
- sent = 'this this is is a a test test'.split()
-
- b = BigramCollocationFinder.from_words(sent)
-
- # python 2.6 does not have assertItemsEqual or assertListEqual
- self.assertEqual(
- sorted(b.ngram_fd.items()),
- sorted(
- [
- (('a', 'a'), 1),
- (('a', 'test'), 1),
- (('is', 'a'), 1),
- (('is', 'is'), 1),
- (('test', 'test'), 1),
- (('this', 'is'), 1),
- (('this', 'this'), 1),
- ]
- ),
- )
- self.assertEqual(
- sorted(b.word_fd.items()),
- sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
- )
- self.assertTrue(
- len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
- )
- self.assertTrue(
- close_enough(
- sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
- sorted(
- [
- (('a', 'a'), 1.0),
- (('a', 'test'), 1.0),
- (('is', 'a'), 1.0),
- (('is', 'is'), 1.0),
- (('test', 'test'), 1.0),
- (('this', 'is'), 1.0),
- (('this', 'this'), 1.0),
- ]
- ),
- )
- )
-
- def test_bigram3(self):
- sent = 'this this is is a a test test'.split()
-
- b = BigramCollocationFinder.from_words(sent, window_size=3)
- self.assertEqual(
- sorted(b.ngram_fd.items()),
- sorted(
- [
- (('a', 'test'), 3),
- (('is', 'a'), 3),
- (('this', 'is'), 3),
- (('a', 'a'), 1),
- (('is', 'is'), 1),
- (('test', 'test'), 1),
- (('this', 'this'), 1),
- ]
- ),
- )
- self.assertEqual(
- sorted(b.word_fd.items()),
- sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
- )
- self.assertTrue(
- len(sent)
- == sum(b.word_fd.values())
- == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
- )
- self.assertTrue(
- close_enough(
- sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
- sorted(
- [
- (('a', 'test'), 1.584962500721156),
- (('is', 'a'), 1.584962500721156),
- (('this', 'is'), 1.584962500721156),
- (('a', 'a'), 0.0),
- (('is', 'is'), 0.0),
- (('test', 'test'), 0.0),
- (('this', 'this'), 0.0),
- ]
- ),
- )
- )
-
- def test_bigram5(self):
- sent = 'this this is is a a test test'.split()
-
- b = BigramCollocationFinder.from_words(sent, window_size=5)
- self.assertEqual(
- sorted(b.ngram_fd.items()),
- sorted(
- [
- (('a', 'test'), 4),
- (('is', 'a'), 4),
- (('this', 'is'), 4),
- (('is', 'test'), 3),
- (('this', 'a'), 3),
- (('a', 'a'), 1),
- (('is', 'is'), 1),
- (('test', 'test'), 1),
- (('this', 'this'), 1),
- ]
- ),
- )
- self.assertEqual(
- sorted(b.word_fd.items()),
- sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
- )
- self.assertTrue(
- len(sent)
- == sum(b.word_fd.values())
- == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
- )
- self.assertTrue(
- close_enough(
- sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
- sorted(
- [
- (('a', 'test'), 1.0),
- (('is', 'a'), 1.0),
- (('this', 'is'), 1.0),
- (('is', 'test'), 0.5849625007211562),
- (('this', 'a'), 0.5849625007211562),
- (('a', 'a'), -1.0),
- (('is', 'is'), -1.0),
- (('test', 'test'), -1.0),
- (('this', 'this'), -1.0),
- ]
- ),
- )
- )
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-import unittest
-import contextlib
-import sys
-from io import StringIO
-
-from nose import with_setup
-
-from nltk.corpus import gutenberg
-from nltk.text import Text
-
-
-@contextlib.contextmanager
-def stdout_redirect(where):
- sys.stdout = where
- try:
- yield where
- finally:
- sys.stdout = sys.__stdout__
-
-
-class TestConcordance(unittest.TestCase):
- """Text constructed using: http://www.nltk.org/book/ch01.html"""
-
- @classmethod
- def setup_class(cls):
- cls.corpus = gutenberg.words('melville-moby_dick.txt')
-
- @classmethod
- def teardown_class(cls):
- pass
-
- def setUp(self):
- self.text = Text(TestConcordance.corpus)
- self.query = "monstrous"
- self.maxDiff = None
- self.list_out = [
- 'ong the former , one was of a most monstrous size . ... This came towards us , ',
- 'ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r',
- 'll over with a heathenish array of monstrous clubs and spears . Some were thick',
- 'd as you gazed , and wondered what monstrous cannibal and savage could ever hav',
- 'that has survived the flood ; most monstrous and most mountainous ! That Himmal',
- 'they might scout at Moby Dick as a monstrous fable , or still worse and more de',
- 'th of Radney .\'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l',
- 'ing Scenes . In connexion with the monstrous pictures of whales , I am strongly',
- 'ere to enter upon those still more monstrous stories of them which are to be fo',
- 'ght have been rummaged out of this monstrous cabinet there is no telling . But ',
- 'of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u',
- ]
-
- def tearDown(self):
- pass
-
- def test_concordance_list(self):
- concordance_out = self.text.concordance_list(self.query)
- self.assertEqual(self.list_out, [c.line for c in concordance_out])
-
- def test_concordance_width(self):
- list_out = [
- "monstrous",
- "monstrous",
- "monstrous",
- "monstrous",
- "monstrous",
- "monstrous",
- "Monstrous",
- "monstrous",
- "monstrous",
- "monstrous",
- "monstrous",
- ]
-
- concordance_out = self.text.concordance_list(self.query, width=0)
- self.assertEqual(list_out, [c.query for c in concordance_out])
-
- def test_concordance_lines(self):
- concordance_out = self.text.concordance_list(self.query, lines=3)
- self.assertEqual(self.list_out[:3], [c.line for c in concordance_out])
-
- def test_concordance_print(self):
- print_out = """Displaying 11 of 11 matches:
- ong the former , one was of a most monstrous size . ... This came towards us ,
- ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
- ll over with a heathenish array of monstrous clubs and spears . Some were thick
- d as you gazed , and wondered what monstrous cannibal and savage could ever hav
- that has survived the flood ; most monstrous and most mountainous ! That Himmal
- they might scout at Moby Dick as a monstrous fable , or still worse and more de
- th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
- ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
- ere to enter upon those still more monstrous stories of them which are to be fo
- ght have been rummaged out of this monstrous cabinet there is no telling . But
- of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u
- """
-
- with stdout_redirect(StringIO()) as stdout:
- self.text.concordance(self.query)
-
- def strip_space(raw_str):
- return raw_str.replace(" ", "")
-
- self.assertEqual(strip_space(print_out), strip_space(stdout.getvalue()))
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-"""
-Mock test for Stanford CoreNLP wrappers.
-"""
-
-import sys
-from itertools import chain
-from unittest import TestCase, SkipTest
-from unittest.mock import MagicMock
-
-from nltk.tree import Tree
-from nltk.parse import corenlp
-
-
-class TestTokenizerAPI(TestCase):
- def test_tokenize(self):
- corenlp_tokenizer = corenlp.CoreNLPParser()
-
- api_return_value = {
- u'sentences': [
- {
- u'index': 0,
- u'tokens': [
- {
- u'after': u' ',
- u'before': u'',
- u'characterOffsetBegin': 0,
- u'characterOffsetEnd': 4,
- u'index': 1,
- u'originalText': u'Good',
- u'word': u'Good',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 5,
- u'characterOffsetEnd': 12,
- u'index': 2,
- u'originalText': u'muffins',
- u'word': u'muffins',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 13,
- u'characterOffsetEnd': 17,
- u'index': 3,
- u'originalText': u'cost',
- u'word': u'cost',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 18,
- u'characterOffsetEnd': 19,
- u'index': 4,
- u'originalText': u'$',
- u'word': u'$',
- },
- {
- u'after': u'\n',
- u'before': u'',
- u'characterOffsetBegin': 19,
- u'characterOffsetEnd': 23,
- u'index': 5,
- u'originalText': u'3.88',
- u'word': u'3.88',
- },
- {
- u'after': u' ',
- u'before': u'\n',
- u'characterOffsetBegin': 24,
- u'characterOffsetEnd': 26,
- u'index': 6,
- u'originalText': u'in',
- u'word': u'in',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 27,
- u'characterOffsetEnd': 30,
- u'index': 7,
- u'originalText': u'New',
- u'word': u'New',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 31,
- u'characterOffsetEnd': 35,
- u'index': 8,
- u'originalText': u'York',
- u'word': u'York',
- },
- {
- u'after': u' ',
- u'before': u'',
- u'characterOffsetBegin': 35,
- u'characterOffsetEnd': 36,
- u'index': 9,
- u'originalText': u'.',
- u'word': u'.',
- },
- ],
- },
- {
- u'index': 1,
- u'tokens': [
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 38,
- u'characterOffsetEnd': 44,
- u'index': 1,
- u'originalText': u'Please',
- u'word': u'Please',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 45,
- u'characterOffsetEnd': 48,
- u'index': 2,
- u'originalText': u'buy',
- u'word': u'buy',
- },
- {
- u'after': u'\n',
- u'before': u' ',
- u'characterOffsetBegin': 49,
- u'characterOffsetEnd': 51,
- u'index': 3,
- u'originalText': u'me',
- u'word': u'me',
- },
- {
- u'after': u' ',
- u'before': u'\n',
- u'characterOffsetBegin': 52,
- u'characterOffsetEnd': 55,
- u'index': 4,
- u'originalText': u'two',
- u'word': u'two',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 56,
- u'characterOffsetEnd': 58,
- u'index': 5,
- u'originalText': u'of',
- u'word': u'of',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 59,
- u'characterOffsetEnd': 63,
- u'index': 6,
- u'originalText': u'them',
- u'word': u'them',
- },
- {
- u'after': u'\n',
- u'before': u'',
- u'characterOffsetBegin': 63,
- u'characterOffsetEnd': 64,
- u'index': 7,
- u'originalText': u'.',
- u'word': u'.',
- },
- ],
- },
- {
- u'index': 2,
- u'tokens': [
- {
- u'after': u'',
- u'before': u'\n',
- u'characterOffsetBegin': 65,
- u'characterOffsetEnd': 71,
- u'index': 1,
- u'originalText': u'Thanks',
- u'word': u'Thanks',
- },
- {
- u'after': u'',
- u'before': u'',
- u'characterOffsetBegin': 71,
- u'characterOffsetEnd': 72,
- u'index': 2,
- u'originalText': u'.',
- u'word': u'.',
- },
- ],
- },
- ]
- }
- corenlp_tokenizer.api_call = MagicMock(return_value=api_return_value)
-
- input_string = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
-
- expected_output = [
- u'Good',
- u'muffins',
- u'cost',
- u'$',
- u'3.88',
- u'in',
- u'New',
- u'York',
- u'.',
- u'Please',
- u'buy',
- u'me',
- u'two',
- u'of',
- u'them',
- u'.',
- u'Thanks',
- u'.',
- ]
-
- tokenized_output = list(corenlp_tokenizer.tokenize(input_string))
-
- corenlp_tokenizer.api_call.assert_called_once_with(
- 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.',
- properties={'annotators': 'tokenize,ssplit'},
- )
- self.assertEqual(expected_output, tokenized_output)
-
-
-class TestTaggerAPI(TestCase):
- def test_pos_tagger(self):
- corenlp_tagger = corenlp.CoreNLPParser(tagtype='pos')
-
- api_return_value = {
- u'sentences': [
- {
- u'basicDependencies': [
- {
- u'dep': u'ROOT',
- u'dependent': 1,
- u'dependentGloss': u'What',
- u'governor': 0,
- u'governorGloss': u'ROOT',
- },
- {
- u'dep': u'cop',
- u'dependent': 2,
- u'dependentGloss': u'is',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'det',
- u'dependent': 3,
- u'dependentGloss': u'the',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'nsubj',
- u'dependent': 4,
- u'dependentGloss': u'airspeed',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'case',
- u'dependent': 5,
- u'dependentGloss': u'of',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'det',
- u'dependent': 6,
- u'dependentGloss': u'an',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'compound',
- u'dependent': 7,
- u'dependentGloss': u'unladen',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'nmod',
- u'dependent': 8,
- u'dependentGloss': u'swallow',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'punct',
- u'dependent': 9,
- u'dependentGloss': u'?',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- ],
- u'enhancedDependencies': [
- {
- u'dep': u'ROOT',
- u'dependent': 1,
- u'dependentGloss': u'What',
- u'governor': 0,
- u'governorGloss': u'ROOT',
- },
- {
- u'dep': u'cop',
- u'dependent': 2,
- u'dependentGloss': u'is',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'det',
- u'dependent': 3,
- u'dependentGloss': u'the',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'nsubj',
- u'dependent': 4,
- u'dependentGloss': u'airspeed',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'case',
- u'dependent': 5,
- u'dependentGloss': u'of',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'det',
- u'dependent': 6,
- u'dependentGloss': u'an',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'compound',
- u'dependent': 7,
- u'dependentGloss': u'unladen',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'nmod:of',
- u'dependent': 8,
- u'dependentGloss': u'swallow',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'punct',
- u'dependent': 9,
- u'dependentGloss': u'?',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- ],
- u'enhancedPlusPlusDependencies': [
- {
- u'dep': u'ROOT',
- u'dependent': 1,
- u'dependentGloss': u'What',
- u'governor': 0,
- u'governorGloss': u'ROOT',
- },
- {
- u'dep': u'cop',
- u'dependent': 2,
- u'dependentGloss': u'is',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'det',
- u'dependent': 3,
- u'dependentGloss': u'the',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'nsubj',
- u'dependent': 4,
- u'dependentGloss': u'airspeed',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'case',
- u'dependent': 5,
- u'dependentGloss': u'of',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'det',
- u'dependent': 6,
- u'dependentGloss': u'an',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'compound',
- u'dependent': 7,
- u'dependentGloss': u'unladen',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'nmod:of',
- u'dependent': 8,
- u'dependentGloss': u'swallow',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'punct',
- u'dependent': 9,
- u'dependentGloss': u'?',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- ],
- u'index': 0,
- u'parse': u'(ROOT\n (SBARQ\n (WHNP (WP What))\n (SQ (VBZ is)\n (NP\n (NP (DT the) (NN airspeed))\n (PP (IN of)\n (NP (DT an) (NN unladen) (NN swallow)))))\n (. ?)))',
- u'tokens': [
- {
- u'after': u' ',
- u'before': u'',
- u'characterOffsetBegin': 0,
- u'characterOffsetEnd': 4,
- u'index': 1,
- u'lemma': u'what',
- u'originalText': u'What',
- u'pos': u'WP',
- u'word': u'What',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 5,
- u'characterOffsetEnd': 7,
- u'index': 2,
- u'lemma': u'be',
- u'originalText': u'is',
- u'pos': u'VBZ',
- u'word': u'is',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 8,
- u'characterOffsetEnd': 11,
- u'index': 3,
- u'lemma': u'the',
- u'originalText': u'the',
- u'pos': u'DT',
- u'word': u'the',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 12,
- u'characterOffsetEnd': 20,
- u'index': 4,
- u'lemma': u'airspeed',
- u'originalText': u'airspeed',
- u'pos': u'NN',
- u'word': u'airspeed',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 21,
- u'characterOffsetEnd': 23,
- u'index': 5,
- u'lemma': u'of',
- u'originalText': u'of',
- u'pos': u'IN',
- u'word': u'of',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 24,
- u'characterOffsetEnd': 26,
- u'index': 6,
- u'lemma': u'a',
- u'originalText': u'an',
- u'pos': u'DT',
- u'word': u'an',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 27,
- u'characterOffsetEnd': 34,
- u'index': 7,
- u'lemma': u'unladen',
- u'originalText': u'unladen',
- u'pos': u'JJ',
- u'word': u'unladen',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 35,
- u'characterOffsetEnd': 42,
- u'index': 8,
- u'lemma': u'swallow',
- u'originalText': u'swallow',
- u'pos': u'VB',
- u'word': u'swallow',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 43,
- u'characterOffsetEnd': 44,
- u'index': 9,
- u'lemma': u'?',
- u'originalText': u'?',
- u'pos': u'.',
- u'word': u'?',
- },
- ],
- }
- ]
- }
- corenlp_tagger.api_call = MagicMock(return_value=api_return_value)
-
- input_tokens = 'What is the airspeed of an unladen swallow ?'.split()
- expected_output = [
- ('What', 'WP'),
- ('is', 'VBZ'),
- ('the', 'DT'),
- ('airspeed', 'NN'),
- ('of', 'IN'),
- ('an', 'DT'),
- ('unladen', 'JJ'),
- ('swallow', 'VB'),
- ('?', '.'),
- ]
- tagged_output = corenlp_tagger.tag(input_tokens)
-
- corenlp_tagger.api_call.assert_called_once_with(
- 'What is the airspeed of an unladen swallow ?',
- properties={
- 'ssplit.isOneSentence': 'true',
- 'annotators': 'tokenize,ssplit,pos',
- },
- )
- self.assertEqual(expected_output, tagged_output)
-
- def test_ner_tagger(self):
- corenlp_tagger = corenlp.CoreNLPParser(tagtype='ner')
-
- api_return_value = {
- 'sentences': [
- {
- 'index': 0,
- 'tokens': [
- {
- 'after': ' ',
- 'before': '',
- 'characterOffsetBegin': 0,
- 'characterOffsetEnd': 4,
- 'index': 1,
- 'lemma': 'Rami',
- 'ner': 'PERSON',
- 'originalText': 'Rami',
- 'pos': 'NNP',
- 'word': 'Rami',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 5,
- 'characterOffsetEnd': 8,
- 'index': 2,
- 'lemma': 'Eid',
- 'ner': 'PERSON',
- 'originalText': 'Eid',
- 'pos': 'NNP',
- 'word': 'Eid',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 9,
- 'characterOffsetEnd': 11,
- 'index': 3,
- 'lemma': 'be',
- 'ner': 'O',
- 'originalText': 'is',
- 'pos': 'VBZ',
- 'word': 'is',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 12,
- 'characterOffsetEnd': 20,
- 'index': 4,
- 'lemma': 'study',
- 'ner': 'O',
- 'originalText': 'studying',
- 'pos': 'VBG',
- 'word': 'studying',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 21,
- 'characterOffsetEnd': 23,
- 'index': 5,
- 'lemma': 'at',
- 'ner': 'O',
- 'originalText': 'at',
- 'pos': 'IN',
- 'word': 'at',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 24,
- 'characterOffsetEnd': 29,
- 'index': 6,
- 'lemma': 'Stony',
- 'ner': 'ORGANIZATION',
- 'originalText': 'Stony',
- 'pos': 'NNP',
- 'word': 'Stony',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 30,
- 'characterOffsetEnd': 35,
- 'index': 7,
- 'lemma': 'Brook',
- 'ner': 'ORGANIZATION',
- 'originalText': 'Brook',
- 'pos': 'NNP',
- 'word': 'Brook',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 36,
- 'characterOffsetEnd': 46,
- 'index': 8,
- 'lemma': 'University',
- 'ner': 'ORGANIZATION',
- 'originalText': 'University',
- 'pos': 'NNP',
- 'word': 'University',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 47,
- 'characterOffsetEnd': 49,
- 'index': 9,
- 'lemma': 'in',
- 'ner': 'O',
- 'originalText': 'in',
- 'pos': 'IN',
- 'word': 'in',
- },
- {
- 'after': '',
- 'before': ' ',
- 'characterOffsetBegin': 50,
- 'characterOffsetEnd': 52,
- 'index': 10,
- 'lemma': 'NY',
- 'ner': 'O',
- 'originalText': 'NY',
- 'pos': 'NNP',
- 'word': 'NY',
- },
- ],
- }
- ]
- }
-
- corenlp_tagger.api_call = MagicMock(return_value=api_return_value)
-
- input_tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
- expected_output = [
- ('Rami', 'PERSON'),
- ('Eid', 'PERSON'),
- ('is', 'O'),
- ('studying', 'O'),
- ('at', 'O'),
- ('Stony', 'ORGANIZATION'),
- ('Brook', 'ORGANIZATION'),
- ('University', 'ORGANIZATION'),
- ('in', 'O'),
- ('NY', 'O'),
- ]
- tagged_output = corenlp_tagger.tag(input_tokens)
-
- corenlp_tagger.api_call.assert_called_once_with(
- 'Rami Eid is studying at Stony Brook University in NY',
- properties={
- 'ssplit.isOneSentence': 'true',
- 'annotators': 'tokenize,ssplit,ner',
- },
- )
- self.assertEqual(expected_output, tagged_output)
-
- def test_unexpected_tagtype(self):
- with self.assertRaises(ValueError):
- corenlp_tagger = corenlp.CoreNLPParser(tagtype='test')
-
-
-class TestParserAPI(TestCase):
- def test_parse(self):
- corenlp_parser = corenlp.CoreNLPParser()
-
- api_return_value = {
- 'sentences': [
- {
- 'basicDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'dep',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'dep',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedPlusPlusDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'dep',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'index': 0,
- 'parse': '(ROOT\n (NP\n (NP (DT The) (JJ quick) (JJ brown) (NN fox))\n (NP\n (NP (NNS jumps))\n (PP (IN over)\n (NP (DT the) (JJ lazy) (NN dog))))))',
- 'tokens': [
- {
- 'after': ' ',
- 'before': '',
- 'characterOffsetBegin': 0,
- 'characterOffsetEnd': 3,
- 'index': 1,
- 'lemma': 'the',
- 'originalText': 'The',
- 'pos': 'DT',
- 'word': 'The',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 4,
- 'characterOffsetEnd': 9,
- 'index': 2,
- 'lemma': 'quick',
- 'originalText': 'quick',
- 'pos': 'JJ',
- 'word': 'quick',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 10,
- 'characterOffsetEnd': 15,
- 'index': 3,
- 'lemma': 'brown',
- 'originalText': 'brown',
- 'pos': 'JJ',
- 'word': 'brown',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 16,
- 'characterOffsetEnd': 19,
- 'index': 4,
- 'lemma': 'fox',
- 'originalText': 'fox',
- 'pos': 'NN',
- 'word': 'fox',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 20,
- 'characterOffsetEnd': 25,
- 'index': 5,
- 'lemma': 'jump',
- 'originalText': 'jumps',
- 'pos': 'VBZ',
- 'word': 'jumps',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 26,
- 'characterOffsetEnd': 30,
- 'index': 6,
- 'lemma': 'over',
- 'originalText': 'over',
- 'pos': 'IN',
- 'word': 'over',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 31,
- 'characterOffsetEnd': 34,
- 'index': 7,
- 'lemma': 'the',
- 'originalText': 'the',
- 'pos': 'DT',
- 'word': 'the',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 35,
- 'characterOffsetEnd': 39,
- 'index': 8,
- 'lemma': 'lazy',
- 'originalText': 'lazy',
- 'pos': 'JJ',
- 'word': 'lazy',
- },
- {
- 'after': '',
- 'before': ' ',
- 'characterOffsetBegin': 40,
- 'characterOffsetEnd': 43,
- 'index': 9,
- 'lemma': 'dog',
- 'originalText': 'dog',
- 'pos': 'NN',
- 'word': 'dog',
- },
- ],
- }
- ]
- }
-
- corenlp_parser.api_call = MagicMock(return_value=api_return_value)
-
- input_string = "The quick brown fox jumps over the lazy dog".split()
- expected_output = Tree(
- 'ROOT',
- [
- Tree(
- 'NP',
- [
- Tree(
- 'NP',
- [
- Tree('DT', ['The']),
- Tree('JJ', ['quick']),
- Tree('JJ', ['brown']),
- Tree('NN', ['fox']),
- ],
- ),
- Tree(
- 'NP',
- [
- Tree('NP', [Tree('NNS', ['jumps'])]),
- Tree(
- 'PP',
- [
- Tree('IN', ['over']),
- Tree(
- 'NP',
- [
- Tree('DT', ['the']),
- Tree('JJ', ['lazy']),
- Tree('NN', ['dog']),
- ],
- ),
- ],
- ),
- ],
- ),
- ],
- )
- ],
- )
-
- parsed_data = next(corenlp_parser.parse(input_string))
-
- corenlp_parser.api_call.assert_called_once_with(
- "The quick brown fox jumps over the lazy dog",
- properties={'ssplit.eolonly': 'true'},
- )
- self.assertEqual(expected_output, parsed_data)
-
- def test_dependency_parser(self):
- corenlp_parser = corenlp.CoreNLPDependencyParser()
-
- api_return_value = {
- 'sentences': [
- {
- 'basicDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'nsubj',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'nsubj',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedPlusPlusDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'nsubj',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'index': 0,
- 'tokens': [
- {
- 'after': ' ',
- 'before': '',
- 'characterOffsetBegin': 0,
- 'characterOffsetEnd': 3,
- 'index': 1,
- 'lemma': 'the',
- 'originalText': 'The',
- 'pos': 'DT',
- 'word': 'The',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 4,
- 'characterOffsetEnd': 9,
- 'index': 2,
- 'lemma': 'quick',
- 'originalText': 'quick',
- 'pos': 'JJ',
- 'word': 'quick',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 10,
- 'characterOffsetEnd': 15,
- 'index': 3,
- 'lemma': 'brown',
- 'originalText': 'brown',
- 'pos': 'JJ',
- 'word': 'brown',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 16,
- 'characterOffsetEnd': 19,
- 'index': 4,
- 'lemma': 'fox',
- 'originalText': 'fox',
- 'pos': 'NN',
- 'word': 'fox',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 20,
- 'characterOffsetEnd': 25,
- 'index': 5,
- 'lemma': 'jump',
- 'originalText': 'jumps',
- 'pos': 'VBZ',
- 'word': 'jumps',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 26,
- 'characterOffsetEnd': 30,
- 'index': 6,
- 'lemma': 'over',
- 'originalText': 'over',
- 'pos': 'IN',
- 'word': 'over',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 31,
- 'characterOffsetEnd': 34,
- 'index': 7,
- 'lemma': 'the',
- 'originalText': 'the',
- 'pos': 'DT',
- 'word': 'the',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 35,
- 'characterOffsetEnd': 39,
- 'index': 8,
- 'lemma': 'lazy',
- 'originalText': 'lazy',
- 'pos': 'JJ',
- 'word': 'lazy',
- },
- {
- 'after': '',
- 'before': ' ',
- 'characterOffsetBegin': 40,
- 'characterOffsetEnd': 43,
- 'index': 9,
- 'lemma': 'dog',
- 'originalText': 'dog',
- 'pos': 'NN',
- 'word': 'dog',
- },
- ],
- }
- ]
- }
-
- corenlp_parser.api_call = MagicMock(return_value=api_return_value)
-
- input_string = "The quick brown fox jumps over the lazy dog".split()
- expected_output = Tree(
- 'jumps',
- [
- Tree('fox', ['The', 'quick', 'brown']),
- Tree('dog', ['over', 'the', 'lazy']),
- ],
- )
-
- parsed_data = next(corenlp_parser.parse(input_string))
-
- corenlp_parser.api_call.assert_called_once_with(
- "The quick brown fox jumps over the lazy dog",
- properties={'ssplit.eolonly': 'true'},
- )
- self.assertEqual(expected_output, parsed_data.tree())
+++ /dev/null
-# -*- coding: utf-8 -*-
-import unittest
-
-from nltk.corpus import (
- sinica_treebank,
- conll2007,
- indian,
- cess_cat,
- cess_esp,
- floresta,
- ptb,
- udhr,
-) # mwa_ppdb
-
-from nltk.tree import Tree
-from nltk.test.unit.utils import skipIf
-
-
-class TestUdhr(unittest.TestCase):
- def test_words(self):
- for name in udhr.fileids():
- try:
- words = list(udhr.words(name))
- except AssertionError:
- print(name)
- raise
- self.assertTrue(words)
-
- def test_raw_unicode(self):
- for name in udhr.fileids():
- txt = udhr.raw(name)
- assert not isinstance(txt, bytes), name
-
-
-class TestIndian(unittest.TestCase):
- def test_words(self):
- words = indian.words()[:3]
- self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
-
- def test_tagged_words(self):
- tagged_words = indian.tagged_words()[:3]
- self.assertEqual(
- tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')]
- )
-
-
-class TestCess(unittest.TestCase):
- def test_catalan(self):
- words = cess_cat.words()[:15]
- txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
- self.assertEqual(words, txt.split())
- self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
-
- def test_esp(self):
- words = cess_esp.words()[:15]
- txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
- self.assertEqual(words, txt.split())
- self.assertEqual(cess_esp.words()[115], "años")
-
-
-class TestFloresta(unittest.TestCase):
- def test_words(self):
- words = floresta.words()[:10]
- txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
- self.assertEqual(words, txt.split())
-
-
-class TestSinicaTreebank(unittest.TestCase):
- def test_sents(self):
- first_3_sents = sinica_treebank.sents()[:3]
- self.assertEqual(
- first_3_sents, [['一'], ['友情'], ['嘉珍', '和', '我', '住在', '同一條', '巷子']]
- )
-
- def test_parsed_sents(self):
- parsed_sents = sinica_treebank.parsed_sents()[25]
- self.assertEqual(
- parsed_sents,
- Tree(
- 'S',
- [
- Tree('NP', [Tree('Nba', ['嘉珍'])]),
- Tree('V‧地', [Tree('VA11', ['不停']), Tree('DE', ['的'])]),
- Tree('VA4', ['哭泣']),
- ],
- ),
- )
-
-
-class TestCoNLL2007(unittest.TestCase):
- # Reading the CoNLL 2007 Dependency Treebanks
-
- def test_sents(self):
- sents = conll2007.sents('esp.train')[0]
- self.assertEqual(
- sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo']
- )
-
- def test_parsed_sents(self):
-
- parsed_sents = conll2007.parsed_sents('esp.train')[0]
-
- self.assertEqual(
- parsed_sents.tree(),
- Tree(
- 'fortaleció',
- [
- Tree(
- 'aumento',
- [
- 'El',
- Tree(
- 'del',
- [
- Tree(
- 'índice',
- [
- Tree(
- 'de',
- [Tree('desempleo', ['estadounidense'])],
- )
- ],
- )
- ],
- ),
- ],
- ),
- 'hoy',
- 'considerablemente',
- Tree(
- 'al',
- [
- Tree(
- 'euro',
- [
- Tree(
- 'cotizaba',
- [
- ',',
- 'que',
- Tree('a', [Tree('15.35', ['las', 'GMT'])]),
- 'se',
- Tree(
- 'en',
- [
- Tree(
- 'mercado',
- [
- 'el',
- Tree('de', ['divisas']),
- Tree('de', ['Fráncfort']),
- ],
- )
- ],
- ),
- Tree('a', ['0,9452_dólares']),
- Tree(
- 'frente_a',
- [
- ',',
- Tree(
- '0,9349_dólares',
- [
- 'los',
- Tree(
- 'de',
- [
- Tree(
- 'mañana',
- ['esta'],
- )
- ],
- ),
- ],
- ),
- ],
- ),
- ],
- )
- ],
- )
- ],
- ),
- '.',
- ],
- ),
- )
-
-
-@skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available")
-class TestPTB(unittest.TestCase):
- def test_fileids(self):
- self.assertEqual(
- ptb.fileids()[:4],
- [
- 'BROWN/CF/CF01.MRG',
- 'BROWN/CF/CF02.MRG',
- 'BROWN/CF/CF03.MRG',
- 'BROWN/CF/CF04.MRG',
- ],
- )
-
- def test_words(self):
- self.assertEqual(
- ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
- ['A', 'form', 'of', 'asbestos', 'once', 'used', '*'],
- )
-
- def test_tagged_words(self):
- self.assertEqual(
- ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3],
- [('A', 'DT'), ('form', 'NN'), ('of', 'IN')],
- )
-
- def test_categories(self):
- self.assertEqual(
- ptb.categories(),
- [
- 'adventure',
- 'belles_lettres',
- 'fiction',
- 'humor',
- 'lore',
- 'mystery',
- 'news',
- 'romance',
- 'science_fiction',
- ],
- )
-
- def test_news_fileids(self):
- self.assertEqual(
- ptb.fileids('news')[:3],
- ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG'],
- )
-
- def test_category_words(self):
- self.assertEqual(
- ptb.words(categories=['humor', 'fiction'])[:6],
- ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'],
- )
-
-
-@unittest.skip("Skipping test for mwa_ppdb.")
-class TestMWAPPDB(unittest.TestCase):
- def test_fileids(self):
- self.assertEqual(
- mwa_ppdb.fileids(), ['ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs']
- )
-
- def test_entries(self):
- self.assertEqual(
- mwa_ppdb.entries()[:10],
- [
- ('10/17/01', '17/10/2001'),
- ('102,70', '102.70'),
- ('13,53', '13.53'),
- ('3.2.5.3.2.1', '3.2.5.3.2.1.'),
- ('53,76', '53.76'),
- ('6.9.5', '6.9.5.'),
- ('7.7.6.3', '7.7.6.3.'),
- ('76,20', '76.20'),
- ('79,85', '79.85'),
- ('93,65', '93.65'),
- ],
- )
-
-
-# unload corpora
-from nltk.corpus import teardown_module
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Corpus View Regression Tests
-"""
-import unittest
-import nltk.data
-from nltk.corpus.reader.util import (
- StreamBackedCorpusView,
- read_whitespace_block,
- read_line_block,
-)
-
-
-class TestCorpusViews(unittest.TestCase):
-
- linetok = nltk.LineTokenizer(blanklines='keep')
- names = [
- 'corpora/inaugural/README', # A very short file (160 chars)
- 'corpora/inaugural/1793-Washington.txt', # A relatively short file (791 chars)
- 'corpora/inaugural/1909-Taft.txt', # A longer file (32k chars)
- ]
-
- def data(self):
- for name in self.names:
- f = nltk.data.find(name)
- with f.open() as fp:
- file_data = fp.read().decode('utf8')
- yield f, file_data
-
- def test_correct_values(self):
- # Check that corpus views produce the correct sequence of values.
-
- for f, file_data in self.data():
- v = StreamBackedCorpusView(f, read_whitespace_block)
- self.assertEqual(list(v), file_data.split())
-
- v = StreamBackedCorpusView(f, read_line_block)
- self.assertEqual(list(v), self.linetok.tokenize(file_data))
-
- def test_correct_length(self):
- # Check that the corpus views report the correct lengths:
-
- for f, file_data in self.data():
- v = StreamBackedCorpusView(f, read_whitespace_block)
- self.assertEqual(len(v), len(file_data.split()))
-
- v = StreamBackedCorpusView(f, read_line_block)
- self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
+++ /dev/null
-import unittest
-import nltk.data
-from nose.tools import assert_raises
-
-
-class TestData(unittest.TestCase):
- def test_find_raises_exception(self):
-
- with assert_raises(LookupError) as context:
- nltk.data.find('no_such_resource/foo')
-
- assert type(context.exception) == LookupError, 'Unexpected exception raised'
-
- def test_find_raises_exception_with_full_resource_name(self):
- no_such_thing = 'no_such_thing/bar'
-
- with assert_raises(LookupError) as context:
- nltk.data.find(no_such_thing)
-
- assert no_such_thing in str(
- context.exception
- ), 'Exception message does not include full resource name'
+++ /dev/null
-# -*- coding: utf-8 -*-
-import unittest
-
-from nltk.metrics.agreement import AnnotationTask
-
-class TestDisagreement(unittest.TestCase):
-
- '''
- Class containing unit tests for nltk.metrics.agreement.Disagreement.
- '''
-
- def test_easy(self):
- '''
- Simple test, based on
- https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf.
- '''
- data = [('coder1', 'dress1', 'YES'),
- ('coder2', 'dress1', 'NO'),
- ('coder3', 'dress1', 'NO'),
- ('coder1', 'dress2', 'YES'),
- ('coder2', 'dress2', 'NO'),
- ('coder3', 'dress3', 'NO'),
- ]
- annotation_task = AnnotationTask(data)
- self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
-
- def test_easy2(self):
- '''
- Same simple test with 1 rating removed.
- Removal of that rating should not matter: K-Apha ignores items with
- only 1 rating.
- '''
- data = [('coder1', 'dress1', 'YES'),
- ('coder2', 'dress1', 'NO'),
- ('coder3', 'dress1', 'NO'),
- ('coder1', 'dress2', 'YES'),
- ('coder2', 'dress2', 'NO'),
- ]
- annotation_task = AnnotationTask(data)
- self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
-
- def test_advanced(self):
- '''
- More advanced test, based on
- http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf
- '''
- data = [('A', '1', '1'),
- ('B', '1', '1'),
- ('D', '1', '1'),
- ('A', '2', '2'),
- ('B', '2', '2'),
- ('C', '2', '3'),
- ('D', '2', '2'),
- ('A', '3', '3'),
- ('B', '3', '3'),
- ('C', '3', '3'),
- ('D', '3', '3'),
- ('A', '4', '3'),
- ('B', '4', '3'),
- ('C', '4', '3'),
- ('D', '4', '3'),
- ('A', '5', '2'),
- ('B', '5', '2'),
- ('C', '5', '2'),
- ('D', '5', '2'),
- ('A', '6', '1'),
- ('B', '6', '2'),
- ('C', '6', '3'),
- ('D', '6', '4'),
- ('A', '7', '4'),
- ('B', '7', '4'),
- ('C', '7', '4'),
- ('D', '7', '4'),
- ('A', '8', '1'),
- ('B', '8', '1'),
- ('C', '8', '2'),
- ('D', '8', '1'),
- ('A', '9', '2'),
- ('B', '9', '2'),
- ('C', '9', '2'),
- ('D', '9', '2'),
- ('B', '10', '5'),
- ('C', '10', '5'),
- ('D', '10', '5'),
- ('C', '11', '1'),
- ('D', '11', '1'),
- ('C', '12', '3'),
- ]
- annotation_task = AnnotationTask(data)
- self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
-
- def test_advanced2(self):
- '''
- Same more advanced example, but with 1 rating removed.
- Again, removal of that 1 rating shoudl not matter.
- '''
- data = [('A', '1', '1'),
- ('B', '1', '1'),
- ('D', '1', '1'),
- ('A', '2', '2'),
- ('B', '2', '2'),
- ('C', '2', '3'),
- ('D', '2', '2'),
- ('A', '3', '3'),
- ('B', '3', '3'),
- ('C', '3', '3'),
- ('D', '3', '3'),
- ('A', '4', '3'),
- ('B', '4', '3'),
- ('C', '4', '3'),
- ('D', '4', '3'),
- ('A', '5', '2'),
- ('B', '5', '2'),
- ('C', '5', '2'),
- ('D', '5', '2'),
- ('A', '6', '1'),
- ('B', '6', '2'),
- ('C', '6', '3'),
- ('D', '6', '4'),
- ('A', '7', '4'),
- ('B', '7', '4'),
- ('C', '7', '4'),
- ('D', '7', '4'),
- ('A', '8', '1'),
- ('B', '8', '1'),
- ('C', '8', '2'),
- ('D', '8', '1'),
- ('A', '9', '2'),
- ('B', '9', '2'),
- ('C', '9', '2'),
- ('D', '9', '2'),
- ('B', '10', '5'),
- ('C', '10', '5'),
- ('D', '10', '5'),
- ('C', '11', '1'),
- ('D', '11', '1'),
- ('C', '12', '3'),
- ]
- annotation_task = AnnotationTask(data)
- self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
-
+++ /dev/null
-import unittest
-import nltk
-
-
-class TestFreqDist(unittest.TestCase):
-
- def test_iterating_returns_an_iterator_ordered_by_frequency(self):
-
- samples = ['one', 'two', 'two']
-
- distribution = nltk.FreqDist(samples)
-
- most_frequent, less_frequent = [entry for entry in distribution]
-
- self.assertEqual(most_frequent, 'two')
- self.assertEqual(less_frequent, 'one')
+++ /dev/null
-# -*- coding: utf-8 -*-
-from nltk.tag import hmm
-
-
-def _wikipedia_example_hmm():
- # Example from wikipedia
- # (http://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm)
-
- states = ['rain', 'no rain']
- symbols = ['umbrella', 'no umbrella']
-
- A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities
- B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities
- pi = [0.5, 0.5] # initial probabilities
-
- seq = ['umbrella', 'umbrella', 'no umbrella', 'umbrella', 'umbrella']
- seq = list(zip(seq, [None] * len(seq)))
-
- model = hmm._create_hmm_tagger(states, symbols, A, B, pi)
- return model, states, symbols, seq
-
-
-def test_forward_probability():
- from numpy.testing import assert_array_almost_equal
-
- # example from p. 385, Huang et al
- model, states, symbols = hmm._market_hmm_example()
- seq = [('up', None), ('up', None)]
- expected = [[0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357]]
-
- fp = 2 ** model._forward_probability(seq)
-
- assert_array_almost_equal(fp, expected)
-
-
-def test_forward_probability2():
- from numpy.testing import assert_array_almost_equal
-
- model, states, symbols, seq = _wikipedia_example_hmm()
- fp = 2 ** model._forward_probability(seq)
-
- # examples in wikipedia are normalized
- fp = (fp.T / fp.sum(axis=1)).T
-
- wikipedia_results = [
- [0.8182, 0.1818],
- [0.8834, 0.1166],
- [0.1907, 0.8093],
- [0.7308, 0.2692],
- [0.8673, 0.1327],
- ]
-
- assert_array_almost_equal(wikipedia_results, fp, 4)
-
-
-def test_backward_probability():
- from numpy.testing import assert_array_almost_equal
-
- model, states, symbols, seq = _wikipedia_example_hmm()
-
- bp = 2 ** model._backward_probability(seq)
- # examples in wikipedia are normalized
-
- bp = (bp.T / bp.sum(axis=1)).T
-
- wikipedia_results = [
- # Forward-backward algorithm doesn't need b0_5,
- # so .backward_probability doesn't compute it.
- # [0.6469, 0.3531],
- [0.5923, 0.4077],
- [0.3763, 0.6237],
- [0.6533, 0.3467],
- [0.6273, 0.3727],
- [0.5, 0.5],
- ]
-
- assert_array_almost_equal(wikipedia_results, bp, 4)
-
-
-def setup_module(module):
- from nose import SkipTest
-
- try:
- import numpy
- except ImportError:
- raise SkipTest("numpy is required for nltk.test.test_hmm")
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Twitter client
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Lorenzo Rubio <lrnzcig@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-"""
-Regression tests for `json2csv()` and `json2csv_entities()` in Twitter
-package.
-
-"""
-
-import os
-import unittest
-from tempfile import TemporaryDirectory
-
-from nltk.corpus import twitter_samples
-from nltk.twitter.common import json2csv, json2csv_entities
-
-
-def are_files_identical(filename1, filename2, debug=False):
- """
- Compare two files, ignoring carriage returns.
- """
- with open(filename1, "rb") as fileA:
- with open(filename2, "rb") as fileB:
- result = True
- for lineA, lineB in zip(
- sorted(fileA.readlines()), sorted(fileB.readlines())
- ):
- if lineA.strip() != lineB.strip():
- if debug:
- print(
- "Error while comparing files. "
- + "First difference at line below."
- )
- print("=> Output file line: {0}".format(lineA))
- print("=> Refer. file line: {0}".format(lineB))
- result = False
- break
- return result
-
-
-class TestJSON2CSV(unittest.TestCase):
- def setUp(self):
- with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
- self.infile = [next(infile) for x in range(100)]
- infile.close()
- self.msg = "Test and reference files are not the same"
- self.subdir = os.path.join(os.path.dirname(__file__), 'files')
-
- def tearDown(self):
- return
-
- def test_textoutput(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.text.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
- json2csv(self.infile, outfn, ['text'], gzip_compress=False)
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_tweet_metadata(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.tweet.csv.ref')
- fields = [
- 'created_at',
- 'favorite_count',
- 'id',
- 'in_reply_to_status_id',
- 'in_reply_to_user_id',
- 'retweet_count',
- 'retweeted',
- 'text',
- 'truncated',
- 'user.id',
- ]
-
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.tweet.csv')
- json2csv(self.infile, outfn, fields, gzip_compress=False)
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_user_metadata(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref')
- fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
-
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv')
- json2csv(self.infile, outfn, fields, gzip_compress=False)
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_tweet_hashtag(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id', 'text'],
- 'hashtags',
- ['text'],
- gzip_compress=False,
- )
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_tweet_usermention(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.usermention.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.usermention.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id', 'text'],
- 'user_mentions',
- ['id', 'screen_name'],
- gzip_compress=False,
- )
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_tweet_media(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.media.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.media.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id'],
- 'media',
- ['media_url', 'url'],
- gzip_compress=False,
- )
-
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_tweet_url(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.url.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.url.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id'],
- 'urls',
- ['url', 'expanded_url'],
- gzip_compress=False,
- )
-
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_userurl(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.userurl.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.userurl.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id', 'screen_name'],
- 'user.urls',
- ['url', 'expanded_url'],
- gzip_compress=False,
- )
-
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_tweet_place(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.place.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.place.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id', 'text'],
- 'place',
- ['name', 'country'],
- gzip_compress=False,
- )
-
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_tweet_place_boundingbox(self):
- ref_fn = os.path.join(
- self.subdir, 'tweets.20150430-223406.placeboundingbox.csv.ref'
- )
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.placeboundingbox.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id', 'name'],
- 'place.bounding_box',
- ['coordinates'],
- gzip_compress=False,
- )
-
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_retweet_original_tweet(self):
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv')
- json2csv_entities(
- self.infile,
- outfn,
- ['id'],
- 'retweeted_status',
- [
- 'created_at',
- 'favorite_count',
- 'id',
- 'in_reply_to_status_id',
- 'in_reply_to_user_id',
- 'retweet_count',
- 'text',
- 'truncated',
- 'user.id',
- ],
- gzip_compress=False,
- )
-
- self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
-
- def test_file_is_wrong(self):
- """
- Sanity check that file comparison is not giving false positives.
- """
- ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
- with TemporaryDirectory() as tempdir:
- outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
- json2csv(self.infile, outfn, ['text'], gzip_compress=False)
- self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg)
-
-
-if __name__ == "__main__":
- unittest.main()
+++ /dev/null
-import unittest
-
-from nltk.corpus import brown
-from nltk.jsontags import JSONTaggedDecoder, JSONTaggedEncoder
-from nltk.tag import DefaultTagger, RegexpTagger, AffixTagger
-from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, NgramTagger
-from nltk.tag import PerceptronTagger
-from nltk.tag import BrillTaggerTrainer, BrillTagger
-from nltk.tag.brill import nltkdemo18
-
-
-class TestJSONSerialization(unittest.TestCase):
- def setUp(self):
- self.corpus = brown.tagged_sents()[:35]
- self.decoder = JSONTaggedDecoder()
- self.encoder = JSONTaggedEncoder()
- self.default_tagger = DefaultTagger("NN")
-
- def test_default_tagger(self):
- encoded = self.encoder.encode(self.default_tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(self.default_tagger), repr(decoded))
- self.assertEqual(self.default_tagger._tag, decoded._tag)
-
- def test_regexp_tagger(self):
- tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(tagger), repr(decoded))
- self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
- self.assertEqual(tagger._regexps, decoded._regexps)
-
- def test_affix_tagger(self):
- tagger = AffixTagger(self.corpus, backoff=self.default_tagger)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(tagger), repr(decoded))
- self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
- self.assertEqual(tagger._affix_length, decoded._affix_length)
- self.assertEqual(tagger._min_word_length, decoded._min_word_length)
- self.assertEqual(tagger._context_to_tag, decoded._context_to_tag)
-
- def test_ngram_taggers(self):
- unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger)
- bitagger = BigramTagger(self.corpus, backoff=unitagger)
- tritagger = TrigramTagger(self.corpus, backoff=bitagger)
- ntagger = NgramTagger(4, self.corpus, backoff=tritagger)
-
- encoded = self.encoder.encode(ntagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(ntagger), repr(decoded))
- self.assertEqual(repr(tritagger), repr(decoded.backoff))
- self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff))
- self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff))
- self.assertEqual(repr(self.default_tagger),
- repr(decoded.backoff.backoff.backoff.backoff))
-
- def test_perceptron_tagger(self):
- tagger = PerceptronTagger(load=False)
- tagger.train(self.corpus)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(tagger.model.weights, decoded.model.weights)
- self.assertEqual(tagger.tagdict, decoded.tagdict)
- self.assertEqual(tagger.classes, decoded.classes)
-
- def test_brill_tagger(self):
- trainer = BrillTaggerTrainer(self.default_tagger, nltkdemo18(),
- deterministic=True)
- tagger = trainer.train(self.corpus, max_rules=30)
-
- encoded = self.encoder.encode(tagger)
- decoded = self.decoder.decode(encoded)
-
- self.assertEqual(repr(tagger._initial_tagger),
- repr(decoded._initial_tagger))
- self.assertEqual(tagger._rules, decoded._rules)
- self.assertEqual(tagger._training_stats, decoded._training_stats)
-
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-import unittest
-from nltk.classify.naivebayes import NaiveBayesClassifier
-
-
-class NaiveBayesClassifierTest(unittest.TestCase):
- def test_simple(self):
- training_features = [
- ({'nice': True, 'good': True}, 'positive'),
- ({'bad': True, 'mean': True}, 'negative'),
- ]
-
- classifier = NaiveBayesClassifier.train(training_features)
-
- result = classifier.prob_classify({'nice': True})
- self.assertTrue(result.prob('positive') > result.prob('negative'))
- self.assertEqual(result.max(), 'positive')
-
- result = classifier.prob_classify({'bad': True})
- self.assertTrue(result.prob('positive') < result.prob('negative'))
- self.assertEqual(result.max(), 'negative')
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Unit tests for nltk.corpus.nombank
-"""
-
-import unittest
-
-from nltk.corpus import nombank
-# Load the nombank once.
-nombank.nouns()
-
-class NombankDemo(unittest.TestCase):
- def test_numbers(self):
- # No. of instances.
- self.assertEqual(len(nombank.instances()), 114574)
- # No. of rolesets
- self.assertEqual(len(nombank.rolesets()), 5577)
- # No. of nouns.
- self.assertEqual(len(nombank.nouns()), 4704)
-
-
- def test_instance(self):
- self.assertEqual(nombank.instances()[0].roleset, 'perc-sign.01')
-
- def test_framefiles_fileids(self):
- self.assertEqual(len(nombank.fileids()), 4705)
- self.assertTrue(all(fileid.endswith('.xml') for fileid in nombank.fileids()))
+++ /dev/null
-import unittest
-
-import nltk
-from nltk.corpus.reader import pl196x
-
-
-class TestCorpusViews(unittest.TestCase):
-
- def test_corpus_reader(self):
- pl196x_dir = nltk.data.find('corpora/pl196x')
- pl = pl196x.Pl196xCorpusReader(pl196x_dir, r'.*\.xml',
- textids='textids.txt',
- cat_file='cats.txt')
- pl.tagged_words(fileids=pl.fileids(), categories='cats.txt')
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for nltk.pos_tag
-"""
-
-
-import unittest
-
-from nltk import word_tokenize, pos_tag
-
-
-class TestPosTag(unittest.TestCase):
- def test_pos_tag_eng(self):
- text = "John's big idea isn't all that bad."
- expected_tagged = [
- ('John', 'NNP'),
- ("'s", 'POS'),
- ('big', 'JJ'),
- ('idea', 'NN'),
- ('is', 'VBZ'),
- ("n't", 'RB'),
- ('all', 'PDT'),
- ('that', 'DT'),
- ('bad', 'JJ'),
- ('.', '.'),
- ]
- assert pos_tag(word_tokenize(text)) == expected_tagged
-
- def test_pos_tag_eng_universal(self):
- text = "John's big idea isn't all that bad."
- expected_tagged = [
- ('John', 'NOUN'),
- ("'s", 'PRT'),
- ('big', 'ADJ'),
- ('idea', 'NOUN'),
- ('is', 'VERB'),
- ("n't", 'ADV'),
- ('all', 'DET'),
- ('that', 'DET'),
- ('bad', 'ADJ'),
- ('.', '.'),
- ]
- assert pos_tag(word_tokenize(text), tagset='universal') == expected_tagged
-
- def test_pos_tag_rus(self):
- text = u"Илья оторопел и дважды перечитал бумажку."
- expected_tagged = [
- ('Илья', 'S'),
- ('оторопел', 'V'),
- ('и', 'CONJ'),
- ('дважды', 'ADV'),
- ('перечитал', 'V'),
- ('бумажку', 'S'),
- ('.', 'NONLEX'),
- ]
- assert pos_tag(word_tokenize(text), lang='rus') == expected_tagged
-
- def test_pos_tag_rus_universal(self):
- text = u"Илья оторопел и дважды перечитал бумажку."
- expected_tagged = [
- ('Илья', 'NOUN'),
- ('оторопел', 'VERB'),
- ('и', 'CONJ'),
- ('дважды', 'ADV'),
- ('перечитал', 'VERB'),
- ('бумажку', 'NOUN'),
- ('.', '.'),
- ]
- assert (
- pos_tag(word_tokenize(text), tagset='universal', lang='rus')
- == expected_tagged
- )
-
- def test_pos_tag_unknown_lang(self):
- text = u"모르겠 습니 다"
- self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang='kor')
- # Test for default kwarg, `lang=None`
- self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None)
-
- def test_unspecified_lang(self):
- # Tries to force the lang='eng' option.
- text = u"모르겠 습니 다"
- expected_but_wrong = [('모르겠', 'JJ'), ('습니', 'NNP'), ('다', 'NN')]
- assert pos_tag(word_tokenize(text)) == expected_but_wrong
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-import unittest
-
-from nltk.corpus import rte as rte_corpus
-from nltk.classify.rte_classify import RTEFeatureExtractor, rte_features, rte_classifier
-
-expected_from_rte_feature_extration = """
-alwayson => True
-ne_hyp_extra => 0
-ne_overlap => 1
-neg_hyp => 0
-neg_txt => 0
-word_hyp_extra => 3
-word_overlap => 3
-
-alwayson => True
-ne_hyp_extra => 0
-ne_overlap => 1
-neg_hyp => 0
-neg_txt => 0
-word_hyp_extra => 2
-word_overlap => 1
-
-alwayson => True
-ne_hyp_extra => 1
-ne_overlap => 1
-neg_hyp => 0
-neg_txt => 0
-word_hyp_extra => 1
-word_overlap => 2
-
-alwayson => True
-ne_hyp_extra => 1
-ne_overlap => 0
-neg_hyp => 0
-neg_txt => 0
-word_hyp_extra => 6
-word_overlap => 2
-
-alwayson => True
-ne_hyp_extra => 1
-ne_overlap => 0
-neg_hyp => 0
-neg_txt => 0
-word_hyp_extra => 4
-word_overlap => 0
-
-alwayson => True
-ne_hyp_extra => 1
-ne_overlap => 0
-neg_hyp => 0
-neg_txt => 0
-word_hyp_extra => 3
-word_overlap => 1
-"""
-
-
-class RTEClassifierTest(unittest.TestCase):
- # Test the feature extraction method.
- def test_rte_feature_extraction(self):
- pairs = rte_corpus.pairs(['rte1_dev.xml'])[:6]
- test_output = [
- "%-15s => %s" % (key, rte_features(pair)[key])
- for pair in pairs
- for key in sorted(rte_features(pair))
- ]
- expected_output = expected_from_rte_feature_extration.strip().split('\n')
- # Remove null strings.
- expected_output = list(filter(None, expected_output))
- self.assertEqual(test_output, expected_output)
-
- # Test the RTEFeatureExtractor object.
- def test_feature_extractor_object(self):
- rtepair = rte_corpus.pairs(['rte3_dev.xml'])[33]
- extractor = RTEFeatureExtractor(rtepair)
- self.assertEqual(extractor.hyp_words, {'member', 'China', 'SCO.'})
- self.assertEqual(extractor.overlap('word'), set())
- self.assertEqual(extractor.overlap('ne'), {'China'})
- self.assertEqual(extractor.hyp_extra('word'), {'member'})
-
- # Test the RTE classifier training.
- def test_rte_classification_without_megam(self):
- clf = rte_classifier('IIS')
- clf = rte_classifier('GIS')
-
- @unittest.skip("Skipping tests with dependencies on MEGAM")
- def test_rte_classification_with_megam(self):
- nltk.config_megam('/usr/local/bin/megam')
- clf = rte_classifier('megam')
- clf = rte_classifier('BFGS')
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-The following test performs a random series of reads, seeks, and
-tells, and checks that the results are consistent.
-"""
-import random
-import functools
-from io import BytesIO
-from nltk.corpus.reader import SeekableUnicodeStreamReader
-
-
-def check_reader(unicode_string, encoding, n=1000):
- bytestr = unicode_string.encode(encoding)
- strlen = len(unicode_string)
- stream = BytesIO(bytestr)
- reader = SeekableUnicodeStreamReader(stream, encoding)
- # Find all character positions
- chars = []
- while True:
- pos = reader.tell()
- chars.append((pos, reader.read(1)))
- if chars[-1][1] == '':
- break
- # Find all strings
- strings = dict((pos, '') for (pos, c) in chars)
- for pos1, char in chars:
- for pos2, _ in chars:
- if pos2 <= pos1:
- strings[pos2] += char
- while True:
- op = random.choice('tsrr')
- # Check our position?
- if op == 't': # tell
- reader.tell()
- # Perform a seek?
- if op == 's': # seek
- new_pos = random.choice([p for (p, c) in chars])
- reader.seek(new_pos)
- # Perform a read?
- if op == 'r': # read
- if random.random() < 0.3:
- pos = reader.tell()
- else:
- pos = None
- if random.random() < 0.2:
- size = None
- elif random.random() < 0.8:
- size = random.randint(0, int(strlen / 6))
- else:
- size = random.randint(0, strlen + 20)
- if random.random() < 0.8:
- s = reader.read(size)
- else:
- s = reader.readline(size)
- # check that everything's consistent
- if pos is not None:
- assert pos in strings
- assert strings[pos].startswith(s)
- n -= 1
- if n == 0:
- return 'passed'
-
-
-# Call the randomized test function `check_reader` with a variety of
-# input strings and encodings.
-
-ENCODINGS = ['ascii', 'latin1', 'greek', 'hebrew', 'utf-16', 'utf-8']
-
-STRINGS = [
- """
- This is a test file.
- It is fairly short.
- """,
- "This file can be encoded with latin1. \x83",
- """\
- This is a test file.
- Here's a blank line:
-
- And here's some unicode: \xee \u0123 \uffe3
- """,
- """\
- This is a test file.
- Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
- """,
-]
-
-
-def test_reader():
- for string in STRINGS:
- for encoding in ENCODINGS:
- try:
- # skip strings that can't be encoded with the current encoding
- string.encode(encoding)
- yield check_reader, string, encoding
- except UnicodeEncodeError:
- pass
-
-
-# nose shows the whole string arguments in a verbose mode; this is annoying,
-# so large string test is separated.
-
-LARGE_STRING = (
- """\
-This is a larger file. It has some lines that are longer \
-than 72 characters. It's got lots of repetition. Here's \
-some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
-
-How fun! Let's repeat it twenty times.
-"""
- * 10
-)
-
-
-def test_reader_on_large_string():
- for encoding in ENCODINGS:
- try:
- # skip strings that can't be encoded with the current encoding
- LARGE_STRING.encode(encoding)
-
- def _check(encoding, n=1000):
- check_reader(LARGE_STRING, encoding, n)
-
- yield _check, encoding
-
- except UnicodeEncodeError:
- pass
-
-
-def test_reader_stream_is_closed():
- reader = SeekableUnicodeStreamReader(BytesIO(b''), 'ascii')
- assert reader.stream.closed is False
- reader.__del__()
- assert reader.stream.closed is True
-
-
-def teardown_module(module=None):
- import gc
-
- gc.collect()
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Unit tests for Senna
-"""
-
-from os import environ, path, sep
-
-import logging
-import unittest
-
-from nltk.classify import Senna
-from nltk.tag import SennaTagger, SennaChunkTagger, SennaNERTagger
-
-# Set Senna executable path for tests if it is not specified as an environment variable
-if 'SENNA' in environ:
- SENNA_EXECUTABLE_PATH = path.normpath(environ['SENNA']) + sep
-else:
- SENNA_EXECUTABLE_PATH = '/usr/share/senna-v3.0'
-
-senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
-
-
-@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
-class TestSennaPipeline(unittest.TestCase):
- """Unittest for nltk.classify.senna"""
-
- def test_senna_pipeline(self):
- """Senna pipeline interface"""
-
- pipeline = Senna(SENNA_EXECUTABLE_PATH, ['pos', 'chk', 'ner'])
- sent = 'Dusseldorf is an international business center'.split()
- result = [
- (token['word'], token['chk'], token['ner'], token['pos'])
- for token in pipeline.tag(sent)
- ]
- expected = [
- ('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'),
- ('is', 'B-VP', 'O', 'VBZ'),
- ('an', 'B-NP', 'O', 'DT'),
- ('international', 'I-NP', 'O', 'JJ'),
- ('business', 'I-NP', 'O', 'NN'),
- ('center', 'I-NP', 'O', 'NN'),
- ]
- self.assertEqual(result, expected)
-
-
-@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
-class TestSennaTagger(unittest.TestCase):
- """Unittest for nltk.tag.senna"""
-
- def test_senna_tagger(self):
- tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
- result = tagger.tag('What is the airspeed of an unladen swallow ?'.split())
- expected = [
- ('What', 'WP'),
- ('is', 'VBZ'),
- ('the', 'DT'),
- ('airspeed', 'NN'),
- ('of', 'IN'),
- ('an', 'DT'),
- ('unladen', 'NN'),
- ('swallow', 'NN'),
- ('?', '.'),
- ]
- self.assertEqual(result, expected)
-
- def test_senna_chunk_tagger(self):
- chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
- result_1 = chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
- expected_1 = [
- ('What', 'B-NP'),
- ('is', 'B-VP'),
- ('the', 'B-NP'),
- ('airspeed', 'I-NP'),
- ('of', 'B-PP'),
- ('an', 'B-NP'),
- ('unladen', 'I-NP'),
- ('swallow', 'I-NP'),
- ('?', 'O'),
- ]
-
- result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type='NP'))
- expected_2 = [
- ('What', '0'),
- ('the airspeed', '2-3'),
- ('an unladen swallow', '5-6-7'),
- ]
- self.assertEqual(result_1, expected_1)
- self.assertEqual(result_2, expected_2)
-
- def test_senna_ner_tagger(self):
- nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
- result_1 = nertagger.tag('Shakespeare theatre was in London .'.split())
- expected_1 = [
- ('Shakespeare', 'B-PER'),
- ('theatre', 'O'),
- ('was', 'O'),
- ('in', 'O'),
- ('London', 'B-LOC'),
- ('.', 'O'),
- ]
-
- result_2 = nertagger.tag('UN headquarters are in NY , USA .'.split())
- expected_2 = [
- ('UN', 'B-ORG'),
- ('headquarters', 'O'),
- ('are', 'O'),
- ('in', 'O'),
- ('NY', 'B-LOC'),
- (',', 'O'),
- ('USA', 'B-LOC'),
- ('.', 'O'),
- ]
- self.assertEqual(result_1, expected_1)
- self.assertEqual(result_2, expected_2)
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-import os
-import unittest
-from contextlib import closing
-
-from nltk import data
-from nltk.stem.snowball import SnowballStemmer
-from nltk.stem.porter import PorterStemmer
-
-
-class SnowballTest(unittest.TestCase):
- def test_arabic(self):
- """
- this unit testing for test the snowball arabic light stemmer
- this stemmer deals with prefixes and suffixes
- """
- # Test where the ignore_stopwords=True.
- ar_stemmer = SnowballStemmer("arabic", True)
- assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
- assert ar_stemmer.stem("العربية") == "عرب"
- assert ar_stemmer.stem("فقالوا") == "قال"
- assert ar_stemmer.stem("الطالبات") == "طالب"
- assert ar_stemmer.stem("فالطالبات") == "طالب"
- assert ar_stemmer.stem("والطالبات") == "طالب"
- assert ar_stemmer.stem("الطالبون") == "طالب"
- assert ar_stemmer.stem("اللذان") == "اللذان"
- assert ar_stemmer.stem("من") == "من"
- # Test where the ignore_stopwords=False.
- ar_stemmer = SnowballStemmer("arabic", False)
- assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
- assert ar_stemmer.stem("الطالبات") == "طالب"
- assert ar_stemmer.stem("الكلمات") == "كلم"
- # test where create the arabic stemmer without given init value to ignore_stopwords
- ar_stemmer = SnowballStemmer("arabic")
- assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
- assert ar_stemmer.stem("العربية") == "عرب"
- assert ar_stemmer.stem("فقالوا") == "قال"
- assert ar_stemmer.stem("الطالبات") == "طالب"
- assert ar_stemmer.stem("الكلمات") == "كلم"
-
- def test_russian(self):
- stemmer_russian = SnowballStemmer("russian")
- assert stemmer_russian.stem("авантненькая") == "авантненьк"
-
- def test_german(self):
- stemmer_german = SnowballStemmer("german")
- stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
-
- assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
- assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'
-
- assert stemmer_german.stem("keinen") == 'kein'
- assert stemmer_german2.stem("keinen") == 'keinen'
-
- def test_spanish(self):
- stemmer = SnowballStemmer('spanish')
-
- assert stemmer.stem("Visionado") == 'vision'
-
- # The word 'algue' was raising an IndexError
- assert stemmer.stem("algue") == 'algu'
-
- def test_short_strings_bug(self):
- stemmer = SnowballStemmer('english')
- assert stemmer.stem("y's") == 'y'
-
-
-class PorterTest(unittest.TestCase):
- def _vocabulary(self):
- with closing(
- data.find('stemmers/porter_test/porter_vocabulary.txt').open(
- encoding='utf-8'
- )
- ) as fp:
- return fp.read().splitlines()
-
- def _test_against_expected_output(self, stemmer_mode, expected_stems):
- stemmer = PorterStemmer(mode=stemmer_mode)
- for word, true_stem in zip(self._vocabulary(), expected_stems):
- our_stem = stemmer.stem(word)
- assert our_stem == true_stem, (
- "%s should stem to %s in %s mode but got %s"
- % (word, true_stem, stemmer_mode, our_stem)
- )
-
- def test_vocabulary_martin_mode(self):
- """Tests all words from the test vocabulary provided by M Porter
-
- The sample vocabulary and output were sourced from:
- http://tartarus.org/martin/PorterStemmer/voc.txt
- http://tartarus.org/martin/PorterStemmer/output.txt
- and are linked to from the Porter Stemmer algorithm's homepage
- at
- http://tartarus.org/martin/PorterStemmer/
- """
- with closing(
- data.find('stemmers/porter_test/porter_martin_output.txt').open(
- encoding='utf-8'
- )
- ) as fp:
- self._test_against_expected_output(
- PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
- )
-
- def test_vocabulary_nltk_mode(self):
- with closing(
- data.find('stemmers/porter_test/porter_nltk_output.txt').open(
- encoding='utf-8'
- )
- ) as fp:
- self._test_against_expected_output(
- PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
- )
-
- def test_vocabulary_original_mode(self):
- # The list of stems for this test was generated by taking the
- # Martin-blessed stemmer from
- # http://tartarus.org/martin/PorterStemmer/c.txt
- # and removing all the --DEPARTURE-- sections from it and
- # running it against Martin's test vocabulary.
-
- with closing(
- data.find('stemmers/porter_test/porter_original_output.txt').open(
- encoding='utf-8'
- )
- ) as fp:
- self._test_against_expected_output(
- PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
- )
-
- self._test_against_expected_output(
- PorterStemmer.ORIGINAL_ALGORITHM,
- data.find('stemmers/porter_test/porter_original_output.txt')
- .open(encoding='utf-8')
- .read()
- .splitlines(),
- )
-
- def test_oed_bug(self):
- """Test for bug https://github.com/nltk/nltk/issues/1581
-
- Ensures that 'oed' can be stemmed without throwing an error.
- """
- assert PorterStemmer().stem('oed') == 'o'
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-def test_basic():
- from nltk.tag import pos_tag
- from nltk.tokenize import word_tokenize
-
- result = pos_tag(word_tokenize("John's big idea isn't all that bad."))
- assert result == [
- ('John', 'NNP'),
- ("'s", 'POS'),
- ('big', 'JJ'),
- ('idea', 'NN'),
- ('is', 'VBZ'),
- ("n't", 'RB'),
- ('all', 'PDT'),
- ('that', 'DT'),
- ('bad', 'JJ'),
- ('.', '.'),
- ]
-
-
-def setup_module(module):
- from nose import SkipTest
-
- try:
- import numpy
- except ImportError:
- raise SkipTest("numpy is required for nltk.test.test_tag")
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Natural Language Toolkit: TGrep search
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Will Roberts <wildwilhelm@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-'''
-Unit tests for nltk.tgrep.
-'''
-
-
-import unittest
-
-from nltk.tree import ParentedTree
-from nltk import tgrep
-
-
-class TestSequenceFunctions(unittest.TestCase):
-
- '''
- Class containing unit tests for nltk.tgrep.
- '''
-
- def test_tokenize_simple(self):
- '''
- Simple test of tokenization.
- '''
- tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')
- self.assertEqual(
- tokens,
- [
- 'A',
- '..',
- '(',
- 'B',
- '!',
- '<',
- 'C',
- '.',
- 'D',
- ')',
- '|',
- '!',
- '[',
- '<<',
- '(',
- 'E',
- ',',
- 'F',
- ')',
- '$',
- 'G',
- ']',
- ],
- )
-
- def test_tokenize_encoding(self):
- '''
- Test that tokenization handles bytes and strs the same way.
- '''
- self.assertEqual(
- tgrep.tgrep_tokenize(b'A .. (B !< C . D) | ![<< (E , F) $ G]'),
- tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'),
- )
-
- def test_tokenize_link_types(self):
- '''
- Test tokenization of basic link types.
- '''
- self.assertEqual(tgrep.tgrep_tokenize('A<B'), ['A', '<', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>B'), ['A', '>', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<3B'), ['A', '<3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>3B'), ['A', '>3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<,B'), ['A', '<,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>,B'), ['A', '>,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<-3B'), ['A', '<-3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>-3B'), ['A', '>-3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<-B'), ['A', '<-', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>-B'), ['A', '>-', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<\'B'), ['A', '<\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>\'B'), ['A', '>\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<:B'), ['A', '<:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>:B'), ['A', '>:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<<B'), ['A', '<<', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>>B'), ['A', '>>', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<<,B'), ['A', '<<,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>>,B'), ['A', '>>,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'), ['A', '<<\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'), ['A', '>>\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A<<:B'), ['A', '<<:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A>>:B'), ['A', '>>:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A.B'), ['A', '.', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A,B'), ['A', ',', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A..B'), ['A', '..', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A,,B'), ['A', ',,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A$B'), ['A', '$', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A$.B'), ['A', '$.', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A$,B'), ['A', '$,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A$..B'), ['A', '$..', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A$,,B'), ['A', '$,,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<B'), ['A', '!', '<', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>B'), ['A', '!', '>', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<3B'), ['A', '!', '<3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>3B'), ['A', '!', '>3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<,B'), ['A', '!', '<,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>,B'), ['A', '!', '>,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'), ['A', '!', '<-3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'), ['A', '!', '>-3', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<-B'), ['A', '!', '<-', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>-B'), ['A', '!', '>-', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'), ['A', '!', '<\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'), ['A', '!', '>\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<:B'), ['A', '!', '<:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>:B'), ['A', '!', '>:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<<B'), ['A', '!', '<<', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>>B'), ['A', '!', '>>', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'), ['A', '!', '<<,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'), ['A', '!', '>>,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'), ['A', '!', '<<\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'), ['A', '!', '>>\'', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'), ['A', '!', '<<:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'), ['A', '!', '>>:', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!.B'), ['A', '!', '.', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!,B'), ['A', '!', ',', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!..B'), ['A', '!', '..', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!,,B'), ['A', '!', ',,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!$B'), ['A', '!', '$', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!$.B'), ['A', '!', '$.', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!$,B'), ['A', '!', '$,', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!$..B'), ['A', '!', '$..', 'B'])
- self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'), ['A', '!', '$,,', 'B'])
-
- def test_tokenize_examples(self):
- '''
- Test tokenization of the TGrep2 manual example patterns.
- '''
- self.assertEqual(tgrep.tgrep_tokenize('NP < PP'), ['NP', '<', 'PP'])
- self.assertEqual(tgrep.tgrep_tokenize('/^NP/'), ['/^NP/'])
- self.assertEqual(
- tgrep.tgrep_tokenize('NP << PP . VP'), ['NP', '<<', 'PP', '.', 'VP']
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('NP << PP | . VP'), ['NP', '<<', 'PP', '|', '.', 'VP']
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'),
- ['NP', '!', '<<', 'PP', '[', '>', 'NP', '|', '>>', 'VP', ']'],
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('NP << (PP . VP)'),
- ['NP', '<<', '(', 'PP', '.', 'VP', ')'],
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'),
- ['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<', 'on', ')', ')'],
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('S < (A < B) < C'),
- ['S', '<', '(', 'A', '<', 'B', ')', '<', 'C'],
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('S < ((A < B) < C)'),
- ['S', '<', '(', '(', 'A', '<', 'B', ')', '<', 'C', ')'],
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('S < (A < B < C)'),
- ['S', '<', '(', 'A', '<', 'B', '<', 'C', ')'],
- )
- self.assertEqual(tgrep.tgrep_tokenize('A<B&.C'), ['A', '<', 'B', '&', '.', 'C'])
-
- def test_tokenize_quoting(self):
- '''
- Test tokenization of quoting.
- '''
- self.assertEqual(
- tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
- ['"A<<:B"', '<<:', '"A $.. B"', '<', '"A>3B"', '<', 'C'],
- )
-
- def test_tokenize_nodenames(self):
- '''
- Test tokenization of node names.
- '''
- self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert'])
- self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/'])
- self.assertEqual(tgrep.tgrep_tokenize('*'), ['*'])
- self.assertEqual(tgrep.tgrep_tokenize('__'), ['__'])
- # test tokenization of NLTK tree position syntax
- self.assertEqual(tgrep.tgrep_tokenize('N()'), ['N(', ')'])
- self.assertEqual(tgrep.tgrep_tokenize('N(0,)'), ['N(', '0', ',', ')'])
- self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'), ['N(', '0', ',', '0', ')'])
- self.assertEqual(
- tgrep.tgrep_tokenize('N(0,0,)'), ['N(', '0', ',', '0', ',', ')']
- )
-
- def test_tokenize_macros(self):
- '''
- Test tokenization of macro definitions.
- '''
- self.assertEqual(
- tgrep.tgrep_tokenize(
- '@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN'
- ),
- [
- '@',
- 'NP',
- '/^NP/',
- ';',
- '@',
- 'NN',
- '/^NN/',
- ';',
- '@NP',
- '[',
- '!',
- '<',
- 'NP',
- '|',
- '<',
- '@NN',
- ']',
- '!',
- '$..',
- '@NN',
- ],
- )
-
- def test_node_simple(self):
- '''
- Test a simple use of tgrep for finding nodes matching a given
- pattern.
- '''
- tree = ParentedTree.fromstring(
- '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
- )
- self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
- self.assertEqual(
- list(tgrep.tgrep_nodes('NN', [tree])), [[tree[0, 2], tree[2, 1]]]
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('NN|JJ', [tree])), [[(0, 1), (0, 2), (2, 1)]]
- )
-
- def test_node_printing(self):
- '''Test that the tgrep print operator ' is properly ignored.'''
- tree = ParentedTree.fromstring('(S (n x) (N x))')
- self.assertEqual(
- list(tgrep.tgrep_positions('N', [tree])),
- list(tgrep.tgrep_positions('\'N', [tree])),
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('/[Nn]/', [tree])),
- list(tgrep.tgrep_positions('\'/[Nn]/', [tree])),
- )
-
- def test_node_encoding(self):
- '''
- Test that tgrep search strings handles bytes and strs the same
- way.
- '''
- tree = ParentedTree.fromstring(
- '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
- )
- self.assertEqual(
- list(tgrep.tgrep_positions(b'NN', [tree])),
- list(tgrep.tgrep_positions(b'NN', [tree])),
- )
- self.assertEqual(
- list(tgrep.tgrep_nodes(b'NN', [tree])),
- list(tgrep.tgrep_nodes('NN', [tree])),
- )
- self.assertEqual(
- list(tgrep.tgrep_positions(b'NN|JJ', [tree])),
- list(tgrep.tgrep_positions('NN|JJ', [tree])),
- )
-
- def test_node_nocase(self):
- '''
- Test selecting nodes using case insensitive node names.
- '''
- tree = ParentedTree.fromstring('(S (n x) (N x))')
- self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
-
- def test_node_quoted(self):
- '''
- Test selecting nodes using quoted node names.
- '''
- tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
- self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
- self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
-
- def test_node_regex(self):
- '''
- Test regex matching on nodes.
- '''
- tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
- # This is a regular expression that matches any node whose
- # name starts with NP, including NP-SBJ:
- self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
-
- def test_node_regex_2(self):
- '''
- Test regex matching on nodes.
- '''
- tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
- self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])), [[(0,), (1,)]])
- # This is a regular expression that matches any node whose
- # name includes SBJ, including NP-SBJ:
- self.assertEqual(
- list(tgrep.tgrep_positions('/SBJ/', [tree])), [[(0,), (1,), (2,)]]
- )
-
- def test_node_tree_position(self):
- '''
- Test matching on nodes based on NLTK tree position.
- '''
- tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
- # test all tree positions that are not leaves
- leaf_positions = set(
- tree.leaf_treeposition(x) for x in range(len(tree.leaves()))
- )
- tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
- for position in tree_positions:
- node_id = 'N{0}'.format(position)
- tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
- self.assertEqual(len(tgrep_positions[0]), 1)
- self.assertEqual(tgrep_positions[0][0], position)
-
- def test_node_noleaves(self):
- '''
- Test node name matching with the search_leaves flag set to False.
- '''
- tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
- self.assertEqual(
- list(tgrep.tgrep_positions('x', [tree])), [[(0, 0, 0), (1, 0, 0)]]
- )
- self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)), [[]])
-
- def tests_rel_dominance(self):
- '''
- Test matching nodes based on dominance relations.
- '''
- tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
- self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])), [[(0,)]])
- self.assertEqual(
- list(tgrep.tgrep_positions('* !< T', [tree])),
- [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
- )
- self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])), [[(0, 0)]])
- self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])), [[(1, 0)]])
- self.assertEqual(
- list(tgrep.tgrep_positions('* !> B', [tree])),
- [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* !> B >> S', [tree])), [[(0,), (0, 0), (1,)]]
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* >> S', [tree])),
- [[(0,), (0, 0), (1,), (1, 0)]],
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* >>, S', [tree])), [[(0,), (0, 0)]]
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* >>\' S', [tree])), [[(1,), (1, 0)]]
- )
- # Known issue:
- # self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
- # [[()]])
- self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])), [[(), (0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])), [[(1,)]])
- self.assertEqual(
- list(tgrep.tgrep_positions('* !<< T', [tree])),
- [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
- )
- tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))')
- self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,), (1,)]])
- self.assertEqual(
- list(tgrep.tgrep_positions('* !<: T', [tree])),
- [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
- )
- self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])), [[(1,)]])
- tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))')
- self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])), [[(1, 0)]])
- self.assertEqual(
- list(tgrep.tgrep_positions('* !>: T', [tree])),
- [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
- )
- tree = ParentedTree.fromstring(
- '(S (A (B (C (D (E (T x))))))' ' (A (B (C (D (E (T x))) (N x)))))'
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* <<: T', [tree])),
- [
- [
- (0,),
- (0, 0),
- (0, 0, 0),
- (0, 0, 0, 0),
- (0, 0, 0, 0, 0),
- (1, 0, 0, 0),
- (1, 0, 0, 0, 0),
- ]
- ],
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* >>: A', [tree])),
- [
- [
- (0, 0),
- (0, 0, 0),
- (0, 0, 0, 0),
- (0, 0, 0, 0, 0),
- (0, 0, 0, 0, 0, 0),
- (1, 0),
- (1, 0, 0),
- ]
- ],
- )
-
- def test_bad_operator(self):
- '''
- Test error handling of undefined tgrep operators.
- '''
- tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
- self.assertRaises(
- tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree])
- )
-
- def test_comments(self):
- '''
- Test that comments are correctly filtered out of tgrep search
- strings.
- '''
- tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
- search1 = '''
- @ NP /^NP/;
- @ NN /^NN/;
- @NN
- '''
- self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
- search2 = '''
- # macros
- @ NP /^NP/;
- @ NN /^NN/;
-
- # search string
- @NN
- '''
- self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
-
- def test_rel_sister_nodes(self):
- '''
- Test matching sister nodes in a tree.
- '''
- tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
- self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]])
-
- def tests_rel_indexed_children(self):
- '''
- Test matching nodes based on their index in their parent node.
- '''
- tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
- self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0,)]])
- tree = ParentedTree.fromstring(
- '(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) ' '(F (C x) (A x) (B x)))'
- )
- self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2,)]])
- self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0,)]])
-
- def test_rel_precedence(self):
- '''
- Test matching nodes based on precedence relations.
- '''
- tree = ParentedTree.fromstring(
- '(S (NP (NP (PP x)) (NP (AP x)))'
- ' (VP (AP (X (PP x)) (Y (AP x))))'
- ' (NP (RC (NP (AP x)))))'
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* . X', [tree])), [[(0,), (0, 1), (0, 1, 0)]]
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* .. X', [tree])),
- [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* .. Y', [tree])),
- [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* , Y', [tree])),
- [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* ,, X', [tree])),
- [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('* ,, Y', [tree])),
- [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
- )
-
- def test_examples(self):
- '''
- Test the Basic Examples from the TGrep2 manual.
- '''
- tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))')
- # This matches any NP node that immediately dominates a PP:
- self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])), [[(1,)]])
-
- tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))')
- # This matches an NP that dominates a PP and is immediately
- # followed by a VP:
- self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])), [[(2,)]])
-
- tree = ParentedTree.fromstring(
- '(S (NP (AP x)) (NP (PP x)) ' '(NP (DET x) (NN x)) (VP x))'
- )
- # This matches an NP that dominates a PP or is immediately
- # followed by a VP:
- self.assertEqual(
- list(tgrep.tgrep_positions('NP << PP | . VP', [tree])), [[(1,), (2,)]]
- )
-
- tree = ParentedTree.fromstring(
- '(S (NP (NP (PP x)) (NP (AP x)))'
- ' (VP (AP (NP (PP x)) (NP (AP x))))'
- ' (NP (RC (NP (AP x)))))'
- )
- # This matches an NP that does not dominate a PP. Also, the NP
- # must either have a parent that is an NP or be dominated by a
- # VP:
- self.assertEqual(
- list(tgrep.tgrep_positions('NP !<< PP [> NP | >> VP]', [tree])),
- [[(0, 1), (1, 0, 1)]],
- )
-
- tree = ParentedTree.fromstring(
- '(S (NP (AP (PP x) (VP x))) ' '(NP (AP (PP x) (NP x))) (NP x))'
- )
- # This matches an NP that dominates a PP which itself is
- # immediately followed by a VP. Note the use of parentheses to
- # group ". VP" with the PP rather than with the NP:
- self.assertEqual(
- list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])), [[(0,)]]
- )
-
- tree = ParentedTree.fromstring(
- '(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))'
- ' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))'
- ' (NP x))'
- )
- # This matches an NP whose last child is a PP that begins with
- # the preposition "on":
- self.assertEqual(
- list(tgrep.tgrep_positions('NP <\' (PP <, (IN < on))', [tree])), [[(0,)]]
- )
-
- tree = ParentedTree.fromstring(
- '(S (S (C x) (A (B x))) (S (C x) (A x)) ' '(S (D x) (A (B x))))'
- )
- # The following pattern matches an S which has a child A and
- # another child that is a C and that the A has a child B:
- self.assertEqual(
- list(tgrep.tgrep_positions('S < (A < B) < C', [tree])), [[(0,)]]
- )
-
- tree = ParentedTree.fromstring(
- '(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))'
- )
- # However, this pattern means that S has child A and that A
- # has children B and C:
- self.assertEqual(
- list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])), [[(0,)]]
- )
-
- # It is equivalent to this:
- self.assertEqual(
- list(tgrep.tgrep_positions('S < (A < B < C)', [tree])), [[(0,)]]
- )
-
- def test_use_macros(self):
- '''
- Test defining and using tgrep2 macros.
- '''
- tree = ParentedTree.fromstring(
- '(VP (VB sold) (NP (DET the) '
- '(NN heiress)) (NP (NN deed) (PREP to) '
- '(NP (DET the) (NN school) (NN house))))'
- )
- self.assertEqual(
- list(
- tgrep.tgrep_positions(
- '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree]
- )
- ),
- [[(1,), (2, 2)]],
- )
- # use undefined macro @CNP
- self.assertRaises(
- tgrep.TgrepException,
- list,
- tgrep.tgrep_positions(
- '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]
- ),
- )
-
- def test_tokenize_node_labels(self):
- '''Test tokenization of labeled nodes.'''
- self.assertEqual(
- tgrep.tgrep_tokenize('S < @SBJ < (@VP < (@VB $.. @OBJ))'),
- [
- 'S',
- '<',
- '@SBJ',
- '<',
- '(',
- '@VP',
- '<',
- '(',
- '@VB',
- '$..',
- '@OBJ',
- ')',
- ')',
- ],
- )
- self.assertEqual(
- tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'),
- [
- 'S',
- '<',
- '@SBJ',
- '=',
- 's',
- '<',
- '(',
- '@VP',
- '=',
- 'v',
- '<',
- '(',
- '@VB',
- '$..',
- '@OBJ',
- ')',
- ')',
- ],
- )
-
- def test_tokenize_segmented_patterns(self):
- '''Test tokenization of segmented patterns.'''
- self.assertEqual(
- tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'),
- [
- 'S',
- '<',
- '@SBJ',
- '=',
- 's',
- '<',
- '(',
- '@VP',
- '=',
- 'v',
- '<',
- '(',
- '@VB',
- '$..',
- '@OBJ',
- ')',
- ')',
- ':',
- '=s',
- '..',
- '=v',
- ],
- )
-
- def test_labeled_nodes(self):
- '''
- Test labeled nodes.
-
- Test case from Emily M. Bender.
- '''
- search = '''
- # macros
- @ SBJ /SBJ/;
- @ VP /VP/;
- @ VB /VB/;
- @ VPoB /V[PB]/;
- @ OBJ /OBJ/;
-
- # 1 svo
- S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
- sent1 = ParentedTree.fromstring(
- '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))'
- )
- sent2 = ParentedTree.fromstring(
- '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))'
- )
- search_firsthalf = search.split('\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))'
- search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'
-
- self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
- self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
- self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
- self.assertEqual(
- list(tgrep.tgrep_positions(search, [sent1])),
- list(tgrep.tgrep_positions(search_rewrite, [sent1])),
- )
- self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
- self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
- self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
- self.assertEqual(
- list(tgrep.tgrep_positions(search, [sent2])),
- list(tgrep.tgrep_positions(search_rewrite, [sent2])),
- )
-
- def test_multiple_conjs(self):
- '''
- Test that multiple (3 or more) conjunctions of node relations are
- handled properly.
- '''
- sent = ParentedTree.fromstring('((A (B b) (C c)) (A (B b) (C c) (D d)))')
- # search = '(A < B < C < D)'
- # search_tworels = '(A < B < C)'
- self.assertEqual(
- list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1,)]]
- )
- self.assertEqual(
- list(tgrep.tgrep_positions('(A < B < C)', [sent])), [[(0,), (1,)]]
- )
-
- def test_trailing_semicolon(self):
- '''
- Test that semicolons at the end of a tgrep2 search string won't
- cause a parse failure.
- '''
- tree = ParentedTree.fromstring(
- '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
- )
- self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
- self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0, 2), (2, 1)]])
- self.assertEqual(
- list(tgrep.tgrep_positions('NN;;', [tree])), [[(0, 2), (2, 1)]]
- )
-
-
-if __name__ == '__main__':
- unittest.main()
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Unit tests for nltk.tokenize.
-See also nltk/test/tokenize.doctest
-"""
-
-
-import unittest
-
-from nose import SkipTest
-from nose.tools import assert_equal
-
-from nltk.tokenize import (
- punkt,
- word_tokenize,
- TweetTokenizer,
- StanfordSegmenter,
- TreebankWordTokenizer,
- SyllableTokenizer,
-)
-
-
-class TestTokenize(unittest.TestCase):
- def test_tweet_tokenizer(self):
- """
- Test TweetTokenizer using words with special and accented characters.
- """
-
- tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
- s9 = "@myke: Let's test these words: resumé España München français"
- tokens = tokenizer.tokenize(s9)
- expected = [
- ':',
- "Let's",
- 'test',
- 'these',
- 'words',
- ':',
- 'resumé',
- 'España',
- 'München',
- 'français',
- ]
- self.assertEqual(tokens, expected)
-
- def test_sonority_sequencing_syllable_tokenizer(self):
- """
- Test SyllableTokenizer tokenizer.
- """
- tokenizer = SyllableTokenizer()
- tokens = tokenizer.tokenize('justification')
- self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
-
- def test_stanford_segmenter_arabic(self):
- """
- Test the Stanford Word Segmenter for Arabic (default config)
- """
- try:
- seg = StanfordSegmenter()
- seg.default_config('ar')
- sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
- segmented_sent = seg.segment(sent.split())
- assert segmented_sent.split() == [
- 'يبحث',
- 'علم',
- 'الحاسوب',
- 'استخدام',
- 'الحوسبة',
- 'ب',
- 'جميع',
- 'اشكال',
- 'ها',
- 'ل',
- 'حل',
- 'المشكلات',
- ]
- except LookupError as e:
- raise SkipTest(str(e))
-
- def test_stanford_segmenter_chinese(self):
- """
- Test the Stanford Word Segmenter for Chinese (default config)
- """
- try:
- seg = StanfordSegmenter()
- seg.default_config('zh')
- sent = u"这是斯坦福中文分词器测试"
- segmented_sent = seg.segment(sent.split())
- assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
- except LookupError as e:
- raise SkipTest(str(e))
-
- def test_phone_tokenizer(self):
- """
- Test a string that resembles a phone number but contains a newline
- """
-
- # Should be recognized as a phone number, albeit one with multiple spaces
- tokenizer = TweetTokenizer()
- test1 = "(393) 928 -3010"
- expected = ['(393) 928 -3010']
- result = tokenizer.tokenize(test1)
- self.assertEqual(result, expected)
-
- # Due to newline, first three elements aren't part of a phone number;
- # fourth is
- test2 = "(393)\n928 -3010"
- expected = ['(', '393', ')', "928 -3010"]
- result = tokenizer.tokenize(test2)
- self.assertEqual(result, expected)
-
- def test_pad_asterisk(self):
- """
- Test padding of asterisk for word tokenization.
- """
- text = "This is a, *weird sentence with *asterisks in it."
- expected = ['This', 'is', 'a', ',', '*', 'weird', 'sentence',
- 'with', '*', 'asterisks', 'in', 'it', '.']
- self.assertEqual(word_tokenize(text), expected)
-
- def test_pad_dotdot(self):
- """
- Test padding of dotdot* for word tokenization.
- """
- text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
- expected = ['Why', 'did', 'dotdot', '..', 'not', 'get',
- 'tokenized', 'but', 'dotdotdot', '...', 'did', '?',
- 'How', 'about', 'manydots', '.....']
- self.assertEqual(word_tokenize(text), expected)
-
- def test_remove_handle(self):
- """
- Test remove_handle() from casual.py with specially crafted edge cases
- """
-
- tokenizer = TweetTokenizer(strip_handles=True)
-
- # Simple example. Handles with just numbers should be allowed
- test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
- expected = ['hello', '.', 'hi']
- result = tokenizer.tokenize(test1)
- self.assertEqual(result, expected)
-
- # Handles are allowed to follow any of the following characters
- test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
- expected = [
- '`',
- '~',
- '(',
- ')',
- '-',
- '=',
- '+',
- '\\',
- '|',
- '[',
- ']',
- '{',
- '}',
- ';',
- ':',
- "'",
- '"',
- '/',
- '?',
- '.',
- ',',
- '<',
- '>',
- 'ñ',
- '.',
- 'ü',
- '.',
- 'ç',
- '.',
- ]
- result = tokenizer.tokenize(test2)
- self.assertEqual(result, expected)
-
- # Handles are NOT allowed to follow any of the following characters
- test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
- expected = [
- 'a',
- '@n',
- 'j',
- '@n',
- 'z',
- '@n',
- 'A',
- '@n',
- 'L',
- '@n',
- 'Z',
- '@n',
- '1',
- '@n',
- '4',
- '@n',
- '7',
- '@n',
- '9',
- '@n',
- '0',
- '@n',
- '_',
- '@n',
- '!',
- '@n',
- '@',
- '@n',
- '#',
- '@n',
- '$',
- '@n',
- '%',
- '@n',
- '&',
- '@n',
- '*',
- '@n',
- ]
- result = tokenizer.tokenize(test3)
- self.assertEqual(result, expected)
-
- # Handles are allowed to precede the following characters
- test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
- expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a']
- result = tokenizer.tokenize(test4)
- self.assertEqual(result, expected)
-
- # Tests interactions with special symbols and multiple @
- test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
- expected = [
- '!',
- '@n',
- '#',
- '@n',
- '$',
- '@n',
- '%',
- '@n',
- '&',
- '@n',
- '*',
- '@n',
- '@n',
- '@n',
- '@',
- '@n',
- '@n',
- '@',
- '@n',
- '@n_',
- '@n',
- '@n7',
- '@n',
- '@nj',
- '@n',
- ]
- result = tokenizer.tokenize(test5)
- self.assertEqual(result, expected)
-
- # Tests that handles can have a max length of 20
- test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
- expected = ['uvwxyz', '1234', '_', 'endofhandle']
- result = tokenizer.tokenize(test6)
- self.assertEqual(result, expected)
-
- # Edge case where an @ comes directly after a long handle
- test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
- expected = [
- 'u',
- '@abcde',
- '@abcdefghijklmnopqrst',
- '@abcde',
- '_',
- '@abcde',
- '5',
- '@abcde',
- ]
- result = tokenizer.tokenize(test7)
- self.assertEqual(result, expected)
-
- def test_treebank_span_tokenizer(self):
- """
- Test TreebankWordTokenizer.span_tokenize function
- """
-
- tokenizer = TreebankWordTokenizer()
-
- # Test case in the docstring
- test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
- expected = [
- (0, 4),
- (5, 12),
- (13, 17),
- (18, 19),
- (19, 23),
- (24, 26),
- (27, 30),
- (31, 32),
- (32, 36),
- (36, 37),
- (37, 38),
- (40, 46),
- (47, 48),
- (48, 51),
- (51, 52),
- (53, 55),
- (56, 59),
- (60, 62),
- (63, 68),
- (69, 70),
- (70, 76),
- (76, 77),
- (77, 78),
- ]
- result = list(tokenizer.span_tokenize(test1))
- self.assertEqual(result, expected)
-
- # Test case with double quotation
- test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
- expected = [
- (0, 3),
- (4, 7),
- (8, 10),
- (11, 18),
- (19, 21),
- (22, 25),
- (26, 27),
- (27, 36),
- (37, 42),
- (42, 43),
- (44, 46),
- (47, 50),
- (51, 57),
- (58, 64),
- (65, 68),
- (69, 74),
- (75, 76),
- (77, 85),
- (86, 92),
- (93, 95),
- (96, 102),
- (103, 109),
- ]
- result = list(tokenizer.span_tokenize(test2))
- self.assertEqual(result, expected)
-
- # Test case with double qoutation as well as converted quotations
- test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
- expected = [
- (0, 3),
- (4, 7),
- (8, 10),
- (11, 18),
- (19, 21),
- (22, 25),
- (26, 27),
- (27, 36),
- (37, 42),
- (42, 43),
- (44, 46),
- (47, 50),
- (51, 57),
- (58, 64),
- (65, 68),
- (69, 74),
- (75, 76),
- (77, 79),
- (79, 87),
- (87, 89),
- (90, 96),
- (97, 99),
- (100, 106),
- (107, 113),
- ]
- result = list(tokenizer.span_tokenize(test3))
- self.assertEqual(result, expected)
-
- def test_word_tokenize(self):
- """
- Test word_tokenize function
- """
-
- sentence = "The 'v', I've been fooled but I'll seek revenge."
- expected = ['The', "'", 'v', "'", ',', 'I', "'ve", 'been', 'fooled',
- 'but', 'I', "'ll", 'seek', 'revenge', '.']
- self.assertEqual(word_tokenize(sentence), expected)
-
- sentence = "'v' 're'"
- expected = ["'", 'v', "'", "'re", "'"]
- self.assertEqual(word_tokenize(sentence), expected)
-
- def test_punkt_pair_iter(self):
-
- test_cases = [
- ('12', [('1', '2'), ('2', None)]),
- ('123', [('1', '2'), ('2', '3'), ('3', None)]),
- ('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
- ]
-
- for (test_input, expected_output) in test_cases:
- actual_output = [x for x in punkt._pair_iter(test_input)]
-
- assert_equal(actual_output, expected_output)
-
- def test_punkt_pair_iter_handles_stop_iteration_exception(self):
- # test input to trigger StopIteration from next()
- it = iter([])
- # call method under test and produce a generator
- gen = punkt._pair_iter(it)
- # unpack generator, ensure that no error is raised
- list(gen)
-
- def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
- obj = punkt.PunktBaseClass()
-
- class TestPunktTokenizeWordsMock:
- def word_tokenize(self, s):
- return iter([])
-
- obj._lang_vars = TestPunktTokenizeWordsMock()
- # unpack generator, ensure that no error is raised
- list(obj._tokenize_words('test'))
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for static parts of Twitter package
-"""
-
-import os
-import unittest
-from nose import SkipTest
-
-try:
- import twython
-except ImportError as e:
- raise SkipTest("The twython library has not been installed.")
-
-from nltk.twitter import Authenticate
-
-
-class TestCredentials(unittest.TestCase):
- """
- Tests that Twitter credentials information from file is handled correctly.
- """
-
- def setUp(self):
- self.subdir = os.path.join(os.path.dirname(__file__), 'files')
- self.auth = Authenticate()
- os.environ['TWITTER'] = 'twitter-files'
-
- def test_environment(self):
- """
- Test that environment variable has been read correctly.
- """
- fn = os.path.basename(self.auth.creds_subdir)
- self.assertEqual(fn, os.environ['TWITTER'])
-
- def test_empty_subdir1(self):
- """
- Setting subdir to empty path should raise an error.
- """
- try:
- self.auth.load_creds(subdir='')
- # raises ValueError (zero length field name in format) for python 2.6
- # OSError for the rest
- except OSError:
- pass
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('OSError exception not thrown.')
-
- def test_empty_subdir2(self):
- """
- Setting subdir to `None` should raise an error.
- """
- self.auth.creds_subdir = None
- try:
- self.auth.load_creds()
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('ValueError exception not thrown.')
-
- def test_missingdir(self):
- """
- Setting subdir to nonexistent directory should raise an error.
- """
- try:
- self.auth.load_creds(subdir='/nosuchdir')
- # raises ValueError (zero length field name in format) for python 2.6
- # OSError for the rest
- except OSError:
- pass
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('OSError exception not thrown.')
-
- def test_missingfile1(self):
- """
- Defaults for authentication will fail since 'credentials.txt' not
- present in default subdir, as read from `os.environ['TWITTER']`.
- """
- try:
- self.auth.load_creds()
- # raises ValueError (zero length field name in format) for python 2.6
- # OSError for the rest
- except OSError:
- pass
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('OSError exception not thrown.')
-
- def test_missingfile2(self):
- """
- Credentials file 'foobar' cannot be found in default subdir.
- """
- try:
- self.auth.load_creds(creds_file='foobar')
- # raises ValueError (zero length field name in format) for python 2.6
- # OSError for the rest
- except OSError:
- pass
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('OSError exception not thrown.')
-
- def test_incomplete_file(self):
- """
- Credentials file 'bad_oauth1-1.txt' is incomplete
- """
- try:
- self.auth.load_creds(creds_file='bad_oauth1-1.txt', subdir=self.subdir)
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('ValueError exception not thrown.')
-
- def test_malformed_file1(self):
- """
- First key in credentials file 'bad_oauth1-2.txt' is ill-formed
- """
- try:
- self.auth.load_creds(creds_file='bad_oauth1-2.txt', subdir=self.subdir)
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('ValueError exception not thrown.')
-
- def test_malformed_file2(self):
- """
- First key in credentials file 'bad_oauth1-2.txt' is ill-formed
- """
- try:
- self.auth.load_creds(creds_file='bad_oauth1-3.txt', subdir=self.subdir)
- except ValueError:
- pass
- except Exception as e:
- self.fail('Unexpected exception thrown: %s' % e)
- else:
- self.fail('ValueError exception not thrown.')
-
- def test_correct_path(self):
- """
- Path to default credentials file is well-formed, given specified
- subdir.
- """
- self.auth.load_creds(subdir=self.subdir)
- self.auth.creds_fullpath = os.path.join(self.subdir, self.auth.creds_file)
-
- def test_correct_file1(self):
- """
- Default credentials file is identified
- """
- self.auth.load_creds(subdir=self.subdir)
- self.assertEqual(self.auth.creds_file, 'credentials.txt')
-
- def test_correct_file2(self):
- """
- Default credentials file has been read correctluy
- """
- oauth = self.auth.load_creds(subdir=self.subdir)
- self.assertEqual(oauth['app_key'], 'a')
-
-
-if __name__ == '__main__':
- unittest.main()
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Unit tests for nltk.corpus.wordnet
-See also nltk/test/wordnet.doctest
-"""
-
-
-import collections
-import os
-import unittest
-
-from nose import SkipTest
-
-from nltk.corpus.reader.wordnet import WordNetCorpusReader
-from nltk.corpus import wordnet as wn
-from nltk.corpus import wordnet_ic as wnic
-from nltk.data import find as find_data
-
-
-wn.ensure_loaded()
-S = wn.synset
-L = wn.lemma
-
-
-class WordnNetDemo(unittest.TestCase):
- def test_retrieve_synset(self):
- move_synset = S('go.v.21')
- self.assertEqual(move_synset.name(), "move.v.15")
- self.assertEqual(move_synset.lemma_names(), ['move', 'go'])
- self.assertEqual(
- move_synset.definition(), "have a turn; make one's move in a game"
- )
- self.assertEqual(move_synset.examples(), ['Can I go now?'])
-
- def test_retrieve_synsets(self):
- self.assertEqual(sorted(wn.synsets('zap', pos='n')), [S('zap.n.01')])
- self.assertEqual(
- sorted(wn.synsets('zap', pos='v')),
- [S('microwave.v.01'), S('nuke.v.01'), S('zap.v.01'), S('zap.v.02')],
- )
-
- def test_hyperhyponyms(self):
- # Not every synset as hypernyms()
- self.assertEqual(S('travel.v.01').hypernyms(), [])
- self.assertEqual(S('travel.v.02').hypernyms(), [S('travel.v.03')])
- self.assertEqual(S('travel.v.03').hypernyms(), [])
-
- # Test hyper-/hyponyms.
- self.assertEqual(S('breakfast.n.1').hypernyms(), [S('meal.n.01')])
- first_five_meal_hypo = [
- S('banquet.n.02'),
- S('bite.n.04'),
- S('breakfast.n.01'),
- S('brunch.n.01'),
- S('buffet.n.02'),
- ]
- self.assertEqual(sorted(S('meal.n.1').hyponyms()[:5]), first_five_meal_hypo)
- self.assertEqual(S('Austen.n.1').instance_hypernyms(), [S('writer.n.01')])
- first_five_composer_hypo = [
- S('ambrose.n.01'),
- S('bach.n.01'),
- S('barber.n.01'),
- S('bartok.n.01'),
- S('beethoven.n.01'),
- ]
- self.assertEqual(
- S('composer.n.1').instance_hyponyms()[:5], first_five_composer_hypo
- )
-
- # Test root hyper-/hyponyms
- self.assertEqual(S('person.n.01').root_hypernyms(), [S('entity.n.01')])
- self.assertEqual(S('sail.v.01').root_hypernyms(), [S('travel.v.01')])
- self.assertEqual(
- S('fall.v.12').root_hypernyms(), [S('act.v.01'), S('fall.v.17')]
- )
-
- def test_derivationally_related_forms(self):
- # Test `derivationally_related_forms()`
- self.assertEqual(
- L('zap.v.03.nuke').derivationally_related_forms(),
- [L('atomic_warhead.n.01.nuke')],
- )
- self.assertEqual(
- L('zap.v.03.atomize').derivationally_related_forms(),
- [L('atomization.n.02.atomization')],
- )
- self.assertEqual(
- L('zap.v.03.atomise').derivationally_related_forms(),
- [L('atomization.n.02.atomisation')],
- )
- self.assertEqual(L('zap.v.03.zap').derivationally_related_forms(), [])
-
- def test_meronyms_holonyms(self):
- # Test meronyms, holonyms.
- self.assertEqual(
- S('dog.n.01').member_holonyms(), [S('canis.n.01'), S('pack.n.06')]
- )
- self.assertEqual(S('dog.n.01').part_meronyms(), [S('flag.n.07')])
-
- self.assertEqual(S('faculty.n.2').member_meronyms(), [S('professor.n.01')])
- self.assertEqual(S('copilot.n.1').member_holonyms(), [S('crew.n.01')])
-
- self.assertEqual(
- S('table.n.2').part_meronyms(),
- [S('leg.n.03'), S('tabletop.n.01'), S('tableware.n.01')],
- )
- self.assertEqual(S('course.n.7').part_holonyms(), [S('meal.n.01')])
-
- self.assertEqual(
- S('water.n.1').substance_meronyms(), [S('hydrogen.n.01'), S('oxygen.n.01')]
- )
- self.assertEqual(
- S('gin.n.1').substance_holonyms(),
- [
- S('gin_and_it.n.01'),
- S('gin_and_tonic.n.01'),
- S('martini.n.01'),
- S('pink_lady.n.01'),
- ],
- )
-
- def test_antonyms(self):
- # Test antonyms.
- self.assertEqual(
- L('leader.n.1.leader').antonyms(), [L('follower.n.01.follower')]
- )
- self.assertEqual(
- L('increase.v.1.increase').antonyms(), [L('decrease.v.01.decrease')]
- )
-
- def test_misc_relations(self):
- # Test misc relations.
- self.assertEqual(S('snore.v.1').entailments(), [S('sleep.v.01')])
- self.assertEqual(
- S('heavy.a.1').similar_tos(),
- [
- S('dense.s.03'),
- S('doughy.s.01'),
- S('heavier-than-air.s.01'),
- S('hefty.s.02'),
- S('massive.s.04'),
- S('non-buoyant.s.01'),
- S('ponderous.s.02'),
- ],
- )
- self.assertEqual(S('light.a.1').attributes(), [S('weight.n.01')])
- self.assertEqual(S('heavy.a.1').attributes(), [S('weight.n.01')])
-
- # Test pertainyms.
- self.assertEqual(
- L('English.a.1.English').pertainyms(), [L('england.n.01.England')]
- )
-
- def test_lch(self):
- # Test LCH.
- self.assertEqual(
- S('person.n.01').lowest_common_hypernyms(S('dog.n.01')),
- [S('organism.n.01')],
- )
- self.assertEqual(
- S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')),
- [S('woman.n.01')],
- )
-
- def test_domains(self):
- # Test domains.
- self.assertEqual(S('code.n.03').topic_domains(), [S('computer_science.n.01')])
- self.assertEqual(S('pukka.a.01').region_domains(), [S('india.n.01')])
- self.assertEqual(S('freaky.a.01').usage_domains(), [S('slang.n.02')])
-
- def test_in_topic_domains(self):
- # Test in domains.
- self.assertEqual(
- S('computer_science.n.01').in_topic_domains()[0], S('access.n.05')
- )
- self.assertEqual(S('germany.n.01').in_region_domains()[23], S('trillion.n.02'))
- self.assertEqual(S('slang.n.02').in_usage_domains()[1], S('airhead.n.01'))
-
- def test_wordnet_similarities(self):
- # Path based similarities.
- self.assertAlmostEqual(S('cat.n.01').path_similarity(S('cat.n.01')), 1.0)
- self.assertAlmostEqual(S('dog.n.01').path_similarity(S('cat.n.01')), 0.2)
- self.assertAlmostEqual(
- S('dog.n.01').lch_similarity(S('cat.n.01')), 2.028, places=3
- )
- self.assertAlmostEqual(
- S('dog.n.01').wup_similarity(S('cat.n.01')), 0.8571, places=3
- )
- # Information Content similarities.
- brown_ic = wnic.ic('ic-brown.dat')
- self.assertAlmostEqual(
- S('dog.n.01').jcn_similarity(S('cat.n.01'), brown_ic), 0.4497, places=3
- )
- semcor_ic = wnic.ic('ic-semcor.dat')
- self.assertAlmostEqual(
- S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3
- )
-
- def test_omw_lemma_no_trailing_underscore(self):
- expected = sorted([
- u'popolna_sprememba_v_mišljenju',
- u'popoln_obrat',
- u'preobrat',
- u'preobrat_v_mišljenju'
- ])
- self.assertEqual(sorted(S('about-face.n.02').lemma_names(lang='slv')), expected)
-
- def test_iterable_type_for_all_lemma_names(self):
- # Duck-test for iterables.
- # See https://stackoverflow.com/a/36230057/610569
- cat_lemmas = wn.all_lemma_names(lang='cat')
- eng_lemmas = wn.all_lemma_names(lang='eng')
-
- self.assertTrue(hasattr(eng_lemmas, '__iter__'))
- self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
- self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
-
- self.assertTrue(hasattr(cat_lemmas, '__iter__'))
- self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
- self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for BLEU translation evaluation metric
-"""
-
-import functools
-import io
-import unittest
-
-from nltk.data import find
-from nltk.translate.bleu_score import (
- modified_precision,
- brevity_penalty,
- closest_ref_length,
-)
-from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
-
-
-class TestBLEU(unittest.TestCase):
- def test_modified_precision(self):
- """
- Examples from the original BLEU paper
- http://www.aclweb.org/anthology/P02-1040.pdf
- """
- # Example 1: the "the*" example.
- # Reference sentences.
- ref1 = 'the cat is on the mat'.split()
- ref2 = 'there is a cat on the mat'.split()
- # Hypothesis sentence(s).
- hyp1 = 'the the the the the the the'.split()
-
- references = [ref1, ref2]
-
- # Testing modified unigram precision.
- hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
- assert round(hyp1_unigram_precision, 4) == 0.2857
- # With assertAlmostEqual at 4 place precision.
- self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
-
- # Testing modified bigram precision.
- assert float(modified_precision(references, hyp1, n=2)) == 0.0
-
- # Example 2: the "of the" example.
- # Reference sentences
- ref1 = str(
- 'It is a guide to action that ensures that the military '
- 'will forever heed Party commands'
- ).split()
- ref2 = str(
- 'It is the guiding principle which guarantees the military '
- 'forces always being under the command of the Party'
- ).split()
- ref3 = str(
- 'It is the practical guide for the army always to heed '
- 'the directions of the party'
- ).split()
- # Hypothesis sentence(s).
- hyp1 = 'of the'.split()
-
- references = [ref1, ref2, ref3]
- # Testing modified unigram precision.
- assert float(modified_precision(references, hyp1, n=1)) == 1.0
-
- # Testing modified bigram precision.
- assert float(modified_precision(references, hyp1, n=2)) == 1.0
-
- # Example 3: Proper MT outputs.
- hyp1 = str(
- 'It is a guide to action which ensures that the military '
- 'always obeys the commands of the party'
- ).split()
- hyp2 = str(
- 'It is to insure the troops forever hearing the activity '
- 'guidebook that party direct'
- ).split()
-
- references = [ref1, ref2, ref3]
-
- # Unigram precision.
- hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
- hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
- # Test unigram precision with assertAlmostEqual at 4 place precision.
- self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
- self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
- # Test unigram precision with rounding.
- assert round(hyp1_unigram_precision, 4) == 0.9444
- assert round(hyp2_unigram_precision, 4) == 0.5714
-
- # Bigram precision
- hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
- hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
- # Test bigram precision with assertAlmostEqual at 4 place precision.
- self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
- self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
- # Test bigram precision with rounding.
- assert round(hyp1_bigram_precision, 4) == 0.5882
- assert round(hyp2_bigram_precision, 4) == 0.0769
-
- def test_brevity_penalty(self):
- # Test case from brevity_penalty_closest function in mteval-v13a.pl.
- # Same test cases as in the doctest in nltk.translate.bleu_score.py
- references = [['a'] * 11, ['a'] * 8]
- hypothesis = ['a'] * 7
- hyp_len = len(hypothesis)
- closest_ref_len = closest_ref_length(references, hyp_len)
- self.assertAlmostEqual(
- brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
- )
-
- references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
- hypothesis = ['a'] * 7
- hyp_len = len(hypothesis)
- closest_ref_len = closest_ref_length(references, hyp_len)
- assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
-
- def test_zero_matches(self):
- # Test case where there's 0 matches
- references = ['The candidate has no alignment to any of the references'.split()]
- hypothesis = 'John loves Mary'.split()
-
- # Test BLEU to nth order of n-grams, where n is len(hypothesis).
- for n in range(1, len(hypothesis)):
- weights = [1.0 / n] * n # Uniform weights.
- assert sentence_bleu(references, hypothesis, weights) == 0
-
- def test_full_matches(self):
- # Test case where there's 100% matches
- references = ['John loves Mary'.split()]
- hypothesis = 'John loves Mary'.split()
-
- # Test BLEU to nth order of n-grams, where n is len(hypothesis).
- for n in range(1, len(hypothesis)):
- weights = [1.0 / n] * n # Uniform weights.
- assert sentence_bleu(references, hypothesis, weights) == 1.0
-
- def test_partial_matches_hypothesis_longer_than_reference(self):
- references = ['John loves Mary'.split()]
- hypothesis = 'John loves Mary who loves Mike'.split()
- # Since no 4-grams matches were found the result should be zero
- # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
- self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
- # Checks that the warning has been raised because len(reference) < 4.
- try:
- self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
- except AttributeError:
- pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
-
-
-# @unittest.skip("Skipping fringe cases for BLEU.")
-class TestBLEUFringeCases(unittest.TestCase):
- def test_case_where_n_is_bigger_than_hypothesis_length(self):
- # Test BLEU to nth order of n-grams, where n > len(hypothesis).
- references = ['John loves Mary ?'.split()]
- hypothesis = 'John loves Mary'.split()
- n = len(hypothesis) + 1 #
- weights = [1.0 / n] * n # Uniform weights.
- # Since no n-grams matches were found the result should be zero
- # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
- self.assertAlmostEqual(
- sentence_bleu(references, hypothesis, weights), 0.0, places=4
- )
- # Checks that the warning has been raised because len(hypothesis) < 4.
- try:
- self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
- except AttributeError:
- pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
-
- # Test case where n > len(hypothesis) but so is n > len(reference), and
- # it's a special case where reference == hypothesis.
- references = ['John loves Mary'.split()]
- hypothesis = 'John loves Mary'.split()
- # Since no 4-grams matches were found the result should be zero
- # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
- self.assertAlmostEqual(
- sentence_bleu(references, hypothesis, weights), 0.0, places=4
- )
-
- def test_empty_hypothesis(self):
- # Test case where there's hypothesis is empty.
- references = ['The candidate has no alignment to any of the references'.split()]
- hypothesis = []
- assert sentence_bleu(references, hypothesis) == 0
-
- def test_empty_references(self):
- # Test case where there's reference is empty.
- references = [[]]
- hypothesis = 'John loves Mary'.split()
- assert sentence_bleu(references, hypothesis) == 0
-
- def test_empty_references_and_hypothesis(self):
- # Test case where both references and hypothesis is empty.
- references = [[]]
- hypothesis = []
- assert sentence_bleu(references, hypothesis) == 0
-
- def test_reference_or_hypothesis_shorter_than_fourgrams(self):
- # Tese case where the length of reference or hypothesis
- # is shorter than 4.
- references = ['let it go'.split()]
- hypothesis = 'let go it'.split()
- # Checks that the value the hypothesis and reference returns is 0.0
- # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
- self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
- # Checks that the warning has been raised.
- try:
- self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
- except AttributeError:
- pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
-
-
-class TestBLEUvsMteval13a(unittest.TestCase):
- def test_corpus_bleu(self):
- ref_file = find('models/wmt15_eval/ref.ru')
- hyp_file = find('models/wmt15_eval/google.ru')
- mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
-
- # Reads the BLEU scores from the `mteval-13a.output` file.
- # The order of the list corresponds to the order of the ngrams.
- with open(mteval_output_file, 'r') as mteval_fin:
- # The numbers are located in the last 2nd line of the file.
- # The first and 2nd item in the list are the score and system names.
- mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
-
- with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
- with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
- # Whitespace tokenize the file.
- # Note: split() automatically strip().
- hypothesis = list(map(lambda x: x.split(), hyp_fin))
- # Note that the corpus_bleu input is list of list of references.
- references = list(map(lambda x: [x.split()], ref_fin))
- # Without smoothing.
- for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
- nltk_bleu = corpus_bleu(
- references, hypothesis, weights=(1.0 / i,) * i
- )
- # Check that the BLEU scores difference is less than 0.005 .
- # Note: This is an approximate comparison; as much as
- # +/- 0.01 BLEU might be "statistically significant",
- # the actual translation quality might not be.
- assert abs(mteval_bleu - nltk_bleu) < 0.005
-
- # With the same smoothing method used in mteval-v13a.pl
- chencherry = SmoothingFunction()
- for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
- nltk_bleu = corpus_bleu(
- references,
- hypothesis,
- weights=(1.0 / i,) * i,
- smoothing_function=chencherry.method3,
- )
- assert abs(mteval_bleu - nltk_bleu) < 0.005
-
-
-class TestBLEUWithBadSentence(unittest.TestCase):
- def test_corpus_bleu_with_bad_sentence(self):
- hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
- ref = str(
- "Their tasks include changing a pump on the faulty stokehold ."
- "Likewise , two species that are very similar in morphology "
- "were distinguished using genetics ."
- )
- references = [[ref.split()]]
- hypotheses = [hyp.split()]
- try: # Check that the warning is raised since no. of 2-grams < 0.
- with self.assertWarns(UserWarning):
- # Verify that the BLEU output is undesired since no. of 2-grams < 0.
- self.assertAlmostEqual(
- corpus_bleu(references, hypotheses), 0.0, places=4
- )
- except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
- self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests GDFA alignments
-"""
-
-import functools
-import io
-import unittest
-
-from nltk.translate.gdfa import grow_diag_final_and
-
-
-class TestGDFA(unittest.TestCase):
- def test_from_eflomal_outputs(self):
- """
- Testing GDFA with first 10 eflomal outputs from issue #1829
- https://github.com/nltk/nltk/issues/1829
- """
- # Input.
- forwards = [
- '0-0 1-2',
- '0-0 1-1',
- '0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14',
- '0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10',
- '0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31',
- '0-0 1-1 0-2 2-3',
- '0-0 2-2 4-4',
- '0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20',
- '3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14',
- '1-0',
- ]
- backwards = [
- '0-0 1-2',
- '0-0 1-1',
- '0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13',
- '0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8',
- '0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31',
- '0-0 1-1 2-3',
- '0-0 1-1 2-3 4-4',
- '0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18',
- '0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10',
- '1-0',
- ]
- source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
- target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
- # Expected Output.
- expected = [
- [(0, 0), (1, 2)],
- [(0, 0), (1, 1)],
- [
- (0, 0),
- (2, 1),
- (3, 2),
- (4, 3),
- (5, 4),
- (6, 5),
- (7, 6),
- (8, 7),
- (10, 10),
- (11, 12),
- ],
- [
- (0, 0),
- (1, 1),
- (1, 2),
- (2, 3),
- (3, 4),
- (4, 5),
- (4, 6),
- (5, 7),
- (6, 8),
- (7, 5),
- (8, 7),
- (8, 9),
- (9, 8),
- (9, 10),
- ],
- [
- (0, 0),
- (1, 8),
- (2, 9),
- (3, 10),
- (4, 11),
- (5, 8),
- (6, 9),
- (6, 11),
- (7, 10),
- (8, 11),
- (31, 31),
- ],
- [(0, 0), (0, 2), (1, 1), (2, 3)],
- [(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
- [
- (0, 0),
- (1, 1),
- (2, 3),
- (3, 4),
- (5, 5),
- (7, 6),
- (8, 7),
- (9, 8),
- (10, 9),
- (11, 10),
- (12, 11),
- (13, 12),
- (14, 13),
- (15, 14),
- (16, 16),
- (17, 17),
- (18, 18),
- (19, 19),
- ],
- [
- (0, 0),
- (1, 1),
- (3, 0),
- (3, 2),
- (4, 1),
- (5, 3),
- (6, 2),
- (6, 4),
- (7, 5),
- (8, 6),
- (9, 7),
- (9, 12),
- (10, 8),
- (10, 13),
- (11, 9),
- (12, 8),
- (12, 14),
- (13, 9),
- (14, 8),
- (15, 9),
- (16, 10),
- ],
- [(1, 0)],
- [
- (0, 0),
- (1, 1),
- (3, 2),
- (4, 3),
- (5, 4),
- (6, 5),
- (7, 6),
- (9, 10),
- (10, 12),
- (11, 13),
- (12, 14),
- (13, 15),
- ],
- ]
-
- # Iterate through all 10 examples and check for expected outputs.
- for fw, bw, src_len, trg_len, expect in zip(
- forwards, backwards, source_lens, target_lens, expected
- ):
- self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw))
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for IBM Model 1 training methods
-"""
-
-import unittest
-
-from collections import defaultdict
-from nltk.translate import AlignedSent
-from nltk.translate import IBMModel
-from nltk.translate import IBMModel1
-from nltk.translate.ibm_model import AlignmentInfo
-
-
-class TestIBMModel1(unittest.TestCase):
- def test_set_uniform_translation_probabilities(self):
- # arrange
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model1 = IBMModel1(corpus, 0)
-
- # act
- model1.set_uniform_probabilities(corpus)
-
- # assert
- # expected_prob = 1.0 / (target vocab size + 1)
- self.assertEqual(model1.translation_table['ham']['eier'], 1.0 / 3)
- self.assertEqual(model1.translation_table['eggs'][None], 1.0 / 3)
-
- def test_set_uniform_translation_probabilities_of_non_domain_values(self):
- # arrange
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model1 = IBMModel1(corpus, 0)
-
- # act
- model1.set_uniform_probabilities(corpus)
-
- # assert
- # examine target words that are not in the training data domain
- self.assertEqual(model1.translation_table['parrot']['eier'], IBMModel.MIN_PROB)
-
- def test_prob_t_a_given_s(self):
- # arrange
- src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
- trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
- corpus = [AlignedSent(trg_sentence, src_sentence)]
- alignment_info = AlignmentInfo(
- (0, 1, 4, 0, 2, 5, 5),
- [None] + src_sentence,
- ['UNUSED'] + trg_sentence,
- None,
- )
-
- translation_table = defaultdict(lambda: defaultdict(float))
- translation_table['i']['ich'] = 0.98
- translation_table['love']['gern'] = 0.98
- translation_table['to'][None] = 0.98
- translation_table['eat']['esse'] = 0.98
- translation_table['smoked']['räucherschinken'] = 0.98
- translation_table['ham']['räucherschinken'] = 0.98
-
- model1 = IBMModel1(corpus, 0)
- model1.translation_table = translation_table
-
- # act
- probability = model1.prob_t_a_given_s(alignment_info)
-
- # assert
- lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
- expected_probability = lexical_translation
- self.assertEqual(round(probability, 4), round(expected_probability, 4))
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for IBM Model 2 training methods
-"""
-
-import unittest
-
-from collections import defaultdict
-from nltk.translate import AlignedSent
-from nltk.translate import IBMModel
-from nltk.translate import IBMModel2
-from nltk.translate.ibm_model import AlignmentInfo
-
-
-class TestIBMModel2(unittest.TestCase):
- def test_set_uniform_alignment_probabilities(self):
- # arrange
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model2 = IBMModel2(corpus, 0)
-
- # act
- model2.set_uniform_probabilities(corpus)
-
- # assert
- # expected_prob = 1.0 / (length of source sentence + 1)
- self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4)
- self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3)
-
- def test_set_uniform_alignment_probabilities_of_non_domain_values(self):
- # arrange
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model2 = IBMModel2(corpus, 0)
-
- # act
- model2.set_uniform_probabilities(corpus)
-
- # assert
- # examine i and j values that are not in the training data domain
- self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB)
- self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB)
-
- def test_prob_t_a_given_s(self):
- # arrange
- src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
- trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
- corpus = [AlignedSent(trg_sentence, src_sentence)]
- alignment_info = AlignmentInfo(
- (0, 1, 4, 0, 2, 5, 5),
- [None] + src_sentence,
- ['UNUSED'] + trg_sentence,
- None,
- )
-
- translation_table = defaultdict(lambda: defaultdict(float))
- translation_table['i']['ich'] = 0.98
- translation_table['love']['gern'] = 0.98
- translation_table['to'][None] = 0.98
- translation_table['eat']['esse'] = 0.98
- translation_table['smoked']['räucherschinken'] = 0.98
- translation_table['ham']['räucherschinken'] = 0.98
-
- alignment_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
- )
- alignment_table[0][3][5][6] = 0.97 # None -> to
- alignment_table[1][1][5][6] = 0.97 # ich -> i
- alignment_table[2][4][5][6] = 0.97 # esse -> eat
- alignment_table[4][2][5][6] = 0.97 # gern -> love
- alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked
- alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham
-
- model2 = IBMModel2(corpus, 0)
- model2.translation_table = translation_table
- model2.alignment_table = alignment_table
-
- # act
- probability = model2.prob_t_a_given_s(alignment_info)
-
- # assert
- lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
- alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
- expected_probability = lexical_translation * alignment
- self.assertEqual(round(probability, 4), round(expected_probability, 4))
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for IBM Model 3 training methods
-"""
-
-import unittest
-
-from collections import defaultdict
-from nltk.translate import AlignedSent
-from nltk.translate import IBMModel
-from nltk.translate import IBMModel3
-from nltk.translate.ibm_model import AlignmentInfo
-
-
-class TestIBMModel3(unittest.TestCase):
- def test_set_uniform_distortion_probabilities(self):
- # arrange
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model3 = IBMModel3(corpus, 0)
-
- # act
- model3.set_uniform_probabilities(corpus)
-
- # assert
- # expected_prob = 1.0 / length of target sentence
- self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2)
- self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4)
-
- def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
- # arrange
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model3 = IBMModel3(corpus, 0)
-
- # act
- model3.set_uniform_probabilities(corpus)
-
- # assert
- # examine i and j values that are not in the training data domain
- self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB)
- self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB)
- self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB)
-
- def test_prob_t_a_given_s(self):
- # arrange
- src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
- trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
- corpus = [AlignedSent(trg_sentence, src_sentence)]
- alignment_info = AlignmentInfo(
- (0, 1, 4, 0, 2, 5, 5),
- [None] + src_sentence,
- ['UNUSED'] + trg_sentence,
- [[3], [1], [4], [], [2], [5, 6]],
- )
-
- distortion_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
- )
- distortion_table[1][1][5][6] = 0.97 # i -> ich
- distortion_table[2][4][5][6] = 0.97 # love -> gern
- distortion_table[3][0][5][6] = 0.97 # to -> NULL
- distortion_table[4][2][5][6] = 0.97 # eat -> esse
- distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken
- distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken
-
- translation_table = defaultdict(lambda: defaultdict(float))
- translation_table['i']['ich'] = 0.98
- translation_table['love']['gern'] = 0.98
- translation_table['to'][None] = 0.98
- translation_table['eat']['esse'] = 0.98
- translation_table['smoked']['räucherschinken'] = 0.98
- translation_table['ham']['räucherschinken'] = 0.98
-
- fertility_table = defaultdict(lambda: defaultdict(float))
- fertility_table[1]['ich'] = 0.99
- fertility_table[1]['esse'] = 0.99
- fertility_table[0]['ja'] = 0.99
- fertility_table[1]['gern'] = 0.99
- fertility_table[2]['räucherschinken'] = 0.999
- fertility_table[1][None] = 0.99
-
- probabilities = {
- 'p1': 0.167,
- 'translation_table': translation_table,
- 'distortion_table': distortion_table,
- 'fertility_table': fertility_table,
- 'alignment_table': None,
- }
-
- model3 = IBMModel3(corpus, 0, probabilities)
-
- # act
- probability = model3.prob_t_a_given_s(alignment_info)
-
- # assert
- null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
- fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
- lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
- distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97
- expected_probability = (
- null_generation * fertility * lexical_translation * distortion
- )
- self.assertEqual(round(probability, 4), round(expected_probability, 4))
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for IBM Model 4 training methods
-"""
-
-import unittest
-
-from collections import defaultdict
-from nltk.translate import AlignedSent
-from nltk.translate import IBMModel
-from nltk.translate import IBMModel4
-from nltk.translate.ibm_model import AlignmentInfo
-
-
-class TestIBMModel4(unittest.TestCase):
- def test_set_uniform_distortion_probabilities_of_max_displacements(self):
- # arrange
- src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
- trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
-
- # act
- model4.set_uniform_probabilities(corpus)
-
- # assert
- # number of displacement values =
- # 2 *(number of words in longest target sentence - 1)
- expected_prob = 1.0 / (2 * (4 - 1))
-
- # examine the boundary values for (displacement, src_class, trg_class)
- self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob)
- self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob)
- self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob)
- self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob)
-
- def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
- # arrange
- src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
- trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
-
- # act
- model4.set_uniform_probabilities(corpus)
-
- # assert
- # examine displacement values that are not in the training data domain
- self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB)
- self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB)
- self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB)
- self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB)
-
- def test_prob_t_a_given_s(self):
- # arrange
- src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
- trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
- src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
- trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
- corpus = [AlignedSent(trg_sentence, src_sentence)]
- alignment_info = AlignmentInfo(
- (0, 1, 4, 0, 2, 5, 5),
- [None] + src_sentence,
- ['UNUSED'] + trg_sentence,
- [[3], [1], [4], [], [2], [5, 6]],
- )
-
- head_distortion_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(float))
- )
- head_distortion_table[1][None][3] = 0.97 # None, i
- head_distortion_table[3][2][4] = 0.97 # ich, eat
- head_distortion_table[-2][3][4] = 0.97 # esse, love
- head_distortion_table[3][4][1] = 0.97 # gern, smoked
-
- non_head_distortion_table = defaultdict(lambda: defaultdict(float))
- non_head_distortion_table[1][0] = 0.96 # ham
-
- translation_table = defaultdict(lambda: defaultdict(float))
- translation_table['i']['ich'] = 0.98
- translation_table['love']['gern'] = 0.98
- translation_table['to'][None] = 0.98
- translation_table['eat']['esse'] = 0.98
- translation_table['smoked']['räucherschinken'] = 0.98
- translation_table['ham']['räucherschinken'] = 0.98
-
- fertility_table = defaultdict(lambda: defaultdict(float))
- fertility_table[1]['ich'] = 0.99
- fertility_table[1]['esse'] = 0.99
- fertility_table[0]['ja'] = 0.99
- fertility_table[1]['gern'] = 0.99
- fertility_table[2]['räucherschinken'] = 0.999
- fertility_table[1][None] = 0.99
-
- probabilities = {
- 'p1': 0.167,
- 'translation_table': translation_table,
- 'head_distortion_table': head_distortion_table,
- 'non_head_distortion_table': non_head_distortion_table,
- 'fertility_table': fertility_table,
- 'alignment_table': None,
- }
-
- model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities)
-
- # act
- probability = model4.prob_t_a_given_s(alignment_info)
-
- # assert
- null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
- fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
- lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
- distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
- expected_probability = (
- null_generation * fertility * lexical_translation * distortion
- )
- self.assertEqual(round(probability, 4), round(expected_probability, 4))
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for IBM Model 5 training methods
-"""
-
-import unittest
-
-from collections import defaultdict
-from nltk.translate import AlignedSent
-from nltk.translate import IBMModel
-from nltk.translate import IBMModel4
-from nltk.translate import IBMModel5
-from nltk.translate.ibm_model import AlignmentInfo
-
-
-class TestIBMModel5(unittest.TestCase):
- def test_set_uniform_vacancy_probabilities_of_max_displacements(self):
- # arrange
- src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
- trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
-
- # act
- model5.set_uniform_probabilities(corpus)
-
- # assert
- # number of vacancy difference values =
- # 2 * number of words in longest target sentence
- expected_prob = 1.0 / (2 * 4)
-
- # examine the boundary values for (dv, max_v, trg_class)
- self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob)
- self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob)
- self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
- self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
-
- def test_set_uniform_vacancy_probabilities_of_non_domain_values(self):
- # arrange
- src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
- trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
- corpus = [
- AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
- AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
- ]
- model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
-
- # act
- model5.set_uniform_probabilities(corpus)
-
- # assert
- # examine dv and max_v values that are not in the training data domain
- self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
- self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
- self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB)
- self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
- self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
-
- def test_prob_t_a_given_s(self):
- # arrange
- src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
- trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
- src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
- trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
- corpus = [AlignedSent(trg_sentence, src_sentence)]
- alignment_info = AlignmentInfo(
- (0, 1, 4, 0, 2, 5, 5),
- [None] + src_sentence,
- ['UNUSED'] + trg_sentence,
- [[3], [1], [4], [], [2], [5, 6]],
- )
-
- head_vacancy_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(float))
- )
- head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i
- head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat
- head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love
- head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked
-
- non_head_vacancy_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(float))
- )
- non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham
-
- translation_table = defaultdict(lambda: defaultdict(float))
- translation_table['i']['ich'] = 0.98
- translation_table['love']['gern'] = 0.98
- translation_table['to'][None] = 0.98
- translation_table['eat']['esse'] = 0.98
- translation_table['smoked']['räucherschinken'] = 0.98
- translation_table['ham']['räucherschinken'] = 0.98
-
- fertility_table = defaultdict(lambda: defaultdict(float))
- fertility_table[1]['ich'] = 0.99
- fertility_table[1]['esse'] = 0.99
- fertility_table[0]['ja'] = 0.99
- fertility_table[1]['gern'] = 0.99
- fertility_table[2]['räucherschinken'] = 0.999
- fertility_table[1][None] = 0.99
-
- probabilities = {
- 'p1': 0.167,
- 'translation_table': translation_table,
- 'fertility_table': fertility_table,
- 'head_vacancy_table': head_vacancy_table,
- 'non_head_vacancy_table': non_head_vacancy_table,
- 'head_distortion_table': None,
- 'non_head_distortion_table': None,
- 'alignment_table': None,
- }
-
- model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities)
-
- # act
- probability = model5.prob_t_a_given_s(alignment_info)
-
- # assert
- null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
- fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
- lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
- vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
- expected_probability = (
- null_generation * fertility * lexical_translation * vacancy
- )
- self.assertEqual(round(probability, 4), round(expected_probability, 4))
-
- def test_prune(self):
- # arrange
- alignment_infos = [
- AlignmentInfo((1, 1), None, None, None),
- AlignmentInfo((1, 2), None, None, None),
- AlignmentInfo((2, 1), None, None, None),
- AlignmentInfo((2, 2), None, None, None),
- AlignmentInfo((0, 0), None, None, None),
- ]
- min_factor = IBMModel5.MIN_SCORE_FACTOR
- best_score = 0.9
- scores = {
- (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold
- (1, 2): best_score,
- (2, 1): min_factor * best_score, # at threshold
- (2, 2): min_factor * best_score * 0.5, # low score
- (0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold
- }
- corpus = [AlignedSent(['a'], ['b'])]
- original_prob_function = IBMModel4.model4_prob_t_a_given_s
- # mock static method
- IBMModel4.model4_prob_t_a_given_s = staticmethod(
- lambda a, model: scores[a.alignment]
- )
- model5 = IBMModel5(corpus, 0, None, None)
-
- # act
- pruned_alignments = model5.prune(alignment_infos)
-
- # assert
- self.assertEqual(len(pruned_alignments), 3)
-
- # restore static method
- IBMModel4.model4_prob_t_a_given_s = original_prob_function
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for common methods of IBM translation models
-"""
-
-import unittest
-
-from collections import defaultdict
-from nltk.translate import AlignedSent
-from nltk.translate import IBMModel
-from nltk.translate.ibm_model import AlignmentInfo
-
-
-class TestIBMModel(unittest.TestCase):
- __TEST_SRC_SENTENCE = ["j'", 'aime', 'bien', 'jambon']
- __TEST_TRG_SENTENCE = ['i', 'love', 'ham']
-
- def test_vocabularies_are_initialized(self):
- parallel_corpora = [
- AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']),
- AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
- AlignedSent([], ['sept']),
- ]
-
- ibm_model = IBMModel(parallel_corpora)
- self.assertEqual(len(ibm_model.src_vocab), 8)
- self.assertEqual(len(ibm_model.trg_vocab), 6)
-
- def test_vocabularies_are_initialized_even_with_empty_corpora(self):
- parallel_corpora = []
-
- ibm_model = IBMModel(parallel_corpora)
- self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
- self.assertEqual(len(ibm_model.trg_vocab), 0)
-
- def test_best_model2_alignment(self):
- # arrange
- sentence_pair = AlignedSent(
- TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
- )
- # None and 'bien' have zero fertility
- translation_table = {
- 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
- 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
- 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
- }
- alignment_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
- )
-
- ibm_model = IBMModel([])
- ibm_model.translation_table = translation_table
- ibm_model.alignment_table = alignment_table
-
- # act
- a_info = ibm_model.best_model2_alignment(sentence_pair)
-
- # assert
- self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
- self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
-
- def test_best_model2_alignment_does_not_change_pegged_alignment(self):
- # arrange
- sentence_pair = AlignedSent(
- TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
- )
- translation_table = {
- 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
- 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
- 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
- }
- alignment_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
- )
-
- ibm_model = IBMModel([])
- ibm_model.translation_table = translation_table
- ibm_model.alignment_table = alignment_table
-
- # act: force 'love' to be pegged to 'jambon'
- a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
- # assert
- self.assertEqual(a_info.alignment[1:], (1, 4, 4))
- self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
-
- def test_best_model2_alignment_handles_fertile_words(self):
- # arrange
- sentence_pair = AlignedSent(
- ['i', 'really', ',', 'really', 'love', 'ham'],
- TestIBMModel.__TEST_SRC_SENTENCE,
- )
- # 'bien' produces 2 target words: 'really' and another 'really'
- translation_table = {
- 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
- 'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09},
- ',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7},
- 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
- 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
- }
- alignment_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
- )
-
- ibm_model = IBMModel([])
- ibm_model.translation_table = translation_table
- ibm_model.alignment_table = alignment_table
-
- # act
- a_info = ibm_model.best_model2_alignment(sentence_pair)
-
- # assert
- self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
- self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
-
- def test_best_model2_alignment_handles_empty_src_sentence(self):
- # arrange
- sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
- ibm_model = IBMModel([])
-
- # act
- a_info = ibm_model.best_model2_alignment(sentence_pair)
-
- # assert
- self.assertEqual(a_info.alignment[1:], (0, 0, 0))
- self.assertEqual(a_info.cepts, [[1, 2, 3]])
-
- def test_best_model2_alignment_handles_empty_trg_sentence(self):
- # arrange
- sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
- ibm_model = IBMModel([])
-
- # act
- a_info = ibm_model.best_model2_alignment(sentence_pair)
-
- # assert
- self.assertEqual(a_info.alignment[1:], ())
- self.assertEqual(a_info.cepts, [[], [], [], [], []])
-
- def test_neighboring_finds_neighbor_alignments(self):
- # arrange
- a_info = AlignmentInfo(
- (0, 3, 2),
- (None, 'des', 'œufs', 'verts'),
- ('UNUSED', 'green', 'eggs'),
- [[], [], [2], [1]],
- )
- ibm_model = IBMModel([])
-
- # act
- neighbors = ibm_model.neighboring(a_info)
-
- # assert
- neighbor_alignments = set()
- for neighbor in neighbors:
- neighbor_alignments.add(neighbor.alignment)
- expected_alignments = set(
- [
- # moves
- (0, 0, 2),
- (0, 1, 2),
- (0, 2, 2),
- (0, 3, 0),
- (0, 3, 1),
- (0, 3, 3),
- # swaps
- (0, 2, 3),
- # original alignment
- (0, 3, 2),
- ]
- )
- self.assertEqual(neighbor_alignments, expected_alignments)
-
- def test_neighboring_sets_neighbor_alignment_info(self):
- # arrange
- a_info = AlignmentInfo(
- (0, 3, 2),
- (None, 'des', 'œufs', 'verts'),
- ('UNUSED', 'green', 'eggs'),
- [[], [], [2], [1]],
- )
- ibm_model = IBMModel([])
-
- # act
- neighbors = ibm_model.neighboring(a_info)
-
- # assert: select a few particular alignments
- for neighbor in neighbors:
- if neighbor.alignment == (0, 2, 2):
- moved_alignment = neighbor
- elif neighbor.alignment == (0, 3, 2):
- swapped_alignment = neighbor
-
- self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
- self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
-
- def test_neighboring_returns_neighbors_with_pegged_alignment(self):
- # arrange
- a_info = AlignmentInfo(
- (0, 3, 2),
- (None, 'des', 'œufs', 'verts'),
- ('UNUSED', 'green', 'eggs'),
- [[], [], [2], [1]],
- )
- ibm_model = IBMModel([])
-
- # act: peg 'eggs' to align with 'œufs'
- neighbors = ibm_model.neighboring(a_info, 2)
-
- # assert
- neighbor_alignments = set()
- for neighbor in neighbors:
- neighbor_alignments.add(neighbor.alignment)
- expected_alignments = set(
- [
- # moves
- (0, 0, 2),
- (0, 1, 2),
- (0, 2, 2),
- # no swaps
- # original alignment
- (0, 3, 2),
- ]
- )
- self.assertEqual(neighbor_alignments, expected_alignments)
-
- def test_hillclimb(self):
- # arrange
- initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
-
- def neighboring_mock(a, j):
- if a.alignment == (0, 3, 2):
- return set(
- [
- AlignmentInfo((0, 2, 2), None, None, None),
- AlignmentInfo((0, 1, 1), None, None, None),
- ]
- )
- elif a.alignment == (0, 2, 2):
- return set(
- [
- AlignmentInfo((0, 3, 3), None, None, None),
- AlignmentInfo((0, 4, 4), None, None, None),
- ]
- )
- return set()
-
- def prob_t_a_given_s_mock(a):
- prob_values = {
- (0, 3, 2): 0.5,
- (0, 2, 2): 0.6,
- (0, 1, 1): 0.4,
- (0, 3, 3): 0.6,
- (0, 4, 4): 0.7,
- }
- return prob_values.get(a.alignment, 0.01)
-
- ibm_model = IBMModel([])
- ibm_model.neighboring = neighboring_mock
- ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
-
- # act
- best_alignment = ibm_model.hillclimb(initial_alignment)
-
- # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
- self.assertEqual(best_alignment.alignment, (0, 4, 4))
-
- def test_sample(self):
- # arrange
- sentence_pair = AlignedSent(
- TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
- )
- ibm_model = IBMModel([])
- ibm_model.prob_t_a_given_s = lambda x: 0.001
-
- # act
- samples, best_alignment = ibm_model.sample(sentence_pair)
-
- # assert
- self.assertEqual(len(samples), 61)
+++ /dev/null
-# -*- coding: utf-8 -*-
-"""
-Tests for NIST translation evaluation metric
-"""
-
-import io
-import unittest
-
-from nltk.data import find
-from nltk.translate.nist_score import sentence_nist, corpus_nist
-
-
-class TestNIST(unittest.TestCase):
- def test_sentence_nist(self):
- ref_file = find('models/wmt15_eval/ref.ru')
- hyp_file = find('models/wmt15_eval/google.ru')
- mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
-
- # Reads the NIST scores from the `mteval-13a.output` file.
- # The order of the list corresponds to the order of the ngrams.
- with open(mteval_output_file, 'r') as mteval_fin:
- # The numbers are located in the last 4th line of the file.
- # The first and 2nd item in the list are the score and system names.
- mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1])
-
- with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
- with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
- # Whitespace tokenize the file.
- # Note: split() automatically strip().
- hypotheses = list(map(lambda x: x.split(), hyp_fin))
- # Note that the corpus_bleu input is list of list of references.
- references = list(map(lambda x: [x.split()], ref_fin))
- # Without smoothing.
- for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
- nltk_nist = corpus_nist(references, hypotheses, i)
- # Check that the NIST scores difference is less than 0.5
- assert abs(mteval_nist - nltk_nist) < 0.05
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Stack decoder
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Tah Wei Hoon <hoon.tw@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-"""
-Tests for stack decoder
-"""
-
-import unittest
-from collections import defaultdict
-from math import log
-from nltk.translate import PhraseTable
-from nltk.translate import StackDecoder
-from nltk.translate.stack_decoder import _Hypothesis, _Stack
-
-
-class TestStackDecoder(unittest.TestCase):
- def test_find_all_src_phrases(self):
- # arrange
- phrase_table = TestStackDecoder.create_fake_phrase_table()
- stack_decoder = StackDecoder(phrase_table, None)
- sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
-
- # act
- src_phrase_spans = stack_decoder.find_all_src_phrases(sentence)
-
- # assert
- self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft'
- self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft'
- self.assertEqual(src_phrase_spans[2], [3]) # 'is'
- self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels'
- self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of'
- self.assertEqual(src_phrase_spans[5], [6]) # 'eels'
-
- def test_distortion_score(self):
- # arrange
- stack_decoder = StackDecoder(None, None)
- stack_decoder.distortion_factor = 0.5
- hypothesis = _Hypothesis()
- hypothesis.src_phrase_span = (3, 5)
-
- # act
- score = stack_decoder.distortion_score(hypothesis, (8, 10))
-
- # assert
- expected_score = log(stack_decoder.distortion_factor) * (8 - 5)
- self.assertEqual(score, expected_score)
-
- def test_distortion_score_of_first_expansion(self):
- # arrange
- stack_decoder = StackDecoder(None, None)
- stack_decoder.distortion_factor = 0.5
- hypothesis = _Hypothesis()
-
- # act
- score = stack_decoder.distortion_score(hypothesis, (8, 10))
-
- # assert
- # expansion from empty hypothesis always has zero distortion cost
- self.assertEqual(score, 0.0)
-
- def test_compute_future_costs(self):
- # arrange
- phrase_table = TestStackDecoder.create_fake_phrase_table()
- language_model = TestStackDecoder.create_fake_language_model()
- stack_decoder = StackDecoder(phrase_table, language_model)
- sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
-
- # act
- future_scores = stack_decoder.compute_future_scores(sentence)
-
- # assert
- self.assertEqual(
- future_scores[1][2],
- (
- phrase_table.translations_for(('hovercraft',))[0].log_prob
- + language_model.probability(('hovercraft',))
- ),
- )
- self.assertEqual(
- future_scores[0][2],
- (
- phrase_table.translations_for(('my', 'hovercraft'))[0].log_prob
- + language_model.probability(('my', 'hovercraft'))
- ),
- )
-
- def test_compute_future_costs_for_phrases_not_in_phrase_table(self):
- # arrange
- phrase_table = TestStackDecoder.create_fake_phrase_table()
- language_model = TestStackDecoder.create_fake_language_model()
- stack_decoder = StackDecoder(phrase_table, language_model)
- sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
-
- # act
- future_scores = stack_decoder.compute_future_scores(sentence)
-
- # assert
- self.assertEqual(
- future_scores[1][3], # 'hovercraft is' is not in phrase table
- future_scores[1][2] + future_scores[2][3],
- ) # backoff
-
- def test_future_score(self):
- # arrange: sentence with 8 words; words 2, 3, 4 already translated
- hypothesis = _Hypothesis()
- hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock
- future_score_table = defaultdict(lambda: defaultdict(float))
- future_score_table[0][2] = 0.4
- future_score_table[5][8] = 0.5
- stack_decoder = StackDecoder(None, None)
-
- # act
- future_score = stack_decoder.future_score(hypothesis, future_score_table, 8)
-
- # assert
- self.assertEqual(future_score, 0.4 + 0.5)
-
- def test_valid_phrases(self):
- # arrange
- hypothesis = _Hypothesis()
- # mock untranslated_spans method
- hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)]
- all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]]
-
- # act
- phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis)
-
- # assert
- self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)])
-
- @staticmethod
- def create_fake_phrase_table():
- phrase_table = PhraseTable()
- phrase_table.add(('hovercraft',), ('',), 0.8)
- phrase_table.add(('my', 'hovercraft'), ('', ''), 0.7)
- phrase_table.add(('my', 'cheese'), ('', ''), 0.7)
- phrase_table.add(('is',), ('',), 0.8)
- phrase_table.add(('is',), ('',), 0.5)
- phrase_table.add(('full', 'of'), ('', ''), 0.01)
- phrase_table.add(('full', 'of', 'eels'), ('', '', ''), 0.5)
- phrase_table.add(('full', 'of', 'spam'), ('', ''), 0.5)
- phrase_table.add(('eels',), ('',), 0.5)
- phrase_table.add(('spam',), ('',), 0.5)
- return phrase_table
-
- @staticmethod
- def create_fake_language_model():
- # nltk.model should be used here once it is implemented
- language_prob = defaultdict(lambda: -999.0)
- language_prob[('my',)] = log(0.1)
- language_prob[('hovercraft',)] = log(0.1)
- language_prob[('is',)] = log(0.1)
- language_prob[('full',)] = log(0.1)
- language_prob[('of',)] = log(0.1)
- language_prob[('eels',)] = log(0.1)
- language_prob[('my', 'hovercraft')] = log(0.3)
- language_model = type(
- '', (object,), {'probability': lambda _, phrase: language_prob[phrase]}
- )()
- return language_model
-
-
-class TestHypothesis(unittest.TestCase):
- def setUp(self):
- root = _Hypothesis()
- child = _Hypothesis(
- raw_score=0.5,
- src_phrase_span=(3, 7),
- trg_phrase=('hello', 'world'),
- previous=root,
- )
- grandchild = _Hypothesis(
- raw_score=0.4,
- src_phrase_span=(1, 2),
- trg_phrase=('and', 'goodbye'),
- previous=child,
- )
- self.hypothesis_chain = grandchild
-
- def test_translation_so_far(self):
- # act
- translation = self.hypothesis_chain.translation_so_far()
-
- # assert
- self.assertEqual(translation, ['hello', 'world', 'and', 'goodbye'])
-
- def test_translation_so_far_for_empty_hypothesis(self):
- # arrange
- hypothesis = _Hypothesis()
-
- # act
- translation = hypothesis.translation_so_far()
-
- # assert
- self.assertEqual(translation, [])
-
- def test_total_translated_words(self):
- # act
- total_translated_words = self.hypothesis_chain.total_translated_words()
-
- # assert
- self.assertEqual(total_translated_words, 5)
-
- def test_translated_positions(self):
- # act
- translated_positions = self.hypothesis_chain.translated_positions()
-
- # assert
- translated_positions.sort()
- self.assertEqual(translated_positions, [1, 3, 4, 5, 6])
-
- def test_untranslated_spans(self):
- # act
- untranslated_spans = self.hypothesis_chain.untranslated_spans(10)
-
- # assert
- self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)])
-
- def test_untranslated_spans_for_empty_hypothesis(self):
- # arrange
- hypothesis = _Hypothesis()
-
- # act
- untranslated_spans = hypothesis.untranslated_spans(10)
-
- # assert
- self.assertEqual(untranslated_spans, [(0, 10)])
-
-
-class TestStack(unittest.TestCase):
- def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self):
- # arrange
- stack = _Stack(3)
- poor_hypothesis = _Hypothesis(0.01)
-
- # act
- stack.push(_Hypothesis(0.2))
- stack.push(poor_hypothesis)
- stack.push(_Hypothesis(0.1))
- stack.push(_Hypothesis(0.3))
-
- # assert
- self.assertFalse(poor_hypothesis in stack)
-
- def test_push_removes_hypotheses_that_fall_below_beam_threshold(self):
- # arrange
- stack = _Stack(3, 0.5)
- poor_hypothesis = _Hypothesis(0.01)
- worse_hypothesis = _Hypothesis(0.009)
-
- # act
- stack.push(poor_hypothesis)
- stack.push(worse_hypothesis)
- stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
-
- # assert
- self.assertFalse(poor_hypothesis in stack)
- self.assertFalse(worse_hypothesis in stack)
-
- def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self):
- # arrange
- stack = _Stack(3, 0.5)
- poor_hypothesis = _Hypothesis(0.01)
-
- # act
- stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
- stack.push(poor_hypothesis)
-
- # assert
- self.assertFalse(poor_hypothesis in stack)
-
- def test_best_returns_the_best_hypothesis(self):
- # arrange
- stack = _Stack(3)
- best_hypothesis = _Hypothesis(0.99)
-
- # act
- stack.push(_Hypothesis(0.0))
- stack.push(best_hypothesis)
- stack.push(_Hypothesis(0.5))
-
- # assert
- self.assertEqual(stack.best(), best_hypothesis)
-
- def test_best_returns_none_when_stack_is_empty(self):
- # arrange
- stack = _Stack(3)
-
- # assert
- self.assertEqual(stack.best(), None)
+++ /dev/null
-# -*- coding: utf-8 -*-
-from unittest import TestCase
-from functools import wraps
-from nose.plugins.skip import SkipTest
-from nltk.util import py26
-
-
-def skip(reason):
- """
- Unconditionally skip a test.
- """
-
- def decorator(test_item):
- is_test_class = isinstance(test_item, type) and issubclass(test_item, TestCase)
-
- if is_test_class and py26():
- # Patch all test_ methods to raise SkipText exception.
- # This is necessary for Python 2.6 because its unittest
- # doesn't understand __unittest_skip__.
- for meth_name in (m for m in dir(test_item) if m.startswith('test_')):
- patched_method = skip(reason)(getattr(test_item, meth_name))
- setattr(test_item, meth_name, patched_method)
-
- if not is_test_class:
-
- @wraps(test_item)
- def skip_wrapper(*args, **kwargs):
- raise SkipTest(reason)
-
- skip_wrapper.__name__ = test_item.__name__
- test_item = skip_wrapper
-
- test_item.__unittest_skip__ = True
- test_item.__unittest_skip_why__ = reason
- return test_item
-
- return decorator
-
-
-def skipIf(condition, reason):
- """
- Skip a test if the condition is true.
- """
- if condition:
- return skip(reason)
- return lambda obj: obj
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=================
-Utility functions
-=================
-
- >>> from nltk.util import *
- >>> from nltk.tree import Tree
-
- >>> print_string("This is a long string, therefore it should break", 25)
- This is a long string,
- therefore it should break
-
- >>> re_show("[a-z]+", "sdf123")
- {sdf}123
-
- >>> tree = Tree(5,
- ... [Tree(4, [Tree(2, [1, 3])]),
- ... Tree(8, [Tree(6, [7]), 9])])
- >>> for x in breadth_first(tree):
- ... if isinstance(x, int): print(x)
- ... else: print(x.label())
- 5
- 4
- 8
- 2
- 6
- 9
- 1
- 3
- 7
- >>> for x in breadth_first(tree, maxdepth=2):
- ... if isinstance(x, int): print(x)
- ... else: print(x.label())
- 5
- 4
- 8
- 2
- 6
- 9
-
- >>> invert_dict({1: 2})
- defaultdict(<... 'list'>, {2: 1})
-
- >>> invert_dict({1: [3, 4, 5]})
- defaultdict(<... 'list'>, {3: [1], 4: [1], 5: [1]})
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-=================
-WordNet Interface
-=================
-
-WordNet is just another NLTK corpus reader, and can be imported like this:
- >>> from nltk.corpus import wordnet
-
-For more compact code, we recommend:
-
- >>> from nltk.corpus import wordnet as wn
-
------
-Words
------
-
-Look up a word using ``synsets()``; this function has an optional ``pos`` argument
-which lets you constrain the part of speech of the word:
-
- >>> wn.synsets('dog') # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
- [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'),
- Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
- >>> wn.synsets('dog', pos=wn.VERB)
- [Synset('chase.v.01')]
-
-The other parts of speech are ``NOUN``, ``ADJ`` and ``ADV``.
-A synset is identified with a 3-part name of the form: word.pos.nn:
-
- >>> wn.synset('dog.n.01')
- Synset('dog.n.01')
- >>> print(wn.synset('dog.n.01').definition())
- a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
- >>> len(wn.synset('dog.n.01').examples())
- 1
- >>> print(wn.synset('dog.n.01').examples()[0])
- the dog barked all night
- >>> wn.synset('dog.n.01').lemmas()
- [Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')]
- >>> [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()]
- ['dog', 'domestic_dog', 'Canis_familiaris']
- >>> wn.lemma('dog.n.01.dog').synset()
- Synset('dog.n.01')
-
-The WordNet corpus reader gives access to the Open Multilingual
-WordNet, using ISO-639 language codes.
-
- >>> sorted(wn.langs()) # doctest: +NORMALIZE_WHITESPACE
- ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fas',
- 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno',
- 'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm']
- >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
- [Synset('dog.n.01'), Synset('spy.n.01')]
-
- wn.synset('spy.n.01').lemma_names('jpn') # doctest: +NORMALIZE_WHITESPACE
- ['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005',
- '\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005',
- '\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1',
- '\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6']
-
- >>> wn.synset('dog.n.01').lemma_names('ita')
- ['cane', 'Canis_familiaris']
- >>> wn.lemmas('cane', lang='ita') # doctest: +NORMALIZE_WHITESPACE
- [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'),
- Lemma('incompetent.n.01.cane')]
- >>> sorted(wn.synset('dog.n.01').lemmas('dan')) # doctest: +NORMALIZE_WHITESPACE
- [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
- Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]
-
- sorted(wn.synset('dog.n.01').lemmas('por'))
- [Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')]
-
- >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
- >>> dog_lemma
- Lemma('dog.n.01.c\xe3o')
- >>> dog_lemma.lang()
- 'por'
- >>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn')))
- 64797
-
--------
-Synsets
--------
-
-`Synset`: a set of synonyms that share a common meaning.
-
- >>> dog = wn.synset('dog.n.01')
- >>> dog.hypernyms()
- [Synset('canine.n.02'), Synset('domestic_animal.n.01')]
- >>> dog.hyponyms() # doctest: +ELLIPSIS
- [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), ...]
- >>> dog.member_holonyms()
- [Synset('canis.n.01'), Synset('pack.n.06')]
- >>> dog.root_hypernyms()
- [Synset('entity.n.01')]
- >>> wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01'))
- [Synset('carnivore.n.01')]
-
-Each synset contains one or more lemmas, which represent a specific
-sense of a specific word.
-
-Note that some relations are defined by WordNet only over Lemmas:
-
- >>> good = wn.synset('good.a.01')
- >>> good.antonyms()
- Traceback (most recent call last):
- File "<stdin>", line 1, in <module>
- AttributeError: 'Synset' object has no attribute 'antonyms'
- >>> good.lemmas()[0].antonyms()
- [Lemma('bad.a.01.bad')]
-
-The relations that are currently defined in this way are `antonyms`,
-`derivationally_related_forms` and `pertainyms`.
-
-If you know the byte offset used to identify a synset in the original
-Princeton WordNet data file, you can use that to instantiate the synset
-in NLTK:
-
- >>> wn.synset_from_pos_and_offset('n', 4543158)
- Synset('wagon.n.01')
-
-------
-Lemmas
-------
-
- >>> eat = wn.lemma('eat.v.03.eat')
- >>> eat
- Lemma('feed.v.06.eat')
- >>> print(eat.key())
- eat%2:34:02::
- >>> eat.count()
- 4
- >>> wn.lemma_from_key(eat.key())
- Lemma('feed.v.06.eat')
- >>> wn.lemma_from_key(eat.key()).synset()
- Synset('feed.v.06')
- >>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00')
- Lemma('backward.s.03.feebleminded')
- >>> for lemma in wn.synset('eat.v.03').lemmas():
- ... print(lemma, lemma.count())
- ...
- Lemma('feed.v.06.feed') 3
- Lemma('feed.v.06.eat') 4
- >>> for lemma in wn.lemmas('eat', 'v'):
- ... print(lemma, lemma.count())
- ...
- Lemma('eat.v.01.eat') 61
- Lemma('eat.v.02.eat') 13
- Lemma('feed.v.06.eat') 4
- Lemma('eat.v.04.eat') 0
- Lemma('consume.v.05.eat') 0
- Lemma('corrode.v.01.eat') 0
- >>> wn.lemma('jump.v.11.jump')
- Lemma('jump.v.11.jump')
-
-Lemmas can also have relations between them:
-
- >>> vocal = wn.lemma('vocal.a.01.vocal')
- >>> vocal.derivationally_related_forms()
- [Lemma('vocalize.v.02.vocalize')]
- >>> vocal.pertainyms()
- [Lemma('voice.n.02.voice')]
- >>> vocal.antonyms()
- [Lemma('instrumental.a.01.instrumental')]
-
-The three relations above exist only on lemmas, not on synsets.
-
------------
-Verb Frames
------------
-
- >>> wn.synset('think.v.01').frame_ids()
- [5, 9]
- >>> for lemma in wn.synset('think.v.01').lemmas():
- ... print(lemma, lemma.frame_ids())
- ... print(" | ".join(lemma.frame_strings()))
- ...
- Lemma('think.v.01.think') [5, 9]
- Something think something Adjective/Noun | Somebody think somebody
- Lemma('think.v.01.believe') [5, 9]
- Something believe something Adjective/Noun | Somebody believe somebody
- Lemma('think.v.01.consider') [5, 9]
- Something consider something Adjective/Noun | Somebody consider somebody
- Lemma('think.v.01.conceive') [5, 9]
- Something conceive something Adjective/Noun | Somebody conceive somebody
- >>> wn.synset('stretch.v.02').frame_ids()
- [8]
- >>> for lemma in wn.synset('stretch.v.02').lemmas():
- ... print(lemma, lemma.frame_ids())
- ... print(" | ".join(lemma.frame_strings()))
- ...
- Lemma('stretch.v.02.stretch') [8, 2]
- Somebody stretch something | Somebody stretch
- Lemma('stretch.v.02.extend') [8]
- Somebody extend something
-
-
-----------
-Similarity
-----------
-
- >>> dog = wn.synset('dog.n.01')
- >>> cat = wn.synset('cat.n.01')
-
- >>> hit = wn.synset('hit.v.01')
- >>> slap = wn.synset('slap.v.01')
-
-
-``synset1.path_similarity(synset2):``
-Return a score denoting how similar two word senses are, based on the
-shortest path that connects the senses in the is-a (hypernym/hypnoym)
-taxonomy. The score is in the range 0 to 1. By default, there is now
-a fake root node added to verbs so for cases where previously a path
-could not be found---and None was returned---it should return a value.
-The old behavior can be achieved by setting simulate_root to be False.
-A score of 1 represents identity i.e. comparing a sense with itself
-will return 1.
-
- >>> dog.path_similarity(cat) # doctest: +ELLIPSIS
- 0.2...
-
- >>> hit.path_similarity(slap) # doctest: +ELLIPSIS
- 0.142...
-
- >>> wn.path_similarity(hit, slap) # doctest: +ELLIPSIS
- 0.142...
-
- >>> print(hit.path_similarity(slap, simulate_root=False))
- None
-
- >>> print(wn.path_similarity(hit, slap, simulate_root=False))
- None
-
-``synset1.lch_similarity(synset2):``
-Leacock-Chodorow Similarity:
-Return a score denoting how similar two word senses are, based on the
-shortest path that connects the senses (as above) and the maximum depth
-of the taxonomy in which the senses occur. The relationship is given
-as -log(p/2d) where p is the shortest path length and d the taxonomy
-depth.
-
- >>> dog.lch_similarity(cat) # doctest: +ELLIPSIS
- 2.028...
-
- >>> hit.lch_similarity(slap) # doctest: +ELLIPSIS
- 1.312...
-
- >>> wn.lch_similarity(hit, slap) # doctest: +ELLIPSIS
- 1.312...
-
- >>> print(hit.lch_similarity(slap, simulate_root=False))
- None
-
- >>> print(wn.lch_similarity(hit, slap, simulate_root=False))
- None
-
-``synset1.wup_similarity(synset2):``
-Wu-Palmer Similarity:
-Return a score denoting how similar two word senses are, based on the
-depth of the two senses in the taxonomy and that of their Least Common
-Subsumer (most specific ancestor node). Note that at this time the
-scores given do _not_ always agree with those given by Pedersen's Perl
-implementation of Wordnet Similarity.
-
-The LCS does not necessarily feature in the shortest path connecting the
-two senses, as it is by definition the common ancestor deepest in the
-taxonomy, not closest to the two senses. Typically, however, it will so
-feature. Where multiple candidates for the LCS exist, that whose
-shortest path to the root node is the longest will be selected. Where
-the LCS has multiple paths to the root, the longer path is used for
-the purposes of the calculation.
-
- >>> dog.wup_similarity(cat) # doctest: +ELLIPSIS
- 0.857...
-
- >>> hit.wup_similarity(slap)
- 0.25
-
- >>> wn.wup_similarity(hit, slap)
- 0.25
-
- >>> print(hit.wup_similarity(slap, simulate_root=False))
- None
-
- >>> print(wn.wup_similarity(hit, slap, simulate_root=False))
- None
-
-``wordnet_ic``
-Information Content:
-Load an information content file from the wordnet_ic corpus.
-
- >>> from nltk.corpus import wordnet_ic
- >>> brown_ic = wordnet_ic.ic('ic-brown.dat')
- >>> semcor_ic = wordnet_ic.ic('ic-semcor.dat')
-
-Or you can create an information content dictionary from a corpus (or
-anything that has a words() method).
-
- >>> from nltk.corpus import genesis
- >>> genesis_ic = wn.ic(genesis, False, 0.0)
-
-``synset1.res_similarity(synset2, ic):``
-Resnik Similarity:
-Return a score denoting how similar two word senses are, based on the
-Information Content (IC) of the Least Common Subsumer (most specific
-ancestor node). Note that for any similarity measure that uses
-information content, the result is dependent on the corpus used to
-generate the information content and the specifics of how the
-information content was created.
-
- >>> dog.res_similarity(cat, brown_ic) # doctest: +ELLIPSIS
- 7.911...
- >>> dog.res_similarity(cat, genesis_ic) # doctest: +ELLIPSIS
- 7.204...
-
-``synset1.jcn_similarity(synset2, ic):``
-Jiang-Conrath Similarity
-Return a score denoting how similar two word senses are, based on the
-Information Content (IC) of the Least Common Subsumer (most specific
-ancestor node) and that of the two input Synsets. The relationship is
-given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
-
- >>> dog.jcn_similarity(cat, brown_ic) # doctest: +ELLIPSIS
- 0.449...
- >>> dog.jcn_similarity(cat, genesis_ic) # doctest: +ELLIPSIS
- 0.285...
-
-``synset1.lin_similarity(synset2, ic):``
-Lin Similarity:
-Return a score denoting how similar two word senses are, based on the
-Information Content (IC) of the Least Common Subsumer (most specific
-ancestor node) and that of the two input Synsets. The relationship is
-given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
-
- >>> dog.lin_similarity(cat, semcor_ic) # doctest: +ELLIPSIS
- 0.886...
-
-
----------------------
-Access to all Synsets
----------------------
-
-Iterate over all the noun synsets:
-
- >>> for synset in list(wn.all_synsets('n'))[:10]:
- ... print(synset)
- ...
- Synset('entity.n.01')
- Synset('physical_entity.n.01')
- Synset('abstraction.n.06')
- Synset('thing.n.12')
- Synset('object.n.01')
- Synset('whole.n.02')
- Synset('congener.n.03')
- Synset('living_thing.n.01')
- Synset('organism.n.01')
- Synset('benthos.n.02')
-
-Get all synsets for this word, possibly restricted by POS:
-
- >>> wn.synsets('dog') # doctest: +ELLIPSIS
- [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), ...]
- >>> wn.synsets('dog', pos='v')
- [Synset('chase.v.01')]
-
-Walk through the noun synsets looking at their hypernyms:
-
- >>> from itertools import islice
- >>> for synset in islice(wn.all_synsets('n'), 5):
- ... print(synset, synset.hypernyms())
- ...
- Synset('entity.n.01') []
- Synset('physical_entity.n.01') [Synset('entity.n.01')]
- Synset('abstraction.n.06') [Synset('entity.n.01')]
- Synset('thing.n.12') [Synset('physical_entity.n.01')]
- Synset('object.n.01') [Synset('physical_entity.n.01')]
-
-
-------
-Morphy
-------
-
-Look up forms not in WordNet, with the help of Morphy:
-
- >>> wn.morphy('denied', wn.NOUN)
- >>> print(wn.morphy('denied', wn.VERB))
- deny
- >>> wn.synsets('denied', wn.NOUN)
- []
- >>> wn.synsets('denied', wn.VERB) # doctest: +NORMALIZE_WHITESPACE
- [Synset('deny.v.01'), Synset('deny.v.02'), Synset('deny.v.03'), Synset('deny.v.04'),
- Synset('deny.v.05'), Synset('traverse.v.03'), Synset('deny.v.07')]
-
-Morphy uses a combination of inflectional ending rules and exception
-lists to handle a variety of different possibilities:
-
- >>> print(wn.morphy('dogs'))
- dog
- >>> print(wn.morphy('churches'))
- church
- >>> print(wn.morphy('aardwolves'))
- aardwolf
- >>> print(wn.morphy('abaci'))
- abacus
- >>> print(wn.morphy('book', wn.NOUN))
- book
- >>> wn.morphy('hardrock', wn.ADV)
- >>> wn.morphy('book', wn.ADJ)
- >>> wn.morphy('his', wn.NOUN)
- >>>
-
----------------
-Synset Closures
----------------
-
-Compute transitive closures of synsets
-
- >>> dog = wn.synset('dog.n.01')
- >>> hypo = lambda s: s.hyponyms()
- >>> hyper = lambda s: s.hypernyms()
- >>> list(dog.closure(hypo, depth=1)) == dog.hyponyms()
- True
- >>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()
- True
- >>> list(dog.closure(hypo)) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
- [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'),
- Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'),
- Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'),
- Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'),
- Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...]
- >>> list(dog.closure(hyper)) # doctest: +NORMALIZE_WHITESPACE
- [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'),
- Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'),
- Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),
- Synset('physical_entity.n.01'), Synset('entity.n.01')]
-
-
-----------------
-Regression Tests
-----------------
-
-Bug 85: morphy returns the base form of a word, if it's input is given
-as a base form for a POS for which that word is not defined:
-
- >>> wn.synsets('book', wn.NOUN)
- [Synset('book.n.01'), Synset('book.n.02'), Synset('record.n.05'), Synset('script.n.01'), Synset('ledger.n.01'), Synset('book.n.06'), Synset('book.n.07'), Synset('koran.n.01'), Synset('bible.n.01'), Synset('book.n.10'), Synset('book.n.11')]
- >>> wn.synsets('book', wn.ADJ)
- []
- >>> wn.morphy('book', wn.NOUN)
- 'book'
- >>> wn.morphy('book', wn.ADJ)
-
-Bug 160: wup_similarity breaks when the two synsets have no common hypernym
-
- >>> t = wn.synsets('picasso')[0]
- >>> m = wn.synsets('male')[1]
- >>> t.wup_similarity(m) # doctest: +ELLIPSIS
- 0.631...
-
- >>> t = wn.synsets('titan')[1]
- >>> s = wn.synsets('say', wn.VERB)[0]
- >>> print(t.wup_similarity(s))
- None
-
-Bug 21: "instance of" not included in LCS (very similar to bug 160)
-
- >>> a = wn.synsets("writings")[0]
- >>> b = wn.synsets("scripture")[0]
- >>> brown_ic = wordnet_ic.ic('ic-brown.dat')
- >>> a.jcn_similarity(b, brown_ic) # doctest: +ELLIPSIS
- 0.175...
-
-Bug 221: Verb root IC is zero
-
- >>> from nltk.corpus.reader.wordnet import information_content
- >>> s = wn.synsets('say', wn.VERB)[0]
- >>> information_content(s, brown_ic) # doctest: +ELLIPSIS
- 4.623...
-
-Bug 161: Comparison between WN keys/lemmas should not be case sensitive
-
- >>> k = wn.synsets("jefferson")[0].lemmas()[0].key()
- >>> wn.lemma_from_key(k)
- Lemma('jefferson.n.01.Jefferson')
- >>> wn.lemma_from_key(k.upper())
- Lemma('jefferson.n.01.Jefferson')
-
-Bug 99: WordNet root_hypernyms gives incorrect results
-
- >>> from nltk.corpus import wordnet as wn
- >>> for s in wn.all_synsets(wn.NOUN):
- ... if s.root_hypernyms()[0] != wn.synset('entity.n.01'):
- ... print(s, s.root_hypernyms())
- ...
- >>>
-
-Bug 382: JCN Division by zero error
-
- >>> tow = wn.synset('tow.v.01')
- >>> shlep = wn.synset('shlep.v.02')
- >>> from nltk.corpus import wordnet_ic
- >>> brown_ic = wordnet_ic.ic('ic-brown.dat')
- >>> tow.jcn_similarity(shlep, brown_ic) # doctest: +ELLIPSIS
- 1...e+300
-
-Bug 428: Depth is zero for instance nouns
-
- >>> s = wn.synset("lincoln.n.01")
- >>> s.max_depth() > 0
- True
-
-Bug 429: Information content smoothing used old reference to all_synsets
-
- >>> genesis_ic = wn.ic(genesis, True, 1.0)
-
-Bug 430: all_synsets used wrong pos lookup when synsets were cached
-
- >>> for ii in wn.all_synsets(): pass
- >>> for ii in wn.all_synsets(): pass
-
-Bug 470: shortest_path_distance ignored instance hypernyms
-
- >>> google = wordnet.synsets("google")[0]
- >>> earth = wordnet.synsets("earth")[0]
- >>> google.wup_similarity(earth) # doctest: +ELLIPSIS
- 0.1...
-
-Bug 484: similarity metrics returned -1 instead of None for no LCS
-
- >>> t = wn.synsets('fly', wn.VERB)[0]
- >>> s = wn.synsets('say', wn.VERB)[0]
- >>> print(s.shortest_path_distance(t))
- None
- >>> print(s.path_similarity(t, simulate_root=False))
- None
- >>> print(s.lch_similarity(t, simulate_root=False))
- None
- >>> print(s.wup_similarity(t, simulate_root=False))
- None
-
-Bug 427: "pants" does not return all the senses it should
-
- >>> from nltk.corpus import wordnet
- >>> wordnet.synsets("pants",'n')
- [Synset('bloomers.n.01'), Synset('pant.n.01'), Synset('trouser.n.01'), Synset('gasp.n.01')]
-
-Bug 482: Some nouns not being lemmatised by WordNetLemmatizer().lemmatize
-
- >>> from nltk.stem.wordnet import WordNetLemmatizer
- >>> WordNetLemmatizer().lemmatize("eggs", pos="n")
- 'egg'
- >>> WordNetLemmatizer().lemmatize("legs", pos="n")
- 'leg'
-
-Bug 284: instance hypernyms not used in similarity calculations
-
- >>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01')) # doctest: +ELLIPSIS
- 1.335...
- >>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01')) # doctest: +ELLIPSIS
- 0.571...
- >>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS
- 2.224...
- >>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS
- 0.075...
- >>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS
- 0.252...
- >>> wn.synset('john.n.02').hypernym_paths() # doctest: +ELLIPSIS
- [[Synset('entity.n.01'), ..., Synset('john.n.02')]]
-
-Issue 541: add domains to wordnet
-
- >>> wn.synset('code.n.03').topic_domains()
- [Synset('computer_science.n.01')]
- >>> wn.synset('pukka.a.01').region_domains()
- [Synset('india.n.01')]
- >>> wn.synset('freaky.a.01').usage_domains()
- [Synset('slang.n.02')]
-
-Issue 629: wordnet failures when python run with -O optimizations
-
- >>> # Run the test suite with python -O to check this
- >>> wn.synsets("brunch")
- [Synset('brunch.n.01'), Synset('brunch.v.01')]
-
-Issue 395: wordnet returns incorrect result for lowest_common_hypernyms of chef and policeman
-
- >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))
- [Synset('person.n.01')]
-
-Bug https://github.com/nltk/nltk/issues/1641: Non-English lemmas containing capital letters cannot be looked up using wordnet.lemmas() or wordnet.synsets()
-
- >>> wn.lemmas('Londres', lang='fra')
- [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')]
- >>> wn.lemmas('londres', lang='fra')
- [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')]
-
-Patch-1 https://github.com/nltk/nltk/pull/2065 Adding 3 functions (relations) to WordNet class
- >>> wn.synsets("computer_science")[0].in_topic_domains()[2]
- Synset('access_time.n.01')
- >>> wn.synsets("France")[0].in_region_domains()[18]
- Synset('french.n.01')
- >>> wn.synsets("slang")[1].in_usage_domains()[18]
- Synset('can-do.s.01')
+++ /dev/null
-# -*- coding: utf-8 -*-
-
-
-def teardown_module(module=None):
- from nltk.corpus import wordnet
-
- wordnet._unload()
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-===============================
-WordNet Lowest Common Hypernyms
-===============================
-
-Wordnet's lowest_common_hypernyms() method is based used to locate the
-lowest single hypernym that is shared by two given words:
-
- >>> from nltk.corpus import wordnet as wn
- >>> wn.synset('kin.n.01').lowest_common_hypernyms(wn.synset('mother.n.01'))
- [Synset('relative.n.01')]
-
- >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))
- [Synset('person.n.01')]
-
-This method generally returns a single result, but in some cases, more than one
-valid LCH is possible:
-
- >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'))
- [Synset('attribute.n.02'), Synset('measure.n.02')]
-
-In some cases, lowest_common_hypernyms() can return one of the synsets which was
-passed to it as an argument:
-
- >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'))
- [Synset('woman.n.01')]
-
-In NLTK 3.0a2 the behavior of lowest_common_hypernyms() was changed to give more
-accurate results in a small set of cases, generally when dealing with nouns describing
-social roles or jobs. To emulate the pre v3.0a2 behavior, you can set the use_min_depth=True
-flag:
-
- >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))
- [Synset('person.n.01')]
- >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'), use_min_depth=True)
- [Synset('organism.n.01')]
-
-In some cases use_min_depth=True may return more or fewer results than the default
-behavior:
-
- >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'))
- [Synset('woman.n.01')]
- >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'), use_min_depth=True)
- [Synset('organism.n.01'), Synset('woman.n.01')]
-
-In the general case, however, they tend to return the same results:
-
- >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'))
- [Synset('attribute.n.02'), Synset('measure.n.02')]
- >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'), use_min_depth=True)
- [Synset('attribute.n.02'), Synset('measure.n.02')]
+++ /dev/null
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. -*- coding: utf-8 -*-
-
-=========================
-Word Sense Disambiguation
-=========================
-
-
-Lesk Algorithm
---------------
-
-
-Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using
-a the definitions of the ambiguous word.
-
-Given an ambiguous word and the context in which the word occurs, Lesk returns
-a Synset with the highest number of overlapping words between the context
-sentence and different definitions from each Synset.
-
- >>> from nltk.wsd import lesk
- >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']
-
- >>> print(lesk(sent, 'bank', 'n'))
- Synset('savings_bank.n.02')
-
- >>> print(lesk(sent, 'bank'))
- Synset('savings_bank.n.02')
-
-The definitions for "bank" are:
-
- >>> from nltk.corpus import wordnet as wn
- >>> for ss in wn.synsets('bank'):
- ... print(ss, ss.definition())
- ...
- Synset('bank.n.01') sloping land (especially the slope beside a body of water)
- Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities
- Synset('bank.n.03') a long ridge or pile
- Synset('bank.n.04') an arrangement of similar objects in a row or in tiers
- Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies)
- Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games
- Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
- Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home
- Synset('bank.n.09') a building in which the business of banking transacted
- Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
- Synset('bank.v.01') tip laterally
- Synset('bank.v.02') enclose with a bank
- Synset('bank.v.03') do business with a bank or keep an account at a bank
- Synset('bank.v.04') act as the banker in a game or in gambling
- Synset('bank.v.05') be in the banking business
- Synset('deposit.v.02') put into a bank account
- Synset('bank.v.07') cover with ashes so to control the rate of burning
- Synset('trust.v.01') have confidence or faith in
-
-Test disambiguation of POS tagged `able`.
-
- >>> [(s, s.pos()) for s in wn.synsets('able')]
- [(Synset('able.a.01'), 'a'), (Synset('able.s.02'), 's'), (Synset('able.s.03'), 's'), (Synset('able.s.04'), 's')]
- >>> sent = 'people should be able to marry a person of their choice'.split()
- >>> lesk(sent, 'able')
- Synset('able.s.04')
- >>> lesk(sent, 'able', pos='a')
- Synset('able.a.01')
-
-Test behavior if there is are no matching senses.
-
- >>> lesk('John loves Mary'.split(), 'loves', synsets=[])
# Natural Language Toolkit: Texts
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
regular expression search over tokenized strings, and
distributional similarity.
"""
+from __future__ import print_function, division, unicode_literals
from math import log
-from collections import defaultdict, Counter, namedtuple
+from collections import defaultdict, Counter
from functools import reduce
+from itertools import islice
import re
-import sys
-from nltk.lm import MLE
-from nltk.lm.preprocessing import padded_everygram_pipeline
-from nltk.probability import FreqDist
+from six import text_type
+
+from nltk.probability import FreqDist, LidstoneProbDist
from nltk.probability import ConditionalFreqDist as CFD
from nltk.util import tokenwrap, LazyConcatenation
from nltk.metrics import f_measure, BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
-from nltk.tokenize import sent_tokenize
-
-ConcordanceLine = namedtuple(
- "ConcordanceLine",
- ["left", "query", "right", "offset", "left_print", "right_print", "line"],
-)
+from nltk.compat import python_2_unicode_compatible
class ContextIndex(object):
in a fixed window around the word; but other definitions may also
be used by providing a custom context function.
"""
-
@staticmethod
def _default_context(tokens, i):
"""One left token and one right token, normalized to lowercase"""
- left = tokens[i - 1].lower() if i != 0 else "*START*"
- right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*"
+ left = (tokens[i-1].lower() if i != 0 else '*START*')
+ right = (tokens[i+1].lower() if i != len(tokens) - 1 else '*END*')
return (left, right)
- def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
+ def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
self._key = key
self._tokens = tokens
if context_func:
self._context_func = self._default_context
if filter:
tokens = [t for t in tokens if filter(t)]
- self._word_to_contexts = CFD(
- (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
- )
- self._context_to_words = CFD(
- (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
- )
+ self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
+ for i, w in enumerate(tokens))
+ self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
+ for i, w in enumerate(tokens))
def tokens(self):
"""
for c in self._word_to_contexts[self._key(word)]:
for w in self._context_to_words[c]:
if w != word:
- scores[w] += (
- self._context_to_words[c][word] * self._context_to_words[c][w]
- )
+ scores[w] += self._context_to_words[c][word] * self._context_to_words[c][w]
return sorted(scores, key=scores.get, reverse=True)[:n]
def common_contexts(self, words, fail_on_unknown=False):
empty = [words[i] for i in range(len(words)) if not contexts[i]]
common = reduce(set.intersection, contexts)
if empty and fail_on_unknown:
- raise ValueError("The following word(s) were not found:", " ".join(words))
+ raise ValueError("The following word(s) were not found:",
+ " ".join(words))
elif not common:
# nothing in common -- just return an empty freqdist.
return FreqDist()
else:
- fd = FreqDist(
- c for w in words for c in self._word_to_contexts[w] if c in common
- )
+ fd = FreqDist(c for w in words
+ for c in self._word_to_contexts[w]
+ if c in common)
return fd
-
-
+@python_2_unicode_compatible
class ConcordanceIndex(object):
"""
An index that can be used to look up the offset locations at which
a given word occurs in a document.
"""
-
- def __init__(self, tokens, key=lambda x: x):
+ def __init__(self, tokens, key=lambda x:x):
"""
Construct a new concordance index.
"""Function mapping each token to an index key (or None)."""
self._offsets = defaultdict(list)
- """Dictionary mapping words (or keys) to lists of offset indices."""
+ """Dictionary mapping words (or keys) to lists of offset
+ indices."""
+
# Initialize the index (self._offsets)
for index, word in enumerate(tokens):
word = self._key(word)
return self._offsets[word]
def __repr__(self):
- return "<ConcordanceIndex for %d tokens (%d types)>" % (
- len(self._tokens),
- len(self._offsets),
- )
+ return '<ConcordanceIndex for %d tokens (%d types)>' % (
+ len(self._tokens), len(self._offsets))
- def find_concordance(self, word, width=80):
- """
- Find all concordance lines given the query word.
+ def print_concordance(self, word, width=75, lines=25):
"""
- half_width = (width - len(word) - 2) // 2
- context = width // 4 # approx number of words of context
+ Print a concordance for ``word`` with the specified context window.
- # Find the instances of the word to create the ConcordanceLine
- concordance_list = []
- offsets = self.offsets(word)
- if offsets:
- for i in offsets:
- query_word = self._tokens[i]
- # Find the context of query word.
- left_context = self._tokens[max(0, i - context) : i]
- right_context = self._tokens[i + 1 : i + context]
- # Create the pretty lines with the query_word in the middle.
- left_print = " ".join(left_context)[-half_width:]
- right_print = " ".join(right_context)[:half_width]
- # The WYSIWYG line of the concordance.
- line_print = " ".join([left_print, query_word, right_print])
- # Create the ConcordanceLine
- concordance_line = ConcordanceLine(
- left_context,
- query_word,
- right_context,
- i,
- left_print,
- right_print,
- line_print,
- )
- concordance_list.append(concordance_line)
- return concordance_list
-
- def print_concordance(self, word, width=80, lines=25):
- """
- Print concordance lines given the query word.
:param word: The target word
:type word: str
- :param lines: The number of lines to display (default=25)
- :type lines: int
:param width: The width of each line, in characters (default=80)
:type width: int
- :param save: The option to save the concordance.
- :type save: bool
+ :param lines: The number of lines to display (default=25)
+ :type lines: int
"""
- concordance_list = self.find_concordance(word, width=width)
+ half_width = (width - len(word) - 2) // 2
+ context = width // 4 # approx number of words of context
- if not concordance_list:
- print("no matches")
+ offsets = self.offsets(word)
+ if offsets:
+ lines = min(lines, len(offsets))
+ print("Displaying %s of %s matches:" % (lines, len(offsets)))
+ for i in offsets:
+ if lines <= 0:
+ break
+ left = (' ' * half_width +
+ ' '.join(self._tokens[i-context:i]))
+ right = ' '.join(self._tokens[i+1:i+context])
+ left = left[-half_width:]
+ right = right[:half_width]
+ print(left, self._tokens[i], right)
+ lines -= 1
else:
- lines = min(lines, len(concordance_list))
- print("Displaying {} of {} matches:".format(lines, len(concordance_list)))
- for i, concordance_line in enumerate(concordance_list[:lines]):
- print(concordance_line.line)
-
+ print("No matches")
class TokenSearcher(object):
"""
brackets as non-capturing parentheses, in addition to matching the
token boundaries; and to have ``'.'`` not match the angle brackets.
"""
-
def __init__(self, tokens):
- self._raw = "".join("<" + w + ">" for w in tokens)
+ self._raw = ''.join('<'+w+'>' for w in tokens)
def findall(self, regexp):
"""
:type regexp: str
"""
# preprocess the regular expression
- regexp = re.sub(r"\s", "", regexp)
- regexp = re.sub(r"<", "(?:<(?:", regexp)
- regexp = re.sub(r">", ")>)", regexp)
- regexp = re.sub(r"(?<!\\)\.", "[^>]", regexp)
+ regexp = re.sub(r'\s', '', regexp)
+ regexp = re.sub(r'<', '(?:<(?:', regexp)
+ regexp = re.sub(r'>', ')>)', regexp)
+ regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)
# perform the search
hits = re.findall(regexp, self._raw)
# Sanity check
for h in hits:
- if not h.startswith("<") and h.endswith(">"):
- raise ValueError("Bad regexp for TokenSearcher.findall")
+ if not h.startswith('<') and h.endswith('>'):
+ raise ValueError('Bad regexp for TokenSearcher.findall')
# postprocess the output
- hits = [h[1:-1].split("><") for h in hits]
+ hits = [h[1:-1].split('><') for h in hits]
return hits
-
+@python_2_unicode_compatible
class Text(object):
"""
A wrapper around a sequence of simple (string) tokens, which is
>>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
"""
-
# This defeats lazy loading, but makes things faster. This
# *shouldn't* be necessary because the corpus view *should* be
# doing intelligent caching, but without this it's running slow.
if name:
self.name = name
- elif "]" in tokens[:20]:
- end = tokens[:20].index("]")
- self.name = " ".join(str(tok) for tok in tokens[1:end])
+ elif ']' in tokens[:20]:
+ end = tokens[:20].index(']')
+ self.name = " ".join(text_type(tok) for tok in tokens[1:end])
else:
- self.name = " ".join(str(tok) for tok in tokens[:8]) + "..."
+ self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Support item & slice access
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def __getitem__(self, i):
return self.tokens[i]
def __len__(self):
return len(self.tokens)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Interactive console methods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def concordance(self, word, width=79, lines=25):
"""
- Prints a concordance for ``word`` with the specified context window.
+ Print a concordance for ``word`` with the specified context window.
Word matching is not case-sensitive.
-
- :param word: The target word
- :type word: str
- :param width: The width of each line, in characters (default=80)
- :type width: int
- :param lines: The number of lines to display (default=25)
- :type lines: int
-
:seealso: ``ConcordanceIndex``
"""
- if "_concordance_index" not in self.__dict__:
- self._concordance_index = ConcordanceIndex(
- self.tokens, key=lambda s: s.lower()
- )
+ if '_concordance_index' not in self.__dict__:
+ #print("Building index...")
+ self._concordance_index = ConcordanceIndex(self.tokens,
+ key=lambda s:s.lower())
- return self._concordance_index.print_concordance(word, width, lines)
+ self._concordance_index.print_concordance(word, width, lines)
- def concordance_list(self, word, width=79, lines=25):
- """
- Generate a concordance for ``word`` with the specified context window.
- Word matching is not case-sensitive.
-
- :param word: The target word
- :type word: str
- :param width: The width of each line, in characters (default=80)
- :type width: int
- :param lines: The number of lines to display (default=25)
- :type lines: int
-
- :seealso: ``ConcordanceIndex``
- """
- if "_concordance_index" not in self.__dict__:
- self._concordance_index = ConcordanceIndex(
- self.tokens, key=lambda s: s.lower()
- )
- return self._concordance_index.find_concordance(word, width)[:lines]
-
- def collocation_list(self, num=20, window_size=2):
+ def collocations(self, num=20, window_size=2):
"""
- Return collocations derived from the text, ignoring stopwords.
-
- >>> from nltk.book import text4
- >>> text4.collocation_list()[:2]
- [('United', 'States'), ('fellow', 'citizens')]
+ Print collocations derived from the text, ignoring stopwords.
- :param num: The maximum number of collocations to return.
+ :seealso: find_collocations
+ :param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
- :rtype: list(tuple(str, str))
"""
- if not (
- "_collocations" in self.__dict__
- and self._num == num
- and self._window_size == window_size
- ):
+ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
- # print("Building collocations list")
+ #print("Building collocations list")
from nltk.corpus import stopwords
-
- ignored_words = stopwords.words("english")
+ ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
- self._collocations = list(finder.nbest(bigram_measures.likelihood_ratio, num))
- return self._collocations
-
- def collocations(self, num=20, window_size=2):
- """
- Print collocations derived from the text, ignoring stopwords.
-
- >>> from nltk.book import text4
- >>> text4.collocations() # doctest: +ELLIPSIS
- United States; fellow citizens; four years; ...
-
- :param num: The maximum number of collocations to print.
- :type num: int
- :param window_size: The number of tokens spanned by a collocation (default=2)
- :type window_size: int
- """
-
- collocation_strings = [
- w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
- ]
- print(tokenwrap(collocation_strings, separator="; "))
+ self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
+ colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
+ print(tokenwrap(colloc_strings, separator="; "))
def count(self, word):
"""
:type num: int
:seealso: ContextIndex.similar_words()
"""
- if "_word_context_index" not in self.__dict__:
- # print('Building word-context index...')
- self._word_context_index = ContextIndex(
- self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
- )
+ if '_word_context_index' not in self.__dict__:
+ #print('Building word-context index...')
+ self._word_context_index = ContextIndex(self.tokens,
+ filter=lambda x:x.isalpha(),
+ key=lambda s:s.lower())
- # words = self._word_context_index.similar_words(word, num)
+# words = self._word_context_index.similar_words(word, num)
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
- fd = Counter(
- w
- for w in wci.conditions()
- for c in wci[w]
- if c in contexts and not w == word
- )
+ fd = Counter(w for w in wci.conditions() for c in wci[w]
+ if c in contexts and not w == word)
words = [w for w, _ in fd.most_common(num)]
print(tokenwrap(words))
else:
print("No matches")
+
def common_contexts(self, words, num=20):
"""
Find contexts where the specified words appear; list
most frequent common contexts first.
- :param words: The words used to seed the similarity search
- :type words: str
+ :param word: The word used to seed the similarity search
+ :type word: str
:param num: The number of words to generate (default=20)
:type num: int
:seealso: ContextIndex.common_contexts()
"""
- if "_word_context_index" not in self.__dict__:
- # print('Building word-context index...')
- self._word_context_index = ContextIndex(
- self.tokens, key=lambda s: s.lower()
- )
+ if '_word_context_index' not in self.__dict__:
+ #print('Building word-context index...')
+ self._word_context_index = ContextIndex(self.tokens,
+ key=lambda s:s.lower())
try:
fd = self._word_context_index.common_contexts(words, True)
print("No common contexts were found")
else:
ranked_contexts = [w for w, _ in fd.most_common(num)]
- print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))
+ print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts))
except ValueError as e:
print(e)
:seealso: nltk.draw.dispersion_plot()
"""
from nltk.draw import dispersion_plot
-
dispersion_plot(self, words)
- def _train_default_ngram_lm(self, tokenized_sents, n=3):
- train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents)
- model = MLE(order=n)
- model.fit(train_data, padded_sents)
- return model
-
- def generate(self, length=100, text_seed=None, random_seed=42):
- """
- Print random text, generated using a trigram language model.
- See also `help(nltk.lm)`.
-
- :param length: The length of text to generate (default=100)
- :type length: int
-
- :param text_seed: Generation can be conditioned on preceding context.
- :type text_seed: list(str)
-
- :param random_seed: A random seed or an instance of `random.Random`. If provided,
- makes the random sampling part of generation reproducible. (default=42)
- :type random_seed: int
-
- """
- # Create the model when using it the first time.
- self._tokenized_sents = [
- sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens))
- ]
- if not hasattr(self, "trigram_model"):
- print("Building ngram index...", file=sys.stderr)
- self._trigram_model = self._train_default_ngram_lm(
- self._tokenized_sents, n=3
- )
-
- generated_tokens = []
-
- assert length > 0, "The `length` must be more than 0."
- while len(generated_tokens) < length:
- for idx, token in enumerate(
- self._trigram_model.generate(
- length, text_seed=text_seed, random_seed=random_seed
- )
- ):
- if token == "<s>":
- continue
- if token == "</s>":
- break
- generated_tokens.append(token)
- random_seed += 1
-
- prefix = " ".join(text_seed) + " " if text_seed else ""
- output_str = prefix + tokenwrap(generated_tokens[:length])
- print(output_str)
- return output_str
+ def generate(self, words):
+ """
+ Issues a reminder to users following the book online
+ """
+ import warnings
+ warnings.warn('The generate() method is no longer available.', DeprecationWarning)
def plot(self, *args):
"""
:seealso: nltk.prob.FreqDist
"""
if "_vocab" not in self.__dict__:
- # print("Building vocabulary index...")
+ #print("Building vocabulary index...")
self._vocab = FreqDist(self)
return self._vocab
self._token_searcher = TokenSearcher(self)
hits = self._token_searcher.findall(regexp)
- hits = [" ".join(h) for h in hits]
+ hits = [' '.join(h) for h in hits]
print(tokenwrap(hits, "; "))
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Helper Methods
- # ////////////////////////////////////////////////////////////
-
- _CONTEXT_RE = re.compile("\w+|[\.\!\?]")
+ #////////////////////////////////////////////////////////////
+ _CONTEXT_RE = re.compile('\w+|[\.\!\?]')
def _context(self, tokens, i):
"""
One left & one right token, both case-normalized. Skip over
that is created for ``similar()`` and ``common_contexts()``.
"""
# Left context
- j = i - 1
- while j >= 0 and not self._CONTEXT_RE.match(tokens[j]):
+ j = i-1
+ while j>=0 and not self._CONTEXT_RE.match(tokens[j]):
j -= 1
- left = tokens[j] if j != 0 else "*START*"
+ left = (tokens[j] if j != 0 else '*START*')
# Right context
- j = i + 1
- while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
+ j = i+1
+ while j<len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
j += 1
- right = tokens[j] if j != len(tokens) else "*END*"
+ right = (tokens[j] if j != len(tokens) else '*END*')
return (left, right)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# String Display
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def __str__(self):
- return "<Text: %s>" % self.name
+ return '<Text: %s>' % self.name
def __repr__(self):
- return "<Text: %s>" % self.name
+ return '<Text: %s>' % self.name
# Prototype only; this approach will be slow to load
Iterating over a TextCollection produces all the tokens of all the
texts in order.
"""
-
def __init__(self, source):
- if hasattr(source, "words"): # bridge to the text corpus reader
+ if hasattr(source, 'words'): # bridge to the text corpus reader
source = [source.words(f) for f in source.fileids()]
self._texts = source
idf = self._idf_cache.get(term)
if idf is None:
matches = len([True for text in self._texts if term in text])
- if len(self._texts) == 0:
- raise ValueError("IDF undefined for empty document collection")
- idf = log(len(self._texts) / matches) if matches else 0.0
+ # FIXME Should this raise some kind of error instead?
+ idf = (log(len(self._texts) / matches) if matches else 0.0)
self._idf_cache[term] = idf
return idf
def tf_idf(self, term, text):
return self.tf(term, text) * self.idf(term)
-
def demo():
from nltk.corpus import brown
-
- text = Text(brown.words(categories="news"))
+ text = Text(brown.words(categories='news'))
print(text)
print()
print("Concordance:")
- text.concordance("news")
+ text.concordance('news')
print()
print("Distributionally similar words:")
- text.similar("news")
+ text.similar('news')
print()
print("Collocations:")
text.collocations()
print()
- # print("Automatically generated text:")
- # text.generate()
- # print()
+ #print("Automatically generated text:")
+ #text.generate()
+ #print()
print("Dispersion plot:")
- text.dispersion_plot(["news", "report", "said", "announced"])
+ text.dispersion_plot(['news', 'report', 'said', 'announced'])
print()
print("Vocabulary plot:")
text.plot(50)
print("Indexing:")
print("text[3]:", text[3])
print("text[3:5]:", text[3:5])
- print("text.vocab()['news']:", text.vocab()["news"])
-
+ print("text.vocab()['news']:", text.vocab()['news'])
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
-__all__ = [
- "ContextIndex",
- "ConcordanceIndex",
- "TokenSearcher",
- "Text",
- "TextCollection",
-]
+__all__ = ["ContextIndex",
+ "ConcordanceIndex",
+ "TokenSearcher",
+ "Text",
+ "TextCollection"]
#
# Natural Language Toolkit: TGrep search
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Will Roberts <wildwilhelm@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-"""
+'''
============================================
TGrep search implementation for NLTK trees
============================================
predicates must always pass the value of these arguments on. The
top-level predicate (constructed by ``_tgrep_exprs_action``) binds the
macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
-"""
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
import functools
import re
+from six import binary_type, text_type
+
try:
import pyparsing
except ImportError:
- print("Warning: nltk.tgrep will not work without the `pyparsing` package")
- print("installed.")
+ print('Warning: nltk.tgrep will not work without the `pyparsing` package')
+ print('installed.')
import nltk.tree
-
class TgrepException(Exception):
- """Tgrep exception type."""
-
+ '''Tgrep exception type.'''
pass
-
def ancestors(node):
- """
+ '''
Returns the list of all nodes dominating the given tree node.
This method will not work with leaf nodes, since there is no way
to recover the parent.
- """
+ '''
results = []
try:
current = node.parent()
current = current.parent()
return results
-
def unique_ancestors(node):
- """
+ '''
Returns the list of all nodes dominating the given node, where
there is only a single path of descent.
- """
+ '''
results = []
try:
current = node.parent()
current = current.parent()
return results
-
def _descendants(node):
- """
+ '''
Returns the list of all nodes which are descended from the given
tree node in some way.
- """
+ '''
try:
treepos = node.treepositions()
except AttributeError:
return []
return [node[x] for x in treepos[1:]]
-
def _leftmost_descendants(node):
- """
+ '''
Returns the set of all nodes descended in some way through
left branches from this node.
- """
+ '''
try:
treepos = node.treepositions()
except AttributeError:
return []
return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]
-
def _rightmost_descendants(node):
- """
+ '''
Returns the set of all nodes descended in some way through
right branches from this node.
- """
+ '''
try:
rightmost_leaf = max(node.treepositions())
except AttributeError:
return []
return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]
-
def _istree(obj):
- """Predicate to check whether `obj` is a nltk.tree.Tree."""
+ '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
return isinstance(obj, nltk.tree.Tree)
-
def _unique_descendants(node):
- """
+ '''
Returns the list of all nodes descended from the given node, where
there is only a single path of descent.
- """
+ '''
results = []
current = node
while current and _istree(current) and len(current) == 1:
results.append(current)
return results
-
def _before(node):
- """
+ '''
Returns the set of all nodes that are before the given node.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
except AttributeError:
return []
- return [tree[x] for x in tree.treepositions() if x[: len(pos)] < pos[: len(x)]]
-
+ return [tree[x] for x in tree.treepositions()
+ if x[:len(pos)] < pos[:len(x)]]
def _immediately_before(node):
- """
+ '''
Returns the set of all nodes that are immediately before the given
node.
Tree node A immediately precedes node B if the last terminal
symbol (word) produced by A immediately precedes the first
terminal symbol produced by B.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
idx -= 1
if idx < 0:
return []
- pos = list(pos[: idx + 1])
+ pos = list(pos[:idx + 1])
pos[-1] -= 1
before = tree[pos]
return [before] + _rightmost_descendants(before)
-
def _after(node):
- """
+ '''
Returns the set of all nodes that are after the given node.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
except AttributeError:
return []
- return [tree[x] for x in tree.treepositions() if x[: len(pos)] > pos[: len(x)]]
-
+ return [tree[x] for x in tree.treepositions()
+ if x[:len(pos)] > pos[:len(x)]]
def _immediately_after(node):
- """
+ '''
Returns the set of all nodes that are immediately after the given
node.
Tree node A immediately follows node B if the first terminal
symbol (word) produced by A immediately follows the last
terminal symbol produced by B.
- """
+ '''
try:
pos = node.treeposition()
tree = node.root()
current = current.parent()
if idx < 0:
return []
- pos = list(pos[: idx + 1])
+ pos = list(pos[:idx + 1])
pos[-1] += 1
after = tree[pos]
return [after] + _leftmost_descendants(after)
-
def _tgrep_node_literal_value(node):
- """
+ '''
Gets the string value of a given parse tree node, for comparison
using the tgrep node literal predicates.
- """
- return node.label() if _istree(node) else str(node)
-
+ '''
+ return (node.label() if _istree(node) else text_type(node))
def _tgrep_macro_use_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function which looks up the macro name used.
- """
+ '''
assert len(tokens) == 1
- assert tokens[0][0] == "@"
+ assert tokens[0][0] == '@'
macro_name = tokens[0][1:]
-
def macro_use(n, m=None, l=None):
if m is None or macro_name not in m:
- raise TgrepException("macro {0} not defined".format(macro_name))
+ raise TgrepException('macro {0} not defined'.format(macro_name))
return m[macro_name](n, m, l)
-
return macro_use
-
def _tgrep_node_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
depending on the name of its node.
- """
+ '''
+ # print 'node tokens: ', tokens
if tokens[0] == "'":
# strip initial apostrophe (tgrep2 print command)
tokens = tokens[1:]
if len(tokens) > 1:
# disjunctive definition of a node name
- assert list(set(tokens[1::2])) == ["|"]
+ assert list(set(tokens[1::2])) == ['|']
# recursively call self to interpret each node name definition
- tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]]
+ tokens = [_tgrep_node_action(None, None, [node])
+ for node in tokens[::2]]
# capture tokens and return the disjunction
return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens)
else:
- if hasattr(tokens[0], "__call__"):
+ if hasattr(tokens[0], '__call__'):
# this is a previously interpreted parenthetical node
# definition (lambda function)
return tokens[0]
- elif tokens[0] == "*" or tokens[0] == "__":
+ elif tokens[0] == '*' or tokens[0] == '__':
return lambda n, m=None, l=None: True
elif tokens[0].startswith('"'):
assert tokens[0].endswith('"')
- node_lit = tokens[0][1:-1].replace('\\"', '"').replace("\\\\", "\\")
- return (
- lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s
- )(node_lit)
- elif tokens[0].startswith("/"):
- assert tokens[0].endswith("/")
+ node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\')
+ return (lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s)(node_lit)
+ elif tokens[0].startswith('/'):
+ assert tokens[0].endswith('/')
node_lit = tokens[0][1:-1]
- return (
- lambda r: lambda n, m=None, l=None: r.search(
- _tgrep_node_literal_value(n)
- )
- )(re.compile(node_lit))
- elif tokens[0].startswith("i@"):
+ return (lambda r: lambda n, m=None, l=None:
+ r.search(_tgrep_node_literal_value(n)))(re.compile(node_lit))
+ elif tokens[0].startswith('i@'):
node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()])
- return (
- lambda f: lambda n, m=None, l=None: f(
- _tgrep_node_literal_value(n).lower()
- )
- )(node_func)
+ return (lambda f: lambda n, m=None, l=None:
+ f(_tgrep_node_literal_value(n).lower()))(node_func)
else:
- return (
- lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s
- )(tokens[0])
-
+ return (lambda s: lambda n, m=None, l=None:
+ _tgrep_node_literal_value(n) == s)(tokens[0])
def _tgrep_parens_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
from a parenthetical notation.
- """
+ '''
+ # print 'parenthetical tokens: ', tokens
assert len(tokens) == 3
- assert tokens[0] == "("
- assert tokens[2] == ")"
+ assert tokens[0] == '('
+ assert tokens[2] == ')'
return tokens[1]
-
def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
which returns true if the node is located at a specific tree
position.
- """
+ '''
# recover the tuple from the parsed sting
node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
# capture the node's tree position
- return (
- lambda i: lambda n, m=None, l=None: (
- hasattr(n, "treeposition") and n.treeposition() == i
- )
- )(node_tree_position)
-
+ return (lambda i: lambda n, m=None, l=None: (hasattr(n, 'treeposition') and
+ n.treeposition() == i))(node_tree_position)
def _tgrep_relation_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
depending on its relation to other nodes in the tree.
- """
+ '''
+ # print 'relation tokens: ', tokens
# process negation first if needed
negated = False
- if tokens[0] == "!":
+ if tokens[0] == '!':
negated = True
tokens = tokens[1:]
- if tokens[0] == "[":
+ if tokens[0] == '[':
# process square-bracketed relation expressions
assert len(tokens) == 3
- assert tokens[2] == "]"
+ assert tokens[2] == ']'
retval = tokens[1]
else:
# process operator-node relation expressions
assert len(tokens) == 2
operator, predicate = tokens
# A < B A is the parent of (immediately dominates) B.
- if operator == "<":
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in n)
- )
+ if operator == '<':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l) for x in n))
# A > B A is the child of B.
- elif operator == ">":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and bool(n.parent())
- and predicate(n.parent(), m, l)
- )
+ elif operator == '>':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ predicate(n.parent(), m, l))
# A <, B Synonymous with A <1 B.
- elif operator == "<," or operator == "<1":
- retval = lambda n, m=None, l=None: (
- _istree(n) and bool(list(n)) and predicate(n[0], m, l)
- )
+ elif operator == '<,' or operator == '<1':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ bool(list(n)) and
+ predicate(n[0], m, l))
# A >, B Synonymous with A >1 B.
- elif operator == ">," or operator == ">1":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and bool(n.parent())
- and (n is n.parent()[0])
- and predicate(n.parent(), m, l)
- )
+ elif operator == '>,' or operator == '>1':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ (n is n.parent()[0]) and
+ predicate(n.parent(), m, l))
# A <N B B is the Nth child of A (the first child is <1).
- elif operator[0] == "<" and operator[1:].isdigit():
+ elif operator[0] == '<' and operator[1:].isdigit():
idx = int(operator[1:])
# capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- _istree(n)
- and bool(list(n))
- and 0 <= i < len(n)
- and predicate(n[i], m, l)
- )
- )(idx - 1)
+ retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+ bool(list(n)) and
+ 0 <= i < len(n) and
+ predicate(n[i], m, l)))(idx - 1)
# A >N B A is the Nth child of B (the first child is >1).
- elif operator[0] == ">" and operator[1:].isdigit():
+ elif operator[0] == '>' and operator[1:].isdigit():
idx = int(operator[1:])
# capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and bool(n.parent())
- and 0 <= i < len(n.parent())
- and (n is n.parent()[i])
- and predicate(n.parent(), m, l)
- )
- )(idx - 1)
+ retval = (lambda i: lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ 0 <= i < len(n.parent()) and
+ (n is n.parent()[i]) and
+ predicate(n.parent(), m, l)))(idx - 1)
# A <' B B is the last child of A (also synonymous with A <-1 B).
# A <- B B is the last child of A (synonymous with A <-1 B).
- elif operator == "<'" or operator == "<-" or operator == "<-1":
- retval = lambda n, m=None, l=None: (
- _istree(n) and bool(list(n)) and predicate(n[-1], m, l)
- )
+ elif operator == '<\'' or operator == '<-' or operator == '<-1':
+ retval = lambda n, m=None, l=None: (_istree(n) and bool(list(n))
+ and predicate(n[-1], m, l))
# A >' B A is the last child of B (also synonymous with A >-1 B).
# A >- B A is the last child of B (synonymous with A >-1 B).
- elif operator == ">'" or operator == ">-" or operator == ">-1":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and bool(n.parent())
- and (n is n.parent()[-1])
- and predicate(n.parent(), m, l)
- )
+ elif operator == '>\'' or operator == '>-' or operator == '>-1':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ (n is n.parent()[-1]) and
+ predicate(n.parent(), m, l))
# A <-N B B is the N th-to-last child of A (the last child is <-1).
- elif operator[:2] == "<-" and operator[2:].isdigit():
+ elif operator[:2] == '<-' and operator[2:].isdigit():
idx = -int(operator[2:])
# capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- _istree(n)
- and bool(list(n))
- and 0 <= (i + len(n)) < len(n)
- and predicate(n[i + len(n)], m, l)
- )
- )(idx)
+ retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+ bool(list(n)) and
+ 0 <= (i + len(n)) < len(n) and
+ predicate(n[i + len(n)], m, l)))(idx)
# A >-N B A is the N th-to-last child of B (the last child is >-1).
- elif operator[:2] == ">-" and operator[2:].isdigit():
+ elif operator[:2] == '>-' and operator[2:].isdigit():
idx = -int(operator[2:])
# capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and bool(n.parent())
- and 0 <= (i + len(n.parent())) < len(n.parent())
- and (n is n.parent()[i + len(n.parent())])
- and predicate(n.parent(), m, l)
- )
- )(idx)
+ retval = (lambda i: lambda n, m=None, l=None:
+ (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ 0 <= (i + len(n.parent())) < len(n.parent()) and
+ (n is n.parent()[i + len(n.parent())]) and
+ predicate(n.parent(), m, l)))(idx)
# A <: B B is the only child of A
- elif operator == "<:":
- retval = lambda n, m=None, l=None: (
- _istree(n) and len(n) == 1 and predicate(n[0], m, l)
- )
+ elif operator == '<:':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ len(n) == 1 and
+ predicate(n[0], m, l))
# A >: B A is the only child of B.
- elif operator == ">:":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and bool(n.parent())
- and len(n.parent()) == 1
- and predicate(n.parent(), m, l)
- )
+ elif operator == '>:':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ len(n.parent()) == 1 and
+ predicate(n.parent(), m, l))
# A << B A dominates B (A is an ancestor of B).
- elif operator == "<<":
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in _descendants(n))
- )
+ elif operator == '<<':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l) for x in _descendants(n)))
# A >> B A is dominated by B (A is a descendant of B).
- elif operator == ">>":
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in ancestors(n)
- )
+ elif operator == '>>':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in ancestors(n))
# A <<, B B is a left-most descendant of A.
- elif operator == "<<," or operator == "<<1":
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n))
- )
+ elif operator == '<<,' or operator == '<<1':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l)
+ for x in _leftmost_descendants(n)))
# A >>, B A is a left-most descendant of B.
- elif operator == ">>,":
- retval = lambda n, m=None, l=None: any(
- (predicate(x, m, l) and n in _leftmost_descendants(x))
- for x in ancestors(n)
- )
+ elif operator == '>>,':
+ retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+ n in _leftmost_descendants(x))
+ for x in ancestors(n))
# A <<' B B is a right-most descendant of A.
- elif operator == "<<'":
- retval = lambda n, m=None, l=None: (
- _istree(n)
- and any(predicate(x, m, l) for x in _rightmost_descendants(n))
- )
+ elif operator == '<<\'':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l)
+ for x in _rightmost_descendants(n)))
# A >>' B A is a right-most descendant of B.
- elif operator == ">>'":
- retval = lambda n, m=None, l=None: any(
- (predicate(x, m, l) and n in _rightmost_descendants(x))
- for x in ancestors(n)
- )
+ elif operator == '>>\'':
+ retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+ n in _rightmost_descendants(x))
+ for x in ancestors(n))
# A <<: B There is a single path of descent from A and B is on it.
- elif operator == "<<:":
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n))
- )
+ elif operator == '<<:':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l)
+ for x in _unique_descendants(n)))
# A >>: B There is a single path of descent from B and A is on it.
- elif operator == ">>:":
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in unique_ancestors(n)
- )
+ elif operator == '>>:':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in unique_ancestors(n))
# A . B A immediately precedes B.
- elif operator == ".":
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _immediately_after(n)
- )
+ elif operator == '.':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+ for x in _immediately_after(n))
# A , B A immediately follows B.
- elif operator == ",":
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _immediately_before(n)
- )
+ elif operator == ',':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+ for x in _immediately_before(n))
# A .. B A precedes B.
- elif operator == "..":
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _after(n)
- )
+ elif operator == '..':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _after(n))
# A ,, B A follows B.
- elif operator == ",,":
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _before(n)
- )
+ elif operator == ',,':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _before(n))
# A $ B A is a sister of B (and A != B).
- elif operator == "$" or operator == "%":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and bool(n.parent())
- and any(predicate(x, m, l) for x in n.parent() if x is not n)
- )
+ elif operator == '$' or operator == '%':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ any(predicate(x, m, l)
+ for x in n.parent() if x is not n))
# A $. B A is a sister of and immediately precedes B.
- elif operator == "$." or operator == "%.":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "right_sibling")
- and bool(n.right_sibling())
- and predicate(n.right_sibling(), m, l)
- )
+ elif operator == '$.' or operator == '%.':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'right_sibling') and
+ bool(n.right_sibling()) and
+ predicate(n.right_sibling(), m, l))
# A $, B A is a sister of and immediately follows B.
- elif operator == "$," or operator == "%,":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "left_sibling")
- and bool(n.left_sibling())
- and predicate(n.left_sibling(), m, l)
- )
+ elif operator == '$,' or operator == '%,':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'left_sibling') and
+ bool(n.left_sibling()) and
+ predicate(n.left_sibling(), m, l))
# A $.. B A is a sister of and precedes B.
- elif operator == "$.." or operator == "%..":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and hasattr(n, "parent_index")
- and bool(n.parent())
- and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1 :])
- )
+ elif operator == '$..' or operator == '%..':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ hasattr(n, 'parent_index') and
+ bool(n.parent()) and
+ any(predicate(x, m, l) for x in
+ n.parent()[n.parent_index() + 1:]))
# A $,, B A is a sister of and follows B.
- elif operator == "$,," or operator == "%,,":
- retval = lambda n, m=None, l=None: (
- hasattr(n, "parent")
- and hasattr(n, "parent_index")
- and bool(n.parent())
- and any(predicate(x, m, l) for x in n.parent()[: n.parent_index()])
- )
+ elif operator == '$,,' or operator == '%,,':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ hasattr(n, 'parent_index') and
+ bool(n.parent()) and
+ any(predicate(x, m, l) for x in
+ n.parent()[:n.parent_index()]))
else:
raise TgrepException(
- 'cannot interpret tgrep operator "{0}"'.format(operator)
- )
+ 'cannot interpret tgrep operator "{0}"'.format(operator))
# now return the built function
if negated:
return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval)
else:
return retval
-
-def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"):
- """
+def _tgrep_conjunction_action(_s, _l, tokens, join_char = '&'):
+ '''
Builds a lambda function representing a predicate on a tree node
from the conjunction of several other such lambda functions.
tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional)
list of segmented patterns (`tgrep_expr_labeled`, processed by
`_tgrep_segmented_pattern_action`).
- """
+ '''
# filter out the ampersand
tokens = [x for x in tokens if x != join_char]
+ # print 'relation conjunction tokens: ', tokens
if len(tokens) == 1:
return tokens[0]
else:
- return (
- lambda ts: lambda n, m=None, l=None: all(
- predicate(n, m, l) for predicate in ts
- )
- )(tokens)
-
+ return (lambda ts: lambda n, m=None, l=None: all(predicate(n, m, l)
+ for predicate in ts))(tokens)
def _tgrep_segmented_pattern_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a segmented pattern.
Called for expressions like (`tgrep_expr_labeled`)::
parse action to the pred use inside a node_expr. See
`_tgrep_node_label_use_action` and
`_tgrep_node_label_pred_use_action`.
- """
+ '''
# tokens[0] is a string containing the node label
node_label = tokens[0]
# tokens[1:] is an (optional) list of predicates which must all
# hold of the bound node
reln_preds = tokens[1:]
-
def pattern_segment_pred(n, m=None, l=None):
- """This predicate function ignores its node argument."""
+ '''This predicate function ignores its node argument.'''
# look up the bound node using its label
if l is None or node_label not in l:
- raise TgrepException(
- "node_label ={0} not bound in pattern".format(node_label)
- )
+ raise TgrepException('node_label ={0} not bound in pattern'.format(
+ node_label))
node = l[node_label]
# match the relation predicates against the node
return all(pred(node, m, l) for pred in reln_preds)
-
return pattern_segment_pred
-
def _tgrep_node_label_use_action(_s, _l, tokens):
- """
+ '''
Returns the node label used to begin a tgrep_expr_labeled. See
`_tgrep_segmented_pattern_action`.
expression (see `_tgrep_segmented_pattern_action`).
It returns the node label.
- """
+ '''
assert len(tokens) == 1
- assert tokens[0].startswith("=")
+ assert tokens[0].startswith('=')
return tokens[0][1:]
-
def _tgrep_node_label_pred_use_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
which describes the use of a previously bound node label.
relation). The predicate returns true if and only if its node
argument is identical the the node looked up in the node label
dictionary using the node's label.
- """
+ '''
assert len(tokens) == 1
- assert tokens[0].startswith("=")
+ assert tokens[0].startswith('=')
node_label = tokens[0][1:]
-
def node_label_use_pred(n, m=None, l=None):
# look up the bound node using its label
if l is None or node_label not in l:
- raise TgrepException(
- "node_label ={0} not bound in pattern".format(node_label)
- )
+ raise TgrepException('node_label ={0} not bound in pattern'.format(
+ node_label))
node = l[node_label]
# truth means the given node is this node
return n is node
-
return node_label_use_pred
-
def _tgrep_bind_node_label_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
which can optionally bind a matching node into the tgrep2 string's
label_dict.
/NP/
@NP=n
- """
+ '''
# tokens[0] is a tgrep_node_expr
if len(tokens) == 1:
return tokens[0]
# if present, tokens[1] is the character '=', and tokens[2] is
# a tgrep_node_label, a string value containing the node label
assert len(tokens) == 3
- assert tokens[1] == "="
+ assert tokens[1] == '='
node_pred = tokens[0]
node_label = tokens[2]
-
def node_label_bind_pred(n, m=None, l=None):
if node_pred(n, m, l):
# bind `n` into the dictionary `l`
if l is None:
raise TgrepException(
- "cannot bind node_label {0}: label_dict is None".format(
- node_label
- )
- )
+ 'cannot bind node_label {0}: label_dict is None'.format(
+ node_label))
l[node_label] = n
return True
else:
return False
-
return node_label_bind_pred
-
def _tgrep_rel_disjunction_action(_s, _l, tokens):
- """
+ '''
Builds a lambda function representing a predicate on a tree node
from the disjunction of several other such lambda functions.
- """
+ '''
# filter out the pipe
- tokens = [x for x in tokens if x != "|"]
+ tokens = [x for x in tokens if x != '|']
+ # print 'relation disjunction tokens: ', tokens
if len(tokens) == 1:
return tokens[0]
elif len(tokens) == 2:
- return (lambda a, b: lambda n, m=None, l=None: a(n, m, l) or b(n, m, l))(
- tokens[0], tokens[1]
- )
-
+ return (lambda a, b: lambda n, m=None, l=None:
+ a(n, m, l) or b(n, m, l))(tokens[0], tokens[1])
def _macro_defn_action(_s, _l, tokens):
- """
+ '''
Builds a dictionary structure which defines the given macro.
- """
+ '''
assert len(tokens) == 3
- assert tokens[0] == "@"
+ assert tokens[0] == '@'
return {tokens[1]: tokens[2]}
-
def _tgrep_exprs_action(_s, _l, tokens):
- """
+ '''
This is the top-lebel node in a tgrep2 search string; the
predicate function it returns binds together all the state of a
tgrep2 search string.
from the disjunction of several tgrep expressions. Also handles
macro definitions and macro name binding, and node label
definitions and node label binding.
- """
+ '''
if len(tokens) == 1:
return lambda n, m=None, l=None: tokens[0](n, None, {})
# filter out all the semicolons
- tokens = [x for x in tokens if x != ";"]
+ tokens = [x for x in tokens if x != ';']
# collect all macro definitions
macro_dict = {}
macro_defs = [tok for tok in tokens if isinstance(tok, dict)]
label_dict = {}
# bind macro definitions and OR together all tgrep_exprs
return any(predicate(n, m, label_dict) for predicate in tgrep_exprs)
-
return top_level_pred
-
-def _build_tgrep_parser(set_parse_actions=True):
- """
+def _build_tgrep_parser(set_parse_actions = True):
+ '''
Builds a pyparsing-based parser object for tokenizing and
interpreting tgrep search strings.
- """
- tgrep_op = pyparsing.Optional("!") + pyparsing.Regex("[$%,.<>][%,.<>0-9-':]*")
- tgrep_qstring = pyparsing.QuotedString(
- quoteChar='"', escChar="\\", unquoteResults=False
- )
- tgrep_node_regex = pyparsing.QuotedString(
- quoteChar="/", escChar="\\", unquoteResults=False
- )
- tgrep_qstring_icase = pyparsing.Regex('i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
- tgrep_node_regex_icase = pyparsing.Regex("i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/")
- tgrep_node_literal = pyparsing.Regex("[^][ \r\t\n;:.,&|<>()$!@%'^=]+")
+ '''
+ tgrep_op = (pyparsing.Optional('!') +
+ pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*'))
+ tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\',
+ unquoteResults=False)
+ tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\',
+ unquoteResults=False)
+ tgrep_qstring_icase = pyparsing.Regex(
+ 'i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
+ tgrep_node_regex_icase = pyparsing.Regex(
+ 'i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/')
+ tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
tgrep_expr = pyparsing.Forward()
tgrep_relations = pyparsing.Forward()
- tgrep_parens = pyparsing.Literal("(") + tgrep_expr + ")"
+ tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')'
tgrep_nltk_tree_pos = (
- pyparsing.Literal("N(")
- + pyparsing.Optional(
- pyparsing.Word(pyparsing.nums)
- + ","
- + pyparsing.Optional(
- pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=",")
- + pyparsing.Optional(",")
- )
- )
- + ")"
- )
- tgrep_node_label = pyparsing.Regex("[A-Za-z0-9]+")
- tgrep_node_label_use = pyparsing.Combine("=" + tgrep_node_label)
+ pyparsing.Literal('N(') +
+ pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' +
+ pyparsing.Optional(pyparsing.delimitedList(
+ pyparsing.Word(pyparsing.nums), delim=',') +
+ pyparsing.Optional(','))) + ')')
+ tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+')
+ tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label)
# see _tgrep_segmented_pattern_action
tgrep_node_label_use_pred = tgrep_node_label_use.copy()
- macro_name = pyparsing.Regex("[^];:.,&|<>()[$!@%'^=\r\t\n ]+")
- macro_name.setWhitespaceChars("")
- macro_use = pyparsing.Combine("@" + macro_name)
- tgrep_node_expr = (
- tgrep_node_label_use_pred
- | macro_use
- | tgrep_nltk_tree_pos
- | tgrep_qstring_icase
- | tgrep_node_regex_icase
- | tgrep_qstring
- | tgrep_node_regex
- | "*"
- | tgrep_node_literal
- )
- tgrep_node_expr2 = (
- tgrep_node_expr
- + pyparsing.Literal("=").setWhitespaceChars("")
- + tgrep_node_label.copy().setWhitespaceChars("")
- ) | tgrep_node_expr
- tgrep_node = tgrep_parens | (
- pyparsing.Optional("'")
- + tgrep_node_expr2
- + pyparsing.ZeroOrMore("|" + tgrep_node_expr)
- )
- tgrep_brackets = pyparsing.Optional("!") + "[" + tgrep_relations + "]"
+ macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+')
+ macro_name.setWhitespaceChars('')
+ macro_use = pyparsing.Combine('@' + macro_name)
+ tgrep_node_expr = (tgrep_node_label_use_pred |
+ macro_use |
+ tgrep_nltk_tree_pos |
+ tgrep_qstring_icase |
+ tgrep_node_regex_icase |
+ tgrep_qstring |
+ tgrep_node_regex |
+ '*' |
+ tgrep_node_literal)
+ tgrep_node_expr2 = ((tgrep_node_expr +
+ pyparsing.Literal('=').setWhitespaceChars('') +
+ tgrep_node_label.copy().setWhitespaceChars('')) |
+ tgrep_node_expr)
+ tgrep_node = (tgrep_parens |
+ (pyparsing.Optional("'") +
+ tgrep_node_expr2 +
+ pyparsing.ZeroOrMore("|" + tgrep_node_expr)))
+ tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']'
tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node)
tgrep_rel_conjunction = pyparsing.Forward()
- tgrep_rel_conjunction << (
- tgrep_relation
- + pyparsing.ZeroOrMore(pyparsing.Optional("&") + tgrep_rel_conjunction)
- )
+ tgrep_rel_conjunction << (tgrep_relation +
+ pyparsing.ZeroOrMore(pyparsing.Optional('&') +
+ tgrep_rel_conjunction))
tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
- "|" + tgrep_relations
- )
+ "|" + tgrep_relations)
tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations)
- tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(":" + tgrep_expr_labeled)
- macro_defn = (
- pyparsing.Literal("@") + pyparsing.White().suppress() + macro_name + tgrep_expr2
- )
- tgrep_exprs = (
- pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(";" + macro_defn) + ";")
- + tgrep_expr2
- + pyparsing.ZeroOrMore(";" + (macro_defn | tgrep_expr2))
- + pyparsing.ZeroOrMore(";").suppress()
- )
+ tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled)
+ macro_defn = (pyparsing.Literal('@') +
+ pyparsing.White().suppress() +
+ macro_name +
+ tgrep_expr2)
+ tgrep_exprs = (pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';') +
+ tgrep_expr2 +
+ pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2)) +
+ pyparsing.ZeroOrMore(';').suppress())
if set_parse_actions:
tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action)
tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action)
# relation predicates
tgrep_expr.setParseAction(_tgrep_conjunction_action)
tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action)
- tgrep_expr2.setParseAction(
- functools.partial(_tgrep_conjunction_action, join_char=":")
- )
+ tgrep_expr2.setParseAction(functools.partial(_tgrep_conjunction_action,
+ join_char = ':'))
tgrep_exprs.setParseAction(_tgrep_exprs_action)
- return tgrep_exprs.ignore("#" + pyparsing.restOfLine)
-
+ return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
def tgrep_tokenize(tgrep_string):
- """
+ '''
Tokenizes a TGrep search string into separate tokens.
- """
+ '''
parser = _build_tgrep_parser(False)
- if isinstance(tgrep_string, bytes):
+ if isinstance(tgrep_string, binary_type):
tgrep_string = tgrep_string.decode()
return list(parser.parseString(tgrep_string))
-
def tgrep_compile(tgrep_string):
- """
+ '''
Parses (and tokenizes, if necessary) a TGrep search string into a
lambda function.
- """
+ '''
parser = _build_tgrep_parser(True)
- if isinstance(tgrep_string, bytes):
+ if isinstance(tgrep_string, binary_type):
tgrep_string = tgrep_string.decode()
return list(parser.parseString(tgrep_string, parseAll=True))[0]
-
def treepositions_no_leaves(tree):
- """
+ '''
Returns all the tree positions in the given tree which are not
leaf nodes.
- """
+ '''
treepositions = tree.treepositions()
# leaves are treeposition tuples that are not prefixes of any
# other treeposition
prefixes.add(pos[:length])
return [pos for pos in treepositions if pos in prefixes]
-
def tgrep_positions(pattern, trees, search_leaves=True):
"""
Return the tree positions in the trees which match the given pattern.
:rtype: iter(tree positions)
"""
- if isinstance(pattern, (bytes, str)):
+ if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
positions = tree.treepositions()
else:
positions = treepositions_no_leaves(tree)
- yield [position for position in positions if pattern(tree[position])]
+ yield [position for position in positions
+ if pattern(tree[position])]
except AttributeError:
yield []
-
def tgrep_nodes(pattern, trees, search_leaves=True):
"""
Return the tree nodes in the trees which match the given pattern.
:rtype: iter(tree nodes)
"""
- if isinstance(pattern, (bytes, str)):
+ if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
positions = tree.treepositions()
else:
positions = treepositions_no_leaves(tree)
- yield [tree[position] for position in positions if pattern(tree[position])]
+ yield [tree[position] for position in positions
+ if pattern(tree[position])]
except AttributeError:
yield []
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# Contributors: matthewmc, clouds56
import re
-from nltk.data import load
-from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
-from nltk.tokenize.mwe import MWETokenizer
-from nltk.tokenize.destructive import NLTKWordTokenizer
-from nltk.tokenize.punkt import PunktSentenceTokenizer
-from nltk.tokenize.regexp import (
- RegexpTokenizer,
- WhitespaceTokenizer,
- BlanklineTokenizer,
- WordPunctTokenizer,
- wordpunct_tokenize,
- regexp_tokenize,
- blankline_tokenize,
-)
-from nltk.tokenize.repp import ReppTokenizer
-from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
-from nltk.tokenize.simple import (
- SpaceTokenizer,
- TabTokenizer,
- LineTokenizer,
- line_tokenize,
-)
+from nltk.data import load
+from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize)
+from nltk.tokenize.mwe import MWETokenizer
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+from nltk.tokenize.regexp import (RegexpTokenizer, WhitespaceTokenizer,
+ BlanklineTokenizer, WordPunctTokenizer,
+ wordpunct_tokenize, regexp_tokenize,
+ blankline_tokenize)
+from nltk.tokenize.repp import ReppTokenizer
+from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
+from nltk.tokenize.simple import (SpaceTokenizer, TabTokenizer, LineTokenizer,
+ line_tokenize)
from nltk.tokenize.texttiling import TextTilingTokenizer
-from nltk.tokenize.toktok import ToktokTokenizer
+from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
-from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
+from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
-from nltk.tokenize.sonority_sequencing import SyllableTokenizer
-
# Standard sentence tokenizer.
-def sent_tokenize(text, language="english"):
+def sent_tokenize(text, language='english'):
"""
Return a sentence-tokenized copy of *text*,
using NLTK's recommended sentence tokenizer
:param text: text to split into sentences
:param language: the model name in the Punkt corpus
"""
- tokenizer = load("tokenizers/punkt/{0}.pickle".format(language))
+ tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
return tokenizer.tokenize(text)
-
# Standard word tokenizer.
-_treebank_word_tokenizer = NLTKWordTokenizer()
+_treebank_word_tokenizer = TreebankWordTokenizer()
+
+# See discussion on https://github.com/nltk/nltk/pull/1437
+# Adding to TreebankWordTokenizer, the splits on
+# - chervon quotes u'\xab' and u'\xbb' .
+# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
+
+improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
+improved_close_quote_regex = re.compile(u'([»”’])', re.U)
+improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
+_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
+_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
+_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
-def word_tokenize(text, language="english", preserve_line=False):
+def word_tokenize(text, language='english', preserve_line=False):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
:param language: the model name in the Punkt corpus
:type language: str
:param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
- :type preserve_line: bool
+ :type preserver_line: bool
"""
sentences = [text] if preserve_line else sent_tokenize(text, language)
- return [
- token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
- ]
+ return [token for sent in sentences
+ for token in _treebank_word_tokenizer.tokenize(sent)]
# Natural Language Toolkit: Tokenizer Interface
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
Tokenizer Interface
"""
-from abc import ABC, abstractmethod
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.internals import overridden
from nltk.tokenize.util import string_span_tokenize
-class TokenizerI(ABC):
+@add_metaclass(ABCMeta)
+class TokenizerI(object):
"""
A processing interface for tokenizing a string.
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
"""
-
@abstractmethod
def tokenize(self, s):
"""
on the specified string (defined in subclasses).
"""
- @property
- @abstractmethod
- def _string(self):
- raise NotImplementedError
-
def tokenize(self, s):
return s.split(self._string)
#
# Natural Language Toolkit: Twitter Tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <> (modifications)
"""
+
######################################################################
-import regex # https://github.com/nltk/nltk/issues/2409
-import html
+from __future__ import unicode_literals
+import re
+
+from six import int2byte, unichr
+from six.moves import html_entities
######################################################################
# The following strings are components in the regular expression
(?:
(?: # (international)
\+?[01]
- [ *\-.\)]*
+ [\-\s.]*
)?
(?: # (area code)
[\(]?
\d{3}
- [ *\-.\)]*
+ [\-\s.\)]*
)?
\d{3} # exchange
- [ *\-.\)]*
+ [\-\s.]*
\d{4} # base
- )""",
+ )"""
+ ,
# ASCII Emoticons
- EMOTICONS,
+ EMOTICONS
+ ,
# HTML tags:
- r"""<[^>\s]+>""",
+ r"""<[^>\s]+>"""
+ ,
# ASCII Arrows
- r"""[\-]+>|<[\-]+""",
+ r"""[\-]+>|<[\-]+"""
+ ,
# Twitter username:
- r"""(?:@[\w_]+)""",
+ r"""(?:@[\w_]+)"""
+ ,
# Twitter hashtags:
- r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
+ r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
+ ,
# email addresses
- r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
+ r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]"""
+ ,
# Remaining word types:
r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
- """,
-)
+ """
+ )
######################################################################
# This is the core tokenizing regex:
-WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
+WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I
+ | re.UNICODE)
# WORD_RE performs poorly on these patterns:
-HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
+HANG_RE = re.compile(r'([^a-zA-Z0-9])\1{3,}')
# The emoticon string gets its own regex so that we can preserve case for
# them as needed:
-EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
+EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
# These are for regularizing HTML entities to Unicode:
-ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
+ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
######################################################################
# Functions for converting html entities
######################################################################
-
-def _str_to_unicode(text, encoding=None, errors="strict"):
+def _str_to_unicode(text, encoding=None, errors='strict'):
if encoding is None:
- encoding = "utf-8"
+ encoding = 'utf-8'
if isinstance(text, bytes):
return text.decode(encoding, errors)
return text
-
-def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
+def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
"""
Remove entities from text by converting them to their
corresponding unicode character.
# Numeric character references in the 80-9F range are typically
# interpreted by browsers as representing the characters mapped
# to bytes 80-9F in the Windows-1252 encoding. For more info
- # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
- if 0x80 <= number <= 0x9F:
- return bytes((number,)).decode("cp1252")
+ # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
+ if 0x80 <= number <= 0x9f:
+ return int2byte(number).decode('cp1252')
except ValueError:
number = None
else:
if entity_body in keep:
return match.group(0)
else:
- number = html.entities.name2codepoint.get(entity_body)
+ number = html_entities.name2codepoint.get(entity_body)
if number is not None:
try:
- return chr(number)
+ return unichr(number)
except ValueError:
pass
######################################################################
-
class TweetTokenizer:
r"""
Tokenizer for tweets.
if self.reduce_len:
text = reduce_lengthening(text)
# Shorten problematic sequences of characters
- safe_text = HANG_RE.sub(r"\1\1\1", text)
+ safe_text = HANG_RE.sub(r'\1\1\1', text)
# Tokenize:
words = WORD_RE.findall(safe_text)
# Possibly alter the case, but avoid changing emoticons like :D into :d:
if not self.preserve_case:
- words = list(
- map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
- )
+ words = list(map((lambda x : x if EMOTICON_RE.search(x) else
+ x.lower()), words))
return words
-
######################################################################
# Normalization Functions
######################################################################
-
def reduce_lengthening(text):
"""
Replace repeated character sequences of length 3 or greater with sequences
of length 3.
"""
- pattern = regex.compile(r"(.)\1{2,}")
+ pattern = re.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1\1", text)
-
def remove_handles(text):
"""
Remove Twitter username handles from text.
"""
- pattern = regex.compile(
- r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
- )
- # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
- return pattern.sub(" ", text)
-
+ pattern = re.compile(r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)")
+ # Substitute hadnles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+ return pattern.sub(' ', text)
######################################################################
# Tokenization Function
######################################################################
-
def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
"""
Convenience function for wrapping the tokenizer.
"""
- return TweetTokenizer(
- preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles
- ).tokenize(text)
-
+ return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len,
+ strip_handles=strip_handles).tokenize(text)
###############################################################################
+++ /dev/null
-# Natural Language Toolkit: NLTK's very own tokenizer.
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author:
-# URL: <http://nltk.sourceforge.net>
-# For license information, see LICENSE.TXT
-
-
-import re
-from nltk.tokenize.api import TokenizerI
-
-
-class MacIntyreContractions:
- """
- List of contractions adapted from Robert MacIntyre's tokenizer.
- """
-
- CONTRACTIONS2 = [
- r"(?i)\b(can)(?#X)(not)\b",
- r"(?i)\b(d)(?#X)('ye)\b",
- r"(?i)\b(gim)(?#X)(me)\b",
- r"(?i)\b(gon)(?#X)(na)\b",
- r"(?i)\b(got)(?#X)(ta)\b",
- r"(?i)\b(lem)(?#X)(me)\b",
- r"(?i)\b(mor)(?#X)('n)\b",
- r"(?i)\b(wan)(?#X)(na)\s",
- ]
- CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
- CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
-
-
-class NLTKWordTokenizer(TokenizerI):
- """
- The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
-
- The tokenizer is "destructive" such that the regexes applied will munge the
- input string to a state beyond re-construction. It is possible to apply
- `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
- `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
- revert to the original string.
- """
-
- # Starting quotes.
- STARTING_QUOTES = [
- (re.compile(u"([«“‘„]|[`]+)", re.U), r" \1 "),
- (re.compile(r"^\""), r"``"),
- (re.compile(r"(``)"), r" \1 "),
- (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
- (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\b", re.U), r"\1 \2"),
- ]
-
- # Ending quotes.
- ENDING_QUOTES = [
- (re.compile(u"([»”’])", re.U), r" \1 "),
- (re.compile(r'"'), " '' "),
- (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
- (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
- (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
- ]
-
- # For improvements for starting/closing quotes from TreebankWordTokenizer,
- # see discussion on https://github.com/nltk/nltk/pull/1437
- # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
- # - chervon quotes u'\xab' and u'\xbb' .
- # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
- # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
- # Also, behavior of splitting on clitics now follows Stanford CoreNLP
- # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
-
- # Punctuation.
- PUNCTUATION = [
- (re.compile(r'([^\.])(\.)([\]\)}>"\'' u"»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
- (re.compile(r"([:,])([^\d])"), r" \1 \2"),
- (re.compile(r"([:,])$"), r" \1 "),
- (re.compile(r"\.{2,}", re.U), r" \g<0> "), # See https://github.com/nltk/nltk/pull/2322
- (re.compile(r"[;@#$%&]"), r" \g<0> "),
- (
- re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
- r"\1 \2\3 ",
- ), # Handles the final period.
- (re.compile(r"[?!]"), r" \g<0> "),
- (re.compile(r"([^'])' "), r"\1 ' "),
- (re.compile(r"[*]", re.U), r" \g<0> "), # See https://github.com/nltk/nltk/pull/2322
- ]
-
- # Pads parentheses
- PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
-
- # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
- CONVERT_PARENTHESES = [
- (re.compile(r"\("), "-LRB-"),
- (re.compile(r"\)"), "-RRB-"),
- (re.compile(r"\["), "-LSB-"),
- (re.compile(r"\]"), "-RSB-"),
- (re.compile(r"\{"), "-LCB-"),
- (re.compile(r"\}"), "-RCB-"),
- ]
-
- DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
-
- # List of contractions adapted from Robert MacIntyre's tokenizer.
- _contractions = MacIntyreContractions()
- CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
- CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
-
- def tokenize(self, text, convert_parentheses=False, return_str=False):
- for regexp, substitution in self.STARTING_QUOTES:
- text = regexp.sub(substitution, text)
-
- for regexp, substitution in self.PUNCTUATION:
- text = regexp.sub(substitution, text)
-
- # Handles parentheses.
- regexp, substitution = self.PARENS_BRACKETS
- text = regexp.sub(substitution, text)
- # Optionally convert parentheses
- if convert_parentheses:
- for regexp, substitution in self.CONVERT_PARENTHESES:
- text = regexp.sub(substitution, text)
-
- # Handles double dash.
- regexp, substitution = self.DOUBLE_DASHES
- text = regexp.sub(substitution, text)
-
- # add extra space to make things easier
- text = " " + text + " "
-
- for regexp, substitution in self.ENDING_QUOTES:
- text = regexp.sub(substitution, text)
-
- for regexp in self.CONTRACTIONS2:
- text = regexp.sub(r" \1 \2 ", text)
- for regexp in self.CONTRACTIONS3:
- text = regexp.sub(r" \1 \2 ", text)
-
- # We are not using CONTRACTIONS4 since
- # they are also commented out in the SED scripts
- # for regexp in self._contractions.CONTRACTIONS4:
- # text = regexp.sub(r' \1 \2 \3 ', text)
-
- return text if return_str else text.split()
--- /dev/null
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit:
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pidong Wang, Josh Schroeder, Ondrej Bojar, based on code by Philipp Koehn
+# Contributors: Liling Tan, Martijn Pieters, Wiktor Stribizew
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+from __future__ import print_function
+import re
+from six import text_type
+
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import is_cjk
+from nltk.corpus import perluniprops, nonbreaking_prefixes
+
+
+class MosesTokenizer(TokenizerI):
+ """
+ This is a Python port of the Moses Tokenizer from
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+ >>> tokenizer = MosesTokenizer()
+ >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+ >>> expected_tokenized = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> tokenized_text = tokenizer.tokenize(text, return_str=True)
+ >>> tokenized_text == expected_tokenized
+ True
+ >>> tokenizer.tokenize(text) == [u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
+ True
+
+ The nonbreaking prefixes should tokenize the final fullstop.
+
+ >>> m = MosesTokenizer()
+ >>> m.tokenize('abc def.')
+ [u'abc', u'def', u'.']
+
+ The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
+ In below example, "pp" is the last element, and there is no digit after it.
+
+ >>> m = MosesTokenizer()
+ >>> m.tokenize('2016, pp.')
+ [u'2016', u',', u'pp', u'.']
+
+ >>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
+ >>> m.tokenize(sent, escape=True)
+ ['This', 'ain', ''t', 'funny', '.', 'It', ''s', 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', ''re', 'gonna', 'shake', 'it', 'off', '?', 'Don', ''t', '?']
+ >>> m.tokenize(sent, escape=False)
+ ['This', 'ain', "'t", 'funny', '.', 'It', "'s", 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', "'re", 'gonna', 'shake', 'it', 'off', '?', 'Don', "'t", '?']
+ """
+
+ # Perl Unicode Properties character sets.
+ IsN = text_type(''.join(perluniprops.chars('IsN')))
+ IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
+ IsSc = text_type(''.join(perluniprops.chars('IsSc')))
+ IsSo = text_type(''.join(perluniprops.chars('IsSo')))
+ IsAlpha = text_type(''.join(perluniprops.chars('IsAlpha')))
+ IsLower = text_type(''.join(perluniprops.chars('IsLower')))
+
+ # Remove ASCII junk.
+ DEDUPLICATE_SPACE = r'\s+', r' '
+ ASCII_JUNK = r'[\000-\037]', r''
+
+ # Neurotic Perl heading space, multi-space and trailing space chomp.
+ # These regexes are kept for reference purposes and shouldn't be used!!
+ MID_STRIP = r" +", r" " # Use DEDUPLICATE_SPACE instead.
+ LEFT_STRIP = r"^ ", r"" # Uses text.lstrip() instead.
+ RIGHT_STRIP = r" $", r"" # Uses text.rstrip() instead.
+
+ # Pad all "other" special characters not in IsAlnum.
+ PAD_NOT_ISALNUM = u'([^{}\s\.\'\`\,\-])'.format(IsAlnum), r' \1 '
+
+ # Splits all hypens (regardless of circumstances), e.g.
+ # 'foo -- bar' -> 'foo @-@ @-@ bar' , 'foo-bar' -> 'foo @-@ bar'
+ AGGRESSIVE_HYPHEN_SPLIT = u'([{alphanum}])\-(?=[{alphanum}])'.format(alphanum=IsAlnum), r'\1 \@-\@ '
+
+ # Make multi-dots stay together.
+ REPLACE_DOT_WITH_LITERALSTRING_1 = r'\.([\.]+)', ' DOTMULTI\1'
+ REPLACE_DOT_WITH_LITERALSTRING_2 = r'DOTMULTI\.([^\.])', 'DOTDOTMULTI \1'
+ REPLACE_DOT_WITH_LITERALSTRING_3 = r'DOTMULTI\.', 'DOTDOTMULTI'
+
+ # Separate out "," except if within numbers (5,300)
+ # e.g. A,B,C,D,E > A , B,C , D,E
+ # First application uses up B so rule can't see B,C
+ # two-step version here may create extra spaces but these are removed later
+ # will also space digit,letter or letter,digit forms (redundant with next section)
+ COMMA_SEPARATE_1 = u'([^{}])[,]'.format(IsN), r'\1 , '
+ COMMA_SEPARATE_2 = u'[,]([^{}])'.format(IsN), r' , \1'
+
+ # Attempt to get correct directional quotes.
+ DIRECTIONAL_QUOTE_1 = r'^``', r'`` '
+ DIRECTIONAL_QUOTE_2 = r'^"', r'`` '
+ DIRECTIONAL_QUOTE_3 = r'^`([^`])', r'` \1'
+ DIRECTIONAL_QUOTE_4 = r"^'", r'` '
+ DIRECTIONAL_QUOTE_5 = r'([ ([{<])"', r'\1 `` '
+ DIRECTIONAL_QUOTE_6 = r'([ ([{<])``', r'\1 `` '
+ DIRECTIONAL_QUOTE_7 = r'([ ([{<])`([^`])', r'\1 ` \2'
+ DIRECTIONAL_QUOTE_8 = r"([ ([{<])'", r'\1 ` '
+
+ # Replace ... with _ELLIPSIS_
+ REPLACE_ELLIPSIS = r'\.\.\.', r' _ELLIPSIS_ '
+ # Restore _ELLIPSIS_ with ...
+ RESTORE_ELLIPSIS = r'_ELLIPSIS_', r'\.\.\.'
+
+ # Pad , with tailing space except if within numbers, e.g. 5,300
+ # These are used in nltk.tokenize.moses.penn_tokenize()
+ COMMA_1 = u'([^{numbers}])[,]([^{numbers}])'.format(numbers=IsN), r'\1 , \2'
+ COMMA_2 = u'([{numbers}])[,]([^{numbers}])'.format(numbers=IsN), r'\1 , \2'
+ COMMA_3 = u'([^{numbers}])[,]([{numbers}])'.format(numbers=IsN), r'\1 , \2'
+
+ # Pad unicode symbols with spaces.
+ SYMBOLS = u'([;:@#\$%&{}{}])'.format(IsSc, IsSo), r' \1 '
+
+ # Separate out intra-token slashes. PTB tokenization doesn't do this, so
+ # the tokens should be merged prior to parsing with a PTB-trained parser.
+ # e.g. "and/or" -> "and @/@ or"
+ INTRATOKEN_SLASHES = u'([{alphanum}])\/([{alphanum}])'.format(alphanum=IsAlnum), r'$1 \@\/\@ $2'
+
+ # Splits final period at end of string.
+ FINAL_PERIOD = r"""([^.])([.])([\]\)}>"']*) ?$""", r'\1 \2\3'
+ # Pad all question marks and exclamation marks with spaces.
+ PAD_QUESTION_EXCLAMATION_MARK = r'([?!])', r' \1 '
+
+ # Handles parentheses, brackets and converts them to PTB symbols.
+ PAD_PARENTHESIS = r'([\]\[\(\){}<>])', r' \1 '
+ CONVERT_PARENTHESIS_1 = r'\(', '-LRB-'
+ CONVERT_PARENTHESIS_2 = r'\)', '-RRB-'
+ CONVERT_PARENTHESIS_3 = r'\[', '-LSB-'
+ CONVERT_PARENTHESIS_4 = r'\]', '-RSB-'
+ CONVERT_PARENTHESIS_5 = r'\{', '-LCB-'
+ CONVERT_PARENTHESIS_6 = r'\}', '-RCB-'
+
+ # Pads double dashes with spaces.
+ PAD_DOUBLE_DASHES = r'--', ' -- '
+
+ # Adds spaces to start and end of string to simplify further regexps.
+ PAD_START_OF_STR = r'^', ' '
+ PAD_END_OF_STR = r'$', ' '
+
+ # Converts double quotes to two single quotes and pad with spaces.
+ CONVERT_DOUBLE_TO_SINGLE_QUOTES = r'"', " '' "
+ # Handles single quote in possessives or close-single-quote.
+ HANDLES_SINGLE_QUOTES = r"([^'])' ", r"\1 ' "
+
+ # Pad apostrophe in possessive or close-single-quote.
+ APOSTROPHE = r"([^'])'", r"\1 ' "
+
+ # Prepend space on contraction apostrophe.
+ CONTRACTION_1 = r"'([sSmMdD]) ", r" '\1 "
+ CONTRACTION_2 = r"'ll ", r" 'll "
+ CONTRACTION_3 = r"'re ", r" 're "
+ CONTRACTION_4 = r"'ve ", r" 've "
+ CONTRACTION_5 = r"n't ", r" n't "
+ CONTRACTION_6 = r"'LL ", r" 'LL "
+ CONTRACTION_7 = r"'RE ", r" 'RE "
+ CONTRACTION_8 = r"'VE ", r" 'VE "
+ CONTRACTION_9 = r"N'T ", r" N'T "
+
+ # Informal Contractions.
+ CONTRACTION_10 = r" ([Cc])annot ", r" \1an not "
+ CONTRACTION_11 = r" ([Dd])'ye ", r" \1' ye "
+ CONTRACTION_12 = r" ([Gg])imme ", r" \1im me "
+ CONTRACTION_13 = r" ([Gg])onna ", r" \1on na "
+ CONTRACTION_14 = r" ([Gg])otta ", r" \1ot ta "
+ CONTRACTION_15 = r" ([Ll])emme ", r" \1em me "
+ CONTRACTION_16 = r" ([Mm])ore$text =~ s='n ", r" \1ore 'n "
+ CONTRACTION_17 = r" '([Tt])is ", r" '\1 is "
+ CONTRACTION_18 = r" '([Tt])was ", r" '\1 was "
+ CONTRACTION_19 = r" ([Ww])anna ", r" \1an na "
+
+ # Clean out extra spaces
+ CLEAN_EXTRA_SPACE_1 = r' *', r' '
+ CLEAN_EXTRA_SPACE_2 = r'^ *', r''
+ CLEAN_EXTRA_SPACE_3 = r' *$', r''
+
+ # Neurotic Perl regexes to escape special characters.
+ # These XML escaping regexes are kept such that tokens generated from
+ # NLTK's implementation is consistent with Moses' tokenizer's output.
+ # Outside of the MosesTokenizer function, it's strongly encouraged to use
+ # nltk.tokenize.util.xml_escape() function instead.
+ ESCAPE_AMPERSAND = r'&', r'&'
+ ESCAPE_PIPE = r'\|', r'|'
+ ESCAPE_LEFT_ANGLE_BRACKET = r'<', r'<'
+ ESCAPE_RIGHT_ANGLE_BRACKET = r'>', r'>'
+ ESCAPE_SINGLE_QUOTE = r"\'", r"'"
+ ESCAPE_DOUBLE_QUOTE = r'\"', r'"'
+ ESCAPE_LEFT_SQUARE_BRACKET = r"\[", r"["
+ ESCAPE_RIGHT_SQUARE_BRACKET = r"]", r"]"
+
+ EN_SPECIFIC_1 = u"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ EN_SPECIFIC_2 = u"([^{alpha}{isn}])[']([{alpha}])".format(alpha=IsAlpha, isn=IsN), r"\1 ' \2"
+ EN_SPECIFIC_3 = u"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ EN_SPECIFIC_4 = u"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 '\2"
+ EN_SPECIFIC_5 = u"([{isn}])[']([s])".format(isn=IsN), r"\1 '\2"
+
+ ENGLISH_SPECIFIC_APOSTROPHE = [EN_SPECIFIC_1, EN_SPECIFIC_2, EN_SPECIFIC_3,
+ EN_SPECIFIC_4, EN_SPECIFIC_5]
+
+ FR_IT_SPECIFIC_1 = u"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ FR_IT_SPECIFIC_2 = u"([^{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ FR_IT_SPECIFIC_3 = u"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ FR_IT_SPECIFIC_4 = u"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1' \2"
+
+ FR_IT_SPECIFIC_APOSTROPHE = [FR_IT_SPECIFIC_1, FR_IT_SPECIFIC_2,
+ FR_IT_SPECIFIC_3, FR_IT_SPECIFIC_4]
+
+ NON_SPECIFIC_APOSTROPHE = r"\'", r" \' "
+
+ MOSES_PENN_REGEXES_1 = [DEDUPLICATE_SPACE, ASCII_JUNK, DIRECTIONAL_QUOTE_1,
+ DIRECTIONAL_QUOTE_2, DIRECTIONAL_QUOTE_3,
+ DIRECTIONAL_QUOTE_4, DIRECTIONAL_QUOTE_5,
+ DIRECTIONAL_QUOTE_6, DIRECTIONAL_QUOTE_7,
+ DIRECTIONAL_QUOTE_8, REPLACE_ELLIPSIS, COMMA_1,
+ COMMA_2, COMMA_3, SYMBOLS, INTRATOKEN_SLASHES,
+ FINAL_PERIOD, PAD_QUESTION_EXCLAMATION_MARK,
+ PAD_PARENTHESIS, CONVERT_PARENTHESIS_1,
+ CONVERT_PARENTHESIS_2, CONVERT_PARENTHESIS_3,
+ CONVERT_PARENTHESIS_4, CONVERT_PARENTHESIS_5,
+ CONVERT_PARENTHESIS_6, PAD_DOUBLE_DASHES,
+ PAD_START_OF_STR, PAD_END_OF_STR,
+ CONVERT_DOUBLE_TO_SINGLE_QUOTES,
+ HANDLES_SINGLE_QUOTES, APOSTROPHE, CONTRACTION_1,
+ CONTRACTION_2, CONTRACTION_3, CONTRACTION_4,
+ CONTRACTION_5, CONTRACTION_6, CONTRACTION_7,
+ CONTRACTION_8, CONTRACTION_9, CONTRACTION_10,
+ CONTRACTION_11, CONTRACTION_12, CONTRACTION_13,
+ CONTRACTION_14, CONTRACTION_15, CONTRACTION_16,
+ CONTRACTION_17, CONTRACTION_18, CONTRACTION_19]
+
+ MOSES_PENN_REGEXES_2 = [RESTORE_ELLIPSIS, CLEAN_EXTRA_SPACE_1,
+ CLEAN_EXTRA_SPACE_2, CLEAN_EXTRA_SPACE_3,
+ ESCAPE_AMPERSAND, ESCAPE_PIPE,
+ ESCAPE_LEFT_ANGLE_BRACKET, ESCAPE_RIGHT_ANGLE_BRACKET,
+ ESCAPE_SINGLE_QUOTE, ESCAPE_DOUBLE_QUOTE]
+
+ MOSES_ESCAPE_XML_REGEXES = [ESCAPE_AMPERSAND, ESCAPE_PIPE,
+ ESCAPE_LEFT_ANGLE_BRACKET,
+ ESCAPE_RIGHT_ANGLE_BRACKET,
+ ESCAPE_SINGLE_QUOTE, ESCAPE_DOUBLE_QUOTE,
+ ESCAPE_LEFT_SQUARE_BRACKET,
+ ESCAPE_RIGHT_SQUARE_BRACKET]
+
+ def __init__(self, lang='en'):
+ # Initialize the object.
+ super(MosesTokenizer, self).__init__()
+ self.lang = lang
+ # Initialize the language specific nonbreaking prefixes.
+ self.NONBREAKING_PREFIXES = [_nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)]
+ self.NUMERIC_ONLY_PREFIXES = [w.rpartition(' ')[0] for w in
+ self.NONBREAKING_PREFIXES if
+ self.has_numeric_only(w)]
+
+
+
+ def replace_multidots(self, text):
+ text = re.sub(r'\.([\.]+)', r' DOTMULTI\1', text)
+ while re.search(r'DOTMULTI\.', text):
+ text = re.sub(r'DOTMULTI\.([^\.])', r'DOTDOTMULTI \1', text)
+ text = re.sub(r'DOTMULTI\.', 'DOTDOTMULTI', text)
+ return text
+
+ def restore_multidots(self, text):
+ while re.search(r'DOTDOTMULTI', text):
+ text = re.sub(r'DOTDOTMULTI', r'DOTMULTI.', text)
+ return re.sub(r'DOTMULTI', r'.', text)
+
+ def islower(self, text):
+ return not set(text).difference(set(self.IsLower))
+
+ def isalpha(self, text):
+ return not set(text).difference(set(self.IsAlpha))
+
+ def has_numeric_only(self, text):
+ return bool(re.search(r'(.*)[\s]+(\#NUMERIC_ONLY\#)', text))
+
+ def handles_nonbreaking_prefixes(self, text):
+ # Splits the text into tokens to check for nonbreaking prefixes.
+ tokens = text.split()
+ num_tokens = len(tokens)
+ for i, token in enumerate(tokens):
+ # Checks if token ends with a fullstop.
+ token_ends_with_period = re.search(r'^(\S+)\.$', token)
+ if token_ends_with_period:
+ prefix = token_ends_with_period.group(1)
+ # Checks for 3 conditions if
+ # i. the prefix contains a fullstop and
+ # any char in the prefix is within the IsAlpha charset
+ # ii. the prefix is in the list of nonbreaking prefixes and
+ # does not contain #NUMERIC_ONLY#
+ # iii. the token is not the last token and that the
+ # next token contains all lowercase.
+ if ( ('.' in prefix and self.isalpha(prefix)) or
+ (prefix in self.NONBREAKING_PREFIXES and
+ prefix not in self.NUMERIC_ONLY_PREFIXES) or
+ (i != num_tokens-1 and self.islower(tokens[i+1])) ):
+ pass # No change to the token.
+ # Checks if the prefix is in NUMERIC_ONLY_PREFIXES
+ # and ensures that the next word is a digit.
+ elif (prefix in self.NUMERIC_ONLY_PREFIXES and
+ (i + 1) < num_tokens and
+ re.search(r'^[0-9]+', tokens[i+1])):
+ pass # No change to the token.
+ else: # Otherwise, adds a space after the tokens before a dot.
+ tokens[i] = prefix + ' .'
+ return " ".join(tokens) # Stitch the tokens back.
+
+ def escape_xml(self, text):
+ for regexp, substitution in self.MOSES_ESCAPE_XML_REGEXES:
+ text = re.sub(regexp, substitution, text)
+ return text
+
+ def penn_tokenize(self, text, return_str=False):
+ """
+ This is a Python port of the Penn treebank tokenizer adapted by the Moses
+ machine translation community. It's a little different from the
+ version in nltk.tokenize.treebank.
+ """
+ # Converts input string into unicode.
+ text = text_type(text)
+ # Perform a chain of regex substituitions using MOSES_PENN_REGEXES_1
+ for regexp, substitution in self.MOSES_PENN_REGEXES_1:
+ text = re.sub(regexp, substitution, text)
+ # Handles nonbreaking prefixes.
+ text = self.handles_nonbreaking_prefixes(text)
+ # Restore ellipsis, clean extra spaces, escape XML symbols.
+ for regexp, substitution in self.MOSES_PENN_REGEXES_2:
+ text = re.sub(regexp, substitution, text)
+ return text if return_str else text.split()
+
+ def tokenize(self, text, agressive_dash_splits=False, return_str=False, escape=True):
+ """
+ Python port of the Moses tokenizer.
+
+ >>> mtokenizer = MosesTokenizer()
+ >>> text = u'Is 9.5 or 525,600 my favorite number?'
+ >>> print (mtokenizer.tokenize(text, return_str=True))
+ Is 9.5 or 525,600 my favorite number ?
+ >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
+ >>> print (mtokenizer.tokenize(text, return_str=True))
+ The https : / / github.com / jonsafari / tok-tok / blob / master / tok-tok.pl is a website with / and / or slashes and sort of weird : things
+ >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+ >>> expected = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> assert mtokenizer.tokenize(text, return_str=True) == expected
+
+ :param tokens: A single string, i.e. sentence text.
+ :type tokens: str
+ :param agressive_dash_splits: Option to trigger dash split rules .
+ :type agressive_dash_splits: bool
+ """
+ # Converts input string into unicode.
+ text = text_type(text)
+
+ # De-duplicate spaces and clean ASCII junk
+ for regexp, substitution in [self.DEDUPLICATE_SPACE, self.ASCII_JUNK]:
+ text = re.sub(regexp, substitution, text)
+ # Strips heading and trailing spaces.
+ text = text.strip()
+ # Separate special characters outside of IsAlnum character set.
+ regexp, substitution = self.PAD_NOT_ISALNUM
+ text = re.sub(regexp, substitution, text)
+ # Aggressively splits dashes
+ if agressive_dash_splits:
+ regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
+ text = re.sub(regexp, substitution, text)
+ # Replaces multidots with "DOTDOTMULTI" literal strings.
+ text = self.replace_multidots(text)
+ # Separate out "," except if within numbers e.g. 5,300
+ for regexp, substitution in [self.COMMA_SEPARATE_1, self.COMMA_SEPARATE_2]:
+ text = re.sub(regexp, substitution, text)
+
+ # (Language-specific) apostrophe tokenization.
+ if self.lang == 'en':
+ for regexp, substitution in self.ENGLISH_SPECIFIC_APOSTROPHE:
+ text = re.sub(regexp, substitution, text)
+ elif self.lang in ['fr', 'it']:
+ for regexp, substitution in self.FR_IT_SPECIFIC_APOSTROPHE:
+ text = re.sub(regexp, substitution, text)
+ else:
+ regexp, substitution = self.NON_SPECIFIC_APOSTROPHE
+ text = re.sub(regexp, substitution, text)
+
+ # Handles nonbreaking prefixes.
+ text = self.handles_nonbreaking_prefixes(text)
+ # Cleans up extraneous spaces.
+ regexp, substitution = self.DEDUPLICATE_SPACE
+ text = re.sub(regexp,substitution, text).strip()
+ # Restore multidots.
+ text = self.restore_multidots(text)
+ if escape:
+ # Escape XML symbols.
+ text = self.escape_xml(text)
+
+ return text if return_str else text.split()
+
+
+class MosesDetokenizer(TokenizerI):
+ """
+ This is a Python port of the Moses Detokenizer from
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl
+
+ >>> tokenizer = MosesTokenizer()
+ >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+ >>> expected_tokenized = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> tokenized_text = tokenizer.tokenize(text, return_str=True)
+ >>> tokenized_text == expected_tokenized
+ True
+ >>> detokenizer = MosesDetokenizer()
+ >>> expected_detokenized = u'This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> detokenized_text = detokenizer.detokenize(tokenized_text.split(), return_str=True)
+ >>> detokenized_text == expected_detokenized
+ True
+
+ >>> from nltk.tokenize.moses import MosesTokenizer, MosesDetokenizer
+ >>> t, d = MosesTokenizer(), MosesDetokenizer()
+ >>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
+ >>> expected_tokens = [u'This', u'ain', u''t', u'funny', u'.', u'It', u''s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'|', u'[', u']', u'<', u'>', u'[', u']', u'&', u'You', u''re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u''t', u'?']
+ >>> expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
+ >>> tokens = t.tokenize(sent)
+ >>> tokens == expected_tokens
+ True
+ >>> detokens = d.detokenize(tokens)
+ >>> " ".join(detokens) == expected_detokens
+ True
+
+ >>> d.detokenize(expected_tokens, unescape=True)
+ ['This', "ain't", 'funny.', "It's", 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '|', '[]', '<', '>', '[]', '&', "You're", 'gonna', 'shake', 'it', 'off?', "Don't?"]
+ >>> d.detokenize(expected_tokens, unescape=False)
+ ['This', 'ain', ''t', 'funny.', 'It', ''s', 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', ''re', 'gonna', 'shake', 'it', 'off?', 'Don', ''t?']
+ """
+ # Currency Symbols.
+ IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
+ IsAlpha = text_type(''.join(perluniprops.chars('IsAlpha')))
+ IsSc = text_type(''.join(perluniprops.chars('IsSc')))
+
+ AGGRESSIVE_HYPHEN_SPLIT = r' \@\-\@ ', r'-'
+
+ # Merge multiple spaces.
+ ONE_SPACE = re.compile(r' {2,}'), ' '
+
+ # Unescape special characters.
+ UNESCAPE_FACTOR_SEPARATOR = r'|', r'|'
+ UNESCAPE_LEFT_ANGLE_BRACKET = r'<', r'<'
+ UNESCAPE_RIGHT_ANGLE_BRACKET = r'>', r'>'
+ UNESCAPE_DOUBLE_QUOTE = r'"', r'"'
+ UNESCAPE_SINGLE_QUOTE = r"'", r"'"
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT = r'[', r'['
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT = r']', r']'
+ UNESCAPE_AMPERSAND = r'&', r'&'
+ # The legacy regexes are used to support outputs from older Moses versions.
+ UNESCAPE_FACTOR_SEPARATOR_LEGACY = r'&bar;', r'|'
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY = r'&bra;', r'['
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY = r'&ket;', r']'
+
+
+ MOSES_UNESCAPE_XML_REGEXES = [UNESCAPE_FACTOR_SEPARATOR_LEGACY,
+ UNESCAPE_FACTOR_SEPARATOR, UNESCAPE_LEFT_ANGLE_BRACKET,
+ UNESCAPE_RIGHT_ANGLE_BRACKET,
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY,
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY,
+ UNESCAPE_DOUBLE_QUOTE, UNESCAPE_SINGLE_QUOTE,
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT,
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT, UNESCAPE_AMPERSAND]
+
+ FINNISH_MORPHSET_1 = [u'N', u'n', u'A', u'a', u'\xc4', u'\xe4', u'ssa',
+ u'Ssa', u'ss\xe4', u'Ss\xe4', u'sta', u'st\xe4',
+ u'Sta', u'St\xe4', u'hun', u'Hun', u'hyn', u'Hyn',
+ u'han', u'Han', u'h\xe4n', u'H\xe4n', u'h\xf6n',
+ u'H\xf6n', u'un', u'Un', u'yn', u'Yn', u'an', u'An',
+ u'\xe4n', u'\xc4n', u'\xf6n', u'\xd6n', u'seen',
+ u'Seen', u'lla', u'Lla', u'll\xe4', u'Ll\xe4', u'lta',
+ u'Lta', u'lt\xe4', u'Lt\xe4', u'lle', u'Lle', u'ksi',
+ u'Ksi', u'kse', u'Kse', u'tta', u'Tta', u'ine', u'Ine']
+
+ FINNISH_MORPHSET_2 = [u'ni', u'si', u'mme', u'nne', u'nsa']
+
+ FINNISH_MORPHSET_3 = [u'ko', u'k\xf6', u'han', u'h\xe4n', u'pa', u'p\xe4',
+ u'kaan', u'k\xe4\xe4n', u'kin']
+
+ FINNISH_REGEX = u'^({})({})?({})$'.format(text_type('|'.join(FINNISH_MORPHSET_1)),
+ text_type('|'.join(FINNISH_MORPHSET_2)),
+ text_type('|'.join(FINNISH_MORPHSET_3)))
+
+
+ def __init__(self, lang='en'):
+ super(MosesDetokenizer, self).__init__()
+ self.lang = lang
+
+
+ def unescape_xml(self, text):
+ for regexp, substitution in self.MOSES_UNESCAPE_XML_REGEXES:
+ text = re.sub(regexp, substitution, text)
+ return text
+
+
+ def tokenize(self, tokens, return_str=False, unescape=True):
+ """
+ Python port of the Moses detokenizer.
+
+ :param tokens: A list of strings, i.e. tokenized text.
+ :type tokens: list(str)
+ :return: str
+ """
+ # Convert the list of tokens into a string and pad it with spaces.
+ text = u" {} ".format(" ".join(tokens))
+ # Converts input string into unicode.
+ text = text_type(text)
+ # Detokenize the agressive hyphen split.
+ regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
+ text = re.sub(regexp, substitution, text)
+ if unescape:
+ # Unescape the XML symbols.
+ text = self.unescape_xml(text)
+ # Keep track of no. of quotation marks.
+ quote_counts = {u"'":0 , u'"':0, u"``":0, u"`":0, u"''":0}
+
+ # The *prepend_space* variable is used to control the "effects" of
+ # detokenization as the function loops through the list of tokens and
+ # changes the *prepend_space* accordingly as it sequentially checks
+ # through the language specific and language independent conditions.
+ prepend_space = " "
+ detokenized_text = ""
+ tokens = text.split()
+ # Iterate through every token and apply language specific detokenization rule(s).
+ for i, token in enumerate(iter(tokens)):
+ # Check if the first char is CJK.
+ if is_cjk(token[0]):
+ # Perform left shift if this is a second consecutive CJK word.
+ if i > 0 and is_cjk(token[-1]):
+ detokenized_text += token
+ # But do nothing special if this is a CJK word that doesn't follow a CJK word
+ else:
+ detokenized_text += prepend_space + token
+ prepend_space = " "
+
+ # If it's a currency symbol.
+ elif token in self.IsSc:
+ # Perform right shift on currency and other random punctuation items
+ detokenized_text += prepend_space + token
+ prepend_space = ""
+
+ elif re.search(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', token):
+ # In French, these punctuations are prefixed with a non-breakable space.
+ if self.lang == 'fr' and re.search(r'^[\?\!\:\;\\\%]$', token):
+ detokenized_text += " "
+ # Perform left shift on punctuation items.
+ detokenized_text += token
+ prepend_space = " "
+
+ elif (self.lang == 'en' and i > 0
+ and re.search(u"^[\'][{}]".format(self.IsAlpha), token)):
+ #and re.search(u'[{}]$'.format(self.IsAlnum), tokens[i-1])):
+ # For English, left-shift the contraction.
+ detokenized_text += token
+ prepend_space = " "
+
+ elif (self.lang == 'cs' and i > 1
+ and re.search(r'^[0-9]+$', tokens[-2]) # If the previous previous token is a number.
+ and re.search(r'^[.,]$', tokens[-1]) # If previous token is a dot.
+ and re.search(r'^[0-9]+$', token)): # If the current token is a number.
+ # In Czech, left-shift floats that are decimal numbers.
+ detokenized_text += token
+ prepend_space = " "
+
+ elif (self.lang in ['fr', 'it'] and i <= len(tokens)-2
+ and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
+ and re.search(u'^[{}]$'.format(self.IsAlpha), tokens[i+1])): # If the next token is alpha.
+ # For French and Italian, right-shift the contraction.
+ detokenized_text += prepend_space + token
+ prepend_space = ""
+
+ elif (self.lang == 'cs' and i <= len(tokens)-3
+ and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
+ and re.search(u'^[-–]$', tokens[i+1])
+ and re.search(u'^li$|^mail.*', tokens[i+2], re.IGNORECASE)): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)
+ # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)
+ detokenized_text += prepend_space + token + tokens[i+1]
+ next(tokens, None) # Advance over the dash
+ prepend_space = ""
+
+ # Combine punctuation smartly.
+ elif re.search(r'''^[\'\"„“`]+$''', token):
+ normalized_quo = token
+ if re.search(r'^[„“”]+$', token):
+ normalized_quo = '"'
+ quote_counts[normalized_quo] = quote_counts.get(normalized_quo, 0)
+
+ if self.lang == 'cs' and token == u"„":
+ quote_counts[normalized_quo] = 0
+ if self.lang == 'cs' and token == u"“":
+ quote_counts[normalized_quo] = 1
+
+
+ if quote_counts[normalized_quo] % 2 == 0:
+ if (self.lang == 'en' and token == u"'" and i > 0
+ and re.search(r'[s]$', tokens[i-1]) ):
+ # Left shift on single quote for possessives ending
+ # in "s", e.g. "The Jones' house"
+ detokenized_text += token
+ prepend_space = " "
+ else:
+ # Right shift.
+ detokenized_text += prepend_space + token
+ prepend_space = ""
+ quote_counts[normalized_quo] += 1
+ else:
+ # Left shift.
+ detokenized_text += token
+ prepend_space = " "
+ quote_counts[normalized_quo] += 1
+
+ elif (self.lang == 'fi' and re.search(r':$', tokens[i-1])
+ and re.search(self.FINNISH_REGEX, token)):
+ # Finnish : without intervening space if followed by case suffix
+ # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+ detokenized_text += prepend_space + token
+ prepend_space = " "
+
+ else:
+ detokenized_text += prepend_space + token
+ prepend_space = " "
+
+ # Merge multiple spaces.
+ regexp, substitution = self.ONE_SPACE
+ detokenized_text = re.sub(regexp, substitution, detokenized_text)
+ # Removes heading and trailing spaces.
+ detokenized_text = detokenized_text.strip()
+
+ return detokenized_text if return_str else detokenized_text.split()
+
+ def detokenize(self, tokens, return_str=False, unescape=True):
+ """ Duck-typing the abstract *tokenize()*."""
+ return self.tokenize(tokens, return_str, unescape)
# Multi-Word Expression tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
into single tokens.
"""
- def __init__(self, mwes=None, separator="_"):
+ def __init__(self, mwes=None, separator='_'):
"""Initialize the multi-word tokenizer with a list of expressions and a
separator
>>> tokenizer.add_mwe(('a', 'b', 'c'))
>>> tokenizer.add_mwe(('a', 'x'))
>>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
- >>> tokenizer._mwes == expected
+ >>> tokenizer._mwes.as_dict() == expected
True
"""
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
"""
+from __future__ import unicode_literals
import io
import re
+from six import text_type
from nltk.corpus import perluniprops
from nltk.tokenize.api import TokenizerI
paragraph-based tokenization from mteval-14.pl; The sentence-based
tokenization is consistent with the other tokenizers available in NLTK.
+ >>> from six import text_type
>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()
>>> s = "Good muffins cost $3.88 in New York."
True
>>> nist.international_tokenize(rkt)[:10] == expected_rkt
True
-
- # Doctest for patching issue #1926
- >>> sent = u'this is a foo\u2604sentence.'
- >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
- >>> nist.international_tokenize(sent) == expected_sent
- True
"""
-
# Strip "skipped" tags
- STRIP_SKIP = re.compile("<skipped>"), ""
+ STRIP_SKIP = re.compile('<skipped>'), ''
# Strip end-of-line hyphenation and join lines
- STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
+ STRIP_EOL_HYPHEN = re.compile(u'\u2028'), ' '
# Tokenize punctuation.
- PUNCT = re.compile("([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
+ PUNCT = re.compile('([\{-\~\[-\` -\&\(-\+\:-\@\/])'), ' \\1 '
# Tokenize period and comma unless preceded by a digit.
- PERIOD_COMMA_PRECEED = re.compile("([^0-9])([\.,])"), "\\1 \\2 "
+ PERIOD_COMMA_PRECEED = re.compile('([^0-9])([\.,])'), '\\1 \\2 '
# Tokenize period and comma unless followed by a digit.
- PERIOD_COMMA_FOLLOW = re.compile("([\.,])([^0-9])"), " \\1 \\2"
+ PERIOD_COMMA_FOLLOW = re.compile('([\.,])([^0-9])'), ' \\1 \\2'
# Tokenize dash when preceded by a digit
- DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
+ DASH_PRECEED_DIGIT = re.compile('([0-9])(-)'), '\\1 \\2 '
- LANG_DEPENDENT_REGEXES = [
- PUNCT,
- PERIOD_COMMA_PRECEED,
- PERIOD_COMMA_FOLLOW,
- DASH_PRECEED_DIGIT,
- ]
+ LANG_DEPENDENT_REGEXES = [PUNCT, PERIOD_COMMA_PRECEED,
+ PERIOD_COMMA_FOLLOW, DASH_PRECEED_DIGIT]
# Perluniprops characters used in NIST tokenizer.
- pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N}
- pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P}
- pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S}
+ pup_number = text_type(''.join(set(perluniprops.chars('Number')))) # i.e. \p{N}
+ pup_punct = text_type(''.join(set(perluniprops.chars('Punctuation')))) # i.e. \p{P}
+ pup_symbol = text_type(''.join(set(perluniprops.chars('Symbol')))) # i.e. \p{S}
# Python regexes needs to escape some special symbols, see
# see https://stackoverflow.com/q/45670950/610569
- number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
- punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
- symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
+ number_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_number)
+ punct_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_punct)
+ symbol_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_symbol)
# Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
# (i) strip trailing and heading spaces and
# (ii) de-deuplicate spaces.
# In Python, this would do: ' '.join(str.strip().split())
# Thus, the next two lines were commented out.
- # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
- # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+ #Line_Separator = text_type(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+ #Separator = text_type(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
# Pads non-ascii strings with space.
- NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
+ NONASCII = re.compile('([\x00-\x7f]+)'), r' \1 '
# Tokenize any punctuation unless followed AND preceded by a digit.
- PUNCT_1 = (
- re.compile("([{n}])([{p}])".format(n=number_regex, p=punct_regex)),
- "\\1 \\2 ",
- )
- PUNCT_2 = (
- re.compile("([{p}])([{n}])".format(n=number_regex, p=punct_regex)),
- " \\1 \\2",
- )
+ PUNCT_1 = re.compile(u"([{n}])([{p}])".format(n=number_regex, p=punct_regex)), '\\1 \\2 '
+ PUNCT_2 = re.compile(u"([{p}])([{n}])".format(n=number_regex, p=punct_regex)), ' \\1 \\2'
# Tokenize symbols
- SYMBOLS = re.compile("([{s}])".format(s=symbol_regex)), " \\1 "
+ SYMBOLS = re.compile(u"({s})".format(s=symbol_regex)), ' \\1 '
INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
text = regexp.sub(substitution, text)
return text
- def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
- text = str(text)
+ def tokenize(self, text, lowercase=False,
+ western_lang=True, return_str=False):
+ text = text_type(text)
# Language independent regex.
text = self.lang_independent_sub(text)
# Language dependent regex.
if western_lang:
# Pad string with whitespace.
- text = " " + text + " "
+ text = ' ' + text + ' '
if lowercase:
text = text.lower()
for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
text = regexp.sub(substitution, text)
# Remove contiguous whitespaces.
- text = " ".join(text.split())
+ text = ' '.join(text.split())
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
- text = str(text.strip())
+ text = text_type(text.strip())
return text if return_str else text.split()
- def international_tokenize(
- self, text, lowercase=False, split_non_ascii=True, return_str=False
- ):
- text = str(text)
+ def international_tokenize(self, text, lowercase=False,
+ split_non_ascii=True,
+ return_str=False):
+ text = text_type(text)
# Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
# first before unescaping.
regexp, substitution = self.STRIP_SKIP
# Make sure that there's only one space only between words.
# Strip leading and trailing spaces.
- text = " ".join(text.strip().split())
+ text = ' '.join(text.strip().split())
return text if return_str else text.split()
# Natural Language Toolkit: Punkt sentence tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Algorithm: Kiss & Strunk (2006)
# Author: Willy <willy@csse.unimelb.edu.au> (original Python port)
# Steven Bird <stevenbird1@gmail.com> (additions)
r"""
Punkt Sentence Tokenizer
-This tokenizer divides a text into a list of sentences
+This tokenizer divides a text into a list of sentences,
by using an unsupervised algorithm to build a model for abbreviation
words, collocations, and words that start sentences. It must be
trained on a large collection of plaintext in the target language
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
Boundary Detection. Computational Linguistics 32: 485-525.
"""
+from __future__ import print_function, unicode_literals, division
# TODO: Make orthographic heuristic less susceptible to overtraining
# TODO: Frequent sentence starters optionally exclude always-capitalised words
import math
from collections import defaultdict
+from six import string_types
+
+from nltk.compat import unicode_repr, python_2_unicode_compatible
from nltk.probability import FreqDist
from nltk.tokenize.api import TokenizerI
######################################################################
-# { Orthographic Context Constants
+#{ Orthographic Context Constants
######################################################################
# The following constants are used to describe the orthographic
# contexts in which a word can occur. BEG=beginning, MID=middle,
# UNK=unknown, UC=uppercase, LC=lowercase, NC=no case.
-_ORTHO_BEG_UC = 1 << 1
+_ORTHO_BEG_UC = 1 << 1
"""Orthographic context: beginning of a sentence with upper case."""
-_ORTHO_MID_UC = 1 << 2
+_ORTHO_MID_UC = 1 << 2
"""Orthographic context: middle of a sentence with upper case."""
-_ORTHO_UNK_UC = 1 << 3
+_ORTHO_UNK_UC = 1 << 3
"""Orthographic context: unknown position in a sentence with upper case."""
-_ORTHO_BEG_LC = 1 << 4
+_ORTHO_BEG_LC = 1 << 4
"""Orthographic context: beginning of a sentence with lower case."""
-_ORTHO_MID_LC = 1 << 5
+_ORTHO_MID_LC = 1 << 5
"""Orthographic context: middle of a sentence with lower case."""
-_ORTHO_UNK_LC = 1 << 6
+_ORTHO_UNK_LC = 1 << 6
"""Orthographic context: unknown position in a sentence with lower case."""
_ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC
"""Orthographic context: occurs with lower case."""
_ORTHO_MAP = {
- ("initial", "upper"): _ORTHO_BEG_UC,
- ("internal", "upper"): _ORTHO_MID_UC,
- ("unknown", "upper"): _ORTHO_UNK_UC,
- ("initial", "lower"): _ORTHO_BEG_LC,
- ("internal", "lower"): _ORTHO_MID_LC,
- ("unknown", "lower"): _ORTHO_UNK_LC,
+ ('initial', 'upper'): _ORTHO_BEG_UC,
+ ('internal', 'upper'): _ORTHO_MID_UC,
+ ('unknown', 'upper'): _ORTHO_UNK_UC,
+ ('initial', 'lower'): _ORTHO_BEG_LC,
+ ('internal', 'lower'): _ORTHO_MID_LC,
+ ('unknown', 'lower'): _ORTHO_UNK_LC,
}
"""A map from context position and first-letter case to the
appropriate orthographic context flag."""
-# } (end orthographic context constants)
+#} (end orthographic context constants)
######################################################################
######################################################################
-# { Decision reasons for debugging
+#{ Decision reasons for debugging
######################################################################
-REASON_DEFAULT_DECISION = "default decision"
-REASON_KNOWN_COLLOCATION = "known collocation (both words)"
-REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = "abbreviation + orthographic heuristic"
-REASON_ABBR_WITH_SENTENCE_STARTER = "abbreviation + frequent sentence starter"
-REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
-REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
-REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = (
- "initial + special orthographic heuristic"
-)
-
+REASON_DEFAULT_DECISION = 'default decision'
+REASON_KNOWN_COLLOCATION = 'known collocation (both words)'
+REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = 'abbreviation + orthographic heuristic'
+REASON_ABBR_WITH_SENTENCE_STARTER = 'abbreviation + frequent sentence starter'
+REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
+REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
+REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = 'initial + special orthographic heuristic'
-# } (end decision reasons for debugging)
+#} (end decision reasons for debugging)
######################################################################
######################################################################
-# { Language-dependent variables
+#{ Language-dependent variables
######################################################################
-
class PunktLanguageVars(object):
"""
Stores variables, mostly regular expressions, which may be
constructors.
"""
- __slots__ = ("_re_period_context", "_re_word_tokenizer")
+ __slots__ = ('_re_period_context', '_re_word_tokenizer')
def __getstate__(self):
# All modifications to the class are performed by inheritance.
def __setstate__(self, state):
return 1
- sent_end_chars = (".", "?", "!")
+ sent_end_chars = ('.', '?', '!')
"""Characters which are candidates for sentence boundaries"""
@property
def _re_sent_end_chars(self):
- return "[%s]" % re.escape("".join(self.sent_end_chars))
+ return '[%s]' % re.escape(''.join(self.sent_end_chars))
- internal_punctuation = ",:;" # might want to extend this..
+ internal_punctuation = ',:;' # might want to extend this..
"""sentence internal punctuation, which indicates an abbreviation if
preceded by a period-final token."""
- re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE)
+ re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)',
+ re.MULTILINE)
"""Used to realign punctuation that should be included in a sentence
although it follows the period (or ?, !)."""
- _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]"
+ _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]"
"""Excludes some characters from starting word tokens"""
- _re_non_word_chars = r"(?:[?!)\";}\]\*:@\'\({\[])"
+ _re_non_word_chars = r"(?:[?!)\";}\]\*:@\'\({\[])"
"""Characters that cannot appear within words"""
_re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)"
"""Hyphen and ellipsis are multi-character punctuation"""
- _word_tokenize_fmt = r"""(
+ _word_tokenize_fmt = r'''(
%(MultiChar)s
|
(?=%(WordStart)s)\S+? # Accept word characters until end is found
)
|
\S
- )"""
+ )'''
"""Format of a regular expression to split punctuation from words,
excluding period."""
return self._re_word_tokenizer
except AttributeError:
self._re_word_tokenizer = re.compile(
- self._word_tokenize_fmt
- % {
- "NonWord": self._re_non_word_chars,
- "MultiChar": self._re_multi_char_punct,
- "WordStart": self._re_word_start,
+ self._word_tokenize_fmt %
+ {
+ 'NonWord': self._re_non_word_chars,
+ 'MultiChar': self._re_multi_char_punct,
+ 'WordStart': self._re_word_start,
},
- re.UNICODE | re.VERBOSE,
+ re.UNICODE | re.VERBOSE
)
return self._re_word_tokenizer
return self._re_period_context
except:
self._re_period_context = re.compile(
- self._period_context_fmt
- % {
- "NonWord": self._re_non_word_chars,
- "SentEndChars": self._re_sent_end_chars,
+ self._period_context_fmt %
+ {
+ 'NonWord': self._re_non_word_chars,
+ 'SentEndChars': self._re_sent_end_chars,
},
- re.UNICODE | re.VERBOSE,
- )
+ re.UNICODE | re.VERBOSE)
return self._re_period_context
-_re_non_punct = re.compile(r"[^\W\d]", re.UNICODE)
+_re_non_punct = re.compile(r'[^\W\d]', re.UNICODE)
"""Matches token types that are not merely punctuation. (Types for
numeric tokens are changed to ##number## and hence contain alpha.)"""
-
-# }
+#}
######################################################################
-# ////////////////////////////////////////////////////////////
-# { Helper Functions
-# ////////////////////////////////////////////////////////////
+#////////////////////////////////////////////////////////////
+#{ Helper Functions
+#////////////////////////////////////////////////////////////
def _pair_iter(it):
"""
pair will have None as its second element.
"""
it = iter(it)
- try:
- prev = next(it)
- except StopIteration:
- return
+ prev = next(it)
for el in it:
yield (prev, el)
prev = el
yield (prev, None)
-
######################################################################
-# { Punkt Parameters
+#{ Punkt Parameters
######################################################################
-
class PunktParameters(object):
"""Stores data used to perform sentence boundary detection with Punkt."""
def _debug_ortho_context(self, typ):
c = self.ortho_context[typ]
if c & _ORTHO_BEG_UC:
- yield "BEG-UC"
+ yield 'BEG-UC'
if c & _ORTHO_MID_UC:
- yield "MID-UC"
+ yield 'MID-UC'
if c & _ORTHO_UNK_UC:
- yield "UNK-UC"
+ yield 'UNK-UC'
if c & _ORTHO_BEG_LC:
- yield "BEG-LC"
+ yield 'BEG-LC'
if c & _ORTHO_MID_LC:
- yield "MID-LC"
+ yield 'MID-LC'
if c & _ORTHO_UNK_LC:
- yield "UNK-LC"
-
+ yield 'UNK-LC'
######################################################################
-# { PunktToken
+#{ PunktToken
######################################################################
-
+@python_2_unicode_compatible
class PunktToken(object):
"""Stores a token of text with annotations produced during
sentence boundary detection."""
- _properties = ["parastart", "linestart", "sentbreak", "abbr", "ellipsis"]
- __slots__ = ["tok", "type", "period_final"] + _properties
+ _properties = [
+ 'parastart', 'linestart',
+ 'sentbreak', 'abbr', 'ellipsis'
+ ]
+ __slots__ = ['tok', 'type', 'period_final'] + _properties
def __init__(self, tok, **params):
self.tok = tok
self.type = self._get_type(tok)
- self.period_final = tok.endswith(".")
+ self.period_final = tok.endswith('.')
for p in self._properties:
setattr(self, p, None)
for k in params:
setattr(self, k, params[k])
- # ////////////////////////////////////////////////////////////
- # { Regular expressions for properties
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Regular expressions for properties
+ #////////////////////////////////////////////////////////////
# Note: [A-Za-z] is approximated by [^\W\d] in the general case.
- _RE_ELLIPSIS = re.compile(r"\.\.+$")
- _RE_NUMERIC = re.compile(r"^-?[\.,]?\d[\d,\.-]*\.?$")
- _RE_INITIAL = re.compile(r"[^\W\d]\.$", re.UNICODE)
- _RE_ALPHA = re.compile(r"[^\W\d]+$", re.UNICODE)
+ _RE_ELLIPSIS = re.compile(r'\.\.+$')
+ _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$')
+ _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE)
+ _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE)
- # ////////////////////////////////////////////////////////////
- # { Derived properties
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Derived properties
+ #////////////////////////////////////////////////////////////
def _get_type(self, tok):
"""Returns a case-normalized representation of the token."""
- return self._RE_NUMERIC.sub("##number##", tok.lower())
+ return self._RE_NUMERIC.sub('##number##', tok.lower())
@property
def type_no_period(self):
"""
The type with its final period removed if it has one.
"""
- if len(self.type) > 1 and self.type[-1] == ".":
+ if len(self.type) > 1 and self.type[-1] == '.':
return self.type[:-1]
return self.type
@property
def first_case(self):
if self.first_lower:
- return "lower"
+ return 'lower'
elif self.first_upper:
- return "upper"
- return "none"
+ return 'upper'
+ return 'none'
@property
def is_ellipsis(self):
@property
def is_number(self):
"""True if the token text is that of a number."""
- return self.type.startswith("##number##")
+ return self.type.startswith('##number##')
@property
def is_initial(self):
"""True if the token is either a number or is alphabetic."""
return _re_non_punct.search(self.type)
- # ////////////////////////////////////////////////////////////
- # { String representation
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ String representation
+ #////////////////////////////////////////////////////////////
def __repr__(self):
"""
with eval(), which lists all the token's non-default
annotations.
"""
- typestr = " type=%s," % repr(self.type) if self.type != self.tok else ""
+ typestr = (' type=%s,' % unicode_repr(self.type)
+ if self.type != self.tok else '')
- propvals = ", ".join(
- "%s=%s" % (p, repr(getattr(self, p)))
+ propvals = ', '.join(
+ '%s=%s' % (p, unicode_repr(getattr(self, p)))
for p in self._properties
if getattr(self, p)
)
- return "%s(%s,%s %s)" % (
- self.__class__.__name__,
- repr(self.tok),
- typestr,
- propvals,
- )
+ return '%s(%s,%s %s)' % (self.__class__.__name__,
+ unicode_repr(self.tok), typestr, propvals)
def __str__(self):
"""
"""
res = self.tok
if self.abbr:
- res += "<A>"
+ res += '<A>'
if self.ellipsis:
- res += "<E>"
+ res += '<E>'
if self.sentbreak:
- res += "<S>"
+ res += '<S>'
return res
-
######################################################################
-# { Punkt base class
+#{ Punkt base class
######################################################################
-
class PunktBaseClass(object):
"""
Includes common components of PunktTrainer and PunktSentenceTokenizer.
"""
- def __init__(self, lang_vars=None, token_cls=PunktToken, params=None):
- if lang_vars is None:
- lang_vars = PunktLanguageVars()
+ def __init__(self, lang_vars=PunktLanguageVars(), token_cls=PunktToken,
+ params=None):
if params is None:
- params = PunktParameters()
+ params = PunktParameters()
self._params = params
self._lang_vars = lang_vars
self._Token = token_cls
"""The collection of parameters that determines the behavior
of the punkt tokenizer."""
- # ////////////////////////////////////////////////////////////
- # { Word tokenization
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Word tokenization
+ #////////////////////////////////////////////////////////////
def _tokenize_words(self, plaintext):
"""
respectively.
"""
parastart = False
- for line in plaintext.split("\n"):
+ for line in plaintext.split('\n'):
if line.strip():
line_toks = iter(self._lang_vars.word_tokenize(line))
- try:
- tok = next(line_toks)
- except StopIteration:
- continue
-
- yield self._Token(tok, parastart=parastart, linestart=True)
+ yield self._Token(next(line_toks),
+ parastart=parastart, linestart=True)
parastart = False
for t in line_toks:
else:
parastart = True
- # ////////////////////////////////////////////////////////////
- # { Annotation Procedures
- # ////////////////////////////////////////////////////////////
+
+ #////////////////////////////////////////////////////////////
+ #{ Annotation Procedures
+ #////////////////////////////////////////////////////////////
def _annotate_first_pass(self, tokens):
"""
aug_tok.sentbreak = True
elif aug_tok.is_ellipsis:
aug_tok.ellipsis = True
- elif aug_tok.period_final and not tok.endswith(".."):
- if (
- tok[:-1].lower() in self._params.abbrev_types
- or tok[:-1].lower().split("-")[-1] in self._params.abbrev_types
- ):
+ elif aug_tok.period_final and not tok.endswith('..'):
+ if (tok[:-1].lower() in self._params.abbrev_types or
+ tok[:-1].lower().split('-')[-1] in self._params.abbrev_types):
aug_tok.abbr = True
else:
return
-
######################################################################
-# { Punkt Trainer
+#{ Punkt Trainer
######################################################################
class PunktTrainer(PunktBaseClass):
"""Learns parameters used in Punkt sentence boundary detection."""
- def __init__(
- self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken
- ):
+ def __init__(self, train_text=None, verbose=False,
+ lang_vars=PunktLanguageVars(), token_cls=PunktToken):
- PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls)
+ PunktBaseClass.__init__(self, lang_vars=lang_vars,
+ token_cls=token_cls)
self._type_fdist = FreqDist()
"""A frequency distribution giving the frequency of each
self.finalize_training()
return self._params
- # ////////////////////////////////////////////////////////////
- # { Customization Variables
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Customization Variables
+ #////////////////////////////////////////////////////////////
ABBREV = 0.3
"""cut-off value whether a 'token' is an abbreviation"""
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""
- # ////////////////////////////////////////////////////////////
- # { Training..
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Training..
+ #////////////////////////////////////////////////////////////
def train(self, text, verbose=False, finalize=True):
"""
if is_add:
self._params.abbrev_types.add(abbr)
if verbose:
- print((" Abbreviation: [%6.4f] %s" % (score, abbr)))
+ print((' Abbreviation: [%6.4f] %s' %
+ (score, abbr)))
else:
if not is_add:
self._params.abbrev_types.remove(abbr)
if verbose:
- print((" Removed abbreviation: [%6.4f] %s" % (score, abbr)))
+ print((' Removed abbreviation: [%6.4f] %s' %
+ (score, abbr)))
# Make a preliminary pass through the document, marking likely
# sentence breaks, abbreviations, and ellipsis tokens.
if self._is_rare_abbrev_type(aug_tok1, aug_tok2):
self._params.abbrev_types.add(aug_tok1.type_no_period)
if verbose:
- print((" Rare Abbrev: %s" % aug_tok1.type))
+ print((' Rare Abbrev: %s' % aug_tok1.type))
# Does second token have a high likelihood of starting a sentence?
if self._is_potential_sent_starter(aug_tok2, aug_tok1):
# Is this bigram a potential collocation?
if self._is_potential_collocation(aug_tok1, aug_tok2):
self._collocation_fdist[
- (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod)
- ] += 1
+ (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod)] += 1
def _unique_types(self, tokens):
return set(aug_tok.type for aug_tok in tokens)
for typ, ll in self._find_sent_starters():
self._params.sent_starters.add(typ)
if verbose:
- print((" Sent Starter: [%6.4f] %r" % (ll, typ)))
+ print((' Sent Starter: [%6.4f] %r' % (ll, typ)))
self._params.clear_collocations()
for (typ1, typ2), ll in self._find_collocations():
- self._params.collocations.add((typ1, typ2))
+ self._params.collocations.add( (typ1,typ2) )
if verbose:
- print((" Collocation: [%6.4f] %r+%r" % (ll, typ1, typ2)))
+ print((' Collocation: [%6.4f] %r+%r' %
+ (ll, typ1, typ2)))
self._finalized = True
- # ////////////////////////////////////////////////////////////
- # { Overhead reduction
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Overhead reduction
+ #////////////////////////////////////////////////////////////
- def freq_threshold(
- self, ortho_thresh=2, type_thresh=2, colloc_thres=2, sentstart_thresh=2
- ):
+ def freq_threshold(self, ortho_thresh=2, type_thresh=2, colloc_thres=2,
+ sentstart_thresh=2):
"""
Allows memory use to be reduced after much training by removing data
about rare tokens that are unlikely to have a statistical effect with
self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh)
self._collocation_fdist = self._freq_threshold(
- self._collocation_fdist, colloc_thres
- )
+ self._collocation_fdist, colloc_thres)
self._sent_starter_fdist = self._freq_threshold(
- self._sent_starter_fdist, sentstart_thresh
- )
+ self._sent_starter_fdist, sentstart_thresh)
def _freq_threshold(self, fdist, threshold):
"""
res[None] += num_removed
return res
- # ////////////////////////////////////////////////////////////
- # { Orthographic data
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Orthographic data
+ #////////////////////////////////////////////////////////////
def _get_orthography_data(self, tokens):
"""
positions.
"""
# 'initial' or 'internal' or 'unknown'
- context = "internal"
+ context = 'internal'
tokens = list(tokens)
for aug_tok in tokens:
# that it's a sentence break. But err on the side of
# caution (by not positing a sentence break) if we just
# saw an abbreviation.
- if aug_tok.parastart and context != "unknown":
- context = "initial"
+ if aug_tok.parastart and context != 'unknown':
+ context = 'initial'
# If we're at the beginning of a line, then we can't decide
# between 'internal' and 'initial'.
- if aug_tok.linestart and context == "internal":
- context = "unknown"
+ if aug_tok.linestart and context == 'internal':
+ context = 'unknown'
# Find the case-normalized type of the token. If it's a
# sentence-final token, strip off the period.
# Decide whether the next word is at a sentence boundary.
if aug_tok.sentbreak:
if not (aug_tok.is_number or aug_tok.is_initial):
- context = "initial"
+ context = 'initial'
else:
- context = "unknown"
+ context = 'unknown'
elif aug_tok.ellipsis or aug_tok.abbr:
- context = "unknown"
+ context = 'unknown'
else:
- context = "internal"
+ context = 'internal'
- # ////////////////////////////////////////////////////////////
- # { Abbreviations
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Abbreviations
+ #////////////////////////////////////////////////////////////
def _reclassify_abbrev_types(self, types):
"""
for typ in types:
# Check some basic conditions, to rule out words that are
# clearly not abbrev_types.
- if not _re_non_punct.search(typ) or typ == "##number##":
+ if not _re_non_punct.search(typ) or typ == '##number##':
continue
- if typ.endswith("."):
+ if typ.endswith('.'):
if typ in self._params.abbrev_types:
continue
typ = typ[:-1]
# Count how many periods & nonperiods are in the
# candidate.
- num_periods = typ.count(".") + 1
+ num_periods = typ.count('.') + 1
num_nonperiods = len(typ) - num_periods + 1
# Let <a> be the candidate without the period, and <b>
# indicates whether <ab> occurs as a single unit (high
# value of ll), or as two independent units <a> and
# <b> (low value of ll).
- count_with_period = self._type_fdist[typ + "."]
+ count_with_period = self._type_fdist[typ + '.']
count_without_period = self._type_fdist[typ]
ll = self._dunning_log_likelihood(
count_with_period + count_without_period,
- self._num_period_toks,
- count_with_period,
- self._type_fdist.N(),
- )
+ self._num_period_toks, count_with_period,
+ self._type_fdist.N())
# Apply three scaling factors to 'tweak' the basic log
# likelihood ratio:
# F_penalty: penalize occurrences w/o a period
f_length = math.exp(-num_nonperiods)
f_periods = num_periods
- f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow(
- num_nonperiods, -count_without_period
- )
+ f_penalty = (int(self.IGNORE_ABBREV_PENALTY)
+ or math.pow(num_nonperiods, -count_without_period))
score = ll * f_length * f_periods * f_penalty
yield typ, score, is_add
This fails to include abbreviations otherwise found as "rare".
"""
self._params.clear_abbrevs()
- tokens = (typ for typ in self._type_fdist if typ and typ.endswith("."))
+ tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.'))
for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
if score >= self.ABBREV:
self._params.abbrev_types.add(abbr)
# Proceed only if the type hasn't been categorized as an
# abbreviation already, and is sufficiently rare...
count = self._type_fdist[typ] + self._type_fdist[typ[:-1]]
- if typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF:
+ if (typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF):
return False
# Record this token as an abbreviation if the next
elif next_tok.first_lower:
typ2 = next_tok.type_no_sentperiod
typ2ortho_context = self._params.ortho_context[typ2]
- if (typ2ortho_context & _ORTHO_BEG_UC) and not (
- typ2ortho_context & _ORTHO_MID_UC
- ):
+ if ( (typ2ortho_context & _ORTHO_BEG_UC) and
+ not (typ2ortho_context & _ORTHO_MID_UC) ):
return True
- # ////////////////////////////////////////////////////////////
- # { Log Likelihoods
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Log Likelihoods
+ #////////////////////////////////////////////////////////////
# helper for _reclassify_abbrev_types:
@staticmethod
p1 = count_b / N
p2 = 0.99
- null_hypo = count_ab * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1)
- alt_hypo = count_ab * math.log(p2) + (count_a - count_ab) * math.log(1.0 - p2)
+ null_hypo = (count_ab * math.log(p1) +
+ (count_a - count_ab) * math.log(1.0 - p1))
+ alt_hypo = (count_ab * math.log(p2) +
+ (count_a - count_ab) * math.log(1.0 - p2))
likelihood = null_hypo - alt_hypo
- return -2.0 * likelihood
+ return (-2.0 * likelihood)
@staticmethod
def _col_log_likelihood(count_a, count_b, count_ab, N):
p2 = 1
try:
- summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p)
+ summand1 = (count_ab * math.log(p) +
+ (count_a - count_ab) * math.log(1.0 - p))
except ValueError as e:
summand1 = 0
try:
- summand2 = (count_b - count_ab) * math.log(p) + (
- N - count_a - count_b + count_ab
- ) * math.log(1.0 - p)
+ summand2 = ((count_b - count_ab) * math.log(p) +
+ (N - count_a - count_b + count_ab) * math.log(1.0 - p))
except ValueError as e:
summand2 = 0
if count_a == count_ab or p1 <= 0 or p1 >= 1:
summand3 = 0
else:
- summand3 = count_ab * math.log(p1) + (count_a - count_ab) * math.log(
- 1.0 - p1
- )
+ summand3 = (count_ab * math.log(p1) +
+ (count_a - count_ab) * math.log(1.0 - p1))
if count_b == count_ab or p2 <= 0 or p2 >= 1:
summand4 = 0
else:
- summand4 = (count_b - count_ab) * math.log(p2) + (
- N - count_a - count_b + count_ab
- ) * math.log(1.0 - p2)
+ summand4 = ((count_b - count_ab) * math.log(p2) +
+ (N - count_a - count_b + count_ab) * math.log(1.0 - p2))
likelihood = summand1 + summand2 - summand3 - summand4
- return -2.0 * likelihood
+ return (-2.0 * likelihood)
- # ////////////////////////////////////////////////////////////
- # { Collocation Finder
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Collocation Finder
+ #////////////////////////////////////////////////////////////
def _is_potential_collocation(self, aug_tok1, aug_tok2):
"""
Returns True if the pair of tokens may form a collocation given
log-likelihood statistics.
"""
- return (
- (
- self.INCLUDE_ALL_COLLOCS
- or (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr)
- or (aug_tok1.sentbreak and (aug_tok1.is_number or aug_tok1.is_initial))
- )
- and aug_tok1.is_non_punct
- and aug_tok2.is_non_punct
- )
+ return ((self.INCLUDE_ALL_COLLOCS or
+ (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) or
+ (aug_tok1.sentbreak and
+ (aug_tok1.is_number or aug_tok1.is_initial)))
+ and aug_tok1.is_non_punct
+ and aug_tok2.is_non_punct)
def _find_collocations(self):
"""
continue
col_count = self._collocation_fdist[types]
- typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + "."]
- typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + "."]
- if (
- typ1_count > 1
- and typ2_count > 1
- and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count)
- ):
-
- ll = self._col_log_likelihood(
- typ1_count, typ2_count, col_count, self._type_fdist.N()
- )
+ typ1_count = self._type_fdist[typ1]+self._type_fdist[typ1+'.']
+ typ2_count = self._type_fdist[typ2]+self._type_fdist[typ2+'.']
+ if (typ1_count > 1 and typ2_count > 1
+ and self.MIN_COLLOC_FREQ <
+ col_count <= min(typ1_count, typ2_count)):
+
+ ll = self._col_log_likelihood(typ1_count, typ2_count,
+ col_count, self._type_fdist.N())
# Filter out the not-so-collocative
- if ll >= self.COLLOCATION and (
- self._type_fdist.N() / typ1_count > typ2_count / col_count
- ):
+ if (ll >= self.COLLOCATION and
+ (self._type_fdist.N()/typ1_count >
+ typ2_count/col_count)):
yield (typ1, typ2), ll
- # ////////////////////////////////////////////////////////////
- # { Sentence-Starter Finder
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Sentence-Starter Finder
+ #////////////////////////////////////////////////////////////
def _is_potential_sent_starter(self, cur_tok, prev_tok):
"""
# If a token (i) is preceded by a sentece break that is
# not a potential ordinal number or initial, and (ii) is
# alphabetic, then it is a a sentence-starter.
- return (
- prev_tok.sentbreak
- and not (prev_tok.is_number or prev_tok.is_initial)
- and cur_tok.is_alpha
- )
+ return ( prev_tok.sentbreak and
+ not (prev_tok.is_number or prev_tok.is_initial) and
+ cur_tok.is_alpha )
def _find_sent_starters(self):
"""
continue
typ_at_break_count = self._sent_starter_fdist[typ]
- typ_count = self._type_fdist[typ] + self._type_fdist[typ + "."]
+ typ_count = self._type_fdist[typ]+self._type_fdist[typ+'.']
if typ_count < typ_at_break_count:
# needed after freq_threshold
continue
- ll = self._col_log_likelihood(
- self._sentbreak_count,
- typ_count,
- typ_at_break_count,
- self._type_fdist.N(),
- )
+ ll = self._col_log_likelihood(self._sentbreak_count, typ_count,
+ typ_at_break_count,
+ self._type_fdist.N())
+
+ if (ll >= self.SENT_STARTER and
+ self._type_fdist.N()/self._sentbreak_count >
+ typ_count/typ_at_break_count):
- if (
- ll >= self.SENT_STARTER
- and self._type_fdist.N() / self._sentbreak_count
- > typ_count / typ_at_break_count
- ):
yield typ, ll
def _get_sentbreak_count(self, tokens):
######################################################################
-# { Punkt Sentence Tokenizer
+#{ Punkt Sentence Tokenizer
######################################################################
-class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
+class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
"""
A sentence tokenizer which uses an unsupervised algorithm to build
a model for abbreviation words, collocations, and words that start
This approach has been shown to work well for many European
languages.
"""
-
- def __init__(
- self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken
- ):
+ def __init__(self, train_text=None, verbose=False,
+ lang_vars=PunktLanguageVars(), token_cls=PunktToken):
"""
train_text can either be the sole training text for this sentence
boundary detector, or can be a PunktParameters object.
"""
- PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls)
+ PunktBaseClass.__init__(self, lang_vars=lang_vars,
+ token_cls=token_cls)
if train_text:
self._params = self.train(train_text, verbose)
given. Repeated calls to this method destroy previous parameters. For
incremental training, instantiate a separate PunktTrainer instance.
"""
- if not isinstance(train_text, str):
+ if not isinstance(train_text, string_types):
return train_text
- return PunktTrainer(
- train_text, lang_vars=self._lang_vars, token_cls=self._Token
- ).get_params()
+ return PunktTrainer(train_text, lang_vars=self._lang_vars,
+ token_cls=self._Token).get_params()
- # ////////////////////////////////////////////////////////////
- # { Tokenization
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Tokenization
+ #////////////////////////////////////////////////////////////
def tokenize(self, text, realign_boundaries=True):
"""
"""
for match in self._lang_vars.period_context_re().finditer(text):
- decision_text = match.group() + match.group("after_tok")
+ decision_text = match.group() + match.group('after_tok')
tokens = self._tokenize_words(decision_text)
tokens = list(self._annotate_first_pass(tokens))
while not tokens[0].period_final:
tokens.pop(0)
- yield dict(
- period_index=match.end() - 1,
+ yield dict(period_index=match.end() - 1,
text=decision_text,
type1=tokens[0].type,
type2=tokens[1].type,
type1_in_abbrs=bool(tokens[0].abbr),
type1_is_initial=bool(tokens[0].is_initial),
- type2_is_sent_starter=tokens[1].type_no_sentperiod
- in self._params.sent_starters,
+ type2_is_sent_starter=tokens[1].type_no_sentperiod in self._params.sent_starters,
type2_ortho_heuristic=self._ortho_heuristic(tokens[1]),
- type2_ortho_contexts=set(
- self._params._debug_ortho_context(tokens[1].type_no_sentperiod)
- ),
- collocation=(tokens[0].type_no_sentperiod, tokens[1].type_no_sentperiod)
- in self._params.collocations,
- reason=self._second_pass_annotation(tokens[0], tokens[1])
- or REASON_DEFAULT_DECISION,
+ type2_ortho_contexts=set(self._params._debug_ortho_context(tokens[1].type_no_sentperiod)),
+ collocation=(tokens[0].type_no_sentperiod, tokens[1].type_no_sentperiod) in self._params.collocations,
+
+ reason=self._second_pass_annotation(tokens[0], tokens[1]) or REASON_DEFAULT_DECISION,
break_decision=tokens[0].sentbreak,
)
def span_tokenize(self, text, realign_boundaries=True):
"""
- Given a text, generates (start, end) spans of sentences
+ Given a text, returns a list of the (start, end) spans of sentences
in the text.
"""
slices = self._slices_from_text(text)
if realign_boundaries:
slices = self._realign_boundaries(text, slices)
- for sl in slices:
- yield (sl.start, sl.stop)
+ return [(sl.start, sl.stop) for sl in slices]
def sentences_from_text(self, text, realign_boundaries=True):
"""
def _slices_from_text(self, text):
last_break = 0
for match in self._lang_vars.period_context_re().finditer(text):
- context = match.group() + match.group("after_tok")
+ context = match.group() + match.group('after_tok')
if self.text_contains_sentbreak(context):
yield slice(last_break, match.end())
- if match.group("next_tok"):
+ if match.group('next_tok'):
# next sentence starts after whitespace
- last_break = match.start("next_tok")
+ last_break = match.start('next_tok')
else:
# next sentence starts at following punctuation
last_break = match.end()
"""
Returns True if the given text includes a sentence break.
"""
- found = False # used to ignore last token
+ found = False # used to ignore last token
for t in self._annotate_tokens(self._tokenize_words(text)):
if found:
return True
tokens = self._annotate_second_pass(tokens)
## [XX] TESTING
- # tokens = list(tokens)
- # self.dump(tokens)
+ #tokens = list(tokens)
+ #self.dump(tokens)
return tokens
pos = 0
# A regular expression that finds pieces of whitespace:
- WS_REGEXP = re.compile(r"\s*")
+ WS_REGEXP = re.compile(r'\s*')
- sentence = ""
+ sentence = ''
for aug_tok in tokens:
tok = aug_tok.tok
# that contain whitespace in the source text. If our
# token doesn't match, see if adding whitespace helps.
# If so, then use the version with whitespace.
- if text[pos : pos + len(tok)] != tok:
- pat = "\s*".join(re.escape(c) for c in tok)
- m = re.compile(pat).match(text, pos)
- if m:
- tok = m.group()
+ if text[pos:pos+len(tok)] != tok:
+ pat = '\s*'.join(re.escape(c) for c in tok)
+ m = re.compile(pat).match(text,pos)
+ if m: tok = m.group()
# Move our position pointer to the end of the token.
- assert text[pos : pos + len(tok)] == tok
+ assert text[pos:pos+len(tok)] == tok
pos += len(tok)
# Add this token. If it's not at the beginning of the
# If we're at a sentence break, then start a new sentence.
if aug_tok.sentbreak:
yield sentence
- sentence = ""
+ sentence = ''
# If the last sentence is emtpy, discard it.
if sentence:
# [XX] TESTING
def dump(self, tokens):
- print("writing to /tmp/punkt.new...")
- with open("/tmp/punkt.new", "w") as outfile:
+ print('writing to /tmp/punkt.new...')
+ with open('/tmp/punkt.new', 'w') as outfile:
for aug_tok in tokens:
if aug_tok.parastart:
- outfile.write("\n\n")
+ outfile.write('\n\n')
elif aug_tok.linestart:
- outfile.write("\n")
+ outfile.write('\n')
else:
- outfile.write(" ")
+ outfile.write(' ')
outfile.write(str(aug_tok))
- # ////////////////////////////////////////////////////////////
- # { Customization Variables
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Customization Variables
+ #////////////////////////////////////////////////////////////
- PUNCTUATION = tuple(";:,.!?")
+ PUNCTUATION = tuple(';:,.!?')
- # ////////////////////////////////////////////////////////////
- # { Annotation Procedures
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
+ #{ Annotation Procedures
+ #////////////////////////////////////////////////////////////
def _annotate_second_pass(self, tokens):
"""
# [4.2. Token-Based Reclassification of Abbreviations] If
# the token is an abbreviation or an ellipsis, then decide
# whether we should *also* classify it as a sentbreak.
- if (aug_tok1.abbr or aug_tok1.ellipsis) and (not tok_is_initial):
+ if ( (aug_tok1.abbr or aug_tok1.ellipsis) and
+ (not tok_is_initial) ):
# [4.1.1. Orthographic Heuristic] Check if there's
# orthogrpahic evidence about whether the next word
# starts a sentence or not.
# next word is capitalized, and is a member of the
# frequent-sentence-starters list, then label tok as a
# sentence break.
- if aug_tok2.first_upper and next_typ in self._params.sent_starters:
+ if ( aug_tok2.first_upper and
+ next_typ in self._params.sent_starters):
aug_tok1.sentbreak = True
return REASON_ABBR_WITH_SENTENCE_STARTER
# [4.3. Token-Based Detection of Initials and Ordinals]
# Check if any initials or ordinals tokens that are marked
# as sentbreaks should be reclassified as abbreviations.
- if tok_is_initial or typ == "##number##":
+ if tok_is_initial or typ == '##number##':
# [4.1.1. Orthographic Heuristic] Check if there's
# orthogrpahic evidence about whether the next word
# Special heuristic for initials: if orthogrpahic
# heuristc is unknown, and next word is always
# capitalized, then mark as abbrev (eg: J. Bach).
- if (
- is_sent_starter == "unknown"
- and tok_is_initial
- and aug_tok2.first_upper
- and not (self._params.ortho_context[next_typ] & _ORTHO_LC)
- ):
+ if ( is_sent_starter == 'unknown' and tok_is_initial and
+ aug_tok2.first_upper and
+ not (self._params.ortho_context[next_typ] & _ORTHO_LC) ):
aug_tok1.sentbreak = False
aug_tok1.abbr = True
return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC
# If the word is capitalized, occurs at least once with a
# lower case first letter, and never occurs with an upper case
# first letter sentence-internally, then it's a sentence starter.
- if (
- aug_tok.first_upper
- and (ortho_context & _ORTHO_LC)
- and not (ortho_context & _ORTHO_MID_UC)
- ):
+ if ( aug_tok.first_upper and
+ (ortho_context & _ORTHO_LC) and
+ not (ortho_context & _ORTHO_MID_UC) ):
return True
# If the word is lower case, and either (a) we've seen it used
# with upper case, or (b) we've never seen it used
# sentence-initially with lower case, then it's not a sentence
# starter.
- if aug_tok.first_lower and (
- (ortho_context & _ORTHO_UC) or not (ortho_context & _ORTHO_BEG_LC)
- ):
+ if ( aug_tok.first_lower and
+ ((ortho_context & _ORTHO_UC) or
+ not (ortho_context & _ORTHO_BEG_LC)) ):
return False
# Otherwise, we're not sure.
- return "unknown"
+ return 'unknown'
-DEBUG_DECISION_FMT = """Text: %(text)r (at offset %(period_index)d)
+DEBUG_DECISION_FMT = '''Text: %(text)r (at offset %(period_index)d)
Sentence break? %(break_decision)s (%(reason)s)
Collocation? %(collocation)s
%(type1)r:
known sentence starter: %(type2_is_sent_starter)s
orthographic heuristic suggests is a sentence starter? %(type2_ortho_heuristic)s
orthographic contexts in training: %(type2_ortho_contexts)s
-"""
-
-
+'''
def format_debug_decision(d):
return DEBUG_DECISION_FMT % d
-
def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
"""Builds a punkt model and applies it to the same text"""
- cleanup = (
- lambda s: re.compile(r"(?:\r|^\s+)", re.MULTILINE).sub("", s).replace("\n", " ")
- )
+ cleanup = lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ')
trainer = train_cls()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Trevor Cohn <tacohn@csse.unimelb.edu.au>
``re`` functions, where the pattern is always the first argument.
(This is for consistency with the other NLTK tokenizers.)
"""
+from __future__ import unicode_literals
import re
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import regexp_span_tokenize
+from nltk.compat import python_2_unicode_compatible
-
+@python_2_unicode_compatible
class RegexpTokenizer(TokenizerI):
"""
A tokenizer that splits a string using a regular expression, which
used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
"""
-
- def __init__(
- self,
- pattern,
- gaps=False,
- discard_empty=True,
- flags=re.UNICODE | re.MULTILINE | re.DOTALL,
- ):
+ def __init__(self, pattern, gaps=False, discard_empty=True,
+ flags=re.UNICODE | re.MULTILINE | re.DOTALL):
# If they gave us a regexp object, extract the pattern.
- pattern = getattr(pattern, "pattern", pattern)
+ pattern = getattr(pattern, 'pattern', pattern)
self._pattern = pattern
self._gaps = gaps
self._discard_empty = discard_empty
self._flags = flags
self._regexp = None
-
+
def _check_regexp(self):
if self._regexp is None:
self._regexp = re.compile(self._pattern, self._flags)
-
+
def tokenize(self, text):
self._check_regexp()
# If our regexp matches gaps, use re.split:
yield m.span()
def __repr__(self):
- return "%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)" % (
- self.__class__.__name__,
- self._pattern,
- self._gaps,
- self._discard_empty,
- self._flags,
- )
-
+ return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' %
+ (self.__class__.__name__, self._pattern, self._gaps,
+ self._discard_empty, self._flags))
class WhitespaceTokenizer(RegexpTokenizer):
r"""
"""
def __init__(self):
- RegexpTokenizer.__init__(self, r"\s+", gaps=True)
-
+ RegexpTokenizer.__init__(self, r'\s+', gaps=True)
class BlanklineTokenizer(RegexpTokenizer):
"""
Blank lines are defined as lines containing no characters, except for
space or tab characters.
"""
-
def __init__(self):
- RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
-
+ RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
class WordPunctTokenizer(RegexpTokenizer):
"""
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
"""
-
def __init__(self):
- RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
-
+ RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
######################################################################
-# { Tokenization Functions
+#{ Tokenization Functions
######################################################################
-
-def regexp_tokenize(
- text,
- pattern,
- gaps=False,
- discard_empty=True,
- flags=re.UNICODE | re.MULTILINE | re.DOTALL,
-):
+def regexp_tokenize(text, pattern, gaps=False, discard_empty=True,
+ flags=re.UNICODE | re.MULTILINE | re.DOTALL):
"""
Return a tokenized copy of *text*. See :class:`.RegexpTokenizer`
for descriptions of the arguments.
tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
return tokenizer.tokenize(text)
-
blankline_tokenize = BlanklineTokenizer().tokenize
wordpunct_tokenize = WordPunctTokenizer().tokenize
+
+
+
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals, print_function
+from six import text_type
+
import os
import re
import sys
import subprocess
import tempfile
+
from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir
from nltk.tokenize.api import TokenizerI
-
class ReppTokenizer(TokenizerI):
"""
A class for word tokenization using the REPP parser described in
- Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
- Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
+ Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
+ Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
>>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
>>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
>>> for sent in sents: # doctest: +SKIP
... tokenizer.tokenize(sent) # doctest: +SKIP
- ...
+ ...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
- ... print(sent) # doctest: +SKIP
- ...
+ ... print sent # doctest: +SKIP
+ ...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
- ... print(sent) # doctest: +SKIP
- ...
+ ... print sent # doctest: +SKIP
+ ...
[(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
[(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
[(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
"""
-
- def __init__(self, repp_dir, encoding="utf8"):
+ def __init__(self, repp_dir, encoding='utf8'):
self.repp_dir = self.find_repptokenizer(repp_dir)
- # Set a directory to store the temporary files.
+ # Set a directory to store the temporary files.
self.working_dir = tempfile.gettempdir()
# Set an encoding for the input strings.
self.encoding = encoding
-
+
def tokenize(self, sentence):
"""
- Use Repp to tokenize a single sentence.
-
+ Use Repp to tokenize a single sentence.
+
:param sentence: A single sentence string.
:type sentence: str
- :return: A tuple of tokens.
+ :return: A tuple of tokens.
:rtype: tuple(str)
"""
return next(self.tokenize_sents([sentence]))
-
+
def tokenize_sents(self, sentences, keep_token_positions=False):
"""
Tokenize multiple sentences using Repp.
-
+
:param sentences: A list of sentence strings.
:type sentences: list(str)
:return: A list of tuples of tokens
:rtype: iter(tuple(str))
"""
- with tempfile.NamedTemporaryFile(
- prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
- ) as input_file:
+ with tempfile.NamedTemporaryFile(prefix='repp_input.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
# Write sentences to temporary input file.
for sent in sentences:
- input_file.write(str(sent) + "\n")
+ input_file.write(text_type(sent) + '\n')
input_file.close()
- # Generate command to run REPP.
- cmd = self.generate_repp_command(input_file.name)
+ # Generate command to run REPP.
+ cmd =self.generate_repp_command(input_file.name)
# Decode the stdout and strips the ending newline.
repp_output = self._execute(cmd).decode(self.encoding).strip()
for tokenized_sent in self.parse_repp_outputs(repp_output):
if not keep_token_positions:
# Removes token position information.
tokenized_sent, starts, ends = zip(*tokenized_sent)
- yield tokenized_sent
-
+ yield tokenized_sent
+
def generate_repp_command(self, inputfilename):
"""
This module generates the REPP command to be used at the terminal.
-
+
:param inputfilename: path to the input file
:type inputfilename: str
"""
- cmd = [self.repp_dir + "/src/repp"]
- cmd += ["-c", self.repp_dir + "/erg/repp.set"]
- cmd += ["--format", "triple"]
- cmd += [inputfilename]
- return cmd
+ cmd = [self.repp_dir + '/src/repp']
+ cmd+= ['-c', self.repp_dir + '/erg/repp.set']
+ cmd+= ['--format', 'triple']
+ cmd+= [inputfilename]
+ return cmd
@staticmethod
def _execute(cmd):
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
return stdout
-
- @staticmethod
+
+ @staticmethod
def parse_repp_outputs(repp_output):
"""
This module parses the tri-tuple format that REPP outputs using the
"--format triple" option and returns an generator with tuple of string
tokens.
-
+
:param repp_output:
:type repp_output: type
:return: an iterable of the tokenized sentences as tuples of strings
:rtype: iter(tuple)
"""
- line_regex = re.compile("^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
- for section in repp_output.split("\n\n"):
- words_with_positions = [
- (token, int(start), int(end))
- for start, end, token in line_regex.findall(section)
- ]
+ line_regex = re.compile('^\((\d+), (\d+), (.+)\)$', re.MULTILINE)
+ for section in repp_output.split('\n\n'):
+ words_with_positions = [(token, int(start), int(end))
+ for start, end, token in
+ line_regex.findall(section)]
words = tuple(t[2] for t in words_with_positions)
yield words_with_positions
-
+
def find_repptokenizer(self, repp_dirname):
"""
A module to find REPP tokenizer binary and its *repp.set* config file.
"""
- if os.path.exists(repp_dirname): # If a full path is given.
+ if os.path.exists(repp_dirname): # If a full path is given.
_repp_dir = repp_dirname
- else: # Try to find path to REPP directory in environment variables.
- _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
+ else: # Try to find path to REPP directory in environment variables.
+ _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',))
# Checks for the REPP binary and erg/repp.set config file.
- assert os.path.exists(_repp_dir + "/src/repp")
- assert os.path.exists(_repp_dir + "/erg/repp.set")
+ assert os.path.exists(_repp_dir+'/src/repp')
+ assert os.path.exists(_repp_dir+'/erg/repp.set')
return _repp_dir
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
# Steven Bird <stevenbird1@gmail.com> (minor edits)
# URL: <http://nltk.sourceforge.net>
from nltk.tokenize.api import TokenizerI
-
class SExprTokenizer(TokenizerI):
"""
A tokenizer that divides strings into s-expressions.
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
"""
- def __init__(self, parens="()", strict=True):
+ def __init__(self, parens='()', strict=True):
if len(parens) != 2:
- raise ValueError("parens must contain exactly two strings")
+ raise ValueError('parens must contain exactly two strings')
self._strict = strict
self._open_paren = parens[0]
self._close_paren = parens[1]
- self._paren_regexp = re.compile(
- "%s|%s" % (re.escape(parens[0]), re.escape(parens[1]))
- )
+ self._paren_regexp = re.compile('%s|%s' % (re.escape(parens[0]),
+ re.escape(parens[1])))
def tokenize(self, text):
"""
for m in self._paren_regexp.finditer(text):
paren = m.group()
if depth == 0:
- result += text[pos : m.start()].split()
+ result += text[pos:m.start()].split()
pos = m.start()
if paren == self._open_paren:
depth += 1
if paren == self._close_paren:
if self._strict and depth == 0:
- raise ValueError("Un-matched close paren at char %d" % m.start())
- depth = max(0, depth - 1)
+ raise ValueError('Un-matched close paren at char %d'
+ % m.start())
+ depth = max(0, depth-1)
if depth == 0:
- result.append(text[pos : m.end()])
+ result.append(text[pos:m.end()])
pos = m.end()
if self._strict and depth > 0:
- raise ValueError("Un-matched open paren at char %d" % pos)
+ raise ValueError('Un-matched open paren at char %d' % pos)
if pos < len(text):
result.append(text[pos:])
return result
-
sexpr_tokenize = SExprTokenizer().tokenize
+
+
+
+
# Natural Language Toolkit: Simple Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.sourceforge.net>
to specify the tokenization conventions when building a `CorpusReader`.
"""
-
+from __future__ import unicode_literals
from nltk.tokenize.api import TokenizerI, StringTokenizer
from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
-
class SpaceTokenizer(StringTokenizer):
r"""Tokenize a string using the space character as a delimiter,
which is the same as ``s.split(' ')``.
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
"""
- _string = " "
-
+ _string = ' '
class TabTokenizer(StringTokenizer):
r"""Tokenize a string use the tab character as a delimiter,
['a', 'b c\n', ' d']
"""
- _string = "\t"
-
+ _string = '\t'
class CharTokenizer(StringTokenizer):
"""Tokenize a string into individual characters. If this functionality
for i, j in enumerate(range(1, len(s) + 1)):
yield i, j
-
class LineTokenizer(TokenizerI):
r"""Tokenize a string into its lines, optionally discarding blank lines.
This is similar to ``s.split('\n')``.
a corresponding token ``''`` after that newline.
"""
- def __init__(self, blanklines="discard"):
- valid_blanklines = ("discard", "keep", "discard-eof")
+ def __init__(self, blanklines='discard'):
+ valid_blanklines = ('discard', 'keep', 'discard-eof')
if blanklines not in valid_blanklines:
- raise ValueError(
- "Blank lines must be one of: %s" % " ".join(valid_blanklines)
- )
+ raise ValueError('Blank lines must be one of: %s' %
+ ' '.join(valid_blanklines))
self._blanklines = blanklines
def tokenize(self, s):
lines = s.splitlines()
# If requested, strip off blank lines.
- if self._blanklines == "discard":
+ if self._blanklines == 'discard':
lines = [l for l in lines if l.rstrip()]
- elif self._blanklines == "discard-eof":
+ elif self._blanklines == 'discard-eof':
if lines and not lines[-1].strip():
lines.pop()
return lines
# discard-eof not implemented
def span_tokenize(self, s):
- if self._blanklines == "keep":
- for span in string_span_tokenize(s, r"\n"):
+ if self._blanklines == 'keep':
+ for span in string_span_tokenize(s, r'\n'):
yield span
else:
- for span in regexp_span_tokenize(s, r"\n(\s+\n)*"):
+ for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
yield span
-
######################################################################
-# { Tokenization Functions
+#{ Tokenization Functions
######################################################################
# XXX: it is stated in module docs that there is no function versions
-
-def line_tokenize(text, blanklines="discard"):
+def line_tokenize(text, blanklines='discard'):
return LineTokenizer(blanklines).tokenize(text)
+
+
+
+++ /dev/null
-# Natural Language Toolkit: Tokenizers
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Christopher Hench <chris.l.hench@gmail.com>
-# Alex Estes
-# URL: <http://nltk.sourceforge.net>
-# For license information, see LICENSE.TXT
-
-"""
-The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
-by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
-openness of the lips. Syllable breaks occur before troughs in sonority. For more
-on the SSP see Selkirk (1984).
-
-The default implementation uses the English alphabet, but the `sonority_hiearchy`
-can be modified to IPA or any other alphabet for the use-case. The SSP is a
-universal syllabification algorithm, but that does not mean it performs equally
-across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
-if utilizing IPA (pg. 311).
-
-Importantly, if a custom hiearchy is supplied and vowels span across more than
-one level, they should be given separately to the `vowels` class attribute.
-
-References:
-- Otto Jespersen. 1904. Lehrbuch der Phonetik.
- Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
-- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
- In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
- Cambridge, MIT Press. pp. 107-136.
-- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
- In HLT-NAACL. pp. 308-316.
-"""
-
-import warnings
-
-import re
-from string import punctuation
-
-from nltk.tokenize.api import TokenizerI
-from nltk.util import ngrams
-
-
-class SyllableTokenizer(TokenizerI):
- """
- Syllabifies words based on the Sonority Sequencing Principle (SSP).
-
- >>> from nltk.tokenize import SyllableTokenizer
- >>> from nltk import word_tokenize
- >>> SSP = SyllableTokenizer()
- >>> SSP.tokenize('justification')
- ['jus', 'ti', 'fi', 'ca', 'tion']
- >>> text = "This is a foobar-like sentence."
- >>> [SSP.tokenize(token) for token in word_tokenize(text)]
- [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
- """
-
- def __init__(self, lang="en", sonority_hierarchy=False):
- """
- :param lang: Language parameter, default is English, 'en'
- :type lang: str
- :param sonority_hierarchy: Sonority hierarchy according to the
- Sonority Sequencing Principle.
- :type sonority_hierarchy: list(str)
- """
- # Sonority hierarchy should be provided in descending order.
- # If vowels are spread across multiple levels, they should be
- # passed assigned self.vowels var together, otherwise should be
- # placed in first index of hierarchy.
- if not sonority_hierarchy and lang == "en":
- sonority_hierarchy = [
- "aeiouy", # vowels.
- "lmnrw", # nasals.
- "zvsf", # fricatives.
- "bcdgtkpqxhj", # stops.
- ]
-
- self.vowels = sonority_hierarchy[0]
- self.phoneme_map = {}
- for i, level in enumerate(sonority_hierarchy):
- for c in level:
- sonority_level = len(sonority_hierarchy) - i
- self.phoneme_map[c] = sonority_level
- self.phoneme_map[c.upper()] = sonority_level
-
- def assign_values(self, token):
- """
- Assigns each phoneme its value from the sonority hierarchy.
- Note: Sentence/text has to be tokenized first.
-
- :param token: Single word or token
- :type token: str
- :return: List of tuples, first element is character/phoneme and
- second is the soronity value.
- :rtype: list(tuple(str, int))
- """
- syllables_values = []
- for c in token:
- try:
- syllables_values.append((c, self.phoneme_map[c]))
- except KeyError:
- if c not in punctuation:
- warnings.warn(
- "Character not defined in sonority_hierarchy,"
- " assigning as vowel: '{}'".format(c)
- )
- syllables_values.append((c, max(self.phoneme_map.values())))
- self.vowels += c
- else: # If it's a punctuation, assing -1.
- syllables_values.append((c, -1))
- return syllables_values
-
- def validate_syllables(self, syllable_list):
- """
- Ensures each syllable has at least one vowel.
- If the following syllable doesn't have vowel, add it to the current one.
-
- :param syllable_list: Single word or token broken up into syllables.
- :type syllable_list: list(str)
- :return: Single word or token broken up into syllables
- (with added syllables if necessary)
- :rtype: list(str)
- """
- valid_syllables = []
- front = ""
- for i, syllable in enumerate(syllable_list):
- if syllable in punctuation:
- valid_syllables.append(syllable)
- continue
- if not re.search("|".join(self.vowels), syllable):
- if len(valid_syllables) == 0:
- front += syllable
- else:
- valid_syllables = valid_syllables[:-1] + [
- valid_syllables[-1] + syllable
- ]
- else:
- if len(valid_syllables) == 0:
- valid_syllables.append(front + syllable)
- else:
- valid_syllables.append(syllable)
-
- return valid_syllables
-
- def tokenize(self, token):
- """
- Apply the SSP to return a list of syllables.
- Note: Sentence/text has to be tokenized first.
-
- :param token: Single word or token
- :type token: str
- :return syllable_list: Single word or token broken up into syllables.
- :rtype: list(str)
- """
- # assign values from hierarchy
- syllables_values = self.assign_values(token)
-
- # if only one vowel return word
- if sum(token.count(x) for x in self.vowels) <= 1:
- return [token]
-
- syllable_list = []
- syllable = syllables_values[0][0] # start syllable with first phoneme
- for trigram in ngrams(syllables_values, n=3):
- phonemes, values = zip(*trigram)
- # Sonority of previous, focal and following phoneme
- prev_value, focal_value, next_value = values
- # Focal phoneme.
- focal_phoneme = phonemes[1]
-
- # These cases trigger syllable break.
- if focal_value == -1: # If it's a punctuation, just break.
- syllable_list.append(syllable)
- syllable_list.append(focal_phoneme)
- syllable = ""
- elif prev_value >= focal_value == next_value:
- syllable += focal_phoneme
- syllable_list.append(syllable)
- syllable = ""
-
- elif prev_value > focal_value < next_value:
- syllable_list.append(syllable)
- syllable = ""
- syllable += focal_phoneme
-
- # no syllable break
- else:
- syllable += focal_phoneme
-
- syllable += syllables_values[-1][0] # append last phoneme
- syllable_list.append(syllable)
-
- return self.validate_syllables(syllable_list)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Tokenizer
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals, print_function
+
import tempfile
import os
import json
from subprocess import PIPE
import warnings
+from six import text_type
+
from nltk.internals import find_jar, config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
from nltk.parse.corenlp import CoreNLPParser
-_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml"
-
+_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
class StanfordTokenizer(TokenizerI):
r"""
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
"""
- _JAR = "stanford-postagger.jar"
+ _JAR = 'stanford-postagger.jar'
- def __init__(
- self,
- path_to_jar=None,
- encoding="utf8",
- options=None,
- verbose=False,
- java_options="-mx1000m",
- ):
+ def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
# Raise deprecation warning.
- warnings.warn(
- str(
- "\nThe StanfordTokenizer will "
- "be deprecated in version 3.2.5.\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'"
- ),
- DeprecationWarning,
- stacklevel=2,
- )
-
+ warnings.simplefilter('always', DeprecationWarning)
+ warnings.warn(str("\nThe StanfordTokenizer will "
+ "be deprecated in version 3.2.5.\n"
+ "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+ DeprecationWarning, stacklevel=2)
+ warnings.simplefilter('ignore', DeprecationWarning)
self._stanford_jar = find_jar(
- self._JAR,
- path_to_jar,
- env_vars=("STANFORD_POSTAGGER",),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
+ self._JAR, path_to_jar,
+ env_vars=('STANFORD_POSTAGGER',),
+ searchpath=(), url=_stanford_url,
+ verbose=verbose
)
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
- self._options_cmd = ",".join(
- "{0}={1}".format(key, val) for key, val in options.items()
- )
+ self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
@staticmethod
def _parse_tokenized_output(s):
"""
Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
"""
- cmd = ["edu.stanford.nlp.process.PTBTokenizer"]
+ cmd = [
+ 'edu.stanford.nlp.process.PTBTokenizer',
+ ]
return self._parse_tokenized_output(self._execute(cmd, s))
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
- cmd.extend(["-charset", encoding])
+ cmd.extend(['-charset', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
- cmd.extend(["-options", self._options_cmd])
+ cmd.extend(['-options', self._options_cmd])
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
- with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+ with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
- if isinstance(input_, str) and encoding:
+ if isinstance(input_, text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
cmd.append(input_file.name)
# Run the tagger and get the output.
- stdout, stderr = java(
- cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
- )
+ stdout, stderr = java(cmd, classpath=self._stanford_jar,
+ stdout=PIPE, stderr=PIPE)
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
return stdout
+class CoreNLPTokenizer(CoreNLPParser):
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
+ r"""
+ This is a duck-type of CoreNLPParser that has the tokenizing
+ functionality similar to the original Stanford POS tagger.
+
+ >>> from nltk.tokenize.stanford import CoreNLPTokenizer
+ >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
+ >>> CoreNLPTokenizer(url='http://localhost:9000').tokenize(s) == expected # doctest: +SKIP
+ [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.', u'Please', u'buy', u'me', u'two', u'of', u'them', u'.', u'Thanks', u'.']
+ """
+ super(CoreNLPTokenizer, self).__init__(url, encoding)
+
+ def tokenize(self, text, properties=None):
+ """
+ Tokenize a string of text. Consistent with the StanfordTokenizer, This
+ function returns a list of string. The orignal CoreNLPParser.tokenize()
+ returns a generator of string.
+ """
+ return list(super(CoreNLPTokenizer, self).tokenize(text, properties))
+
+
def setup_module(module):
from nose import SkipTest
try:
StanfordTokenizer()
except LookupError:
- raise SkipTest(
- "doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist"
- )
+ raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist')
+
+ try:
+ CoreNLPTokenizer()
+ except LookupError:
+ raise SkipTest('doctests from nltk.tokenize.stanford.CoreNLPTokenizer are skipped because the '
+ 'stanford corenlp server not started')
# Natural Language Toolkit: Interface to the Stanford Segmenter
# for Chinese and Arabic
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: 52nlp <52nlpcn@gmail.com>
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
# Alex Constantin <alex@keyworder.ch>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import unicode_literals, print_function
+
import tempfile
import os
import json
-import warnings
from subprocess import PIPE
+import warnings
-from nltk.internals import (
- find_jar,
- find_file,
- find_dir,
- config_java,
- java,
- _java_options,
-)
+from nltk import compat
+from nltk.internals import find_jar, find_file, find_dir, \
+ config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
+from six import text_type
-_stanford_url = "https://nlp.stanford.edu/software"
+_stanford_url = 'https://nlp.stanford.edu/software'
class StanfordSegmenter(TokenizerI):
<BLANKLINE>
"""
- _JAR = "stanford-segmenter.jar"
-
- def __init__(
- self,
- path_to_jar=None,
- path_to_slf4j=None,
- java_class=None,
- path_to_model=None,
- path_to_dict=None,
- path_to_sihan_corpora_dict=None,
- sihan_post_processing="false",
- keep_whitespaces="false",
- encoding="UTF-8",
- options=None,
- verbose=False,
- java_options="-mx2g",
- ):
+ _JAR = 'stanford-segmenter.jar'
+
+ def __init__(self,
+ path_to_jar=None,
+ path_to_slf4j=None,
+ java_class=None,
+ path_to_model=None,
+ path_to_dict=None,
+ path_to_sihan_corpora_dict=None,
+ sihan_post_processing='false',
+ keep_whitespaces='false',
+ encoding='UTF-8', options=None,
+ verbose=False, java_options='-mx2g'):
# Raise deprecation warning.
- warnings.simplefilter("always", DeprecationWarning)
- warnings.warn(
- str(
- "\nThe StanfordTokenizer will "
- "be deprecated in version 3.2.5.\n"
- "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
- ),
- DeprecationWarning,
- stacklevel=2,
- )
- warnings.simplefilter("ignore", DeprecationWarning)
+ warnings.simplefilter('always', DeprecationWarning)
+ warnings.warn(str("\nThe StanfordTokenizer will "
+ "be deprecated in version 3.2.5.\n"
+ "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+ DeprecationWarning, stacklevel=2)
+ warnings.simplefilter('ignore', DeprecationWarning)
stanford_segmenter = find_jar(
- self._JAR,
- path_to_jar,
- env_vars=("STANFORD_SEGMENTER",),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
- )
+ self._JAR, path_to_jar,
+ env_vars=('STANFORD_SEGMENTER',),
+ searchpath=(), url=_stanford_url,
+ verbose=verbose)
if path_to_slf4j is not None:
slf4j = find_jar(
- "slf4j-api.jar",
- path_to_slf4j,
- env_vars=("SLF4J", "STANFORD_SEGMENTER"),
- searchpath=(),
- url=_stanford_url,
- verbose=verbose,
- )
+ 'slf4j-api.jar', path_to_slf4j,
+ env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
+ searchpath=(), url=_stanford_url,
+ verbose=verbose)
else:
slf4j = None
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
- self._options_cmd = ",".join(
- "{0}={1}".format(key, json.dumps(val)) for key, val in options.items()
- )
+ self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
def default_config(self, lang):
"""
"""
search_path = ()
- if os.environ.get("STANFORD_SEGMENTER"):
- search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
+ if os.environ.get('STANFORD_SEGMENTER'):
+ search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
# init for Chinese-specific files
self._dict = None
self._sihan_corpora_dict = None
- self._sihan_post_processing = "false"
+ self._sihan_post_processing = 'false'
- if lang == "ar":
- self._java_class = (
- "edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
- )
- model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
+ if lang == 'ar':
+ self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
+ model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
- elif lang == "zh":
- self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
- model = "pku.gz"
- self._sihan_post_processing = "true"
+ elif lang == 'zh':
+ self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
+ model = 'pku.gz'
+ self._sihan_post_processing = 'true'
- path_to_dict = "dict-chris6.ser.gz"
+ path_to_dict = 'dict-chris6.ser.gz'
try:
- self._dict = find_file(
- path_to_dict,
- searchpath=search_path,
- url=_stanford_url,
- verbose=False,
- env_vars=("STANFORD_MODELS",),
- )
+ self._dict = find_file(path_to_dict, searchpath=search_path,
+ url=_stanford_url, verbose=False,
+ env_vars=('STANFORD_MODELS',))
except LookupError:
- raise LookupError(
- "Could not find '%s' (tried using env. "
- "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
- % path_to_dict
- )
+ raise LookupError("Could not find '%s' (tried using env. "
+ "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)
- sihan_dir = "./data/"
+ sihan_dir = './data/'
try:
- path_to_sihan_dir = find_dir(
- sihan_dir,
- url=_stanford_url,
- verbose=False,
- env_vars=("STANFORD_SEGMENTER",),
- )
+ path_to_sihan_dir = find_dir(sihan_dir,
+ url=_stanford_url, verbose=False,
+ env_vars=('STANFORD_SEGMENTER',))
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
except LookupError:
- raise LookupError(
- "Could not find '%s' (tried using the "
- "STANFORD_SEGMENTER environment variable)" % sihan_dir
- )
+ raise LookupError("Could not find '%s' (tried using the "
+ "STANFORD_SEGMENTER environment variable)" % sihan_dir)
else:
- raise LookupError("Unsupported language {}".format(lang))
+ raise LookupError("Unsupported language '%'" % lang)
try:
- self._model = find_file(
- model,
- searchpath=search_path,
- url=_stanford_url,
- verbose=False,
- env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
- )
+ self._model = find_file(model, searchpath=search_path,
+ url=_stanford_url, verbose=False,
+ env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
except LookupError:
- raise LookupError(
- "Could not find '%s' (tried using env. "
- "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
- )
+ raise LookupError("Could not find '%s' (tried using env. "
+ "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
def tokenize(self, s):
super().tokenize(s)
"""
cmd = [
self._java_class,
- "-loadClassifier",
- self._model,
- "-keepAllWhitespaces",
- self._keep_whitespaces,
- "-textFile",
- input_file_path,
+ '-loadClassifier', self._model,
+ '-keepAllWhitespaces', self._keep_whitespaces,
+ '-textFile', input_file_path
]
if self._sihan_corpora_dict is not None:
- cmd.extend(
- [
- "-serDictionary",
- self._dict,
- "-sighanCorporaDict",
- self._sihan_corpora_dict,
- "-sighanPostProcessing",
- self._sihan_post_processing,
- ]
- )
+ cmd.extend(['-serDictionary', self._dict,
+ '-sighanCorporaDict', self._sihan_corpora_dict,
+ '-sighanPostProcessing', self._sihan_post_processing])
stdout = self._execute(cmd)
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
# Write the actural sentences to the temporary input file
- _input_fh = os.fdopen(_input_fh, "wb")
- _input = "\n".join((" ".join(x) for x in sentences))
- if isinstance(_input, str) and encoding:
+ _input_fh = os.fdopen(_input_fh, 'wb')
+ _input = '\n'.join((' '.join(x) for x in sentences))
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
self._java_class,
- "-loadClassifier",
- self._model,
- "-keepAllWhitespaces",
- self._keep_whitespaces,
- "-textFile",
- self._input_file_path,
+ '-loadClassifier', self._model,
+ '-keepAllWhitespaces', self._keep_whitespaces,
+ '-textFile', self._input_file_path
]
if self._sihan_corpora_dict is not None:
- cmd.extend(
- [
- "-serDictionary",
- self._dict,
- "-sighanCorporaDict",
- self._sihan_corpora_dict,
- "-sighanPostProcessing",
- self._sihan_post_processing,
- ]
- )
+ cmd.extend(['-serDictionary', self._dict,
+ '-sighanCorporaDict', self._sihan_corpora_dict,
+ '-sighanPostProcessing', self._sihan_post_processing])
stdout = self._execute(cmd)
def _execute(self, cmd, verbose=False):
encoding = self._encoding
- cmd.extend(["-inputEncoding", encoding])
+ cmd.extend(['-inputEncoding', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
- cmd.extend(["-options", self._options_cmd])
+ cmd.extend(['-options', self._options_cmd])
- default_options = " ".join(_java_options)
+ default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
- stdout, _stderr = java(
- cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
- )
+ stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
stdout = stdout.decode(encoding)
# Return java configurations to their default values.
try:
seg = StanfordSegmenter()
- seg.default_config("ar")
- seg.default_config("zh")
+ seg.default_config('ar')
+ seg.default_config('zh')
except LookupError as e:
- raise SkipTest(
- "Tests for nltk.tokenize.stanford_segmenter skipped: %s" % str(e)
- )
+ raise SkipTest('Tests for nltk.tokenize.stanford_segmenter skipped: %s' % str(e))
# Natural Language Toolkit: TextTiling
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: George Boutsioukis
#
# URL: <http://nltk.org/>
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
"""
- def __init__(
- self,
- w=20,
- k=10,
- similarity_method=BLOCK_COMPARISON,
- stopwords=None,
- smoothing_method=DEFAULT_SMOOTHING,
- smoothing_width=2,
- smoothing_rounds=1,
- cutoff_policy=HC,
- demo_mode=False,
- ):
+ def __init__(self,
+ w=20,
+ k=10,
+ similarity_method=BLOCK_COMPARISON,
+ stopwords=None,
+ smoothing_method=DEFAULT_SMOOTHING,
+ smoothing_width=2,
+ smoothing_rounds=1,
+ cutoff_policy=HC,
+ demo_mode=False):
+
if stopwords is None:
from nltk.corpus import stopwords
-
- stopwords = stopwords.words("english")
+ stopwords = stopwords.words('english')
self.__dict__.update(locals())
- del self.__dict__["self"]
+ del self.__dict__['self']
def tokenize(self, text):
"""Return a tokenized copy of *text*, where each "token" represents
# Tokenization step starts here
# Remove punctuation
- nopunct_text = "".join(
- c for c in lowercase_text if re.match("[a-z\-' \n\t]", c)
- )
+ nopunct_text = ''.join(c for c in lowercase_text
+ if re.match("[a-z\-\' \n\t]", c))
nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
tokseqs = self._divide_to_tokensequences(nopunct_text)
# implementation states that it offers no benefit to the
# process. It might be interesting to test the existing
# stemmers though.
- # words = _stem_words(words)
+ #words = _stem_words(words)
# Filter stopwords
for ts in tokseqs:
- ts.wrdindex_list = [
- wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords
- ]
+ ts.wrdindex_list = [wi for wi in ts.wrdindex_list
+ if wi[0] not in self.stopwords]
token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
# End of the Tokenization step
gap_scores = self._block_comparison(tokseqs, token_table)
elif self.similarity_method == VOCABULARY_INTRODUCTION:
raise NotImplementedError("Vocabulary introduction not implemented")
- else:
- raise ValueError(
- "Similarity method {} not recognized".format(self.similarity_method)
- )
if self.smoothing_method == DEFAULT_SMOOTHING:
smooth_scores = self._smooth_scores(gap_scores)
- else:
- raise ValueError(
- "Smoothing method {} not recognized".format(self.smoothing_method)
- )
# End of Lexical score Determination
# Boundary identification
depth_scores = self._depth_scores(smooth_scores)
segment_boundaries = self._identify_boundaries(depth_scores)
- normalized_boundaries = self._normalize_boundaries(
- text, segment_boundaries, paragraph_breaks
- )
+ normalized_boundaries = self._normalize_boundaries(text,
+ segment_boundaries,
+ paragraph_breaks)
# End of Boundary Identification
segmented_text = []
prevb = 0
segmented_text.append(text[prevb:b])
prevb = b
- if prevb < text_length: # append any text that may be remaining
+ if prevb < text_length: # append any text that may be remaining
segmented_text.append(text[prevb:])
if not segmented_text:
return segmented_text
def _block_comparison(self, tokseqs, token_table):
- """Implements the block comparison method"""
-
+ "Implements the block comparison method"
def blk_frq(tok, block):
- ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences)
+ ts_occs = filter(lambda o: o[0] in block,
+ token_table[tok].ts_occurences)
freq = sum([tsocc[1] for tsocc in ts_occs])
return freq
gap_scores = []
- numgaps = len(tokseqs) - 1
+ numgaps = len(tokseqs)-1
for curr_gap in range(numgaps):
score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0
score = 0.0
- # adjust window size for boundary conditions
- if curr_gap < self.k - 1:
+ #adjust window size for boundary conditions
+ if curr_gap < self.k-1:
window_size = curr_gap + 1
- elif curr_gap > numgaps - self.k:
+ elif curr_gap > numgaps-self.k:
window_size = numgaps - curr_gap
else:
window_size = self.k
- b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]]
- b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]]
+ b1 = [ts.index
+ for ts in tokseqs[curr_gap-window_size+1 : curr_gap+1]]
+ b2 = [ts.index
+ for ts in tokseqs[curr_gap+1 : curr_gap+window_size+1]]
for t in token_table:
- score_dividend += blk_frq(t, b1) * blk_frq(t, b2)
- score_divisor_b1 += blk_frq(t, b1) ** 2
- score_divisor_b2 += blk_frq(t, b2) ** 2
+ score_dividend += blk_frq(t, b1)*blk_frq(t, b2)
+ score_divisor_b1 += blk_frq(t, b1)**2
+ score_divisor_b2 += blk_frq(t, b2)**2
try:
- score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2)
+ score = score_dividend/math.sqrt(score_divisor_b1*
+ score_divisor_b2)
except ZeroDivisionError:
- pass # score += 0.0
+ pass # score += 0.0
gap_scores.append(score)
def _smooth_scores(self, gap_scores):
"Wraps the smooth function from the SciPy Cookbook"
- return list(
- smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1)
- )
+ return list(smooth(numpy.array(gap_scores[:]),
+ window_len = self.smoothing_width+1))
def _mark_paragraph_breaks(self, text):
"""Identifies indented text or line breaks as the beginning of
last_break = 0
pbreaks = [0]
for pb in matches:
- if pb.start() - last_break < MIN_PARAGRAPH:
+ if pb.start()-last_break < MIN_PARAGRAPH:
continue
else:
pbreaks.append(pb.start())
matches = re.finditer("\w+", text)
for match in matches:
wrdindex_list.append((match.group(), match.start()))
- return [
- TokenSequence(i / w, wrdindex_list[i : i + w])
- for i in range(0, len(wrdindex_list), w)
- ]
+ return [TokenSequence(i/w, wrdindex_list[i:i+w])
+ for i in range(0, len(wrdindex_list), w)]
def _create_token_table(self, token_sequences, par_breaks):
"Creates a table of TokenTableFields"
current_par_break = next(pb_iter)
if current_par_break == 0:
try:
- current_par_break = next(pb_iter) # skip break at 0
+ current_par_break = next(pb_iter) #skip break at 0
except StopIteration:
raise ValueError(
"No paragraph breaks were found(text too short perhaps?)"
- )
+ )
for ts in token_sequences:
for word, index in ts.wrdindex_list:
try:
current_par_break = next(pb_iter)
current_par += 1
except StopIteration:
- # hit bottom
+ #hit bottom
pass
if word in token_table:
if token_table[word].last_tok_seq != current_tok_seq:
token_table[word].last_tok_seq = current_tok_seq
- token_table[word].ts_occurences.append([current_tok_seq, 1])
+ token_table[word]\
+ .ts_occurences.append([current_tok_seq,1])
else:
token_table[word].ts_occurences[-1][1] += 1
- else: # new word
- token_table[word] = TokenTableField(
- first_pos=index,
- ts_occurences=[[current_tok_seq, 1]],
- total_count=1,
- par_count=1,
- last_par=current_par,
- last_tok_seq=current_tok_seq,
- )
+ else: #new word
+ token_table[word] = TokenTableField(first_pos=index,
+ ts_occurences= \
+ [[current_tok_seq,1]],
+ total_count=1,
+ par_count=1,
+ last_par=current_par,
+ last_tok_seq= \
+ current_tok_seq)
current_tok_seq += 1
boundaries = [0 for x in depth_scores]
- avg = sum(depth_scores) / len(depth_scores)
+ avg = sum(depth_scores)/len(depth_scores)
stdev = numpy.std(depth_scores)
- # SB: what is the purpose of this conditional?
+ #SB: what is the purpose of this conditional?
if self.cutoff_policy == LC:
- cutoff = avg - stdev / 2.0
+ cutoff = avg-stdev/2.0
else:
- cutoff = avg - stdev / 2.0
+ cutoff = avg-stdev/2.0
depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
depth_tuples.reverse()
- hp = list(filter(lambda x: x[0] > cutoff, depth_tuples))
+ hp = list(filter(lambda x:x[0]>cutoff, depth_tuples))
for dt in hp:
boundaries[dt[1]] = 1
- for dt2 in hp: # undo if there is a boundary close already
- if (
- dt[1] != dt2[1]
- and abs(dt2[1] - dt[1]) < 4
- and boundaries[dt2[1]] == 1
- ):
+ for dt2 in hp: #undo if there is a boundary close already
+ if dt[1] != dt2[1] and abs(dt2[1]-dt[1]) < 4 \
+ and boundaries[dt2[1]] == 1:
boundaries[dt[1]] = 0
return boundaries
between the left and right peaks and the gap's score"""
depth_scores = [0 for x in scores]
- # clip boundaries: this holds on the rule of thumb(my thumb)
- # that a section shouldn't be smaller than at least 2
- # pseudosentences for small texts and around 5 for larger ones.
+ #clip boundaries: this holds on the rule of thumb(my thumb)
+ #that a section shouldn't be smaller than at least 2
+ #pseudosentences for small texts and around 5 for larger ones.
clip = min(max(len(scores) // 10, 2), 5)
index = clip
seen_word = False
word_count += 1
if char not in " \t\n" and not seen_word:
- seen_word = True
- if gaps_seen < len(boundaries) and word_count > (
- max(gaps_seen * self.w, self.w)
- ):
+ seen_word=True
+ if gaps_seen < len(boundaries) and word_count > \
+ (max(gaps_seen*self.w, self.w)):
if boundaries[gaps_seen] == 1:
- # find closest paragraph break
+ #find closest paragraph break
best_fit = len(text)
for br in paragraph_breaks:
- if best_fit > abs(br - char_count):
- best_fit = abs(br - char_count)
+ if best_fit > abs(br-char_count):
+ best_fit = abs(br-char_count)
bestbr = br
else:
break
- if bestbr not in norm_boundaries: # avoid duplicates
+ if bestbr not in norm_boundaries: #avoid duplicates
norm_boundaries.append(bestbr)
gaps_seen += 1
class TokenTableField(object):
"""A field in the token table holding parameters for each token,
used later in the process"""
-
- def __init__(
- self,
- first_pos,
- ts_occurences,
- total_count=1,
- par_count=1,
- last_par=0,
- last_tok_seq=None,
- ):
+ def __init__(self,
+ first_pos,
+ ts_occurences,
+ total_count=1,
+ par_count=1,
+ last_par=0,
+ last_tok_seq=None):
self.__dict__.update(locals())
- del self.__dict__["self"]
-
+ del self.__dict__['self']
class TokenSequence(object):
"A token list with its original length and its index"
-
- def __init__(self, index, wrdindex_list, original_length=None):
- original_length = original_length or len(wrdindex_list)
+ def __init__(self,
+ index,
+ wrdindex_list,
+ original_length=None):
+ original_length=original_length or len(wrdindex_list)
self.__dict__.update(locals())
- del self.__dict__["self"]
+ del self.__dict__['self']
-# Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
-def smooth(x, window_len=11, window="flat"):
+#Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
+def smooth(x,window_len=11,window='flat'):
"""smooth the data using a window with requested size.
This method is based on the convolution of a scaled window with the signal.
if window_len < 3:
return x
- if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
- raise ValueError(
- "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
- )
+ if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
+ raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'")
- s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
+ s=numpy.r_[2*x[0]-x[window_len:1:-1],x,2*x[-1]-x[-1:-window_len:-1]]
- # print(len(s))
- if window == "flat": # moving average
- w = numpy.ones(window_len, "d")
+ #print(len(s))
+ if window == 'flat': #moving average
+ w = numpy.ones(window_len,'d')
else:
- w = eval("numpy." + window + "(window_len)")
+ w = eval('numpy.' + window + '(window_len)')
- y = numpy.convolve(w / w.sum(), s, mode="same")
+ y = numpy.convolve(w/w.sum(), s, mode='same')
- return y[window_len - 1 : -window_len + 1]
+ return y[window_len-1:-window_len+1]
def demo(text=None):
from nltk.corpus import brown
from matplotlib import pylab
-
tt = TextTilingTokenizer(demo_mode=True)
- if text is None:
- text = brown.raw()[:10000]
+ if text is None: text = brown.raw()[:10000]
s, ss, d, b = tt.tokenize(text)
pylab.xlabel("Sentence Gap index")
pylab.ylabel("Gap Scores")
pylab.stem(range(len(b)), b)
pylab.legend()
pylab.show()
+
+
# For license information, see LICENSE.TXT
"""
-The tok-tok tokenizer is a simple, general tokenizer, where the input has one
+The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.
-Tok-tok has been tested on, and gives reasonably good results for English,
-Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
+Tok-tok has been tested on, and gives reasonably good results for English,
+Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.
Reference:
-Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
-Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
+Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
+Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
"""
import re
+from six import text_type
from nltk.tokenize.api import TokenizerI
-
class ToktokTokenizer(TokenizerI):
"""
This is a Python port of the tok-tok.pl from
https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
-
+
>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
- >>> print(toktok.tokenize(text, return_str=True))
+ >>> print (toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
- >>> print(toktok.tokenize(text, return_str=True))
+ >>> print (toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
>>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
>>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
True
"""
-
# Replace non-breaking spaces with normal spaces.
NON_BREAKING = re.compile(u"\u00A0"), " "
-
+
# Pad some funky punctuation.
FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
# Pad more funky punctuation.
- FUNKY_PUNCT_2 = re.compile(u"([({\[“‘„‚«‹「『])"), r" \1 "
+ FUNKY_PUNCT_2 = re.compile(u'([({\[“‘„‚«‹「『])'), r" \1 "
# Pad En dash and em dash
- EN_EM_DASHES = re.compile(u"([–—])"), r" \1 "
-
+ EN_EM_DASHES = re.compile(u'([–—])'), r" \1 "
+
# Replace problematic character with numeric character reference.
- AMPERCENT = re.compile("& "), "& "
- TAB = re.compile("\t"), " 	 "
- PIPE = re.compile("\|"), " | "
-
- # Pad numbers with commas to keep them from further tokenization.
- COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
-
+ AMPERCENT = re.compile('& '), '& '
+ TAB = re.compile('\t'), ' 	 '
+ PIPE = re.compile('\|'), ' | '
+
+ # Pad numbers with commas to keep them from further tokenization.
+ COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' \1 '
+
# Just pad problematic (often neurotic) hyphen/single quote, etc.
- PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r" \1 "
+ PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' \1 '
# Group ` ` stupid quotes ' ' into a single token.
STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
-
- # Don't tokenize period unless it ends the line and that it isn't
- # preceded by another period, e.g.
- # "something ..." -> "something ..."
- # "something." -> "something ."
+
+ # Don't tokenize period unless it ends the line and that it isn't
+ # preceded by another period, e.g.
+ # "something ..." -> "something ..."
+ # "something." -> "something ."
FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
- # Don't tokenize period unless it ends the line eg.
+ # Don't tokenize period unless it ends the line eg.
# " ... stuff." -> "... stuff ."
FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
# Treat continuous commas as fake German,Czech, etc.: „
- MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
+ MULTI_COMMAS = re.compile(r'(,{2,})'), r' \1 '
# Treat continuous dashes as fake en-dash, etc.
- MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
+ MULTI_DASHES = re.compile(r'(-{2,})'), r' \1 '
# Treat multiple periods as a thing (eg. ellipsis)
- MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
+ MULTI_DOTS = re.compile(r'(\.{2,})'), r' \1 '
# This is the \p{Open_Punctuation} from Perl's perluniprops
# see http://perldoc.perl.org/perluniprops.html
- OPEN_PUNCT = str(
- u"([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
- u"\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
- u"\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
- u"\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
- u"\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
- u"\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
- u"\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
- u"\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
- u"\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
- )
+ OPEN_PUNCT = text_type(u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d'
+ u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772'
+ u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983'
+ u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993'
+ u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26'
+ u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016'
+ u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39'
+ u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b'
+ u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62')
# This is the \p{Close_Punctuation} from Perl's perluniprops
- CLOSE_PUNCT = str(
- u")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
- u"\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
- u"\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
- u"\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
- u"\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
- u"\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
- u"\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
- u"\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
- u"\uff09\uff3d\uff5d\uff60\uff63"
- )
+ CLOSE_PUNCT = text_type(u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a'
+ u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6'
+ u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988'
+ u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998'
+ u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009'
+ u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b'
+ u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c'
+ u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e'
+ u'\uff09\uff3d\uff5d\uff60\uff63')
# This is the \p{Close_Punctuation} from Perl's perluniprops
- CURRENCY_SYM = str(
- u"$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
- u"\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
- u"\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
- u"\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
- u"\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
- u"\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
- )
-
+ CURRENCY_SYM = text_type(u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb'
+ u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3'
+ u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab'
+ u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3'
+ u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838'
+ u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6')
+
# Pad spaces after opening punctuations.
- OPEN_PUNCT_RE = re.compile(u"([{}])".format(OPEN_PUNCT)), r"\1 "
+ OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'\1 '
# Pad spaces before closing punctuations.
- CLOSE_PUNCT_RE = re.compile(u"([{}])".format(CLOSE_PUNCT)), r"\1 "
+ CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'\1 '
# Pad spaces after currency symbols.
- CURRENCY_SYM_RE = re.compile(u"([{}])".format(CURRENCY_SYM)), r"\1 "
-
+ CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'\1 '
+
# Use for tokenizing URL-unfriendly characters: [:/?#]
- URL_FOE_1 = re.compile(r":(?!//)"), r" : " # in perl s{:(?!//)}{ : }g;
- URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? " # in perl s{\?(?!\S)}{ ? }g;
+ URL_FOE_1 = re.compile(r':(?!//)'), r' : ' # in perl s{:(?!//)}{ : }g;
+ URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? ' # in perl s{\?(?!\S)}{ ? }g;
# in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
- URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
- URL_FOE_4 = re.compile(r" /"), r" / " # s{ /}{ / }g;
-
+ URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / '
+ URL_FOE_4 = re.compile(r' /'), r' / ' # s{ /}{ / }g;
+
# Left/Right strip, i.e. remove heading/trailing spaces.
# These strip regexes should NOT be used,
- # instead use str.lstrip(), str.rstrip() or str.strip()
- # (They are kept for reference purposes to the original toktok.pl code)
- LSTRIP = re.compile(r"^ +"), ""
- RSTRIP = re.compile(r"\s+$"), "\n"
+ # instead use str.lstrip(), str.rstrip() or str.strip()
+ # (They are kept for reference purposes to the original toktok.pl code)
+ LSTRIP = re.compile(r'^ +'), ''
+ RSTRIP = re.compile(r'\s+$'),'\n'
# Merge multiple spaces.
- ONE_SPACE = re.compile(r" {2,}"), " "
-
- TOKTOK_REGEXES = [
- NON_BREAKING,
- FUNKY_PUNCT_1,
- URL_FOE_1,
- URL_FOE_2,
- URL_FOE_3,
- URL_FOE_4,
- AMPERCENT,
- TAB,
- PIPE,
- OPEN_PUNCT_RE,
- CLOSE_PUNCT_RE,
- MULTI_COMMAS,
- COMMA_IN_NUM,
- FINAL_PERIOD_2,
- PROB_SINGLE_QUOTES,
- STUPID_QUOTES_1,
- STUPID_QUOTES_2,
- CURRENCY_SYM_RE,
- EN_EM_DASHES,
- MULTI_DASHES,
- MULTI_DOTS,
- FINAL_PERIOD_1,
- FINAL_PERIOD_2,
- ONE_SPACE,
- ]
-
+ ONE_SPACE = re.compile(r' {2,}'), ' '
+
+ TOKTOK_REGEXES = [NON_BREAKING, FUNKY_PUNCT_1,
+ URL_FOE_1, URL_FOE_2, URL_FOE_3, URL_FOE_4,
+ AMPERCENT, TAB, PIPE,
+ OPEN_PUNCT_RE, CLOSE_PUNCT_RE,
+ MULTI_COMMAS, COMMA_IN_NUM, FINAL_PERIOD_2,
+ PROB_SINGLE_QUOTES, STUPID_QUOTES_1, STUPID_QUOTES_2,
+ CURRENCY_SYM_RE, EN_EM_DASHES, MULTI_DASHES, MULTI_DOTS,
+ FINAL_PERIOD_1, FINAL_PERIOD_2, ONE_SPACE]
+
def tokenize(self, text, return_str=False):
- text = str(text) # Converts input string into unicode.
+ text = text_type(text) # Converts input string into unicode.
for regexp, subsitution in self.TOKTOK_REGEXES:
text = regexp.sub(subsitution, text)
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
- text = str(text.strip())
- return text if return_str else text.split()
+ text = text_type(text.strip())
+ return text if return_str else text.split()
\ No newline at end of file
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
#
import re
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens
-from nltk.tokenize.destructive import MacIntyreContractions
+
+
+class MacIntyreContractions:
+ """
+ List of contractions adapted from Robert MacIntyre's tokenizer.
+ """
+ CONTRACTIONS2 = [r"(?i)\b(can)(?#X)(not)\b",
+ r"(?i)\b(d)(?#X)('ye)\b",
+ r"(?i)\b(gim)(?#X)(me)\b",
+ r"(?i)\b(gon)(?#X)(na)\b",
+ r"(?i)\b(got)(?#X)(ta)\b",
+ r"(?i)\b(lem)(?#X)(me)\b",
+ r"(?i)\b(mor)(?#X)('n)\b",
+ r"(?i)\b(wan)(?#X)(na)\s"]
+ CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
+ CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b",
+ r"(?i)\b(wha)(t)(cha)\b"]
class TreebankWordTokenizer(TokenizerI):
['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
"""
- # starting quotes
+ #starting quotes
STARTING_QUOTES = [
- (re.compile(r"^\""), r"``"),
- (re.compile(r"(``)"), r" \1 "),
- (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
+ (re.compile(r'^\"'), r'``'),
+ (re.compile(r'(``)'), r' \1 '),
+ (re.compile(r'([ (\[{<])"'), r'\1 `` '),
]
- # punctuation
+ #punctuation
PUNCTUATION = [
- (re.compile(r"([:,])([^\d])"), r" \1 \2"),
- (re.compile(r"([:,])$"), r" \1 "),
- (re.compile(r"\.\.\."), r" ... "),
- (re.compile(r"[;@#$%&]"), r" \g<0> "),
- (
- re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
- r"\1 \2\3 ",
- ), # Handles the final period.
- (re.compile(r"[?!]"), r" \g<0> "),
+ (re.compile(r'([:,])([^\d])'), r' \1 \2'),
+ (re.compile(r'([:,])$'), r' \1 '),
+ (re.compile(r'\.\.\.'), r' ... '),
+ (re.compile(r'[;@#$%&]'), r' \g<0> '),
+ (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '), # Handles the final period.
+ (re.compile(r'[?!]'), r' \g<0> '),
+
(re.compile(r"([^'])' "), r"\1 ' "),
]
# Pads parentheses
- PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
+ PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> ')
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
CONVERT_PARENTHESES = [
- (re.compile(r"\("), "-LRB-"),
- (re.compile(r"\)"), "-RRB-"),
- (re.compile(r"\["), "-LSB-"),
- (re.compile(r"\]"), "-RSB-"),
- (re.compile(r"\{"), "-LCB-"),
- (re.compile(r"\}"), "-RCB-"),
+ (re.compile(r'\('), '-LRB-'), (re.compile(r'\)'), '-RRB-'),
+ (re.compile(r'\['), '-LSB-'), (re.compile(r'\]'), '-RSB-'),
+ (re.compile(r'\{'), '-LCB-'), (re.compile(r'\}'), '-RCB-')
]
- DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
+ DOUBLE_DASHES = (re.compile(r'--'), r' -- ')
- # ending quotes
+ #ending quotes
ENDING_QUOTES = [
(re.compile(r'"'), " '' "),
- (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
+ (re.compile(r'(\S)(\'\')'), r'\1 \2 '),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
regexp, substitution = self.DOUBLE_DASHES
text = regexp.sub(substitution, text)
- # add extra space to make things easier
+ #add extra space to make things easier
text = " " + text + " "
for regexp, substitution in self.ENDING_QUOTES:
text = regexp.sub(substitution, text)
for regexp in self.CONTRACTIONS2:
- text = regexp.sub(r" \1 \2 ", text)
+ text = regexp.sub(r' \1 \2 ', text)
for regexp in self.CONTRACTIONS3:
- text = regexp.sub(r" \1 \2 ", text)
+ text = regexp.sub(r' \1 \2 ', text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
- >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
+ >>> TreebankWordTokenizer().span_tokenize(s) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
True
- Additional example
- >>> from nltk.tokenize import TreebankWordTokenizer
- >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\\n each in New (York)."'''
- >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
- ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
- ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
- ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
- ... (82, 83), (83, 84)]
- >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
- True
- >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
- ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
- ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
- >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
- True
-
"""
raw_tokens = self.tokenize(text)
# Convert converted quotes back to original double quotes
- # Do this only if original text contains double quote(s) or double
- # single-quotes (because '' might be transformed to `` if it is
- # treated as starting quotes).
- if ('"' in text) or ("''" in text):
+ # Do this only if original text contains double quote(s)
+ if '"' in text:
# Find double quotes and converted quotes
- matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
-
+ matched = [m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text)]
+
# Replace converted quotes back to double quotes
- tokens = [
- matched.pop(0) if tok in ['"', "``", "''"] else tok
- for tok in raw_tokens
- ]
+ tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens]
else:
tokens = raw_tokens
- for tok in align_tokens(tokens, text):
- yield tok
+ return align_tokens(tokens, text)
class TreebankWordDetokenizer(TokenizerI):
True
During tokenization it's safe to add more spaces but during detokenization,
- simply undoing the padding doesn't really help.
+ simply undoing the padding doesn't really help.
- During tokenization, left and right pad is added to [!?], when
detokenizing, only left shift the [!?] is needed.
>>> twd.detokenize(toks)
"hello, i can't feel; my feet! Help!! He said: Help, help?!"
"""
-
_contractions = MacIntyreContractions()
- CONTRACTIONS2 = [
- re.compile(pattern.replace("(?#X)", "\s"))
- for pattern in _contractions.CONTRACTIONS2
- ]
- CONTRACTIONS3 = [
- re.compile(pattern.replace("(?#X)", "\s"))
- for pattern in _contractions.CONTRACTIONS3
- ]
+ CONTRACTIONS2 = [re.compile(pattern.replace('(?#X)', '\s'))
+ for pattern in _contractions.CONTRACTIONS2]
+ CONTRACTIONS3 = [re.compile(pattern.replace('(?#X)', '\s'))
+ for pattern in _contractions.CONTRACTIONS3]
- # ending quotes
+ #ending quotes
ENDING_QUOTES = [
(re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
(re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
- (re.compile(r"(\S)(\'\')"), r"\1\2 "),
- (re.compile(r" '' "), '"'),
- ]
+ (re.compile(r'(\S)(\'\')'), r'\1\2 '),
+ (re.compile(r" '' "), '"')
+ ]
# Handles double dashes
- DOUBLE_DASHES = (re.compile(r" -- "), r"--")
+ DOUBLE_DASHES = (re.compile(r' -- '), r'--')
# Optionally: Convert parentheses, brackets and converts them from PTB symbols.
CONVERT_PARENTHESES = [
- (re.compile("-LRB-"), "("),
- (re.compile("-RRB-"), ")"),
- (re.compile("-LSB-"), "["),
- (re.compile("-RSB-"), "]"),
- (re.compile("-LCB-"), "{"),
- (re.compile("-RCB-"), "}"),
+ (re.compile('-LRB-'), '('), (re.compile('-RRB-'), ')'),
+ (re.compile('-LSB-'), '['), (re.compile('-RSB-'), ']'),
+ (re.compile('-LCB-'), '{'), (re.compile('-RCB-'), '}')
]
# Undo padding on parentheses.
- PARENS_BRACKETS = [
- (re.compile(r"\s([\[\(\{\<])\s"), r" \g<1>"),
- (re.compile(r"\s([\]\)\}\>])\s"), r"\g<1> "),
- (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
- ]
+ PARENS_BRACKETS = [(re.compile(r'\s([\[\(\{\<])\s'), r' \g<1>'),
+ (re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '),
+ (re.compile(r'([\]\)\}\>])\s([:;,.])'), r'\1\2')]
- # punctuation
+ #punctuation
PUNCTUATION = [
(re.compile(r"([^'])\s'\s"), r"\1' "),
- (re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!]
- # (re.compile(r'\s([?!])\s'), r'\g<1>'),
- (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
+ (re.compile(r'\s([?!])'), r'\g<1>'), # Strip left pad for [?!]
+ #(re.compile(r'\s([?!])\s'), r'\g<1>'),
+ (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
# When tokenizing, [;@#$%&] are padded with whitespace regardless of
# whether there are spaces before or after them.
# But during detokenization, we need to distinguish between left/right
# pad, so we split this up.
- (re.compile(r"\s([#$])\s"), r" \g<1>"), # Left pad.
- (re.compile(r"\s([;%])\s"), r"\g<1> "), # Right pad.
- (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad.
- (re.compile(r"\s\.\.\.\s"), r"..."),
- (re.compile(r"\s([:,])\s$"), r"\1"),
- (
- re.compile(r"\s([:,])\s([^\d])"),
- r"\1 \2",
- ) # Keep right pad after comma/colon before non-digits.
- # (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
- ]
-
- # starting quotes
+ (re.compile(r'\s([#$])\s'), r' \g<1>'), # Left pad.
+ (re.compile(r'\s([;%])\s'), r'\g<1> '), # Right pad.
+ (re.compile(r'\s([&])\s'), r' \g<1> '), # Unknown pad.
+ (re.compile(r'\s\.\.\.\s'), r'...'),
+ (re.compile(r'\s([:,])\s$'), r'\1'),
+ (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2') # Keep right pad after comma/colon before non-digits.
+ #(re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
+ ]
+
+ #starting quotes
STARTING_QUOTES = [
- (re.compile(r"([ (\[{<])\s``"), r'\1"'),
- (re.compile(r"\s(``)\s"), r"\1"),
- (re.compile(r"^``"), r"\""),
+ (re.compile(r'([ (\[{<])\s``'), r'\1"'),
+ (re.compile(r'\s(``)\s'), r'\1'),
+ (re.compile(r'^``'), r'\"'),
]
def tokenize(self, tokens, convert_parentheses=False):
"""
- Treebank detokenizer, created by undoing the regexes from
- the TreebankWordTokenizer.tokenize.
+ Python port of the Moses detokenizer.
:param tokens: A list of strings, i.e. tokenized text.
:type tokens: list(str)
:return: str
"""
- text = " ".join(tokens)
+ text = ' '.join(tokens)
# Reverse the contractions regexes.
# Note: CONTRACTIONS4 are not used in tokenization.
for regexp in self.CONTRACTIONS3:
- text = regexp.sub(r"\1\2", text)
+ text = regexp.sub(r'\1\2', text)
for regexp in self.CONTRACTIONS2:
- text = regexp.sub(r"\1\2", text)
+ text = regexp.sub(r'\1\2', text)
# Reverse the regexes applied for ending quotes.
for regexp, substitution in self.ENDING_QUOTES:
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Tokenizer Utilities
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
from re import finditer
from xml.sax.saxutils import escape, unescape
-
def string_span_tokenize(s, sep):
r"""
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
left = right + len(sep)
-
def regexp_span_tokenize(s, regexp):
r"""
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
left = next
yield left, len(s)
-
def spans_to_relative(spans):
r"""
Return a sequence of relative spans, given a sequence of spans.
This is a Python port of the CJK code point enumerations of Moses tokenizer:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
"""
-
# Hangul Jamo (1100–11FF)
- Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
+ Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
# CJK Radicals Supplement (2E80–2EFF)
# Kangxi Radicals (2F00–2FDF)
# CJK Unified Ideographs (4E00–9FFF)
# Yi Syllables (A000–A48F)
# Yi Radicals (A490–A4CF)
- CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
+ CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
# Phags-pa (A840–A87F)
- Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
+ Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
# Hangul Syllables (AC00–D7AF)
- Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
+ Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
# CJK Compatibility Ideographs (F900–FAFF)
- CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
+ CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
# CJK Compatibility Forms (FE30–FE4F)
- CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
+ CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
# Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
- Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
+ Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
# Supplementary Ideographic Plane 20000–2FFFF
- Supplementary_Ideographic_Plane = (
- 131072,
- 196607,
- ) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
-
- ranges = [
- Hangul_Jamo,
- CJK_Radicals,
- Phags_Pa,
- Hangul_Syllables,
- CJK_Compatibility_Ideographs,
- CJK_Compatibility_Forms,
- Katakana_Hangul_Halfwidth,
- Supplementary_Ideographic_Plane,
- ]
+ Supplementary_Ideographic_Plane = (131072, 196607) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
+
+ ranges = [Hangul_Jamo, CJK_Radicals, Phags_Pa, Hangul_Syllables,
+ CJK_Compatibility_Ideographs, CJK_Compatibility_Forms,
+ Katakana_Hangul_Halfwidth, Supplementary_Ideographic_Plane]
+
def is_cjk(character):
:type character: char
:return: bool
"""
- return any(
- [
- start <= ord(character) <= end
- for start, end in [
- (4352, 4607),
- (11904, 42191),
- (43072, 43135),
- (44032, 55215),
- (63744, 64255),
- (65072, 65103),
- (65381, 65500),
- (131072, 196607),
- ]
- ]
- )
+ return any([start <= ord(character) <= end for start, end in
+ [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215),
+ (63744, 64255), (65072, 65103), (65381, 65500),
+ (131072, 196607)]
+ ])
def xml_escape(text):
:type text: str
:rtype: str
"""
- return escape(
- text,
- entities={
- r"'": r"'",
- r'"': r""",
- r"|": r"|",
- r"[": r"[",
- r"]": r"]",
- },
- )
+ return escape(text, entities={ r"'": r"'", r'"': r""",
+ r"|": r"|",
+ r"[": r"[", r"]": r"]", })
def xml_unescape(text):
:type text: str
:rtype: str
"""
- return unescape(
- text,
- entities={
- r"'": r"'",
- r""": r'"',
- r"|": r"|",
- r"[": r"[",
- r"]": r"]",
- },
- )
+ return unescape(text, entities={ r"'":r"'", r""":r'"',
+ r"|":r"|",
+ r"[":r"[", r"]":r"]", })
def align_tokens(tokens, sentence):
# coding: utf-8
# Natural Language Toolkit: Toolbox Reader
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Greg Aumann <greg_aumann@sil.org>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
Module for reading, writing and manipulating
Toolbox databases and settings files.
"""
+from __future__ import print_function
-import re, codecs
+import os, re, codecs
from xml.etree.ElementTree import ElementTree, TreeBuilder, Element, SubElement
-from io import StringIO
-from nltk.data import PathPointer, find
+from six import u
+
+from nltk.compat import StringIO, PY3
+from nltk.data import PathPointer, ZipFilePathPointer, find
class StandardFormat(object):
"""
Class for reading and processing standard format marker files and strings.
"""
-
def __init__(self, filename=None, encoding=None):
self._encoding = encoding
if filename is not None:
# (PathPointer.open doesn't take a mode option)
self._file = sfm_file.open(self._encoding)
else:
- self._file = codecs.open(sfm_file, "rU", self._encoding)
+ self._file = codecs.open(sfm_file, 'rU', self._encoding)
def open_string(self, s):
"""
:rtype: iter(tuple(str, str))
"""
- join_string = "\n"
- line_regexp = r"^%s(?:\\(\S+)\s*)?(.*)$"
+ join_string = '\n'
+ line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$'
# discard a BOM in the first line
- first_line_pat = re.compile(line_regexp % "(?:\xef\xbb\xbf)?")
- line_pat = re.compile(line_regexp % "")
+ first_line_pat = re.compile(line_regexp % '(?:\xef\xbb\xbf)?')
+ line_pat = re.compile(line_regexp % '')
# need to get first line outside the loop for correct handling
# of the first marker if it spans multiple lines
file_iter = iter(self._file)
- # PEP 479, prevent RuntimeError when StopIteration is raised inside generator
- try:
- line = next(file_iter)
- except StopIteration:
- # no more data is available, terminate the generator
- return
+ line = next(file_iter)
mobj = re.match(first_line_pat, line)
mkr, line_value = mobj.groups()
- value_lines = [line_value]
+ value_lines = [line_value,]
self.line_num = 0
for line in file_iter:
self.line_num += 1
if line_mkr:
yield (mkr, join_string.join(value_lines))
mkr = line_mkr
- value_lines = [line_value]
+ value_lines = [line_value,]
else:
value_lines.append(line_value)
self.line_num += 1
yield (mkr, join_string.join(value_lines))
- def fields(
- self,
- strip=True,
- unwrap=True,
- encoding=None,
- errors="strict",
- unicode_fields=None,
- ):
+ def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
"""
Return an iterator that returns the next field in a ``(marker, value)``
tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
:rtype: iter(tuple(str, str))
"""
if encoding is None and unicode_fields is not None:
- raise ValueError("unicode_fields is set but not encoding.")
- unwrap_pat = re.compile(r"\n+")
+ raise ValueError('unicode_fields is set but not encoding.')
+ unwrap_pat = re.compile(r'\n+')
for mkr, val in self.raw_fields():
+ if encoding and not PY3: # kludge - already decoded in PY3?
+ if unicode_fields is not None and mkr in unicode_fields:
+ val = val.decode('utf8', errors)
+ else:
+ val = val.decode(encoding, errors)
+ mkr = mkr.decode(encoding, errors)
if unwrap:
- val = unwrap_pat.sub(" ", val)
+ val = unwrap_pat.sub(' ', val)
if strip:
val = val.rstrip()
yield (mkr, val)
except AttributeError:
pass
-
class ToolboxData(StandardFormat):
- def parse(self, grammar=None, **kwargs):
+ def parse(self, grammar=None, **kwargs):
if grammar:
- return self._chunk_parse(grammar=grammar, **kwargs)
+ return self._chunk_parse(grammar=grammar, **kwargs)
else:
return self._record_parse(**kwargs)
:return: contents of toolbox data divided into header and records
"""
builder = TreeBuilder()
- builder.start("toolbox_data", {})
- builder.start("header", {})
+ builder.start('toolbox_data', {})
+ builder.start('header', {})
in_records = False
for mkr, value in self.fields(**kwargs):
- if key is None and not in_records and mkr[0] != "_":
+ if key is None and not in_records and mkr[0] != '_':
key = mkr
if mkr == key:
if in_records:
- builder.end("record")
+ builder.end('record')
else:
- builder.end("header")
+ builder.end('header')
in_records = True
- builder.start("record", {})
+ builder.start('record', {})
builder.start(mkr, {})
builder.data(value)
builder.end(mkr)
if in_records:
- builder.end("record")
+ builder.end('record')
else:
- builder.end("header")
- builder.end("toolbox_data")
+ builder.end('header')
+ builder.end('toolbox_data')
return builder.close()
def _tree2etree(self, parent):
e.text = text
return root
- def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs):
+ def _chunk_parse(self, grammar=None, root_label='record', trace=0, **kwargs):
"""
Returns an element tree structure corresponding to a toolbox data file
parsed according to the chunk grammar.
cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
db = self.parse(**kwargs)
- tb_etree = Element("toolbox_data")
- header = db.find("header")
+ tb_etree = Element('toolbox_data')
+ header = db.find('header')
tb_etree.append(header)
- for record in db.findall("record"):
+ for record in db.findall('record'):
parsed = cp.parse([(elem.text, elem.tag) for elem in record])
tb_etree.append(self._tree2etree(parsed))
return tb_etree
-
_is_value = re.compile(r"\S")
-
-def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None):
+def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
"""
Return a string with a standard format representation of the toolbox
data in tree (tree can be a toolbox database or a single record).
:type unicode_fields: dict(str) or set(str)
:rtype: str
"""
- if tree.tag == "record":
- root = Element("toolbox_data")
+ if tree.tag == 'record':
+ root = Element('toolbox_data')
root.append(tree)
tree = root
- if tree.tag != "toolbox_data":
+ if tree.tag != 'toolbox_data':
raise ValueError("not a toolbox_data element structure")
if encoding is None and unicode_fields is not None:
- raise ValueError(
- "if encoding is not specified then neither should unicode_fields"
- )
+ raise ValueError("if encoding is not specified then neither should unicode_fields")
l = []
for rec in tree:
- l.append("\n")
+ l.append('\n')
for field in rec:
mkr = field.tag
value = field.text
if encoding is not None:
if unicode_fields is not None and mkr in unicode_fields:
- cur_encoding = "utf8"
+ cur_encoding = 'utf8'
else:
cur_encoding = encoding
if re.search(_is_value, value):
- l.append(
- ("\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors)
- )
+ l.append((u("\\%s %s\n") % (mkr, value)).encode(cur_encoding, errors))
else:
- l.append(
- ("\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors)
- )
+ l.append((u("\\%s%s\n") % (mkr, value)).encode(cur_encoding, errors))
else:
if re.search(_is_value, value):
l.append("\\%s %s\n" % (mkr, value))
else:
l.append("\\%s%s\n" % (mkr, value))
- return "".join(l[1:])
-
+ return ''.join(l[1:])
class ToolboxSettings(StandardFormat):
"""This class is the base class for settings files."""
def __init__(self):
super(ToolboxSettings, self).__init__()
- def parse(self, encoding=None, errors="strict", **kwargs):
+ def parse(self, encoding=None, errors='strict', **kwargs):
"""
Return the contents of toolbox settings file with a nested structure.
for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs):
# Check whether the first char of the field marker
# indicates a block start (+) or end (-)
- block = mkr[0]
+ block=mkr[0]
if block in ("+", "-"):
- mkr = mkr[1:]
+ mkr=mkr[1:]
else:
- block = None
+ block=None
# Build tree on the basis of block char
if block == "+":
builder.start(mkr, {})
builder.data(value)
- elif block == "-":
+ elif block == '-':
builder.end(mkr)
else:
builder.start(mkr, {})
builder.end(mkr)
return builder.close()
-
-def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None):
+def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None):
# write XML to file
l = list()
- _to_settings_string(
- tree.getroot(),
- l,
- encoding=encoding,
- errors=errors,
- unicode_fields=unicode_fields,
- )
- return "".join(l)
-
+ _to_settings_string(tree.getroot(), l, encoding=encoding, errors=errors, unicode_fields=unicode_fields)
+ return ''.join(l)
def _to_settings_string(node, l, **kwargs):
# write XML to file
text = node.text
if len(node) == 0:
if text:
- l.append("\\%s %s\n" % (tag, text))
+ l.append('\\%s %s\n' % (tag, text))
else:
- l.append("\\%s\n" % tag)
+ l.append('\\%s\n' % tag)
else:
if text:
- l.append("\\+%s %s\n" % (tag, text))
+ l.append('\\+%s %s\n' % (tag, text))
else:
- l.append("\\+%s\n" % tag)
+ l.append('\\+%s\n' % tag)
for n in node:
_to_settings_string(n, l, **kwargs)
- l.append("\\-%s\n" % tag)
+ l.append('\\-%s\n' % tag)
return
-
def remove_blanks(elem):
"""
Remove all elements and subelements with no text and no child elements.
out.append(child)
elem[:] = out
-
def add_default_fields(elem, default_fields):
"""
Add blank elements and subelements specified in default_fields.
:param default_fields: fields to add to each type of element and subelement
:type default_fields: dict(tuple)
"""
- for field in default_fields.get(elem.tag, []):
+ for field in default_fields.get(elem.tag, []):
if elem.find(field) is None:
SubElement(elem, field)
for child in elem:
add_default_fields(child, default_fields)
-
def sort_fields(elem, field_orders):
"""
Sort the elements and subelements in order specified in field_orders.
order_key[subfield] = i
_sort_fields(elem, order_dicts)
-
def _sort_fields(elem, orders_dicts):
"""sort the children of elem"""
try:
except KeyError:
pass
else:
- tmp = sorted(
- [((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem)]
- )
+ tmp = sorted([((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem)])
elem[:] = [child for key, child in tmp]
for child in elem:
if len(child):
_sort_fields(child, orders_dicts)
-
def add_blank_lines(tree, blanks_before, blanks_between):
"""
Add blank lines before all elements and subelements specified in blank_before.
add_blank_lines(elem, blanks_before, blanks_between)
last_elem = elem
-
def demo():
from itertools import islice
- # zip_path = find('corpora/toolbox.zip')
- # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
- file_path = find("corpora/toolbox/rotokas.dic")
+# zip_path = find('corpora/toolbox.zip')
+# lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
+ file_path = find('corpora/toolbox/rotokas.dic')
lexicon = ToolboxData(file_path).parse()
- print("first field in fourth record:")
+ print('first field in fourth record:')
print(lexicon[3][0].tag)
print(lexicon[3][0].text)
- print("\nfields in sequential order:")
- for field in islice(lexicon.find("record"), 10):
+ print('\nfields in sequential order:')
+ for field in islice(lexicon.find('record'), 10):
print(field.tag, field.text)
- print("\nlx fields:")
- for field in islice(lexicon.findall("record/lx"), 10):
+ print('\nlx fields:')
+ for field in islice(lexicon.findall('record/lx'), 10):
print(field.text)
settings = ToolboxSettings()
- file_path = find("corpora/toolbox/MDF/MDF_AltH.typ")
+ file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
settings.open(file_path)
- # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
- tree = settings.parse(unwrap=False, encoding="cp1252")
- print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text)
+# settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
+ tree = settings.parse(unwrap=False, encoding='cp1252')
+ print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
settings_tree = ElementTree(tree)
- print(to_settings_string(settings_tree).encode("utf8"))
-
+ print(to_settings_string(settings_tree).encode('utf8'))
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Machine Translation
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>, Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.translate.ibm5 import IBMModel5
from nltk.translate.bleu_score import sentence_bleu as bleu
from nltk.translate.ribes_score import sentence_ribes as ribes
-from nltk.translate.meteor_score import meteor_score as meteor
from nltk.translate.metrics import alignment_error_rate
from nltk.translate.stack_decoder import StackDecoder
# Natural Language Toolkit: API for alignment and translation objects
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
import subprocess
from collections import namedtuple
+from nltk.compat import python_2_unicode_compatible
+@python_2_unicode_compatible
class AlignedSent(object):
"""
Return an aligned sentence object, which encapsulates two sentences
along with an ``Alignment`` between them.
- Typically used in machine translation to represent a sentence and
- its translation.
-
>>> from nltk.translate import AlignedSent, Alignment
>>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
- ... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1'))
+ ... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-2 1-3 2-1 3-0'))
>>> algnsent.words
['klein', 'ist', 'das', 'Haus']
>>> algnsent.mots
['the', 'house', 'is', 'small']
>>> algnsent.alignment
- Alignment([(0, 3), (1, 2), (2, 0), (3, 1)])
+ Alignment([(0, 2), (1, 3), (2, 1), (3, 0)])
>>> from nltk.corpus import comtrans
>>> print(comtrans.aligned_sents()[54])
<AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
>>> print(comtrans.aligned_sents()[54].alignment)
0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
- :param words: Words in the target language sentence
+ :param words: source language words
:type words: list(str)
- :param mots: Words in the source language sentence
+ :param mots: target language words
:type mots: list(str)
- :param alignment: Word-level alignments between ``words`` and ``mots``.
- Each alignment is represented as a 2-tuple (words_index, mots_index).
+ :param alignment: the word-level alignments between the source
+ and target language
:type alignment: Alignment
"""
def _set_alignment(self, alignment):
_check_alignment(len(self.words), len(self.mots), alignment)
self._alignment = alignment
-
alignment = property(_get_alignment, _set_alignment)
def __repr__(self):
"""
Dot representation of the aligned sentence
"""
- s = "graph align {\n"
- s += "node[shape=plaintext]\n"
+ s = 'graph align {\n'
+ s += 'node[shape=plaintext]\n'
# Declare node
for w in self._words:
s += '"%s_target" [label="%s"] \n' % (w, w)
# Alignment
- for u, v in self._alignment:
- s += '"%s_source" -- "%s_target" \n' % (self._words[u], self._mots[v])
+ for u,v in self._alignment:
+ s += '"%s_source" -- "%s_target" \n' % (self._words[u] , self._mots[v] )
# Connect the source words
- for i in range(len(self._words) - 1):
- s += '"%s_source" -- "%s_source" [style=invis]\n' % (
- self._words[i],
- self._words[i + 1],
- )
+ for i in range(len(self._words)-1) :
+ s += '"%s_source" -- "%s_source" [style=invis]\n' % (self._words[i] , self._words[i+1])
# Connect the target words
- for i in range(len(self._mots) - 1):
- s += '"%s_target" -- "%s_target" [style=invis]\n' % (
- self._mots[i],
- self._mots[i + 1],
- )
+ for i in range(len(self._mots)-1) :
+ s += '"%s_target" -- "%s_target" [style=invis]\n' % (self._mots[i] , self._mots[i+1])
# Put it in the same rank
- s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words))
- s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots))
+ s += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
+ s += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
- s += "}"
+ s += '}'
return s
"""
Ipython magic : show SVG representation of this ``AlignedSent``.
"""
- dot_string = self._to_dot().encode("utf8")
- output_format = "svg"
+ dot_string = self._to_dot().encode('utf8')
+ output_format = 'svg'
try:
- process = subprocess.Popen(
- ["dot", "-T%s" % output_format],
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
+ process = subprocess.Popen(['dot', '-T%s' % output_format], stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except OSError:
- raise Exception("Cannot find the dot binary from Graphviz package")
+ raise Exception('Cannot find the dot binary from Graphviz package')
out, err = process.communicate(dot_string)
- return out.decode("utf8")
+ return out.decode('utf8')
+
def __str__(self):
"""
:rtype: AlignedSent
"""
- return AlignedSent(self._mots, self._words, self._alignment.invert())
-
+ return AlignedSent(self._mots, self._words,
+ self._alignment.invert())
+@python_2_unicode_compatible
class Alignment(frozenset):
"""
A storage class for representing alignment between two sequences, s1, s2.
def __new__(cls, pairs):
self = frozenset.__new__(cls, pairs)
- self._len = max(p[0] for p in self) if self != frozenset([]) else 0
+ self._len = (max(p[0] for p in self) if self != frozenset([]) else 0)
self._index = None
return self
if not positions:
positions = list(range(len(self._index)))
for p in positions:
- image.update(f for _, f in self._index[p])
+ image.update(f for _,f in self._index[p])
return sorted(image)
def __repr__(self):
i, j = pair_string.split("-")
return int(i), int(j)
-
def _naacl2pair(pair_string):
i, j, p = pair_string.split("-")
return int(i), int(j)
-
def _check_alignment(num_words, num_mots, alignment):
"""
Check whether the alignments are legal.
raise IndexError("Alignment is outside boundary of mots")
-PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"])
-
-
+PhraseTableEntry = namedtuple('PhraseTableEntry', ['trg_phrase', 'log_prob'])
class PhraseTable(object):
"""
In-memory store of translations for a given phrase, and the log
probability of the those translations
"""
-
def __init__(self):
self.src_phrases = dict()
if src_phrase not in self.src_phrases:
self.src_phrases[src_phrase] = []
self.src_phrases[src_phrase].append(entry)
- self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True)
+ self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob,
+ reverse=True)
def __contains__(self, src_phrase):
return src_phrase in self.src_phrases
# -*- coding: utf-8 -*-
# Natural Language Toolkit: BLEU Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
-# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
+# Contributors: Dmitrijs Milajevs, Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""BLEU score implementation."""
+from __future__ import division
import math
import sys
-from fractions import Fraction
+import fractions
import warnings
from collections import Counter
from nltk.util import ngrams
+try:
+ fractions.Fraction(0, 1000, _normalize=False)
+ from fractions import Fraction
+except TypeError:
+ from nltk.compat import Fraction
-def sentence_bleu(
- references,
- hypothesis,
- weights=(0.25, 0.25, 0.25, 0.25),
- smoothing_function=None,
- auto_reweigh=False,
-):
+
+def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
+ smoothing_function=None, auto_reweigh=False,
+ emulate_multibleu=False):
"""
Calculate BLEU score (Bilingual Evaluation Understudy) from
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
0.5045...
- If there is no ngrams overlap for any order of n-grams, BLEU returns the
- value 0. This is because the precision for the order of n-grams without
- overlap is 0, and the geometric mean in the final BLEU score computation
- multiplies the 0 with the precision of other n-grams. This results in 0
- (independently of the precision of the othe n-gram orders). The following
- example has zero 3-gram and 4-gram overlaps:
-
- >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
- 0.0
-
- To avoid this harsh behaviour when no ngram overlaps are found a smoothing
- function can be used.
-
- >>> chencherry = SmoothingFunction()
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
- ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
- 0.0370...
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
+ 0.3969...
- The default BLEU calculates a score for up to 4-grams using uniform
- weights (this is called BLEU-4). To evaluate your translations with
- higher/lower order ngrams, use customized weights. E.g. when accounting
- for up to 5-grams with uniform weights (this is called BLEU-5) use:
+ The default BLEU calculates a score for up to 4grams using uniform
+ weights. To evaluate your translations with higher/lower order ngrams,
+ use customized weights. E.g. when accounting for up to 6grams with uniform
+ weights:
- >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
+ >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
- 0.3920...
+ 0.4583...
:param references: reference sentences
:type references: list(list(str))
:type weights: list(float)
:param smoothing_function:
:type smoothing_function: SmoothingFunction
- :param auto_reweigh: Option to re-normalize the weights uniformly.
+ :param auto_reweigh:
:type auto_reweigh: bool
+ :param emulate_multibleu: bool
:return: The sentence-level BLEU score.
:rtype: float
"""
- return corpus_bleu(
- [references], [hypothesis], weights, smoothing_function, auto_reweigh
- )
-
-
-def corpus_bleu(
- list_of_references,
- hypotheses,
- weights=(0.25, 0.25, 0.25, 0.25),
- smoothing_function=None,
- auto_reweigh=False,
-):
+ return corpus_bleu([references], [hypothesis],
+ weights, smoothing_function, auto_reweigh,
+ emulate_multibleu)
+
+
+def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
+ smoothing_function=None, auto_reweigh=False,
+ emulate_multibleu=False):
"""
Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
the hypotheses and their respective references.
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
0.6223...
- :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
- :type list_of_references: list(list(list(str)))
+ :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
+ :type references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param weights: weights for unigrams, bigrams, trigrams and so on
:type weights: list(float)
:param smoothing_function:
:type smoothing_function: SmoothingFunction
- :param auto_reweigh: Option to re-normalize the weights uniformly.
+ :param auto_reweigh:
:type auto_reweigh: bool
+ :param emulate_multibleu: bool
:return: The corpus-level BLEU score.
:rtype: float
"""
# Before proceeding to compute BLEU, perform sanity checks.
- p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
- p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
+ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
+ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
hyp_lengths, ref_lengths = 0, 0
- assert len(list_of_references) == len(hypotheses), (
- "The number of hypotheses and their reference(s) should be the " "same "
- )
+ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
# Calculate the hypothesis length and the closest reference length.
# Adds them to the corpus-level hypothesis and reference counts.
- hyp_len = len(hypothesis)
+ hyp_len = len(hypothesis)
hyp_lengths += hyp_len
ref_lengths += closest_ref_length(references, hyp_len)
# order of n-grams < 4 and weights is set at default.
if auto_reweigh:
if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
- weights = (1 / hyp_lengths,) * hyp_lengths
+ weights = ( 1 / hyp_lengths ,) * hyp_lengths
# Collects the various precision values for the different ngram orders.
- p_n = [
- Fraction(p_numerators[i], p_denominators[i], _normalize=False)
- for i, _ in enumerate(weights, start=1)
- ]
+ p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+ for i, _ in enumerate(weights, start=1)]
# Returns 0 if there's no matching n-grams
# We only need to check for p_numerators[1] == 0, since if there's
# Note: smoothing_function() may convert values into floats;
# it tries to retain the Fraction object as much as the
# smoothing method allows.
- p_n = smoothing_function(
- p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
- )
- s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
- s = bp * math.exp(math.fsum(s))
- return s
+ p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
+ hyp_len=hyp_len, emulate_multibleu=emulate_multibleu)
+ s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))
+ s = bp * math.exp(math.fsum(s))
+ return round(s, 4) if emulate_multibleu else s
def modified_precision(references, hypothesis, n):
# Set an empty Counter if hypothesis is empty.
counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
# Extract a union of references' counts.
- # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
+ ## max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
max_counts = {}
for reference in references:
- reference_counts = (
- Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
- )
+ reference_counts = Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
for ngram in counts:
- max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
+ max_counts[ngram] = max(max_counts.get(ngram, 0),
+ reference_counts[ngram])
# Assigns the intersection between hypothesis and references' counts.
- clipped_counts = {
- ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
- }
+ clipped_counts = {ngram: min(count, max_counts[ngram])
+ for ngram, count in counts.items()}
numerator = sum(clipped_counts.values())
# Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
:param references: A list of reference translations.
:type references: list(list(str))
- :param hyp_len: The length of the hypothesis.
- :type hyp_len: int
+ :param hypothesis: The length of the hypothesis.
+ :type hypothesis: int
:return: The length of the reference that's closest to the hypothesis.
:rtype: int
"""
ref_lens = (len(reference) for reference in references)
- closest_ref_len = min(
- ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
- )
+ closest_ref_len = min(ref_lens, key=lambda ref_len:
+ (abs(ref_len - hyp_len), ref_len))
return closest_ref_len
:type hyp_len: int
:param closest_ref_len: The length of the closest reference for a single
hypothesis OR the sum of all the closest references for every hypotheses.
- :type closest_ref_len: int
+ :type closest_reference_len: int
:return: BLEU's brevity penalty.
:rtype: float
"""
Smoothing Techniques for Sentence-Level BLEU. In WMT14.
http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
"""
-
def __init__(self, epsilon=0.1, alpha=5, k=5):
"""
This will initialize the parameters required for the various smoothing
... 'Party', 'commands']
>>> chencherry = SmoothingFunction()
- >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
0.4489...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
0.4118...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
0.4905...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
0.4135...
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
+ >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
0.4905...
:param epsilon: the epsilon value use in method 1
self.k = k
def method0(self, p_n, *args, **kwargs):
- """
- No smoothing.
- """
+ """ No smoothing. """
p_n_new = []
+ _emulate_multibleu = kwargs['emulate_multibleu']
for i, p_i in enumerate(p_n):
if p_i.numerator != 0:
p_n_new.append(p_i)
+ elif _emulate_multibleu and i < 5:
+ return [sys.float_info.min]
else:
- _msg = str(
- "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
- "Therefore the BLEU score evaluates to 0, independently of\n"
- "how many N-gram overlaps of lower order it contains.\n"
- "Consider using lower n-gram order or use "
- "SmoothingFunction()"
- ).format(i + 1)
+ _msg = str("\nCorpus/Sentence contains 0 counts of {}-gram overlaps.\n"
+ "BLEU scores might be undesirable; "
+ "use SmoothingFunction().").format(i+1)
warnings.warn(_msg)
- # When numerator==0 where denonminator==0 or !=0, the result
- # for the precision score should be equal to 0 or undefined.
- # Due to BLEU geometric mean computation in logarithm space,
- # we we need to take the return sys.float_info.min such that
- # math.log(sys.float_info.min) returns a 0 precision score.
- p_n_new.append(sys.float_info.min)
+ # If this order of n-gram returns 0 counts, the higher order
+ # n-gram would also return 0, thus breaking the loop here.
+ break
return p_n_new
def method1(self, p_n, *args, **kwargs):
"""
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
"""
- return [
- (p_i.numerator + self.epsilon) / p_i.denominator
- if p_i.numerator == 0
- else p_i
- for p_i in p_n
- ]
+ return [(p_i.numerator + self.epsilon)/ p_i.denominator
+ if p_i.numerator == 0 else p_i for p_i in p_n]
def method2(self, p_n, *args, **kwargs):
"""
machine translation quality using longest common subsequence and
skip-bigram statistics. In ACL04.
"""
- return [
- Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False)
- for p_i in p_n
- ]
+ return [Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False) for p_i in p_n]
def method3(self, p_n, *args, **kwargs):
"""
- n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
- n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
"""
- incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
+ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
for i, p_i in enumerate(p_n):
if p_i.numerator == 0:
- p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
- incvnt += 1
+ p_n[i] = 1 / (2**incvnt * p_i.denominator)
+ incvnt+=1
return p_n
- def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+ def method4(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 4:
Shorter translations may have inflated precision values due to having
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
for i, p_i in enumerate(p_n):
if p_i.numerator == 0 and hyp_len != 0:
- incvnt = i + 1 * self.k / math.log(
- hyp_len
- ) # Note that this K is different from the K from NIST.
- p_n[i] = incvnt / p_i.denominator
+ incvnt = i+1 * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST.
+ p_n[i] = 1 / incvnt
return p_n
- def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+
+ def method5(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 5:
The matched counts for similar values of n should be similar. To a
calculate the n-gram matched count, it averages the n−1, n and n+1 gram
matched counts.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
m = {}
# Requires an precision value for an addition ngram order.
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
m[-1] = p_n[0] + 1
for i, p_i in enumerate(p_n):
- p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
+ p_n[i] = (m[i-1] + p_i + p_n_plus1[i+1]) / 3
m[i] = p_n[i]
return p_n
- def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+ def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 6:
Interpolates the maximum likelihood estimate of the precision *p_n* with
Gao and He (2013) Training MRF-Based Phrase Translation Models using
Gradient Ascent. In NAACL.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
# This smoothing only works when p_1 and p_2 is non-zero.
# Raise an error with an appropriate message when the input is too short
# to use this smoothing technique.
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
for i, p_i in enumerate(p_n):
- if i in [0, 1]: # Skips the first 2 orders of ngrams.
+ if i in [0,1]: # Skips the first 2 orders of ngrams.
continue
else:
- pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
+ pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
# No. of ngrams in translation that matches the reference.
m = p_i.numerator
# No. of ngrams in translation.
- l = sum(1 for _ in ngrams(hypothesis, i + 1))
+ l = sum(1 for _ in ngrams(hypothesis, i+1))
# Calculates the interpolated precision.
p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
return p_n
- def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+ def method7(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
- Smoothing method 7:
- Interpolates methods 5 and 6.
+ Smoothing method 6:
+ Interpolates the maximum likelihood estimate of the precision *p_n* with
+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+ between pn and pn−1 will be the same as that between pn−1 and pn−2.
"""
- hyp_len = hyp_len if hyp_len else len(hypothesis)
p_n = self.method4(p_n, references, hypothesis, hyp_len)
p_n = self.method5(p_n, references, hypothesis, hyp_len)
return p_n
# -*- coding: utf-8 -*-
# Natural Language Toolkit: ChrF score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Maja Popovic
-# Contributors: Liling Tan, Aleš Tamchyna (Memsource)
+# Contributors: Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
""" ChrF score implementation """
-from collections import Counter, defaultdict
-import re
+from __future__ import division
+from collections import Counter
-from nltk.util import ngrams
+from nltk.util import ngrams, everygrams
-
-def sentence_chrf(
- reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
-):
+def sentence_chrf(reference, hypothesis, min_len=1, max_len=6, beta=3.0):
"""
Calculates the sentence level CHRF (Character n-gram F-score) described in
- Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
In Proceedings of the 1st Conference on Machine Translation.
http://www.statmt.org/wmt16/pdf/W16-2341.pdf
- This implementation of CHRF only supports a single reference at the moment.
-
- For details not reported in the paper, consult Maja Popovic's original
- implementation: https://github.com/m-popovic/chrF
-
- The code should output results equivalent to running CHRF++ with the
- following options: -nw 0 -b 3
+ Unlike multi-reference BLEU, CHRF only supports a single reference.
An example from the original BLEU paper
http://www.aclweb.org/anthology/P02-1040.pdf
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
... 'guidebook that party direct').split()
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
- 0.6349...
+ 0.6768...
>>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
- 0.3330...
+ 0.4201...
The infamous "the the the ... " example
>>> ref = 'the cat is on the mat'.split()
>>> hyp = 'the the the the the the the'.split()
>>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
- 0.1468...
+ 0.2530...
An example to show that this function allows users to use strings instead of
tokens, i.e. list(str) as inputs.
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party')
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
- 0.6349...
+ 0.6768...
>>> type(ref1) == type(hyp1) == str
True
>>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
- 0.6349...
+ 0.6768...
To skip the unigrams and only use 2- to 3-grams:
>>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
- 0.6617...
+ 0.7018...
:param references: reference sentence
:type references: list(str) / str
:type max_len: int
:param beta: the parameter to assign more importance to recall over precision
:type beta: float
- :param ignore_whitespace: ignore whitespace characters in scoring
- :type ignore_whitespace: bool
:return: the sentence level CHRF score.
:rtype: float
"""
- return corpus_chrf(
- [reference],
- [hypothesis],
- min_len,
- max_len,
- beta=beta,
- ignore_whitespace=ignore_whitespace,
- )
-
+ return corpus_chrf([reference], [hypothesis], min_len, max_len, beta=beta)
-def _preprocess(sent, ignore_whitespace):
- if type(sent) != str:
- # turn list of tokens into a string
- sent = " ".join(sent)
- if ignore_whitespace:
- sent = re.sub(r"\s+", "", sent)
- return sent
-
-
-def chrf_precision_recall_fscore_support(
- reference, hypothesis, n, beta=3.0, epsilon=1e-16
-):
- """
- This function computes the precision, recall and fscore from the ngram
- overlaps. It returns the `support` which is the true positive score.
-
- By underspecifying the input type, the function will be agnostic as to how
- it computes the ngrams and simply take the whichever element in the list;
- it could be either token or character.
-
- :param reference: The reference sentence.
- :type reference: list
- :param hypothesis: The hypothesis sentence.
- :type hypothesis: list
- :param n: Extract up to the n-th order ngrams
- :type n: int
- :param beta: The parameter to assign more importance to recall over precision.
- :type beta: float
- :param epsilon: The fallback value if the hypothesis or reference is empty.
- :type epsilon: float
- :return: Returns the precision, recall and f-score and support (true positive).
- :rtype: tuple(float)
- """
- ref_ngrams = Counter(ngrams(reference, n))
- hyp_ngrams = Counter(ngrams(hypothesis, n))
-
- # calculate the number of ngram matches
- overlap_ngrams = ref_ngrams & hyp_ngrams
- tp = sum(overlap_ngrams.values()) # True positives.
- tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
- tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
-
- try:
- prec = tp / tpfp # precision
- rec = tp / tpfn # recall
- factor = beta ** 2
- fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
- except ZeroDivisionError:
- prec = rec = fscore = epsilon
- return prec, rec, fscore, tp
-
-
-def corpus_chrf(
- references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
-):
+def corpus_chrf(list_of_references, hypotheses, min_len=1, max_len=6, beta=3.0):
"""
Calculates the corpus level CHRF (Character n-gram F-score), it is the
- macro-averaged value of the sentence/segment level CHRF score.
+ micro-averaged value of the sentence/segment level CHRF score.
- This implementation of CHRF only supports a single reference at the moment.
+ CHRF only supports a single reference.
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands').split()
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
... 'guidebook that party direct')
>>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
- 0.3910...
+ 0.4915...
:param references: a corpus of list of reference sentences, w.r.t. hypotheses
- :type references: list(list(str))
+ :type references: list(list(str)) / list(str)
:param hypotheses: a list of hypothesis sentences
- :type hypotheses: list(list(str))
+ :type hypotheses: list(list(str)) / list(str)
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:param beta: the parameter to assign more importance to recall over precision
:type beta: float
- :param ignore_whitespace: ignore whitespace characters in scoring
- :type ignore_whitespace: bool
:return: the sentence level CHRF score.
:rtype: float
"""
- assert len(references) == len(
- hypotheses
- ), "The number of hypotheses and their references should be the same"
- num_sents = len(hypotheses)
-
- # Keep f-scores for each n-gram order separate
- ngram_fscores = defaultdict(lambda: list())
+ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their references should be the same"
# Iterate through each hypothesis and their corresponding references.
- for reference, hypothesis in zip(references, hypotheses):
-
- # preprocess both reference and hypothesis
- reference = _preprocess(reference, ignore_whitespace)
- hypothesis = _preprocess(hypothesis, ignore_whitespace)
-
- # Calculate f-scores for each sentence and for each n-gram order
- # separately.
- for n in range(min_len, max_len + 1):
- # Compute the precision, recall, fscore and support.
- prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
- reference, hypothesis, n, beta=beta
- )
- ngram_fscores[n].append(fscore)
-
- # how many n-gram sizes
- num_ngram_sizes = len(ngram_fscores)
-
- # sum of f-scores over all sentences for each n-gram order
- total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
-
- # macro-average over n-gram orders and over all sentences
- return (sum(total_scores) / num_ngram_sizes) / num_sents
+ for reference, hypothesis in zip(list_of_references, hypotheses):
+ # Cheating condition to allow users to input strings instead of tokens.
+ if type(reference) and type(hypothesis) != str:
+ reference, hypothesis = ' '.join(reference), ' '.join(hypothesis)
+ # For each order of ngram, calculate the no. of ngram matches and
+ # keep track of no. of ngram in references.
+ ref_ngrams = Counter(everygrams(reference, min_len, max_len))
+ hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
+ overlap_ngrams = ref_ngrams & hyp_ngrams
+ tp = sum(overlap_ngrams.values()) # True positives.
+ tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
+ tffn = sum(ref_ngrams.values()) # True posities + False negatives.
+
+ precision = tp / tpfp
+ recall = tp / tffn
+ factor = beta**2
+ score = (1+ factor ) * (precision * recall) / ( factor * precision + recall)
+ return score
# Natural Language Toolkit: Gale-Church Aligner
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Torsten Marek <marek@ifi.uzh.ch>
# Contributor: Cassidy Laidlaw, Liling Tan
# URL: <http://nltk.org/>
"""
+from __future__ import division
import math
try:
from scipy.stats import norm
from norm import logsf as norm_logsf
except ImportError:
-
def erfcc(x):
"""Complementary error function."""
z = abs(x)
t = 1 / (1 + 0.5 * z)
- r = t * math.exp(
- -z * z
- - 1.26551223
- + t
- * (
- 1.00002368
- + t
- * (
- 0.37409196
- + t
- * (
- 0.09678418
- + t
- * (
- -0.18628806
- + t
- * (
- 0.27886807
- + t
- * (
- -1.13520398
- + t
- * (1.48851587 + t * (-0.82215223 + t * 0.17087277))
- )
- )
- )
- )
- )
- )
- )
- if x >= 0.0:
+ r = t * math.exp(-z * z -
+ 1.26551223 + t *
+ (1.00002368 + t *
+ (.37409196 + t *
+ (.09678418 + t *
+ (-.18628806 + t *
+ (.27886807 + t *
+ (-1.13520398 + t *
+ (1.48851587 + t *
+ (-.82215223 + t * .17087277)))))))))
+ if x >= 0.:
return r
else:
- return 2.0 - r
+ return 2. - r
+
def norm_cdf(x):
"""Return the area under the normal distribution from M{-∞..x}."""
return 1 - 0.5 * erfcc(x / math.sqrt(2))
+
def norm_logsf(x):
try:
return math.log(1 - norm_cdf(x))
except ValueError:
- return float("-inf")
+ return float('-inf')
LOG2 = math.log(2)
def trace(backlinks, source_sents_lens, target_sents_lens):
"""
Traverse the alignment cost from the tracebacks and retrieves
- appropriate sentence pairs.
-
+ appropriate sentence pairs.
+
:param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
:type backlinks: dict
:param source_sents_lens: A list of target sentences' lengths
"""
links = []
position = (len(source_sents_lens), len(target_sents_lens))
- while position != (0, 0) and all(p >= 0 for p in position):
+ while position != (0, 0) and all(p >=0 for p in position):
try:
s, t = backlinks[position]
except TypeError:
- position = (position[0] - 1, position[1] - 1)
+ position = (position[0]-1 , position[1]-1)
continue
for i in range(s):
for j in range(t):
# actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
# reference implementation. With l_s in the denominator, insertions are impossible.
m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
- delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(
- m * params.VARIANCE_CHARACTERS
- )
+ delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(m * params.VARIANCE_CHARACTERS)
except ZeroDivisionError:
- return float("-inf")
+ return float('-inf')
- return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
+ return - (LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
-def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent):
+def align_blocks(source_sents_lens, target_sents_lens, params = LanguageIndependent):
"""Return the sentence alignment of two text blocks (usually paragraphs).
>>> align_blocks([5,5,5], [7,7,7])
backlinks = {}
- for i in range(len(source_sents_lens) + 1):
+ for i in range(len(source_sents_lens) + 1):
for j in range(len(target_sents_lens) + 1):
- min_dist = float("inf")
+ min_dist = float('inf')
min_align = None
for a in alignment_types:
- prev_i = -1 - a[0]
+ prev_i = - 1 - a[0]
prev_j = j - a[1]
if prev_i < -len(D) or prev_j < 0:
continue
- p = D[prev_i][prev_j] + align_log_prob(
- i, j, source_sents_lens, target_sents_lens, a, params
- )
+ p = D[prev_i][prev_j] + align_log_prob(i, j, source_sents_lens,
+ target_sents_lens, a, params)
if p < min_dist:
min_dist = p
min_align = a
- if min_dist == float("inf"):
+ if min_dist == float('inf'):
min_dist = 0
backlinks[(i, j)] = min_align
if len(D) > 2:
D.pop(0)
D.append([])
-
+
return trace(backlinks, source_sents_lens, target_sents_lens)
-def align_texts(source_blocks, target_blocks, params=LanguageIndependent):
+def align_texts(source_blocks, target_blocks, params = LanguageIndependent):
"""Creates the sentence alignment of two texts.
- Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
- alignment links.
+ Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
+ alignment links.
Each block consists of a list that contains the lengths (in characters) of the sentences
in this block.
-
+
@param source_blocks: The list of blocks in the source text.
@param target_blocks: The list of blocks in the target text.
@param params: the sentence alignment parameters.
@returns: A list of sentence alignment lists
"""
if len(source_blocks) != len(target_blocks):
- raise ValueError(
- "Source and target texts do not have the same number of blocks."
- )
-
- return [
- align_blocks(source_block, target_block, params)
- for source_block, target_block in zip(source_blocks, target_blocks)
- ]
+ raise ValueError("Source and target texts do not have the same number of blocks.")
+
+ return [align_blocks(source_block, target_block, params)
+ for source_block, target_block in zip(source_blocks, target_blocks)]
# File I/O functions; may belong in a corpus reader
-
def split_at(it, split_value):
- """Splits an iterator C{it} at values of C{split_value}.
+ """Splits an iterator C{it} at values of C{split_value}.
Each instance of C{split_value} is swallowed. The iterator produces
subiterators which need to be consumed fully before the next subiterator
can be used.
"""
-
def _chunk_iterator(first):
v = first
while v != split_value:
yield v
v = it.next()
-
+
while True:
yield _chunk_iterator(it.next())
-
+
def parse_token_stream(stream, soft_delimiter, hard_delimiter):
- """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
+ """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
"""
return [
- [
- sum(len(token) for token in sentence_it)
- for sentence_it in split_at(block_it, soft_delimiter)
- ]
- for block_it in split_at(stream, hard_delimiter)
- ]
+ [sum(len(token) for token in sentence_it)
+ for sentence_it in split_at(block_it, soft_delimiter)]
+ for block_it in split_at(stream, hard_delimiter)]
+
+
+
+
+# Code for test files in nltk_contrib/align/data/*.tok
+# import sys
+# from contextlib import nested
+# with nested(open(sys.argv[1], "r"), open(sys.argv[2], "r")) as (s, t):
+# source = parse_token_stream((l.strip() for l in s), ".EOS", ".EOP")
+# target = parse_token_stream((l.strip() for l in t), ".EOS", ".EOP")
+# print align_texts(source, target)
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: GDFA word alignment symmetrization
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+import codecs
from collections import defaultdict
-
def grow_diag_final_and(srclen, trglen, e2f, f2e):
"""
This module symmetrisatizes the source-to-target and target-to-source
word alignment output and produces, aka. GDFA algorithm (Koehn, 2005).
-
+
Step 1: Find the intersection of the bidirectional alignment.
-
+
Step 2: Search for additional neighbor alignment points to be added, given
these criteria: (i) neighbor alignments points are not in the
intersection and (ii) neighbor alignments are in the union.
-
+
Step 3: Add all other alignment points thats not in the intersection, not in
the neighboring alignments that met the criteria but in the original
foward/backward alignment outputs.
-
+
>>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 '
... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18')
>>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 '
>>> trglen = len(trgtext.split())
>>>
>>> gdfa = grow_diag_final_and(srclen, trglen, forw, back)
- >>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),
+ >>> gdfa == set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),
... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20,
... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5),
... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22,
... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5,
- ... 12), (11, 6), (12, 8)]))
+ ... 12), (11, 6), (12, 8)])
True
-
+
References:
- Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot.
- 2005. Edinburgh System Description for the 2005 IWSLT Speech
+ Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot.
+ 2005. Edinburgh System Description for the 2005 IWSLT Speech
Translation Evaluation. In MT Eval Workshop.
:type srclen: int
"""
# Converts pharaoh text format into list of tuples.
- e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()]
- f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()]
-
- neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]
- alignment = set(e2f).intersection(set(f2e)) # Find the intersection.
+ e2f = [tuple(map(int,a.split('-'))) for a in e2f.split()]
+ f2e = [tuple(map(int,a.split('-'))) for a in f2e.split()]
+
+ neighbors = [(-1,0),(0,-1),(1,0),(0,1),(-1,-1),(-1,1),(1,-1),(1,1)]
+ alignment = set(e2f).intersection(set(f2e)) # Find the intersection.
union = set(e2f).union(set(f2e))
# *aligned* is used to check if neighbors are aligned in grow_diag()
aligned = defaultdict(set)
- for i, j in alignment:
- aligned["e"].add(i)
- aligned["f"].add(j)
-
+ for i,j in alignment:
+ aligned['e'].add(i)
+ aligned['j'].add(j)
+
def grow_diag():
"""
Search for the neighbor points and them to the intersected alignment
prev_len = len(alignment) - 1
# iterate until no new points added
while prev_len < len(alignment):
- no_new_points = True
# for english word e = 0 ... en
for e in range(srclen):
# for foreign word f = 0 ... fn
- for f in range(trglen):
+ for f in range(trglen):
# if ( e aligned with f)
- if (e, f) in alignment:
+ if (e,f) in alignment:
# for each neighboring point (e-new, f-new)
for neighbor in neighbors:
- neighbor = tuple(i + j for i, j in zip((e, f), neighbor))
+ neighbor = tuple(i+j for i,j in zip((e,f),neighbor))
e_new, f_new = neighbor
- # if ( ( e-new not aligned and f-new not aligned)
+ # if ( ( e-new not aligned and f-new not aligned)
# and (e-new, f-new in union(e2f, f2e) )
- if (
- e_new not in aligned and f_new not in aligned
- ) and neighbor in union:
+ if (e_new not in aligned and f_new not in aligned)\
+ and neighbor in union:
alignment.add(neighbor)
- aligned["e"].add(e_new)
- aligned["f"].add(f_new)
- prev_len += 1
- no_new_points = False
- # iterate until no new points added
- if no_new_points:
- break
-
+ aligned['e'].add(e_new); aligned['f'].add(f_new)
+ prev_len+=1
+
def final_and(a):
"""
- Adds remaining points that are not in the intersection, not in the
+ Adds remaining points that are not in the intersection, not in the
neighboring alignments but in the original *e2f* and *f2e* alignments
"""
# for english word e = 0 ... en
for e_new in range(srclen):
# for foreign word f = 0 ... fn
for f_new in range(trglen):
- # if ( ( e-new not aligned and f-new not aligned)
+ # if ( ( e-new not aligned and f-new not aligned)
# and (e-new, f-new in union(e2f, f2e) )
- if (
- e_new not in aligned
+ if (e_new not in aligned
and f_new not in aligned
- and (e_new, f_new) in union
- ):
+ and (e_new, f_new) in a):
+
alignment.add((e_new, f_new))
- aligned["e"].add(e_new)
- aligned["f"].add(f_new)
+ aligned['e'].add(e_new); aligned['f'].add(f_new)
grow_diag()
final_and(e2f)
final_and(f2e)
- return sorted(alignment)
+ return alignment
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: GLEU Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors:
# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
""" GLEU score implementation. """
-
+from __future__ import division
from collections import Counter
from nltk.util import ngrams, everygrams
:return: the sentence level GLEU score.
:rtype: float
"""
- return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)
-
+ return corpus_gleu(
+ [references],
+ [hypothesis],
+ min_len=min_len,
+ max_len=max_len
+ )
def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
"""
:rtype: float
"""
# sanity check
- assert len(list_of_references) == len(
- hypotheses
- ), "The number of hypotheses and their reference(s) should be the same"
+ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
# sum matches and max-token-lengths over all sentences
corpus_n_match = 0
for references, hypothesis in zip(list_of_references, hypotheses):
hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
-
+
hyp_counts = []
for reference in references:
ref_ngrams = Counter(everygrams(reference, min_len, max_len))
# use the reference yielding the highest score
if hyp_counts:
- n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
+ n_match, n_all = max(hyp_counts, key=lambda hc: hc[0]/hc[1])
corpus_n_match += n_match
corpus_n_all += n_all
"""
Lexical translation model that ignores word order.
-In IBM Model 1, word order is ignored for simplicity. As long as the
-word alignments are equivalent, it doesn't matter where the word occurs
-in the source or target sentence. Thus, the following three alignments
-are equally likely.
+In IBM Model 1, word order is ignored for simplicity. Thus, the
+following three alignments are equally likely. As long as the word
+alignments are equivalent, it doesn't matter where the word
+occurs in the source or target sentence.
Source: je mange du jambon
Target: i eat some ham
-Alignment: (0,0) (1,1) (2,2) (3,3)
+Alignment: (1,1) (2,2) (3,3) (4,4)
Source: je mange du jambon
Target: some ham eat i
-Alignment: (0,2) (1,3) (2,1) (3,1)
+Alignment: (1,4) (2,3) (3,1) (4,2)
Source: du jambon je mange
Target: eat i some ham
-Alignment: (0,3) (1,2) (2,0) (3,1)
-
-Note that an alignment is represented here as
-(word_index_in_target, word_index_in_source).
+Alignment: (1,3) (2,4) (3,2) (4,1)
The EM algorithm used in Model 1 is:
E step - In the training data, count how many times a source language
263-311.
"""
+from __future__ import division
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import Alignment
"""
- def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
+ def __init__(self, sentence_aligned_corpus, iterations,
+ probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model.
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
+ self.translation_table = probability_tables['translation_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
- self.align_all(sentence_aligned_corpus)
+ self.__align_all(sentence_aligned_corpus)
def set_uniform_probabilities(self, sentence_aligned_corpus):
initial_prob = 1 / len(self.trg_vocab)
if initial_prob < IBMModel.MIN_PROB:
- warnings.warn(
- "Target language vocabulary is too large ("
- + str(len(self.trg_vocab))
- + " words). "
- "Results may be less accurate."
- )
+ warnings.warn("Target language vocabulary is too large (" +
+ str(len(self.trg_vocab)) + " words). "
+ "Results may be less accurate.")
for t in self.trg_vocab:
self.translation_table[t] = defaultdict(lambda: initial_prob)
return max(prob, IBMModel.MIN_PROB)
- def align_all(self, parallel_corpus):
+ def __align_all(self, parallel_corpus):
for sentence_pair in parallel_corpus:
- self.align(sentence_pair)
+ self.__align(sentence_pair)
- def align(self, sentence_pair):
+ def __align(self, sentence_pair):
"""
Determines the best word alignment for one sentence pair from
the corpus that the model was trained on.
for j, trg_word in enumerate(sentence_pair.words):
# Initialize trg_word to align with the NULL token
- best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
+ best_prob = max(self.translation_table[trg_word][None],
+ IBMModel.MIN_PROB)
best_alignment_point = None
for i, src_word in enumerate(sentence_pair.mots):
align_prob = self.translation_table[trg_word][src_word]
263-311.
"""
-import warnings
+from __future__ import division
from collections import defaultdict
-
from nltk.translate import AlignedSent
from nltk.translate import Alignment
from nltk.translate import IBMModel
from nltk.translate import IBMModel1
from nltk.translate.ibm_model import Counts
+import warnings
class IBMModel2(IBMModel):
"""
- def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
+ def __init__(self, sentence_aligned_corpus, iterations,
+ probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model and an alignment model.
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
- self.align_all(sentence_aligned_corpus)
+ self.__align_all(sentence_aligned_corpus)
def set_uniform_probabilities(self, sentence_aligned_corpus):
# a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
l_m_combinations.add((l, m))
initial_prob = 1 / (l + 1)
if initial_prob < IBMModel.MIN_PROB:
- warnings.warn(
- "A source sentence is too long ("
- + str(l)
- + " words). Results may be less accurate."
- )
+ warnings.warn("A source sentence is too long (" + str(l) +
+ " words). Results may be less accurate.")
for i in range(0, l + 1):
for j in range(1, m + 1):
counts = Model2Counts()
for aligned_sentence in parallel_corpus:
src_sentence = [None] + aligned_sentence.mots
- trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
+ trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
t = trg_sentence[j]
for i in range(0, l + 1):
s = src_sentence[i]
- count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
+ count = self.prob_alignment_point(
+ i, j, src_sentence, trg_sentence)
normalized_count = count / total_count[t]
counts.update_lexical_translation(normalized_count, s, t)
for j, src_sentence_lengths in j_s.items():
for l, trg_sentence_lengths in src_sentence_lengths.items():
for m in trg_sentence_lengths:
- estimate = (
- counts.alignment[i][j][l][m]
- / counts.alignment_for_any_i[j][l][m]
- )
- self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
+ estimate = (counts.alignment[i][j][l][m] /
+ counts.alignment_for_any_i[j][l][m])
+ self.alignment_table[i][j][l][m] = max(estimate,
+ MIN_PROB)
def prob_all_alignments(self, src_sentence, trg_sentence):
"""
t = trg_sentence[j]
for i in range(0, len(src_sentence)):
alignment_prob_for_t[t] += self.prob_alignment_point(
- i, j, src_sentence, trg_sentence
- )
+ i, j, src_sentence, trg_sentence)
return alignment_prob_for_t
def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
continue # skip the dummy zeroeth element
trg_word = alignment_info.trg_sentence[j]
src_word = alignment_info.src_sentence[i]
- prob *= (
- self.translation_table[trg_word][src_word]
- * self.alignment_table[i][j][l][m]
- )
+ prob *= (self.translation_table[trg_word][src_word] *
+ self.alignment_table[i][j][l][m])
return max(prob, IBMModel.MIN_PROB)
- def align_all(self, parallel_corpus):
+ def __align_all(self, parallel_corpus):
for sentence_pair in parallel_corpus:
- self.align(sentence_pair)
+ self.__align(sentence_pair)
- def align(self, sentence_pair):
+ def __align(self, sentence_pair):
"""
Determines the best word alignment for one sentence pair from
the corpus that the model was trained on.
for j, trg_word in enumerate(sentence_pair.words):
# Initialize trg_word to align with the NULL token
- best_prob = (
- self.translation_table[trg_word][None]
- * self.alignment_table[0][j + 1][l][m]
- )
+ best_prob = (self.translation_table[trg_word][None] *
+ self.alignment_table[0][j + 1][l][m])
best_prob = max(best_prob, IBMModel.MIN_PROB)
best_alignment_point = None
for i, src_word in enumerate(sentence_pair.mots):
- align_prob = (
- self.translation_table[trg_word][src_word]
- * self.alignment_table[i + 1][j + 1][l][m]
- )
+ align_prob = (self.translation_table[trg_word][src_word] *
+ self.alignment_table[i + 1][j + 1][l][m])
if align_prob >= best_prob:
best_prob = align_prob
best_alignment_point = i
Data object to store counts of various parameters during training.
Includes counts for alignment.
"""
-
def __init__(self):
super(Model2Counts, self).__init__()
self.alignment = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.0))))
self.alignment_for_any_i = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
def update_lexical_translation(self, count, s, t):
self.t_given_s[t][s] += count
263-311.
"""
-import warnings
+from __future__ import division
from collections import defaultdict
from math import factorial
-
from nltk.translate import AlignedSent
from nltk.translate import Alignment
from nltk.translate import IBMModel
from nltk.translate import IBMModel2
from nltk.translate.ibm_model import Counts
+import warnings
class IBMModel3(IBMModel):
"""
- def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
+ def __init__(self, sentence_aligned_corpus, iterations,
+ probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model, a distortion model, a fertility model, and a
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
- self.fertility_table = probability_tables["fertility_table"]
- self.p1 = probability_tables["p1"]
- self.distortion_table = probability_tables["distortion_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.distortion_table = probability_tables['distortion_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
def reset_probabilities(self):
super(IBMModel3, self).reset_probabilities()
self.distortion_table = defaultdict(
- lambda: defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
- )
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: self.MIN_PROB))))
"""
dict[int][int][int][int]: float. Probability(j | i,l,m).
Values accessed as ``distortion_table[j][i][l][m]``.
l_m_combinations.add((l, m))
initial_prob = 1 / m
if initial_prob < IBMModel.MIN_PROB:
- warnings.warn(
- "A target sentence is too long ("
- + str(m)
- + " words). Results may be less accurate."
- )
+ warnings.warn("A target sentence is too long (" + str(m) +
+ " words). Results may be less accurate.")
for j in range(1, m + 1):
for i in range(0, l + 1):
self.distortion_table[j][i][l][m] = initial_prob
sampled_alignments, best_alignment = self.sample(aligned_sentence)
# Record the most probable alignment
aligned_sentence.alignment = Alignment(
- best_alignment.zero_indexed_alignment()
- )
+ best_alignment.zero_indexed_alignment())
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_of_alignments(sampled_alignments)
for j in range(1, m + 1):
counts.update_lexical_translation(
- normalized_count, alignment_info, j
- )
- counts.update_distortion(normalized_count, alignment_info, j, l, m)
+ normalized_count, alignment_info, j)
+ counts.update_distortion(
+ normalized_count, alignment_info, j, l, m)
counts.update_null_generation(normalized_count, alignment_info)
counts.update_fertility(normalized_count, alignment_info)
for i, src_sentence_lengths in i_s.items():
for l, trg_sentence_lengths in src_sentence_lengths.items():
for m in trg_sentence_lengths:
- estimate = (
- counts.distortion[j][i][l][m]
- / counts.distortion_for_any_j[i][l][m]
- )
- self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB)
+ estimate = (counts.distortion[j][i][l][m] /
+ counts.distortion_for_any_j[i][l][m])
+ self.distortion_table[j][i][l][m] = max(estimate,
+ MIN_PROB)
def prob_t_a_given_s(self, alignment_info):
"""
# Combine NULL insertion probability
null_fertility = alignment_info.fertility_of_i(0)
- probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
+ probability *= (pow(p1, null_fertility) *
+ pow(p0, m - 2 * null_fertility))
if probability < MIN_PROB:
return MIN_PROB
# Combine fertility probabilities
for i in range(1, l + 1):
fertility = alignment_info.fertility_of_i(i)
- probability *= (
- factorial(fertility) * self.fertility_table[fertility][src_sentence[i]]
- )
+ probability *= (factorial(fertility) *
+ self.fertility_table[fertility][src_sentence[i]])
if probability < MIN_PROB:
return MIN_PROB
i = alignment_info.alignment[j]
s = src_sentence[i]
- probability *= (
- self.translation_table[t][s] * self.distortion_table[j][i][l][m]
- )
+ probability *= (self.translation_table[t][s] *
+ self.distortion_table[j][i][l][m])
if probability < MIN_PROB:
return MIN_PROB
Data object to store counts of various parameters during training.
Includes counts for distortion.
"""
-
def __init__(self):
super(Model3Counts, self).__init__()
self.distortion = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.0))))
self.distortion_for_any_j = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
def update_distortion(self, count, alignment_info, j, l, m):
i = alignment_info.alignment[j]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model 4
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
263-311.
"""
-import warnings
+from __future__ import division
from collections import defaultdict
from math import factorial
-
from nltk.translate import AlignedSent
from nltk.translate import Alignment
from nltk.translate import IBMModel
from nltk.translate import IBMModel3
from nltk.translate.ibm_model import Counts
from nltk.translate.ibm_model import longest_target_sentence_length
+import warnings
class IBMModel4(IBMModel):
"""
- def __init__(
- self,
- sentence_aligned_corpus,
- iterations,
- source_word_classes,
- target_word_classes,
- probability_tables=None,
- ):
+ def __init__(self, sentence_aligned_corpus, iterations,
+ source_word_classes, target_word_classes,
+ probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model, distortion models, a fertility model, and a
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
- self.fertility_table = probability_tables["fertility_table"]
- self.p1 = probability_tables["p1"]
- self.head_distortion_table = probability_tables["head_distortion_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.head_distortion_table = probability_tables[
+ 'head_distortion_table']
self.non_head_distortion_table = probability_tables[
- "non_head_distortion_table"
- ]
+ 'non_head_distortion_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
def reset_probabilities(self):
super(IBMModel4, self).reset_probabilities()
self.head_distortion_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
"""
dict[int][int][int]: float. Probability(displacement of head
word | word class of previous cept,target word class).
"""
self.non_head_distortion_table = defaultdict(
- lambda: defaultdict(lambda: self.MIN_PROB)
- )
+ lambda: defaultdict(lambda: self.MIN_PROB))
"""
dict[int][int]: float. Probability(displacement of non-head
word | target word class).
else:
initial_prob = 1 / (2 * (max_m - 1))
if initial_prob < IBMModel.MIN_PROB:
- warnings.warn(
- "A target sentence is too long ("
- + str(max_m)
- + " words). Results may be less accurate."
- )
+ warnings.warn("A target sentence is too long (" + str(max_m) +
+ " words). Results may be less accurate.")
for dj in range(1, max_m):
self.head_distortion_table[dj] = defaultdict(
- lambda: defaultdict(lambda: initial_prob)
- )
+ lambda: defaultdict(lambda: initial_prob))
self.head_distortion_table[-dj] = defaultdict(
- lambda: defaultdict(lambda: initial_prob)
- )
- self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob)
- self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob)
+ lambda: defaultdict(lambda: initial_prob))
+ self.non_head_distortion_table[dj] = defaultdict(
+ lambda: initial_prob)
+ self.non_head_distortion_table[-dj] = defaultdict(
+ lambda: initial_prob)
def train(self, parallel_corpus):
counts = Model4Counts()
sampled_alignments, best_alignment = self.sample(aligned_sentence)
# Record the most probable alignment
aligned_sentence.alignment = Alignment(
- best_alignment.zero_indexed_alignment()
- )
+ best_alignment.zero_indexed_alignment())
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_of_alignments(sampled_alignments)
for j in range(1, m + 1):
counts.update_lexical_translation(
- normalized_count, alignment_info, j
- )
+ normalized_count, alignment_info, j)
counts.update_distortion(
- normalized_count,
- alignment_info,
- j,
- self.src_classes,
- self.trg_classes,
- )
+ normalized_count, alignment_info, j,
+ self.src_classes, self.trg_classes)
counts.update_null_generation(normalized_count, alignment_info)
counts.update_fertility(normalized_count, alignment_info)
for dj, src_classes in counts.head_distortion.items():
for s_cls, trg_classes in src_classes.items():
for t_cls in trg_classes:
- estimate = (
- counts.head_distortion[dj][s_cls][t_cls]
- / counts.head_distortion_for_any_dj[s_cls][t_cls]
- )
- head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB)
+ estimate = (counts.head_distortion[dj][s_cls][t_cls] /
+ counts.head_distortion_for_any_dj[s_cls][t_cls])
+ head_d_table[dj][s_cls][t_cls] = max(estimate,
+ IBMModel.MIN_PROB)
non_head_d_table = self.non_head_distortion_table
for dj, trg_classes in counts.non_head_distortion.items():
for t_cls in trg_classes:
- estimate = (
- counts.non_head_distortion[dj][t_cls]
- / counts.non_head_distortion_for_any_dj[t_cls]
- )
+ estimate = (counts.non_head_distortion[dj][t_cls] /
+ counts.non_head_distortion_for_any_dj[t_cls])
non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB)
def prob_t_a_given_s(self, alignment_info):
p0 = 1 - p1
null_fertility = alignment_info.fertility_of_i(0)
m = len(alignment_info.trg_sentence) - 1
- value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
+ value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility))
if value < MIN_PROB:
return MIN_PROB
src_sentence = alignment_info.src_sentence
for i in range(1, len(src_sentence)):
fertility = alignment_info.fertility_of_i(i)
- value *= (
- factorial(fertility)
- * ibm_model.fertility_table[fertility][src_sentence[i]]
- )
+ value *= (factorial(fertility) *
+ ibm_model.fertility_table[fertility][src_sentence[i]])
if value < MIN_PROB:
return MIN_PROB
return value
trg_class = ibm_model.trg_classes[t]
dj = j - previous_position
return ibm_model.non_head_distortion_table[dj][trg_class]
-
# end nested functions
# Abort computation whenever probability falls below MIN_PROB at
Data object to store counts of various parameters during training.
Includes counts for distortion.
"""
-
def __init__(self):
super(Model4Counts, self).__init__()
self.head_distortion = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
- )
- self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(lambda: 0.0))
- self.non_head_distortion = defaultdict(lambda: defaultdict(lambda: 0.0))
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ self.head_distortion_for_any_dj = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
+ self.non_head_distortion = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0)
- def update_distortion(self, count, alignment_info, j, src_classes, trg_classes):
+ def update_distortion(self, count, alignment_info, j,
+ src_classes, trg_classes):
i = alignment_info.alignment[j]
t = alignment_info.trg_sentence[j]
if i == 0:
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model 5
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
263-311.
"""
-import warnings
+from __future__ import division
from collections import defaultdict
from math import factorial
-
from nltk.translate import AlignedSent
from nltk.translate import Alignment
from nltk.translate import IBMModel
from nltk.translate import IBMModel4
from nltk.translate.ibm_model import Counts
from nltk.translate.ibm_model import longest_target_sentence_length
+import warnings
class IBMModel5(IBMModel):
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
"""
-
MIN_SCORE_FACTOR = 0.2
"""
Alignments with scores below this factor are pruned during sampling
"""
- def __init__(
- self,
- sentence_aligned_corpus,
- iterations,
- source_word_classes,
- target_word_classes,
- probability_tables=None,
- ):
+ def __init__(self, sentence_aligned_corpus, iterations,
+ source_word_classes, target_word_classes,
+ probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model, vacancy models, a fertility model, and a
if probability_tables is None:
# Get probabilities from IBM model 4
- ibm4 = IBMModel4(
- sentence_aligned_corpus,
- iterations,
- source_word_classes,
- target_word_classes,
- )
+ ibm4 = IBMModel4(sentence_aligned_corpus, iterations,
+ source_word_classes, target_word_classes)
self.translation_table = ibm4.translation_table
self.alignment_table = ibm4.alignment_table
self.fertility_table = ibm4.fertility_table
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
- self.translation_table = probability_tables["translation_table"]
- self.alignment_table = probability_tables["alignment_table"]
- self.fertility_table = probability_tables["fertility_table"]
- self.p1 = probability_tables["p1"]
- self.head_distortion_table = probability_tables["head_distortion_table"]
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.head_distortion_table = probability_tables[
+ 'head_distortion_table']
self.non_head_distortion_table = probability_tables[
- "non_head_distortion_table"
- ]
- self.head_vacancy_table = probability_tables["head_vacancy_table"]
- self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"]
+ 'non_head_distortion_table']
+ self.head_vacancy_table = probability_tables[
+ 'head_vacancy_table']
+ self.non_head_vacancy_table = probability_tables[
+ 'non_head_vacancy_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
def reset_probabilities(self):
super(IBMModel5, self).reset_probabilities()
self.head_vacancy_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
"""
dict[int][int][int]: float. Probability(vacancy difference |
number of remaining valid positions,target word class).
"""
self.non_head_vacancy_table = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
"""
dict[int][int][int]: float. Probability(vacancy difference |
number of remaining valid positions,target word class).
# Thus, the number of possible vacancy difference values is
# (max_v) - (1-max_v) + 1 = 2 * max_v.
if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB:
- warnings.warn(
- "A target sentence is too long ("
- + str(max_m)
- + " words). Results may be less accurate."
- )
+ warnings.warn("A target sentence is too long (" + str(max_m) +
+ " words). Results may be less accurate.")
for max_v in range(1, max_m + 1):
for dv in range(1, max_m + 1):
initial_prob = 1 / (2 * max_v)
- self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob)
- self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
- lambda: initial_prob
- )
+ self.head_vacancy_table[dv][max_v] = defaultdict(
+ lambda: initial_prob)
+ self.head_vacancy_table[-(dv-1)][max_v] = defaultdict(
+ lambda: initial_prob)
self.non_head_vacancy_table[dv][max_v] = defaultdict(
- lambda: initial_prob
- )
- self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
- lambda: initial_prob
- )
+ lambda: initial_prob)
+ self.non_head_vacancy_table[-(dv-1)][max_v] = defaultdict(
+ lambda: initial_prob)
def train(self, parallel_corpus):
counts = Model5Counts()
sampled_alignments, best_alignment = self.sample(aligned_sentence)
# Record the most probable alignment
aligned_sentence.alignment = Alignment(
- best_alignment.zero_indexed_alignment()
- )
+ best_alignment.zero_indexed_alignment())
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_of_alignments(sampled_alignments)
for j in range(1, m + 1):
counts.update_lexical_translation(
- normalized_count, alignment_info, j
- )
+ normalized_count, alignment_info, j)
slots = Slots(m)
for i in range(1, l + 1):
counts.update_vacancy(
- normalized_count, alignment_info, i, self.trg_classes, slots
- )
+ normalized_count, alignment_info, i,
+ self.trg_classes, slots)
counts.update_null_generation(normalized_count, alignment_info)
counts.update_fertility(normalized_count, alignment_info)
and the best alignment of the set for convenience
:rtype: set(AlignmentInfo), AlignmentInfo
"""
- sampled_alignments, best_alignment = super(IBMModel5, self).sample(
- sentence_pair
- )
+ sampled_alignments, best_alignment = super(
+ IBMModel5, self).sample(sentence_pair)
return self.prune(sampled_alignments), best_alignment
def prune(self, alignment_infos):
old_alignment = alignment
for neighbor_alignment in self.neighboring(alignment, j_pegged):
neighbor_probability = IBMModel4.model4_prob_t_a_given_s(
- neighbor_alignment, self
- )
+ neighbor_alignment, self)
if neighbor_probability > max_probability:
alignment = neighbor_alignment
p0 = 1 - p1
null_fertility = alignment_info.fertility_of_i(0)
m = len(alignment_info.trg_sentence) - 1
- value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
+ value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility))
if value < MIN_PROB:
return MIN_PROB
src_sentence = alignment_info.src_sentence
for i in range(1, len(src_sentence)):
fertility = alignment_info.fertility_of_i(i)
- value *= (
- factorial(fertility)
- * self.fertility_table[fertility][src_sentence[i]]
- )
+ value *= (factorial(fertility) *
+ self.fertility_table[fertility][src_sentence[i]])
if value < MIN_PROB:
return MIN_PROB
return value
previous_vacancies = slots.vacancies_at(previous_position)
j = tablet[k]
dv = slots.vacancies_at(j) - previous_vacancies
- max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
+ max_v = (total_vacancies - tablet_length + k + 1 -
+ previous_vacancies)
trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
value *= self.non_head_vacancy_table[dv][max_v][trg_class]
slots.occupy(j) # mark position as occupied
return MIN_PROB
return value
-
# end nested functions
# Abort computation whenever probability falls below MIN_PROB at
for dv, max_vs in counts.head_vacancy.items():
for max_v, trg_classes in max_vs.items():
for t_cls in trg_classes:
- estimate = (
- counts.head_vacancy[dv][max_v][t_cls]
- / counts.head_vacancy_for_any_dv[max_v][t_cls]
- )
- head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
+ estimate = (counts.head_vacancy[dv][max_v][t_cls] /
+ counts.head_vacancy_for_any_dv[max_v][t_cls])
+ head_vacancy_table[dv][max_v][t_cls] = max(estimate,
+ MIN_PROB)
non_head_vacancy_table = self.non_head_vacancy_table
for dv, max_vs in counts.non_head_vacancy.items():
for max_v, trg_classes in max_vs.items():
for t_cls in trg_classes:
estimate = (
- counts.non_head_vacancy[dv][max_v][t_cls]
- / counts.non_head_vacancy_for_any_dv[max_v][t_cls]
- )
- non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
+ counts.non_head_vacancy[dv][max_v][t_cls] /
+ counts.non_head_vacancy_for_any_dv[max_v][t_cls])
+ non_head_vacancy_table[dv][max_v][t_cls] = max(estimate,
+ MIN_PROB)
class Model5Counts(Counts):
Data object to store counts of various parameters during training.
Includes counts for vacancies.
"""
-
def __init__(self):
super(Model5Counts, self).__init__()
self.head_vacancy = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
- )
- self.head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0))
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ self.head_vacancy_for_any_dv = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
self.non_head_vacancy = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
- )
- self.non_head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0))
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ self.non_head_vacancy_for_any_dv = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
def update_vacancy(self, count, alignment_info, i, trg_classes, slots):
"""
previous_vacancies = slots.vacancies_at(previous_position)
j = tablet[k]
dv = slots.vacancies_at(j) - previous_vacancies
- max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
+ max_v = (total_vacancies - tablet_length + k + 1 -
+ previous_vacancies)
trg_class = trg_classes[alignment_info.trg_sentence[j]]
self.non_head_vacancy[dv][max_v][trg_class] += count
self.non_head_vacancy_for_any_dv[max_v][trg_class] += count
Represents positions in a target sentence. Used to keep track of
which slot (position) is occupied.
"""
-
def __init__(self, target_sentence_length):
self._slots = [False] * (target_sentence_length + 1) # 1-indexed
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model Core
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
-
+from __future__ import division
from bisect import insort_left
from collections import defaultdict
from copy import deepcopy
"""
Abstract base class for all IBM models
"""
-
# Avoid division by zero and precision errors by imposing a minimum
# value for probabilities. Note that this approach is theoretically
# incorrect, since it may create probabilities that sum to more
def reset_probabilities(self):
self.translation_table = defaultdict(
- lambda: defaultdict(lambda: IBMModel.MIN_PROB)
- )
+ lambda: defaultdict(lambda: IBMModel.MIN_PROB))
"""
dict[str][str]: float. Probability(target word | source word).
Values accessed as ``translation_table[target_word][source_word]``.
"""
self.alignment_table = defaultdict(
- lambda: defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB))
- )
- )
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: IBMModel.MIN_PROB))))
"""
dict[int][int][int][int]: float. Probability(i | j,l,m).
Values accessed as ``alignment_table[i][j][l][m]``.
Used in model 2 and hill climbing in models 3 and above
"""
- self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
+ self.fertility_table = defaultdict(
+ lambda: defaultdict(lambda: self.MIN_PROB))
"""
dict[int][str]: float. Probability(fertility | source word).
Values accessed as ``fertility_table[fertility][source_word]``.
# with the constraint that j is aligned (pegged) to i
for j in range(1, m + 1):
for i in range(0, l + 1):
- initial_alignment = self.best_model2_alignment(sentence_pair, j, i)
+ initial_alignment = self.best_model2_alignment(
+ sentence_pair, j, i)
potential_alignment = self.hillclimb(initial_alignment, j)
neighbors = self.neighboring(potential_alignment, j)
sampled_alignments.update(neighbors)
:type i_pegged: int
"""
src_sentence = [None] + sentence_pair.mots
- trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed
+ trg_sentence = ['UNUSED'] + sentence_pair.words # 1-indexed
l = len(src_sentence) - 1 # exclude NULL
m = len(trg_sentence) - 1
for i in range(0, l + 1):
s = src_sentence[i]
- alignment_prob = (
- self.translation_table[t][s] * self.alignment_table[i][j][l][m]
- )
+ alignment_prob = (self.translation_table[t][s] *
+ self.alignment_table[i][j][l][m])
if alignment_prob >= max_alignment_prob:
max_alignment_prob = alignment_prob
alignment[j] = best_i
cepts[best_i].append(j)
- return AlignmentInfo(
- tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts
- )
+ return AlignmentInfo(tuple(alignment), tuple(src_sentence),
+ tuple(trg_sentence), cepts)
def hillclimb(self, alignment_info, j_pegged=None):
"""
new_cepts[old_i].remove(j)
new_alignment_info = AlignmentInfo(
- tuple(new_alignment),
- alignment_info.src_sentence,
- alignment_info.trg_sentence,
- new_cepts,
- )
+ tuple(new_alignment), alignment_info.src_sentence,
+ alignment_info.trg_sentence, new_cepts)
neighbors.add(new_alignment_info)
for j in range(1, m + 1):
insort_left(new_cepts[i], other_j)
new_alignment_info = AlignmentInfo(
- tuple(new_alignment),
- alignment_info.src_sentence,
- alignment_info.trg_sentence,
- new_cepts,
- )
+ tuple(new_alignment), alignment_info.src_sentence,
+ alignment_info.trg_sentence, new_cepts)
neighbors.add(new_alignment_info)
return neighbors
def maximize_fertility_probabilities(self, counts):
for phi, src_words in counts.fertility.items():
for s in src_words:
- estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s]
+ estimate = (counts.fertility[phi][s] /
+ counts.fertility_for_any_phi[s])
self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
def maximize_null_generation_probabilities(self, counts):
def __init__(self, alignment, src_sentence, trg_sentence, cepts):
if not isinstance(alignment, tuple):
- raise TypeError(
- "The alignment must be a tuple because it is used "
- "to uniquely identify AlignmentInfo objects."
- )
+ raise TypeError("The alignment must be a tuple because it is used "
+ "to uniquely identify AlignmentInfo objects.")
self.alignment = alignment
"""
"""
i = self.alignment[j]
if i == 0:
- raise ValueError(
- "Words aligned to NULL cannot have a previous "
- "cept because NULL has no position"
- )
+ raise ValueError("Words aligned to NULL cannot have a previous "
+ "cept because NULL has no position")
previous_cept = i - 1
while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
previous_cept -= 1
"""
Data object to store counts of various parameters during training
"""
-
def __init__(self):
self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
self.any_t_given_s = defaultdict(lambda: 0.0)
+++ /dev/null
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Machine Translation
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Uday Krishna <udaykrishna5@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from nltk.stem.porter import PorterStemmer
-from nltk.corpus import wordnet
-from itertools import chain, product
-
-
-def _generate_enums(hypothesis, reference, preprocess=str.lower):
- """
- Takes in string inputs for hypothesis and reference and returns
- enumerated word lists for each of them
-
- :param hypothesis: hypothesis string
- :type hypothesis: str
- :param reference: reference string
- :type reference: str
- :preprocess: preprocessing method (default str.lower)
- :type preprocess: method
- :return: enumerated words list
- :rtype: list of 2D tuples, list of 2D tuples
- """
- hypothesis_list = list(enumerate(preprocess(hypothesis).split()))
- reference_list = list(enumerate(preprocess(reference).split()))
- return hypothesis_list, reference_list
-
-
-def exact_match(hypothesis, reference):
- """
- matches exact words in hypothesis and reference
- and returns a word mapping based on the enumerated
- word id between hypothesis and reference
-
- :param hypothesis: hypothesis string
- :type hypothesis: str
- :param reference: reference string
- :type reference: str
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- hypothesis_list, reference_list = _generate_enums(hypothesis, reference)
- return _match_enums(hypothesis_list, reference_list)
-
-
-def _match_enums(enum_hypothesis_list, enum_reference_list):
- """
- matches exact words in hypothesis and reference and returns
- a word mapping between enum_hypothesis_list and enum_reference_list
- based on the enumerated word id.
-
- :param enum_hypothesis_list: enumerated hypothesis list
- :type enum_hypothesis_list: list of tuples
- :param enum_reference_list: enumerated reference list
- :type enum_reference_list: list of 2D tuples
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- word_match = []
- for i in range(len(enum_hypothesis_list))[::-1]:
- for j in range(len(enum_reference_list))[::-1]:
- if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
- word_match.append(
- (enum_hypothesis_list[i][0], enum_reference_list[j][0])
- )
- (enum_hypothesis_list.pop(i)[1], enum_reference_list.pop(j)[1])
- break
- return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def _enum_stem_match(
- enum_hypothesis_list, enum_reference_list, stemmer=PorterStemmer()
-):
- """
- Stems each word and matches them in hypothesis and reference
- and returns a word mapping between enum_hypothesis_list and
- enum_reference_list based on the enumerated word id. The function also
- returns a enumerated list of unmatched words for hypothesis and reference.
-
- :param enum_hypothesis_list:
- :type enum_hypothesis_list:
- :param enum_reference_list:
- :type enum_reference_list:
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- stemmed_enum_list1 = [
- (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list
- ]
-
- stemmed_enum_list2 = [
- (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list
- ]
-
- word_match, enum_unmat_hypo_list, enum_unmat_ref_list = _match_enums(
- stemmed_enum_list1, stemmed_enum_list2
- )
-
- enum_unmat_hypo_list = (
- list(zip(*enum_unmat_hypo_list)) if len(enum_unmat_hypo_list) > 0 else []
- )
-
- enum_unmat_ref_list = (
- list(zip(*enum_unmat_ref_list)) if len(enum_unmat_ref_list) > 0 else []
- )
-
- enum_hypothesis_list = list(
- filter(lambda x: x[0] not in enum_unmat_hypo_list, enum_hypothesis_list)
- )
-
- enum_reference_list = list(
- filter(lambda x: x[0] not in enum_unmat_ref_list, enum_reference_list)
- )
-
- return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def stem_match(hypothesis, reference, stemmer=PorterStemmer()):
- """
- Stems each word and matches them in hypothesis and reference
- and returns a word mapping between hypothesis and reference
-
- :param hypothesis:
- :type hypothesis:
- :param reference:
- :type reference:
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that
- implements a stem method
- :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
- enumerated unmatched reference tuples
- :rtype: list of 2D tuples, list of 2D tuples, list of 2D tuples
- """
- enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
- return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer)
-
-
-def _enum_wordnetsyn_match(enum_hypothesis_list, enum_reference_list, wordnet=wordnet):
- """
- Matches each word in reference to a word in hypothesis
- if any synonym of a hypothesis word is the exact match
- to the reference word.
-
- :param enum_hypothesis_list: enumerated hypothesis list
- :param enum_reference_list: enumerated reference list
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: list of matched tuples, unmatched hypothesis list, unmatched reference list
- :rtype: list of tuples, list of tuples, list of tuples
-
- """
- word_match = []
- for i in range(len(enum_hypothesis_list))[::-1]:
- hypothesis_syns = set(
- chain(
- *[
- [
- lemma.name()
- for lemma in synset.lemmas()
- if lemma.name().find("_") < 0
- ]
- for synset in wordnet.synsets(enum_hypothesis_list[i][1])
- ]
- )
- ).union({enum_hypothesis_list[i][1]})
- for j in range(len(enum_reference_list))[::-1]:
- if enum_reference_list[j][1] in hypothesis_syns:
- word_match.append(
- (enum_hypothesis_list[i][0], enum_reference_list[j][0])
- )
- enum_hypothesis_list.pop(i), enum_reference_list.pop(j)
- break
- return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def wordnetsyn_match(hypothesis, reference, wordnet=wordnet):
- """
- Matches each word in reference to a word in hypothesis if any synonym
- of a hypothesis word is the exact match to the reference word.
-
- :param hypothesis: hypothesis string
- :param reference: reference string
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: list of mapped tuples
- :rtype: list of tuples
- """
- enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
- return _enum_wordnetsyn_match(
- enum_hypothesis_list, enum_reference_list, wordnet=wordnet
- )
-
-
-def _enum_allign_words(
- enum_hypothesis_list, enum_reference_list, stemmer=PorterStemmer(), wordnet=wordnet
-):
- """
- Aligns/matches words in the hypothesis to reference by sequentially
- applying exact match, stemmed match and wordnet based synonym match.
- in case there are multiple matches the match which has the least number
- of crossing is chosen. Takes enumerated list as input instead of
- string input
-
- :param enum_hypothesis_list: enumerated hypothesis list
- :param enum_reference_list: enumerated reference list
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: sorted list of matched tuples, unmatched hypothesis list,
- unmatched reference list
- :rtype: list of tuples, list of tuples, list of tuples
- """
- exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums(
- enum_hypothesis_list, enum_reference_list
- )
-
- stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match(
- enum_hypothesis_list, enum_reference_list, stemmer=stemmer
- )
-
- wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match(
- enum_hypothesis_list, enum_reference_list, wordnet=wordnet
- )
-
- return (
- sorted(
- exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0]
- ),
- enum_hypothesis_list,
- enum_reference_list,
- )
-
-
-def allign_words(hypothesis, reference, stemmer=PorterStemmer(), wordnet=wordnet):
- """
- Aligns/matches words in the hypothesis to reference by sequentially
- applying exact match, stemmed match and wordnet based synonym match.
- In case there are multiple matches the match which has the least number
- of crossing is chosen.
-
- :param hypothesis: hypothesis string
- :param reference: reference string
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list
- :rtype: list of tuples, list of tuples, list of tuples
- """
- enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
- return _enum_allign_words(
- enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet
- )
-
-
-def _count_chunks(matches):
- """
- Counts the fewest possible number of chunks such that matched unigrams
- of each chunk are adjacent to each other. This is used to caluclate the
- fragmentation part of the metric.
-
- :param matches: list containing a mapping of matched words (output of allign_words)
- :return: Number of chunks a sentence is divided into post allignment
- :rtype: int
- """
- i = 0
- chunks = 1
- while i < len(matches) - 1:
- if (matches[i + 1][0] == matches[i][0] + 1) and (
- matches[i + 1][1] == matches[i][1] + 1
- ):
- i += 1
- continue
- i += 1
- chunks += 1
- return chunks
-
-
-def single_meteor_score(
- reference,
- hypothesis,
- preprocess=str.lower,
- stemmer=PorterStemmer(),
- wordnet=wordnet,
- alpha=0.9,
- beta=3,
- gamma=0.5,
-):
- """
- Calculates METEOR score for single hypothesis and reference as per
- "Meteor: An Automatic Metric for MT Evaluation with HighLevels of
- Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal,
- in Proceedings of ACL.
- http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
-
-
- >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
-
- >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
-
-
- >>> round(single_meteor_score(reference1, hypothesis1),4)
- 0.7398
-
- If there is no words match during the alignment the method returns the
- score as 0. We can safely return a zero instead of raising a
- division by zero error as no match usually implies a bad translation.
-
- >>> round(meteor_score('this is a cat', 'non matching hypothesis'),4)
- 0.0
-
- :param references: reference sentences
- :type references: list(str)
- :param hypothesis: a hypothesis sentence
- :type hypothesis: str
- :param preprocess: preprocessing function (default str.lower)
- :type preprocess: method
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :param alpha: parameter for controlling relative weights of precision and recall.
- :type alpha: float
- :param beta: parameter for controlling shape of penalty as a
- function of as a function of fragmentation.
- :type beta: float
- :param gamma: relative weight assigned to fragmentation penality.
- :type gamma: float
- :return: The sentence-level METEOR score.
- :rtype: float
- """
- enum_hypothesis, enum_reference = _generate_enums(
- hypothesis, reference, preprocess=preprocess
- )
- translation_length = len(enum_hypothesis)
- reference_length = len(enum_reference)
- matches, _, _ = _enum_allign_words(enum_hypothesis, enum_reference, stemmer=stemmer)
- matches_count = len(matches)
- try:
- precision = float(matches_count) / translation_length
- recall = float(matches_count) / reference_length
- fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
- chunk_count = float(_count_chunks(matches))
- frag_frac = chunk_count / matches_count
- except ZeroDivisionError:
- return 0.0
- penalty = gamma * frag_frac ** beta
- return (1 - penalty) * fmean
-
-
-def meteor_score(
- references,
- hypothesis,
- preprocess=str.lower,
- stemmer=PorterStemmer(),
- wordnet=wordnet,
- alpha=0.9,
- beta=3,
- gamma=0.5,
-):
- """
- Calculates METEOR score for hypothesis with multiple references as
- described in "Meteor: An Automatic Metric for MT Evaluation with
- HighLevels of Correlation with Human Judgments" by Alon Lavie and
- Abhaya Agarwal, in Proceedings of ACL.
- http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
-
-
- In case of multiple references the best score is chosen. This method
- iterates over single_meteor_score and picks the best pair among all
- the references for a given hypothesis
-
- >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
- >>> hypothesis2 = 'It is to insure the troops forever hearing the activity guidebook that party direct'
-
- >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
- >>> reference2 = 'It is the guiding principle which guarantees the military forces always being under the command of the Party'
- >>> reference3 = 'It is the practical guide for the army always to heed the directions of the party'
-
- >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4)
- 0.7398
-
- If there is no words match during the alignment the method returns the
- score as 0. We can safely return a zero instead of raising a
- division by zero error as no match usually implies a bad translation.
-
- >>> round(meteor_score(['this is a cat'], 'non matching hypothesis'),4)
- 0.0
-
- :param references: reference sentences
- :type references: list(str)
- :param hypothesis: a hypothesis sentence
- :type hypothesis: str
- :param preprocess: preprocessing function (default str.lower)
- :type preprocess: method
- :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
- :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
- :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
- :type wordnet: WordNetCorpusReader
- :param alpha: parameter for controlling relative weights of precision and recall.
- :type alpha: float
- :param beta: parameter for controlling shape of penalty as a function
- of as a function of fragmentation.
- :type beta: float
- :param gamma: relative weight assigned to fragmentation penality.
- :type gamma: float
- :return: The sentence-level METEOR score.
- :rtype: float
- """
- return max(
- [
- single_meteor_score(
- reference,
- hypothesis,
- stemmer=stemmer,
- wordnet=wordnet,
- alpha=alpha,
- beta=beta,
- gamma=gamma,
- )
- for reference in references
- ]
- )
# Natural Language Toolkit: Translation metrics
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
+from __future__ import division
def alignment_error_rate(reference, hypothesis, possible=None):
"""
if possible is None:
possible = reference
else:
- assert reference.issubset(possible) # sanity check
+ assert(reference.issubset(possible)) # sanity check
- return 1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) / float(
- len(hypothesis) + len(reference)
- )
+ return (1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) /
+ float(len(hypothesis) + len(reference)))
# -*- coding: utf-8 -*-
# Natural Language Toolkit: NIST Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors:
# Contributors:
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""NIST score implementation."""
+from __future__ import division
import math
import fractions
from collections import Counter
from nltk.util import ngrams
+from nltk.translate.bleu_score import modified_precision, closest_ref_length
+
+try:
+ fractions.Fraction(0, 1000, _normalize=False)
+ from fractions import Fraction
+except TypeError:
+ from nltk.compat import Fraction
def sentence_nist(references, hypothesis, n=5):
... 'of', 'the', 'party']
>>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
- 3.3709...
+ 0.0854...
>>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
- 1.4619...
+ 0.1485...
:param references: reference sentences
:type references: list(list(str))
"""
return corpus_nist([references], [hypothesis], n)
-
def corpus_nist(list_of_references, hypotheses, n=5):
"""
Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
:type n: int
"""
# Before proceeding to compute NIST, perform sanity checks.
- assert len(list_of_references) == len(
- hypotheses
- ), "The number of hypotheses and their reference(s) should be the same"
-
- # Collect the ngram coounts from the reference sentences.
- ngram_freq = Counter()
- total_reference_words = 0
- for (
- references
- ) in list_of_references: # For each source sent, there's a list of reference sents.
- for reference in references:
- # For each order of ngram, count the ngram occurrences.
- for i in range(1, n + 1):
- ngram_freq.update(ngrams(reference, i))
- total_reference_words += len(reference)
-
- # Compute the information weights based on the reference sentences.
+ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+
+ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
+ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
+ sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp.
+ hyp_lengths, ref_lengths = 0, 0
+
+ # Iterate through each hypothesis and their corresponding references.
+ for references, hypothesis in zip(list_of_references, hypotheses):
+ # For each order of ngram, calculate the numerator and
+ # denominator for the corpus-level modified precision.
+ for i, _ in enumerate(range(1,n+1)):
+ p_i = modified_precision(references, hypothesis, i)
+ p_numerators[i] += p_i.numerator
+ p_denominators[i] += p_i.denominator
+ # Adds the no. of ngrams in the hypothesis.
+ sysoutput_lengths[i] += len(hypothesis) - (i - 1)
+
+ # Calculate the hypothesis length and the closest reference length.
+ # Adds them to the corpus-level hypothesis and reference counts.
+ hyp_len = len(hypothesis)
+ hyp_lengths += hyp_len
+ ref_lengths += closest_ref_length(references, hyp_len)
+
+ # Calculate corpus-level brevity penalty.
+ bp = nist_length_penalty(ref_lengths, hyp_lengths)
+
+ # Collects the various precision values for the different ngram orders.
+ p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+ for i, _ in enumerate(range(1,n+1))]
+
# Eqn 2 in Doddington (2002):
# Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
- information_weights = {}
- for _ngram in ngram_freq: # w_1 ... w_n
- _mgram = _ngram[:-1] # w_1 ... w_n-1
- # From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl#L546
- # it's computed as such:
- # denominator = ngram_freq[_mgram] if _mgram and _mgram in ngram_freq else denominator = total_reference_words
- # information_weights[_ngram] = -1 * math.log(ngram_freq[_ngram]/denominator) / math.log(2)
- #
- # Mathematically, it's equivalent to the our implementation:
- if _mgram and _mgram in ngram_freq:
- numerator = ngram_freq[_mgram]
- else:
- numerator = total_reference_words
- information_weights[_ngram] = math.log(numerator / ngram_freq[_ngram], 2)
-
- # Micro-average.
- nist_precision_numerator_per_ngram = Counter()
- nist_precision_denominator_per_ngram = Counter()
- l_ref, l_sys = 0, 0
- # For each order of ngram.
- for i in range(1, n + 1):
- # Iterate through each hypothesis and their corresponding references.
- for references, hypothesis in zip(list_of_references, hypotheses):
- hyp_len = len(hypothesis)
-
- # Find reference with the best NIST score.
- nist_score_per_ref = []
- for reference in references:
- _ref_len = len(reference)
- # Counter of ngrams in hypothesis.
- hyp_ngrams = (
- Counter(ngrams(hypothesis, i))
- if len(hypothesis) >= i
- else Counter()
- )
- ref_ngrams = (
- Counter(ngrams(reference, i)) if len(reference) >= i else Counter()
- )
- ngram_overlaps = hyp_ngrams & ref_ngrams
- # Precision part of the score in Eqn 3
- _numerator = sum(
- information_weights[_ngram] * count
- for _ngram, count in ngram_overlaps.items()
- )
- _denominator = sum(hyp_ngrams.values())
- _precision = 0 if _denominator == 0 else _numerator / _denominator
- nist_score_per_ref.append(
- (_precision, _numerator, _denominator, _ref_len)
- )
- # Best reference.
- precision, numerator, denominator, ref_len = max(nist_score_per_ref)
- nist_precision_numerator_per_ngram[i] += numerator
- nist_precision_denominator_per_ngram[i] += denominator
- l_ref += ref_len
- l_sys += hyp_len
-
- # Final NIST micro-average mean aggregation.
- nist_precision = 0
- for i in nist_precision_numerator_per_ngram:
- precision = (
- nist_precision_numerator_per_ngram[i]
- / nist_precision_denominator_per_ngram[i]
- )
- nist_precision += precision
- # Eqn 3 in Doddington(2002)
- return nist_precision * nist_length_penalty(l_ref, l_sys)
-
-
-def nist_length_penalty(ref_len, hyp_len):
+ info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors.
+ else math.log(p_n[i].numerator / p_n[i+1].numerator)
+ for i in range(len(p_n)-1)]
+ return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
+
+
+def nist_length_penalty(closest_ref_len, hyp_len):
"""
Calculates the NIST length penalty, from Eq. 3 in Doddington (2002)
of the score of small variations in the length of a translation.
See Fig. 4 in Doddington (2002)
"""
- ratio = hyp_len / ref_len
+ ratio = closest_ref_len / hyp_len
if 0 < ratio < 1:
ratio_x, score_x = 1.5, 0.5
- beta = math.log(score_x) / math.log(ratio_x) ** 2
- return math.exp(beta * math.log(ratio) ** 2)
- else: # ratio <= 0 or ratio >= 1
+ beta = math.log(score_x) / math.log(score_x)**2
+ return math.exp(beta * math.log(ratio)**2)
+ else: # ratio <= 0 or ratio >= 1
return max(min(ratio, 1.0), 0.0)
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Phrase Extraction Algorithm
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
-def extract(
- f_start,
- f_end,
- e_start,
- e_end,
- alignment,
- f_aligned,
- srctext,
- trgtext,
- srclen,
- trglen,
- max_phrase_length,
-):
+def extract(f_start, f_end, e_start, e_end,
+ alignment, f_aligned,
+ srctext, trgtext, srclen, trglen, max_phrase_length):
"""
- This function checks for alignment point consistency and extracts
+ This function checks for alignment point consistency and extracts
phrases using the chunk of consistent phrases.
-
+
A phrase pair (e, f ) is consistent with an alignment A if and only if:
(i) No English words in the phrase pair are aligned to words outside it.
-
+
∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
-
- (ii) No Foreign words in the phrase pair are aligned to words outside it.
-
+
+ (ii) No Foreign words in the phrase pair are aligned to words outside it.
+
∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
-
- (iii) The phrase pair contains at least one alignment point.
-
+
+ (iii) The phrase pair contains at least one alignment point.
+
∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A
-
+
:type f_start: int
:param f_start: Starting index of the possible foreign language phrases
:type f_end: int
- :param f_end: End index of the possible foreign language phrases
+ :param f_end: Starting index of the possible foreign language phrases
:type e_start: int
:param e_start: Starting index of the possible source language phrases
:type e_end: int
- :param e_end: End index of the possible source language phrases
+ :param e_end: Starting index of the possible source language phrases
:type srctext: list
:param srctext: The source language tokens, a list of string.
:type trgtext: list
if f_end < 0: # 0-based indexing.
return {}
# Check if alignment points are consistent.
- for e, f in alignment:
- if (f_start <= f <= f_end) and (e < e_start or e > e_end):
+ for e,f in alignment:
+ if ((f_start <= f <= f_end) and (e < e_start or e > e_end)):
return {}
# Add phrase pairs (incl. additional unaligned f)
while True:
# add phrase pair ([e_start, e_end], [fs, fe]) to set E
# Need to +1 in range to include the end-point.
- src_phrase = " ".join(srctext[e_start : e_end + 1])
- trg_phrase = " ".join(trgtext[fs : fe + 1])
+ src_phrase = " ".join(srctext[e_start:e_end+1])
+ trg_phrase = " ".join(trgtext[fs:fe+1])
# Include more data for later ordering.
- phrases.add(
- ((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase)
- )
+ phrases.add(((e_start, e_end+1), (f_start, f_end+1),
+ src_phrase, trg_phrase))
fe += 1
- if fe in f_aligned or fe >= trglen:
+ if fe in f_aligned or fe == trglen:
break
- fs -= 1
+ fs -=1
if fs in f_aligned or fs < 0:
break
return phrases
-
def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
"""
- Phrase extraction algorithm extracts all consistent phrase pairs from
+ Phrase extraction algorithm extracts all consistent phrase pairs from
a word-aligned sentence pair.
- The idea is to loop over all possible source language (e) phrases and find
- the minimal foreign phrase (f) that matches each of them. Matching is done
- by identifying all alignment points for the source phrase and finding the
- shortest foreign phrase that includes all the foreign counterparts for the
+ The idea is to loop over all possible source language (e) phrases and find
+ the minimal foreign phrase (f) that matches each of them. Matching is done
+ by identifying all alignment points for the source phrase and finding the
+ shortest foreign phrase that includes all the foreign counterparts for the
source words.
- In short, a phrase alignment has to
+ In short, a phrase alignment has to
(a) contain all alignment points for all covered words
(b) contain at least one alignment point
-
+
>>> srctext = "michael assumes that he will stay in the house"
>>> trgtext = "michael geht davon aus , dass er im haus bleibt"
- >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
+ >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
... (5,9), (6,7), (7,7), (8,8)]
>>> phrases = phrase_extraction(srctext, trgtext, alignment)
>>> for i in sorted(phrases):
...
((0, 1), (0, 1), 'michael', 'michael')
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
- ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
+ ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus ,')
((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
((1, 2), (1, 4), 'assumes', 'geht davon aus')
- ((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
+ ((1, 2), (1, 4), 'assumes', 'geht davon aus ,')
((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
- ((2, 3), (4, 6), 'that', ', dass')
+ ((2, 3), (5, 6), 'that', ', dass')
((2, 3), (5, 6), 'that', 'dass')
- ((2, 4), (4, 7), 'that he', ', dass er')
+ ((2, 4), (5, 7), 'that he', ', dass er')
((2, 4), (5, 7), 'that he', 'dass er')
- ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
+ ((2, 9), (5, 10), 'that he will stay in the house', ', dass er im haus bleibt')
((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
((3, 4), (6, 7), 'he', 'er')
((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
((6, 8), (7, 8), 'in the', 'im')
((6, 9), (7, 9), 'in the house', 'im haus')
((8, 9), (8, 9), 'house', 'haus')
-
+
:type srctext: str
:param srctext: The sentence string from the source language.
:type trgtext: str
:param trgtext: The sentence string from the target language.
- :type alignment: list(tuple)
+ :type alignment: str
:param alignment: The word alignment outputs as list of tuples, where
the first elements of tuples are the source words' indices and
second elements are the target words' indices. This is also the output
format of nltk.translate.ibm1
:rtype: list(tuple)
- :return: A list of tuples, each element in a list is a phrase and each
- phrase is a tuple made up of (i) its source location, (ii) its target
+ :return: A list of tuples, each element in a list is a phrase and each
+ phrase is a tuple made up of (i) its source location, (ii) its target
location, (iii) the source phrase and (iii) the target phrase. The phrase
- list of tuples represents all the possible phrases extracted from the
- word alignments.
+ list of tuples represents all the possible phrases extracted from the
+ word alignments.
:type max_phrase_length: int
:param max_phrase_length: maximal phrase length, if 0 or not specified
it is set to a length of the longer sentence (srctext or trgtext).
"""
- srctext = srctext.split() # e
- trgtext = trgtext.split() # f
- srclen = len(srctext) # len(e)
- trglen = len(trgtext) # len(f)
+ srctext = srctext.split() # e
+ trgtext = trgtext.split() # f
+ srclen = len(srctext) # len(e)
+ trglen = len(trgtext) # len(f)
# Keeps an index of which source/target words that are aligned.
- f_aligned = [j for _, j in alignment]
- max_phrase_length = max_phrase_length or max(srclen, trglen)
+ f_aligned = [j for _,j in alignment]
+ max_phrase_length = max_phrase_length or max(srclen,trglen)
# set of phrase pairs BP
bp = set()
# // find the minimally matching foreign phrase
# (f start , f end ) = ( length(f), 0 )
# f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
- f_start, f_end = trglen - 1, -1 # 0-based indexing
-
- for e, f in alignment:
+ f_start, f_end = trglen-1 , -1 # 0-based indexing
+
+ for e,f in alignment:
if e_start <= e <= e_end:
f_start = min(f, f_start)
f_end = max(f, f_end)
# add extract (f start , f end , e start , e end ) to set BP
- phrases = extract(
- f_start,
- f_end,
- e_start,
- e_end,
- alignment,
- f_aligned,
- srctext,
- trgtext,
- srclen,
- trglen,
- max_phrase_length,
- )
+ phrases = extract(f_start, f_end, e_start, e_end,
+ alignment, f_aligned,
+ srctext, trgtext, srclen, trglen,
+ max_phrase_length)
if phrases:
bp.update(phrases)
return bp
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: RIBES Score
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
# Mark Byers, ekhumoro, P. Ortiz
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
""" RIBES score implementation """
-
+from __future__ import division
from itertools import islice
import math
def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
"""
- The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
- Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
- Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
- Distant Language Pairs". In Proceedings of EMNLP.
- http://www.aclweb.org/anthology/D/D10/D10-1092.pdf
-
- The generic RIBES scores used in shared task, e.g. Workshop for
+ The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
+ Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
+ Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
+ Distant Language Pairs". In Proceedings of EMNLP.
+ http://www.aclweb.org/anthology/D/D10/D10-1092.pdf
+
+ The generic RIBES scores used in shared task, e.g. Workshop for
Asian Translation (WAT) uses the following RIBES calculations:
-
+
RIBES = kendall_tau * (alpha**p1) * (beta**bp)
-
+
Please note that this re-implementation differs from the official
RIBES implementation and though it emulates the results as describe
- in the original paper, there are further optimization implemented
+ in the original paper, there are further optimization implemented
in the official RIBES script.
-
- Users are encouraged to use the official RIBES script instead of this
+
+ Users are encouraged to use the official RIBES script instead of this
implementation when evaluating your machine translation system. Refer
to http://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
-
+
:param references: a list of reference sentences
:type reference: list(list(str))
:param hypothesis: a hypothesis sentence
# Collects the *worder* from the ranked correlation alignments.
worder = word_rank_alignment(reference, hypothesis)
nkt = kendall_tau(worder)
-
+
# Calculates the brevity penalty
- bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis)))
-
+ bp = min(1.0, math.exp(1.0 - len(reference)/len(hypothesis)))
+
# Calculates the unigram precision, *p1*
p1 = len(worder) / len(hypothesis)
-
- _ribes = nkt * (p1 ** alpha) * (bp ** beta)
-
- if _ribes > best_ribes: # Keeps the best score.
+
+ _ribes = nkt * (p1 ** alpha) * (bp ** beta)
+
+ if _ribes > best_ribes: # Keeps the best score.
best_ribes = _ribes
-
+
return best_ribes
def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
"""
- This function "calculates RIBES for a system output (hypothesis) with
- multiple references, and returns "best" score among multi-references and
- individual scores. The scores are corpus-wise, i.e., averaged by the number
+ This function "calculates RIBES for a system output (hypothesis) with
+ multiple references, and returns "best" score among multi-references and
+ individual scores. The scores are corpus-wise, i.e., averaged by the number
of sentences." (c.f. RIBES version 1.03.1 code).
-
- Different from BLEU's micro-average precision, RIBES calculates the
- macro-average precision by averaging the best RIBES score for each pair of
- hypothesis and its corresponding references
+
+ Different from BLEU's micro-average precision, RIBES calculates the
+ macro-average precision by averaging the best RIBES score for each pair of
+ hypothesis and its corresponding references
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
-
- >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+
+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
... 'interested', 'in', 'world', 'history']
- >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
... 'because', 'he', 'read', 'the', 'book']
-
+
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
>>> hypotheses = [hyp1, hyp2]
>>> round(corpus_ribes(list_of_references, hypotheses),4)
0.3597
-
+
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
for references, hypothesis in zip(list_of_references, hypotheses):
corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
return corpus_best_ribes / len(hypotheses)
-
-
+
+
def position_of_ngram(ngram, sentence):
"""
- This function returns the position of the first instance of the ngram
+ This function returns the position of the first instance of the ngram
appearing in a sentence.
-
+
Note that one could also use string as follows but the code is a little
convoluted with type casting back and forth:
-
+
char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
word_pos = char_pos.count(' ')
-
+
Another way to conceive this is:
-
- return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
+
+ return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
if ng == ngram)
-
+
:param ngram: The ngram that needs to be searched
:type ngram: tuple
:param sentence: The list of tokens to search from.
:type sentence: list(str)
"""
# Iterates through the ngrams in sentence.
- for i, sublist in enumerate(ngrams(sentence, len(ngram))):
+ for i,sublist in enumerate(ngrams(sentence, len(ngram))):
# Returns the index of the word when ngram matches.
if ngram == sublist:
return i
def word_rank_alignment(reference, hypothesis, character_based=False):
- """
+ """
This is the word rank alignment algorithm described in the paper to produce
- the *worder* list, i.e. a list of word indices of the hypothesis word orders
+ the *worder* list, i.e. a list of word indices of the hypothesis word orders
w.r.t. the list of reference words.
-
- Below is (H0, R0) example from the Isozaki et al. 2010 paper,
+
+ Below is (H0, R0) example from the Isozaki et al. 2010 paper,
note the examples are indexed from 1 but the results here are indexed from 0:
-
+
>>> ref = str('he was interested in world history because he '
... 'read the book').split()
>>> hyp = str('he read the book because he was interested in world '
... 'history').split()
>>> word_rank_alignment(ref, hyp)
[7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
-
+
The (H1, R1) example from the paper, note the 0th index:
-
+
>>> ref = 'John hit Bob yesterday'.split()
>>> hyp = 'Bob hit John yesterday'.split()
>>> word_rank_alignment(ref, hyp)
[2, 1, 0, 3]
Here is the (H2, R2) example from the paper, note the 0th index here too:
-
+
>>> ref = 'the boy read the book'.split()
>>> hyp = 'the book was read by the boy'.split()
>>> word_rank_alignment(ref, hyp)
[3, 4, 2, 0, 1]
-
+
:param reference: a reference sentence
:type reference: list(str)
:param hypothesis: a hypothesis sentence
# This is used for matching context window later in the algorithm.
ref_ngrams = []
hyp_ngrams = []
- for n in range(1, len(reference) + 1):
+ for n in range(1, len(reference)+1):
for ng in ngrams(reference, n):
ref_ngrams.append(ng)
for ng in ngrams(hypothesis, n):
# If word is not in the reference, continue.
if h_word not in reference:
continue
- # If we can determine one-to-one word correspondence for unigrams that
+ # If we can determine one-to-one word correspondence for unigrams that
# only appear once in both the reference and hypothesis.
elif hypothesis.count(h_word) == reference.count(h_word) == 1:
worder.append(reference.index(h_word))
else:
- max_window_size = max(i, hyp_len - i + 1)
+ max_window_size = max(i, hyp_len-i+1)
for window in range(1, max_window_size):
- if i + window < hyp_len: # If searching the right context is possible.
+ if i+window < hyp_len: # If searching the right context is possible.
# Retrieve the right context window.
- right_context_ngram = tuple(islice(hypothesis, i, i + window + 1))
+ right_context_ngram = tuple(islice(hypothesis, i, i+window+1))
num_times_in_ref = ref_ngrams.count(right_context_ngram)
- num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
+ num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
# If ngram appears only once in both ref and hyp.
if num_times_in_ref == num_times_in_hyp == 1:
# Find the position of ngram that matched the reference.
pos = position_of_ngram(right_context_ngram, reference)
worder.append(pos) # Add the positions of the ngram.
break
- if window <= i: # If searching the left context is possible.
+ if window <= i: # If searching the left context is possible.
# Retrieve the left context window.
- left_context_ngram = tuple(islice(hypothesis, i - window, i + 1))
+ left_context_ngram = tuple(islice(hypothesis, i-window, i+1))
num_times_in_ref = ref_ngrams.count(left_context_ngram)
num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
if num_times_in_ref == num_times_in_hyp == 1:
# Find the position of ngram that matched the reference.
pos = position_of_ngram(left_context_ngram, reference)
# Add the positions of the ngram.
- worder.append(pos + len(left_context_ngram) - 1)
+ worder.append(pos+ len(left_context_ngram) -1)
break
return worder
-
+
def find_increasing_sequences(worder):
"""
- Given the *worder* list, this function groups monotonic +1 sequences.
-
+ Given the *worder* list, this function groups monotonic +1 sequences.
+
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> list(find_increasing_sequences(worder))
[(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
-
+
:param worder: The worder list output from word_rank_alignment
:param type: list(int)
"""
"""
Calculates the Kendall's Tau correlation coefficient given the *worder*
list of word alignments from word_rank_alignment(), using the formula:
-
+
tau = 2 * num_increasing_pairs / num_possible pairs -1
-
+
Note that the no. of increasing pairs can be discontinuous in the *worder*
- list and each each increasing sequence can be tabulated as choose(len(seq), 2)
+ list and each each increasing sequence can be tabulated as choose(len(seq), 2)
no. of increasing pairs, e.g.
-
+
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> number_possible_pairs = choose(len(worder), 2)
>>> round(kendall_tau(worder, normalize=False),3)
-0.236
>>> round(kendall_tau(worder),3)
0.382
-
+
:param worder: The worder list output from word_rank_alignment
:type worder: list(int)
:param normalize: Flag to indicate normalization
# Extract the groups of increasing/monotonic sequences.
increasing_sequences = find_increasing_sequences(worder)
# Calculate no. of increasing_pairs in *worder* list.
- num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences)
+ num_increasing_pairs = sum(choose(len(seq),2) for seq in increasing_sequences)
# Calculate no. of possible pairs.
num_possible_pairs = choose(worder_len, 2)
# Kendall's Tau computation.
- tau = 2 * num_increasing_pairs / num_possible_pairs - 1
- if normalize: # If normalized, the tau output falls between 0.0 to 1.0
- return (tau + 1) / 2
- else: # Otherwise, the tau outputs falls between -1.0 to +1.0
+ tau = 2 * num_increasing_pairs / num_possible_pairs -1
+ if normalize: # If normalized, the tau output falls between 0.0 to 1.0
+ return (tau + 1) /2
+ else: # Otherwise, the tau outputs falls between -1.0 to +1.0
return tau
def spearman_rho(worder, normalize=True):
"""
- Calculates the Spearman's Rho correlation coefficient given the *worder*
+ Calculates the Spearman's Rho correlation coefficient given the *worder*
list of word alignment from word_rank_alignment(), using the formula:
-
- rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
-
+
+ rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
+
Given that d is the sum of difference between the *worder* list of indices
and the original word indices from the reference sentence.
-
+
Using the (H0,R0) and (H5, R5) example from the paper
-
+
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> round(spearman_rho(worder, normalize=False), 3)
-0.591
>>> round(spearman_rho(worder), 3)
0.205
-
+
:param worder: The worder list output from word_rank_alignment
:param type: list(int)
"""
worder_len = len(worder)
- sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len)))
- rho = 1 - sum_d_square / choose(worder_len + 1, 3)
-
- if normalize: # If normalized, the rho output falls between 0.0 to 1.0
- return (rho + 1) / 2
- else: # Otherwise, the rho outputs falls between -1.0 to +1.0
+ sum_d_square = sum((wi - i)**2 for wi, i in zip(worder, range(worder_len)))
+ rho = 1 - sum_d_square / choose(worder_len+1, 3)
+
+ if normalize: # If normalized, the rho output falls between 0.0 to 1.0
+ return (rho + 1) /2
+ else: # Otherwise, the rho outputs falls between -1.0 to +1.0
return rho
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Stack decoder
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
['nobody', 'expects', 'the', 'spanish', 'inquisition', '!']
"""
-
def __init__(self, phrase_table, language_model):
"""
:param phrase_table: Table of translations for source language
"""
sentence = tuple(src_sentence) # prevent accidental modification
sentence_length = len(sentence)
- stacks = [
- _Stack(self.stack_size, self.beam_threshold)
- for _ in range(0, sentence_length + 1)
- ]
+ stacks = [_Stack(self.stack_size, self.beam_threshold)
+ for _ in range(0, sentence_length + 1)]
empty_hypothesis = _Hypothesis()
stacks[0].push(empty_hypothesis)
future_score_table = self.compute_future_scores(sentence)
for stack in stacks:
for hypothesis in stack:
- possible_expansions = StackDecoder.valid_phrases(
- all_phrases, hypothesis
- )
+ possible_expansions = StackDecoder.valid_phrases(all_phrases,
+ hypothesis)
for src_phrase_span in possible_expansions:
- src_phrase = sentence[src_phrase_span[0] : src_phrase_span[1]]
- for translation_option in self.phrase_table.translations_for(
- src_phrase
- ):
+ src_phrase = sentence[src_phrase_span[0]:src_phrase_span[1]]
+ for translation_option in (self.phrase_table.
+ translations_for(src_phrase)):
raw_score = self.expansion_score(
- hypothesis, translation_option, src_phrase_span
- )
+ hypothesis, translation_option, src_phrase_span)
new_hypothesis = _Hypothesis(
raw_score=raw_score,
src_phrase_span=src_phrase_span,
trg_phrase=translation_option.trg_phrase,
- previous=hypothesis,
+ previous=hypothesis
)
new_hypothesis.future_score = self.future_score(
- new_hypothesis, future_score_table, sentence_length
- )
+ new_hypothesis, future_score_table, sentence_length)
total_words = new_hypothesis.total_translated_words()
stacks[total_words].push(new_hypothesis)
if not stacks[sentence_length]:
- warnings.warn(
- "Unable to translate all words. "
- "The source sentence contains words not in "
- "the phrase table"
- )
+ warnings.warn('Unable to translate all words. '
+ 'The source sentence contains words not in '
+ 'the phrase table')
# Instead of returning empty output, perhaps a partial
# translation could be returned
return []
subsequence covering positions 2, 3, and 4.
:rtype: dict(int: (dict(int): float))
"""
- scores = defaultdict(lambda: defaultdict(lambda: float("-inf")))
+ scores = defaultdict(lambda: defaultdict(lambda: float('-inf')))
for seq_length in range(1, len(src_sentence) + 1):
for start in range(0, len(src_sentence) - seq_length + 1):
end = start + seq_length
phrase = src_sentence[start:end]
if phrase in self.phrase_table:
- score = self.phrase_table.translations_for(phrase)[
- 0
- ].log_prob # pick best (first) translation
+ score = self.phrase_table.translations_for(
+ phrase)[0].log_prob # pick best (first) translation
# Warning: API of language_model is subject to change
score += self.language_model.probability(phrase)
scores[start][end] = score
# check if a better score can be obtained by combining
# two child subsequences
for mid in range(start + 1, end):
- combined_score = scores[start][mid] + scores[mid][end]
+ combined_score = (scores[start][mid] +
+ scores[mid][end])
if combined_score > scores[start][end]:
scores[start][end] = combined_score
return scores
# The API of language_model is subject to change; it could accept
# a string, a list of words, and/or some other type
score += self.language_model.probability_change(
- hypothesis, translation_option.trg_phrase
- )
+ hypothesis, translation_option.trg_phrase)
score += self.distortion_score(hypothesis, src_phrase_span)
score -= self.word_penalty * len(translation_option.trg_phrase)
return score
cover untranslated positions.
:rtype: list(tuple(int, int))
"""
- untranslated_spans = hypothesis.untranslated_spans(len(all_phrases_from))
+ untranslated_spans = hypothesis.untranslated_spans(
+ len(all_phrases_from))
valid_phrases = []
for available_span in untranslated_spans:
start = available_span[0]
``src_phrase_span`` in the hypothesis chain. Similarly, the
translation output can be found by traversing up the chain.
"""
-
- def __init__(
- self,
- raw_score=0.0,
- src_phrase_span=(),
- trg_phrase=(),
- previous=None,
- future_score=0.0,
- ):
+ def __init__(self, raw_score=0.0, src_phrase_span=(), trg_phrase=(),
+ previous=None, future_score=0.0):
"""
:param raw_score: Likelihood of hypothesis so far.
Higher is better. Does not account for untranslated words.
current_hypothesis = self
while current_hypothesis.previous is not None:
translated_span = current_hypothesis.src_phrase_span
- translated_positions.extend(range(translated_span[0], translated_span[1]))
+ translated_positions.extend(range(translated_span[0],
+ translated_span[1]))
current_hypothesis = current_hypothesis.previous
return translated_positions
"""
Collection of _Hypothesis objects
"""
-
def __init__(self, max_size=100, beam_threshold=0.0):
"""
:param beam_threshold: Hypotheses that score less than this
self.items = []
if beam_threshold == 0.0:
- self.__log_beam_threshold = float("-inf")
+ self.__log_beam_threshold = float('-inf')
else:
self.__log_beam_threshold = log(beam_threshold)
def __bool__(self):
return len(self.items) != 0
-
- __nonzero__ = __bool__
+ __nonzero__=__bool__
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Text Trees
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Peter Ljunglöf <peter.ljunglof@gu.se>
Class for representing hierarchical language structures, such as
syntax trees and morphological trees.
"""
+from __future__ import print_function, unicode_literals
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+# TODO: add LabelledTree (can be used for dependency trees)
import re
-import sys
-from abc import ABCMeta, abstractmethod
+from six import string_types
from nltk.grammar import Production, Nonterminal
from nltk.probability import ProbabilisticMixIn
from nltk.util import slice_bounds
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.internals import raise_unorderable_types
-# TODO: add LabelledTree (can be used for dependency trees)
-
######################################################################
## Trees
######################################################################
-
-
+@python_2_unicode_compatible
class Tree(list):
"""
A Tree represents a hierarchical grouping of leaves and subtrees.
- ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``.
"""
-
def __init__(self, node, children=None):
if children is None:
- raise TypeError(
- "%s: Expected a node value and child list " % type(self).__name__
- )
- elif isinstance(children, str):
- raise TypeError(
- "%s() argument 2 should be a list, not a "
- "string" % type(self).__name__
- )
+ raise TypeError("%s: Expected a node value and child list "
+ % type(self).__name__)
+ elif isinstance(children, string_types):
+ raise TypeError("%s() argument 2 should be a list, not a "
+ "string" % type(self).__name__)
else:
list.__init__(self, children)
self._label = node
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Comparison operators
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def __eq__(self, other):
- return self.__class__ is other.__class__ and (self._label, list(self)) == (
- other._label,
- list(other),
- )
+ return (self.__class__ is other.__class__ and
+ (self._label, list(self)) == (other._label, list(other)))
def __lt__(self, other):
if not isinstance(other, Tree):
__le__ = lambda self, other: self < other or self == other
__ge__ = lambda self, other: not self < other
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Disabled list operations
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def __mul__(self, v):
- raise TypeError("Tree does not support multiplication")
-
+ raise TypeError('Tree does not support multiplication')
def __rmul__(self, v):
- raise TypeError("Tree does not support multiplication")
-
+ raise TypeError('Tree does not support multiplication')
def __add__(self, v):
- raise TypeError("Tree does not support addition")
-
+ raise TypeError('Tree does not support addition')
def __radd__(self, v):
- raise TypeError("Tree does not support addition")
+ raise TypeError('Tree does not support addition')
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Indexing (with support for tree positions)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def __getitem__(self, index):
if isinstance(index, (int, slice)):
else:
return self[index[0]][index[1:]]
else:
- raise TypeError(
- "%s indices must be integers, not %s"
- % (type(self).__name__, type(index).__name__)
- )
+ raise TypeError("%s indices must be integers, not %s" %
+ (type(self).__name__, type(index).__name__))
def __setitem__(self, index, value):
if isinstance(index, (int, slice)):
return list.__setitem__(self, index, value)
elif isinstance(index, (list, tuple)):
if len(index) == 0:
- raise IndexError("The tree position () may not be " "assigned to.")
+ raise IndexError('The tree position () may not be '
+ 'assigned to.')
elif len(index) == 1:
self[index[0]] = value
else:
self[index[0]][index[1:]] = value
else:
- raise TypeError(
- "%s indices must be integers, not %s"
- % (type(self).__name__, type(index).__name__)
- )
+ raise TypeError("%s indices must be integers, not %s" %
+ (type(self).__name__, type(index).__name__))
def __delitem__(self, index):
if isinstance(index, (int, slice)):
return list.__delitem__(self, index)
elif isinstance(index, (list, tuple)):
if len(index) == 0:
- raise IndexError("The tree position () may not be deleted.")
+ raise IndexError('The tree position () may not be deleted.')
elif len(index) == 1:
del self[index[0]]
else:
del self[index[0]][index[1:]]
else:
- raise TypeError(
- "%s indices must be integers, not %s"
- % (type(self).__name__, type(index).__name__)
- )
+ raise TypeError("%s indices must be integers, not %s" %
+ (type(self).__name__, type(index).__name__))
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Basic tree operations
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def _get_node(self):
"""Outdated method to access the node value; use the label() method instead."""
raise NotImplementedError("Use label() to access a node label.")
-
def _set_node(self, value):
"""Outdated method to set the node value; use the set_label() method instead."""
raise NotImplementedError("Use set_label() method to set a node label.")
-
node = property(_get_node, _set_node)
def label(self):
max_child_height = max(max_child_height, 1)
return 1 + max_child_height
- def treepositions(self, order="preorder"):
+ def treepositions(self, order='preorder'):
"""
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
>>> t.treepositions() # doctest: +ELLIPSIS
``leaves``.
"""
positions = []
- if order in ("preorder", "bothorder"):
- positions.append(())
+ if order in ('preorder', 'bothorder'): positions.append( () )
for i, child in enumerate(self):
if isinstance(child, Tree):
childpos = child.treepositions(order)
- positions.extend((i,) + p for p in childpos)
+ positions.extend((i,)+p for p in childpos)
else:
- positions.append((i,))
- if order in ("postorder", "bothorder"):
- positions.append(())
+ positions.append( (i,) )
+ if order in ('postorder', 'bothorder'): positions.append( () )
return positions
def subtrees(self, filter=None):
:rtype: list(Production)
"""
- if not isinstance(self._label, str):
- raise TypeError(
- "Productions can only be generated from trees having node labels that are strings"
- )
+ if not isinstance(self._label, string_types):
+ raise TypeError('Productions can only be generated from trees having node labels that are strings')
prods = [Production(Nonterminal(self._label), _child_names(self))]
for child in self:
:raise IndexError: If this tree contains fewer than ``index+1``
leaves, or if ``index<0``.
"""
- if index < 0:
- raise IndexError("index must be non-negative")
+ if index < 0: raise IndexError('index must be non-negative')
stack = [(self, ())]
while stack:
value, treepos = stack.pop()
if not isinstance(value, Tree):
- if index == 0:
- return treepos
- else:
- index -= 1
+ if index == 0: return treepos
+ else: index -= 1
else:
- for i in range(len(value) - 1, -1, -1):
- stack.append((value[i], treepos + (i,)))
+ for i in range(len(value)-1, -1, -1):
+ stack.append( (value[i], treepos+(i,)) )
- raise IndexError("index must be less than or equal to len(self)")
+ raise IndexError('index must be less than or equal to len(self)')
def treeposition_spanning_leaves(self, start, end):
"""
:raise ValueError: if ``end <= start``
"""
if end <= start:
- raise ValueError("end must be greater than start")
+ raise ValueError('end must be greater than start')
# Find the tree positions of the start & end leaves, and
# take the longest common subsequence.
start_treepos = self.leaf_treeposition(start)
- end_treepos = self.leaf_treeposition(end - 1)
+ end_treepos = self.leaf_treeposition(end-1)
# Find the first index where they mismatch:
for i in range(len(start_treepos)):
if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
return start_treepos[:i]
return start_treepos
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Transforms
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
- def chomsky_normal_form(
- self,
- factor="right",
- horzMarkov=None,
- vertMarkov=0,
- childChar="|",
- parentChar="^",
- ):
+ def chomsky_normal_form(self, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"):
"""
This method can modify a tree in three ways:
:type parentChar: str
"""
from nltk.treetransforms import chomsky_normal_form
-
chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
- def un_chomsky_normal_form(
- self, expandUnary=True, childChar="|", parentChar="^", unaryChar="+"
- ):
+ def un_chomsky_normal_form(self, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
"""
This method modifies the tree in three ways:
:type unaryChar: str
"""
from nltk.treetransforms import un_chomsky_normal_form
-
un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
- def collapse_unary(self, collapsePOS=False, collapseRoot=False, joinChar="+"):
+ def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
"""
Collapse subtrees with a single child (ie. unary productions)
into a new non-terminal (Tree node) joined by 'joinChar'.
:type joinChar: str
"""
from nltk.treetransforms import collapse_unary
-
collapse_unary(self, collapsePOS, collapseRoot, joinChar)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Convert, copy
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
@classmethod
def convert(cls, tree):
else:
return tree
- def __copy__(self):
- return self.copy()
-
- def __deepcopy__(self, memo):
- return self.copy(deep=True)
-
def copy(self, deep=False):
- if not deep:
- return type(self)(self._label, self)
- else:
- return type(self).convert(self)
-
- def _frozen_class(self):
- return ImmutableTree
+ if not deep: return type(self)(self._label, self)
+ else: return type(self).convert(self)
+ def _frozen_class(self): return ImmutableTree
def freeze(self, leaf_freezer=None):
frozen_class = self._frozen_class()
if leaf_freezer is None:
newcopy = frozen_class.convert(self)
else:
newcopy = self.copy(deep=True)
- for pos in newcopy.treepositions("leaves"):
+ for pos in newcopy.treepositions('leaves'):
newcopy[pos] = leaf_freezer(newcopy[pos])
newcopy = frozen_class.convert(newcopy)
- hash(newcopy) # Make sure the leaves are hashable.
+ hash(newcopy) # Make sure the leaves are hashable.
return newcopy
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Parsing
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
@classmethod
- def fromstring(
- cls,
- s,
- brackets="()",
- read_node=None,
- read_leaf=None,
- node_pattern=None,
- leaf_pattern=None,
- remove_empty_top_bracketing=False,
- ):
+ def fromstring(cls, s, brackets='()', read_node=None, read_leaf=None,
+ node_pattern=None, leaf_pattern=None,
+ remove_empty_top_bracketing=False):
"""
Read a bracketed tree string and return the resulting tree.
Trees are represented as nested brackettings, such as::
then it will return a tree of that type.
:rtype: Tree
"""
- if not isinstance(brackets, str) or len(brackets) != 2:
- raise TypeError("brackets must be a length-2 string")
- if re.search("\s", brackets):
- raise TypeError("whitespace brackets not allowed")
+ if not isinstance(brackets, string_types) or len(brackets) != 2:
+ raise TypeError('brackets must be a length-2 string')
+ if re.search('\s', brackets):
+ raise TypeError('whitespace brackets not allowed')
# Construct a regexp that will tokenize the string.
open_b, close_b = brackets
open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
if node_pattern is None:
- node_pattern = "[^\s%s%s]+" % (open_pattern, close_pattern)
+ node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
if leaf_pattern is None:
- leaf_pattern = "[^\s%s%s]+" % (open_pattern, close_pattern)
- token_re = re.compile(
- "%s\s*(%s)?|%s|(%s)"
- % (open_pattern, node_pattern, close_pattern, leaf_pattern)
- )
+ leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
+ token_re = re.compile('%s\s*(%s)?|%s|(%s)' % (
+ open_pattern, node_pattern, close_pattern, leaf_pattern))
# Walk through each token, updating a stack of trees.
- stack = [(None, [])] # list of (node, children) tuples
+ stack = [(None, [])] # list of (node, children) tuples
for match in token_re.finditer(s):
token = match.group()
# Beginning of a tree/subtree
if token[0] == open_b:
if len(stack) == 1 and len(stack[0][1]) > 0:
- cls._parse_error(s, match, "end-of-string")
+ cls._parse_error(s, match, 'end-of-string')
label = token[1:].lstrip()
- if read_node is not None:
- label = read_node(label)
+ if read_node is not None: label = read_node(label)
stack.append((label, []))
# End of a tree/subtree
elif token == close_b:
if len(stack[0][1]) == 0:
cls._parse_error(s, match, open_b)
else:
- cls._parse_error(s, match, "end-of-string")
+ cls._parse_error(s, match, 'end-of-string')
label, children = stack.pop()
stack[-1][1].append(cls(label, children))
# Leaf node
else:
if len(stack) == 1:
cls._parse_error(s, match, open_b)
- if read_leaf is not None:
- token = read_leaf(token)
+ if read_leaf is not None: token = read_leaf(token)
stack[-1][1].append(token)
# check that we got exactly one complete tree.
if len(stack) > 1:
- cls._parse_error(s, "end-of-string", close_b)
+ cls._parse_error(s, 'end-of-string', close_b)
elif len(stack[0][1]) == 0:
- cls._parse_error(s, "end-of-string", open_b)
+ cls._parse_error(s, 'end-of-string', open_b)
else:
assert stack[0][0] is None
assert len(stack[0][1]) == 1
# If the tree has an extra level with node='', then get rid of
# it. E.g.: "((S (NP ...) (VP ...)))"
- if remove_empty_top_bracketing and tree._label == "" and len(tree) == 1:
+ if remove_empty_top_bracketing and tree._label == '' and len(tree) == 1:
tree = tree[0]
# return the tree.
return tree
:param expecting: what we expected to see instead.
"""
# Construct a basic error message
- if match == "end-of-string":
- pos, token = len(s), "end-of-string"
+ if match == 'end-of-string':
+ pos, token = len(s), 'end-of-string'
else:
pos, token = match.start(), match.group()
- msg = "%s.read(): expected %r but got %r\n%sat index %d." % (
- cls.__name__,
- expecting,
- token,
- " " * 12,
- pos,
- )
+ msg = '%s.read(): expected %r but got %r\n%sat index %d.' % (
+ cls.__name__, expecting, token, ' '*12, pos)
# Add a display showing the error token itsels:
- s = s.replace("\n", " ").replace("\t", " ")
+ s = s.replace('\n', ' ').replace('\t', ' ')
offset = pos
- if len(s) > pos + 10:
- s = s[: pos + 10] + "..."
+ if len(s) > pos+10:
+ s = s[:pos+10]+'...'
if pos > 10:
- s = "..." + s[pos - 10 :]
+ s = '...'+s[pos-10:]
offset = 13
- msg += '\n%s"%s"\n%s^' % (" " * 16, s, " " * (17 + offset))
+ msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset))
raise ValueError(msg)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Visualization & String Representation
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
def draw(self):
"""
Open a new window containing a graphical diagram of this tree.
"""
from nltk.draw.tree import draw_trees
-
draw_trees(self)
def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs):
`nltk.treeprettyprinter.TreePrettyPrinter`.
"""
from nltk.treeprettyprinter import TreePrettyPrinter
-
- print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream)
+ print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs),
+ file=stream)
def __repr__(self):
- childstr = ", ".join(repr(c) for c in self)
- return "%s(%s, [%s])" % (
- type(self).__name__,
- repr(self._label),
- childstr,
- )
+ childstr = ", ".join(unicode_repr(c) for c in self)
+ return '%s(%s, [%s])' % (type(self).__name__, unicode_repr(self._label), childstr)
def _repr_png_(self):
"""
from nltk.draw.tree import tree_to_treesegment
from nltk.draw.util import CanvasFrame
from nltk.internals import find_binary
-
_canvas_frame = CanvasFrame()
widget = tree_to_treesegment(_canvas_frame.canvas(), self)
_canvas_frame.add_widget(widget)
x, y, w, h = widget.bbox()
# print_to_file uses scrollregion to set the width and height of the pdf.
- _canvas_frame.canvas()["scrollregion"] = (0, 0, w, h)
+ _canvas_frame.canvas()['scrollregion'] = (0, 0, w, h)
with tempfile.NamedTemporaryFile() as file:
- in_path = "{0:}.ps".format(file.name)
- out_path = "{0:}.png".format(file.name)
+ in_path = '{0:}.ps'.format(file.name)
+ out_path = '{0:}.png'.format(file.name)
_canvas_frame.print_to_file(in_path)
_canvas_frame.destroy_widget(widget)
- try:
- subprocess.call(
- [
- find_binary(
- "gs",
- binary_names=["gswin32c.exe", "gswin64c.exe"],
- env_vars=["PATH"],
- verbose=False,
- )
- ]
- + "-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}".format(
- out_path, in_path
- ).split()
- )
- except LookupError:
- pre_error_message = str(
- "The Ghostscript executable isn't found.\n"
- "See http://web.mit.edu/ghostscript/www/Install.htm\n"
- "If you're using a Mac, you can try installing\n"
- "https://docs.brew.sh/Installation then `brew install ghostscript`"
- )
- print(pre_error_message, file=sys.stderr)
- raise LookupError
-
- with open(out_path, "rb") as sr:
+ subprocess.call([find_binary('gs', binary_names=['gswin32c.exe', 'gswin64c.exe'], env_vars=['PATH'], verbose=False)] +
+ '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
+ .format(out_path, in_path).split())
+ with open(out_path, 'rb') as sr:
res = sr.read()
os.remove(in_path)
os.remove(out_path)
stream = None
print(self.pformat(**kwargs), file=stream)
- def pformat(self, margin=70, indent=0, nodesep="", parens="()", quotes=False):
+ def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
"""
:return: A pretty-printed string representation of this tree.
:rtype: str
return s
# If it doesn't fit on one line, then write it on multi-lines.
- if isinstance(self._label, str):
- s = "%s%s%s" % (parens[0], self._label, nodesep)
+ if isinstance(self._label, string_types):
+ s = '%s%s%s' % (parens[0], self._label, nodesep)
else:
- s = "%s%s%s" % (parens[0], repr(self._label), nodesep)
+ s = '%s%s%s' % (parens[0], unicode_repr(self._label), nodesep)
for child in self:
if isinstance(child, Tree):
- s += (
- "\n"
- + " " * (indent + 2)
- + child.pformat(margin, indent + 2, nodesep, parens, quotes)
- )
+ s += '\n'+' '*(indent+2)+child.pformat(margin, indent+2,
+ nodesep, parens, quotes)
elif isinstance(child, tuple):
- s += "\n" + " " * (indent + 2) + "/".join(child)
- elif isinstance(child, str) and not quotes:
- s += "\n" + " " * (indent + 2) + "%s" % child
+ s += '\n'+' '*(indent+2)+ "/".join(child)
+ elif isinstance(child, string_types) and not quotes:
+ s += '\n'+' '*(indent+2)+ '%s' % child
else:
- s += "\n" + " " * (indent + 2) + repr(child)
- return s + parens[1]
+ s += '\n'+' '*(indent+2)+ unicode_repr(child)
+ return s+parens[1]
def pformat_latex_qtree(self):
r"""
:return: A latex qtree representation of this tree.
:rtype: str
"""
- reserved_chars = re.compile("([#\$%&~_\{\}])")
+ reserved_chars = re.compile('([#\$%&~_\{\}])')
- pformat = self.pformat(indent=6, nodesep="", parens=("[.", " ]"))
- return r"\Tree " + re.sub(reserved_chars, r"\\\1", pformat)
+ pformat = self.pformat(indent=6, nodesep='', parens=('[.', ' ]'))
+ return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pformat)
def _pformat_flat(self, nodesep, parens, quotes):
childstrs = []
childstrs.append(child._pformat_flat(nodesep, parens, quotes))
elif isinstance(child, tuple):
childstrs.append("/".join(child))
- elif isinstance(child, str) and not quotes:
- childstrs.append("%s" % child)
+ elif isinstance(child, string_types) and not quotes:
+ childstrs.append('%s' % child)
else:
- childstrs.append(repr(child))
- if isinstance(self._label, str):
- return "%s%s%s %s%s" % (
- parens[0],
- self._label,
- nodesep,
- " ".join(childstrs),
- parens[1],
- )
+ childstrs.append(unicode_repr(child))
+ if isinstance(self._label, string_types):
+ return '%s%s%s %s%s' % (parens[0], self._label, nodesep,
+ " ".join(childstrs), parens[1])
else:
- return "%s%s%s %s%s" % (
- parens[0],
- repr(self._label),
- nodesep,
- " ".join(childstrs),
- parens[1],
- )
+ return '%s%s%s %s%s' % (parens[0], unicode_repr(self._label), nodesep,
+ " ".join(childstrs), parens[1])
class ImmutableTree(Tree):
try:
self._hash = hash((self._label, tuple(self)))
except (TypeError, ValueError):
- raise ValueError(
- "%s: node value and children " "must be immutable" % type(self).__name__
- )
+ raise ValueError("%s: node value and children "
+ "must be immutable" % type(self).__name__)
def __setitem__(self, index, value):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __setslice__(self, i, j, value):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __delitem__(self, index):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __delslice__(self, i, j):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __iadd__(self, other):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __imul__(self, other):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def append(self, v):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def extend(self, v):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def pop(self, v=None):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def remove(self, v):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def reverse(self):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def sort(self):
- raise ValueError("%s may not be modified" % type(self).__name__)
-
+ raise ValueError('%s may not be modified' % type(self).__name__)
def __hash__(self):
return self._hash
Set the node label. This will only succeed the first time the
node label is set, which should occur in ImmutableTree.__init__().
"""
- if hasattr(self, "_label"):
- raise ValueError("%s may not be modified" % type(self).__name__)
+ if hasattr(self, '_label'):
+ raise ValueError('%s may not be modified' % type(self).__name__)
self._label = value
######################################################################
## Parented trees
######################################################################
-class AbstractParentedTree(Tree, metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class AbstractParentedTree(Tree):
"""
An abstract base class for a ``Tree`` that automatically maintains
pointers to parent nodes. These parent pointers are updated
if isinstance(child, Tree):
self._setparent(child, i)
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Parent management
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
@abstractmethod
def _setparent(self, child, index, dry_run=False):
"""
:param index: The index of ``child`` in ``self``.
"""
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Methods that add/remove children
- # ////////////////////////////////////////////////////////////
+ #////////////////////////////////////////////////////////////
# Every method that adds or removes a child must make
# appropriate calls to _setparent() and _delparent().
# del ptree[i]
elif isinstance(index, int):
- if index < 0:
- index += len(self)
- if index < 0:
- raise IndexError("index out of range")
+ if index < 0: index += len(self)
+ if index < 0: raise IndexError('index out of range')
# Clear the child's parent pointer.
if isinstance(self[index], Tree):
self._delparent(self[index], index)
elif isinstance(index, (list, tuple)):
# del ptree[()]
if len(index) == 0:
- raise IndexError("The tree position () may not be deleted.")
+ raise IndexError('The tree position () may not be deleted.')
# del ptree[(i,)]
elif len(index) == 1:
del self[index[0]]
del self[index[0]][index[1:]]
else:
- raise TypeError(
- "%s indices must be integers, not %s"
- % (type(self).__name__, type(index).__name__)
- )
+ raise TypeError("%s indices must be integers, not %s" %
+ (type(self).__name__, type(index).__name__))
def __setitem__(self, index, value):
# ptree[start:stop] = value
# up in an inconsistent state if an error does occur.
for i, child in enumerate(value):
if isinstance(child, Tree):
- self._setparent(child, start + i * step, dry_run=True)
+ self._setparent(child, start + i*step, dry_run=True)
# clear the child pointers of all parents we're removing
for i in range(start, stop, step):
if isinstance(self[i], Tree):
# reversing the elements in a tree.
for i, child in enumerate(value):
if isinstance(child, Tree):
- self._setparent(child, start + i * step)
+ self._setparent(child, start + i*step)
# finally, update the content of the child list itself.
super(AbstractParentedTree, self).__setitem__(index, value)
# ptree[i] = value
elif isinstance(index, int):
- if index < 0:
- index += len(self)
- if index < 0:
- raise IndexError("index out of range")
+ if index < 0: index += len(self)
+ if index < 0: raise IndexError('index out of range')
# if the value is not changing, do nothing.
if value is self[index]:
return
elif isinstance(index, (list, tuple)):
# ptree[()] = value
if len(index) == 0:
- raise IndexError("The tree position () may not be assigned to.")
+ raise IndexError('The tree position () may not be assigned to.')
# ptree[(i,)] = value
elif len(index) == 1:
self[index[0]] = value
self[index[0]][index[1:]] = value
else:
- raise TypeError(
- "%s indices must be integers, not %s"
- % (type(self).__name__, type(index).__name__)
- )
+ raise TypeError("%s indices must be integers, not %s" %
+ (type(self).__name__, type(index).__name__))
def append(self, child):
if isinstance(child, Tree):
# Handle negative indexes. Note that if index < -len(self),
# we do *not* raise an IndexError, unlike __getitem__. This
# is done for consistency with list.__getitem__ and list.index.
- if index < 0:
- index += len(self)
- if index < 0:
- index = 0
+ if index < 0: index += len(self)
+ if index < 0: index = 0
# Set the child's parent, and update our child list.
if isinstance(child, Tree):
self._setparent(child, index)
super(AbstractParentedTree, self).insert(index, child)
def pop(self, index=-1):
- if index < 0:
- index += len(self)
- if index < 0:
- raise IndexError("index out of range")
+ if index < 0: index += len(self)
+ if index < 0: raise IndexError('index out of range')
if isinstance(self[index], Tree):
self._delparent(self[index], index)
return super(AbstractParentedTree, self).pop(index)
# __getitem__ etc., but use max(0, start) and max(0, stop) because
# because negative indices are already handled *before*
# __getslice__ is called; and we don't want to double-count them.
- if hasattr(list, "__getslice__"):
-
+ if hasattr(list, '__getslice__'):
def __getslice__(self, start, stop):
return self.__getitem__(slice(max(0, start), max(0, stop)))
-
def __delslice__(self, start, stop):
return self.__delitem__(slice(max(0, start), max(0, stop)))
-
def __setslice__(self, start, stop, value):
return self.__setitem__(slice(max(0, start), max(0, stop)), value)
-
class ParentedTree(AbstractParentedTree):
"""
A ``Tree`` that automatically maintains parent pointers for
or ``MultiParentedTrees``. Mixing tree implementations may result
in incorrect parent pointers and in ``TypeError`` exceptions.
"""
-
def __init__(self, node, children=None):
self._parent = None
"""The parent of this Tree, or None if it has no parent."""
child._parent = None
self._setparent(child, i)
- def _frozen_class(self):
- return ImmutableParentedTree
+ def _frozen_class(self): return ImmutableParentedTree
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def parent(self):
"""The parent of this tree, or None if it has no parent."""
``ptree.parent.index(ptree)``, since the ``index()`` method
returns the first child that is equal to its argument.
"""
- if self._parent is None:
- return None
+ if self._parent is None: return None
for i, child in enumerate(self._parent):
- if child is self:
- return i
- assert False, "expected to find self in self._parent!"
+ if child is self: return i
+ assert False, 'expected to find self in self._parent!'
def left_sibling(self):
"""The left sibling of this tree, or None if it has none."""
parent_index = self.parent_index()
if self._parent and parent_index > 0:
- return self._parent[parent_index - 1]
- return None # no left sibling
+ return self._parent[parent_index-1]
+ return None # no left sibling
def right_sibling(self):
"""The right sibling of this tree, or None if it has none."""
parent_index = self.parent_index()
- if self._parent and parent_index < (len(self._parent) - 1):
- return self._parent[parent_index + 1]
- return None # no right sibling
+ if self._parent and parent_index < (len(self._parent)-1):
+ return self._parent[parent_index+1]
+ return None # no right sibling
def root(self):
"""
else:
return self.parent().treeposition() + (self.parent_index(),)
- # /////////////////////////////////////////////////////////////////
+
+ #/////////////////////////////////////////////////////////////////
# Parent Management
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _delparent(self, child, index):
# Sanity checks
def _setparent(self, child, index, dry_run=False):
# If the child's type is incorrect, then complain.
if not isinstance(child, ParentedTree):
- raise TypeError(
- "Can not insert a non-ParentedTree " + "into a ParentedTree"
- )
+ raise TypeError('Can not insert a non-ParentedTree '+
+ 'into a ParentedTree')
# If child already has a parent, then complain.
if child._parent is not None:
- raise ValueError("Can not insert a subtree that already " "has a parent.")
+ raise ValueError('Can not insert a subtree that already '
+ 'has a parent.')
# Set child's parent pointer & index.
if not dry_run:
``Trees`` or ``ParentedTrees``. Mixing tree implementations may
result in incorrect parent pointers and in ``TypeError`` exceptions.
"""
-
def __init__(self, node, children=None):
self._parents = []
"""A list of this tree's parents. This list should not
child._parents = []
self._setparent(child, i)
- def _frozen_class(self):
- return ImmutableMultiParentedTree
+ def _frozen_class(self): return ImmutableMultiParentedTree
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Methods
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def parents(self):
"""
:type: list(MultiParentedTree)
"""
- return [
- parent[index - 1]
- for (parent, index) in self._get_parent_indices()
- if index > 0
- ]
+ return [parent[index-1]
+ for (parent, index) in self._get_parent_indices()
+ if index > 0]
def right_siblings(self):
"""
:type: list(MultiParentedTree)
"""
- return [
- parent[index + 1]
- for (parent, index) in self._get_parent_indices()
- if index < (len(parent) - 1)
- ]
+ return [parent[index+1]
+ for (parent, index) in self._get_parent_indices()
+ if index < (len(parent)-1)]
def _get_parent_indices(self):
- return [
- (parent, index)
- for parent in self._parents
- for index, child in enumerate(parent)
- if child is self
- ]
+ return [(parent, index)
+ for parent in self._parents
+ for index, child in enumerate(parent)
+ if child is self]
def roots(self):
"""
for parent_index in ptree.parent_indices(parent):
parent[parent_index] is ptree
"""
- if parent not in self._parents:
- return []
- else:
- return [index for (index, child) in enumerate(parent) if child is self]
+ if parent not in self._parents: return []
+ else: return [index for (index, child) in enumerate(parent)
+ if child is self]
def treepositions(self, root):
"""
if self is root:
return [()]
else:
- return [
- treepos + (index,)
- for parent in self._parents
- for treepos in parent.treepositions(root)
- for (index, child) in enumerate(parent)
- if child is self
- ]
+ return [treepos+(index,)
+ for parent in self._parents
+ for treepos in parent.treepositions(root)
+ for (index, child) in enumerate(parent) if child is self]
+
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
# Parent Management
- # /////////////////////////////////////////////////////////////////
+ #/////////////////////////////////////////////////////////////////
def _delparent(self, child, index):
# Sanity checks
# If the only copy of child in self is at index, then delete
# self from child's parent list.
for i, c in enumerate(self):
- if c is child and i != index:
- break
+ if c is child and i != index: break
else:
child._parents.remove(self)
def _setparent(self, child, index, dry_run=False):
# If the child's type is incorrect, then complain.
if not isinstance(child, MultiParentedTree):
- raise TypeError(
- "Can not insert a non-MultiParentedTree " + "into a MultiParentedTree"
- )
+ raise TypeError('Can not insert a non-MultiParentedTree '+
+ 'into a MultiParentedTree')
# Add self as a parent pointer if it's not already listed.
if not dry_run:
for parent in child._parents:
- if parent is self:
- break
+ if parent is self: break
else:
child._parents.append(self)
-
class ImmutableParentedTree(ImmutableTree, ParentedTree):
pass
-
class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
pass
## Probabilistic trees
######################################################################
-
-
+@python_2_unicode_compatible
class ProbabilisticTree(Tree, ProbabilisticMixIn):
def __init__(self, node, children=None, **prob_kwargs):
Tree.__init__(self, node, children)
ProbabilisticMixIn.__init__(self, **prob_kwargs)
# We have to patch up these methods to make them work right:
- def _frozen_class(self):
- return ImmutableProbabilisticTree
-
+ def _frozen_class(self): return ImmutableProbabilisticTree
def __repr__(self):
- return "%s (p=%r)" % (Tree.__repr__(self), self.prob())
-
+ return '%s (p=%r)' % (Tree.unicode_repr(self), self.prob())
def __str__(self):
- return "%s (p=%.6g)" % (self.pformat(margin=60), self.prob())
-
+ return '%s (p=%.6g)' % (self.pformat(margin=60), self.prob())
def copy(self, deep=False):
- if not deep:
- return type(self)(self._label, self, prob=self.prob())
- else:
- return type(self).convert(self)
-
+ if not deep: return type(self)(self._label, self, prob=self.prob())
+ else: return type(self).convert(self)
@classmethod
def convert(cls, val):
if isinstance(val, Tree):
return val
def __eq__(self, other):
- return self.__class__ is other.__class__ and (
- self._label,
- list(self),
- self.prob(),
- ) == (other._label, list(other), other.prob())
+ return (self.__class__ is other.__class__ and
+ (self._label, list(self), self.prob()) ==
+ (other._label, list(other), other.prob()))
def __lt__(self, other):
if not isinstance(other, Tree):
raise_unorderable_types("<", self, other)
if self.__class__ is other.__class__:
- return (self._label, list(self), self.prob()) < (
- other._label,
- list(other),
- other.prob(),
- )
+ return ((self._label, list(self), self.prob()) <
+ (other._label, list(other), other.prob()))
else:
return self.__class__.__name__ < other.__class__.__name__
-
+@python_2_unicode_compatible
class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
def __init__(self, node, children=None, **prob_kwargs):
ImmutableTree.__init__(self, node, children)
self._hash = hash((self._label, tuple(self), self.prob()))
# We have to patch up these methods to make them work right:
- def _frozen_class(self):
- return ImmutableProbabilisticTree
-
+ def _frozen_class(self): return ImmutableProbabilisticTree
def __repr__(self):
- return "%s [%s]" % (Tree.__repr__(self), self.prob())
-
+ return '%s [%s]' % (Tree.unicode_repr(self), self.prob())
def __str__(self):
- return "%s [%s]" % (self.pformat(margin=60), self.prob())
-
+ return '%s [%s]' % (self.pformat(margin=60), self.prob())
def copy(self, deep=False):
- if not deep:
- return type(self)(self._label, self, prob=self.prob())
- else:
- return type(self).convert(self)
-
+ if not deep: return type(self)(self._label, self, prob=self.prob())
+ else: return type(self).convert(self)
@classmethod
def convert(cls, val):
if isinstance(val, Tree):
names.append(child)
return names
-
######################################################################
## Parsing
######################################################################
-
def bracket_parse(s):
"""
Use Tree.read(s, remove_empty_top_bracketing=True) instead.
"""
raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.")
-
def sinica_parse(s):
"""
Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings,
:param s: The string to be converted
:type s: str
"""
- tokens = re.split(r"([()| ])", s)
+ tokens = re.split(r'([()| ])', s)
for i in range(len(tokens)):
- if tokens[i] == "(":
- tokens[i - 1], tokens[i] = (
- tokens[i],
- tokens[i - 1],
- ) # pull nonterminal inside parens
- elif ":" in tokens[i]:
- fields = tokens[i].split(":")
- if len(fields) == 2: # non-terminal
+ if tokens[i] == '(':
+ tokens[i-1], tokens[i] = tokens[i], tokens[i-1] # pull nonterminal inside parens
+ elif ':' in tokens[i]:
+ fields = tokens[i].split(':')
+ if len(fields) == 2: # non-terminal
tokens[i] = fields[1]
else:
tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
- elif tokens[i] == "|":
- tokens[i] = ""
+ elif tokens[i] == '|':
+ tokens[i] = ''
treebank_string = " ".join(tokens)
return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
-
# s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier
# s = re.sub(r'\w+:', '', s) # remove role tags
## Demonstration
######################################################################
-
def demo():
"""
A demonstration showing how Trees and Trees can be
from nltk import Tree, ProbabilisticTree
# Demonstrate tree parsing.
- s = "(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))"
+ s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
t = Tree.fromstring(s)
print("Convert bracketed string into tree:")
print(t)
print(t.__repr__())
print("Display tree properties:")
- print(t.label()) # tree's constituent type
- print(t[0]) # tree's first child
- print(t[1]) # tree's second child
+ print(t.label()) # tree's constituent type
+ print(t[0]) # tree's first child
+ print(t[1]) # tree's second child
print(t.height())
print(t.leaves())
print(t[1])
- print(t[1, 1])
- print(t[1, 1, 0])
+ print(t[1,1])
+ print(t[1,1,0])
# Demonstrate tree modification.
the_cat = t[0]
- the_cat.insert(1, Tree.fromstring("(JJ big)"))
+ the_cat.insert(1, Tree.fromstring('(JJ big)'))
print("Tree modification:")
print(t)
- t[1, 1, 1] = Tree.fromstring("(NN cake)")
+ t[1,1,1] = Tree.fromstring('(NN cake)')
print(t)
print()
print()
# Demonstrate probabilistic trees.
- pt = ProbabilisticTree("x", ["y", "z"], prob=0.5)
+ pt = ProbabilisticTree('x', ['y', 'z'], prob=0.5)
print("Probabilistic Tree:")
print(pt)
print()
print()
# Demonstrate tree nodes containing objects other than strings
- t.set_label(("test", 3))
+ t.set_label(('test', 3))
print(t)
-
-__all__ = [
- "ImmutableProbabilisticTree",
- "ImmutableTree",
- "ProbabilisticMixIn",
- "ProbabilisticTree",
- "Tree",
- "bracket_parse",
- "sinica_parse",
- "ParentedTree",
- "MultiParentedTree",
- "ImmutableParentedTree",
- "ImmutableMultiParentedTree",
-]
+__all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn',
+ 'ProbabilisticTree', 'Tree', 'bracket_parse',
+ 'sinica_parse', 'ParentedTree', 'MultiParentedTree',
+ 'ImmutableParentedTree', 'ImmutableMultiParentedTree']
# -*- coding: utf-8 -*-
# Natural Language Toolkit: ASCII visualization of NLTK trees
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Andreas van Cranenburgh <A.W.vanCranenburgh@uva.nl>
# Peter Ljunglöf <peter.ljunglof@gu.se>
# URL: <http://nltk.org/>
http://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf
"""
+from __future__ import division, print_function, unicode_literals
+
+from nltk.util import slice_bounds, OrderedDict
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+from nltk.internals import raise_unorderable_types
+from nltk.tree import Tree
+
import re
-try:
- from html import escape
-except ImportError:
- from cgi import escape
+import sys
+import codecs
+from cgi import escape
from collections import defaultdict
from operator import itemgetter
+from itertools import chain, islice
-from nltk.util import OrderedDict
-from nltk.tree import Tree
ANSICOLOR = {
- "black": 30,
- "red": 31,
- "green": 32,
- "yellow": 33,
- "blue": 34,
- "magenta": 35,
- "cyan": 36,
- "white": 37,
+ 'black': 30,
+ 'red': 31,
+ 'green': 32,
+ 'yellow': 33,
+ 'blue': 34,
+ 'magenta': 35,
+ 'cyan': 36,
+ 'white': 37,
}
+@python_2_unicode_compatible
class TreePrettyPrinter(object):
"""
Pretty-print a tree in text format, either as ASCII or Unicode.
def __init__(self, tree, sentence=None, highlight=()):
if sentence is None:
leaves = tree.leaves()
- if (
- leaves
- and not any(len(a) == 0 for a in tree.subtrees())
- and all(isinstance(a, int) for a in leaves)
- ):
+ if (leaves and not any(len(a) == 0 for a in tree.subtrees())
+ and all(isinstance(a, int) for a in leaves)):
sentence = [str(a) for a in leaves]
else:
# this deals with empty nodes (frontier non-terminals)
for n, b in enumerate(a):
if not isinstance(b, Tree):
a[n] = len(sentence)
- if type(b) == tuple:
- b = "/".join(b)
- sentence.append("%s" % b)
+ sentence.append('%s' % b)
self.nodes, self.coords, self.edges, self.highlight = self.nodecoords(
- tree, sentence, highlight
- )
+ tree, sentence, highlight)
def __str__(self):
return self.text()
def __repr__(self):
- return "<TreePrettyPrinter with %d nodes>" % len(self.nodes)
+ return '<TreePrettyPrinter with %d nodes>' % len(self.nodes)
+
@staticmethod
def nodecoords(tree, sentence, highlight):
- edges[id]: parent id of node with this id (ordered dictionary)
- highlighted: set of ids that should be highlighted
"""
-
def findcell(m, matrix, startoflevel, children):
"""
Find vacant row, column index for node ``m``.
startoflevel = len(matrix)
for rowidx in range(startoflevel, len(matrix) + 1):
if rowidx == len(matrix): # need to add a new row
- matrix.append(
- [
- vertline if a not in (corner, None) else None
- for a in matrix[-1]
- ]
- )
+ matrix.append([vertline if a not in (corner, None)
+ else None for a in matrix[-1]])
row = matrix[rowidx]
i = j = center
if len(children[m]) == 1: # place unaries directly above child
return rowidx, next(iter(children[m]))[1]
- elif all(
- a is None or a == vertline
- for a in row[min(candidates) : max(candidates) + 1]
- ):
+ elif all(a is None or a == vertline for a
+ in row[min(candidates):max(candidates) + 1]):
# find free column
for n in range(scale):
i = j = center + n
while j > minidx or i < maxidx:
- if i < maxidx and (
- matrix[rowidx][i] is None or i in candidates
- ):
+ if i < maxidx and (matrix[rowidx][i] is None
+ or i in candidates):
return rowidx, i
- elif j > minidx and (
- matrix[rowidx][j] is None or j in candidates
- ):
+ elif j > minidx and (matrix[rowidx][j] is None
+ or j in candidates):
return rowidx, j
i += scale
j -= scale
- raise ValueError(
- "could not find a free cell for:\n%s\n%s"
- "min=%d; max=%d" % (tree[m], minidx, maxidx, dumpmatrix())
- )
+ raise ValueError('could not find a free cell for:\n%s\n%s'
+ 'min=%d; max=%d' % (tree[m], minidx, maxidx, dumpmatrix()))
def dumpmatrix():
"""Dump matrix contents for debugging purposes."""
- return "\n".join(
- "%2d: %s" % (n, " ".join(("%2r" % i)[:2] for i in row))
- for n, row in enumerate(matrix)
- )
+ return '\n'.join(
+ '%2d: %s' % (n, ' '.join(('%2r' % i)[:2] for i in row))
+ for n, row in enumerate(matrix))
leaves = tree.leaves()
if not all(isinstance(n, int) for n in leaves):
- raise ValueError("All leaves must be integer indices.")
+ raise ValueError('All leaves must be integer indices.')
if len(leaves) != len(set(leaves)):
- raise ValueError("Indices must occur at most once.")
+ raise ValueError('Indices must occur at most once.')
if not all(0 <= n < len(sentence) for n in leaves):
- raise ValueError(
- "All leaves must be in the interval 0..n "
- "with n=len(sentence)\ntokens: %d indices: "
- "%r\nsentence: %s" % (len(sentence), tree.leaves(), sentence)
- )
+ raise ValueError('All leaves must be in the interval 0..n '
+ 'with n=len(sentence)\ntokens: %d indices: '
+ '%r\nsentence: %s' % (len(sentence), tree.leaves(), sentence))
vertline, corner = -1, -2 # constants
tree = tree.copy(True)
for a in tree.subtrees():
matrix = [[None] * (len(sentence) * scale)]
nodes = {}
ids = dict((a, n) for n, a in enumerate(positions))
- highlighted_nodes = set(
- n for a, n in ids.items() if not highlight or tree[a] in highlight
- )
+ highlighted_nodes = set(n for a, n in ids.items()
+ if not highlight or tree[a] in highlight)
levels = dict((n, []) for n in range(maxdepth - 1))
terminals = []
for a in positions:
terminals.append(a)
for n in levels:
- levels[n].sort(key=lambda n: max(tree[n].leaves()) - min(tree[n].leaves()))
+ levels[n].sort(key=lambda n: max(tree[n].leaves())
+ - min(tree[n].leaves()))
terminals.sort()
positions = set(positions)
matrix[0][i] = ids[m]
nodes[ids[m]] = sentence[tree[m]]
if nodes[ids[m]] is None:
- nodes[ids[m]] = "..."
+ nodes[ids[m]] = '...'
highlighted_nodes.discard(ids[m])
positions.remove(m)
childcols[m[:-1]].add((0, i))
for n in sorted(levels, reverse=True):
nodesatdepth = levels[n]
startoflevel = len(matrix)
- matrix.append(
- [vertline if a not in (corner, None) else None for a in matrix[-1]]
- )
+ matrix.append([vertline if a not in (corner, None) else None
+ for a in matrix[-1]])
for m in nodesatdepth: # [::-1]:
if n < maxdepth - 1 and childcols[m]:
_, pivot = min(childcols[m], key=itemgetter(1))
- if set(
- a[:-1]
- for row in matrix[:-1]
- for a in row[:pivot]
- if isinstance(a, tuple)
- ) & set(
- a[:-1]
- for row in matrix[:-1]
- for a in row[pivot:]
- if isinstance(a, tuple)
- ):
+ if (set(a[:-1] for row in matrix[:-1] for a in row[:pivot]
+ if isinstance(a, tuple)) &
+ set(a[:-1] for row in matrix[:-1] for a in row[pivot:]
+ if isinstance(a, tuple))):
crossed.add(m)
rowidx, i = findcell(m, matrix, startoflevel, childcols)
# remove unused columns, right to left
for m in range(scale * len(sentence) - 1, -1, -1):
- if not any(isinstance(row[m], (Tree, int)) for row in matrix):
+ if not any(isinstance(row[m], (Tree, int))
+ for row in matrix):
for row in matrix:
del row[m]
# remove unused rows, reverse
- matrix = [
- row
- for row in reversed(matrix)
- if not all(a is None or a == vertline for a in row)
- ]
+ matrix = [row for row in reversed(matrix)
+ if not all(a is None or a == vertline for a in row)]
# collect coordinates of nodes
coords = {}
coords[i] = n, m
# move crossed edges last
- positions = sorted(
- [a for level in levels.values() for a in level],
- key=lambda a: a[:-1] in crossed,
- )
+ positions = sorted([a for level in levels.values()
+ for a in level], key=lambda a: a[:-1] in crossed)
# collect edges from node to node
edges = OrderedDict()
for i in reversed(positions):
for j, _ in enumerate(tree[i]):
- edges[ids[i + (j,)]] = ids[i]
+ edges[ids[i + (j, )]] = ids[i]
return nodes, coords, edges, highlighted_nodes
- def text(
- self,
- nodedist=1,
- unicodelines=False,
- html=False,
- ansi=False,
- nodecolor="blue",
- leafcolor="red",
- funccolor="green",
- abbreviate=None,
- maxwidth=16,
- ):
+
+ def text(self, nodedist=1, unicodelines=False, html=False, ansi=False,
+ nodecolor='blue', leafcolor='red', funccolor='green',
+ abbreviate=None, maxwidth=16):
"""
:return: ASCII art for a discontinuous tree.
if abbreviate == True:
abbreviate = 5
if unicodelines:
- horzline = "\u2500"
- leftcorner = "\u250c"
- rightcorner = "\u2510"
- vertline = " \u2502 "
- tee = horzline + "\u252C" + horzline
- bottom = horzline + "\u2534" + horzline
- cross = horzline + "\u253c" + horzline
- ellipsis = "\u2026"
+ horzline = '\u2500'
+ leftcorner = '\u250c'
+ rightcorner = '\u2510'
+ vertline = ' \u2502 '
+ tee = horzline + '\u252C' + horzline
+ bottom = horzline + '\u2534' + horzline
+ cross = horzline + '\u253c' + horzline
+ ellipsis = '\u2026'
else:
- horzline = "_"
- leftcorner = rightcorner = " "
- vertline = " | "
+ horzline = '_'
+ leftcorner = rightcorner = ' '
+ vertline = ' | '
tee = 3 * horzline
- cross = bottom = "_|_"
- ellipsis = "."
+ cross = bottom = '_|_'
+ ellipsis = '.'
def crosscell(cur, x=vertline):
"""Overwrite center of this cell with a vertical branch."""
splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1
lst = list(cur)
- lst[splitl : splitl + len(x)] = list(x)
- return "".join(lst)
+ lst[splitl:splitl + len(x)] = list(x)
+ return ''.join(lst)
result = []
matrix = defaultdict(dict)
maxchildcol = {}
childcols = defaultdict(set)
labels = {}
- wrapre = re.compile(
- "(.{%d,%d}\\b\\W*|.{%d})" % (maxwidth - 4, maxwidth, maxwidth)
- )
+ wrapre = re.compile('(.{%d,%d}\\b\\W*|.{%d})' % (
+ maxwidth - 4, maxwidth, maxwidth))
# collect labels and coordinates
for a in self.nodes:
row, column = self.coords[a]
matrix[row][column] = a
maxcol = max(maxcol, column)
- label = (
- self.nodes[a].label()
- if isinstance(self.nodes[a], Tree)
- else self.nodes[a]
- )
+ label = (self.nodes[a].label() if isinstance(self.nodes[a], Tree)
+ else self.nodes[a])
if abbreviate and len(label) > abbreviate:
label = label[:abbreviate] + ellipsis
if maxwidth and len(label) > maxwidth:
- label = wrapre.sub(r"\1\n", label).strip()
- label = label.split("\n")
+ label = wrapre.sub(r'\1\n', label).strip()
+ label = label.split('\n')
maxnodeheight[row] = max(maxnodeheight[row], len(label))
maxnodewith[column] = max(maxnodewith[column], max(map(len, label)))
labels[a] = label
maxchildcol[parent] = max(maxchildcol.get(parent, column), column)
# bottom up level order traversal
for row in sorted(matrix, reverse=True):
- noderows = [
- ["".center(maxnodewith[col]) for col in range(maxcol + 1)]
- for _ in range(maxnodeheight[row])
- ]
- branchrow = ["".center(maxnodewith[col]) for col in range(maxcol + 1)]
+ noderows = [[''.center(maxnodewith[col]) for col in range(maxcol + 1)]
+ for _ in range(maxnodeheight[row])]
+ branchrow = [''.center(maxnodewith[col]) for col in range(maxcol + 1)]
for col in matrix[row]:
n = matrix[row][col]
node = self.nodes[n]
if n in minchildcol and minchildcol[n] < maxchildcol[n]:
i, j = minchildcol[n], maxchildcol[n]
a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2
- branchrow[i] = ((" " * a) + leftcorner).ljust(
- maxnodewith[i], horzline
- )
- branchrow[j] = (rightcorner + (" " * b)).rjust(
- maxnodewith[j], horzline
- )
+ branchrow[i] = ((' ' * a) + leftcorner).ljust(
+ maxnodewith[i], horzline)
+ branchrow[j] = (rightcorner + (' ' * b)).rjust(
+ maxnodewith[j], horzline)
for i in range(minchildcol[n] + 1, maxchildcol[n]):
- if i == col and any(a == i for _, a in childcols[n]):
+ if i == col and any(
+ a == i for _, a in childcols[n]):
line = cross
elif i == col:
line = bottom
branchrow[col] = crosscell(branchrow[col])
text = [a.center(maxnodewith[col]) for a in text]
color = nodecolor if isinstance(node, Tree) else leafcolor
- if isinstance(node, Tree) and node.label().startswith("-"):
+ if isinstance(node, Tree) and node.label().startswith('-'):
color = funccolor
if html:
- text = [escape(a, quote=False) for a in text]
+ text = [escape(a) for a in text]
if n in self.highlight:
- text = ["<font color=%s>%s</font>" % (color, a) for a in text]
+ text = ['<font color=%s>%s</font>' % (
+ color, a) for a in text]
elif ansi and n in self.highlight:
- text = ["\x1b[%d;1m%s\x1b[0m" % (ANSICOLOR[color], a) for a in text]
+ text = ['\x1b[%d;1m%s\x1b[0m' % (
+ ANSICOLOR[color], a) for a in text]
for x in range(maxnodeheight[row]):
# draw vertical lines in partially filled multiline node
# labels, but only if it's not a frontier node.
- noderows[x][col] = (
- text[x]
- if x < len(text)
- else (vertline if childcols[n] else " ").center(
- maxnodewith[col], " "
- )
- )
+ noderows[x][col] = (text[x] if x < len(text)
+ else (vertline if childcols[n] else ' ').center(
+ maxnodewith[col], ' '))
# for each column, if there is a node below us which has a parent
# above us, draw a vertical branch in that column.
if row != max(matrix):
for n, (childrow, col) in self.coords.items():
- if n > 0 and self.coords[self.edges[n]][0] < row < childrow:
+ if (n > 0 and
+ self.coords[self.edges[n]][0] < row < childrow):
branchrow[col] = crosscell(branchrow[col])
if col not in matrix[row]:
for noderow in noderows:
noderow[col] = crosscell(noderow[col])
- branchrow = [
- a + ((a[-1] if a[-1] != " " else b[0]) * nodedist)
- for a, b in zip(branchrow, branchrow[1:] + [" "])
- ]
- result.append("".join(branchrow))
- result.extend(
- (" " * nodedist).join(noderow) for noderow in reversed(noderows)
- )
- return "\n".join(reversed(result)) + "\n"
+ branchrow = [a + ((a[-1] if a[-1] != ' ' else b[0]) * nodedist)
+ for a, b in zip(branchrow, branchrow[1:] + [' '])]
+ result.append(''.join(branchrow))
+ result.extend((' ' * nodedist).join(noderow)
+ for noderow in reversed(noderows))
+ return '\n'.join(reversed(result)) + '\n'
+
- def svg(self, nodecolor="blue", leafcolor="red", funccolor="green"):
+ def svg(self, nodecolor='blue', leafcolor='red', funccolor='green'):
"""
:return: SVG representation of a tree.
"""
hstart = vstart = 20
width = max(col for _, col in self.coords.values())
height = max(row for row, _ in self.coords.values())
- result = [
- '<svg version="1.1" xmlns="http://www.w3.org/2000/svg" '
- 'width="%dem" height="%dem" viewBox="%d %d %d %d">'
- % (
- width * 3,
- height * 2.5,
- -hstart,
- -vstart,
- width * hscale + 3 * hstart,
- height * vscale + 3 * vstart,
- )
- ]
+ result = ['<svg version="1.1" xmlns="http://www.w3.org/2000/svg" '
+ 'width="%dem" height="%dem" viewBox="%d %d %d %d">' % (
+ width * 3,
+ height * 2.5,
+ -hstart, -vstart,
+ width * hscale + 3 * hstart,
+ height * vscale + 3 * vstart)
+ ]
children = defaultdict(set)
for n in self.nodes:
xmax = hstart + hscale * max(childx)
result.append(
'\t<polyline style="stroke:black; stroke-width:1; fill:none;" '
- 'points="%g,%g %g,%g" />' % (xmin, y, xmax, y)
- )
+ 'points="%g,%g %g,%g" />' % (xmin, y, xmax, y))
result.append(
'\t<polyline style="stroke:black; stroke-width:1; fill:none;" '
- 'points="%g,%g %g,%g" />' % (x, y, x, y - fontsize // 3)
- )
+ 'points="%g,%g %g,%g" />' % (x, y, x, y - fontsize // 3))
# vertical branches from children to parents
for child, parent in self.edges.items():
' points="%g,%g %g,%g" />' % (childx, childy, childx, y + 5),
'\t<polyline style="stroke:black; stroke-width:1; fill:none;"'
' points="%g,%g %g,%g" />' % (childx, childy, childx, y),
- ]
+ ]
# write nodes with coordinates
for n, (row, column) in self.coords.items():
y = row * vscale + vstart
if n in self.highlight:
color = nodecolor if isinstance(node, Tree) else leafcolor
- if isinstance(node, Tree) and node.label().startswith("-"):
+ if isinstance(node, Tree) and node.label().startswith('-'):
color = funccolor
else:
- color = "black"
- result += [
- '\t<text style="text-anchor: middle; fill: %s; '
- 'font-size: %dpx;" x="%g" y="%g">%s</text>'
- % (
- color,
- fontsize,
- x,
- y,
- escape(node.label() if isinstance(node, Tree) else node, quote=False),
- )
- ]
+ color = 'black'
+ result += ['\t<text style="text-anchor: middle; fill: %s; '
+ 'font-size: %dpx;" x="%g" y="%g">%s</text>' % (
+ color, fontsize, x, y,
+ escape(node.label() if isinstance(node, Tree)
+ else node))]
- result += ["</svg>"]
- return "\n".join(result)
+ result += ['</svg>']
+ return '\n'.join(result)
def test():
"""Do some tree drawing tests."""
-
def print_tree(n, tree, sentence=None, ansi=True, **xargs):
print()
- print('{0}: "{1}"'.format(n, " ".join(sentence or tree.leaves())))
+ print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
print(tree)
print()
drawtree = TreePrettyPrinter(tree, sentence)
print(drawtree.text(unicodelines=False, ansi=False, **xargs))
from nltk.corpus import treebank
-
for n in [0, 1440, 1591, 2771, 2170]:
tree = treebank.parsed_sents()[n]
print_tree(n, tree, nodedist=2, maxwidth=8)
print()
- print("ASCII version:")
+ print('ASCII version:')
print(TreePrettyPrinter(tree).text(nodedist=2))
tree = Tree.fromstring(
- "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) "
- "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) "
- "(vg 10) (inf (verb 11)))))) (punct 12))",
- read_leaf=int,
- )
- sentence = (
- "Ze had met haar moeder kunnen gaan winkelen ,"
- " zwemmen of terrassen .".split()
- )
- print_tree("Discontinuous tree", tree, sentence, nodedist=2)
+ '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
+ '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
+ '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int)
+ sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
+ ' zwemmen of terrassen .'.split())
+ print_tree('Discontinuous tree', tree, sentence, nodedist=2)
-__all__ = ["TreePrettyPrinter"]
+__all__ = ['TreePrettyPrinter']
-if __name__ == "__main__":
+if __name__ == '__main__':
test()
C D C D
"""
+from __future__ import print_function
from nltk.tree import Tree
-
-def chomsky_normal_form(
- tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"
-):
+def chomsky_normal_form(tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"):
# assume all subtrees have homogeneous children
# assume all terminals have no siblings
# A semi-hack to have elegant looking code below. As a result,
# any subtree with a branching factor greater than 999 will be incorrectly truncated.
- if horzMarkov is None:
- horzMarkov = 999
+ if horzMarkov is None: horzMarkov = 999
# Traverse the tree depth-first keeping a list of ancestor nodes to the root.
# I chose not to use the tree.treepositions() method since it requires
nodeList = [(tree, [tree.label()])]
while nodeList != []:
node, parent = nodeList.pop()
- if isinstance(node, Tree):
+ if isinstance(node,Tree):
# parent annotation
parentString = ""
originalNode = node.label()
- if vertMarkov != 0 and node != tree and isinstance(node[0], Tree):
+ if vertMarkov != 0 and node != tree and isinstance(node[0],Tree):
parentString = "%s<%s>" % (parentChar, "-".join(parent))
node.set_label(node.label() + parentString)
- parent = [originalNode] + parent[: vertMarkov - 1]
+ parent = [originalNode] + parent[:vertMarkov - 1]
# add children to the agenda before we mess with them
for child in node:
if len(node) > 2:
childNodes = [child.label() for child in node]
nodeCopy = node.copy()
- node[0:] = [] # delete the children
+ node[0:] = [] # delete the children
curNode = node
numChildren = len(nodeCopy)
- for i in range(1, numChildren - 1):
+ for i in range(1,numChildren - 1):
if factor == "right":
- newHead = "%s%s<%s>%s" % (
- originalNode,
- childChar,
- "-".join(
- childNodes[i : min([i + horzMarkov, numChildren])]
- ),
- parentString,
- ) # create new head
+ newHead = "%s%s<%s>%s" % (originalNode, childChar, "-".join(childNodes[i:min([i+horzMarkov,numChildren])]),parentString) # create new head
newNode = Tree(newHead, [])
curNode[0:] = [nodeCopy.pop(0), newNode]
else:
- newHead = "%s%s<%s>%s" % (
- originalNode,
- childChar,
- "-".join(
- childNodes[max([numChildren - i - horzMarkov, 0]) : -i]
- ),
- parentString,
- )
+ newHead = "%s%s<%s>%s" % (originalNode, childChar, "-".join(childNodes[max([numChildren-i-horzMarkov,0]):-i]),parentString)
newNode = Tree(newHead, [])
curNode[0:] = [newNode, nodeCopy.pop()]
curNode[0:] = [child for child in nodeCopy]
-def un_chomsky_normal_form(
- tree, expandUnary=True, childChar="|", parentChar="^", unaryChar="+"
-):
+def un_chomsky_normal_form(tree, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
# Traverse the tree-depth first keeping a pointer to the parent for modification purposes.
- nodeList = [(tree, [])]
+ nodeList = [(tree,[])]
while nodeList != []:
- node, parent = nodeList.pop()
- if isinstance(node, Tree):
+ node,parent = nodeList.pop()
+ if isinstance(node,Tree):
# if the node contains the 'childChar' character it means that
# it is an artificial node and can be removed, although we still need
# to move its children to its parent
# means the grammar was left factored. We must insert the children
# at the beginning of the parent's children
if nodeIndex == 0:
- parent.insert(0, node[0])
- parent.insert(1, node[1])
+ parent.insert(0,node[0])
+ parent.insert(1,node[1])
else:
- parent.extend([node[0], node[1]])
+ parent.extend([node[0],node[1]])
# parent is now the current node so the children of parent will be added to the agenda
node = parent
if expandUnary == True:
unaryIndex = node.label().find(unaryChar)
if unaryIndex != -1:
- newNode = Tree(
- node.label()[unaryIndex + 1 :], [i for i in node]
- )
+ newNode = Tree(node.label()[unaryIndex + 1:], [i for i in node])
node.set_label(node.label()[:unaryIndex])
node[0:] = [newNode]
for child in node:
- nodeList.append((child, node))
+ nodeList.append((child,node))
-def collapse_unary(tree, collapsePOS=False, collapseRoot=False, joinChar="+"):
+def collapse_unary(tree, collapsePOS = False, collapseRoot = False, joinChar = "+"):
"""
Collapse subtrees with a single child (ie. unary productions)
into a new non-terminal (Tree node) joined by 'joinChar'.
# depth-first traversal of tree
while nodeList != []:
node = nodeList.pop()
- if isinstance(node, Tree):
- if (
- len(node) == 1
- and isinstance(node[0], Tree)
- and (collapsePOS == True or isinstance(node[0, 0], Tree))
- ):
+ if isinstance(node,Tree):
+ if len(node) == 1 and isinstance(node[0], Tree) and (collapsePOS == True or isinstance(node[0,0], Tree)):
node.set_label(node.label() + joinChar + node[0].label())
node[0:] = [child for child in node[0]]
# since we assigned the child's children to the current node,
for child in node:
nodeList.append(child)
-
#################################################################
# Demonstration
#################################################################
-
def demo():
"""
A demonstration showing how each tree transform can be used.
draw_trees(t, collapsedTree, cnfTree, parentTree, original)
-
-if __name__ == "__main__":
+if __name__ == '__main__':
demo()
__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"]
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import twython
except ImportError:
import warnings
-
- warnings.warn(
- "The twython library has not been installed. "
- "Some functionality from the twitter package will not be available."
- )
+ warnings.warn("The twython library has not been installed. "
+ "Some functionality from the twitter package will not be available.")
else:
from nltk.twitter.util import Authenticate, credsfromfile
- from nltk.twitter.twitterclient import (
- Streamer,
- Query,
- Twitter,
- TweetViewer,
- TweetWriter,
- )
+ from nltk.twitter.twitterclient import Streamer, Query, Twitter,\
+ TweetViewer, TweetWriter
from nltk.twitter.common import json2csv
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter API
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
handling.
"""
-import time as _time
from abc import ABCMeta, abstractmethod
-from datetime import tzinfo, timedelta, timezone, datetime
+from six import add_metaclass
+from datetime import tzinfo, timedelta, datetime
+from nltk.compat import UTC
+import time as _time
class LocalTimezoneOffsetWithUTC(tzinfo):
Reference: https://docs.python.org/3/library/datetime.html
"""
-
STDOFFSET = timedelta(seconds=-_time.timezone)
if _time.daylight:
LOCAL = LocalTimezoneOffsetWithUTC()
-class BasicTweetHandler(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class BasicTweetHandler(object):
"""
Minimal implementation of `TweetHandler`.
Counts the number of Tweets and decides when the client should stop
fetching them.
"""
-
def __init__(self, limit=20):
self.limit = limit
self.counter = 0
"""
return self.counter < self.limit and not self.do_stop
-
class TweetHandlerI(BasicTweetHandler):
"""
Interface class whose subclasses should implement a handle method that
Twitter clients can delegate to.
"""
-
def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
"""
:param int limit: The number of data items to process in the current\
Validate date limits.
"""
if self.upper_date_limit or self.lower_date_limit:
- date_fmt = "%a %b %d %H:%M:%S +0000 %Y"
- tweet_date = datetime.strptime(data["created_at"], date_fmt).replace(
- tzinfo=timezone.utc
- )
- if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
- self.lower_date_limit and tweet_date < self.lower_date_limit
- ):
+ date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
+ tweet_date = \
+ datetime.strptime(data['created_at'],
+ date_fmt).replace(tzinfo=UTC)
+ if (self.upper_date_limit and tweet_date > self.upper_date_limit) or \
+ (self.lower_date_limit and tweet_date < self.lower_date_limit):
if self.upper_date_limit:
message = "earlier"
date_limit = self.upper_date_limit
message = "later"
date_limit = self.lower_date_limit
if verbose:
- print(
- "Date limit {0} is {1} than date of current tweet {2}".format(
- date_limit, message, tweet_date
- )
- )
+ print("Date limit {0} is {1} than date of current tweet {2}".\
+ format(date_limit, message, tweet_date))
self.do_stop = True
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
Utility functions for the :module:`twitterclient` module which do not require
the `twython` library to have been installed.
"""
+from __future__ import print_function
+
import csv
import gzip
import json
-from nltk.internals import deprecated
+import nltk.compat as compat
-HIER_SEPARATOR = "."
+HIER_SEPARATOR = "."
def extract_fields(tweet, fields):
"""
try:
_add_field_to_out(tweet, field, out)
except TypeError:
- raise RuntimeError(
- "Fatal error when extracting fields. Cannot find field ", field
- )
+ raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
return out
-
def _add_field_to_out(json, field, out):
if _is_composed_key(field):
key, value = _get_key_value_composed(field)
else:
out += [json[field]]
-
def _is_composed_key(field):
if HIER_SEPARATOR in field:
return True
return False
-
def _get_key_value_composed(field):
out = field.split(HIER_SEPARATOR)
# there could be up to 3 levels
value = HIER_SEPARATOR.join(out[1:])
return key, value
-
def _get_entity_recursive(json, entity):
if not json:
return None
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
- if key == "entities" or key == "extended_entities":
+ if key == 'entities' or key == 'extended_entities':
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
else:
return None
-
-def json2csv(
- fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False
-):
+def json2csv(fp, outfile, fields, encoding='utf8', errors='replace',
+ gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
- Additionally, it allows IDs from other Twitter objects, e. g.,\
+ Additonally, it allows IDs from other Twitter objects, e. g.,\
['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
:param error: Behaviour for encoding errors, see\
:param gzip_compress: if `True`, output files are compressed with gzip
"""
- (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
# write the list of fields as header
writer.writerow(fields)
# process the file
outf.close()
-@deprecated("Use open() and csv.writer() directly instead.")
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
- """Get a CSV writer with optional compression."""
- return _outf_writer(outfile, encoding, errors, gzip_compress)
-
-
-def _outf_writer(outfile, encoding, errors, gzip_compress=False):
- if gzip_compress:
- outf = gzip.open(outfile, "wt", encoding=encoding, errors=errors)
+ """
+ Identify appropriate CSV writer given the Python version
+ """
+ if compat.PY3:
+ if gzip_compress:
+ outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
+ else:
+ outf = open(outfile, 'w', encoding=encoding, errors=errors)
+ writer = csv.writer(outf)
else:
- outf = open(outfile, "w", encoding=encoding, errors=errors)
- writer = csv.writer(outf)
+ if gzip_compress:
+ outf = gzip.open(outfile, 'wb')
+ else:
+ outf = open(outfile, 'wb')
+ writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
return (writer, outf)
-def json2csv_entities(
- tweets_file,
- outfile,
- main_fields,
- entity_type,
- entity_fields,
- encoding="utf8",
- errors="replace",
- gzip_compress=False,
-):
+def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
+ encoding='utf8', errors='replace', gzip_compress=False):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
:param gzip_compress: if `True`, ouput files are compressed with gzip
"""
- (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
-
def get_header_field_list(main_fields, entity_type, entity_fields):
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
return output1 + output2
-
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
- raise RuntimeError(
- """Key {0} does not contain a dictionary
- in the json file""".format(
- kd
- )
- )
+ raise RuntimeError("""Key {0} does not contain a dictionary
+ in the json file""".format(kd))
row += [json_dict[vd]]
writer.writerow(row)
return
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
For error codes see Twitter's
`Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
"""
+from __future__ import print_function
import datetime
from functools import wraps
import json
-from io import StringIO
-from nltk.twitter import (
- Query,
- Streamer,
- Twitter,
- TweetViewer,
- TweetWriter,
- credsfromfile,
-)
+from nltk.compat import StringIO
+from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter,\
+ credsfromfile
-SPACER = "###################################"
+SPACER = '###################################'
def verbose(func):
"""Decorator for demo functions"""
-
@wraps(func)
def with_formatting(*args, **kwargs):
print()
print("Using %s" % (func.__name__))
print(SPACER)
return func(*args, **kwargs)
-
return with_formatting
-
def yesterday():
"""
Get yesterday's datetime as a 5-tuple.
"""
- date = datetime.datetime.now()
+ date = datetime.datetime.now()
date -= datetime.timedelta(days=1)
date_tuple = date.timetuple()[:6]
return date_tuple
-
def setup():
"""
Initialize global variables for the demos.
"""
global USERIDS, FIELDS
- USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"]
+ USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800']
# UserIDs corresponding to\
# @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive
- FIELDS = ["id_str"]
+ FIELDS = ['id_str']
@verbose
"""
tw = Twitter()
print("Track from the public stream\n")
- tw.tweets(keywords="love, hate", limit=10) # public stream
+ tw.tweets(keywords='love, hate', limit=10) #public stream
print(SPACER)
print("Search past Tweets\n")
tw = Twitter()
- tw.tweets(keywords="love, hate", stream=False, limit=10) # search past tweets
+ tw.tweets(keywords='love, hate', stream=False, limit=10) # search past tweets
print(SPACER)
- print(
- "Follow two accounts in the public stream"
- + " -- be prepared to wait a few minutes\n"
- )
+ print("Follow two accounts in the public stream" +
+ " -- be prepared to wait a few minutes\n")
tw = Twitter()
- tw.tweets(follow=["759251", "6017542"], stream=True, limit=5) # public stream
+ tw.tweets(follow=['759251', '6017542'], stream=True, limit=5) #public stream
@verbose
@verbose
-def search_demo(keywords="nltk"):
+def search_demo(keywords='nltk'):
"""
Use the REST API to search for past tweets containing a given keyword.
"""
oauth = credsfromfile()
client = Query(**oauth)
for tweet in client.search_tweets(keywords=keywords, limit=10):
- print(tweet["text"])
+ print(tweet['text'])
@verbose
-def tweets_by_user_demo(user="NLTK_org", count=200):
+def tweets_by_user_demo(user='NLTK_org', count=200):
"""
Use the REST API to search for past tweets by a given user.
"""
client = Query(**oauth)
user_info = client.user_info_from_id(USERIDS)
for info in user_info:
- name = info["screen_name"]
- followers = info["followers_count"]
- following = info["friends_count"]
+ name = info['screen_name']
+ followers = info['followers_count']
+ following = info['friends_count']
print("{0}, followers: {1}, following: {2}".format(name, followers, following))
print("Cutoff date: {}\n".format(dt_date))
for tweet in client.search_tweets(keywords=keywords):
- print("{} ".format(tweet["created_at"]), end="")
+ print("{} ".format(tweet['created_at']), end='')
client.handler.handle(tweet)
corresponding full Tweets, if available.
"""
- ids_f = StringIO(
- """\
+ ids_f =\
+ StringIO("""\
588665495492124672
588665495487909888
588665495508766721
588665495525588992
588665495487844352
588665495492014081
- 588665495512948737"""
- )
+ 588665495512948737""")
oauth = credsfromfile()
client = Query(**oauth)
hydrated = client.expand_tweetids(ids_f)
for tweet in hydrated:
- id_str = tweet["id_str"]
- print("id: {}".format(id_str))
- text = tweet["text"]
- if text.startswith("@null"):
- text = "[Tweet not available]"
- print(text + "\n")
-
-
-ALL = [
- twitterclass_demo,
- sampletoscreen_demo,
- tracktoscreen_demo,
- search_demo,
- tweets_by_user_demo,
- lookup_by_userid_demo,
- followtoscreen_demo,
- streamtofile_demo,
- limit_by_time_demo,
- corpusreader_demo,
- expand_tweetids_demo,
-]
+ id_str = tweet['id_str']
+ print('id: {}'.format(id_str))
+ text = tweet['text']
+ if text.startswith('@null'):
+ text = "[Tweet not available]"
+ print(text + '\n')
+
+
+
+ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo,
+ search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo,
+ streamtofile_demo, limit_by_time_demo, corpusreader_demo, expand_tweetids_demo]
"""
Select demo functions to run. E.g. replace the following line with "DEMOS =
print("\n" + SPACER)
print("All demos completed")
print(SPACER)
+
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
import itertools
import json
import os
+import requests
import time
import gzip
-import requests
from twython import Twython, TwythonStreamer
from twython.exceptions import TwythonRateLimitError, TwythonError
from nltk.twitter.api import TweetHandlerI, BasicTweetHandler
+
class Streamer(TwythonStreamer):
"""
Retrieve data from the Twitter Streaming API.
The streaming API requires
`OAuth 1.0 <http://en.wikipedia.org/wiki/OAuth>`_ authentication.
"""
-
def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
self.handler = None
self.do_continue = True
- TwythonStreamer.__init__(
- self, app_key, app_secret, oauth_token, oauth_token_secret
- )
+ TwythonStreamer.__init__(self, app_key, app_secret, oauth_token,
+ oauth_token_secret)
def register(self, handler):
"""
"""
if self.do_continue:
if self.handler is not None:
- if "text" in data:
+ if 'text' in data:
self.handler.counter += 1
self.handler.handle(data)
self.do_continue = self.handler.do_continue()
self.disconnect()
self.handler.on_finish()
+
def on_error(self, status_code, data):
"""
:param status_code: The status code returned by the Twitter API
print("Error (stream will continue): {0}".format(e))
continue
- def filter(self, track="", follow="", lang="en"):
+ def filter(self, track='', follow='', lang='en'):
"""
Wrapper for 'statuses / filter' API call
"""
while self.do_continue:
- # Stream in an endless loop until limit is reached
+ #Stream in an endless loop until limit is reached
try:
- if track == "" and follow == "":
+ if track == '' and follow == '':
msg = "Please supply a value for 'track', 'follow'"
raise ValueError(msg)
self.statuses.filter(track=track, follow=follow, lang=lang)
"""
Retrieve data from the Twitter REST API.
"""
-
- def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
+ def __init__(self, app_key, app_secret, oauth_token,
+ oauth_token_secret):
self.handler = None
self.do_continue = True
Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret)
# The Twitter endpoint takes lists of up to 100 ids, so we chunk the
# ids.
- id_chunks = [ids[i : i + 100] for i in range(0, len(ids), 100)]
+ id_chunks = [ids[i:i+100] for i in range(0, len(ids), 100)]
- chunked_tweets = (self.lookup_status(id=chunk) for chunk in id_chunks)
+ chunked_tweets = (self.lookup_status(id=chunk) for chunk in
+ id_chunks)
return itertools.chain.from_iterable(chunked_tweets)
- def _search_tweets(self, keywords, limit=100, lang="en"):
+
+
+ def _search_tweets(self, keywords, limit=100, lang='en'):
"""
Assumes that the handler has been informed. Fetches Tweets from
search_tweets generator output and passses them to handler
:param str lang: language
"""
while True:
- tweets = self.search_tweets(
- keywords=keywords, limit=limit, lang=lang, max_id=self.handler.max_id
- )
+ tweets = self.search_tweets(keywords=keywords, limit=limit, lang=lang,
+ max_id=self.handler.max_id)
for tweet in tweets:
self.handler.handle(tweet)
if not (self.handler.do_continue() and self.handler.repeat):
break
self.handler.on_finish()
- def search_tweets(
- self,
- keywords,
- limit=100,
- lang="en",
- max_id=None,
- retries_after_twython_exception=0,
- ):
+ def search_tweets(self, keywords, limit=100, lang='en', max_id=None,
+ retries_after_twython_exception=0):
"""
Call the REST API ``'search/tweets'`` endpoint with some plausible
defaults. See `the Twitter search documentation
<https://dev.twitter.com/rest/public/search>`_ for more information
- about admissible search parameters.
+ about admissable search parameters.
:param str keywords: A list of query terms to search for, written as\
a comma-separated string
if max_id:
self.handler.max_id = max_id
else:
- results = self.search(
- q=keywords, count=min(100, limit), lang=lang, result_type="recent"
- )
- count = len(results["statuses"])
+ results = self.search(q=keywords, count=min(100, limit), lang=lang,
+ result_type='recent')
+ count = len(results['statuses'])
if count == 0:
print("No Tweets available through REST API for those keywords")
return
count_from_query = count
- self.handler.max_id = results["statuses"][count - 1]["id"] - 1
+ self.handler.max_id = results['statuses'][count - 1]['id'] - 1
- for result in results["statuses"]:
+ for result in results['statuses']:
yield result
self.handler.counter += 1
if self.handler.do_continue() == False:
return
+
# Pagination loop: keep fetching Tweets until the desired count is
# reached while dealing with Twitter rate limits.
retries = 0
while count_from_query < limit:
try:
- mcount = min(100, limit - count_from_query)
- results = self.search(
- q=keywords,
- count=mcount,
- lang=lang,
- max_id=self.handler.max_id,
- result_type="recent",
- )
+ mcount = min(100, limit-count_from_query)
+ results = self.search(q=keywords, count=mcount, lang=lang,
+ max_id=self.handler.max_id, result_type='recent')
except TwythonRateLimitError as e:
print("Waiting for 15 minutes -{0}".format(e))
- time.sleep(15 * 60) # wait 15 minutes
+ time.sleep(15*60) # wait 15 minutes
continue
except TwythonError as e:
print("Fatal error in Twython request -{0}".format(e))
raise e
retries += 1
- count = len(results["statuses"])
+ count = len(results['statuses'])
if count == 0:
print("No more Tweets available through rest api")
return
# results['search_metadata']['next_results'], but as part of a
# query and difficult to fetch. This is doing the equivalent
# (last tweet id minus one)
- self.handler.max_id = results["statuses"][count - 1]["id"] - 1
+ self.handler.max_id = results['statuses'][count - 1]['id'] - 1
- for result in results["statuses"]:
+ for result in results['statuses']:
yield result
self.handler.counter += 1
if self.handler.do_continue() == False:
"""
return [self.show_user(user_id=userid) for userid in userids]
- def user_tweets(self, screen_name, limit, include_rts="false"):
+ def user_tweets(self, screen_name, limit, include_rts='false'):
"""
Return a collection of the most recent Tweets posted by the user
:param str include_rts: Whether to include statuses which have been\
retweeted by the user; possible values are 'true' and 'false'
"""
- data = self.get_user_timeline(
- screen_name=screen_name, count=limit, include_rts=include_rts
- )
+ data = self.get_user_timeline(screen_name=screen_name, count=limit,
+ include_rts=include_rts)
for item in data:
self.handler.handle(item)
+
+
class Twitter(object):
"""
Wrapper class with restricted functionality and fewer options.
"""
-
def __init__(self):
self._oauth = credsfromfile()
self.streamer = Streamer(**self._oauth)
self.query = Query(**self._oauth)
- def tweets(
- self,
- keywords="",
- follow="",
- to_screen=True,
- stream=True,
- limit=100,
- date_limit=None,
- lang="en",
- repeat=False,
- gzip_compress=False,
- ):
+
+ def tweets(self, keywords='', follow='', to_screen=True, stream=True,
+ limit=100, date_limit=None, lang='en', repeat=False,
+ gzip_compress=False):
"""
Process some Tweets in a simple manner.
lower_date_limit = date_limit
if to_screen:
- handler = TweetViewer(
- limit=limit,
- upper_date_limit=upper_date_limit,
- lower_date_limit=lower_date_limit,
- )
+ handler = TweetViewer(limit=limit,
+ upper_date_limit=upper_date_limit,
+ lower_date_limit=lower_date_limit)
else:
- handler = TweetWriter(
- limit=limit,
- upper_date_limit=upper_date_limit,
- lower_date_limit=lower_date_limit,
- repeat=repeat,
- gzip_compress=gzip_compress,
- )
+ handler = TweetWriter(limit=limit,
+ upper_date_limit=upper_date_limit,
+ lower_date_limit=lower_date_limit, repeat=repeat,
+ gzip_compress=gzip_compress)
+
+
if to_screen:
handler = TweetViewer(limit=limit)
upper_date_limit = None
lower_date_limit = date_limit
- handler = TweetWriter(
- limit=limit,
- upper_date_limit=upper_date_limit,
- lower_date_limit=lower_date_limit,
- repeat=repeat,
- gzip_compress=gzip_compress,
- )
+ handler = TweetWriter(limit=limit, upper_date_limit=upper_date_limit,
+ lower_date_limit=lower_date_limit, repeat=repeat,
+ gzip_compress=gzip_compress)
if stream:
self.streamer.register(handler)
- if keywords == "" and follow == "":
+ if keywords == '' and follow == '':
self.streamer.sample()
else:
self.streamer.filter(track=keywords, follow=follow, lang=lang)
else:
self.query.register(handler)
- if keywords == "":
+ if keywords == '':
raise ValueError("Please supply at least one keyword to search for.")
else:
self.query._search_tweets(keywords, limit=limit, lang=lang)
+
class TweetViewer(TweetHandlerI):
"""
Handle data by sending it to the terminal.
:rtype: bool
:param data: Tweet object returned by Twitter API
"""
- text = data["text"]
+ text = data['text']
print(text)
self.check_date_limit(data)
return
def on_finish(self):
- print("Written {0} Tweets".format(self.counter))
+ print('Written {0} Tweets'.format(self.counter))
class TweetWriter(TweetHandlerI):
"""
Handle data by writing it to a file.
"""
-
- def __init__(
- self,
- limit=2000,
- upper_date_limit=None,
- lower_date_limit=None,
- fprefix="tweets",
- subdir="twitter-files",
- repeat=False,
- gzip_compress=False,
- ):
+ def __init__(self, limit=2000, upper_date_limit=None, lower_date_limit=None,
+ fprefix='tweets', subdir='twitter-files', repeat=False,
+ gzip_compress=False):
"""
The difference between the upper and lower date limits depends on
whether Tweets are coming in an ascending date order (i.e. when
self.output = None
TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit)
+
def timestamped_file(self):
"""
:return: timestamped file name
os.mkdir(subdir)
fname = os.path.join(subdir, fprefix)
- fmt = "%Y%m%d-%H%M%S"
+ fmt = '%Y%m%d-%H%M%S'
timestamp = datetime.datetime.now().strftime(fmt)
if self.gzip_compress:
- suffix = ".gz"
+ suffix = '.gz'
else:
- suffix = ""
- outfile = "{0}.{1}.json{2}".format(fname, timestamp, suffix)
+ suffix = ''
+ outfile = '{0}.{1}.json{2}'.format(fname, timestamp, suffix)
return outfile
+
def handle(self, data):
"""
Write Twitter data as line-delimited JSON into one or more files.
"""
if self.startingup:
if self.gzip_compress:
- self.output = gzip.open(self.fname, "w")
+ self.output = gzip.open(self.fname, 'w')
else:
- self.output = open(self.fname, "w")
- print("Writing to {0}".format(self.fname))
+ self.output = open(self.fname, 'w')
+ print('Writing to {0}'.format(self.fname))
json_data = json.dumps(data)
if self.gzip_compress:
- self.output.write((json_data + "\n").encode("utf-8"))
+ self.output.write((json_data + "\n").encode('utf-8'))
else:
self.output.write(json_data + "\n")
self.startingup = False
def on_finish(self):
- print("Written {0} Tweets".format(self.counter))
+ print('Written {0} Tweets'.format(self.counter))
if self.output:
self.output.close()
self._restart_file()
return True
+
def _restart_file(self):
self.on_finish()
self.fname = self.timestamped_file()
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
Authentication utilities to accompany :module:`twitterclient`.
"""
+from __future__ import print_function
+
import os
import pprint
from twython import Twython
-
def credsfromfile(creds_file=None, subdir=None, verbose=False):
"""
Convenience function for authentication
"""
- return Authenticate().load_creds(
- creds_file=creds_file, subdir=subdir, verbose=verbose
- )
+ return Authenticate().load_creds(creds_file=creds_file, subdir=subdir, verbose=verbose)
class Authenticate(object):
"""
Methods for authenticating with Twitter.
"""
-
def __init__(self):
- self.creds_file = "credentials.txt"
+ self.creds_file = 'credentials.txt'
self.creds_fullpath = None
self.oauth = {}
try:
- self.twitter_dir = os.environ["TWITTER"]
+ self.twitter_dir = os.environ['TWITTER']
self.creds_subdir = self.twitter_dir
except KeyError:
self.twitter_dir = None
self.creds_subdir = None
+
def load_creds(self, creds_file=None, subdir=None, verbose=False):
"""
Read OAuth credentials from a text file.
if subdir is None:
if self.creds_subdir is None:
- msg = (
- "Supply a value to the 'subdir' parameter or"
- + " set the TWITTER environment variable."
- )
+ msg = "Supply a value to the 'subdir' parameter or" +\
+ " set the TWITTER environment variable."
raise ValueError(msg)
else:
self.creds_subdir = subdir
- self.creds_fullpath = os.path.normpath(
- os.path.join(self.creds_subdir, self.creds_file)
- )
+ self.creds_fullpath =\
+ os.path.normpath(os.path.join(self.creds_subdir, self.creds_file))
if not os.path.isfile(self.creds_fullpath):
- raise OSError("Cannot find file {}".format(self.creds_fullpath))
+ raise OSError('Cannot find file {}'.format(self.creds_fullpath))
with open(self.creds_fullpath) as infile:
if verbose:
- print("Reading credentials file {}".format(self.creds_fullpath))
+ print('Reading credentials file {}'.format(self.creds_fullpath))
for line in infile:
- if "=" in line:
- name, value = line.split("=", 1)
+ if '=' in line:
+ name, value = line.split('=', 1)
self.oauth[name.strip()] = value.strip()
self._validate_creds_file(verbose=verbose)
def _validate_creds_file(self, verbose=False):
"""Check validity of a credentials file."""
oauth1 = False
- oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"]
+ oauth1_keys = ['app_key', 'app_secret', 'oauth_token', 'oauth_token_secret']
oauth2 = False
- oauth2_keys = ["app_key", "app_secret", "access_token"]
+ oauth2_keys = ['app_key', 'app_secret', 'access_token']
if all(k in self.oauth for k in oauth1_keys):
oauth1 = True
elif all(k in self.oauth for k in oauth2_keys):
oauth2 = True
if not (oauth1 or oauth2):
- msg = "Missing or incorrect entries in {}\n".format(self.creds_file)
+ msg = 'Missing or incorrect entries in {}\n'.format(self.creds_file)
msg += pprint.pformat(self.oauth)
raise ValueError(msg)
elif verbose:
"""
if creds_file is None:
path = os.path.dirname(__file__)
- creds_file = os.path.join(path, "credentials2.txt")
+ creds_file = os.path.join(path, 'credentials2.txt')
oauth2 = credsfromfile(creds_file=creds_file)
- app_key = oauth2["app_key"]
- app_secret = oauth2["app_secret"]
+ app_key = oauth2['app_key']
+ app_secret = oauth2['app_secret']
twitter = Twython(app_key, app_secret, oauth_version=2)
access_token = twitter.obtain_access_token()
- tok = "access_token={}\n".format(access_token)
- with open(creds_file, "a") as infile:
+ tok = 'access_token={}\n'.format(access_token)
+ with open(creds_file, 'a') as infile:
print(tok, file=infile)
# Natural Language Toolkit: Utility functions
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from __future__ import print_function
import sys
import inspect
import bisect
import os
-from itertools import islice, chain, combinations, tee
+from itertools import islice, chain, combinations
from pprint import pprint
from collections import defaultdict, deque
from sys import version_info
-from urllib.request import (
- build_opener,
- install_opener,
- getproxies,
- ProxyHandler,
- ProxyBasicAuthHandler,
- ProxyDigestAuthHandler,
- HTTPPasswordMgrWithDefaultRealm,
-)
+from six import class_types, string_types, text_type
+from six.moves.urllib.request import (build_opener, install_opener, getproxies,
+ ProxyHandler, ProxyBasicAuthHandler,
+ ProxyDigestAuthHandler,
+ HTTPPasswordMgrWithDefaultRealm)
from nltk.internals import slice_bounds, raise_unorderable_types
from nltk.collections import *
+from nltk.compat import python_2_unicode_compatible
+
######################################################################
# Short usage message
######################################################################
+def usage(obj, selfname='self'):
+ str(obj) # In case it's lazy, this will load it.
-def usage(obj, selfname="self"):
- str(obj) # In case it's lazy, this will load it.
-
- if not isinstance(obj, type):
+ if not isinstance(obj, class_types):
obj = obj.__class__
- print("%s supports the following operations:" % obj.__name__)
+ print('%s supports the following operations:' % obj.__name__)
for (name, method) in sorted(pydoc.allmethods(obj).items()):
- if name.startswith("_"):
- continue
- if getattr(method, "__deprecated__", False):
- continue
+ if name.startswith('_'): continue
+ if getattr(method, '__deprecated__', False): continue
- getargspec = inspect.getfullargspec
+ if sys.version_info[0] >= 3:
+ getargspec = inspect.getfullargspec
+ else:
+ getargspec = inspect.getargspec
args, varargs, varkw, defaults = getargspec(method)[:4]
- if (
- args
- and args[0] == "self"
- and (defaults is None or len(args) > len(defaults))
- ):
+ if (args and args[0]=='self' and
+ (defaults is None or len(args)>len(defaults))):
args = args[1:]
- name = "%s.%s" % (selfname, name)
- argspec = inspect.formatargspec(args, varargs, varkw, defaults)
- print(
- textwrap.fill(
- "%s%s" % (name, argspec),
- initial_indent=" - ",
- subsequent_indent=" " * (len(name) + 5),
- )
- )
-
+ name = '%s.%s' % (selfname, name)
+ argspec = inspect.formatargspec(
+ args, varargs, varkw, defaults)
+ print(textwrap.fill('%s%s' % (name, argspec),
+ initial_indent=' - ',
+ subsequent_indent=' '*(len(name)+5)))
##########################################################################
# IDLE
##########################################################################
-
def in_idle():
"""
Return True if this function is run within idle. Tkinter
:rtype: bool
"""
import sys
-
- return sys.stdin.__class__.__name__ in ("PyShell", "RPCProxy")
-
+ return sys.stdin.__class__.__name__ in ('PyShell', 'RPCProxy')
##########################################################################
# PRETTY PRINTING
##########################################################################
-
def pr(data, start=0, end=None):
"""
Pretty print a sequence of data items
"""
pprint(list(islice(data, start, end)))
-
def print_string(s, width=70):
"""
Pretty print a string, breaking lines on whitespace
:param width: the display width
:type width: int
"""
- print("\n".join(textwrap.wrap(s, width=width)))
-
+ print('\n'.join(textwrap.wrap(s, width=width)))
def tokenwrap(tokens, separator=" ", width=70):
"""
:param width: the display width (default=70)
:type width: int
"""
- return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
+ return '\n'.join(textwrap.wrap(separator.join(tokens), width=width))
##########################################################################
# Python version
##########################################################################
-
def py25():
return version_info[0] == 2 and version_info[1] == 5
-
-
def py26():
return version_info[0] == 2 and version_info[1] == 6
-
-
def py27():
return version_info[0] == 2 and version_info[1] == 7
# Indexing
##########################################################################
-
class Index(defaultdict):
+
def __init__(self, pairs):
defaultdict.__init__(self, list)
for key, value in pairs:
## Regexp display (thanks to David Mertz)
######################################################################
-
def re_show(regexp, string, left="{", right="}"):
"""
Return a string with markers surrounding the matched substrings.
# recipe from David Mertz
def filestring(f):
- if hasattr(f, "read"):
+ if hasattr(f, 'read'):
return f.read()
- elif isinstance(f, str):
- with open(f, "r") as infile:
+ elif isinstance(f, string_types):
+ with open(f, 'r') as infile:
return infile.read()
else:
raise ValueError("Must be called with a filename or file-like object")
-
##########################################################################
# Breadth-First Search
##########################################################################
-
def breadth_first(tree, children=iter, maxdepth=-1):
"""Traverse the nodes of a tree in breadth-first order.
(No need to check for cycles.)
except TypeError:
pass
-
##########################################################################
# Guess Character Encoding
##########################################################################
# adapted from io.py in the docutils extension module (http://docutils.sourceforge.net)
# http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html
-
def guess_encoding(data):
"""
Given a byte string, attempt to decode it.
"""
successful_encoding = None
# we make 'utf-8' the first encoding
- encodings = ["utf-8"]
+ encodings = ['utf-8']
#
# next we add anything we can learn from the locale
try:
pass
#
# we try 'latin-1' last
- encodings.append("latin-1")
+ encodings.append('latin-1')
for enc in encodings:
# some of the locale calls
# may have returned None
if not enc:
continue
try:
- decoded = str(data, enc)
+ decoded = text_type(data, enc)
successful_encoding = enc
except (UnicodeError, LookupError):
else:
break
if not successful_encoding:
- raise UnicodeError(
- "Unable to decode input data. "
- "Tried the following encodings: %s."
- % ", ".join([repr(enc) for enc in encodings if enc])
- )
+ raise UnicodeError(
+ 'Unable to decode input data. Tried the following encodings: %s.'
+ % ', '.join([repr(enc) for enc in encodings if enc]))
else:
- return (decoded, successful_encoding)
+ return (decoded, successful_encoding)
##########################################################################
# Remove repeated elements from a list deterministcally
##########################################################################
-
def unique_list(xs):
seen = set()
# not seen.add(x) here acts to make the code shorter without using if statements, seen.add(x) always returns None.
return [x for x in xs if x not in seen and not seen.add(x)]
-
##########################################################################
# Invert a dictionary
##########################################################################
-
def invert_dict(d):
inverted_dict = defaultdict(list)
for key in d:
- if hasattr(d[key], "__iter__"):
+ if hasattr(d[key], '__iter__'):
for term in d[key]:
inverted_dict[term].append(key)
else:
# The graph is represented as a dictionary of sets
##########################################################################
-
def transitive_closure(graph, reflexive=False):
"""
Calculate the transitive closure of a directed graph,
return inverted
+
##########################################################################
# HTML Cleaning
##########################################################################
-
def clean_html(html):
- raise NotImplementedError(
- "To remove HTML markup, use BeautifulSoup's get_text() function"
- )
-
+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
def clean_url(url):
- raise NotImplementedError(
- "To remove HTML markup, use BeautifulSoup's get_text() function"
- )
-
+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
##########################################################################
# FLATTEN LISTS
##########################################################################
-
def flatten(*args):
"""
Flatten a list.
x = []
for l in args:
- if not isinstance(l, (list, tuple)):
- l = [l]
+ if not isinstance(l, (list, tuple)): l = [l]
for item in l:
if isinstance(item, (list, tuple)):
x.extend(flatten(item))
x.append(item)
return x
-
##########################################################################
# Ngram iteration
##########################################################################
-
-def pad_sequence(
- sequence,
- n,
- pad_left=False,
- pad_right=False,
- left_pad_symbol=None,
- right_pad_symbol=None,
-):
+def pad_sequence(sequence, n, pad_left=False, pad_right=False,
+ left_pad_symbol=None, right_pad_symbol=None):
"""
Returns a padded sequence of items before ngram extraction.
"""
sequence = iter(sequence)
if pad_left:
- sequence = chain((left_pad_symbol,) * (n - 1), sequence)
+ sequence = chain((left_pad_symbol,) * (n-1), sequence)
if pad_right:
- sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
+ sequence = chain(sequence, (right_pad_symbol,) * (n-1))
return sequence
-
# add a flag to pad the sequence so we get peripheral ngrams?
-
-def ngrams(
- sequence,
- n,
- pad_left=False,
- pad_right=False,
- left_pad_symbol=None,
- right_pad_symbol=None,
-):
+def ngrams(sequence, n, pad_left=False, pad_right=False,
+ left_pad_symbol=None, right_pad_symbol=None):
"""
Return the ngrams generated from a sequence of items, as an iterator.
For example:
:type right_pad_symbol: any
:rtype: sequence or iter
"""
- sequence = pad_sequence(
- sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
- )
+ sequence = pad_sequence(sequence, n, pad_left, pad_right,
+ left_pad_symbol, right_pad_symbol)
history = []
while n > 1:
- # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
- try:
- next_item = next(sequence)
- except StopIteration:
- # no more data, terminate the generator
- return
- history.append(next_item)
+ history.append(next(sequence))
n -= 1
for item in sequence:
history.append(item)
yield tuple(history)
del history[0]
-
def bigrams(sequence, **kwargs):
"""
Return the bigrams generated from a sequence of items, as an iterator.
for item in ngrams(sequence, 2, **kwargs):
yield item
-
def trigrams(sequence, **kwargs):
"""
Return the trigrams generated from a sequence of items, as an iterator.
for item in ngrams(sequence, 3, **kwargs):
yield item
-
def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
"""
Returns all possible ngrams generated from a sequence of items, as an iterator.
if max_len == -1:
max_len = len(sequence)
- for n in range(min_len, max_len + 1):
+ for n in range(min_len, max_len+1):
for ng in ngrams(sequence, n, **kwargs):
yield ng
-
def skipgrams(sequence, n, k, **kwargs):
"""
Returns all possible skipgrams generated from a sequence of items, as an iterator.
"""
# Pads the sequence as desired by **kwargs.
- if "pad_left" in kwargs or "pad_right" in kwargs:
+ if 'pad_left' in kwargs or 'pad_right' in kwargs:
sequence = pad_sequence(sequence, n, **kwargs)
# Note when iterating through the ngrams, the pad_right here is not
continue
yield head + skip_tail
-
######################################################################
# Binary Search in a File
######################################################################
:param key: the identifier we are searching for.
"""
- key = key + " "
+ key = key + ' '
keylen = len(key)
start = 0
currentDepth = 0
- if hasattr(file, "name"):
+ if hasattr(file, 'name'):
end = os.stat(file.name).st_size - 1
else:
file.seek(0, 2)
while True:
file.seek(max(0, middle - 1))
if middle > 0:
- file.discard_line()
+ file.readline()
offset = file.tell()
line = file.readline()
- if line != "":
- break
+ if line != "": break
# at EOF; try to find start of the last line
- middle = (start + middle) // 2
- if middle == end - 1:
+ middle = (start + middle)//2
+ if middle == end -1:
return None
if currentDepth < cacheDepth:
cache[middle] = (offset, line)
return None
-
######################################################################
# Proxy configuration
######################################################################
-
-def set_proxy(proxy, user=None, password=""):
+def set_proxy(proxy, user=None, password=''):
"""
Set the HTTP proxy for Python to download through.
authentication.
:param password: The password to authenticate with.
"""
+ from nltk import compat
+
if proxy is None:
# Try and find the system proxy settings
try:
- proxy = getproxies()["http"]
+ proxy = getproxies()['http']
except KeyError:
- raise ValueError("Could not detect default proxy settings")
+ raise ValueError('Could not detect default proxy settings')
# Set up the proxy handler
- proxy_handler = ProxyHandler({"https": proxy, "http": proxy})
+ proxy_handler = ProxyHandler({'https': proxy, 'http': proxy})
opener = build_opener(proxy_handler)
if user is not None:
# Set up basic proxy authentication if provided
password_manager = HTTPPasswordMgrWithDefaultRealm()
- password_manager.add_password(realm=None, uri=proxy, user=user, passwd=password)
+ password_manager.add_password(realm=None, uri=proxy, user=user,
+ passwd=password)
opener.add_handler(ProxyBasicAuthHandler(password_manager))
opener.add_handler(ProxyDigestAuthHandler(password_manager))
:return: Contents of elem indented to reflect its structure
"""
- i = "\n" + level * " "
+ i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
for elem in elem:
- elementtree_indent(elem, level + 1)
+ elementtree_indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
-
######################################################################
# Mathematical approximations
######################################################################
-
def choose(n, k):
"""
This function is a fast way to calculate binomial coefficients, commonly
return ntok // ktok
else:
return 0
-
-
-######################################################################
-# Iteration utilities
-######################################################################
-
-
-def pairwise(iterable):
- """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
- a, b = tee(iterable)
- next(b, None)
- return zip(a, b)
-
-######################################################################
-# Parallization.
-######################################################################
-
-
-def parallelize_preprocess(func, iterator, processes, progress_bar=False):
- from tqdm import tqdm
- from joblib import Parallel, delayed
-
- iterator = tqdm(iterator) if progress_bar else iterator
- if processes <= 1:
- return map(func, iterator)
- return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator)
# Authors: Liling Tan <alvations@gmail.com>,
# Dmitrijs Milajevs <dimazest@gmail.com>
#
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""Return a synset for an ambiguous word in a context.
:param iter context_sentence: The context sentence where the ambiguous word
- occurs, passed as an iterable of words.
+ occurs, passed as an iterable of words.
:param str ambiguous_word: The ambiguous word that requires WSD.
:param str pos: A specified Part-of-Speech (POS).
:param iter synsets: Possible synsets of the ambiguous word.
)
return sense
+
+
%attr(755,root,root) %{_app_bin_dir}/org.tizen.nlp.service
%{_app_bin_dir}/*
%{TZ_SYS_RO_PACKAGES}/org.tizen.nlp.service.xml
-%{_libdir}/python3.7/site-packages/langdetect/*
-%{_libdir}/python3.7/site-packages/nltk/*
+%{_libdir}/python2.7/site-packages/langdetect/*
+%{_libdir}/python2.7/site-packages/nltk/*
%license LICENSE
%files data-en