Essa página contém links, códigos de referência e arquivos de dados utilizados no projeto de monografia homônimo. Dúvidas, críticas ou quaisquer sugestões podem ser encaminhadas para Caio Begotti, e-mail caio1982 arroba gmail ponto com. Pretende-se criar algum tipo de interface online para consultas e manipulações dos corpora, porém no momento eles estão disponíveis somente em forma estática (todos em domínio público), abaixo.
Todos os corpora de Cícero abaixo estão em formato XML com codificação em UTF-8. Embora tenham sido utilizados com o NLTK, eles precisam de um carregamento específico pois são XML categorizados. Uma classe para carregamento deles no NLTK está na seção de códigos.
Download dos 75 corpora de Cícero em XML, categorizados (3.0M).
Download dos arquivos de stopwords, individuais e combinados (4.0K).
Todos os códigos estão sob domínio público e podem ser baixados em um pacote único (12K).
# http://stackoverflow.com/questions/6849600/does-anyone-have-a-categorized-xml-corpus-reader-for-nltk
# standard nltk classes
from nltk.corpus.reader import CategorizedCorpusReader
from nltk.corpus.reader import XMLCorpusReader
# stopwords (i.e. latin ones)
from nltk.corpus import stopwords
# for CategorizedCorpusReader's init
from nltk.compat import defaultdict
# punctuations load
import string
class MyCategorizedCorpusReader(CategorizedCorpusReader):
def _init(self):
self._f2c = defaultdict(set)
self._c2f = defaultdict(set)
if self._pattern is not None:
for file_id in self._fileids:
category = re.match(self._pattern, file_id).group(1)
self._add(file_id, category)
elif self._map is not None:
for (file_id, categories) in self._map.items():
for category in categories:
self._add(file_id, category)
elif self._file is not None:
for line in self.open(self._file).readlines():
line = line.strip()
file_id, categories = line.split(self._delimiter, 1)
# https://github.com/nltk/nltk/issues/250
#if file_id not in self.fileids():
# raise ValueError('In category mapping file %s: %s '
# 'not found' % (self._file, file_id))
for category in categories.split(self._delimiter):
self._add(file_id, category)
class CategorizedXMLCorpusReader(MyCategorizedCorpusReader, XMLCorpusReader):
def __init__(self, *args, **kwargs):
MyCategorizedCorpusReader.__init__(self, kwargs)
XMLCorpusReader.__init__(self, *args, **kwargs)
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
return XMLCorpusReader.raw(self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
words = []
fileids = self._resolve(fileids, categories)
for fileid in fileids:
words += XMLCorpusReader.words(self, fileid)
return words
def text(self, fileids=None, categories=None):
fileids = self._resolve(fileids, categories)
text = ""
for fileid in fileids:
for i in self.xml(fileid).getiterator():
if i.text:
text += i.text
return text
def sents(self, fileids=None, categories=None):
text = self.words(fileids, categories)
sents = nltk.PunktSentenceTokenizer().tokenize(text)
return sents
def paras(self, fileids=None, categories=None):
return CategorizedCorpusReader.paras(self, self._resolve(fileids, categories))
def stopless(wordslist):
stop = stopwords.words('latin')
filtered = [x for x in wordslist if x not in stop]
return filtered
def punctless(wordslist):
punct = string.punctuation
punct += u'\u00a7' # SECTION SIGN
punct += u'\u00b3' # SUPERSCRIPT THREE
punct += u'\u00b2' # SUPERSCRIPT TWO
punct += u'\u00b7' # MIDDLE DOT
punct += u'\u00b9' # SUPERSCRIPT ONE
punct += u'\u2014' # EM DASH
punct += u'\u2019' # RIGHT SINGLE QUOTATION MARK
punct += u'\u2020' # DAGGER
punct += u'\u2184' # LATIN SMALL LETTER REVERSED C
punct += u'\u221e' # INFINITY
punct += u'\u23d1' # METRICAL BREVE
punctuation = list(punct)
words = []
for w in wordslist:
if w.isalpha():
words.append(w)
filtered = [x.encode('utf-8', 'replace') for x in words if x not in punctuation]
return filtered
def ciceroabbr(filename):
return {
'cicero_academica.xml': 'Ac',
'cicero_arati_phaenomena.xml': 'AratPhaen',
'cicero_arati_prognostica.xml': 'AratProgn',
'cicero_brutus.xml': 'Brut',
'cicero_carmina_fragmenta.xml': 'CarFrr',
'cicero_cato_maior_de_senectute.xml': 'Sen',
'cicero_commentarii_causarum.xml': 'CommCaus',
'cicero_de_divinatione.xml': 'Div',
'cicero_de_domo_sua.xml': 'Dom',
'cicero_de_fato.xml': 'Fat',
'cicero_de_finibus.xml': 'Fin',
'cicero_de_haruspicum_responso.xml': 'Har',
'cicero_de_inventione.xml': 'Inv',
'cicero_de_iure_civ_in_artem_redig.xml': 'IurCiv',
'cicero_de_lege_agraria.xml': 'Agr',
'cicero_de_legibus.xml': 'Leg',
'cicero_de_natura_deorum.xml': 'ND',
'cicero_de_officiis.xml': 'Off',
'cicero_de_optimo_genere_oratorum.xml': 'OptGen',
'cicero_de_oratore.xml': 'DeOrat',
'cicero_de_partitione_oratoria.xml': 'Part',
'cicero_de_provinciis_consularibus.xml': 'Prov',
'cicero_de_republica.xml': 'Rep',
'cicero_epistula_ad_octavianum_sp.xml': 'EpOct',
'cicero_epistulae_ad_atticum.xml': 'Att',
'cicero_epistulae_ad_brutum.xml': 'AdBrut',
'cicero_epistulae_ad_familiares.xml': 'Fam',
'cicero_epistulae_ad_quintum_fratrem.xml': 'Qfr',
'cicero_epistulae_fragmenta.xml': 'EpFrr',
'cicero_facete_dicta.xml': 'Facet',
'cicero_hortensius.xml': 'Hort',
'cicero_in_catilinam.xml': 'Catil',
'cicero_in_pisonem.xml': 'Pis',
'cicero_in_q_caecilium.xml': 'DivCaec',
'cicero_in_sallustium_sp.xml': 'Sal',
'cicero_in_vatinium.xml': 'Vat',
'cicero_in_verrem.xml': 'Ver',
'cicero_incertorum_librorum_fragmenta.xml': 'LibFrr',
'cicero_laelius_de_amicitia.xml': 'Amic',
'cicero_lucullus.xml': 'Luc',
'cicero_orationum_deperditarum_frr.xml': 'DepFrr',
'cicero_orationum_incertarum_frr.xml': 'IncFrr',
'cicero_orator.xml': 'Orat',
'cicero_paradoxa_stoicorum.xml': 'Parad',
'cicero_philippicae.xml': 'Phil',
'cicero_philosophicorum_librorum_frr.xml': 'PhilFrr',
'cicero_post_reditum_ad_populum.xml': 'RedPop',
'cicero_post_reditum_in_senatu.xml': 'RedSen',
'cicero_pro_archia.xml': 'Arch',
'cicero_pro_balbo.xml': 'Balb',
'cicero_pro_caecina.xml': 'Caec',
'cicero_pro_caelio.xml': 'Cael',
'cicero_pro_cluentio.xml': 'Clu',
'cicero_pro_flacco.xml': 'Flac',
'cicero_pro_fonteio.xml': 'Font',
'cicero_pro_lege_manilia.xml': 'Man',
'cicero_pro_ligario.xml': 'Lig',
'cicero_pro_marcello.xml': 'Marc',
'cicero_pro_milone.xml': 'Mil',
'cicero_pro_murena.xml': 'Mur',
'cicero_pro_plancio.xml': 'Planc',
'cicero_pro_q_roscio_comoedo.xml': 'QRosc',
'cicero_pro_quinctio.xml': 'Quinct',
'cicero_pro_rabirio_perduellionis_reo.xml': 'RabPerd',
'cicero_pro_rabirio_postumo.xml': 'RabPost',
'cicero_pro_rege_deiotaro.xml': 'Deiot',
'cicero_pro_s_roscio_amerino.xml': 'SRosc',
'cicero_pro_scauro.xml': 'Scaur',
'cicero_pro_sestio.xml': 'Sest',
'cicero_pro_sulla.xml': 'Sul',
'cicero_pro_tullio.xml': 'Tul',
'cicero_rhetorica_ad_herennium_sp.xml': 'RhetHer',
'cicero_timaeus.xml': 'Tim',
'cicero_topica.xml': 'Top',
'cicero_tusculanae_disputationes.xml': 'Tusc',
}.get(filename, 'Cic')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
import glob
import optparse
import string
from CatXMLReader import CategorizedXMLCorpusReader
from CatXMLReader import ciceroabbr
from nltk.corpus import cicero
from nltk import ConcordanceIndex
from nltk import Text
parser = optparse.OptionParser("Usage: %prog [options]")
parser.add_option("-l", "--lookup", type="string", dest="term",
help="look up concordances for a word")
parser.add_option("-f", "--fake", action="store_true", dest="fake",
default=False, help="considers non-ciceronian texts")
parser.add_option("-w", "--width", type="int", dest="width",
default=150, help="width of the context data")
parser.add_option("-c", "--count", type="int", dest="count",
default=1, help="how many matches to display")
parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
default=False, help="print headers or stats")
(options, args) = parser.parse_args()
if options.term is None:
parser.print_help()
exit(-1)
reset = '\033[1;m'
red = '\033[1;31m'
green = '\033[1;32m'
yellow = '\033[1;33m'
blue = '\033[1;34m'
class MyText(Text):
def search(self, corpus, word, width, lines):
res = self.concordance(word, width, lines, corpus)
if res is not None:
print res
def concordance(self, corpus, word, width=150, lines=1):
if '_concordance_index' not in self.__dict__:
if options.verbose is True:
print "\nBuilding index..."
self._concordance_index = MyConcordanceIndex(self.tokens, key=lambda s:s.lower())
self._concordance_index.print_concordance(width, lines, corpus, word)
class MyConcordanceIndex(ConcordanceIndex):
def print_concordance(self, corpus, word, width=150, lines=1):
half_width = (width - len(word) - 2) / 2
context = width/4
offsets = self.offsets(word)
if offsets:
lines = min(lines, len(offsets))
if options.verbose is True:
print "Displaying %s of %s matches:" % (lines, len(offsets))
for i in offsets:
if lines <= 0:
break
left = (' ' * half_width +
' '.join(self._tokens[i-context:i]))
right = ' '.join(self._tokens[i+1:i+context])
left = left[-half_width:]
right = right[:half_width]
abbr = ciceroabbr(corpus)
abbrinfo = '[' + abbr + ']'
abbrinfo = abbrinfo.center(12, ' ').replace(abbr, green + abbr + reset)
print abbrinfo + '[' + left, yellow + self._tokens[i] + reset, right + ']'
lines -= 1
else:
if options.verbose is True:
print "No matches found for " + word + " in " + corpus
#exit(-1)
def corpora_loader(corpus, fake):
reader = CategorizedXMLCorpusReader(cicero.root,
cicero.abspaths(),
cat_file='categories.txt')
data = Text(reader.words([corpus]))
return data
if __name__ == "__main__":
for corpus in cicero.fileids():
if corpus in cicero.fileids(['spurious']) and options.fake is False:
continue
content = corpora_loader(corpus, fake=options.fake)
text = MyText(content)
res = text.search(options.term,
options.width,
options.count,
corpus)
if res is not None:
print res
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
from CatXMLReader import CategorizedXMLCorpusReader
from CatXMLReader import stopless
from CatXMLReader import punctless
from nltk.corpus import cicero
from nltk import FreqDist
from nltk import Text
# experimental
from latin_lemmatizer import lemmatize
# fancy dictionary
from collections import defaultdict
total = defaultdict(int)
for corpus in cicero.fileids():
print corpus
reader = CategorizedXMLCorpusReader(cicero.root,
cicero.abspaths(),
cat_file='categories.txt')
try:
dist = FreqDist(Text(punctless(stopless(reader.words([corpus])))))
except UnicodeEncodeError as e:
print str(e)
break
definitions = {}
stat = reader.words([corpus])
for item in dist.items()[:1000]:
entry = item[0]
if len(entry) >= 2:
lemma = lemmatize(item[0])
if lemma is not None:
if lemma not in total:
definitions[entry] = lemma
num = dist[entry]
total[lemma] += num
#print sum(total.values()), len(stat)
#print corpus + ': ' + ', '.join(sorted(definitions.values()))
res = sorted(total.items(), key=lambda x: x[1], reverse=True)
for r in res:
print str(r[1]) + ':' + r[0]
print sum(total.values())
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
# this is a python query script to interface with
# CHLT LEMLAT's web lemmatizer for latin:
# http://www.ilc.cnr.it/lemlat/lemlat/index.html
from sys import argv
from sys import exit
from lxml import etree
def lemmatize(term):
term = term.lower().strip()
parser = etree.HTMLParser()
tree = etree.parse('http://www.ilc.cnr.it/lemlat/cgi-bin/LemLat_cgi.cgi?World+Form=' + term, parser)
element = tree.xpath('//u//text()')
if element and element[0] is not None:
return element[0]
if __name__ == "__main__":
if not len(argv) == 2:
exit('Usage: ' + argv[0] + " 'latin word to lemmatize'")
else:
res = lemmatize(argv[1])
if res is not None:
print res
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
import codecs
import glob
def normalizer():
for loop in glob.glob('raw/*.txt'):
text = ''
with file(loop, 'r') as content:
text = content.read().lower()
# fixes the linebreaking of corpora
text = text.replace("- ", "")
# fixes unused letters for their real latin ones
text = text.replace("v","u")
text = text.replace("j","i")
with file(loop.replace('raw/', 'ready/'), 'w') as content:
content.write(text)
if __name__ == "__main__":
normalizer()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
# to add some sleep time between fetches
# and not hammer down the server
import time
# to avoid charsetting mess with UTF-8 strings
import codecs
import string
# to check if a text was already fetched
from os.path import exists
# document parser
from lxml import etree
# output generation
from elementtree.SimpleXMLWriter import XMLWriter
refs = []
urls = []
# the root directory of all latin texts in PHI
base = 'http://latin.packhum.org/'
# marcus tullius cicero entry in PHI
browse = base + 'author/474'
try:
page = etree.parse(browse, etree.HTMLParser(encoding='utf-8'))
except Exception, err:
print 'Browse Error: ' + str(err)
# gets the list of texts by cicero currently in PHI
matches = page.xpath("//span[@class='wnam']//text()")
counter = 1
for entry in matches:
# creates a reference list with download addresses for every text
refs.append((base + 'dx/text/474/%s/' % str(counter), entry.lower()))
counter += 1
for param in refs:
source = param[0]
title = param[1]
# debug
print '[%s]' % title
filename = title.replace(' ', '_')
for p in string.punctuation.replace('_',''):
filename = filename.replace(p, '')
w = XMLWriter('cicero_' + filename + '.xml', encoding='utf-8')
xml = w.start("document")
# metadata entries of the output files
w.element("meta", name="author", value="marcus tullius cicero")
w.element("meta", name="title", value=title)
w.element("meta", name="source", value=source + '0')
# upon checking it no text in PHI attributed to cicero
# has more than 500 pages, so this is a safe download limit
for x in range(0, 500):
lines = []
entry = []
section = source + str(x)
reference = base + 'loc/474/' + str(x) + '/0'
# debug
print '\t<%s>' % section
# output filename
path = 'ready/' + filename + '-' + str(x) + '.txt'
if not exists(path):
# fetches the current page
try:
page = etree.parse(section, etree.HTMLParser(encoding='utf-8'))
except Exception, err:
print 'Text Error: ' + str(err)
# parses the page paragraphs
try:
entry = page.xpath("//tr/td[1]//text() | //h3//text()")
except Exception, err:
print 'Match Error: ' + str(err)
# a priori this is not needed but it is helpful for debugging
f = codecs.open("log.txt", "a", "utf8")
f.write('\nMatch Error: ' + str(err) + ' [missing] ' + section)
f.close()
# checks if the end of text has been reached
if 'No text' in entry:
print 'EOF: ' + str(x)
break
empty = u'\xa0\xa0'
if len(entry) > 0:
for e in entry:
if e.startswith(empty):
# apparently PHI texts have double blank spaces indicating new paragraphs
lines.append(''.join(e.replace(empty,'')))
else:
lines.append(''.join(e))
paragraph = ' '.join(lines)
y = codecs.open(path, "w", "utf8")
y.write(paragraph)
y.write
else:
# if text has been fetched ok, process it
paragraph = codecs.open(path, "r", "utf8")
strings = paragraph.read()
# finally writes the new content to the corpus file
w.start("page", id=str(x))
w.element("paragraph", strings)
w.end("page")
# give the PHI server some time until the next fetch
# time.sleep(5)
# generates the output file
w.close(xml)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
# reference: https://gist.github.com/2307114
# double-check: http://en.wiktionary.org/wiki/Appendix:Roman_praenomina
import codecs
import glob
import re
def parser():
regex = re.compile("(?:A|Ap|D|C|Cn|K|L|M|Mam|N|O|P|Q|Qu|S|Sp|Ser|Sex|Sec|Seq|Sept|T|Ti|Tit|Vel|Vo)'?\. [A-Z]{0,}\w{0,} [A-Z]{0,}\w{0,}")
praenomina = []
for loop in glob.glob('raw/*.txt'):
with file(loop, 'r') as content:
text = content.read()
for entry in regex.findall(text):
praenomina.append(entry)
return sorted(set(praenomina))
def replacer():
list = parser()
regex = re.compile("^(.*)\. ")
for loop in glob.glob('ready/*.txt'):
with file(loop, 'r') as content:
text = content.read()
replaced = ''
for entry in list:
r = regex.search(entry)
match = r.group(1)
name = re.sub('^' + match, '(' + match + ')', entry)
name = name.replace('.', '')
replaced = re.sub(entry, name, text)
#with file(loop, 'w') as content:
# content.write(replaced)
print entry + ' -> ' + name
if __name__ == "__main__":
replacer()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
# reference: the schinke latin stemming algorithm in python
# http://snowball.tartarus.org/otherapps/schinke/intro.html
import sys
que = ['atque', 'quoque', 'neque', 'itaque', 'absque', 'apsque', 'abusque',
'adaeque', 'adusque', 'deniquep', 'deque', 'susque', 'oblique', 'peraeque',
'plenisque', 'quandoque', 'quisque', 'quaequep', 'cuiusque', 'cuique',
'quemque', 'quamque', 'quaque', 'quique', 'quorumque', 'quarumque',
'quibusque', 'quosque', 'quasque', 'quotusquisque', 'quousque', 'ubique',
'undique', 'usque', 'uterque', 'utique', 'utroque', 'utribique', 'torque',
'coque', 'concoque', 'contorque', 'detorque', 'decoque', 'excoque',
'extorque', 'obtorque', 'optorque', 'retorque', 'recoque', 'attorque',
'incoque', 'intorque', 'praetorque']
noun_suffix = ['ibus', 'ius', 'ae', 'am', 'as', 'em', 'es', 'ia', 'is',
'nt', 'os', 'ud', 'um', 'us', 'a', 'e', 'i', 'o', 'u']
verb_suffix = ['iuntur', 'beris', 'erunt', 'untur', 'iunt', 'mini', 'ntur',
'stis', 'bor', 'ero', 'mur', 'mus', 'ris', 'sti', 'tis', 'tur', 'unt',
'bo', 'ns', 'nt', 'ri', 'm', 'r', 's', 't']
orig = []
nouns = []
verbs = []
# http://stackoverflow.com/questions/3411006/fastest-implementation-to-do-multiple-string-substitutions-in-python
# this is the multiple replacing algorithm proposed by matt anderson at stackoverflow in 2010
# it should perform faster than python's native replace method on huge corpora
def multi_replace(pairs, text):
stack = list(pairs)
stack.reverse()
def replace(stack, parts):
if not stack:
return parts
stack = list(stack)
from_, to = stack.pop()
# debug
# print 'split (%r=>%r)' % (from_, to), parts
split_parts = [replace(stack, part.split(from_)) for part in parts]
parts = [to.join(split_subparts) for split_subparts in split_parts]
# debug
# print 'join (%r=>%r)' % (from_, to), parts
return parts
return replace(stack, [text])[0]
def stemmer():
for entry in sys.stdin.readlines():
# step 2
entry = multi_replace([('j', 'i'), ('v', 'u')], entry.replace('\n',''))
# hackish buffer
buffer = entry
orig.append(buffer)
# step 3
if entry not in que:
if entry.endswith('que'):
entry = entry[:-3]
else:
nouns.append(entry)
verbs.append(entry)
# step 4
for s in noun_suffix:
if entry.endswith(s):
entry = entry[:-len(s)]
break
# step 5
if len(entry) >= 2:
nouns.append(entry)
# step 6
i = ['iuntur', 'erunt', 'untur', 'iunt', 'unt', 'i']
bi = ['beris', 'bor', 'bo', 'bi']
eri = ['ero', 'eri']
# repeat removal of que for verbs
if buffer not in que:
if buffer.endswith('que'):
buffer = buffer[:-3]
else:
nouns.append(buffer)
verbs.append(buffer)
endings = [i, bi, eri]
for list in endings:
for item in list[:-1]:
if buffer.endswith(item):
buffer = buffer.replace(item, list[-1])
break
else:
for v in verb_suffix:
if buffer.endswith(v):
buffer = buffer[:-len(v)]
break
# step 7
if len(buffer) >= 2:
verbs.append(buffer)
return zip(orig, nouns, verbs)
if __name__ == "__main__":
# step 1
res = stemmer()
if res is not None:
for r in res:
print "%s:%s:%s" % (r[0], r[1], r[2])
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
import glob
from nltk import Text
from nltk.tokenize import word_tokenize
from nltk.corpus import cicero
from CatXMLReader import CategorizedXMLCorpusReader
def stopwords():
stopwords = []
with file('../../stopwords/latin', 'r') as content:
for line in content.readlines():
stopwords.append(line.replace('\n',''))
return stopwords
def tokenizer():
fileids = cicero.abspaths()
reader = CategorizedXMLCorpusReader('/', fileids, cat_file='categories.txt')
tokens = Text(reader.words(fileids))
return tokens
matches = []
tokens = tokenizer()
for s in stopwords():
counter = tokens.count(s)
matches.append(counter)
percentage = (float(counter)/float(len(tokens)))*100
print "%d\t%f\t%s" % (counter, percentage, s)
total_stat = (float(sum(matches))/float(len(tokens)))*100
print "stopwords: %d (%f percent)" % (sum(matches), total_stat)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# caio begotti <caio1982@gmail.com>
# this is under public domain
import string
import optparse
import pylab
from itertools import islice
from CatXMLReader import CategorizedXMLCorpusReader
from CatXMLReader import stopless
from CatXMLReader import punctless
from nltk.corpus import stopwords
from nltk.corpus import cicero
from nltk import FreqDist
from nltk import Text
parser = optparse.OptionParser("Usage: %prog [options]")
parser.add_option("-s", "--stopwords", action="store_true", dest="stopwords",
default=False, help="include stopwords in the calculations")
parser.add_option("-p", "--plot", action="store_true", dest="plot",
default=False, help="plot the frequency distribution of terms")
parser.add_option("-z", "--zipf", action="store_true", dest="zipf",
default=False, help="plots a zipf's law log.log graph")
parser.add_option("-l", "--limit", type="int", dest="limit",
default=100, help="prints calculation of first (default: 100) terms")
parser.add_option("-c", "--count", type="int", dest="count",
default=100, help="shows only counts higher than (default: 100)")
(options, args) = parser.parse_args()
#if options is None:
# parser.print_help()
# exit(-1)
class MyFreqDist(FreqDist):
def plot(self, *args, **kwargs):
if len(args) == 0:
args = [len(self)]
samples = list(islice(self, *args))
cumulative = _get_kwarg(kwargs, 'cumulative', False)
if cumulative:
freqs = list(self._cumulative_frequencies(samples))
else:
freqs = [self[sample] for sample in samples]
fig = pylab.figure(figsize=(12.5, 5))
ax = fig.add_subplot(1, 1, 1)
if "title" in kwargs:
ax.set_title(kwargs["title"])
del kwargs["title"]
if "xlabel" in kwargs:
ax.set_xlabel(kwargs["xlabel"])
del kwargs["xlabel"]
if "ylabel" in kwargs:
ax.set_ylabel(kwargs["ylabel"])
del kwargs["ylabel"]
ax.plot(freqs, 'k+-', **kwargs)
ax.grid(True, color="silver")
pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
pylab.tight_layout()
pylab.savefig('word_frequency.eps', dpi=300)
pylab.show()
def _get_kwarg(kwargs, key, default):
if key in kwargs:
arg = kwargs[key]
del kwargs[key]
else:
arg = default
return arg
categories = 'categories.txt'
reader = CategorizedXMLCorpusReader(cicero.root,
cicero.abspaths(),
cat_file=categories)
data = reader.words(cicero.fileids())
if options.stopwords is True:
filtered = punctless(data)
else:
filtered = punctless(stopless(data))
dist = MyFreqDist(Text(filtered))
if options.plot is True:
dist.plot(options.limit,
cumulative=False,
title= u'Gráfico de frequência (' + str(options.limit) + ' termos)',
ylabel=u'Ocorrências',
xlabel=u'Termos')
else:
print 'Data lenght: ' + str(len(data))
print 'Filtered data: ' + str(len(filtered))
print 'Distribution of: ' + str(len(dist))
print '\nCOUNT\tP(%)\tTERM'
total = len(dist.items())
limit = options.limit
if limit == 0:
limit = total
for item in dist.items()[:limit]:
if len(item[0]) >= 1 and item[1] >= options.count:
percentage = dist.freq(item[0]) * 100
percentage = '{0:.3}'.format(percentage)
print '%d\t%s\t%s' % (item[1], percentage + '%', item[0])
if options.zipf is True:
ranks = []
freqs = []
for rank, word in enumerate(dist):
ranks.append(rank+1)
freqs.append(dist[word])
fig = pylab.figure(figsize=(7.5, 5))
ax = fig.add_subplot(1, 1, 1)
ax.loglog(ranks, freqs, 'k-')
ax.loglog(range(1, len(freqs)+1), [len(freqs)/x for x in range(1, len(freqs)+1)], 'k--')
ax.grid(True, color="silver")
ax.set_title(u'Lei de Zipf (' + str(len(dist.items())) + ' termos)')
ax.set_ylabel(u'Frequência')
ax.set_xlabel(u'Ordem')
pylab.tight_layout()
pylab.savefig('word_frequency_zipf.eps', dpi=300)
pylab.show()
As ferramentas online pra manipulação dos corpora serão desenvolvidas em breve.
Léxico de latim clássico para aprendizado, de acordo com os métodos apresentados no projeto:
dico | uerbum | quantus | oportet | lex | anima |
possum | populus | suis | auctoritas | modius | liber |
uideo | uita | ciuis | quaero | ars | exercitus |
homo | animus | suo | facilis | arbitror | reliquus |
facio | tuus | suus | prouincia | quin | praetor |
causa | scribo | sententia | studium | genus | honor |
habeo | multus | semper | loco | uirus | diligo |
uolo | lego | nullus | intellego | plus | bene |
iudex | consilium | ceterus | corpus | periculum | bellum |
tantus | ius | saepe | dignitas | optimus | sentio |
bonus | oratio | maior | pater | soleo | domus |
ratio | inquam | uir | dolor | scriba | sapio |
primus | dies | totus | scio | opus | rego |
publica | gener | pars | do | dica | diu |
maximus | uirtus | iudicium | ago | modus | imperium |
tempus | pono | ueneo | necesse | proficiscor | plures |
littera | itaque | pecunia | audio | mors | accipio |
nunc | deus | minor | umquam | spes | praesertim |
senatus | locus | debeo | fortuna | urbs | uno |
uis | ciuitas | numquam | ullus | pario | malis |
multa | consul | fors | omnino | duo | uoluntas |
natura | satis | summo | orator | gratia | amicus |
puto | nomen | animo | salus | memoria | moueo |
magnus | publico | potis | utor | credo | mens |
alter | solus | fero | uoluptas | populor | postea |
fides | antea | paene | amicio | prior | contineo |
crimen | terra | statim | exter | laudo | iter |
paullus | testis | crassus | manus | exspecto | consto |
sermo | bello | peto | constituo | nolo | frumentum |
brutus | melior | nosco | officio | usque | finis |
patria | aio | plurimus | socius | conscribo | cumque |
iniuria | nego | pertineo | uer | contrarius | dicis |
sal | loquor | uerres | sol | philosophia | quaeso |
gloria | morior | tandem | numero | brutes | absum |
potestas | honestus | dedo | accusator | spero | quirito |
epistula | defendo | consulo | accedo | summa | concedo |
sanus | mitto | publicus | commodus | foro | accuso |
aetas | ualde | uix | puteo | praesidium | audacia |
ager | aliquando | domio | intersum | eloquentia | mare |
malo | existimo | communio | magistratus | castrum | disciplina |
filius | immortalis | breuis | sex | contio | diuino |
fere | annus | libertas | amplus | arator | alienus |
cognosco | uiuo | iudico | nonne | libet | sicilis |
amicitia | bellis | familiaris | iubeo | adulesco | legio |
frater | minimus | casus | falsum | mundus | inuenio |
efficio | summus | liberi | tribunus | ostendo | quaestio |
grauis | italia | affero | praeclarus | pridie | miles |
appello | quintus | unde | reus | uenio | consequor |
uehemens | placeo | censeo | caput | cottidie | laus |
planus | princeps | genero | paco | mehercule | defensio |
ingenius | scelerus | sequor | consulatus | tabula | turpis |
licet | gero | cupiditas | ciuilis | scientia | dubito |
negotium | beneficium | testimonium | opinor | diligentia | decimus |
beo | adhuc | dolabella | uera | caelum | communis |
tamquam | metus | gratus | filia | nimis | praeceptum |
opera | rex | orbis | legatus | asia | clarus |
ualeo | consuetudo | conuenio | miser | similis | multitudo |
hostis | certus | accido | auctor | labor | magnitudo |
arma | praeterea | talis | amor | dignus | conficio |
nescio | religio | philosophus | plebs | odeum | praesto |
nascor | longus | ordo | imperator | prudentia | duco |
simul | tot | agero | opinio | uaco | metellus |
cura | uox | inimicus | cogito | adsum | fortis |
officium | oculus | utilitas | sapientia | actio | honestas |
copia | partim | facia | doceo | poena | aegritudo |
exemplum | numerus | dato | quod-si | puer | occido |
suspicio | ordino | edico | teneo | consularis | uictoria |
ulciscor | contentio | adeo | superior | demonstro | cognitio |
argumentum | inuidia | relinquo | constitutio | catulus | singula |
probo | exemplo | libido | rogus | argumentatio | senectus |
ibi | par | pauci | caedes | instituo | fructus |
forma | acta | acte | insum | hortens | hortus |
lentulus | similitudo | nauis | subeo | platon | nouo |
facultas | familia | otion | adhibeo | curro | singularis |
inde | nimius | scilicet | considero | mos | datus |
aequus | culpa | necessarius | gens | redeo | reddo |
domo | societas | utinam | uerso | adno | cerno |
aduersarius | scipio | simile | hinc | apio | grauitas |
dea | sisto | nuper | uia | diuinatio | contendo |
controuersia | exeo | hir | perspicuus | species | humanitas |
tribus | plerusque | quoad | percipio | templum | popularis |
utilis | uitio | uereor | amo | aedes | necessitudo |
cupio | auspicium | marcus | dubius | armatus | quattuor |
assentio | difficilis | dis | balbus | collega | mater |
totus | iustitia | aperio | signo | careo | pando |
nondum | mulier | postulo | iuro | cado | seruus |
beneuolentia | coepio | istic | calamitas | auris | coniungo |
sumo | graecor | improbus | fides | absoluo | ferrum |
pompeum | doctrina | fama | molestus | condemno | forum |
cieo | parum | cato | capio | eques | facinus |