> # Prefer fixed to scientific notation
> options(scipen=5)
> 
> # Print numbers with two significant digits
> options(digits=2)
> options(R2HTML.format.digits=2)
> 
> # Set a nice color palette for plots
> lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
+    "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1, 
+   3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
> rm(dtm)
Warning in rm(dtm): oggetto "dtm" non trovato
> corpus <- VCorpus(DirSource("/home/daniele/CORPUS", encoding="UTF-8"), 
+   readerControl=list(language="it"))
> corpusVars <- data.frame(var1=factor(rep("", length(corpus))), 
+   row.names=names(corpus))
> activeDataSet("corpusVars")
> setCorpusVariables()
> dtmCorpus <- corpus
> dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
> dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x) 
+   gsub("\\p{P}|\\p{S}|\\p{Z}|\\p{C}", " ", x, perl=TRUE)))
> dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
> dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE, 
+   wordLengths=c(2, Inf)))
> rm(dtmCorpus)
> library(SnowballC)
> dictionary <- data.frame(row.names=colnames(dtm), 
+   "Occurrences"=col_sums(dtm), "Stemmed.Term"=wordStem(colnames(dtm), "it"), 
+   "Stopword"=ifelse(colnames(dtm) %in% stopwords("it"), "Stopword", ""), 
+   stringsAsFactors=FALSE)
> dtm <- rollup(dtm, 2, dictionary[[2]])
> attr(dtm, "dictionary") <- dictionary
> rm(dictionary)
> meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "it"
> meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <- 
+   c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=FALSE, 
+   stemming=TRUE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA, 
+   removeNames=NA)
> corpus
<<VCorpus>>
Metadata:  corpus specific: 2, document level (indexed): 0
Content:  documents: 2
> dtm
<<DocumentTermMatrix (documents: 2, terms: 233)>>
Non-/sparse entries: 274/192
Sparsity           : 41%
Maximal term length: 18
> dtmCorpus <- corpus
> dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
> dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x) 
+   gsub("\\p{P}|\\p{S}|\\p{Z}|\\p{C}", " ", x, perl=TRUE)))
> dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
> wordsDtm <- DocumentTermMatrix(dtmCorpus, control=list(wordLengths=c(2, 
+   Inf)))
> rm(dtmCorpus)
> voc <- vocabularyTable(dtm, wordsDtm)
> barchart(t(voc[c(3),, drop=FALSE]), stack=FALSE, horizontal=FALSE, 
+   scales=list(rot=90), ylab="Percent of unique terms", 
+   main="Vocabulary summary by document")

> attr(voc, "title") <- "Vocabulary summary by document (per-document mean)"
> voc
                            
                             silvia silvia1 Corpus mean Corpus total
  Number of terms             188.0   161.0       174.5        349.0
  Number of unique terms      143.0   131.0       137.0        233.0
  Percent of unique terms      76.1    81.4        78.7         66.8
  Number of hapax legomena    116.0   107.0       111.5        169.0
  Percent of hapax legomena    61.7    66.5        64.1         48.4
  Number of words             188.0   161.0       174.5        349.0
  Number of long words         84.0    86.0        85.0        170.0
  Percent of long words        44.7    53.4        49.0         48.7
  Number of very long words    18.0    20.0        19.0         38.0
  Percent of very long words    9.6    12.4        11.0         10.9
  Average word length           6.5     7.1         6.8          6.7
> specTerms <- specificTerms(dtm, NULL, p=0.1, min.occ=2, n.max=25)
> attr(specTerms, "title") <- "Specific terms by document"
> specTerms
$silvia
      % Term/Level % Level/Term Global % Level Global t value Prob.
islam          2.1          100      1.1     4      4     1.4 0.083
torn           2.1          100      1.1     4      4     1.4 0.083

$silvia1
      % Term/Level % Level/Term Global % Level Global t value Prob.
-----            0            0      1.1     0      4    -1.4 0.083
                NA           NA       NA    NA     NA      NA    NA
islam            0            0      1.1     0      4    -1.4 0.083
torn             0            0      1.1     0      4    -1.4 0.083

attr(,"title")
[1] "Specific terms by document"
> specTerms <- specificTerms(dtm, NULL, p=0.1, min.occ=2, n.max=25)
> attr(specTerms, "title") <- "Specific terms by document"
> specTerms
$silvia
      % Term/Level % Level/Term Global % Level Global t value Prob.
islam          2.1          100      1.1     4      4     1.4 0.083
torn           2.1          100      1.1     4      4     1.4 0.083

$silvia1
      % Term/Level % Level/Term Global % Level Global t value Prob.
-----            0            0      1.1     0      4    -1.4 0.083
                NA           NA       NA    NA     NA      NA    NA
islam            0            0      1.1     0      4    -1.4 0.083
torn             0            0      1.1     0      4    -1.4 0.083

attr(,"title")
[1] "Specific terms by document"