-
Notifications
You must be signed in to change notification settings - Fork 0
/
gallicagram.R
89 lines (73 loc) · 4.4 KB
/
gallicagram.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
library(ggplot2)
library(lubridate)
library(stringr)
library(utils)
library(xml2)
library(dplyr)
setwd("C:/Users/Benjamin/Downloads/gallicagram")
########## EXTRACTION
tab5<-as.data.frame(cbind(c(NA),c(NA),c(NA)))
tab5<-tab5[-1,]
for (i in 1900:1950)
{
mot<-"revolution%20nationale" #un espace entre deux mots doit être remplacé par "%20"
y<-as.character(i)
url<-str_c("https://gallica.bnf.fr/SRU?operation=searchRetrieve&version=1.2&startRecord=1&maximumRecords=1&page=1collapsing=false&exactSearch=true&query=text%20adj%20%22",mot,"%22%20%20and%20(dc.type%20all%20%22fascicule%22)%20and%20(gallicapublication_date%3E=%22",y,"/01/01%22%20and%20gallicapublication_date%3C=%22",y,"/12/31%22)%20sortby%20dc.date/sort.ascending&suggest=10&keywords=",mot)
ngram<-as.character(read_xml(url))
a<-str_extract(str_extract(ngram,"numberOfRecordsDecollapser>+[:digit:]+"),"[:digit:]+")
url_base<-str_c("https://gallica.bnf.fr/SRU?operation=searchRetrieve&version=1.2&startRecord=1&maximumRecords=1&page=1collapsing=false&version=1.2&query=(dc.type%20all%20%22fascicule%22)%20and%20(gallicapublication_date%3E=%22",y,"/01/01%22%20and%20gallicapublication_date%3C=%22",y,"/12/31%22)&suggest=10&keywords=")
ngram_base<-as.character(read_xml(url_base))
b<-str_extract(str_extract(ngram_base,"numberOfRecordsDecollapser>+[:digit:]+"),"[:digit:]+")
tab5<-rbind(tab5,c(i,a,b))
print(i)
}
#####CALCUL DE L'INDICATEUR
colnames(tab5)<-c("date","nb_temp","base_temp")
tab5$date<-as.integer(tab5$date)
tab5$nb_temp<-as.integer(tab5$nb_temp)
tab5$base_temp<-as.integer(tab5$base_temp)
tab5$ratio_temp<-tab5$nb_temp/tab5$base_temp
#####AFFICHAGE DU GRAPHE
ggplot(tab5,aes(date,ratio_temp))+geom_line(size=1)+
scale_x_continuous(breaks = seq(1900,1950,2))+
theme(axis.text.x = element_text(angle=45))+
xlab("Date")+ylab("Part des numéros faisant mention de l'expression \n 'Révolution nationale' dans le corpus Gallica-Presse")+
ggtitle("Fréquence d'usage de l'expression 'Révolution nationale' (Gallica-Presse)")+
ggsave("Fréquence d'usage de l'expression Révolution nationale.png",scale=2)
##########POUR UNE RESOLUTION AU MOIS##########
tab6<-as.data.frame(cbind(c(NA),c(NA),c(NA)))
colnames(tab6)<-c("date","nb_temp","base_temp")
########## EXTRACTION
for (i in 1940:1944)
{
for (j in 1:12)
{
mot<-"revolution%20nationale"
y<-as.character(i)
z<-as.character(j)
if(nchar(z)<2){z<-str_c("0",z)}
url<-str_c("https://gallica.bnf.fr/SRU?operation=searchRetrieve&version=1.2&startRecord=1&maximumRecords=1&page=1collapsing=false&exactSearch=true&query=text%20adj%20%22",mot,"%22%20%20and%20(dc.type%20all%20%22fascicule%22)%20and%20(gallicapublication_date%3E=%22",y,"/",z,"/01%22%20and%20gallicapublication_date%3C=%22",y,"/",z,"/31%22)%20sortby%20dc.date/sort.ascending&suggest=10&keywords=",mot)
ngram<-as.character(read_xml(url))
a<-str_extract(str_extract(ngram,"numberOfRecordsDecollapser>+[:digit:]+"),"[:digit:]+")
url_base<-str_c("https://gallica.bnf.fr/SRU?operation=searchRetrieve&version=1.2&startRecord=1&maximumRecords=1&page=1collapsing=false&version=1.2&query=(dc.type%20all%20%22fascicule%22)%20and%20(gallicapublication_date%3E=%22",y,"/",z,"/01%22%20and%20gallicapublication_date%3C=%22",y,"/",z,"/31%22)&suggest=10&keywords=")
ngram_base<-as.character(read_xml(url_base))
b<-str_extract(str_extract(ngram_base,"numberOfRecordsDecollapser>+[:digit:]+"),"[:digit:]+")
date<-str_c(y,z,"01")
tab6<-rbind(tab6,c(date,a,b))
print(str_c(i,"-",j))
}
}
#####CALCUL DE L'INDICATEUR
tab6<-tab6[-1,]
tab6$date<-ymd(as.character(tab6$date))
tab6$nb_temp<-as.integer(tab6$nb_temp)
tab6$base_temp<-as.integer(tab6$base_temp)
tab6$ratio_temp<-tab6$nb_temp/tab6$base_temp
#####AFFICHAGE
tab6%>%subset(tab6$date>=ymd(19400101) & tab6$date<=ymd(19450101))%>% ggplot(aes(date,ratio_temp))+geom_line(size=1)+
scale_x_date(breaks=seq(as.Date("1940/1/1"), as.Date("1945/1/1"), "2 months"),date_labels = "%b %Y")+
theme(axis.text.x = element_text(angle=45))+
xlab("Date")+ylab("Part des numéros faisant mention de l'expression \n 'Révolution nationale' dans le corpus Gallica-Presse")+
ggtitle("Fréquence d'usage de l'expression 'Révolution nationale' durant l'Occupation (Gallica-Presse)")+
ggsave("Fréquence d'usage de l'expression Révolution nationale durant l'Occupation.png",scale=2)
#####