-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstaticPages.py
812 lines (723 loc) · 32.7 KB
/
staticPages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
# -*- coding: utf-8 -*-
from lxml import etree as ET
from openpyxl import load_workbook
from operator import itemgetter
import os
import copy
import requests
#lxml parser for parsing XML files from strings
parser = ET.XMLParser(remove_blank_text=True)
if os.name == "nt":
#Windows Directory Names
#Finding Aid Directory
faDir = "\\\\romeo\\Collect\\spe\\Greg\\EAD_Validator"
#Collection and Subject spreadsheets directory
spreadDir = "\\\\romeo\\Collect\spe\\Tools\\CollectionList"
#static pages directory on public webserver
staticDir = "\\\\romeo\\wwwroot\\eresources\\static"
else:
#Unix directory names
#Finding Aid Directory
faDir = "/media/bcadmin/Collect/spe/Greg/EAD_Validator"
#Collection and Subject spreadsheets directory
spreadDir = "/media/bcadmin/Collect/spe/Tools/CollectionList"
#static pages directory on public webserver
staticDir = "/media/bcadmin/wwwroot/eresources/static"
#parse Collection List spreadsheet
collectionListFile = os.path.join(spreadDir, "collectionList.xlsx")
collectionWorkbook = load_workbook(filename = collectionListFile, use_iterators=True, read_only=True)
collectionList = collectionWorkbook.get_sheet_by_name('collectionList')
#parse Local Subject Lists spreadsheet
subjectGuidesFile = os.path.join(spreadDir, "subjectGuides.xlsx")
subjectWorkbook = load_workbook(filename = subjectGuidesFile, use_iterators=True, read_only=True)
subjectGuides = subjectWorkbook.get_sheet_by_name('subjectGuides')
#load basic HTML template file
templateFile = os.path.join(staticDir, "templates", "template.html")
tempateInput = ET.parse(templateFile, parser)
tempate = tempateInput.getroot()
#add additional JS file for fixed header
affixHead = ET.Element("script")
affixHead.set("type", "text/javascript")
affixHead.set("src", "http://library.albany.edu/speccoll/findaids/eresources/static/js/headerAffix.js")
tempate.find("head").append(affixHead)
#############################################################
#load and parse specific templates for each static page
alpha = copy.copy(tempate)
alpha.find("head/title").text = "A-Z Complete List of Collections"
alphaContent = alpha.find("body/div[@id='mainContent']")
alphaTemplateFile = os.path.join(staticDir, "templates", "browseAlpha.xml")
alphaTempateInput = ET.parse(alphaTemplateFile, parser)
alphaTempate = alphaTempateInput.getroot()
alphaContent.append(alphaTempate)
apap = copy.copy(tempate)
apap.find("head/title").text = "New York State Modern Political Archive"
apapContent = apap.find("body/div[@id='mainContent']")
apapTemplateFile = os.path.join(staticDir, "templates", "browseAPAP.xml")
apapTempateInput = ET.parse(apapTemplateFile, parser)
apapTempate = apapTempateInput.getroot()
apapContent.append(apapTempate)
ger = copy.copy(tempate)
ger.find("head/title").text = u"German and Jewish Intellectual Émigré"
gerContent = ger.find("body/div[@id='mainContent']")
gerTemplateFile = os.path.join(staticDir, "templates", "browseGER.xml")
gerTempateInput = ET.parse(gerTemplateFile, parser)
gerTempate = gerTempateInput.getroot()
gerContent.append(gerTempate)
mss = copy.copy(tempate)
mss.find("head/title").text = u"Business, Literary, and Miscellaneous Manuscripts"
mssContent = mss.find("body/div[@id='mainContent']")
mssTemplateFile = os.path.join(staticDir, "templates", "browseMSS.xml")
mssTempateInput = ET.parse(mssTemplateFile, parser)
mssTempate = mssTempateInput.getroot()
mssContent.append(mssTempate)
ua = copy.copy(tempate)
ua.find("head/title").text = u"University Archives"
uaContent = ua.find("body/div[@id='mainContent']")
uaTemplateFile = os.path.join(staticDir, "templates", "browseUA.xml")
uaTempateInput = ET.parse(uaTemplateFile, parser)
uaTempate = uaTempateInput.getroot()
uaContent.append(uaTempate)
subjects = copy.copy(tempate)
subjects.find("head/title").text = u"Subjects"
subjectsContent = subjects.find("body/div[@id='mainContent']")
subjectsTemplateFile = os.path.join(staticDir, "templates", "browseSubjects.xml")
subjectsTempateInput = ET.parse(subjectsTemplateFile, parser)
subjectsTempate = subjectsTempateInput.getroot()
subjectsContent.append(subjectsTempate)
#############################################################
#############################################################
#Functions to make each component of static pages
#############################################################
def makePanel(id, title, type, date, extent, units, abstract, link):
panel = ET.Element("div")
panel.set("class", "panel panel-default")
panelHead = ET.Element("div")
panelHead.set("class", "panel-heading")
panelTitle = ET.Element("div")
panelTitle.set("class", "panel-title pull-left")
panelBody = ET.Element("div")
panelBody.set("class", "panel-body")
h3 = ET.Element("h3")
h5 = ET.Element("h5")
quantity = ET.Element("p")
strong = ET.Element("strong")
strong.text = "Quantity:"
p = ET.Element("p")
h3.text = title
h5.text = str(type) + ", " + str(date)
strong.tail = " " + str(extent) + " " + units + " (about " + str(extent) + " boxes)"
p.text = abstract
if len(link) > 0:
collectionLink = ET.Element("a")
collectionLink.set("href", link)
collectionLink.append(panelTitle)
panelHead.append(collectionLink)
else:
panelHead.append(panelTitle)
requestButton = ET.Element("button")
requestButton.set("class", "btn btn-primary requestModel pull-right")
requestButton.set("id", "nam_" + id + ": " + title)
requestButton.set("type", "button")
requestButton.set("data-toggle", "modal")
requestButton.set("data-target", "#requestBrowse")
icon = ET.Element("i")
icon.set("class", "glyphicon glyphicon-folder-close")
panelHead.append(requestButton)
requestButton.append(icon)
icon.tail = " Request"
clearFloat = ET.Element("div")
clearFloat.set("style", "clear:both;")
panelHead.append(clearFloat)
panel.append(panelHead)
panelTitle.append(h3)
panelTitle.append(h5)
quantity.append(strong)
panelBody.append(quantity)
panelBody.append(p)
panel.append(panelBody)
return panel
def makeAbstract(collection, pageContent):
if collection[3] == "ead":
id = collection[1]
title = collection[4]
type = collection[5]
eadFile = os.path.join(faDir, id + ".xml")
eadInput = ET.parse(eadFile, parser)
ead = eadInput.getroot()
date = ead.find("archdesc/did/unittitle/unitdate").text
if ead.find("archdesc/did/physdesc/extent") is None:
try:
extent = ead.find("archdesc/did/physdesc/physfacet").text
units = ""
except:
extent = ead.find("archdesc/did/physdesc/dimensions").text
units = ead.find("archdesc/did/physdesc/dimensions").attrib["unit"]
else:
extent = ead.find("archdesc/did/physdesc/extent").text
units = ead.find("archdesc/did/physdesc/extent").attrib["unit"]
abstract = ead.find("archdesc/did/abstract").text
link = "http://meg.library.albany.edu:8080/archive/view?docId=" + id + ".xml"
panel = makePanel(id, title, type, date, extent, units, abstract, link)
collectionAnchor = ET.Element("a")
collectionAnchor.set("name", id)
collectionAnchor.set("class", "anchor")
pageContent.append(collectionAnchor)
pageContent.append(panel)
return pageContent
elif collection[3] == "html":
id = collection[1]
title = collection[4]
type = collection[5]
date = collection[6]
extent = collection[7]
units = collection[8]
abstract = collection[9]
link = "http://library.albany.edu/speccoll/findaids/" + id + ".htm"
panel = makePanel(id, title, type, date, extent, units, abstract, link)
collectionAnchor = ET.Element("a")
collectionAnchor.set("name", id)
collectionAnchor.set("class", "anchor")
pageContent.append(collectionAnchor)
pageContent.append(panel)
return pageContent
elif collection[3] == "null":
id = collection[1]
title = collection[4]
type = collection[5]
date = collection[6]
extent = collection[7]
units = collection[8]
abstract = collection[9]
panel = makePanel(id, title, type, date, extent, units, abstract, "")
collectionAnchor = ET.Element("a")
collectionAnchor.set("name", id)
collectionAnchor.set("class", "anchor")
pageContent.append(collectionAnchor)
pageContent.append(panel)
return pageContent
else:
print "error with " + collection[3] + ", line " + str(rowIndex)
return pageContent
def makeLink(collection, pageContent):
if collection[3] == "ead":
id = collection[1]
title = collection[4]
type = collection[5]
eadFile = os.path.join(faDir, id + ".xml")
eadInput = ET.parse(eadFile, parser)
ead = eadInput.getroot()
date = ead.find("archdesc/did/unittitle/unitdate").text
abstract = ead.find("archdesc/did/abstract").text
link = "http://meg.library.albany.edu:8080/archive/view?docId=" + id + ".xml"
elif collection[3] == "html":
id = collection[1]
title = collection[4]
type = collection[5]
date = collection[6]
link = "http://library.albany.edu/speccoll/findaids/" + id + ".htm"
elif collection[3] == "null":
id = collection[1]
title = collection[4]
type = collection[5]
date = collection[6]
if id.startswith("apap"):
link = "http://library.albany.edu/speccoll/findaids/eresources/static/apap.html#" + id
elif id.startswith("ger"):
link = "http://library.albany.edu/speccoll/findaids/eresources/static/ger.html#" + id
elif id.startswith("mss"):
link = "http://library.albany.edu/speccoll/findaids/eresources/static/mss.html#" + id
elif id.startswith("ua"):
link = "http://library.albany.edu/speccoll/findaids/eresources/static/ua.html#" + id
alphaLink = ET.Element("a")
alphaLink.set("href", link)
alphaLink.set("class", "alphaLink")
alphaLink.text = str(title) + "; " + str(type) + ", " + str(date)
pageContent.append(alphaLink)
br = ET.Element("br")
pageContent.append(br)
return pageContent
def alphaNav(contentDiv, collection):
leftNav = contentDiv.find("div[@class='row no-gutter']/div[@id='browseNav']/div[@class='panel panel-default']/div[@class='nav list-group']")
alphaContent = contentDiv.find("div[@class='row no-gutter']/div[@class='col-md-9 col-md-offset-3 alphaContent']")
letter = collection[4][:1]
if leftNav.find("li/a[@id='link-" + letter + "']") is None:
navLi =ET.Element("li")
navLink = ET.Element("a")
navLink.set("id", "link-" + letter)
navLi.set("class", "list-group-item")
navLink.set("href", "#" + letter)
navLink.text = letter.upper()
navLi.append(navLink)
leftNav.append(navLi)
if alphaContent.find("div[@id='" + letter + "']") is None:
anchor = ET.Element("div")
anchor.set("id", letter)
anchor.set("class", "anchor")
alphaContent.append(anchor)
else:
anchor = alphaContent.find("div[@id='" + letter + "']")
if contentDiv.find("div[@title='alphaList']") is None:
anchor = makeAbstract(collection, anchor)
else:
alphaContent = makeLink(collection, alphaContent)
return contentDiv
#############################################################
#end functions to make compontents for static pages
#############################################################
#Parse List of Collections to list of lists
rowIndex = 0
collections = []
for row in collectionList.rows:
rowIndex = rowIndex + 1
if rowIndex > 1:
collection = [str(rowIndex), row[0].value, row[1].value, row[2].value, row[3].value, row[4].value, row[5].value, row[6].value, row[7].value, row[8].value, row[9].value, row[10].value, row[11].value]
collections.append(collection)
#Function to make DACS and Normal (ISO) dates from CDX timestamp
def makeDate(stamp):
calendar = {"01": "January", "02": "February", "03": "March", "04": "April", "05": "May", "06": "June", "07": "July", "08": "August", "09": "September", "10": "October", "11": "November", "12": "December"}
year = stamp[:4]
month = stamp[4:6]
day = stamp[-2:]
normal = year + "-" + month + "-" + day
if day.startswith("0"):
day = day[-1:]
dacs = year + " " + calendar[month] + " " + day
return dacs, normal
# Iterate through each collection
sortedCollections = sorted(collections, key=itemgetter(4))
for collection in sortedCollections:
if not collection[2] is None:
#for non-public collections
pass
else:
#sort collections in to catagories by ID prefix
alphaContent = alphaNav(alphaContent, collection)
if collection[1].startswith("apap"):
apapContent = alphaNav(apapContent, collection)
elif collection[1].startswith("ger"):
gerContent = alphaNav(gerContent, collection)
elif collection[1].startswith("mss"):
mssContent = alphaNav(mssContent, collection)
elif collection[1].startswith("ua"):
uaContent = alphaNav(uaContent, collection)
###################################################################################
#Web Archives Section
###################################################################################
# if collection has EAD file and a Web Archives series listed
if not collection[10] is None and str(collection[3]).lower() == "ead":
try:
#Archive-It CDX API request
webUrl = str(collection[12])
archiveItCollection = str(collection[11])
print "looking for web archives captures for collection " + str(collection[1])
archiveIt = False
wayback = False
requestURL = "http://wayback.archive-it.org/" + archiveItCollection + "/timemap/cdx?url=" + webUrl
#for debugging:
#print requestURL
response = requests.get(requestURL)
responseText = response.text
#variable to count number of captures:
aiCount = 0
#if lenght of HTTP response is greater than 5, aribitrary value to check for any captures
if len(responseText) > 5:
archiveIt = True
responseLines = responseText.split("\n")
firstPage = responseLines[0]
for textLine in responseLines:
aiCount = aiCount + 1
if len(textLine) > 5:
lastPage = textLine
#get date range of captures
firstDate = firstPage.split(" ")[1][:8]
lastDate = lastPage.split(" ")[1][:8]
#general Wayback CDX API request
wayRequestURL = "http://web.archive.org/cdx/search/cdx?url=" + webUrl
#for debugging:
#print wayRequestURL
wayResponse = requests.get(wayRequestURL)
wayResponseText = wayResponse.text
waybackCount = 0
if len(wayResponseText) > 120:
wayback = True
wayResponseLines = wayResponseText.split("\n")
wayFirstPage = wayResponseLines[0]
for wayLine in wayResponseLines:
if len(wayLine) > 5:
waybackCount = waybackCount + 1
wayLastPage = wayLine
#get date range of captures
wayFirstDate = wayFirstPage.split(" ")[1][:8]
wayLastDate = wayLastPage.split(" ")[1][:8]
#check if actually found any captures and convert to DACS and Normal (ISO) dates
if archiveIt == False and wayback == False:
pass
else:
if archiveIt == False:
#get DACS and normal dates
dateRange = [wayFirstDate, wayLastDate]
seriesMax = max(dateRange)
seriesMin = min(dateRange)
seriesMaxDacs, seriesMaxNormal = makeDate(seriesMax)
seriesMinDacs, seriesMinNormal = makeDate(seriesMin)
seriesDacs = seriesMinDacs + "-" + seriesMaxDacs
seriesNormal = seriesMinNormal + "/" + seriesMaxNormal
elif wayback == False:
#get DACS and normal dates
dateRange = [firstDate, lastDate]
seriesMax = max(dateRange)
seriesMin = min(dateRange)
seriesMaxDacs, seriesMaxNormal = makeDate(seriesMax)
seriesMinDacs, seriesMinNormal = makeDate(seriesMin)
seriesDacs = seriesMinDacs + "-" + seriesMaxDacs
seriesNormal = seriesMinNormal + "/" + seriesMaxNormal
else:
dateRange = [firstDate, lastDate, wayFirstDate, wayLastDate]
seriesMax = max(dateRange)
seriesMin = min(dateRange)
seriesMaxDacs, seriesMaxNormal = makeDate(seriesMax)
seriesMinDacs, seriesMinNormal = makeDate(seriesMin)
seriesDacs = seriesMinDacs + "-" + seriesMaxDacs
seriesNormal = seriesMinNormal + "/" + seriesMaxNormal
#Show feedback for web archvies
print "found Web Archives for " + str(collection[1])
#parse EAD file for collection
eadFile = os.path.join(faDir, str(collection[1]) + ".xml")
faInput = ET.parse(eadFile, parser)
fa = faInput.getroot()
#Get Web Archvies Series Semantic ID
webArchSeries = "nam_" + str(collection[1]) + "-" + str(collection[10])
#Add Web Archives not in <phystech>
if fa.find("archdesc/phystech") is None:
phystech = ET.Element("phystech")
phystechP = ET.SubElement(phystech, "p")
phystechP.text = "Web Archives"
fa.find("archdesc").insert(1, phystech)
#for debugging:
#print "Series " + webArchSeries
#find or create Web Archvies Series
if fa.find("archdesc/dsc") is None:
print "creating web archives series"
dsc = ET.Element("dsc")
fa.find("archdesc").append(dsc)
series = ET.Element("c01")
series.set("id", webArchSeries)
dsc.append(series)
else:
match = False
for series in fa.find("archdesc/dsc"):
if series.tag == "c01":
if series.attrib["id"] == webArchSeries:
match = True
if match == False:
newSeries = ET.Element("c01")
newSeries.set("id", webArchSeries)
fa.find("archdesc/dsc").append(newSeries)
#iterate though EAD and find matching series
for series in fa.find("archdesc/dsc"):
if series.tag == "c01":
if series.attrib["id"] == webArchSeries:
#for debugging:
#print "found series"
series.set("level", "series")\
#find or create <did>
if series.find("did") is None:
did = ET.Element("did")
series.insert(0, did)
#find or create <unitid>
if series.find("did/unitid") is None:
unitid = ET.Element("unitid")
unitid.text = str(collection[10])
series.find("did").insert(0, unitid)
#update <unittitle>
if series.find("did/unittitle") is None:
unittitle = ET.Element("unittitle")
unittitle.text = "Web Archives"
series.find("did").insert(1, unittitle)
#remove existing <unitdate>s
if not series.find("did/unitdate") is None:
series.find("did").remove(series.find("did/unitdate"))
#Add new <unitdate>
unitdate = ET.Element("unitdate")
unitdate.set("type", "inclusive")
unitdate.set("era", "ce")
unitdate.set("calendar", "gregorian")
unitdate.set("normal", seriesNormal)
unitdate.text = seriesDacs
series.find("did").insert(2, unitdate)
#remove existing <physdesc>
if not series.find("did/physdesc") is None:
series.find("did").remove(series.find("did/physdesc"))
#Add new <physdesc> with count of captures
physdescElement = ET. Element ("physdesc")
extentElement = ET.Element("extent")
extentElement.text = str(aiCount + waybackCount)
extentElement.set("unit", "captures")
physdescElement.append(extentElement)
series.find("did").append(physdescElement)
#remove existing <phystech>
if not series.find("phystech") is None:
series.remove(series.find("phystech"))
#add new <phystech>
if series.find("phystech") is None:
phystech = ET.Element("phystech")
phystechP = ET.SubElement(phystech, "p")
phystechP.text = "Web Archives"
series.insert(1, phystech)
#remove existing <acqinfo>
if not series.find("acqinfo") is None:
series.remove(series.find("acqinfo"))
#add new <acqinfo>
if series.find("acqinfo") is None:
acqinfo = ET.Element("acqinfo")
acqP1 = ET.SubElement(acqinfo, "p")
acqP2 = ET.SubElement(acqinfo, "p")
#default <acqinfo> text
acqP1.text = "Web crawling is managed through the Internet Archive's Archive-It service. This page includes links to both the university's collection and the Internet Archive's public collection."
#Albany.edu <acqinfo> text
if archiveItCollection == "3308":
acqP2.text = "Surface-level crawling of www.albany.edu is performed daily which should includes most top-level webpages. Separate targeted crawls of every albany.edu subdomain are performed monthly to attempt to gather all content. This includes: www.albany.edu, www.rna.albany.edu, www.ctg.albany.edu, www.ualbanysports.com, www.albany.edu/rockefeller, www.albany.edu/cela, www.albany.edu/asrc, m.albany.edu, library.albany.edu, events.albany.edu, cstar.cestm.albany.edu, csda.albany.edu, and alumni.albany.edu"
#NYCLU <acqinfo> text
elif archiveItCollection == "7081":
acqP2.text = "The following seeds are crawled monthly: www.nyclu.org, www.facebook.com/NYCLU, www.twitter.com/NYCLU, and en.wikipedia.org/wiki/New_York_Civil_Liberties_Union. The domains en.wikipedia.org, twitter.com, and upload.wikimedia.org are subjected to a 1,000 document limit."
series.insert(1, acqinfo)
#remove existing web archives links
for oldc02 in series:
if oldc02.tag == "c02":
series.remove(oldc02)
#variable to make new semantic IDs
idCount = 0
#Make Archive-it <c02>
if archiveIt == True:
idCount = idCount + 1
aiFile = ET.Element("c02")
aiFile.set("id", webArchSeries + "_" + str(idCount))
aiDid = ET.SubElement(aiFile, "did")
aiContainer = ET.SubElement(aiDid, "container")
aiContainer.set("type", "Web-Archive")
aiContainer.text = "1"
aiUnittitle = ET.SubElement(aiDid, "unittitle")
aiUnittitle.text = webUrl + " - University Archives collection"
aiUnitdate = ET.SubElement(aiDid, "unitdate")
aiUnitdate.set("calendar", "gregorian")
aiUnitdate.set("era", "ce")
firstDacs, firstNormal = makeDate(firstDate)
lastDacs, lastNormal = makeDate(lastDate)
aiUnitdate.set("normal", firstNormal + "/" + lastNormal)
aiUnitdate.text = firstDacs + "-" + lastDacs
aiDao = ET.SubElement(aiDid, "dao")
aiDao.set("actuate", "onrequest")
aiDao.set("linktype", "simple")
aiDao.set("show", "new")
aiDao.set("href", "http://wayback.archive-it.org/" + archiveItCollection + "/*/" + webUrl)
series.append(aiFile)
#make WARC <c02>
idCount = idCount + 1
warcFile = ET.Element("c02")
warcFile.set("id", webArchSeries + "_" + str(idCount))
warcDid = ET.SubElement(warcFile, "did")
warcContainer = ET.SubElement(warcDid, "container")
warcContainer.set("type", "WARC")
warcContainer.text = "1"
warcUnittitle = ET.SubElement(warcDid, "unittitle")
warcUnittitle.text = "WARC file for " + webUrl + ", (please see https://en.wikipedia.org/wiki/Web_ARChive for more details)"
warcUnitdate = ET.SubElement(warcDid, "unitdate")
warcUnitdate.set("calendar", "gregorian")
warcUnitdate.set("era", "ce")
warcUnitdate.set("normal", firstNormal + "/" + lastNormal)
warcUnitdate.text = firstDacs + "-" + lastDacs
accessRestrict = ET.Element("accessrestrict")
restrictP = ET.SubElement(accessRestrict, "p")
#WARC restriction Text
restrictP.text = "WARC files are very large and difficult to work with. Your request may take time to process, and we may be unable to deliver your request remotely. Please consult an archivist if you are interested in researching with web archives."
warcFile.append(accessRestrict)
series.append(warcFile)
#add general Wayback <c02>
if wayback == True:
idCount = idCount + 1
wayFile = ET.Element("c02")
wayFile.set("id", webArchSeries + "_" + str(idCount))
wayDid = ET.SubElement(wayFile, "did")
wayContainer = ET.SubElement(wayDid, "container")
wayContainer.set("type", "Web-Archive")
wayContainer.text = "2"
wayUnittitle = ET.SubElement(wayDid, "unittitle")
wayUnittitle.text = webUrl + " - Internet Archive collection"
wayUnitdate = ET.SubElement(wayDid, "unitdate")
wayUnitdate.set("calendar", "gregorian")
wayUnitdate.set("era", "ce")
firstDacs, firstNormal = makeDate(wayFirstDate)
lastDacs, lastNormal = makeDate(wayLastDate)
wayUnitdate.set("normal", firstNormal + "/" + lastNormal)
wayUnitdate.text = firstDacs + "-" + lastDacs
wayDao = ET.SubElement(wayDid, "dao")
wayDao.set("actuate", "onrequest")
wayDao.set("linktype", "simple")
wayDao.set("show", "new")
wayDao.set("href", "https://web.archive.org/web/*/" + webUrl)
series.append(wayFile)
#remove existing Web Archives access headings
accessHeadCount = 0
for heading in fa.find("archdesc/controlaccess"):
if heading.tag == "genreform":
if heading.attrib["source"] == "meg":
if heading.text == "Web Archives":
accessHeadCount = accessHeadCount + 1
#add new Web Archives access headings
if accessHeadCount < 1:
genreform = ET.Element("genreform")
genreform.text = "Web Archives"
genreform.set("source", "meg")
genreform.set("encodinganalog", "655")
fa.find("archdesc/controlaccess").append(genreform)
faString = ET.tostring(fa, pretty_print=True, xml_declaration=True, encoding="utf-8")
faFile = open(eadFile, "w")
faFile.write(faString)
faFile.close()
except:
#for exceptions
print "Issue with Archive-it API?"
continue
###################################################################################
#End Web Archives Section
###################################################################################
###################################################
#Create Static Subject Pages
###################################################
#iterate through subjects spreadsheet
subjectRowNumber = 0
for subjectRow in subjectGuides.rows:
subjectRowNumber = subjectRowNumber + 1
if subjectRowNumber == 1:
subjectIndex = 0
for subject in subjectRow:
leftNav = subjectsContent.find("div[@class='row no-gutter']/div[@id='browseNav']/div[@class='panel panel-default']/div[@class='nav list-group']")
if subjectIndex == 0:
leftNav.remove(leftNav[0])
rightContent = subjectsContent.find("div[@class='row no-gutter']/div[@class='col-md-9 col-md-offset-3 alphaContent']")
letter = subject.value[:1]
subjectName = subject.value
subjectColumn = subjectGuides.columns[subjectIndex]
subjectIndex = subjectIndex + 1
subjectLinkName = subjectColumn[1].value
if leftNav.find("li/a[@id='link-" + letter + "']") is None:
navLi =ET.Element("li")
navLink = ET.Element("a")
navLink.set("id", "link-" + letter)
navLi.set("class", "list-group-item")
navLink.set("href", "#" + letter)
navLink.text = letter.upper()
navLi.append(navLink)
leftNav.append(navLi)
if rightContent.find("div[@id='" + letter + "']") is None:
anchor = ET.Element("div")
anchor.set("id", letter)
anchor.set("class", "anchor")
rightContent.append(anchor)
else:
anchor = rightContent.find("div[@id='" + letter + "']")
subjectPara = ET.Element("p")
subjectLink = ET.Element("a")
subjectLink.text = subjectName
subjectLink.set("href", "http://library.albany.edu/speccoll/findaids/eresources/static/" + subjectLinkName + ".html")
subjectPara.append(subjectLink)
anchor.append(subjectPara)
print "creating subject page for " + subjectName
if subjectLinkName == "ndpa":
page = copy.copy(tempate)
page.find("head/title").text = u"National Death Penalty Archive"
pageContent = page.find("body/div[@id='mainContent']")
pageTemplateFile = os.path.join(staticDir, "templates", "browseNDPA.xml")
pageTempateInput = ET.parse(pageTemplateFile, parser)
pageTempate = pageTempateInput.getroot()
pageContent.append(pageTempate)
else:
page = copy.copy(tempate)
page.find("head/title").text = subjectName
pageContent = page.find("body/div[@id='mainContent']")
pageTemplateFile = os.path.join(staticDir, "templates", "browseSubjects.xml")
pageTempateInput = ET.parse(pageTemplateFile, parser)
pageTempate = pageTempateInput.getroot()
pageTempate.find("div/div[@class='jumbotron']/h2").text = subjectName
pageContent.append(pageTempate)
print subjectName
columnIndex = 0
for collectionIdCell in subjectColumn:
collectionId = collectionIdCell.value
if not collectionId is None:
columnIndex = columnIndex + 1
if columnIndex > 2:
for collection in sortedCollections:
if collection[1] == collectionId:
#add Local Subject Heading to EAD if missing
#for debugging:
#print collectionId
eadFile = os.path.join(faDir, str(collectionId) + ".xml")
if os.path.isfile(eadFile):
faInput = ET.parse(eadFile, parser)
fa = faInput.getroot()
subjectHeadCount = 0
if fa.find("archdesc/controlaccess") is None:
controlaccess = ET.Element("controlaccess")
fa.find("archdesc").insert(1, controlaccess)
for heading in fa.find("archdesc/controlaccess"):
if heading.tag == "subject":
if not "source" in heading.attrib:
pass
elif heading.attrib["source"] == "meg":
if heading.text == subjectName:
subjectHeadCount = subjectHeadCount + 1
if subjectHeadCount < 1:
megSubject = ET.Element("subject")
megSubject.text = subjectName
megSubject.set("source", "meg")
megSubject.set("encodinganalog", "650")
fa.find("archdesc/controlaccess").insert(1, megSubject)
#write Updated EAD to file
faString = ET.tostring(fa, pretty_print=True, xml_declaration=True, encoding="utf-8")
faFile = open(eadFile, "w")
faFile.write(faString)
faFile.close()
leftNav = pageContent.find("div[@class='row no-gutter']/div[@id='browseNav']/div[@class='panel panel-default']/div[@class='nav list-group']")
rightContent = pageContent.find("div[@class='row no-gutter']/div[@class='col-md-9 col-md-offset-3 alphaContent']")
letter = collection[4][:1]
if leftNav.find("li/a[@id='link-" + letter + "']") is None:
navLi =ET.Element("li")
navLink = ET.Element("a")
navLink.set("id", "link-" + letter)
navLi.set("class", "list-group-item")
navLink.set("href", "#" + letter)
navLink.text = letter.upper()
navLi.append(navLink)
leftNav.append(navLi)
if rightContent.find("div[@id='" + letter + "']") is None:
anchor = ET.Element("div")
anchor.set("id", letter)
anchor.set("class", "anchor")
rightContent.append(anchor)
else:
anchor = rightContent.find("div[@id='" + letter + "']")
anchor = makeAbstract(collection, anchor)
#write static subject pages
pageString = ET.tostring(page, pretty_print=True, method='html', xml_declaration=False, doctype="<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
pageFile = open(os.path.join(staticDir, subjectLinkName + ".html"), "w")
pageFile.write(pageString)
#Write static pages to web server
apapString = ET.tostring(apap, method='html', xml_declaration=False, doctype="<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
apapFile = open(os.path.join(staticDir, "apap.html"), "w")
apapFile.write(apapString)
gerString = ET.tostring(ger, method='html', xml_declaration=False, doctype="<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
gerFile = open(os.path.join(staticDir, "ger.html"), "w")
gerFile.write(gerString)
mssString = ET.tostring(mss, method='html', xml_declaration=False, doctype="<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
mssFile = open(os.path.join(staticDir, "mss.html"), "w")
mssFile.write(mssString)
uaString = ET.tostring(ua, method='html', xml_declaration=False, doctype="<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
uaFile = open(os.path.join(staticDir, "ua.html"), "w")
uaFile.write(uaString)
subjectsString = ET.tostring(subjects, method='html', xml_declaration=False, doctype="<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
subjectsFile = open(os.path.join(staticDir, "subjects.html"), "w")
subjectsFile.write(subjectsString)
alphaString = ET.tostring(alpha, method='html', xml_declaration=False, doctype="<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
alphaFile = open(os.path.join(staticDir, "alpha.html"), "w")
alphaFile.write(alphaString)