-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreadAll.sh~
executable file
·43 lines (34 loc) · 1.19 KB
/
readAll.sh~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/bash
# Ricky Savjani
# Readability Scores
# 1/15/2019
# package dependencies
# readability: https://pypi.org/project/readability-lxml/
# html2text: https://pypi.org/project/html2text/
# define params
urlsFile="cancer_immunotherapy2.txt"
baseName=${urlsFile%.txt}
outputDir="autoCleanTxt"
outCSV="cancerImmunotherapyReadScoresTokenized_Auto.csv"
# loop through urls to save auto clean text
count=0
while IFS='' read -r url || [[ -n "$url" ]];
do
# define output name
outFile=${outputDir}/${count}_${baseName}.txt
# echo url
echo ${count}: $url
# use python readability to convert HTML from URL to just body text of URL
# | output to html2text to save to file with some clean up
python -m readability.readability -u $url | html2text --ignore-emphasis --ignore-links --ignore-images --dash-unordered-list > $outFile
# clean up characters
sed -i -e 's/Title://g' $outFile # delete Title:
sed -i -e 's/\#\# //g' $outFile # delete ##
sed -i -e 's/\#//g' $outFile # delete #
sed -i -e 's/ - //g' $outFile # delete bullets
sed -i -e 's/\[[^][]*\]//g' # remove any text inside of brackets
# rm error files
rm -f ${outFile}*-e
((count+=1))
done <"$urlsFile"