#!/bin/bash domain_dirs=("written" "transcribed" "multimode" "spoken" "treebanks" "misc") absolute_url=$PWD # or /shared/corpora/corporaWeb/ one=1; num_zipped=0; num_errors=0; num_total=0; valid_corpora_listing() { if !is_data_folder $1; then return "Add data folder in the corpora listing"; fi if !is_docs_folder $1; then return "Add docs folder in the corpora listing"; fi if !is_readme $1; then return "Add a README.txt or index.html in the corpora listing"; fi } is_data_folder() { # Check if there is a folder named data if [ -s $1/data ]; then echo 1; else echo "ERROR: No data folder found" >&2 echo 0; fi } is_docs_folder() { # Check if there is a folder named docs if [ -s $1/docs ]; then echo 1; # Return 1 else echo "ERROR: No docs folder found" >&2 echo 0; # Return 0 fi } is_readme() { # Check if index page or read me is there if [[ -s $1/README.txt || -s $1/index.html ]]; then echo 1; #Return 1 else echo "ERROR: No README found" >&2 echo 0; #Return 0 fi } print_file_size() { # Print just the file size and not the file again echo "Full Directory $1" printf "\n" echo "File Size: $(du -sh $1 | cut -c 1-4)" printf "\n" } for dir in "${domain_dirs[@]}" do printf "###################################################### BEGIN $dir\n\n" corpora_dirs=(`find $absolute_url/$dir -mindepth 2 -maxdepth 2`) echo "${#corpora_dirs[@]} corpora entries found" for corp_dir in "${corpora_dirs[@]}" do printf "##### $(basename $corp_dir) ##### \n" print_file_size $corp_dir readme=$(is_readme $corp_dir) docs=$(is_docs_folder $corp_dir) data=$(is_data_folder $corp_dir) if [[ $docs -eq $one ]]; then echo "Zipping files for $dir" else ((num_errors++)) fi printf "\n" ((num_total++)) done done printf "######################################################## END\n\n" printf "Summary:\n" echo "$num_total total corpora entries found" echo "$num_zipped corpora entries zipped" echo "$num_errors corpora entries could not be zipped" exit 0 # Zip all files now # Use something like this: # for i in */; do zip -r "${i%/}.zip" "$i"; done