#!/bin/bash

domain_dirs=("written" "transcribed" "multimode" "spoken" "treebanks" "misc")

absolute_url=$PWD # or /shared/corpora/corporaWeb/
one=1;
num_zipped=0;
num_errors=0;
num_total=0;
valid_corpora_listing() {
  if !is_data_folder $1; then return "Add data folder in the corpora listing"; fi
  if !is_docs_folder $1; then return "Add docs folder in the corpora listing"; fi
  if !is_readme $1; then return "Add a README.txt or index.html in the corpora listing"; fi
}

is_data_folder() {
  # Check if there is a folder named data
  if [ -s $1/data ]; then
    echo 1;
  else
    echo "ERROR: No data folder found" >&2
    echo 0;
  fi
}

is_docs_folder() {
  # Check if there is a folder named docs
  if [ -s $1/docs ]; then
    echo 1; # Return 1
  else
    echo "ERROR: No docs folder found" >&2
    echo 0; # Return 0
  fi
}

is_readme() {
  # Check if index page or read me is there
  if [[ -s $1/README.txt || -s $1/index.html ]]; then
    echo 1; #Return 1
  else
    echo "ERROR: No README found" >&2
    echo 0; #Return 0
  fi
}

print_file_size() {
  # Print just the file size and not the file again
  echo "Full Directory $1"
  printf "\n"
  echo "File Size: $(du -sh $1 | cut -c 1-4)"
  printf "\n"
}

for dir in "${domain_dirs[@]}"
do
  printf "###################################################### BEGIN $dir\n\n"
  corpora_dirs=(`find $absolute_url/$dir -mindepth 2 -maxdepth 2`)
  echo "${#corpora_dirs[@]} corpora entries found"
  for corp_dir in "${corpora_dirs[@]}"
  do
    printf "##### $(basename $corp_dir) ##### \n"
    print_file_size $corp_dir
    readme=$(is_readme $corp_dir)
    docs=$(is_docs_folder $corp_dir)
    data=$(is_data_folder $corp_dir)
    if [[ $docs -eq $one ]]; then
      echo "Zipping files for $dir"
    else
      ((num_errors++))
    fi
    printf "\n"
    ((num_total++))
  done
done
printf "######################################################## END\n\n"
printf "Summary:\n"
echo "$num_total total corpora entries found"
echo "$num_zipped corpora entries zipped"
echo "$num_errors corpora entries could not be zipped"
exit 0

# Zip all files now
# Use something like this:
# for i in */; do zip -r "${i%/}.zip" "$i"; done