Skip to content
Snippets Groups Projects

zipitup.sh

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by ssapra2
    snippetfile1.txt 2.13 KiB
    #!/bin/bash
    
    domain_dirs=("written" "transcribed" "multimode" "spoken" "treebanks" "misc")
    
    absolute_url=$PWD # or /shared/corpora/corporaWeb/
    one=1;
    num_zipped=0;
    num_errors=0;
    num_total=0;
    valid_corpora_listing() {
      if !is_data_folder $1; then return "Add data folder in the corpora listing"; fi
      if !is_docs_folder $1; then return "Add docs folder in the corpora listing"; fi
      if !is_readme $1; then return "Add a README.txt or index.html in the corpora listing"; fi
    }
    
    is_data_folder() {
      # Check if there is a folder named data
      if [ -s $1/data ]; then
        echo 1;
      else
        echo "ERROR: No data folder found" >&2
        echo 0;
      fi
    }
    
    is_docs_folder() {
      # Check if there is a folder named docs
      if [ -s $1/docs ]; then
        echo 1; # Return 1
      else
        echo "ERROR: No docs folder found" >&2
        echo 0; # Return 0
      fi
    }
    
    is_readme() {
      # Check if index page or read me is there
      if [[ -s $1/README.txt || -s $1/index.html ]]; then
        echo 1; #Return 1
      else
        echo "ERROR: No README found" >&2
        echo 0; #Return 0
      fi
    }
    
    print_file_size() {
      # Print just the file size and not the file again
      echo "Full Directory $1"
      printf "\n"
      echo "File Size: $(du -sh $1 | cut -c 1-4)"
      printf "\n"
    }
    
    for dir in "${domain_dirs[@]}"
    do
      printf "###################################################### BEGIN $dir\n\n"
      corpora_dirs=(`find $absolute_url/$dir -mindepth 2 -maxdepth 2`)
      echo "${#corpora_dirs[@]} corpora entries found"
      for corp_dir in "${corpora_dirs[@]}"
      do
        printf "##### $(basename $corp_dir) ##### \n"
        print_file_size $corp_dir
        readme=$(is_readme $corp_dir)
        docs=$(is_docs_folder $corp_dir)
        data=$(is_data_folder $corp_dir)
        if [[ $docs -eq $one ]]; then
          echo "Zipping files for $dir"
        else
          ((num_errors++))
        fi
        printf "\n"
        ((num_total++))
      done
    done
    printf "######################################################## END\n\n"
    printf "Summary:\n"
    echo "$num_total total corpora entries found"
    echo "$num_zipped corpora entries zipped"
    echo "$num_errors corpora entries could not be zipped"
    exit 0
    
    # Zip all files now
    # Use something like this:
    # for i in */; do zip -r "${i%/}.zip" "$i"; done
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment