title | author | date |
---|---|---|
DNA Sequencing Examination |
Quynh Nhu Nguyen |
March 26th 2023 |
Download the sequences in the ACN list "cytochrom_plasmid.txt"
# create a directory you work in
$ mkdir NGS_Examination_26_03
# change into the directory
$ cd NGS_Examination_26_03
# install NCBI
$ sudo apt install ncbi-entrez-direct
$ sudo apt install acedb-other
#!/bin/bash
file="cytochrom_plasmid.txt"
num_files=$(wc -l < $file) # Number of files to download
# Loop until we have downloaded enough files
while [ $(ls -l | grep -c "^-" ) -lt $num_files ]
do
# Read the filename from the txt file entries.
while read -r filename
do
echo "${filename}" | esearch -db nucleotide -query ${filename} | efetch -format fasta > "${filename}.fasta"
# Check if the download was successful
if [ $? -eq 0 ]
then
echo "Downloaded $filename"
else
echo "Failed to download $filename. Retrying..."
# Try downloading the file again
echo "${filename}" | esearch -db nucleotide -query ${filename} | efetch -format fasta > "${filename}.fasta"
if [ $? -eq 0 ]
then
echo "Downloaded $filename on retry"
else
echo "Failed to download $filename on retry"
fi
fi
done < "$file"
done
echo "Downloaded $(ls | grep -c '\.fasta$') files"
Merge them into a big fasta format file
# concatenate all *.fasta in the current directory into a big fasta format file
$ cat *.fasta > big.fasta
# test information of big.fasta
$ head -20 big.fasta
$ tail -20 big.fasta
# check if the number of downloaded files is equal to the required FASTAs'number in cytochrom_plasmid.txt
$ if [ $(grep -c "^>" big.fasta) -eq $num_files ]; then
echo "True"
else
echo "False"
fi
Remove the first 12 bases of all the sequences in the fasta and write it to the new file new_5_12.fasta
$ sed 's/^[A-Z]\{12\}//' big.fasta > new_5_12.fasta
Remove the last 12 bases of all the sequences in the fasta and write it to the new file new_3_12.fasta
$ sed 's/^[A-Z]\{12\}$//' big.fasta > new_3_12.fasta
Remove the middle 12 bases of all the sequences in the fasta and write it to the new file new_mid_12.fasta
$ awk '/^>/ {print;next} {L=length($0);print substr($0,1,int((L-12)/2)) substr($0,int((L+12)/2)+1)}' big.fasta > new_mid_12.fasta