$ awk '/^>/ {if (sqlen){print sqlen}; print ;sqlen=0;next; } { sqlen = sqlen +length($0)}END{print sqlen}' db4_cdc_NDM_only.fasta | sed 's/>//g' | paste - - ref|NZ_CP029245.1| Escherichia_coli_strain_ECCRA-119_plasmid_pTB203,_complete_sequence 46161 ref|NZ_CP029386.1| Klebsiella_pneumoniae_subsp._pneumoniae_strain_SCKP040074_plasmid_pNDM6_040074,_complete_sequence 52989 ref|NZ_CP029731.1| Citrobacter_sp._CRE-46_strain_AR_0157_plasmid_unnamed3,_complete_sequence 108148 ref|NZ_CP029737.1| Providencia_rettgeri_strain_AR_0082_plasmid_unnamed,_complete_sequence 144970 ref|NZ_CP029978.1| Escherichia_coli_strain_51008369SK1_plasmid_p51008369SK1_E,_complete_sequence 99465 ref|NZ_KM923969.1| Acinetobacter_dijkshoorniae_strain_JVAP01_plasmid_pNDM-JVAP01,_complete_sequence 47268
Showing posts with label One-liner. Show all posts
Showing posts with label One-liner. Show all posts
Thursday, August 9, 2018
Sequence lengths from AWK one-liner
Thursday, February 1, 2018
One liner to convert a long fasta-file into many separate single fasta sequences
Taken from the Biostar post: https://www.biostars.org/p/105388/
while read line; do if [[ ${line:0:1} == '>' ]]; then outfile=${line#>}.fa; echo $line > $outfile; else echo $line >> $outfile; fi; done < combined_kpneumoniae.tfa
while read line; do if [[ ${line:0:1} == '>' ]]; then outfile=${line#>}.fa; echo $line > $outfile; else echo $line >> $outfile; fi; done < combined_kpneumoniae.tfa
Tuesday, January 9, 2018
Converting FastQ to FastA - comparison of one-liners and toolkits
Just out of curiosity, was checking time it takes to convert fastq to fasta by different tools and one-liners. I took the commands/one-liners from this biostar post.
The input file is gz file and contains 6 M reads of 301 bp.
The input file is gz file and contains 6 M reads of 301 bp.
Sunday, November 12, 2017
Replace a list of patterns with another list
$ while IFS= read -r aline; do filename=`echo "$aline" \|
cut -f1 -d " " `; phyloID=`echo "$aline" \|
cut -f2 -d " " `;
sed -i "s/$filename/$phyloID/" file_with_pattern_to_be_replaced.txt ;
echo "$aline"; done < id_values.txt
## "id_values.txt" - contains pattern and replacement, separated by tab
## "file_with_pattern_to_be_replaced.txt" - replace patterns in this file
Sunday, September 3, 2017
Copying multiple files from a list to another directory
I have a list of filenames in a file "list.txt".
- I want to find out the absolute location of these files
- and then want to copy to another folder.
$ time for d in $(cat list.txt); do cp `find /media/My_Book/DATA/Raw/ -name "$d"` /media/F3F9/files/ ; done
Thursday, September 8, 2016
Reapr - Sampling fragment coverage - didn't get any coverage
While running Reapr on my genome using for the long insert libraries I received the
following error.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | $ Reapr_1.0.18/reapr scaffolds.fasta LIMP_8kb_long_mapped.bam LIMP_8kb_Reapr_out perfect [REAPR pipeline] Running facheck [REAPR pipeline] Running preprocess [REAPR preprocess] sampling region scaffold1_size3169106:5458-6874. Already sampled 0 bases [REAPR preprocess] sampling region scaffold1_size3169106:6895-18716. Already sampled 1417 bases [REAPR preprocess] sampling region scaffold1_size3169106:30623-57811. Already sampled 13239 bases [REAPR preprocess] sampling region scaffold1_size3169106:57813-98842. Already sampled 40428 bases [REAPR preprocess] sampling region scaffold1_size3169106:98863-108371. Already sampled 81458 bases [REAPR preprocess] sampling region scaffold1_size3169106:108392-108786. Already sampled 90967 bases [REAPR preprocess] sampling region scaffold1_size3169106:108807-132036. Already sampled 91362 bases [REAPR preprocess] sampling region scaffold1_size3169106:153000-153621. Already sampled 114592 bases [REAPR preprocess] sampling region scaffold1_size3169106:153642-254312. Already sampled 115214 bases [REAPR preprocess] sampling region scaffold1_size3169106:254314-257386. Already sampled 215885 bases [REAPR preprocess] sampling region scaffold1_size3169106:257407-308464. Already sampled 218958 bases [REAPR preprocess] sampling region scaffold1_size3169106:309627-314663. Already sampled 270016 bases [REAPR preprocess] sampling region scaffold1_size3169106:314684-355810. Already sampled 275053 bases [REAPR preprocess] sampling region scaffold1_size3169106:355831-356388. Already sampled 316180 bases [REAPR preprocess] sampling region scaffold1_size3169106:356424-360553. Already sampled 316738 bases [REAPR preprocess] sampling region scaffold1_size3169106:362353-365556. Already sampled 320868 bases [REAPR preprocess] sampling region scaffold1_size3169106:365596-383718. Already sampled 324072 bases [REAPR preprocess] sampling region scaffold1_size3169106:393560-401076. Already sampled 342195 bases [REAPR preprocess] sampling region scaffold1_size3169106:401097-405358. Already sampled 349712 bases [REAPR preprocess] sampling region scaffold1_size3169106:405379-423388. Already sampled 353974 bases [REAPR preprocess] sampling region scaffold1_size3169106:423445-426298. Already sampled 371984 bases [REAPR preprocess] sampling region scaffold1_size3169106:426319-435419. Already sampled 374838 bases [REAPR preprocess] sampling region scaffold1_size3169106:435440-439265. Already sampled 383939 bases [REAPR preprocess] sampling region scaffold1_size3169106:439286-440463. Already sampled 387765 bases [REAPR preprocess] sampling region scaffold1_size3169106:440484-443034. Already sampled 388943 bases [REAPR preprocess] sampling region scaffold1_size3169106:443036-506052. Already sampled 391494 bases [REAPR preprocess] sampling region scaffold1_size3169106:506073-512313. Already sampled 454511 bases [REAPR preprocess] sampling region scaffold1_size3169106:512315-521665. Already sampled 460752 bases [REAPR preprocess] sampling region scaffold1_size3169106:526492-538394. Already sampled 470103 bases [REAPR preprocess] sampling region scaffold1_size3169106:538415-571424. Already sampled 482006 bases [REAPR preprocess] sampling region scaffold1_size3169106:571426-583083. Already sampled 515016 bases [REAPR preprocess] sampling region scaffold1_size3169106:583085-585679. Already sampled 526674 bases [REAPR preprocess] sampling region scaffold1_size3169106:585700-640937. Already sampled 529269 bases [REAPR preprocess] sampling region scaffold1_size3169106:640982-662781. Already sampled 584507 bases [REAPR preprocess] sampling region scaffold1_size3169106:662783-742983. Already sampled 606307 bases [REAPR preprocess] sampling region scaffold1_size3169106:743047-760078. Already sampled 686508 bases [REAPR preprocess] sampling region scaffold1_size3169106:765101-815949. Already sampled 703540 bases [REAPR preprocess] sampling region scaffold1_size3169106:815971-820741. Already sampled 754389 bases [REAPR preprocess] sampling region scaffold1_size3169106:820912-821293. Already sampled 759160 bases [REAPR preprocess] sampling region scaffold1_size3169106:821314-865486. Already sampled 759542 bases [REAPR preprocess] sampling region scaffold1_size3169106:865488-908573. Already sampled 803715 bases [REAPR preprocess] sampling region scaffold1_size3169106:909261-942195. Already sampled 846801 bases [REAPR preprocess] sampling region scaffold1_size3169106:942216-986026. Already sampled 879736 bases [REAPR preprocess] sampling region scaffold1_size3169106:986047-991833. Already sampled 923547 bases [REAPR preprocess] sampling region scaffold1_size3169106:991854-1003751. Already sampled 929334 bases [REAPR preprocess] sampling region scaffold1_size3169106:1003772-1027843. Already sampled 941232 bases [REAPR preprocess] sampling region scaffold1_size3169106:1029659-1034131. Already sampled 965304 bases [REAPR preprocess] sampling region scaffold1_size3169106:1034152-1034594. Already sampled 969777 bases [REAPR preprocess] sampling region scaffold1_size3169106:1034615-1057662. Already sampled 970220 bases [REAPR preprocess] sampling region scaffold1_size3169106:1057683-1063521. Already sampled 993268 bases [REAPR preprocess] sampling region scaffold1_size3169106:1063542-1064434. Already sampled 999107 bases [REAPR preprocess] Something went wrong sampling fragment coverage - didn't get any coverage. Mon Jul 18 11:42:50 SGT 2016 |
Solution that worked for me:
I changed the orientation of the mate pairs reads by reverse complementing before using REAPR since by default Reapr assumes that the orientation of the reads is FR. To perform RC on fastq files, I used fastx_reverse_complement. Check this page for more details.
Monday, August 1, 2016
Categorizing the mapped reads
The mapped reads primarily fall in above four categories. 1) Properly paired, 2) Improperly paired, Broken reads [3) mates map on different scaffolds, 4) only one of the mate maps]
For my convenience, I have used the samtools commands in a script to categorize the reads mapped in pairs (PE reads) on the reference. Please find it here.
Running the script will list out the number of reads. But it takes more time with very large BAM files.
./SAM_stats.sh file_coordinate_sorted.bam Properly paired with correct insert distances: 191012 Wrong insert Distance or mates inverted: 1858 Broken Reads: 6284 Out of 6284 broken reads 6100 does not have a mate mapped Out of 6284 broken reads, 92 pairs fall on different contigs
Tuesday, July 19, 2016
Reverse complement fastq to fastq/a
When dealing with long insert mate pairs, some tools like Reapr might need the reads to be reverse complemented fastq in order to run the tool. So, I started searching for some softwares which can actually do what I wanted that.
One of the tool, I most of the times used was fastx_toolkit. It has a utility called fastx_reverse_complement by which I could convert all fastq to rc fastq files.
FASTQ to REV-COMP FASTQ:
FASTQ to REV-COMP FASTA:
Sometimes, we might have to reverse complement fastq and need to convert to fasta file. This could be achieved by running revseq tool in EMBOSS with the following command:
* If there is a following error
fastx_reverse_complement: Invalid quality score value (char '0' ord 48 quality value -16) on line 4
Try pasting "-Q33" (without quotes) after fastx_reverse_complement.
One of the tool, I most of the times used was fastx_toolkit. It has a utility called fastx_reverse_complement by which I could convert all fastq to rc fastq files.
FASTQ to REV-COMP FASTQ:
$ fastx_reverse_complement -i sample.fastq -o rc_sample_fastx_tk.fastq
FASTQ to REV-COMP FASTA:
Sometimes, we might have to reverse complement fastq and need to convert to fasta file. This could be achieved by running revseq tool in EMBOSS with the following command:
$ revseq -sequence sample.fastq -outseq rc_sample.fasta -notag
* If there is a following error
fastx_reverse_complement: Invalid quality score value (char '0' ord 48 quality value -16) on line 4
Try pasting "-Q33" (without quotes) after fastx_reverse_complement.
Monday, July 18, 2016
Grep for first occurrence for multiple strings
Inspired by a post from stackexchange. I posted this as an answer there too.
Using "for" loop and "grep" we can do the following:
Explanation:
$ cat somelog.log ADWN 1259 11:00 B23 ADWN 3009 12:00 B19 DDWN 723 11:30 B04 ADWN 1589 14:20 B12 ADWN 1259 11:10 B23 DDWN 2534 13:00 B16 ADWN 3009 11:50 B14
Using "for" loop and "grep" we can do the following:
for i in $ (cut -d " " -f1 somelog.log | sort -u); do LC_ALL=C fgrep -m1 "$i" somelog.log; done
Explanation:
$ cut -d " " -f1 somelog.log | sort -u # will result in unique identifiers
ADWN DDWN
$ LC_ALL=C fgrep -m1 "$i" somelog.log # "m" option in grep will match the first pattern
1) for loop matches each pattern using grep and picks the first pattern.
2) LC_ALL=C is used to make search faster
3) fgrep is to match fixed strings instead of regular expressions.
fastest way to count reads in fastq file - use parallel
Unzipped fastq files with 16 cores, took very minute time to count number of reads.
500bp_insert_lane1_2_read2PE_single_appended.Genome.completely.cleaned.fastq_ORPHAN.fastq
408600
500bp_insert_lane1_2_read1PE_single_appended.Genome.completely.cleaned.fastq_ORPHAN.fastq
2668085
500bp_insert_lane1_2_read2PE_single_appended.Genome.completely.cleaned.fastq_matched.fastq
148713855
500bp_insert_lane1_2_read1PE_single_appended.Genome.completely.cleaned.fastq_matched.fastq
148713855
But the zipped files of size around ~ 23 Gb took a bit of around ~5 min.
500bp_insert_lane1_read1.fastq.gz
75640549
500bp_insert_lane2_read1.fastq.gz
76849862
500bp_insert_lane2_read2.fastq.gz
76849862
500bp_insert_lane1_read2.fastq.gz
75640549
$ time parallel "echo {} && cat {} | wc -l | awk '{d=\$1; print d/4;}'" ::: 500bp_insert*.fastq
500bp_insert_lane1_2_read2PE_single_appended.Genome.completely.cleaned.fastq_ORPHAN.fastq
408600
500bp_insert_lane1_2_read1PE_single_appended.Genome.completely.cleaned.fastq_ORPHAN.fastq
2668085
500bp_insert_lane1_2_read2PE_single_appended.Genome.completely.cleaned.fastq_matched.fastq
148713855
500bp_insert_lane1_2_read1PE_single_appended.Genome.completely.cleaned.fastq_matched.fastq
148713855
real 0m19.850s user 0m13.144s sys 0m56.604s
But the zipped files of size around ~ 23 Gb took a bit of around ~5 min.
$time parallel "echo {} && gunzip -c {} | wc -l | awk '{d=\$1; print d/4;}'" ::: 500bp_insert_lane*_read1.fastq.gz
500bp_insert_lane1_read1.fastq.gz
75640549
500bp_insert_lane2_read1.fastq.gz
76849862
real 2m40.358s user 5m6.051s sys 0m24.710s
$ time parallel "echo {} && gunzip -c {} | wc -l | awk '{d=\$1; print d/4;}'" ::: 500bp_insert_lane*_read2.fastq.gz
500bp_insert_lane2_read2.fastq.gz
76849862
500bp_insert_lane1_read2.fastq.gz
75640549
real 2m49.668s user 5m17.894s sys 0m28.807s
Thursday, July 7, 2016
Rename the fasta file with the first sequence header
Suppose there are multiple fasta files with single sequence. We want to rename the fasta file with the header of the single sequence present in the fasta file. We can run the following command.
Place all the fasta files you want to change the file names in one directory and run the following command.
Place all the fasta files you want to change the file names in one directory and run the following command.
for i in *; do if [ ! -f $i ]; then echo "skipping $i"; else newname=`head -1 $i | sed 's/^\s*\([a-zA-Z0-9]\+\).*$/\1/'`; [ -n "$newname" ] ; mv -i $i $newname.fasta || echo "error at: $i"; fi; done | rename s/ // *.fasta
Subscribe to:
Comments (Atom)

