/*
   this file reads input.fna and 
   1)finds is size using stat
   2)finds the position of start codon
   */
#include<cmath>
#include<cstdio>
#include<cstdlib>
#include<sys/types.h>
#include<sys/stat.h>
#include<unistd.h>
#include<cstring>
char input_file[100];
int input_file_size=0;
int taa_count=0,aug_count=0,gtg_count=0,aug_gtg_count=0,selected_orf_count=0;
int *aug_gtg_position,*taa_position,**orf_details;
float mean_orf_size_for_all_orfs=0.0,standard_deviation_for_all_orfs=0.0;
float mean_orf_size_for_selected_orfs=0.0,standard_deviation_for_selected_orfs=0.0;
struct stat s1;
/*
   orf_details is a 6 dimentional array
   0th element will have the start of orf
   1st element will have the end of orf
   2nd element will have the length of orf
   3rd element will have the reading frame of orf ( 0 or 1 or 2)
   4th element will have the type of start codon (atg/gtg<==>0/1)
   5th element is a selection marker for orf's with same end codon but different 
       start codon.the one with the longest size will have 1 others will have 0
       
   */
void getfileinfo(void)
{

	char *older;
	
	if (stat(input_file, &s1) < 0) 
	{
		fprintf(stderr, "cannot stat %s\n",input_file);
		exit(1);
	}
	printf("s1.st_mtime = %u\n", s1.st_mtime);
	input_file_size=s1.st_size;
	printf("the size of file 1 is %d kb\n",input_file_size);
	
}
void readfile(void)
{
	FILE *fna_ptr;
	FILE *aug_register_ptr,*taa_register_ptr,*gtg_register_ptr,*aug_gtg_register_ptr;
	char *current_area,*bridge;
	bridge=new char[4];
	bridge[0]='X';
	bridge[1]='X';
	bridge[2]='X';
	bridge[3]='X';
	int no_of_4kbs,i1,i2,i3,i4,current_orf_length;
	int global_counter=0;
	current_area=new char [4096];
	fna_ptr=fopen(input_file,"r");
	if(fna_ptr==NULL)
	{
		printf("cannot open file \n");
		exit(1);
	}
	aug_register_ptr=fopen("aug_register.gi","w");
	if(aug_register_ptr==NULL)
	{
		printf("cannot open file aug_register.gi\n");
		exit(1);
	}
	aug_gtg_register_ptr=fopen("aug_gtg_register.gi","w");
	if(aug_register_ptr==NULL)
	{
		printf("cannot open file aug_gtg_register.gi\n");
		exit(1);
	}
	gtg_register_ptr=fopen("gtg_register.gi","w");
	if(gtg_register_ptr==NULL)
	{
		printf("cannot open file gtg_register.gi\n");
		exit(1);
	}
	taa_register_ptr=fopen("taa_register.gi","w");
	if(taa_register_ptr==NULL)
	{
		printf("cannot open file taa_register.gi\n");
		exit(1);
	}
	printf("***%c%c%c%c***\n",bridge[0],bridge[1],bridge[2],bridge[3]);
	no_of_4kbs=input_file_size/4096;
	printf("the size of file in kb %d kb\n",no_of_4kbs);
	for(i1=0;i1<=no_of_4kbs;i1++)
	{
		memset(current_area,'X',4096);
	//	printf("i1=%d\n",i1);
		fread(current_area,4,1024,fna_ptr);

		bridge[2]=current_area[0];
		bridge[3]=current_area[1];
//		printf("***%c%c%c%c***\n",bridge[0],bridge[1],bridge[2],bridge[3]);
		for(i2=0;i2<=1;i2++,global_counter++)
		{
			if((bridge[i2]=='A')&&(bridge[i2+1]=='T')&&(bridge[i2+2]=='G'))
			{
	//			printf("one AUG spotted at %d\n",i2+4096*i1-1);
				fprintf(aug_register_ptr,"%d\n",global_counter);
				aug_count++;
				fprintf(aug_gtg_register_ptr,"%d\n",global_counter);
			}
//			printf("%c",current_area[i2]);
			if((bridge[i2]=='G')&&(bridge[i2+1]=='T')&&(bridge[i2+2]=='G'))
			{
	//			printf("one AUG spotted at %d\n",i2+4096*i1-1);
				fprintf(gtg_register_ptr,"%d\n",global_counter);
				gtg_count++;
				fprintf(aug_gtg_register_ptr,"%d\n",global_counter);
			}

			if((bridge[i2]=='T')&&(bridge[i2+1]=='A')&&(bridge[i2+2]=='A'))
			{
	//			printf("one TAA spotted at %d\n",i2+4096*i1-1);
				fprintf(taa_register_ptr,"%d\n",global_counter);
				taa_count++;
			}

		}


		for(i2=0;i2<4094;i2++,global_counter++)
		{
			if((current_area[i2]=='A')&&(current_area[i2+1]=='T')&&(current_area[i2+2]=='G'))
			{
	//			printf("one AUG spotted at %d\n",i2+4096*i1-1);
				fprintf(aug_register_ptr,"%d\n",global_counter);
				aug_count++;
				fprintf(aug_gtg_register_ptr,"%d\n",global_counter);
			}
//			printf("%c",current_area[i2]);
			if((current_area[i2]=='G')&&(current_area[i2+1]=='T')&&(current_area[i2+2]=='G'))
			{
	//			printf("one AUG spotted at %d\n",i2+4096*i1-1);
				fprintf(gtg_register_ptr,"%d\n",global_counter);
				gtg_count++;
				fprintf(aug_gtg_register_ptr,"%d\n",global_counter);
			}

			if((current_area[i2]=='T')&&(current_area[i2+1]=='A')&&(current_area[i2+2]=='A'))
			{
	//			printf("one TAA spotted at %d\n",i2+4096*i1-1);
				fprintf(taa_register_ptr,"%d\n",global_counter);
				taa_count++;
			}

		}
		bridge[0]=current_area[4094];
		bridge[1]=current_area[4095];
	}
	aug_gtg_count=aug_count+gtg_count;
	fclose(aug_register_ptr);
	fclose(gtg_register_ptr);
	fclose(taa_register_ptr);
	fclose(aug_gtg_register_ptr);
	delete [] current_area;
	delete [] bridge;
}
void write_to_logfile(void)
{
	FILE *logfile_ptr;
	logfile_ptr=fopen("all_orf_logfile.txt","w");
	if(logfile_ptr==NULL)
	{
		printf("cannot open logfile for writing\n");
		exit(1);
	}
	fprintf(logfile_ptr,"general details about genome\n");
	fprintf(logfile_ptr,"aug_cout=%d\n",aug_count);
	fprintf(logfile_ptr,"gtg_count=%d\n",gtg_count);
	fprintf(logfile_ptr,"taa_count=%d\n\n",taa_count);
	fprintf(logfile_ptr,"statastics about all the orfs available\n");
	fprintf(logfile_ptr,"orf count=%d\n",aug_gtg_count);
	fprintf(logfile_ptr,"mean of orf size=%f\n",mean_orf_size_for_all_orfs);
	fprintf(logfile_ptr,"variance=%f\n",standard_deviation_for_all_orfs);
	standard_deviation_for_all_orfs=sqrtf(standard_deviation_for_all_orfs);
	fprintf(logfile_ptr,"standard deviation =%f\n\n",standard_deviation_for_all_orfs);
	fprintf(logfile_ptr,"details about those orfs that do not have the same stop codon\n");
	fprintf(logfile_ptr,"orf count=%d\n",selected_orf_count);
	fprintf(logfile_ptr,"mean of orf size=%f\n",mean_orf_size_for_selected_orfs);
	fprintf(logfile_ptr,"variance=%f\n",standard_deviation_for_selected_orfs);
	standard_deviation_for_selected_orfs=sqrtf(standard_deviation_for_selected_orfs);
	fprintf(logfile_ptr,"standard deviation=%f\n",standard_deviation_for_selected_orfs);
fclose(logfile_ptr);
}
void load_codon_position(void)
{
	FILE *aug_gtg_ptr,*taa_ptr;
	int i1=0,i2=0;
	aug_gtg_ptr = fopen("aug_gtg_register.gi","r");
	taa_ptr=fopen("taa_register.gi","r");
	aug_gtg_position=new int [aug_count+gtg_count];
	orf_details=new int * [aug_count+gtg_count];
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		orf_details[i1]=new int [6];
		for(i2=0;i2<6;i2++)
		{
			orf_details[i1][i2]=-2;
		}
	}
	taa_position=new int [taa_count];
	if(aug_gtg_ptr==NULL)
	{
	
		printf("cannot open file aug_gtg_register.gi for reading\n");
		exit(1);
	}
	if(taa_ptr==NULL)
	{
		printf("cannot open file taa_register.gi for reading\n");
		exit(1);

	}
	for(i1=0;i1<aug_count+gtg_count;i1++)
	{
		fscanf(aug_gtg_ptr,"%d",&aug_gtg_position[i1]);
	}
	for(i1=0;i1<aug_count+gtg_count;i1++)
	{
	//	printf("%d,%d\n",i1,aug_gtg_position[i1]);
	}
	for(i1=0;i1<taa_count;i1++)
	{
		fscanf(taa_ptr,"%d",&taa_position[i1]);
	}
	for(i1=0;i1<taa_count;i1++)
	{
	//	printf("%d,%d\n",i1,taa_position[i1]);
	}
	fclose(taa_ptr);
	fclose(aug_gtg_ptr);
}
void find_and_write_all_orfs(void)
{
	int i1,i2;
	int sum_of_all_orf_sizes=0,stop_flag=0;
	/*
algorithm:
1)read the first start,look for the stop after that in frame with it,write it.
2)read the next start ,it can be inside the first orf..even then read it and look for the next stop after in frame with it,write it
3)do this for all start codons.
4)in the end we will get all orfs in the sequence given, there might be overlaps.
5)this over laps have to eliminated.

	   */
	FILE *orf_ptr;
	orf_ptr=fopen("all_orfs_positions.gi","w");
	if(orf_ptr==NULL)
	{
		printf("cannot open file all_orfs_positions.gi for writing\n");
		exit(1);

	}
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		stop_flag=0;
		for(i2=0;i2<taa_count;i2++)
		{
			if((taa_position[i2]>aug_gtg_position[i1])&&(aug_gtg_position[i1]%3==taa_position[i2]%3))
			{
				orf_details[i1][0]=aug_gtg_position[i1];
				orf_details[i1][1]=taa_position[i2];
				orf_details[i1][2]=taa_position[i2]-aug_gtg_position[i1];
				sum_of_all_orf_sizes=sum_of_all_orf_sizes+orf_details[i1][2];
				stop_flag=1;
				break;
			}
		}
		if(stop_flag==0)
		{
			orf_details[i1][0]=aug_gtg_position[i1];
			orf_details[i1][1]=-1;
			orf_details[i1][5]=-1;
			orf_details[i1][2]=0;
		}
	}
	mean_orf_size_for_all_orfs=(float)sum_of_all_orf_sizes/(float)aug_gtg_count;
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		if(orf_details[i1][5]!=-1)
		{
			orf_details[i1][5]=1;
		}
		standard_deviation_for_all_orfs=powf((mean_orf_size_for_all_orfs-orf_details[i1][2]),2)+standard_deviation_for_all_orfs;
	}
	standard_deviation_for_all_orfs=standard_deviation_for_all_orfs/(float)aug_gtg_count;
	/*
	   the following for is used to find those orfs which have the same end codon but different start codon and mark the 5th element(orf_details[][5]) as -1.
	   */
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		
		for(i2=i1+1;i2<aug_gtg_count;i2++)
		{
			if((orf_details[i1][1]==orf_details[i2][1])&&(orf_details[i2][5]!=-1))
			{
				orf_details[i2][5]=-1;
			}
		}
	}
	

	for(i1=0;i1<aug_gtg_count;i1++)
	{
		if(orf_details[i1][5]==1)
		{
			selected_orf_count++;
		}
		fprintf(orf_ptr,"%d\t%d\t%d\t%d\n",orf_details[i1][0],orf_details[i1][1],orf_details[i1][2],orf_details[i1][5]);

//		fprintf(orf_ptr,"%d\t%d\n",aug_gtg_position[i1]%3,taa_position[i2]%3);
	}
	fclose(orf_ptr);	

}
void find_statastics_for_selected_orfs(void)
{
	int i1,sum_of_selected_orf_sizes=0;
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		if(orf_details[i1][5]==1)
		{
			sum_of_selected_orf_sizes=sum_of_selected_orf_sizes+orf_details[i1][2];
		}
	}
	mean_orf_size_for_selected_orfs=(float)sum_of_selected_orf_sizes/(float)selected_orf_count;
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		if(orf_details[i1][5]==1)
		{
			standard_deviation_for_selected_orfs=powf((mean_orf_size_for_selected_orfs-orf_details[i1][2]),2)+standard_deviation_for_selected_orfs;
	
		}
	}
	standard_deviation_for_selected_orfs=standard_deviation_for_selected_orfs/(float)selected_orf_count;

	
}
/*
   this file will read upstream hundred bases of all selected orfs
( those that satisfy the condition (orf_details[][5]==1))
   algo
   1)two variables are used 
   	a)current_file_pointer_position
	b)required_file_pointer_position
   2)required_file_pointer_position will contain the start point of the upstream hundred      bases.
   3)whole file is read in a single fread into a array
   */
void read_100_bases_upstream(void)
{
	int i1,i2,i3,i4=0,current_file_pointer_position=0,required_file_pointer_position;
	FILE *seq_file_ptr,*upstream_100base_ptr;
	char current_upstream[100],**upstream;
	upstream=new char * [selected_orf_count];
	for(i1=0;i1<selected_orf_count;i1++)
	{
		upstream[i1]=new char [100];
	}
	if(upstream==NULL)
	{
		printf("cannot allocate memory for upstream\n");
		exit(0);
	}

	seq_file_ptr=fopen(input_file,"r");
	upstream_100base_ptr=fopen("upstream_100base.gi","w");
	if(seq_file_ptr==NULL)
	{
		printf("cannot open file %s for reading\n",input_file);
		exit(1);
	}
	if(upstream_100base_ptr==NULL)
	{
		printf("cannot open file upstream_100base.gi for writing\n");
		exit(1);
	}

	
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		if(orf_details[i1][5]==1)
		{
			for(i4=0,i3=orf_details[i1][0]-100;i3<orf_details[i1][0];i3++,i4++)
			{
				current_upstream[i4]=current_upstream[i3];
			}
			fprintf(upstream_100base_ptr,"%s\n",current_upstream);
		}
	}
	fclose(upstream_100base_ptr);
	fclose(seq_file_ptr);
}

void memory_release(void)
{
	int i1;
	for(i1=0;i1<aug_gtg_count;i1++)
	{
		delete [] orf_details[i1];
	}
	delete [] orf_details;
	delete [] aug_gtg_position;

}
int main(void)
{
	printf("enter the input file name\n");
	gets(input_file);
	printf("entered value is %s\n",input_file);
	getfileinfo();
	readfile();
	load_codon_position();
	find_and_write_all_orfs();
	/*
	   this file also calculates statistics about all orfs that are available
	   */
	find_statastics_for_selected_orfs();
	write_to_logfile();
	read_100_bases_upstream();
	memory_release();
	return 0;
}
	
	
		

