172 lines
4.5 KiB
Mathematica
172 lines
4.5 KiB
Mathematica
|
|
function data = readfasta(varargin)
|
||
|
|
MAXNAME = 200;
|
||
|
|
|
||
|
|
if nargin == 0
|
||
|
|
[filename, pathname] = uigetfile( ...
|
||
|
|
{'*.fasta;*.fas;*.txt', 'FASTA Format Files (*.fasta, *.fas, *.txt)';
|
||
|
|
'*.*', 'All Files (*.*)'}, ...
|
||
|
|
'Pick a FASTA file');
|
||
|
|
if ~(filename), aln=[]; return; end
|
||
|
|
filename=[pathname,filename];
|
||
|
|
end
|
||
|
|
|
||
|
|
if nargin == 1
|
||
|
|
[filename,pathname] = uigetfile( ...
|
||
|
|
{'*.fasta;*.fas;*.txt', 'FASTA Format Files (*.fasta, *.fas, *.txt)';
|
||
|
|
'*.*', 'All Files (*.*)'}, ...
|
||
|
|
sprintf('Pick the FASTA file for gene %s',varargin{1}) );
|
||
|
|
if ~(filename), aln=[];
|
||
|
|
data = [];
|
||
|
|
return;
|
||
|
|
end
|
||
|
|
filename=[pathname,filename];
|
||
|
|
end
|
||
|
|
|
||
|
|
if nargin == 2
|
||
|
|
[filename,pathname] = uigetfile( ...
|
||
|
|
{'*.fasta;*.fas;*.txt', 'FASTA Format Files (*.fasta, *.fas, *.txt)';
|
||
|
|
'*.*', 'All Files (*.*)'}, ...
|
||
|
|
sprintf('Pick the FASTA file for gene %s',varargin{1}) );
|
||
|
|
if ~(filename), aln=[];
|
||
|
|
data = [];
|
||
|
|
return;
|
||
|
|
end
|
||
|
|
filename=[pathname,filename];
|
||
|
|
chosen_index = varargin{2};
|
||
|
|
end
|
||
|
|
|
||
|
|
if nargin < 3
|
||
|
|
% [seqtype, geneticcode]=selectSeqTypeAndGeneticCode;
|
||
|
|
seqtype = 2;
|
||
|
|
geneticcode = 1;
|
||
|
|
if (isempty(seqtype)|isempty(geneticcode)), aln=[],data=[]; return; end
|
||
|
|
end
|
||
|
|
|
||
|
|
|
||
|
|
pause(0.0001);
|
||
|
|
if ~ischar(filename)
|
||
|
|
error('BAPS:InvalidInput','Input must be a character array')
|
||
|
|
data = [];
|
||
|
|
return;
|
||
|
|
end
|
||
|
|
|
||
|
|
if ~(exist(filename,'file') | exist(fullfile(cd,filename),'file')),
|
||
|
|
% is a valid filename ?
|
||
|
|
error('BAPS:InvalidInput','Input must be a valid file')
|
||
|
|
data = [];
|
||
|
|
return;
|
||
|
|
end
|
||
|
|
|
||
|
|
file = fopen(filename, 'r');
|
||
|
|
display('---------------------------------------------------');
|
||
|
|
display(['Reading fasta sequence from: ',filename,'...']);
|
||
|
|
display('---------------------------------------------------');
|
||
|
|
% Now we are looking for the maximum length of the sequence
|
||
|
|
n=0; % the number of sequences
|
||
|
|
m=0; % the maximum length
|
||
|
|
cm = 0; % current sequence length
|
||
|
|
|
||
|
|
while 1
|
||
|
|
[x,nr] = fscanf(file,'%c',1);
|
||
|
|
if nr == 0 break; end;
|
||
|
|
if x =='>' % new sequence started
|
||
|
|
if cm ~=m & m >0
|
||
|
|
fprintf(['*** ERROR: Different sequence length found in allelic ','%d','.\n'],n+1);
|
||
|
|
data = [];
|
||
|
|
return;
|
||
|
|
end
|
||
|
|
if cm > m m=cm; end;
|
||
|
|
cm = 0;
|
||
|
|
fgets(file);
|
||
|
|
n=n+1;
|
||
|
|
else
|
||
|
|
if isletter(x) | x=='-' | x == '?'
|
||
|
|
cm=cm+1;
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
end
|
||
|
|
|
||
|
|
if cm > m m=cm; end;
|
||
|
|
|
||
|
|
% go throught the file
|
||
|
|
if (m==0 | n==0)
|
||
|
|
display(['*** ERROR: Unmatched data for gene ' varargin{1}]);
|
||
|
|
data = [];
|
||
|
|
return;
|
||
|
|
end
|
||
|
|
|
||
|
|
Ss = char(m); S = [];
|
||
|
|
str = zeros(1,MAXNAME);
|
||
|
|
sizes = zeros(1,n);
|
||
|
|
frewind(file);
|
||
|
|
% names=[];
|
||
|
|
names={};
|
||
|
|
i=0;j=1;
|
||
|
|
id = 0;
|
||
|
|
while 1
|
||
|
|
[x,nr] = fscanf(file,'%c',1);
|
||
|
|
if nr == 0
|
||
|
|
break;
|
||
|
|
end;
|
||
|
|
if x =='>' % new sequence started
|
||
|
|
if i~= 0 % save the sequence
|
||
|
|
[x, sizes(i)]=size(Ss);
|
||
|
|
S=strvcat(S,Ss);
|
||
|
|
Ss = []; Ss = char(m);
|
||
|
|
end;
|
||
|
|
str=fgetl(file); % read the name, we remove the '>' symbol
|
||
|
|
% names=strvcat(names,str);
|
||
|
|
% pos=find(str==' ');
|
||
|
|
% if ~(isempty(pos))
|
||
|
|
% str=str(1:pos(1,1));
|
||
|
|
% end
|
||
|
|
i=i+1;
|
||
|
|
names{i}=str;
|
||
|
|
|
||
|
|
if nargin == 1
|
||
|
|
if isempty(findstr(str, varargin{1}))
|
||
|
|
display(['*** ERROR: Unmatched data for gene ' varargin{1}]);
|
||
|
|
data = [];
|
||
|
|
return
|
||
|
|
end
|
||
|
|
end
|
||
|
|
% disp(['Processing in: ' str]);
|
||
|
|
id = id + 1;
|
||
|
|
j=1;
|
||
|
|
else
|
||
|
|
if isletter(x) | x== '-' | x=='?'
|
||
|
|
% processing the sequence symbol
|
||
|
|
Ss(j) = upper(x);
|
||
|
|
j=j+1;
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
end
|
||
|
|
S=strvcat(S,Ss);
|
||
|
|
[x, sizes(i)]=size(Ss);
|
||
|
|
if exist('chosen_index','var')
|
||
|
|
S = S(chosen_index,:);
|
||
|
|
names = names(chosen_index);
|
||
|
|
end
|
||
|
|
aln.seqtype = seqtype;
|
||
|
|
aln.geneticcode = geneticcode;
|
||
|
|
aln.seqnames = names;
|
||
|
|
aln.seq = S;
|
||
|
|
aln = encodealn(aln);
|
||
|
|
data = aln.seq;
|
||
|
|
% if nargin == 1
|
||
|
|
% if isempty(findstr(names{1},varargin{1}))
|
||
|
|
% disp(['*** ERROR: The file does not contain the required gene ' varargin{1}]);
|
||
|
|
% data = [];
|
||
|
|
% return;
|
||
|
|
% end
|
||
|
|
% end
|
||
|
|
display(['# of allelic types: ' num2str(size(aln.seq,1))]);
|
||
|
|
display(['# of nucleotides: ' num2str(size(aln.seq,2))]);
|
||
|
|
% ----------------------
|
||
|
|
% order_index = [1:size(aln.seq,1)]';
|
||
|
|
% data = [aln.seq order_index]; % Append the index column in the end.
|
||
|
|
% ----------------------
|
||
|
|
|
||
|
|
fclose(file);
|
||
|
|
|