2012-07-26 33 views
1

我有6个相同的SAS数据集。它们只在观察值上有所不同。在其他数据集上有条件地创建SAS数据集

如何创建一个输出数据,该数据为每个单元的所有6个数据集找到最大值?

update声明似乎是一个很好的候选人,但它不能设置一个条件。

数据1

v1 v2 v3 
1 1 1 
1 2 3 

数据2

v1 v2 v3 
1 2 3 
1 1 1 

结果

v1 v2 v3 
1 2 3 
1 2 3 
+0

没有任何按键标识的唯一的记录?或者只是观察数字(行的顺序)? – vasja 2012-07-26 08:01:38

+0

是的,有一个标识符,它的顺序。 – Rico 2012-07-26 08:21:05

回答

1
If need be the following could be automated by "PUT" statements or variable arrays. 
***ASSUMES DATA SETS ARE SORTED BY ID; 

Data test; 
do until(last.id); 
    set a b c; 
    by id; 
if v1 > updv1 then updv1 = v1; 
if v2 > updv2 then updv2 = v2; 
if v3 > updv3 then updv3 = v3; 
end; 
drop v1-v3; 
rename updv1-updv3 = v1-v3; 
run; 

为了提供波多黎各的问题作出更完整的解决方案(假设6个集例如D1-D6)一个可以这样做:

Data test; 
    array v(*) v1-v3; 
    array updv(*) updv1-updv3; 
    do until(last.id); 
    set d1-d6; 
    by id; 
    do i = 1 to dim(v); 
     if v(i) > updv(i) then updv(i) = v(i); 
    end; 
    end; 
    drop v1-v3; 
    rename updv1-updv3 = v1-v3; 
    run; 


    proc print; 
    var id v1-v3; 
    run; 
+0

这太好了。我试图循环变量名称,但它不工作。 – Rico 2012-08-06 09:23:07

+0

不适用于数组。用户编写的'%do_over'宏的解决方案似乎可行。 – Rico 2012-08-06 09:29:28

0

这里的另一种尝试是对任何数量的数据集和变量的可扩展性。我这次也添加了一个ID变量。就像@vasja的回答一样,这里使用了一些先进的技术。这两种解决方案其实非常相似,我用'call execute'而不是宏来创建视图。我的解决方案还需要将数据集名称存储在数据集中。

/* create dataset of required dataset names */ 
data datasets; 
input ds_name $; 
cards; 
data1 
data2 
; 
run; 

/* dummy data */ 
data data1; 
input id v1 v2 v3; 
cards; 
10 1 1 1 
20 1 2 3 
; 
run; 

data data2; 
input id v1 v2 v3; 
cards; 
10 1 2 3 
20 1 1 1 
; 
run; 

/* create dataset, macro list and count of variables names */ 
proc sql noprint; 
create table variables as 
select name as v_name from dictionary.columns 
      where libname='WORK' and upcase(memname)='DATA1' and upcase(name) ne 'ID'; 
select name, count(*) into :keepvar separated by ' ', 
          :numvar 
      from dictionary.columns 
      where libname='WORK' and upcase(memname)='DATA1' and upcase(name) ne 'ID'; 
quit; 

/* create view that joins all datasets, renames variables and calculates maximum value per id */ 
data _null_; 
set datasets end=last; 
if _n_=1 then call execute('data data_all/view=data_all; merge'); 
    call execute (trim(ds_name)|| '(rename=('); 
     do i=1 to &numvar.; 
     set variables point=i; 
     call execute(trim(v_name)||'='||catx('_',v_name,_n_)); 
     end; 
     call execute('))'); 
if last then do; 
     call execute('; by id;'); 
     do i=1 to &numvar.; 
     set variables point=i; 
     call execute(trim(v_name)||'='||'max(of '||trim(v_name)||':);'); 
     end; 
     call execute('run;'); 
end; 
run; 

/* create dataset of maximum values per id per variable */ 
data result (keep=id &keepvar.); 
set data_all; 
run; 
+0

我有大量的变量的编码建议吗? – Rico 2012-07-26 09:11:09

0

见下文。对于SAS初学者来说可能太复杂了。我希望评论能够解释一下。

/* macro rename_cols_opt to generate cols_opt&n variables 
- cols_opt&n contains generated code for dataset RENAME option for a given (&n) dataset 
*/ 
%macro rename_cols_opt(n); 
    %global cols_opt&n max&n; 

    proc sql noprint; 
    select catt(name, '=', name, "&n") into: cols_opt&n separated by ' ' 
    from dictionary.columns 
     where libname='WORK' and memname='DATA1' 
      and upcase(name) ne 'MY_ID_COLUMN' 
    ; 
    quit; 

%mend; 


/* prepare macro variables = pre-generate the code */ 
%rename_cols_opt(1) 
%rename_cols_opt(2) 
%rename_cols_opt(3) 
%rename_cols_opt(4) 
%rename_cols_opt(5) 
%rename_cols_opt(6) 

/* create macro variable keep_list containing names of output variables to keep (based on DATA1 structure, the code expects those variables in other tables as well */ 
proc sql noprint; 
select trim(name) into: keep_list separated by ' ' 
from dictionary.columns 
    where libname='WORK' and memname='DATA1' 
; 
quit; 
%put &keep_list; 


/* macro variable maxcode contains generated code for calculating all MAX values */ 
proc sql noprint; 
select cat(trim(name), ' = max(of ', trim(name), ":)") into: maxcode separated by '; ' 
from dictionary.columns 
    where libname='WORK' and memname='DATA1' 
     and upcase(name) ne 'MY_ID_COLUMN' 
; 
quit; 
%put "&maxcode"; 

data result1/view =result1; 
merge 
    data1 (in=a rename=(&cols_opt1)) 
    data2 (in=b rename=(&cols_opt2)) 
    data3 (in=b rename=(&cols_opt3)) 
    data4 (in=b rename=(&cols_opt4)) 
    data5 (in=b rename=(&cols_opt5)) 
    data6 (in=b rename=(&cols_opt6)) 
; 
by MY_ID_COLUMN; 
&maxcode; 
keep &keep_list; 
run; 

/* created a datastep view, now "describing" it to see the generated code */ 
data view=result1; 
describe; 
run;