1

I have a list of strings which has four components:

a_b_c_d where:

  • a has 3 patterns of strings: str, jtp and mdl
  • b has 5 patterns of strings: HBW, HBS,HBO,NHBB and NHBO
  • c has 4 patterns of string: L, M, H and ALL
  • d has 4 patterns of strings: NMT, MC, CAR and PT
  • a, b, c and d are connected each other by _ in order
  • Some records do not have b, c and d

I need to extract c from the lists as a new field income. If c does not exist, it should be replaced by NA. Following is actual data frame that I use:

df <- c(
"str_HBW_L_NMT" ,"str_HBW_M_NMT" ,"str_HBW_H_NMT" ,"str_HBW_L_MC" ,"str_HBW_M_MC" ,"str_HBW_H_MC" ,
"str_HBW_L_CAR" ,"str_HBW_M_CAR" ,"str_HBW_H_CAR" ,"str_HBW_L_PT" ,"str_HBW_M_PT" ,"str_HBW_H_PT" ,
"str_HBS_L_NMT" ,"str_HBS_M_NMT" ,"str_HBS_H_NMT" ,"str_HBS_L_MC" ,"str_HBS_M_MC" ,"str_HBS_H_MC" ,
"str_HBS_L_CAR" ,"str_HBS_M_CAR" ,"str_HBS_H_CAR" ,"str_HBS_L_PT" ,"str_HBS_M_PT" ,"str_HBS_H_PT" ,
"str_HBO_L_NMT" ,"str_HBO_M_NMT" ,"str_HBO_H_NMT" ,"str_HBO_L_MC" ,"str_HBO_M_MC" ,"str_HBO_H_MC" ,
"str_HBO_L_CAR" ,"str_HBO_M_CAR" ,"str_HBO_H_CAR" ,"str_HBO_L_PT" ,"str_HBO_M_PT" ,"str_HBO_H_PT" ,
"str_NHBB_L_NMT","str_NHBB_M_NMT","str_NHBB_H_NMT","str_NHBB_L_MC","str_NHBB_M_MC","str_NHBB_H_MC",
"str_NHBB_L_CAR","str_NHBB_M_CAR","str_NHBB_H_CAR","str_NHBB_L_PT","str_NHBB_M_PT","str_NHBB_H_PT",
"str_NHBO_L_NMT","str_NHBO_M_NMT","str_NHBO_H_NMT","str_NHBO_L_MC","str_NHBO_M_MC","str_NHBO_H_MC",
"str_NHBO_L_CAR","str_NHBO_M_CAR","str_NHBO_H_CAR","str_NHBO_L_PT","str_NHBO_M_PT","str_NHBO_H_PT",
"str_HBW_L"     ,"str_HBW_M"     ,"str_HBW_H"     ,"str_HBS_L"    ,"str_HBS_M"    ,"str_HBS_H"    ,
"str_HBO_L"     ,"str_HBO_M"     ,"str_HBO_H"     ,"str_NHBB_L"   ,"str_NHBB_M"   ,"str_NHBB_H"   ,
"str_NHBO_L"    ,"str_NHBO_M"    ,"str_NHBO_H"    ,"str_HBW"      ,"str_HBS"      ,"str_HBO"      ,
"str_NHBB"      ,"str_NHBO"      ,"str_L"         ,"str_M"        ,"str_H"        ,"str_ALL"      ,
"jtp_HBW_L_NMT" ,"jtp_HBW_M_NMT" ,"jtp_HBW_H_NMT" ,"jtp_HBW_L_MC" ,"jtp_HBW_M_MC" ,"jtp_HBW_H_MC" ,
"jtp_HBW_L_CAR" ,"jtp_HBW_M_CAR" ,"jtp_HBW_H_CAR" ,"jtp_HBW_L_PT" ,"jtp_HBW_M_PT" ,"jtp_HBW_H_PT" ,
"jtp_HBS_L_NMT" ,"jtp_HBS_M_NMT" ,"jtp_HBS_H_NMT" ,"jtp_HBS_L_MC" ,"jtp_HBS_M_MC" ,"jtp_HBS_H_MC" ,
"jtp_HBS_L_CAR" ,"jtp_HBS_M_CAR" ,"jtp_HBS_H_CAR" ,"jtp_HBS_L_PT" ,"jtp_HBS_M_PT" ,"jtp_HBS_H_PT" ,
"jtp_HBW_L"     ,"jtp_HBW_M"     ,"jtp_HBW_H"     ,"jtp_HBS_L"    ,"jtp_HBS_M"    ,"jtp_HBS_H"    ,
"jtp_HBW"       ,"jtp_HBS"       ,"jtp_L"         ,"jtp_M"        ,"jtp_H"        ,"jtp_ALL"      ,
"mdl_HBW_L_NMT" ,"mdl_HBW_M_NMT" ,"mdl_HBW_H_NMT" ,"mdl_HBW_L_MC" ,"mdl_HBW_M_MC" ,"mdl_HBW_H_MC" ,
"mdl_HBW_L_CAR" ,"mdl_HBW_M_CAR" ,"mdl_HBW_H_CAR" ,"mdl_HBW_L_PT" ,"mdl_HBW_M_PT" ,"mdl_HBW_H_PT" ,
"mdl_HBS_L_NMT" ,"mdl_HBS_M_NMT" ,"mdl_HBS_H_NMT" ,"mdl_HBS_L_MC" ,"mdl_HBS_M_MC" ,"mdl_HBS_H_MC" ,
"mdl_HBS_L_CAR" ,"mdl_HBS_M_CAR" ,"mdl_HBS_H_CAR" ,"mdl_HBS_L_PT" ,"mdl_HBS_M_PT" ,"mdl_HBS_H_PT" ,
"mdl_HBO_L_NMT" ,"mdl_HBO_M_NMT" ,"mdl_HBO_H_NMT" ,"mdl_HBO_L_MC" ,"mdl_HBO_M_MC" ,"mdl_HBO_H_MC" ,
"mdl_HBO_L_CAR" ,"mdl_HBO_M_CAR" ,"mdl_HBO_H_CAR" ,"mdl_HBO_L_PT" ,"mdl_HBO_M_PT" ,"mdl_HBO_H_PT" ,
"mdl_NHBB_L_NMT","mdl_NHBB_M_NMT","mdl_NHBB_H_NMT","mdl_NHBB_L_MC","mdl_NHBB_M_MC","mdl_NHBB_H_MC",
"mdl_NHBB_L_CAR","mdl_NHBB_M_CAR","mdl_NHBB_H_CAR","mdl_NHBB_L_PT","mdl_NHBB_M_PT","mdl_NHBB_H_PT",
"mdl_NHBO_L_NMT","mdl_NHBO_M_NMT","mdl_NHBO_H_NMT","mdl_NHBO_L_MC","mdl_NHBO_M_MC","mdl_NHBO_H_MC",
"mdl_NHBO_L_CAR","mdl_NHBO_M_CAR","mdl_NHBO_H_CAR","mdl_NHBO_L_PT","mdl_NHBO_M_PT","mdl_NHBO_H_PT",
"mdl_HBW_L"     ,"mdl_HBW_M"     ,"mdl_HBW_H"     ,"mdl_HBS_L"    ,"mdl_HBS_M"    ,"mdl_HBS_H"    ,
"mdl_HBO_L"     ,"mdl_HBO_M"     ,"mdl_HBO_H"     ,"mdl_NHBB_L"   ,"mdl_NHBB_M"   ,"mdl_NHBB_H"   ,
"mdl_NHBO_L"    ,"mdl_NHBO_M"    ,"mdl_NHBO_H"    ,"mdl_HBW"      ,"mdl_HBS"      ,"mdl_HBO"      ,
"mdl_NHBB"      ,"mdl_NHBO"      ,"mdl_L"         ,"mdl_M"        ,"mdl_H"        ,"mdl_ALL"
)

I did lots of trials but could not extract it correctly. Following is an example of my script:

df %>% mutate(income=str_extract_all(string=name,
        pattern="(?!str|jtp|mdl|HBW|HBS|HBO|NHBB|NHBO|_)[L|M|H|(ALL)](?!NMT|MC|CAR|PT|_)"))

Do you have any suggestion to extract desired output as show below? I prefer to use tidyverse and stringr but base function is also applicable instead of stringr.

      name          income
1    str_HBW_L_NMT  L
2    str_HBW_M_NMT  M
3    str_HBW_H_NMT  H
4     str_HBW_L_MC  L
5     str_HBW_M_MC  M
6     str_HBW_H_MC  H
7    str_HBW_L_CAR  L
8    str_HBW_M_CAR  M
9    str_HBW_H_CAR  H
10    str_HBW_L_PT  L
11    str_HBW_M_PT  M
12    str_HBW_H_PT  H
13   str_HBS_L_NMT  L
14   str_HBS_M_NMT  M
15   str_HBS_H_NMT  H
16    str_HBS_L_MC  L
17    str_HBS_M_MC  M
18    str_HBS_H_MC  H
19   str_HBS_L_CAR  L
20   str_HBS_M_CAR  M
21   str_HBS_H_CAR  H
22    str_HBS_L_PT  L
23    str_HBS_M_PT  M
24    str_HBS_H_PT  H
25   str_HBO_L_NMT  L
26   str_HBO_M_NMT  M
27   str_HBO_H_NMT  H
28    str_HBO_L_MC  L
29    str_HBO_M_MC  M
30    str_HBO_H_MC  H
31   str_HBO_L_CAR  L
32   str_HBO_M_CAR  M
33   str_HBO_H_CAR  H
34    str_HBO_L_PT  L
35    str_HBO_M_PT  M
36    str_HBO_H_PT  H
37  str_NHBB_L_NMT  L
38  str_NHBB_M_NMT  M
39  str_NHBB_H_NMT  H
40   str_NHBB_L_MC  L
41   str_NHBB_M_MC  M
42   str_NHBB_H_MC  H
43  str_NHBB_L_CAR  L
44  str_NHBB_M_CAR  M
45  str_NHBB_H_CAR  H
46   str_NHBB_L_PT  L
47   str_NHBB_M_PT  M
48   str_NHBB_H_PT  H
49  str_NHBO_L_NMT  L
50  str_NHBO_M_NMT  M
51  str_NHBO_H_NMT  H
52   str_NHBO_L_MC  L
53   str_NHBO_M_MC  M
54   str_NHBO_H_MC  H
55  str_NHBO_L_CAR  L
56  str_NHBO_M_CAR  M
57  str_NHBO_H_CAR  H
58   str_NHBO_L_PT  L
59   str_NHBO_M_PT  M
60   str_NHBO_H_PT  H
61       str_HBW_L  L
62       str_HBW_M  M
63       str_HBW_H  H
64       str_HBS_L  L
65       str_HBS_M  M
66       str_HBS_H  H
67       str_HBO_L  L
68       str_HBO_M  M
69       str_HBO_H  H
70      str_NHBB_L  L
71      str_NHBB_M  M
72      str_NHBB_H  H
73      str_NHBO_L  L
74      str_NHBO_M  M
75      str_NHBO_H  H
76         str_HBW  <N/A>
77         str_HBS  <N/A>
78         str_HBO  <N/A>
79        str_NHBB  <N/A>
80        str_NHBO  <N/A>
81           str_L  L
82           str_M  M
83           str_H  H
84         str_ALL  ALL
85   jtp_HBW_L_NMT  L
86   jtp_HBW_M_NMT  M
87   jtp_HBW_H_NMT  H
88    jtp_HBW_L_MC  L
89    jtp_HBW_M_MC  M
90    jtp_HBW_H_MC  H
91   jtp_HBW_L_CAR  L
92   jtp_HBW_M_CAR  M
93   jtp_HBW_H_CAR  H
94    jtp_HBW_L_PT  L
95    jtp_HBW_M_PT  M
96    jtp_HBW_H_PT  H
97   jtp_HBS_L_NMT  L
98   jtp_HBS_M_NMT  M
99   jtp_HBS_H_NMT  H
100   jtp_HBS_L_MC  L
101   jtp_HBS_M_MC  M
102   jtp_HBS_H_MC  H
103  jtp_HBS_L_CAR  L
104  jtp_HBS_M_CAR  M
105  jtp_HBS_H_CAR  H
106   jtp_HBS_L_PT  L
107   jtp_HBS_M_PT  M
108   jtp_HBS_H_PT  H
109      jtp_HBW_L  L
110      jtp_HBW_M  M
111      jtp_HBW_H  H
112      jtp_HBS_L  L
113      jtp_HBS_M  M
114      jtp_HBS_H  H
115        jtp_HBW  <N/A>
116        jtp_HBS  <N/A>
117          jtp_L  L
118          jtp_M  M
119          jtp_H  H
120        jtp_ALL  ALL
121  mdl_HBW_L_NMT  L
122  mdl_HBW_M_NMT  M
123  mdl_HBW_H_NMT  H
124   mdl_HBW_L_MC  L
125   mdl_HBW_M_MC  M
126   mdl_HBW_H_MC  H
127  mdl_HBW_L_CAR  L
128  mdl_HBW_M_CAR  M
129  mdl_HBW_H_CAR  H
130   mdl_HBW_L_PT  L
131   mdl_HBW_M_PT  M
132   mdl_HBW_H_PT  H
133  mdl_HBS_L_NMT  L
134  mdl_HBS_M_NMT  M
135  mdl_HBS_H_NMT  H
136   mdl_HBS_L_MC  L
137   mdl_HBS_M_MC  M
138   mdl_HBS_H_MC  H
139  mdl_HBS_L_CAR  L
140  mdl_HBS_M_CAR  M
141  mdl_HBS_H_CAR  H
142   mdl_HBS_L_PT  L
143   mdl_HBS_M_PT  M
144   mdl_HBS_H_PT  H
145  mdl_HBO_L_NMT  L
146  mdl_HBO_M_NMT  M
147  mdl_HBO_H_NMT  H
148   mdl_HBO_L_MC  L
149   mdl_HBO_M_MC  M
150   mdl_HBO_H_MC  H
151  mdl_HBO_L_CAR  L
152  mdl_HBO_M_CAR  M
153  mdl_HBO_H_CAR  H
154   mdl_HBO_L_PT  L
155   mdl_HBO_M_PT  M
156   mdl_HBO_H_PT  H
157 mdl_NHBB_L_NMT  L
158 mdl_NHBB_M_NMT  M
159 mdl_NHBB_H_NMT  H
160  mdl_NHBB_L_MC  L
161  mdl_NHBB_M_MC  M
162  mdl_NHBB_H_MC  H
163 mdl_NHBB_L_CAR  L
164 mdl_NHBB_M_CAR  M
165 mdl_NHBB_H_CAR  H
166  mdl_NHBB_L_PT  L
167  mdl_NHBB_M_PT  M
168  mdl_NHBB_H_PT  H
169 mdl_NHBO_L_NMT  L
170 mdl_NHBO_M_NMT  M
171 mdl_NHBO_H_NMT  H
172  mdl_NHBO_L_MC  L
173  mdl_NHBO_M_MC  M
174  mdl_NHBO_H_MC  H
175 mdl_NHBO_L_CAR  L
176 mdl_NHBO_M_CAR  M
177 mdl_NHBO_H_CAR  H
178  mdl_NHBO_L_PT  L
179  mdl_NHBO_M_PT  M
180  mdl_NHBO_H_PT  H
181      mdl_HBW_L  L
182      mdl_HBW_M  M
183      mdl_HBW_H  H
184      mdl_HBS_L  L
185      mdl_HBS_M  M
186      mdl_HBS_H  H
187      mdl_HBO_L  L
188      mdl_HBO_M  M
189      mdl_HBO_H  H
190     mdl_NHBB_L  L
191     mdl_NHBB_M  M
192     mdl_NHBB_H  H
193     mdl_NHBO_L  L
194     mdl_NHBO_M  M
195     mdl_NHBO_H  H
196        mdl_HBW  <N/A>
197        mdl_HBS  <N/A>
198        mdl_HBO  <N/A>
199       mdl_NHBB  <N/A>
200       mdl_NHBO  <N/A>
201          mdl_L  L
202          mdl_M  M
203          mdl_H  H
204        mdl_ALL  ALL

============== NEW EXAMPLE OF DATA FRAME ========== Records containing only c or d were added at the top of original df.

df <- c(
"NMT","MC","CAR","PT","L","M","H","ALL",
"str_HBW_L_NMT" ,"str_HBW_M_NMT" ,"str_HBW_H_NMT" ,"str_HBW_L_MC" ,"str_HBW_M_MC" ,"str_HBW_H_MC" ,
"str_HBW_L_CAR" ,"str_HBW_M_CAR" ,"str_HBW_H_CAR" ,"str_HBW_L_PT" ,"str_HBW_M_PT" ,"str_HBW_H_PT" ,
"str_HBS_L_NMT" ,"str_HBS_M_NMT" ,"str_HBS_H_NMT" ,"str_HBS_L_MC" ,"str_HBS_M_MC" ,"str_HBS_H_MC" ,
"str_HBS_L_CAR" ,"str_HBS_M_CAR" ,"str_HBS_H_CAR" ,"str_HBS_L_PT" ,"str_HBS_M_PT" ,"str_HBS_H_PT" ,
"str_HBO_L_NMT" ,"str_HBO_M_NMT" ,"str_HBO_H_NMT" ,"str_HBO_L_MC" ,"str_HBO_M_MC" ,"str_HBO_H_MC" ,
"str_HBO_L_CAR" ,"str_HBO_M_CAR" ,"str_HBO_H_CAR" ,"str_HBO_L_PT" ,"str_HBO_M_PT" ,"str_HBO_H_PT" ,
"str_NHBB_L_NMT","str_NHBB_M_NMT","str_NHBB_H_NMT","str_NHBB_L_MC","str_NHBB_M_MC","str_NHBB_H_MC",
"str_NHBB_L_CAR","str_NHBB_M_CAR","str_NHBB_H_CAR","str_NHBB_L_PT","str_NHBB_M_PT","str_NHBB_H_PT",
"str_NHBO_L_NMT","str_NHBO_M_NMT","str_NHBO_H_NMT","str_NHBO_L_MC","str_NHBO_M_MC","str_NHBO_H_MC",
"str_NHBO_L_CAR","str_NHBO_M_CAR","str_NHBO_H_CAR","str_NHBO_L_PT","str_NHBO_M_PT","str_NHBO_H_PT",
"str_HBW_L"     ,"str_HBW_M"     ,"str_HBW_H"     ,"str_HBS_L"    ,"str_HBS_M"    ,"str_HBS_H"    ,
"str_HBO_L"     ,"str_HBO_M"     ,"str_HBO_H"     ,"str_NHBB_L"   ,"str_NHBB_M"   ,"str_NHBB_H"   ,
"str_NHBO_L"    ,"str_NHBO_M"    ,"str_NHBO_H"    ,"str_HBW"      ,"str_HBS"      ,"str_HBO"      ,
"str_NHBB"      ,"str_NHBO"      ,"str_L"         ,"str_M"        ,"str_H"        ,"str_ALL"      ,
"jtp_HBW_L_NMT" ,"jtp_HBW_M_NMT" ,"jtp_HBW_H_NMT" ,"jtp_HBW_L_MC" ,"jtp_HBW_M_MC" ,"jtp_HBW_H_MC" ,
"jtp_HBW_L_CAR" ,"jtp_HBW_M_CAR" ,"jtp_HBW_H_CAR" ,"jtp_HBW_L_PT" ,"jtp_HBW_M_PT" ,"jtp_HBW_H_PT" ,
"jtp_HBS_L_NMT" ,"jtp_HBS_M_NMT" ,"jtp_HBS_H_NMT" ,"jtp_HBS_L_MC" ,"jtp_HBS_M_MC" ,"jtp_HBS_H_MC" ,
"jtp_HBS_L_CAR" ,"jtp_HBS_M_CAR" ,"jtp_HBS_H_CAR" ,"jtp_HBS_L_PT" ,"jtp_HBS_M_PT" ,"jtp_HBS_H_PT" ,
"jtp_HBW_L"     ,"jtp_HBW_M"     ,"jtp_HBW_H"     ,"jtp_HBS_L"    ,"jtp_HBS_M"    ,"jtp_HBS_H"    ,
"jtp_HBW"       ,"jtp_HBS"       ,"jtp_L"         ,"jtp_M"        ,"jtp_H"        ,"jtp_ALL"      ,
"mdl_HBW_L_NMT" ,"mdl_HBW_M_NMT" ,"mdl_HBW_H_NMT" ,"mdl_HBW_L_MC" ,"mdl_HBW_M_MC" ,"mdl_HBW_H_MC" ,
"mdl_HBW_L_CAR" ,"mdl_HBW_M_CAR" ,"mdl_HBW_H_CAR" ,"mdl_HBW_L_PT" ,"mdl_HBW_M_PT" ,"mdl_HBW_H_PT" ,
"mdl_HBS_L_NMT" ,"mdl_HBS_M_NMT" ,"mdl_HBS_H_NMT" ,"mdl_HBS_L_MC" ,"mdl_HBS_M_MC" ,"mdl_HBS_H_MC" ,
"mdl_HBS_L_CAR" ,"mdl_HBS_M_CAR" ,"mdl_HBS_H_CAR" ,"mdl_HBS_L_PT" ,"mdl_HBS_M_PT" ,"mdl_HBS_H_PT" ,
"mdl_HBO_L_NMT" ,"mdl_HBO_M_NMT" ,"mdl_HBO_H_NMT" ,"mdl_HBO_L_MC" ,"mdl_HBO_M_MC" ,"mdl_HBO_H_MC" ,
"mdl_HBO_L_CAR" ,"mdl_HBO_M_CAR" ,"mdl_HBO_H_CAR" ,"mdl_HBO_L_PT" ,"mdl_HBO_M_PT" ,"mdl_HBO_H_PT" ,
"mdl_NHBB_L_NMT","mdl_NHBB_M_NMT","mdl_NHBB_H_NMT","mdl_NHBB_L_MC","mdl_NHBB_M_MC","mdl_NHBB_H_MC",
"mdl_NHBB_L_CAR","mdl_NHBB_M_CAR","mdl_NHBB_H_CAR","mdl_NHBB_L_PT","mdl_NHBB_M_PT","mdl_NHBB_H_PT",
"mdl_NHBO_L_NMT","mdl_NHBO_M_NMT","mdl_NHBO_H_NMT","mdl_NHBO_L_MC","mdl_NHBO_M_MC","mdl_NHBO_H_MC",
"mdl_NHBO_L_CAR","mdl_NHBO_M_CAR","mdl_NHBO_H_CAR","mdl_NHBO_L_PT","mdl_NHBO_M_PT","mdl_NHBO_H_PT",
"mdl_HBW_L"     ,"mdl_HBW_M"     ,"mdl_HBW_H"     ,"mdl_HBS_L"    ,"mdl_HBS_M"    ,"mdl_HBS_H"    ,
"mdl_HBO_L"     ,"mdl_HBO_M"     ,"mdl_HBO_H"     ,"mdl_NHBB_L"   ,"mdl_NHBB_M"   ,"mdl_NHBB_H"   ,
"mdl_NHBO_L"    ,"mdl_NHBO_M"    ,"mdl_NHBO_H"    ,"mdl_HBW"      ,"mdl_HBS"      ,"mdl_HBO"      ,
"mdl_NHBB"      ,"mdl_NHBO"      ,"mdl_L"         ,"mdl_M"        ,"mdl_H"        ,"mdl_ALL"
Hideo.S
  • 601
  • 4
  • 14

3 Answers3

2

You may create a regex that will match all your inputs with optional groupings for b, c and d, remove all but the c part from the string and then replace empty items with NAs:

res <- sub("^(?:(?:str|jtp|mdl)(?:_|$))?(?:(?:HB[WSO]|NHB[BO])(?:_|$))?(?:([LMH]|ALL)(?:_|$))?(?:NMT|MC|CAR|PT)?$", "\\1", df)
res[nchar(res)==0] <- NA
> res
  [1] NA    NA    NA    NA    "L"   "M"   "H"   "ALL" "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"  
 [29] "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"  
 [57] "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   NA   
 [85] NA    NA    NA    NA    "L"   "M"   "H"   "ALL" "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"  
[113] "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   NA    NA    "L"   "M"   "H"   "ALL" "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"  
[141] "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"  
[169] "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"  
[197] "H"   "L"   "M"   "H"   "L"   "M"   "H"   NA    NA    NA    NA    NA    "L"   "M"   "H"   "ALL"

See the regex demo.

As you see, you do not need any additional package here.

Pattern details

  • ^ - start of string
  • (?:(?:str|jtp|mdl)(?:_|$))? - a part (an optional group): str, jtp or mdl followed with _ or end of string ((?:_|$))
  • (?:(?:HB[WSO]|NHB[BO])(?:_|$))? - a b part (an optional group): an optional non-capturing group matching _ and then HBW, HBS, HBO, NHBB or NHBO followed with _ or end of string ((?:_|$))
  • (?:_([LMH]|ALL))? - a c part: an optional non-capturing group matching _ and then L, M, H or ALL followed with _ or end of string ((?:_|$))
  • (?:NMT|MC|CAR|PT)? - a d part: an optional non-capturing group matching NMT, MC, CAR or PT
  • $ - end of string.

The \\1 (a \1 string) placeholder in the replacement pattern restores the Group 1 (c pattern).

Wiktor Stribiżew
  • 484,719
  • 26
  • 302
  • 397
  • 1
    Thank you, it worked out! Detailed explanation also helps me a lot as I am new to user regular expression. – Hideo.S Jun 04 '18 at 10:59
  • I have one additional question. If some records have only `c` or `d` (such as NMT, CAR, H, ALL, etc.), it returns those values in your script. How should it be modified? – Hideo.S Jun 08 '18 at 06:54
  • @Hideo [These are not matched](https://regex101.com/r/QCMa9K/1). Please share a fiddle. Also, please use `res[!grepl("^(?:[LMH]|ALL)$",res)] – Wiktor Stribiżew Jun 08 '18 at 07:03
  • Thank you for your response. I added new example of data frame. – Hideo.S Jun 11 '18 at 03:13
  • 1
    @Hideo Ok, I see. I updated the solution. Just add `|.+` at the end of the pattern. – Wiktor Stribiżew Jun 11 '18 at 08:25
  • Thank you for your response. I tested it but found that those newly added records `"NMT","MC","CAR","PT","L","M","H","ALL"` returns blank. As I need to extract group `c` from those records, `"NMT","MC","CAR","PT"`, those are all `b` group, should return `"ALL"` and `"L","M","H","ALL"`, those are all `c` group, should be returned as it is. Sorry not to give enough information but if group `c` is missing in each record, those records are regarded as it contains `"ALL"` income group. – Hideo.S Jun 11 '18 at 08:46
  • 1
    @Hideo See an update. I had to make all parts of the string optional, and it required a bit more tweaking related to the separator char. – Wiktor Stribiżew Jun 11 '18 at 08:56
  • Thank you for your continuous suggestion. It worked completely the same as I expected. I will try to understand it completely. – Hideo.S Jun 11 '18 at 17:40
  • 1
    @Hideo Well, I actually explained everything, but you may wonder what `(?:...)` is. It is a [non-capturing group](https://stackoverflow.com/questions/3512471/what-is-a-non-capturing-group-what-does-do) that does not create any submatch (it does not create any `\1` or `\2`) and is only used to group sequences of patterns or alternations. – Wiktor Stribiżew Jun 11 '18 at 18:10
0

I would make this data into a dataframe by splitting the string into columns. That is much easier to work with.

require(stringr)

l <- str_split(df, pattern = "_") 

df2 <- as.data.frame(do.call(rbind, lapply(l, function(x) {length(x) <- 4
x})))

str(df2)

results in:

'data.frame':   204 obs. of  4 variables:
$ V1: Factor w/ 3 levels "jtp","mdl","str": 3 3 3 3 3 3 3 3 3 3 ...
$ V2: Factor w/ 9 levels "ALL","H","HBO",..: 5 5 5 5 5 5 5 5 5 5 ...
$ V3: Factor w/ 3 levels "H","L","M": 2 3 1 2 3 1 2 3 1 2 ...
$ V4: Factor w/ 4 levels "CAR","MC","NMT",..: 3 3 3 2 2 2 1 1 1 4 ...
Wietze314
  • 5,675
  • 1
  • 17
  • 35
  • Thank you for your suggestion! I would prefer to keep original columns to keep consistency between actual data frame as it contains other columns. – Hideo.S Jun 04 '18 at 11:03
0

Somewhat simpler regular expressions

library(stringr)
library(dplyr)

# Create pattern to look for
patt <- paste0("_", c("L", "M", "H", "ALL"), c(rep("$", 4), rep("_", 4)))
patt <- paste(patt, collapse = "|")
patt
[1] "_L$|_M$|_H$|_ALL$|_L_|_M_|_H_|_ALL_"

# Extract pattern and clean
df$income <- str_extract(df$name, patt) %>% 
  str_replace_all("_", "")

head(df)
         name income
str_HBW_L_NMT      L
str_HBW_M_NMT      M
str_HBW_H_NMT      H
 str_HBW_L_MC      L
 str_HBW_M_MC      M
 str_HBW_H_MC      H

Or avoid regex/dplyr/stringr altogether:

df$income <- 
  strsplit(df$name, split = "_", fixed = TRUE) %>% 
  lapply(
    function(x) {
      our_el <- x %in% c("L", "M", "H", "ALL")
      ifelse(!any(our_el), NA, x[our_el])
    }
  )
sindri_baldur
  • 22,360
  • 2
  • 25
  • 48
  • Thank you for your suggestion. Though combination pattern of each group got a bit complicated than the original, this solution works for fixed format. – Hideo.S Jun 11 '18 at 19:08