1 |
|
|
/* $OpenBSD: rde_decide.c,v 1.62 2012/07/04 20:43:26 claudio Exp $ */ |
2 |
|
|
|
3 |
|
|
/* |
4 |
|
|
* Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org> |
5 |
|
|
* Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org> |
6 |
|
|
* |
7 |
|
|
* Permission to use, copy, modify, and distribute this software for any |
8 |
|
|
* purpose with or without fee is hereby granted, provided that the above |
9 |
|
|
* copyright notice and this permission notice appear in all copies. |
10 |
|
|
* |
11 |
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
12 |
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
13 |
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
14 |
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
15 |
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
16 |
|
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
17 |
|
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
18 |
|
|
*/ |
19 |
|
|
|
20 |
|
|
#include <sys/types.h> |
21 |
|
|
#include <sys/queue.h> |
22 |
|
|
|
23 |
|
|
#include <string.h> |
24 |
|
|
|
25 |
|
|
#include "bgpd.h" |
26 |
|
|
#include "rde.h" |
27 |
|
|
|
28 |
|
|
int prefix_cmp(struct prefix *, struct prefix *); |
29 |
|
|
/* |
30 |
|
|
* Decision Engine RFC implementation: |
31 |
|
|
* Phase 1: |
32 |
|
|
* - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes |
33 |
|
|
* - IBGP routes may either use LOCAL_PREF or the local system computes |
34 |
|
|
* the degree of preference |
35 |
|
|
* - If the route is ineligible, the route MAY NOT serve as an input to |
36 |
|
|
* the next phase of route selection |
37 |
|
|
* - if the route is eligible the computed value MUST be used as the |
38 |
|
|
* LOCAL_PREF value in any IBGP readvertisement |
39 |
|
|
* |
40 |
|
|
* Phase 2: |
41 |
|
|
* - If the NEXT_HOP attribute of a BGP route depicts an address that is |
42 |
|
|
* not resolvable the BGP route MUST be excluded from the Phase 2 decision |
43 |
|
|
* function. |
44 |
|
|
* - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP |
45 |
|
|
* route should be excluded from the Phase 2 decision function. |
46 |
|
|
* - The local BGP speaker identifies the route that has: |
47 |
|
|
* a) the highest degree of preference of any route to the same set |
48 |
|
|
* of destinations |
49 |
|
|
* b) is the only route to that destination |
50 |
|
|
* c) is selected as a result of the Phase 2 tie breaking rules |
51 |
|
|
* - The local speaker MUST determine the immediate next-hop address from |
52 |
|
|
* the NEXT_HOP attribute of the selected route. |
53 |
|
|
* - If either the immediate next hop or the IGP cost to the NEXT_HOP changes, |
54 |
|
|
* Phase 2 Route Selection MUST be performed again. |
55 |
|
|
* |
56 |
|
|
* Route Resolvability Condition |
57 |
|
|
* - A route Rte1, referencing only the intermediate network address, is |
58 |
|
|
* considered resolvable if the Routing Table contains at least one |
59 |
|
|
* resolvable route Rte2 that matches Rte1's intermediate network address |
60 |
|
|
* and is not recursively resolved through Rte1. |
61 |
|
|
* - Routes referencing interfaces are considered resolvable if the state of |
62 |
|
|
* the referenced interface is up and IP processing is enabled. |
63 |
|
|
* |
64 |
|
|
* Breaking Ties (Phase 2) |
65 |
|
|
* 1. Remove from consideration all routes which are not tied for having the |
66 |
|
|
* smallest number of AS numbers present in their AS_PATH attributes. |
67 |
|
|
* Note, that when counting this number, an AS_SET counts as 1 |
68 |
|
|
* 2. Remove from consideration all routes which are not tied for having the |
69 |
|
|
* lowest Origin number in their Origin attribute. |
70 |
|
|
* 3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC |
71 |
|
|
* attributes. MULTI_EXIT_DISC is only comparable between routes learned |
72 |
|
|
* from the same neighboring AS. |
73 |
|
|
* 4. If at least one of the candidate routes was received via EBGP, |
74 |
|
|
* remove from consideration all routes which were received via IBGP. |
75 |
|
|
* 5. Remove from consideration any routes with less-preferred interior cost. |
76 |
|
|
* If the NEXT_HOP hop for a route is reachable, but no cost can be |
77 |
|
|
* determined, then this step should be skipped. |
78 |
|
|
* 6. Remove from consideration all routes other than the route that was |
79 |
|
|
* advertised by the BGP speaker whose BGP Identifier has the lowest value. |
80 |
|
|
* 7. Prefer the route received from the lowest peer address. |
81 |
|
|
* |
82 |
|
|
* Phase 3: Route Dissemination |
83 |
|
|
* - All routes in the Loc-RIB are processed into Adj-RIBs-Out according |
84 |
|
|
* to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out |
85 |
|
|
* unless the destination and NEXT_HOP described by this route may be |
86 |
|
|
* forwarded appropriately by the Routing Table. |
87 |
|
|
*/ |
88 |
|
|
|
89 |
|
|
/* |
90 |
|
|
* Decision Engine OUR implementation: |
91 |
|
|
* Our implementation has only one RIB. The filtering is done first. The |
92 |
|
|
* filtering calculates the preference and stores it in LOCAL_PREF (Phase 1). |
93 |
|
|
* Ineligible routes are flagged as ineligible via nexthop_add(). |
94 |
|
|
* Phase 3 is done together with Phase 2. |
95 |
|
|
* In following cases a prefix needs to be reevaluated: |
96 |
|
|
* - update of a prefix (path_update) |
97 |
|
|
* - withdraw of a prefix (prefix_remove) |
98 |
|
|
* - state change of the nexthop (nexthop-{in}validate) |
99 |
|
|
* - state change of session (session down) |
100 |
|
|
*/ |
101 |
|
|
|
102 |
|
|
/* |
103 |
|
|
* Compare two prefixes with equal pt_entry. Returns an integer greater than or |
104 |
|
|
* less than 0, according to whether the prefix p1 is more or less preferred |
105 |
|
|
* than the prefix p2. p1 should be used for the new prefix and p2 for a |
106 |
|
|
* already added prefix. |
107 |
|
|
*/ |
108 |
|
|
int |
109 |
|
|
prefix_cmp(struct prefix *p1, struct prefix *p2) |
110 |
|
|
{ |
111 |
|
|
struct rde_aspath *asp1, *asp2; |
112 |
|
|
struct attr *a; |
113 |
|
|
u_int32_t p1id, p2id; |
114 |
|
|
int p1cnt, p2cnt; |
115 |
|
|
|
116 |
|
|
if (p1 == NULL) |
117 |
|
|
return (-1); |
118 |
|
|
if (p2 == NULL) |
119 |
|
|
return (1); |
120 |
|
|
|
121 |
|
|
asp1 = p1->aspath; |
122 |
|
|
asp2 = p2->aspath; |
123 |
|
|
|
124 |
|
|
/* pathes with errors are not eligible */ |
125 |
|
|
if (asp1->flags & F_ATTR_PARSE_ERR) |
126 |
|
|
return (-1); |
127 |
|
|
if (asp2->flags & F_ATTR_PARSE_ERR) |
128 |
|
|
return (1); |
129 |
|
|
|
130 |
|
|
/* only loop free pathes are eligible */ |
131 |
|
|
if (asp1->flags & F_ATTR_LOOP) |
132 |
|
|
return (-1); |
133 |
|
|
if (asp2->flags & F_ATTR_LOOP) |
134 |
|
|
return (1); |
135 |
|
|
|
136 |
|
|
/* 1. check if prefix is eligible a.k.a reachable */ |
137 |
|
|
if (asp2->nexthop != NULL && asp2->nexthop->state != NEXTHOP_REACH) |
138 |
|
|
return (1); |
139 |
|
|
if (asp1->nexthop != NULL && asp1->nexthop->state != NEXTHOP_REACH) |
140 |
|
|
return (-1); |
141 |
|
|
|
142 |
|
|
/* 2. local preference of prefix, bigger is better */ |
143 |
|
|
if ((asp1->lpref - asp2->lpref) != 0) |
144 |
|
|
return (asp1->lpref - asp2->lpref); |
145 |
|
|
|
146 |
|
|
/* 3. aspath count, the shorter the better */ |
147 |
|
|
if ((asp2->aspath->ascnt - asp1->aspath->ascnt) != 0) |
148 |
|
|
return (asp2->aspath->ascnt - asp1->aspath->ascnt); |
149 |
|
|
|
150 |
|
|
/* 4. origin, the lower the better */ |
151 |
|
|
if ((asp2->origin - asp1->origin) != 0) |
152 |
|
|
return (asp2->origin - asp1->origin); |
153 |
|
|
|
154 |
|
|
/* 5. MED decision, only comparable between the same neighboring AS */ |
155 |
|
|
if (rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS || |
156 |
|
|
aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) |
157 |
|
|
/* lowest value wins */ |
158 |
|
|
if ((asp2->med - asp1->med) != 0) |
159 |
|
|
return (asp2->med - asp1->med); |
160 |
|
|
|
161 |
|
|
/* |
162 |
|
|
* 6. EBGP is cooler than IBGP |
163 |
|
|
* It is absolutely important that the ebgp value in peer_config.ebgp |
164 |
|
|
* is bigger than all other ones (IBGP, confederations) |
165 |
|
|
*/ |
166 |
|
|
if (asp1->peer->conf.ebgp != asp2->peer->conf.ebgp) { |
167 |
|
|
if (asp1->peer->conf.ebgp) /* p1 is EBGP other is lower */ |
168 |
|
|
return 1; |
169 |
|
|
else if (asp2->peer->conf.ebgp) /* p2 is EBGP */ |
170 |
|
|
return -1; |
171 |
|
|
} |
172 |
|
|
|
173 |
|
|
/* |
174 |
|
|
* 7. local tie-breaker, this weight is here to tip equal long AS |
175 |
|
|
* paths in one or the other direction. It happens more and more |
176 |
|
|
* that AS paths are equally long and so traffic engineering needs |
177 |
|
|
* a metric that weights a prefix at a very late stage in the |
178 |
|
|
* decision process. |
179 |
|
|
*/ |
180 |
|
|
if ((asp1->weight - asp2->weight) != 0) |
181 |
|
|
return (asp1->weight - asp2->weight); |
182 |
|
|
|
183 |
|
|
/* 8. nexthop costs. NOT YET -> IGNORE */ |
184 |
|
|
|
185 |
|
|
/* |
186 |
|
|
* 9. older route (more stable) wins but only if route-age |
187 |
|
|
* evaluation is enabled. |
188 |
|
|
*/ |
189 |
|
|
if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) |
190 |
|
|
if ((p2->lastchange - p1->lastchange) != 0) |
191 |
|
|
return (p2->lastchange - p1->lastchange); |
192 |
|
|
|
193 |
|
|
/* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */ |
194 |
|
|
if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) { |
195 |
|
|
memcpy(&p1id, a->data, sizeof(p1id)); |
196 |
|
|
p1id = ntohl(p1id); |
197 |
|
|
} else |
198 |
|
|
p1id = asp1->peer->remote_bgpid; |
199 |
|
|
if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) { |
200 |
|
|
memcpy(&p2id, a->data, sizeof(p2id)); |
201 |
|
|
p2id = ntohl(p2id); |
202 |
|
|
} else |
203 |
|
|
p2id = asp2->peer->remote_bgpid; |
204 |
|
|
if ((p2id - p1id) != 0) |
205 |
|
|
return (p2id - p1id); |
206 |
|
|
|
207 |
|
|
/* 11. compare CLUSTER_LIST length, shorter is better */ |
208 |
|
|
p1cnt = p2cnt = 0; |
209 |
|
|
if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL) |
210 |
|
|
p1cnt = a->len / sizeof(u_int32_t); |
211 |
|
|
if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL) |
212 |
|
|
p2cnt = a->len / sizeof(u_int32_t); |
213 |
|
|
if ((p2cnt - p1cnt) != 0) |
214 |
|
|
return (p2cnt - p1cnt); |
215 |
|
|
|
216 |
|
|
/* 12. lowest peer address wins (IPv4 is better than IPv6) */ |
217 |
|
|
if (memcmp(&p1->aspath->peer->remote_addr, |
218 |
|
|
&p2->aspath->peer->remote_addr, |
219 |
|
|
sizeof(p1->aspath->peer->remote_addr)) != 0) |
220 |
|
|
return (-memcmp(&p1->aspath->peer->remote_addr, |
221 |
|
|
&p2->aspath->peer->remote_addr, |
222 |
|
|
sizeof(p1->aspath->peer->remote_addr))); |
223 |
|
|
|
224 |
|
|
/* 13. for announced prefixes prefer dynamic routes */ |
225 |
|
|
if ((asp1->flags & F_ANN_DYNAMIC) != (asp2->flags & F_ANN_DYNAMIC)) { |
226 |
|
|
if (asp1->flags & F_ANN_DYNAMIC) |
227 |
|
|
return (1); |
228 |
|
|
else |
229 |
|
|
return (-1); |
230 |
|
|
} |
231 |
|
|
|
232 |
|
|
fatalx("Uh, oh a politician in the decision process"); |
233 |
|
|
/* NOTREACHED */ |
234 |
|
|
} |
235 |
|
|
|
236 |
|
|
/* |
237 |
|
|
* Find the correct place to insert the prefix in the prefix list. |
238 |
|
|
* If the active prefix has changed we need to send an update. |
239 |
|
|
* The to evaluate prefix must not be in the prefix list. |
240 |
|
|
*/ |
241 |
|
|
void |
242 |
|
|
prefix_evaluate(struct prefix *p, struct rib_entry *re) |
243 |
|
|
{ |
244 |
|
|
struct prefix *xp; |
245 |
|
|
|
246 |
|
|
if (re->flags & F_RIB_NOEVALUATE || rde_noevaluate()) { |
247 |
|
|
/* decision process is turned off */ |
248 |
|
|
if (p != NULL) |
249 |
|
|
LIST_INSERT_HEAD(&re->prefix_h, p, rib_l); |
250 |
|
|
if (re->active != NULL) { |
251 |
|
|
re->active->aspath->active_cnt--; |
252 |
|
|
re->active = NULL; |
253 |
|
|
} |
254 |
|
|
return; |
255 |
|
|
} |
256 |
|
|
|
257 |
|
|
if (p != NULL) { |
258 |
|
|
if (LIST_EMPTY(&re->prefix_h)) |
259 |
|
|
LIST_INSERT_HEAD(&re->prefix_h, p, rib_l); |
260 |
|
|
else { |
261 |
|
|
LIST_FOREACH(xp, &re->prefix_h, rib_l) |
262 |
|
|
if (prefix_cmp(p, xp) > 0) { |
263 |
|
|
LIST_INSERT_BEFORE(xp, p, rib_l); |
264 |
|
|
break; |
265 |
|
|
} else if (LIST_NEXT(xp, rib_l) == NULL) { |
266 |
|
|
/* if xp last element ... */ |
267 |
|
|
LIST_INSERT_AFTER(xp, p, rib_l); |
268 |
|
|
break; |
269 |
|
|
} |
270 |
|
|
} |
271 |
|
|
} |
272 |
|
|
|
273 |
|
|
xp = LIST_FIRST(&re->prefix_h); |
274 |
|
|
if (xp == NULL || xp->aspath->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) || |
275 |
|
|
(xp->aspath->nexthop != NULL && |
276 |
|
|
xp->aspath->nexthop->state != NEXTHOP_REACH)) |
277 |
|
|
/* xp is ineligible */ |
278 |
|
|
xp = NULL; |
279 |
|
|
|
280 |
|
|
if (re->active != xp) { |
281 |
|
|
/* need to generate an update */ |
282 |
|
|
if (re->active != NULL) |
283 |
|
|
re->active->aspath->active_cnt--; |
284 |
|
|
|
285 |
|
|
/* |
286 |
|
|
* Send update with remove for re->active and add for xp |
287 |
|
|
* but remember that xp may be NULL aka ineligible. |
288 |
|
|
* Additional decision may be made by the called functions. |
289 |
|
|
*/ |
290 |
|
|
rde_generate_updates(re->ribid, xp, re->active); |
291 |
|
|
if ((re->flags & F_RIB_NOFIB) == 0) |
292 |
|
|
rde_send_kroute(xp, re->active, re->ribid); |
293 |
|
|
|
294 |
|
|
re->active = xp; |
295 |
|
|
if (xp != NULL) |
296 |
|
|
xp->aspath->active_cnt++; |
297 |
|
|
} |
298 |
|
|
} |