From adf08d481b52824c87af028fc74dc692d179f7f4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 13 Oct 2016 19:07:54 +0900 Subject: regmap: include from include/linux/regmap.h The readx_poll_timeout() macro calls usleep_range(), which is declared in . Make include/linux/regmap.h include , like include/linux/iopoll.h does. Otherwise, users of the macro will see "implicit declaration of function 'usleep_range'" error. Signed-off-by: Masahiro Yamada Signed-off-by: Mark Brown --- include/linux/regmap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 9adc7b21903d..e5157e39ff22 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include -- cgit From 0189efb8f4f830b9ac7a7c56c0c6e260859e950d Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 13 Oct 2016 22:57:02 +0300 Subject: qed*: Fix Kconfig dependencies with INFINIBAND_QEDR The qedr driver would require a tristate Kconfig option [to allow it to compile as a module], and toward that end we've added the INFINIBAND_QEDR option. But as we've made the compilation of the qed/qede infrastructure required for RoCE dependent on the option we'd be facing linking difficulties in case that QED=y or QEDE=y, and INFINIBAND_QEDR=m. To resolve this, we seperate between the INFINIBAND_QEDR option and the infrastructure support in qed/qede by introducing a new QED_RDMA option which would be selected by INFINIBAND_QEDR but would be a boolean instead of a tristate; Following that, the qed/qede is fixed based on this new option so that all config combinations would be supported. Fixes: cee9fbd8e2e9 ("qede: add qedr framework") Reported-by: Arnd Bergmann Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/Kconfig | 4 ++ drivers/net/ethernet/qlogic/qed/Makefile | 2 +- drivers/net/ethernet/qlogic/qed/qed_cxt.c | 7 +-- drivers/net/ethernet/qlogic/qed/qed_dev.c | 14 ++--- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1 + drivers/net/ethernet/qlogic/qed/qed_ll2.h | 20 ------- drivers/net/ethernet/qlogic/qed/qed_main.c | 28 +++++---- drivers/net/ethernet/qlogic/qed/qed_roce.c | 91 +++++++++++++++--------------- drivers/net/ethernet/qlogic/qed/qed_roce.h | 75 +++++++++++++++--------- drivers/net/ethernet/qlogic/qed/qed_spq.c | 4 -- drivers/net/ethernet/qlogic/qede/Makefile | 2 +- include/linux/qed/qede_roce.h | 2 +- 12 files changed, 120 insertions(+), 130 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig index 0df1391f9663..77567727528a 100644 --- a/drivers/net/ethernet/qlogic/Kconfig +++ b/drivers/net/ethernet/qlogic/Kconfig @@ -107,10 +107,14 @@ config QEDE ---help--- This enables the support for ... +config QED_RDMA + bool + config INFINIBAND_QEDR tristate "QLogic qede RoCE sources [debug]" depends on QEDE && 64BIT select QED_LL2 + select QED_RDMA default n ---help--- This provides a temporary node that allows the compilation diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile index cda0af7fbc20..967acf322c09 100644 --- a/drivers/net/ethernet/qlogic/qed/Makefile +++ b/drivers/net/ethernet/qlogic/qed/Makefile @@ -5,4 +5,4 @@ qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \ qed_selftest.o qed_dcbx.o qed_debug.o qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o qed-$(CONFIG_QED_LL2) += qed_ll2.o -qed-$(CONFIG_INFINIBAND_QEDR) += qed_roce.o +qed-$(CONFIG_QED_RDMA) += qed_roce.o diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index 82370a1a59ad..277db7831f7b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -47,13 +47,8 @@ #define TM_ALIGN BIT(TM_SHIFT) #define TM_ELEM_SIZE 4 -/* ILT constants */ -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) /* For RoCE we configure to 64K to cover for RoCE max tasks 256K purpose. */ -#define ILT_DEFAULT_HW_P_SIZE 4 -#else -#define ILT_DEFAULT_HW_P_SIZE 3 -#endif +#define ILT_DEFAULT_HW_P_SIZE (IS_ENABLED(CONFIG_QED_RDMA) ? 4 : 3) #define ILT_PAGE_IN_BYTES(hw_p_size) (1U << ((hw_p_size) + 12)) #define ILT_CFG_REG(cli, reg) PSWRQ2_REG_ ## cli ## _ ## reg ## _RT_OFFSET diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 754f6a908858..21adf5208320 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -1422,19 +1422,19 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn) u32 *feat_num = p_hwfn->hw_info.feat_num; int num_features = 1; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - /* Roce CNQ each requires: 1 status block + 1 CNQ. We divide the - * status blocks equally between L2 / RoCE but with consideration as - * to how many l2 queues / cnqs we have - */ - if (p_hwfn->hw_info.personality == QED_PCI_ETH_ROCE) { + if (IS_ENABLED(CONFIG_QED_RDMA) && + p_hwfn->hw_info.personality == QED_PCI_ETH_ROCE) { + /* Roce CNQ each requires: 1 status block + 1 CNQ. We divide + * the status blocks equally between L2 / RoCE but with + * consideration as to how many l2 queues / cnqs we have. + */ num_features++; feat_num[QED_RDMA_CNQ] = min_t(u32, RESC_NUM(p_hwfn, QED_SB) / num_features, RESC_NUM(p_hwfn, QED_RDMA_CNQ_RAM)); } -#endif + feat_num[QED_PF_L2_QUE] = min_t(u32, RESC_NUM(p_hwfn, QED_SB) / num_features, RESC_NUM(p_hwfn, QED_L2_QUEUE)); diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 02a8be2faed7..7856e5241b52 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -38,6 +38,7 @@ #include "qed_mcp.h" #include "qed_reg_addr.h" #include "qed_sp.h" +#include "qed_roce.h" #define QED_LL2_RX_REGISTERED(ll2) ((ll2)->rx_queue.b_cb_registred) #define QED_LL2_TX_REGISTERED(ll2) ((ll2)->tx_queue.b_cb_registred) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h index 80a5dc2d652d..4e3d62a16cab 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h @@ -293,24 +293,4 @@ void qed_ll2_setup(struct qed_hwfn *p_hwfn, */ void qed_ll2_free(struct qed_hwfn *p_hwfn, struct qed_ll2_info *p_ll2_connections); -void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, - u8 connection_handle, - void *cookie, - dma_addr_t rx_buf_addr, - u16 data_length, - u8 data_length_error, - u16 parse_flags, - u16 vlan, - u32 src_mac_addr_hi, - u16 src_mac_addr_lo, bool b_last_packet); -void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, - u8 connection_handle, - void *cookie, - dma_addr_t first_frag_addr, - bool b_last_fragment, bool b_last_packet); -void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, - u8 connection_handle, - void *cookie, - dma_addr_t first_frag_addr, - bool b_last_fragment, bool b_last_packet); #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 4ee3151e80c2..6eb2401b9e22 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -33,10 +33,8 @@ #include "qed_hw.h" #include "qed_selftest.h" -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) #define QED_ROCE_QPS (8192) #define QED_ROCE_DPIS (8) -#endif static char version[] = "QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n"; @@ -682,9 +680,7 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, enum qed_int_mode int_mode) { struct qed_sb_cnt_info sb_cnt_info; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - int num_l2_queues; -#endif + int num_l2_queues = 0; int rc; int i; @@ -715,8 +711,9 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, cdev->int_params.fp_msix_cnt = cdev->int_params.out.num_vectors - cdev->num_hwfns; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - num_l2_queues = 0; + if (!IS_ENABLED(CONFIG_QED_RDMA)) + return 0; + for_each_hwfn(cdev, i) num_l2_queues += FEAT_NUM(&cdev->hwfns[i], QED_PF_L2_QUE); @@ -738,7 +735,6 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev, DP_VERBOSE(cdev, QED_MSG_RDMA, "roce_msix_cnt=%d roce_msix_base=%d\n", cdev->int_params.rdma_msix_cnt, cdev->int_params.rdma_msix_base); -#endif return 0; } @@ -843,18 +839,20 @@ static void qed_update_pf_params(struct qed_dev *cdev, { int i; -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) - params->rdma_pf_params.num_qps = QED_ROCE_QPS; - params->rdma_pf_params.min_dpis = QED_ROCE_DPIS; - /* divide by 3 the MRs to avoid MF ILT overflow */ - params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS; - params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX; -#endif for (i = 0; i < cdev->num_hwfns; i++) { struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; p_hwfn->pf_params = *params; } + + if (!IS_ENABLED(CONFIG_QED_RDMA)) + return; + + params->rdma_pf_params.num_qps = QED_ROCE_QPS; + params->rdma_pf_params.min_dpis = QED_ROCE_DPIS; + /* divide by 3 the MRs to avoid MF ILT overflow */ + params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS; + params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX; } static int qed_slowpath_start(struct qed_dev *cdev, diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index c73638ceb1ad..187df38542f7 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -129,17 +129,12 @@ static void qed_bmap_release_id(struct qed_hwfn *p_hwfn, } } -u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id) +static u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id) { /* First sb id for RoCE is after all the l2 sb */ return FEAT_NUM((struct qed_hwfn *)p_hwfn, QED_PF_L2_QUE) + rel_sb_id; } -u32 qed_rdma_query_cau_timer_res(void *rdma_cxt) -{ - return QED_CAU_DEF_RX_TIMER_RES; -} - static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, struct qed_rdma_start_in_params *params) @@ -275,7 +270,7 @@ free_rdma_info: return rc; } -void qed_rdma_resc_free(struct qed_hwfn *p_hwfn) +static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn) { struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info; @@ -527,6 +522,26 @@ static int qed_rdma_start_fw(struct qed_hwfn *p_hwfn, return qed_spq_post(p_hwfn, p_ent, NULL); } +static int qed_rdma_alloc_tid(void *rdma_cxt, u32 *itid) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + int rc; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID\n"); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + rc = qed_rdma_bmap_alloc_id(p_hwfn, + &p_hwfn->p_rdma_info->tid_map, itid); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); + if (rc) + goto out; + + rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_TASK, *itid); +out: + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID - done, rc = %d\n", rc); + return rc; +} + static int qed_rdma_reserve_lkey(struct qed_hwfn *p_hwfn) { struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev; @@ -573,7 +588,7 @@ static int qed_rdma_setup(struct qed_hwfn *p_hwfn, return qed_rdma_start_fw(p_hwfn, params, p_ptt); } -int qed_rdma_stop(void *rdma_cxt) +static int qed_rdma_stop(void *rdma_cxt) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct rdma_close_func_ramrod_data *p_ramrod; @@ -629,8 +644,8 @@ out: return rc; } -int qed_rdma_add_user(void *rdma_cxt, - struct qed_rdma_add_user_out_params *out_params) +static int qed_rdma_add_user(void *rdma_cxt, + struct qed_rdma_add_user_out_params *out_params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; u32 dpi_start_offset; @@ -664,7 +679,7 @@ int qed_rdma_add_user(void *rdma_cxt, return rc; } -struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt) +static struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct qed_rdma_port *p_port = p_hwfn->p_rdma_info->port; @@ -680,7 +695,7 @@ struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt) return p_port; } -struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) +static struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; @@ -690,7 +705,7 @@ struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) return p_hwfn->p_rdma_info->dev; } -void qed_rdma_free_tid(void *rdma_cxt, u32 itid) +static void qed_rdma_free_tid(void *rdma_cxt, u32 itid) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; @@ -701,27 +716,7 @@ void qed_rdma_free_tid(void *rdma_cxt, u32 itid) spin_unlock_bh(&p_hwfn->p_rdma_info->lock); } -int qed_rdma_alloc_tid(void *rdma_cxt, u32 *itid) -{ - struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; - int rc; - - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID\n"); - - spin_lock_bh(&p_hwfn->p_rdma_info->lock); - rc = qed_rdma_bmap_alloc_id(p_hwfn, - &p_hwfn->p_rdma_info->tid_map, itid); - spin_unlock_bh(&p_hwfn->p_rdma_info->lock); - if (rc) - goto out; - - rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_TASK, *itid); -out: - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID - done, rc = %d\n", rc); - return rc; -} - -void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod) +static void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod) { struct qed_hwfn *p_hwfn; u16 qz_num; @@ -816,7 +811,7 @@ static int qed_rdma_get_int(struct qed_dev *cdev, struct qed_int_info *info) return 0; } -int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd) +static int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; u32 returned_id; @@ -1985,9 +1980,9 @@ int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp) return 0; } -int qed_rdma_query_qp(void *rdma_cxt, - struct qed_rdma_qp *qp, - struct qed_rdma_query_qp_out_params *out_params) +static int qed_rdma_query_qp(void *rdma_cxt, + struct qed_rdma_qp *qp, + struct qed_rdma_query_qp_out_params *out_params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; int rc; @@ -2022,7 +2017,7 @@ int qed_rdma_query_qp(void *rdma_cxt, return rc; } -int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp) +static int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; int rc = 0; @@ -2215,9 +2210,9 @@ static int qed_roce_modify_qp(struct qed_hwfn *p_hwfn, return rc; } -int qed_rdma_modify_qp(void *rdma_cxt, - struct qed_rdma_qp *qp, - struct qed_rdma_modify_qp_in_params *params) +static int qed_rdma_modify_qp(void *rdma_cxt, + struct qed_rdma_qp *qp, + struct qed_rdma_modify_qp_in_params *params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; enum qed_roce_qp_state prev_state; @@ -2312,8 +2307,9 @@ int qed_rdma_modify_qp(void *rdma_cxt, return rc; } -int qed_rdma_register_tid(void *rdma_cxt, - struct qed_rdma_register_tid_in_params *params) +static int +qed_rdma_register_tid(void *rdma_cxt, + struct qed_rdma_register_tid_in_params *params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct rdma_register_tid_ramrod_data *p_ramrod; @@ -2450,7 +2446,7 @@ int qed_rdma_register_tid(void *rdma_cxt, return rc; } -int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid) +static int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct rdma_deregister_tid_ramrod_data *p_ramrod; @@ -2561,7 +2557,8 @@ void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) qed_rdma_dpm_conf(p_hwfn, p_ptt); } -int qed_rdma_start(void *rdma_cxt, struct qed_rdma_start_in_params *params) +static int qed_rdma_start(void *rdma_cxt, + struct qed_rdma_start_in_params *params) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; struct qed_ptt *p_ptt; @@ -2601,7 +2598,7 @@ static int qed_rdma_init(struct qed_dev *cdev, return qed_rdma_start(QED_LEADING_HWFN(cdev), params); } -void qed_rdma_remove_user(void *rdma_cxt, u16 dpi) +static void qed_rdma_remove_user(void *rdma_cxt, u16 dpi) { struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h index 2f091e8a0f40..691413176734 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.h +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h @@ -181,36 +181,55 @@ struct qed_rdma_qp { dma_addr_t shared_queue_phys_addr; }; -int -qed_rdma_add_user(void *rdma_cxt, - struct qed_rdma_add_user_out_params *out_params); -int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd); -int qed_rdma_alloc_tid(void *rdma_cxt, u32 *tid); -int qed_rdma_deregister_tid(void *rdma_cxt, u32 tid); -void qed_rdma_free_tid(void *rdma_cxt, u32 tid); -struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt); -struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt); -int -qed_rdma_register_tid(void *rdma_cxt, - struct qed_rdma_register_tid_in_params *params); -void qed_rdma_remove_user(void *rdma_cxt, u16 dpi); -int qed_rdma_start(void *p_hwfn, struct qed_rdma_start_in_params *params); -int qed_rdma_stop(void *rdma_cxt); -u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id); -u32 qed_rdma_query_cau_timer_res(void *p_hwfn); -void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 cnq_index, u16 prod); -void qed_rdma_resc_free(struct qed_hwfn *p_hwfn); +#if IS_ENABLED(CONFIG_QED_RDMA) +void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); void qed_async_roce_event(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe); -int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp); -int qed_rdma_modify_qp(void *rdma_cxt, struct qed_rdma_qp *qp, - struct qed_rdma_modify_qp_in_params *params); -int qed_rdma_query_qp(void *rdma_cxt, struct qed_rdma_qp *qp, - struct qed_rdma_query_qp_out_params *out_params); - -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) -void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); +void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet); +void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, bool b_last_packet); +void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t rx_buf_addr, + u16 data_length, + u8 data_length_error, + u16 parse_flags, + u16 vlan, + u32 src_mac_addr_hi, + u16 src_mac_addr_lo, bool b_last_packet); #else -void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) {} +static inline void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) {} +static inline void qed_async_roce_event(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe) {} +static inline void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, + bool b_last_packet) {} +static inline void qed_ll2b_release_tx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t first_frag_addr, + bool b_last_fragment, + bool b_last_packet) {} +static inline void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn, + u8 connection_handle, + void *cookie, + dma_addr_t rx_buf_addr, + u16 data_length, + u8 data_length_error, + u16 parse_flags, + u16 vlan, + u32 src_mac_addr_hi, + u16 src_mac_addr_lo, + bool b_last_packet) {} #endif #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index caff41544898..9fbaf9429fd0 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -28,9 +28,7 @@ #include "qed_reg_addr.h" #include "qed_sp.h" #include "qed_sriov.h" -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) #include "qed_roce.h" -#endif /*************************************************************************** * Structures & Definitions @@ -240,11 +238,9 @@ qed_async_event_completion(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe) { switch (p_eqe->protocol_id) { -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) case PROTOCOLID_ROCE: qed_async_roce_event(p_hwfn, p_eqe); return 0; -#endif case PROTOCOLID_COMMON: return qed_sriov_eqe_event(p_hwfn, p_eqe->opcode, diff --git a/drivers/net/ethernet/qlogic/qede/Makefile b/drivers/net/ethernet/qlogic/qede/Makefile index 28dc58919c85..048a230c3ce0 100644 --- a/drivers/net/ethernet/qlogic/qede/Makefile +++ b/drivers/net/ethernet/qlogic/qede/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_QEDE) := qede.o qede-y := qede_main.o qede_ethtool.o qede-$(CONFIG_DCB) += qede_dcbnl.o -qede-$(CONFIG_INFINIBAND_QEDR) += qede_roce.o +qede-$(CONFIG_QED_RDMA) += qede_roce.o diff --git a/include/linux/qed/qede_roce.h b/include/linux/qed/qede_roce.h index 99fbe6d55acb..f48d64b0e2fb 100644 --- a/include/linux/qed/qede_roce.h +++ b/include/linux/qed/qede_roce.h @@ -68,7 +68,7 @@ void qede_roce_unregister_driver(struct qedr_driver *drv); bool qede_roce_supported(struct qede_dev *dev); -#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +#if IS_ENABLED(CONFIG_QED_RDMA) int qede_roce_dev_add(struct qede_dev *dev); void qede_roce_dev_event_open(struct qede_dev *dev); void qede_roce_dev_event_close(struct qede_dev *dev); -- cgit From 9f7d416c36124667c406978bcb39746589c35d7f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 14 Oct 2016 16:07:23 +0200 Subject: kprobes: Unpoison stack in jprobe_return() for KASAN I observed false KSAN positives in the sctp code, when sctp uses jprobe_return() in jsctp_sf_eat_sack(). The stray 0xf4 in shadow memory are stack redzones: [ ] ================================================================== [ ] BUG: KASAN: stack-out-of-bounds in memcmp+0xe9/0x150 at addr ffff88005e48f480 [ ] Read of size 1 by task syz-executor/18535 [ ] page:ffffea00017923c0 count:0 mapcount:0 mapping: (null) index:0x0 [ ] flags: 0x1fffc0000000000() [ ] page dumped because: kasan: bad access detected [ ] CPU: 1 PID: 18535 Comm: syz-executor Not tainted 4.8.0+ #28 [ ] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 [ ] ffff88005e48f2d0 ffffffff82d2b849 ffffffff0bc91e90 fffffbfff10971e8 [ ] ffffed000bc91e90 ffffed000bc91e90 0000000000000001 0000000000000000 [ ] ffff88005e48f480 ffff88005e48f350 ffffffff817d3169 ffff88005e48f370 [ ] Call Trace: [ ] [] dump_stack+0x12e/0x185 [ ] [] kasan_report+0x489/0x4b0 [ ] [] __asan_report_load1_noabort+0x19/0x20 [ ] [] memcmp+0xe9/0x150 [ ] [] depot_save_stack+0x176/0x5c0 [ ] [] save_stack+0xb1/0xd0 [ ] [] kasan_slab_free+0x72/0xc0 [ ] [] kfree+0xc8/0x2a0 [ ] [] skb_free_head+0x79/0xb0 [ ] [] skb_release_data+0x37a/0x420 [ ] [] skb_release_all+0x4f/0x60 [ ] [] consume_skb+0x138/0x370 [ ] [] sctp_chunk_put+0xcb/0x180 [ ] [] sctp_chunk_free+0x58/0x70 [ ] [] sctp_inq_pop+0x68f/0xef0 [ ] [] sctp_assoc_bh_rcv+0xd6/0x4b0 [ ] [] sctp_inq_push+0x131/0x190 [ ] [] sctp_backlog_rcv+0xe9/0xa20 [ ... ] [ ] Memory state around the buggy address: [ ] ffff88005e48f380: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ffff88005e48f400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] >ffff88005e48f480: f4 f4 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ^ [ ] ffff88005e48f500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ffff88005e48f580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ ] ================================================================== KASAN stack instrumentation poisons stack redzones on function entry and unpoisons them on function exit. If a function exits abnormally (e.g. with a longjmp like jprobe_return()), stack redzones are left poisoned. Later this leads to random KASAN false reports. Unpoison stack redzones in the frames we are going to jump over before doing actual longjmp in jprobe_return(). Signed-off-by: Dmitry Vyukov Acked-by: Masami Hiramatsu Reviewed-by: Mark Rutland Cc: Mark Rutland Cc: Catalin Marinas Cc: Andrey Ryabinin Cc: Lorenzo Pieralisi Cc: Alexander Potapenko Cc: Will Deacon Cc: Andrew Morton Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: kasan-dev@googlegroups.com Cc: surovegin@google.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1476454043-101898-1-git-send-email-dvyukov@google.com Signed-off-by: Ingo Molnar --- arch/arm64/kernel/sleep.S | 2 +- arch/x86/kernel/kprobes/core.c | 4 ++++ include/linux/kasan.h | 2 ++ mm/kasan/kasan.c | 22 +++++++++++++++++++--- 4 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index b8799e7c79de..1bec41b5fda3 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -135,7 +135,7 @@ ENTRY(_cpu_resume) #ifdef CONFIG_KASAN mov x0, sp - bl kasan_unpoison_remaining_stack + bl kasan_unpoison_task_stack_below #endif ldp x19, x20, [x29, #16] diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f423b0ef23a7..d9d8d16b69db 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -1081,6 +1082,9 @@ void jprobe_return(void) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Unpoison stack redzones in the frames we are going to jump over. */ + kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp); + asm volatile ( #ifdef CONFIG_X86_64 " xchg %%rbx,%%rsp \n" diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d600303306eb..820c0ad54a01 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -44,6 +44,7 @@ static inline void kasan_disable_current(void) void kasan_unpoison_shadow(const void *address, size_t size); void kasan_unpoison_task_stack(struct task_struct *task); +void kasan_unpoison_stack_above_sp_to(const void *watermark); void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); @@ -85,6 +86,7 @@ size_t kasan_metadata_size(struct kmem_cache *cache); static inline void kasan_unpoison_shadow(const void *address, size_t size) {} static inline void kasan_unpoison_task_stack(struct task_struct *task) {} +static inline void kasan_unpoison_stack_above_sp_to(const void *watermark) {} static inline void kasan_enable_current(void) {} static inline void kasan_disable_current(void) {} diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 88af13c00d3c..70c009741aab 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "kasan.h" #include "../slab.h" @@ -62,7 +63,7 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } -static void __kasan_unpoison_stack(struct task_struct *task, void *sp) +static void __kasan_unpoison_stack(struct task_struct *task, const void *sp) { void *base = task_stack_page(task); size_t size = sp - base; @@ -77,9 +78,24 @@ void kasan_unpoison_task_stack(struct task_struct *task) } /* Unpoison the stack for the current task beyond a watermark sp value. */ -asmlinkage void kasan_unpoison_remaining_stack(void *sp) +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, sp); + __kasan_unpoison_stack(current, watermark); +} + +/* + * Clear all poison for the region between the current SP and a provided + * watermark value, as is sometimes required prior to hand-crafted asm function + * returns in the middle of functions. + */ +void kasan_unpoison_stack_above_sp_to(const void *watermark) +{ + const void *sp = __builtin_frame_address(0); + size_t size = watermark - sp; + + if (WARN_ON(sp > watermark)) + return; + kasan_unpoison_shadow(sp, size); } /* -- cgit From a04a480d4392ea6efd117be2de564117b2a009c0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 16 Oct 2016 20:02:52 -0700 Subject: net: Require exact match for TCP socket lookups if dif is l3mdev Currently, socket lookups for l3mdev (vrf) use cases can match a socket that is bound to a port but not a device (ie., a global socket). If the sysctl tcp_l3mdev_accept is not set this leads to ack packets going out based on the main table even though the packet came in from an L3 domain. The end result is that the connection does not establish creating confusion for users since the service is running and a socket shows in ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the skb came through an interface enslaved to an l3mdev device and the tcp_l3mdev_accept is not set. skb's through an l3mdev interface are marked by setting a flag in inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the flag for IPv4. Using an skb flag avoids a device lookup on the dif. The flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the move is done after the socket lookup, so IP6CB is used. The flags field in inet_skb_parm struct needs to be increased to add another flag. There is currently a 1-byte hole following the flags, so it can be expanded to u16 without increasing the size of the struct. Fixes: 193125dbd8eb ("net: Introduce VRF device driver") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 ++ include/linux/ipv6.h | 17 ++++++++++++++--- include/net/ip.h | 8 +++++++- include/net/tcp.h | 13 ++++++++++++- net/ipv4/inet_hashtables.c | 8 +++++--- net/ipv6/inet6_hashtables.c | 7 ++++--- 6 files changed, 44 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 85c271c70d42..820de6a9ddde 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -956,6 +956,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, if (skb->pkt_type == PACKET_LOOPBACK) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; + IP6CB(skb)->flags |= IP6SKB_L3SLAVE; skb->pkt_type = PACKET_HOST; goto out; } @@ -996,6 +997,7 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; + IPCB(skb)->flags |= IPSKB_L3SLAVE; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7e9a789be5e0..ca1ad9ebbc92 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -123,12 +123,12 @@ struct inet6_skb_parm { }; #if defined(CONFIG_NET_L3_MASTER_DEV) -static inline bool skb_l3mdev_slave(__u16 flags) +static inline bool ipv6_l3mdev_skb(__u16 flags) { return flags & IP6SKB_L3SLAVE; } #else -static inline bool skb_l3mdev_slave(__u16 flags) +static inline bool ipv6_l3mdev_skb(__u16 flags) { return false; } @@ -139,11 +139,22 @@ static inline bool skb_l3mdev_slave(__u16 flags) static inline int inet6_iif(const struct sk_buff *skb) { - bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags); + bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags); return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } +/* can not be used in TCP layer after tcp_v6_fill_cb */ +static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if defined(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_tcp_l3mdev_accept && + ipv6_l3mdev_skb(IP6CB(skb)->flags)) + return true; +#endif + return false; +} + struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; diff --git a/include/net/ip.h b/include/net/ip.h index bc43c0fcae12..c9d07988911e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -38,7 +38,7 @@ struct sock; struct inet_skb_parm { int iif; struct ip_options opt; /* Compiled IP options */ - unsigned char flags; + u16 flags; #define IPSKB_FORWARDED BIT(0) #define IPSKB_XFRM_TUNNEL_SIZE BIT(1) @@ -48,10 +48,16 @@ struct inet_skb_parm { #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_FRAG_SEGS BIT(7) +#define IPSKB_L3SLAVE BIT(8) u16 frag_max_size; }; +static inline bool ipv4_l3mdev_skb(u16 flags) +{ + return !!(flags & IPSKB_L3SLAVE); +} + static inline unsigned int ip_hdrlen(const struct sk_buff *skb) { return ip_hdr(skb)->ihl * 4; diff --git a/include/net/tcp.h b/include/net/tcp.h index f83b7f220a65..5b82d4d94834 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -794,12 +794,23 @@ struct tcp_skb_cb { */ static inline int tcp_v6_iif(const struct sk_buff *skb) { - bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags); + bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } #endif +/* TCP_SKB_CB reference means this can not be used from early demux */ +static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_tcp_l3mdev_accept && + ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) + return true; +#endif + return false; +} + /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 77c20a489218..ca97835bfec4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -25,6 +25,7 @@ #include #include #include +#include #include static u32 inet_ehashfn(const struct net *net, const __be32 laddr, @@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; struct inet_sock *inet = inet_sk(sk); @@ -186,7 +187,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score += 4; @@ -215,11 +216,12 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet_exact_dif_match(net, skb); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 00cf28ad4565..2fd0374a35b1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; @@ -109,7 +109,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score++; @@ -131,11 +131,12 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet6_exact_dif_match(net, skb); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { -- cgit From 9224eb77e63f70f16c0b6b7a20ca7d395f3bc077 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 17 Oct 2016 16:00:46 +0100 Subject: irqchip/gic-v3-its: Fix entry size mask for GITS_BASER Entry Size in GITS_BASER occupies 5 bits [52:48], but we mask out 8 bits. Fixes: cc2d3216f53c ("irqchip: GICv3: ITS command queue") Cc: stable@vger.kernel.org Signed-off-by: Vladimir Murzin Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 8361c8d3edd1..b7e34313cdfe 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -290,7 +290,7 @@ #define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) #define GITS_BASER_ENTRY_SIZE_SHIFT (48) -#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1) +#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) -- cgit From 71757904efadefdf5505712f675218ce59483c5d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 17 Oct 2016 08:18:15 -0700 Subject: generic syscalls: kill cruft from removed pkey syscalls pkey_set() and pkey_get() were syscalls present in older versions of the protection keys patches. They were fully excised from the x86 code, but some cruft was left in the generic syscall code. The C++ comments were intended to help to make it more glaring to me to fix them before actually submitting them. That technique worked, but later than I would have liked. I test-compiled this for arm64. Fixes: a60f7b69d92c0 ("generic syscalls: Wire up memory protection keys syscalls") Signed-off-by: Dave Hansen Acked-by: Arnd Bergmann Cc: Thomas Gleixner Cc: x86@kernel.org Cc: linux-arch@vger.kernel.org Cc: mgorman@techsingularity.net Cc: linux-api@vger.kernel.org Cc: linux-mm@kvack.org Cc: luto@kernel.org Cc: akpm@linux-foundation.org Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 3 --- include/uapi/asm-generic/unistd.h | 4 ---- 2 files changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0d7abb8b7315..91a740f6b884 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -902,8 +902,5 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len, unsigned long prot, int pkey); asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val); asmlinkage long sys_pkey_free(int pkey); -//asmlinkage long sys_pkey_get(int pkey, unsigned long flags); -//asmlinkage long sys_pkey_set(int pkey, unsigned long access_rights, -// unsigned long flags); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index dbfee7e86ba6..9b1462e38b82 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -730,10 +730,6 @@ __SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect) __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc) #define __NR_pkey_free 290 __SYSCALL(__NR_pkey_free, sys_pkey_free) -#define __NR_pkey_get 291 -//__SYSCALL(__NR_pkey_get, sys_pkey_get) -#define __NR_pkey_set 292 -//__SYSCALL(__NR_pkey_set, sys_pkey_set) #undef __NR_syscalls #define __NR_syscalls 291 -- cgit From 339e1e54891c339b30023c9cc8a005cbf65a3c0c Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 8 Oct 2016 16:59:38 +0800 Subject: clk: core: add __init decoration for CLK_OF_DECLARE_DRIVER function The new introduced macro CLK_OF_DECLARE_DRIVER is usually used to declare clock driver init functions, which are mostly decorated with __init. Add __init decoration for CLK_OF_DECLARE_DRIVER function to avoid causing section mismatch warnings on client clock drivers. Signed-off-by: Shawn Guo Fixes: c7296c51ce5d ("clk: core: New macro CLK_OF_DECLARE_DRIVER") Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index af596381fa0f..a428aec36ace 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -785,7 +785,7 @@ extern struct of_device_id __clk_of_table; * routines, one at of_clk_init(), and one at platform device probe */ #define CLK_OF_DECLARE_DRIVER(name, compat, fn) \ - static void name##_of_clk_init_driver(struct device_node *np) \ + static void __init name##_of_clk_init_driver(struct device_node *np) \ { \ of_node_clear_flag(np, OF_POPULATED); \ fn(np); \ -- cgit From 19be0eaffa3ac7d8eb6784ad9bdbc7d67ed8e619 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Oct 2016 13:07:36 -0700 Subject: mm: remove gup_flags FOLL_WRITE games from __get_user_pages() This is an ancient bug that was actually attempted to be fixed once (badly) by me eleven years ago in commit 4ceb5db9757a ("Fix get_user_pages() race for write access") but that was then undone due to problems on s390 by commit f33ea7f404e5 ("fix get_user_pages bug"). In the meantime, the s390 situation has long been fixed, and we can now fix it by checking the pte_dirty() bit properly (and do it better). The s390 dirty bit was implemented in abf09bed3cce ("s390/mm: implement software dirty bits") which made it into v3.9. Earlier kernels will have to look at the page state itself. Also, the VM has become more scalable, and what used a purely theoretical race back then has become easier to trigger. To fix it, we introduce a new internal FOLL_COW flag to mark the "yes, we already did a COW" rather than play racy games with FOLL_WRITE that is very fundamental, and then use the pte dirty flag to validate that the FOLL_COW flag is still valid. Reported-and-tested-by: Phil "not Paul" Oester Acked-by: Hugh Dickins Reviewed-by: Michal Hocko Cc: Andy Lutomirski Cc: Kees Cook Cc: Oleg Nesterov Cc: Willy Tarreau Cc: Nick Piggin Cc: Greg Thelen Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/gup.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e9caec6a51e9..ed85879f47f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2232,6 +2232,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ +#define FOLL_COW 0x4000 /* internal GUP flag */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/gup.c b/mm/gup.c index 96b2b2fd0fbd..22cc22e7432f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -60,6 +60,16 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, return -EEXIST; } +/* + * FOLL_FORCE can write to even unwritable pte's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +{ + return pte_write(pte) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -95,7 +105,7 @@ retry: } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) { + if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } @@ -412,7 +422,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags &= ~FOLL_WRITE; + *flags |= FOLL_COW; return 0; } -- cgit From d4944b0ecec0af882483fe44b66729316e575208 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:12 +0100 Subject: mm: remove write/force parameters from __get_user_pages_unlocked() This removes the redundant 'write' and 'force' parameters from __get_user_pages_unlocked() to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Paolo Bonzini Reviewed-by: Jan Kara Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +-- mm/gup.c | 17 +++++++++-------- mm/nommu.c | 12 +++++++++--- mm/process_vm_access.c | 7 +++++-- virt/kvm/async_pf.c | 3 ++- virt/kvm/kvm_main.c | 11 ++++++++--- 6 files changed, 34 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ed85879f47f5..bcdea1f4e98c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1285,8 +1285,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags); + struct page **pages, unsigned int gup_flags); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages); int get_user_pages_fast(unsigned long start, int nr_pages, int write, diff --git a/mm/gup.c b/mm/gup.c index 720eb9385204..e997f545b059 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -875,17 +875,11 @@ EXPORT_SYMBOL(get_user_pages_locked); */ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags) + struct page **pages, unsigned int gup_flags) { long ret; int locked = 1; - if (write) - gup_flags |= FOLL_WRITE; - if (force) - gup_flags |= FOLL_FORCE; - down_read(&mm->mmap_sem); ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL, &locked, false, gup_flags); @@ -915,8 +909,15 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { + unsigned int flags = FOLL_TOUCH; + + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - write, force, pages, FOLL_TOUCH); + pages, flags); } EXPORT_SYMBOL(get_user_pages_unlocked); diff --git a/mm/nommu.c b/mm/nommu.c index 95daf81a4855..925dcc1fa2f3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -185,8 +185,7 @@ EXPORT_SYMBOL(get_user_pages_locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags) + struct page **pages, unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -200,8 +199,15 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { + unsigned int flags = 0; + + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - write, force, pages, 0); + pages, flags); } EXPORT_SYMBOL(get_user_pages_unlocked); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 07514d41ebcc..be8dc8d1edb9 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,12 +88,16 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); + unsigned int flags = FOLL_REMOTE; /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + if (vm_write) + flags |= FOLL_WRITE; + while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); size_t bytes; @@ -104,8 +108,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * current/current->mm */ pages = __get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages, - FOLL_REMOTE); + process_pages, flags); if (pages <= 0) return -EFAULT; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index db9668869f6f..8035cc1eb955 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -84,7 +84,8 @@ static void async_pf_execute(struct work_struct *work) * mm and might be done in another context, so we must * use FOLL_REMOTE. */ - __get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE); + __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, + FOLL_WRITE | FOLL_REMOTE); kvm_async_page_present_sync(vcpu, apf); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 81dfc73d3df3..28510e72618a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1416,10 +1416,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, down_read(¤t->mm->mmap_sem); npages = get_user_page_nowait(addr, write_fault, page); up_read(¤t->mm->mmap_sem); - } else + } else { + unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; + + if (write_fault) + flags |= FOLL_WRITE; + npages = __get_user_pages_unlocked(current, current->mm, addr, 1, - write_fault, 0, page, - FOLL_TOUCH|FOLL_HWPOISON); + page, flags); + } if (npages != 1) return npages; -- cgit From c164154f66f0c9b02673f07aa4f044f1d9c70274 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:13 +0100 Subject: mm: replace get_user_pages_unlocked() write/force parameters with gup_flags This removes the 'write' and 'force' use from get_user_pages_unlocked() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 2 +- arch/s390/mm/gup.c | 3 ++- arch/sh/mm/gup.c | 3 ++- arch/sparc/mm/gup.c | 3 ++- arch/x86/mm/gup.c | 2 +- drivers/media/pci/ivtv/ivtv-udma.c | 4 ++-- drivers/media/pci/ivtv/ivtv-yuv.c | 5 +++-- drivers/scsi/st.c | 5 ++--- drivers/video/fbdev/pvr2fb.c | 4 ++-- include/linux/mm.h | 2 +- mm/gup.c | 14 ++++---------- mm/nommu.c | 11 ++--------- mm/util.c | 3 ++- net/ceph/pagevec.c | 2 +- 14 files changed, 27 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 42d124fb6474..d8c3c159289a 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -287,7 +287,7 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - write, 0, pages); + pages, write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index adb0c34bf431..18d4107e10ee 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -266,7 +266,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); + ret = get_user_pages_unlocked(start, nr_pages - nr, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) ret = (ret < 0) ? nr : ret + nr; diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 40fa6c8adc43..063c298ba56c 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -258,7 +258,8 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, write, 0, pages); + (end - start) >> PAGE_SHIFT, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 4e06750a5d29..cd0e32bbcb1d 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -238,7 +238,8 @@ slow: pages += nr; ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, write, 0, pages); + (end - start) >> PAGE_SHIFT, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index b8b6a60b32cf..0d4fb3ebbbac 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -435,7 +435,7 @@ slow_irqon: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - write, 0, pages); + pages, write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index 4769469fe842..2c9232ef7baa 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c @@ -124,8 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, } /* Get user pages for DMA Xfer */ - err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, 0, - 1, dma->map); + err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, + dma->map, FOLL_FORCE); if (user_dma.page_count != err) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c index b094054cda6e..f7299d3d8244 100644 --- a/drivers/media/pci/ivtv/ivtv-yuv.c +++ b/drivers/media/pci/ivtv/ivtv-yuv.c @@ -76,11 +76,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, /* Get user pages for DMA Xfer */ y_pages = get_user_pages_unlocked(y_dma.uaddr, - y_dma.page_count, 0, 1, &dma->map[0]); + y_dma.page_count, &dma->map[0], FOLL_FORCE); uv_pages = 0; /* silence gcc. value is set and consumed only if: */ if (y_pages == y_dma.page_count) { uv_pages = get_user_pages_unlocked(uv_dma.uaddr, - uv_dma.page_count, 0, 1, &dma->map[y_pages]); + uv_dma.page_count, &dma->map[y_pages], + FOLL_FORCE); } if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) { diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 7af5226aa55b..618422ea3a41 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4922,9 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp, res = get_user_pages_unlocked( uaddr, nr_pages, - rw == READ, - 0, /* don't force */ - pages); + pages, + rw == READ ? FOLL_WRITE : 0); /* don't force */ /* Errors and no page mapped should return here */ if (res < nr_pages) diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index 3b1ca4411073..a2564ab91e62 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c @@ -686,8 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, if (!pages) return -ENOMEM; - ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, WRITE, - 0, pages); + ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, pages, + FOLL_WRITE); if (ret < nr_pages) { nr_pages = ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index bcdea1f4e98c..abd53f2eb74e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1287,7 +1287,7 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages); + struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); diff --git a/mm/gup.c b/mm/gup.c index e997f545b059..373d1ec006e4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -907,17 +907,10 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * "force" parameter). */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages) + struct page **pages, unsigned int gup_flags) { - unsigned int flags = FOLL_TOUCH; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, flags); + pages, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -1535,7 +1528,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); + ret = get_user_pages_unlocked(start, nr_pages - nr, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/nommu.c b/mm/nommu.c index 925dcc1fa2f3..7e27add39f7e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -197,17 +197,10 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages) + struct page **pages, unsigned int gup_flags) { - unsigned int flags = 0; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, flags); + pages, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); diff --git a/mm/util.c b/mm/util.c index 662cddf914af..4c685bde5ebc 100644 --- a/mm/util.c +++ b/mm/util.c @@ -283,7 +283,8 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, write, 0, pages); + return get_user_pages_unlocked(start, nr_pages, pages, + write ? FOLL_WRITE : 0); } EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 00d2601407c5..1a7c9a79a53c 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -26,7 +26,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_unlocked( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, 0, pages + got); + num_pages - got, pages + got, write_page ? FOLL_WRITE : 0); if (rc < 0) break; BUG_ON(rc == 0); -- cgit From e4961b0768852d9eb7383e1a5df178eacb714656 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 19 Oct 2016 16:57:08 +0300 Subject: net: core: Correctly iterate over lower adjacency list Tamir reported the following trace when processing ARP requests received via a vlan device on top of a VLAN-aware bridge: NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [swapper/1:0] [...] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 4.8.0-rc7 #1 Hardware name: Mellanox Technologies Ltd. "MSN2100-CB2F"/"SA001017", BIOS 5.6.5 06/07/2016 task: ffff88017edfea40 task.stack: ffff88017ee10000 RIP: 0010:[] [] netdev_all_lower_get_next_rcu+0x33/0x60 [...] Call Trace: [] mlxsw_sp_port_lower_dev_hold+0x5a/0xa0 [mlxsw_spectrum] [] mlxsw_sp_router_netevent_event+0x80/0x150 [mlxsw_spectrum] [] notifier_call_chain+0x4a/0x70 [] atomic_notifier_call_chain+0x1a/0x20 [] call_netevent_notifiers+0x1b/0x20 [] neigh_update+0x306/0x740 [] neigh_event_ns+0x4e/0xb0 [] arp_process+0x66f/0x700 [] ? common_interrupt+0x8c/0x8c [] arp_rcv+0x139/0x1d0 [] ? vlan_do_receive+0xda/0x320 [] __netif_receive_skb_core+0x524/0xab0 [] ? dev_queue_xmit+0x10/0x20 [] ? br_forward_finish+0x3d/0xc0 [bridge] [] ? br_handle_vlan+0xf6/0x1b0 [bridge] [] __netif_receive_skb+0x18/0x60 [] netif_receive_skb_internal+0x40/0xb0 [] netif_receive_skb+0x1c/0x70 [] br_pass_frame_up+0xc6/0x160 [bridge] [] ? deliver_clone+0x37/0x50 [bridge] [] ? br_flood+0xcc/0x160 [bridge] [] br_handle_frame_finish+0x224/0x4f0 [bridge] [] br_handle_frame+0x174/0x300 [bridge] [] __netif_receive_skb_core+0x329/0xab0 [] ? find_next_bit+0x15/0x20 [] ? cpumask_next_and+0x32/0x50 [] ? load_balance+0x178/0x9b0 [] __netif_receive_skb+0x18/0x60 [] netif_receive_skb_internal+0x40/0xb0 [] netif_receive_skb+0x1c/0x70 [] mlxsw_sp_rx_listener_func+0x61/0xb0 [mlxsw_spectrum] [] mlxsw_core_skb_receive+0x187/0x200 [mlxsw_core] [] mlxsw_pci_cq_tasklet+0x63a/0x9b0 [mlxsw_pci] [] tasklet_action+0xf6/0x110 [] __do_softirq+0xf6/0x280 [] irq_exit+0xdf/0xf0 [] do_IRQ+0x54/0xd0 [] common_interrupt+0x8c/0x8c The problem is that netdev_all_lower_get_next_rcu() never advances the iterator, thereby causing the loop over the lower adjacency list to run forever. Fix this by advancing the iterator and avoid the infinite loop. Fixes: 7ce856aaaf13 ("mlxsw: spectrum: Add couple of lower device helper functions") Signed-off-by: Ido Schimmel Reported-by: Tamir Winetroub Reviewed-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/dev.c | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 136ae6bbe81e..465e128699a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3877,7 +3877,7 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, ldev = netdev_all_lower_get_next(dev, &(iter))) #define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \ - for (iter = (dev)->all_adj_list.lower.next, \ + for (iter = &(dev)->all_adj_list.lower, \ ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \ ldev; \ ldev = netdev_all_lower_get_next_rcu(dev, &(iter))) diff --git a/net/core/dev.c b/net/core/dev.c index f1fe26f66458..b09ac57f4348 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5511,10 +5511,14 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, { struct netdev_adjacent *lower; - lower = list_first_or_null_rcu(&dev->all_adj_list.lower, - struct netdev_adjacent, list); + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->all_adj_list.lower) + return NULL; + + *iter = &lower->list; - return lower ? lower->dev : NULL; + return lower->dev; } EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); -- cgit From 3b913179c3fa89dd0e304193fa0c746fc0481447 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:14 +0100 Subject: mm: replace get_user_pages_locked() write/force parameters with gup_flags This removes the 'write' and 'force' use from get_user_pages_locked() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/frame_vector.c | 8 +++++++- mm/gup.c | 12 +++--------- mm/nommu.c | 5 ++++- 4 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index abd53f2eb74e..9fe9b0438169 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1282,7 +1282,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, int *locked); + unsigned int gup_flags, struct page **pages, int *locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 381bb07ed14f..81b67498bb9c 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -41,10 +41,16 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, int ret = 0; int err; int locked; + unsigned int gup_flags = 0; if (nr_frames == 0) return 0; + if (write) + gup_flags |= FOLL_WRITE; + if (force) + gup_flags |= FOLL_FORCE; + if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; @@ -59,7 +65,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, vec->got_ref = true; vec->is_pfns = false; ret = get_user_pages_locked(start, nr_frames, - write, force, (struct page **)(vec->ptrs), &locked); + gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } diff --git a/mm/gup.c b/mm/gup.c index 373d1ec006e4..3cfb55ef0cf4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -848,18 +848,12 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * up_read(&mm->mmap_sem); */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, int *locked) { - unsigned int flags = FOLL_TOUCH; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, NULL, locked, true, flags); + pages, NULL, locked, true, + gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_locked); diff --git a/mm/nommu.c b/mm/nommu.c index 7e27add39f7e..842cfdd1a31e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -176,9 +176,12 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, int *locked) { + int write = gup_flags & FOLL_WRITE; + int force = gup_flags & FOLL_FORCE; + return get_user_pages(start, nr_pages, write, force, pages, NULL); } EXPORT_SYMBOL(get_user_pages_locked); -- cgit From 7f23b3504a0df63b724180262c5f3f117f21bcae Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:15 +0100 Subject: mm: replace get_vaddr_frames() write/force parameters with gup_flags This removes the 'write' and 'force' from get_vaddr_frames() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 3 ++- drivers/media/platform/omap/omap_vout.c | 2 +- drivers/media/v4l2-core/videobuf2-memops.c | 6 +++++- include/linux/mm.h | 2 +- mm/frame_vector.c | 13 ++----------- 5 files changed, 11 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index aa92decf4233..fbd13fabdf2d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -488,7 +488,8 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, goto err_free; } - ret = get_vaddr_frames(start, npages, true, true, g2d_userptr->vec); + ret = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE, + g2d_userptr->vec); if (ret != npages) { DRM_ERROR("failed to get user pages from userptr.\n"); if (ret < 0) diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c index e668dde6d857..a31b95cb3b09 100644 --- a/drivers/media/platform/omap/omap_vout.c +++ b/drivers/media/platform/omap/omap_vout.c @@ -214,7 +214,7 @@ static int omap_vout_get_userptr(struct videobuf_buffer *vb, u32 virtp, if (!vec) return -ENOMEM; - ret = get_vaddr_frames(virtp, 1, true, false, vec); + ret = get_vaddr_frames(virtp, 1, FOLL_WRITE, vec); if (ret != 1) { frame_vector_destroy(vec); return -EINVAL; diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c index 3c3b517f1d1c..1cd322e939c7 100644 --- a/drivers/media/v4l2-core/videobuf2-memops.c +++ b/drivers/media/v4l2-core/videobuf2-memops.c @@ -42,6 +42,10 @@ struct frame_vector *vb2_create_framevec(unsigned long start, unsigned long first, last; unsigned long nr; struct frame_vector *vec; + unsigned int flags = FOLL_FORCE; + + if (write) + flags |= FOLL_WRITE; first = start >> PAGE_SHIFT; last = (start + length - 1) >> PAGE_SHIFT; @@ -49,7 +53,7 @@ struct frame_vector *vb2_create_framevec(unsigned long start, vec = frame_vector_create(nr); if (!vec) return ERR_PTR(-ENOMEM); - ret = get_vaddr_frames(start & PAGE_MASK, nr, write, true, vec); + ret = get_vaddr_frames(start & PAGE_MASK, nr, flags, vec); if (ret < 0) goto out_destroy; /* We accept only complete set of PFNs */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 9fe9b0438169..91cc923ce985 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1305,7 +1305,7 @@ struct frame_vector { struct frame_vector *frame_vector_create(unsigned int nr_frames); void frame_vector_destroy(struct frame_vector *vec); int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, - bool write, bool force, struct frame_vector *vec); + unsigned int gup_flags, struct frame_vector *vec); void put_vaddr_frames(struct frame_vector *vec); int frame_vector_to_pages(struct frame_vector *vec); void frame_vector_to_pfns(struct frame_vector *vec); diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 81b67498bb9c..db77dcb38afd 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -11,10 +11,7 @@ * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map - * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. See description of the same argument of - get_user_pages(). + * @gup_flags: flags modifying lookup behaviour * @vec: structure which receives pages / pfns of the addresses mapped. * It should have space for at least nr_frames entries. * @@ -34,23 +31,17 @@ * This function takes care of grabbing mmap_sem as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, - bool write, bool force, struct frame_vector *vec) + unsigned int gup_flags, struct frame_vector *vec) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret = 0; int err; int locked; - unsigned int gup_flags = 0; if (nr_frames == 0) return 0; - if (write) - gup_flags |= FOLL_WRITE; - if (force) - gup_flags |= FOLL_FORCE; - if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; -- cgit From 768ae309a96103ed02eb1e111e838c87854d8b51 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:16 +0100 Subject: mm: replace get_user_pages() write/force parameters with gup_flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes the 'write' and 'force' from get_user_pages() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Christian König Acked-by: Jesper Nilsson Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- arch/cris/arch-v32/drivers/cryptocop.c | 4 +--- arch/ia64/kernel/err_inject.c | 2 +- arch/x86/mm/mpx.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 7 +++++-- drivers/gpu/drm/radeon/radeon_ttm.c | 3 ++- drivers/gpu/drm/via/via_dmablit.c | 4 ++-- drivers/infiniband/core/umem.c | 6 +++++- drivers/infiniband/hw/mthca/mthca_memfree.c | 2 +- drivers/infiniband/hw/qib/qib_user_pages.c | 3 ++- drivers/infiniband/hw/usnic/usnic_uiom.c | 5 ++++- drivers/media/v4l2-core/videobuf-dma-sg.c | 7 +++++-- drivers/misc/mic/scif/scif_rma.c | 3 +-- drivers/misc/sgi-gru/grufault.c | 2 +- drivers/platform/goldfish/goldfish_pipe.c | 3 ++- drivers/rapidio/devices/rio_mport_cdev.c | 3 ++- .../vc04_services/interface/vchiq_arm/vchiq_2835_arm.c | 3 +-- .../vc04_services/interface/vchiq_arm/vchiq_arm.c | 3 +-- drivers/virt/fsl_hypervisor.c | 4 ++-- include/linux/mm.h | 2 +- mm/gup.c | 12 +++--------- mm/mempolicy.c | 2 +- mm/nommu.c | 18 ++++-------------- 22 files changed, 49 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index b5698c876fcc..099e170a93ee 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -2722,7 +2722,6 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig err = get_user_pages((unsigned long int)(oper.indata + prev_ix), noinpages, 0, /* read access only for in data */ - 0, /* no force */ inpages, NULL); @@ -2736,8 +2735,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig if (oper.do_cipher){ err = get_user_pages((unsigned long int)oper.cipher_outdata, nooutpages, - 1, /* write access for out data */ - 0, /* no force */ + FOLL_WRITE, /* write access for out data */ outpages, NULL); up_read(¤t->mm->mmap_sem); diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 09f845793d12..5ed0ea92c5bf 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -142,7 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, u64 virt_addr=simple_strtoull(buf, NULL, 16); int ret; - ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL); + ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG printk("Virtual address %lx is not existing.\n",virt_addr); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 80476878eb4c..e4f800999b32 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -544,10 +544,9 @@ static int mpx_resolve_fault(long __user *addr, int write) { long gup_ret; int nr_pages = 1; - int force = 0; - gup_ret = get_user_pages((unsigned long)addr, nr_pages, write, - force, NULL, NULL); + gup_ret = get_user_pages((unsigned long)addr, nr_pages, + write ? FOLL_WRITE : 0, NULL, NULL); /* * get_user_pages() returns number of pages gotten. * 0 means we failed to fault in and get anything, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 887483b8b818..dcaf691f56b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -555,10 +555,13 @@ struct amdgpu_ttm_tt { int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) { struct amdgpu_ttm_tt *gtt = (void *)ttm; - int write = !(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY); + unsigned int flags = 0; unsigned pinned = 0; int r; + if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) + flags |= FOLL_WRITE; + if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { /* check that we only use anonymous memory to prevent problems with writeback */ @@ -581,7 +584,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) list_add(&guptask.list, >t->guptasks); spin_unlock(>t->guptasklock); - r = get_user_pages(userptr, num_pages, write, 0, p, NULL); + r = get_user_pages(userptr, num_pages, flags, p, NULL); spin_lock(>t->guptasklock); list_del(&guptask.list); diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 455268214b89..3de5e6e21662 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -566,7 +566,8 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; struct page **pages = ttm->pages + pinned; - r = get_user_pages(userptr, num_pages, write, 0, pages, NULL); + r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0, + pages, NULL); if (r < 0) goto release_pages; diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index 7e2a12c4fed2..1a3ad769f8c8 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -241,8 +241,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) down_read(¤t->mm->mmap_sem); ret = get_user_pages((unsigned long)xfer->mem_addr, vsg->num_pages, - (vsg->direction == DMA_FROM_DEVICE), - 0, vsg->pages, NULL); + (vsg->direction == DMA_FROM_DEVICE) ? FOLL_WRITE : 0, + vsg->pages, NULL); up_read(¤t->mm->mmap_sem); if (ret != vsg->num_pages) { diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c68746ce6624..224ad274ea0b 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -94,6 +94,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, unsigned long dma_attrs = 0; struct scatterlist *sg, *sg_list_start; int need_release = 0; + unsigned int gup_flags = FOLL_WRITE; if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; @@ -183,6 +184,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (ret) goto out; + if (!umem->writable) + gup_flags |= FOLL_FORCE; + need_release = 1; sg_list_start = umem->sg_head.sgl; @@ -190,7 +194,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - 1, !umem->writable, page_list, vma_list); + gup_flags, page_list, vma_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 6c00d04b8b28..c6fe89d79248 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,7 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages(uaddr & PAGE_MASK, 1, 1, 0, pages, NULL); + ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 2d2b94fd3633..75f08624ac05 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -67,7 +67,8 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, for (got = 0; got < num_pages; got += ret) { ret = get_user_pages(start_page + got * PAGE_SIZE, - num_pages - got, 1, 1, + num_pages - got, + FOLL_WRITE | FOLL_FORCE, p + got, NULL); if (ret < 0) goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index a0b6ebee4d8a..1ccee6ea5bc3 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -111,6 +111,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int i; int flags; dma_addr_t pa; + unsigned int gup_flags; if (!can_do_mlock()) return -EPERM; @@ -135,6 +136,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, flags = IOMMU_READ | IOMMU_CACHE; flags |= (writable) ? IOMMU_WRITE : 0; + gup_flags = FOLL_WRITE; + gup_flags |= (writable) ? 0 : FOLL_FORCE; cur_base = addr & PAGE_MASK; ret = 0; @@ -142,7 +145,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - 1, !writable, page_list, NULL); + gup_flags, page_list, NULL); if (ret < 0) goto out; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f300f060b3f3..1db0af6c7f94 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -156,6 +156,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, { unsigned long first, last; int err, rw = 0; + unsigned int flags = FOLL_FORCE; dma->direction = direction; switch (dma->direction) { @@ -178,12 +179,14 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, if (NULL == dma->pages) return -ENOMEM; + if (rw == READ) + flags |= FOLL_WRITE; + dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); err = get_user_pages(data & PAGE_MASK, dma->nr_pages, - rw == READ, 1, /* force */ - dma->pages, NULL); + flags, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c index e0203b1a20fd..f806a4471eb9 100644 --- a/drivers/misc/mic/scif/scif_rma.c +++ b/drivers/misc/mic/scif/scif_rma.c @@ -1396,8 +1396,7 @@ retry: pinned_pages->nr_pages = get_user_pages( (u64)addr, nr_pages, - !!(prot & SCIF_PROT_WRITE), - 0, + (prot & SCIF_PROT_WRITE) ? FOLL_WRITE : 0, pinned_pages->pages, NULL); up_write(&mm->mmap_sem); diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index a2d97b9b17e3..6fb773dbcd0c 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -198,7 +198,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma, #else *pageshift = PAGE_SHIFT; #endif - if (get_user_pages(vaddr, 1, write, 0, &page, NULL) <= 0) + if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0) return -EFAULT; *paddr = page_to_phys(page); put_page(page); diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 07462d79d040..1aba2c74160e 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -309,7 +309,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, * much memory to the process. */ down_read(¤t->mm->mmap_sem); - ret = get_user_pages(address, 1, !is_write, 0, &page, NULL); + ret = get_user_pages(address, 1, is_write ? 0 : FOLL_WRITE, + &page, NULL); up_read(¤t->mm->mmap_sem); if (ret < 0) break; diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 436dfe871d32..9013a585507e 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -892,7 +892,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, down_read(¤t->mm->mmap_sem); pinned = get_user_pages( (unsigned long)xfer->loc_addr & PAGE_MASK, - nr_pages, dir == DMA_FROM_DEVICE, 0, + nr_pages, + dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, page_list, NULL); up_read(¤t->mm->mmap_sem); diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c index c29040fdf9a7..1091b9f1dd07 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c @@ -423,8 +423,7 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, actual_pages = get_user_pages(task, task->mm, (unsigned long)buf & ~(PAGE_SIZE - 1), num_pages, - (type == PAGELIST_READ) /*Write */ , - 0 /*Force */ , + (type == PAGELIST_READ) ? FOLL_WRITE : 0, pages, NULL /*vmas */); up_read(&task->mm->mmap_sem); diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index e11c0e07471b..7b6cd4d80621 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -1477,8 +1477,7 @@ dump_phys_mem(void *virt_addr, uint32_t num_bytes) current->mm, /* mm */ (unsigned long)virt_addr, /* start */ num_pages, /* len */ - 0, /* write */ - 0, /* force */ + 0, /* gup_flags */ pages, /* pages (array of page pointers) */ NULL); /* vmas */ up_read(¤t->mm->mmap_sem); diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 60bdad3a689b..150ce2abf6c8 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -245,8 +245,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) /* Get the physical addresses of the source buffer */ down_read(¤t->mm->mmap_sem); num_pinned = get_user_pages(param.local_vaddr - lb_offset, - num_pages, (param.source == -1) ? READ : WRITE, - 0, pages, NULL); + num_pages, (param.source == -1) ? 0 : FOLL_WRITE, + pages, NULL); up_read(¤t->mm->mmap_sem); if (num_pinned != num_pages) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 91cc923ce985..30bb5d9631bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1279,7 +1279,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, int write, int force, struct page **pages, struct vm_area_struct **vmas); long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); diff --git a/mm/gup.c b/mm/gup.c index 3cfb55ef0cf4..bbc2e76cdd77 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -987,18 +987,12 @@ EXPORT_SYMBOL(get_user_pages_remote); * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - unsigned int flags = FOLL_TOUCH; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, false, flags); + pages, vmas, NULL, false, + gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ad1c96ac313c..0b859af06b87 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -850,7 +850,7 @@ static int lookup_node(unsigned long addr) struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); diff --git a/mm/nommu.c b/mm/nommu.c index 842cfdd1a31e..70cb844dfd95 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -160,18 +160,11 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - int flags = 0; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(current, current->mm, start, nr_pages, flags, - pages, vmas, NULL); + return __get_user_pages(current, current->mm, start, nr_pages, + gup_flags, pages, vmas, NULL); } EXPORT_SYMBOL(get_user_pages); @@ -179,10 +172,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { - int write = gup_flags & FOLL_WRITE; - int force = gup_flags & FOLL_FORCE; - - return get_user_pages(start, nr_pages, write, force, pages, NULL); + return get_user_pages(start, nr_pages, gup_flags, pages, NULL); } EXPORT_SYMBOL(get_user_pages_locked); -- cgit From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:17 +0100 Subject: mm: replace get_user_pages_remote() write/force parameters with gup_flags This removes the 'write' and 'force' from get_user_pages_remote() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 7 +++++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 6 +++++- drivers/infiniband/core/umem_odp.c | 7 +++++-- fs/exec.c | 9 +++++++-- include/linux/mm.h | 2 +- kernel/events/uprobes.c | 6 ++++-- mm/gup.c | 22 +++++++--------------- mm/memory.c | 6 +++++- security/tomoyo/domain.c | 2 +- 9 files changed, 40 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 5ce3603e6eac..0370b842d9cc 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -748,19 +748,22 @@ static struct page **etnaviv_gem_userptr_do_get_pages( int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT; struct page **pvec; uintptr_t ptr; + unsigned int flags = 0; pvec = drm_malloc_ab(npages, sizeof(struct page *)); if (!pvec) return ERR_PTR(-ENOMEM); + if (!etnaviv_obj->userptr.ro) + flags |= FOLL_WRITE; + pinned = 0; ptr = etnaviv_obj->userptr.ptr; down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - !etnaviv_obj->userptr.ro, 0, - pvec + pinned, NULL); + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index e537930c64b5..c6f780f5abc9 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -508,6 +508,10 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) pvec = drm_malloc_gfp(npages, sizeof(struct page *), GFP_TEMPORARY); if (pvec != NULL) { struct mm_struct *mm = obj->userptr.mm->mm; + unsigned int flags = 0; + + if (!obj->userptr.read_only) + flags |= FOLL_WRITE; ret = -EFAULT; if (atomic_inc_not_zero(&mm->mm_users)) { @@ -517,7 +521,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) (work->task, mm, obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, - !obj->userptr.read_only, 0, + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 75077a018675..1f0fe3217f23 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 off; int j, k, ret = 0, start_idx, npages = 0; u64 base_virt_addr; + unsigned int flags = 0; if (access_mask == 0) return -EINVAL; @@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, goto out_put_task; } + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; @@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - access_mask & ODP_WRITE_ALLOWED_BIT, - 0, local_page_list, NULL); + flags, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/fs/exec.c b/fs/exec.c index 6fcfb3f7b137..4e497b9ee71e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index 30bb5d9631bb..ecc4be7b67e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1276,7 +1276,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index bbc2e76cdd77..7aa113c2d373 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -915,9 +915,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). + * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -946,9 +944,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. + * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page + * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must + * be called after the page is finished with, and before put_page is called. * * get_user_pages is typically used for fewer-copy IO operations, to get a * handle on the memory by some means other than accesses via the user virtual @@ -965,18 +963,12 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - unsigned int flags = FOLL_TOUCH | FOLL_REMOTE; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, flags); + NULL, false, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); diff --git a/mm/memory.c b/mm/memory.c index fc1987dfd8cc..20a9adb7b36e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3873,6 +3873,10 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, { struct vm_area_struct *vma; void *old_buf = buf; + unsigned int flags = FOLL_FORCE; + + if (write) + flags |= FOLL_WRITE; down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ @@ -3882,7 +3886,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - write, 1, &page, &vma); + flags, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index ade7c6cad172..682b73af7766 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - 0, 1, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; -- cgit From 6347e8d5bcce33fc36e651901efefbe2c93a43ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:19 +0100 Subject: mm: replace access_remote_vm() write parameter with gup_flags This removes the 'write' argument from access_remote_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- fs/proc/base.c | 19 +++++++++++++------ include/linux/mm.h | 2 +- mm/memory.c | 11 +++-------- mm/nommu.c | 7 +++---- 4 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index c2964d890c9a..8e654468ab67 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -252,7 +252,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_FORCE); if (rv <= 0) goto out_free_page; @@ -270,7 +270,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -305,7 +306,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -354,7 +356,8 @@ skip_argv: bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -832,6 +835,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; + unsigned int flags = FOLL_FORCE; if (!mm) return 0; @@ -844,6 +848,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + if (write) + flags |= FOLL_WRITE; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -852,7 +859,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, break; } - this_len = access_remote_vm(mm, addr, page, this_len, write); + this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; @@ -965,7 +972,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), - page, this_len, 0); + page, this_len, FOLL_FORCE); if (retval <= 0) { ret = retval; diff --git a/include/linux/mm.h b/include/linux/mm.h index ecc4be7b67e0..f31bf9058587 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1268,7 +1268,7 @@ static inline int fixup_user_fault(struct task_struct *tsk, extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write); + void *buf, int len, unsigned int gup_flags); long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, diff --git a/mm/memory.c b/mm/memory.c index 79ebed3a4c2b..bac2d994850e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3935,19 +3935,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer - * @write: whether the access is a write + * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { - unsigned int flags = FOLL_FORCE; - - if (write) - flags |= FOLL_WRITE; - - return __access_remote_vm(NULL, mm, addr, buf, len, flags); + return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* diff --git a/mm/nommu.c b/mm/nommu.c index bde7df35118b..93d5bb53fc63 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1847,15 +1847,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer - * @write: whether the access is a write + * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, - write ? FOLL_WRITE : 0); + return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* -- cgit From f307ab6dcea03f9d8e4d70508fd7d1ca57cfa7f9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:20 +0100 Subject: mm: replace access_process_vm() write parameter with gup_flags This removes the 'write' argument from access_process_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Jesper Nilsson Acked-by: Michal Hocko Acked-by: Michael Ellerman Signed-off-by: Linus Torvalds --- arch/alpha/kernel/ptrace.c | 9 ++++++--- arch/blackfin/kernel/ptrace.c | 5 +++-- arch/cris/arch-v32/kernel/ptrace.c | 4 ++-- arch/ia64/kernel/ptrace.c | 14 +++++++++----- arch/m32r/kernel/ptrace.c | 15 ++++++++++----- arch/mips/kernel/ptrace32.c | 5 +++-- arch/powerpc/kernel/ptrace32.c | 5 +++-- arch/score/kernel/ptrace.c | 10 ++++++---- arch/sparc/kernel/ptrace_64.c | 24 ++++++++++++++++-------- arch/x86/kernel/step.c | 3 ++- arch/x86/um/ptrace_32.c | 3 ++- arch/x86/um/ptrace_64.c | 3 ++- include/linux/mm.h | 3 ++- kernel/ptrace.c | 16 ++++++++++------ mm/memory.c | 8 ++------ mm/nommu.c | 6 +++--- mm/util.c | 5 +++-- 17 files changed, 84 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index d9ee81769899..940dfb406591 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -157,14 +157,16 @@ put_reg(struct task_struct *task, unsigned long regno, unsigned long data) static inline int read_int(struct task_struct *task, unsigned long addr, int * data) { - int copied = access_process_vm(task, addr, data, sizeof(int), 0); + int copied = access_process_vm(task, addr, data, sizeof(int), + FOLL_FORCE); return (copied == sizeof(int)) ? 0 : -EIO; } static inline int write_int(struct task_struct *task, unsigned long addr, int data) { - int copied = access_process_vm(task, addr, &data, sizeof(int), 1); + int copied = access_process_vm(task, addr, &data, sizeof(int), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(int)) ? 0 : -EIO; } @@ -281,7 +283,8 @@ long arch_ptrace(struct task_struct *child, long request, /* When I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ case PTRACE_PEEKDATA: - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), + FOLL_FORCE); ret = -EIO; if (copied != sizeof(tmp)) break; diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index 8b8fe671b1a6..8d79286ee4e8 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -271,7 +271,7 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &tmp, - to_copy, 0); + to_copy, FOLL_FORCE); if (copied) break; @@ -324,7 +324,8 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &data, - to_copy, 1); + to_copy, + FOLL_FORCE | FOLL_WRITE); break; case BFIN_MEM_ACCESS_DMA: if (safe_dma_memcpy(paddr, &data, to_copy)) diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c index f085229cf870..f0df654ac6fc 100644 --- a/arch/cris/arch-v32/kernel/ptrace.c +++ b/arch/cris/arch-v32/kernel/ptrace.c @@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request, /* The trampoline page is globally mapped, no page table to traverse.*/ tmp = *(unsigned long*)addr; } else { - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; @@ -279,7 +279,7 @@ static int insn_size(struct task_struct *child, unsigned long pc) int opsize = 0; /* Read the opcode at pc (do what PTRACE_PEEKTEXT would do). */ - copied = access_process_vm(child, pc, &opcode, sizeof(opcode), 0); + copied = access_process_vm(child, pc, &opcode, sizeof(opcode), FOLL_FORCE); if (copied != sizeof(opcode)) return 0; diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 6f54d511cc50..31aa8c0f68e1 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -453,7 +453,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack, return 0; } } - copied = access_process_vm(child, addr, &ret, sizeof(ret), 0); + copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE); if (copied != sizeof(ret)) return -EIO; *val = ret; @@ -489,7 +489,8 @@ ia64_poke (struct task_struct *child, struct switch_stack *child_stack, *ia64_rse_skip_regs(krbs, regnum) = val; } } - } else if (access_process_vm(child, addr, &val, sizeof(val), 1) + } else if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; return 0; @@ -543,7 +544,8 @@ ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw, ret = ia64_peek(child, sw, user_rbs_end, addr, &val); if (ret < 0) return ret; - if (access_process_vm(child, addr, &val, sizeof(val), 1) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; } @@ -559,7 +561,8 @@ ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw, /* now copy word for word from user rbs to kernel rbs: */ for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) { - if (access_process_vm(child, addr, &val, sizeof(val), 0) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE) != sizeof(val)) return -EIO; @@ -1156,7 +1159,8 @@ arch_ptrace (struct task_struct *child, long request, case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: /* read word at location addr */ - if (access_process_vm(child, addr, &data, sizeof(data), 0) + if (access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE) != sizeof(data)) return -EIO; /* ensure return value is not mistaken for error code */ diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index 51f5e9aa4901..c145605a981f 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -493,7 +493,8 @@ unregister_all_debug_traps(struct task_struct *child) int i; for (i = 0; i < p->nr_trap; i++) - access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), 1); + access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), + FOLL_FORCE | FOLL_WRITE); p->nr_trap = 0; } @@ -537,7 +538,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) unsigned long next_insn, code; unsigned long addr = next_pc & ~3; - if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), 0) + if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), + FOLL_FORCE) != sizeof(next_insn)) { return -1; /* error */ } @@ -546,7 +548,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) if (register_debug_trap(child, next_pc, next_insn, &code)) { return -1; /* error */ } - if (access_process_vm(child, addr, &code, sizeof(code), 1) + if (access_process_vm(child, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE) != sizeof(code)) { return -1; /* error */ } @@ -562,7 +565,8 @@ withdraw_debug_trap(struct pt_regs *regs) addr = (regs->bpc - 2) & ~3; regs->bpc -= 2; if (unregister_debug_trap(current, addr, &code)) { - access_process_vm(current, addr, &code, sizeof(code), 1); + access_process_vm(current, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE); invalidate_cache(); } } @@ -589,7 +593,8 @@ void user_enable_single_step(struct task_struct *child) /* Compute next pc. */ pc = get_stack_long(child, PT_BPC); - if (access_process_vm(child, pc&~3, &insn, sizeof(insn), 0) + if (access_process_vm(child, pc&~3, &insn, sizeof(insn), + FOLL_FORCE) != sizeof(insn)) return; diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index 283b5a1967d1..7e71a4e0281b 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -70,7 +70,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *) (unsigned long) data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &data, - sizeof(data), 1) == sizeof(data)) + sizeof(data), + FOLL_FORCE | FOLL_WRITE) == sizeof(data)) break; ret = -EIO; break; diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index f52b7db327c8..010b7b310237 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -74,7 +74,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *)data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 1) == sizeof(tmp)) + sizeof(tmp), + FOLL_FORCE | FOLL_WRITE) == sizeof(tmp)) break; ret = -EIO; break; diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c index 55836188b217..4f7314d5f334 100644 --- a/arch/score/kernel/ptrace.c +++ b/arch/score/kernel/ptrace.c @@ -131,7 +131,7 @@ read_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -142,7 +142,7 @@ read_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -153,7 +153,8 @@ write_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } @@ -164,7 +165,8 @@ write_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 9ddc4928a089..ac082dd8c67d 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -127,7 +127,8 @@ static int get_from_target(struct task_struct *target, unsigned long uaddr, if (copy_from_user(kbuf, (void __user *) uaddr, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 0); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE); if (len2 != len) return -EFAULT; } @@ -141,7 +142,8 @@ static int set_to_target(struct task_struct *target, unsigned long uaddr, if (copy_to_user((void __user *) uaddr, kbuf, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 1); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE | FOLL_WRITE); if (len2 != len) return -EFAULT; } @@ -505,7 +507,8 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - k, sizeof(*k), 0) + k, sizeof(*k), + FOLL_FORCE) != sizeof(*k)) return -EFAULT; k++; @@ -531,12 +534,14 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; @@ -615,7 +620,8 @@ static int genregs32_set(struct task_struct *target, (unsigned long) ®_window[pos], (void *) k, - sizeof(*k), 1) + sizeof(*k), + FOLL_FORCE | FOLL_WRITE) != sizeof(*k)) return -EFAULT; k++; @@ -642,13 +648,15 @@ static int genregs32_set(struct task_struct *target, if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index c9a073866ca7..a23ce84a3f6c 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -57,7 +57,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) unsigned char opcode[15]; unsigned long addr = convert_ip_to_linear(child, regs); - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + copied = access_process_vm(child, addr, opcode, sizeof(opcode), + FOLL_FORCE); for (i = 0; i < copied; i++) { switch (opcode[i]) { /* popf and iret */ diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 5766ead6fdb9..60a5a5a85505 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -36,7 +36,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk(KERN_ERR "is_syscall : failed to read " "instruction from 0x%lx\n", addr); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 0b5c184dd5b3..e30202b1716e 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -212,7 +212,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk("is_syscall : failed to read instruction from " "0x%lx\n", addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index f31bf9058587..ffbd72979ee7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1266,7 +1266,8 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/mm/memory.c b/mm/memory.c index bac2d994850e..e18c57bdc75c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3951,20 +3951,16 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; - unsigned int flags = FOLL_FORCE; mm = get_task_mm(tsk); if (!mm) return 0; - if (write) - flags |= FOLL_WRITE; - - ret = __access_remote_vm(tsk, mm, addr, buf, len, flags); + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 93d5bb53fc63..db5fd1795298 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1861,7 +1861,8 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Access another process' address space. * - source/target buffer must be kernel space */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags) { struct mm_struct *mm; @@ -1872,8 +1873,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, - write ? FOLL_WRITE : 0); + len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); return len; diff --git a/mm/util.c b/mm/util.c index 4c685bde5ebc..952cbe7ad7b7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -624,7 +624,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen) len = buflen; - res = access_process_vm(task, arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then @@ -639,7 +639,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, - buffer+res, len, 0); + buffer+res, len, + FOLL_FORCE); res = strnlen(buffer, res); } } -- cgit From 8ef2074d28373014d05e92b5f13364ef51075b6e Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 19 Oct 2016 09:51:05 -0600 Subject: nvme: Add tertiary number to NVME_VS NVMe 1.2.1 specification adds a tertiary element to the version number. This updates the macro and its callers to include the final number and fixup a single place in nvmet where the version was generated manually. Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 8 ++++---- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/scsi.c | 4 ++-- drivers/nvme/target/core.c | 2 +- include/linux/nvme.h | 3 ++- 5 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2a57f5ede386..bb168b71048d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -900,9 +900,9 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) return -ENODEV; } - if (ns->ctrl->vs >= NVME_VS(1, 1)) + if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); - if (ns->ctrl->vs >= NVME_VS(1, 2)) + if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); return 0; @@ -1242,7 +1242,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } page_shift = NVME_CAP_MPSMIN(cap) + 12; - if (ctrl->vs >= NVME_VS(1, 1)) + if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(cap); ret = nvme_identify_ctrl(ctrl, &id); @@ -1842,7 +1842,7 @@ static void nvme_scan_work(struct work_struct *work) return; nn = le32_to_cpu(id->nn); - if (ctrl->vs >= NVME_VS(1, 1) && + if (ctrl->vs >= NVME_VS(1, 1, 0) && !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { if (!nvme_scan_ns_list(ctrl, nn)) goto done; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a7c6e9d74943..26a8d31b291d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1214,7 +1214,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; - dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? NVME_CAP_NSSRC(cap) : 0; if (dev->subsystem && @@ -1633,7 +1633,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) * NULL as final argument to sysfs_add_file_to_group. */ - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) { + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) { dev->cmb = nvme_map_cmb(dev); if (dev->cmbsz) { diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index c2a0a1c7d05d..3eaa4d27801e 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -606,7 +606,7 @@ static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, eui = id_ns->eui64; len = sizeof(id_ns->eui64); - if (ns->ctrl->vs >= NVME_VS(1, 2)) { + if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) { if (bitmap_empty(eui, len * 8)) { eui = id_ns->nguid; len = sizeof(id_ns->nguid); @@ -679,7 +679,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; - if (ns->ctrl->vs >= NVME_VS(1, 1)) { + if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) { res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); if (res != -EOPNOTSUPP) return res; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6559d5afa7bf..b4cacb6f0258 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -882,7 +882,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return NULL; - subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */ + subsys->ver = NVME_VS(1, 2, 1); /* NVMe 1.2.1 */ switch (type) { case NVME_NQN_NVME: diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7676557ce357..086d196e68f7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -960,6 +960,7 @@ struct nvme_completion { __le16 status; /* did the command fail, and if so, why? */ }; -#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) +#define NVME_VS(major, minor, tertiary) \ + (((major) << 16) | ((minor) << 8) | (tertiary)) #endif /* _LINUX_NVME_H */ -- cgit From a446c0840e244f34c22cc13b3a62d50aa51fb4c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Sep 2016 13:51:06 +0200 Subject: nvme.h: resync with nvme-cli Import a few updates to nvme.h from nvme-cli. This mostly includes a few new fields and error codes, but also a few renames that so far are only used in user space. Also one field is moved from an array of two le64 values to one of 16 u8 values so that we can more easily access it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/discovery.c | 2 +- include/linux/nvme.h | 33 +++++++++++++++++++++++++++------ 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 7ab9c9381b98..6944f246e562 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -199,7 +199,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) */ /* we support multiple ports and multiples hosts: */ - id->mic = (1 << 0) | (1 << 1); + id->cmic = (1 << 0) | (1 << 1); /* no limit on data transfer sizes for now */ id->mdts = 0; diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 6f65646e89cf..a6c25fbcff3f 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -54,7 +54,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, /* we support only dynamic controllers */ e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); - e->nqntype = type; + e->subtype = type; memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 086d196e68f7..989699641e10 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -182,7 +182,7 @@ struct nvme_id_ctrl { char fr[8]; __u8 rab; __u8 ieee[3]; - __u8 mic; + __u8 cmic; __u8 mdts; __le16 cntlid; __le32 ver; @@ -202,7 +202,13 @@ struct nvme_id_ctrl { __u8 apsta; __le16 wctemp; __le16 cctemp; - __u8 rsvd270[50]; + __le16 mtfa; + __le32 hmpre; + __le32 hmmin; + __u8 tnvmcap[16]; + __u8 unvmcap[16]; + __le32 rpmbs; + __u8 rsvd316[4]; __le16 kas; __u8 rsvd322[190]; __u8 sqes; @@ -267,7 +273,7 @@ struct nvme_id_ns { __le16 nabo; __le16 nabspf; __u16 rsvd46; - __le64 nvmcap[2]; + __u8 nvmcap[16]; __u8 rsvd64[40]; __u8 nguid[16]; __u8 eui64[8]; @@ -556,8 +562,10 @@ enum nvme_admin_opcode { nvme_admin_set_features = 0x09, nvme_admin_get_features = 0x0a, nvme_admin_async_event = 0x0c, + nvme_admin_ns_mgmt = 0x0d, nvme_admin_activate_fw = 0x10, nvme_admin_download_fw = 0x11, + nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, @@ -583,6 +591,7 @@ enum { NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_HOST_MEM_BUF = 0x0d, NVME_FEAT_KATO = 0x0f, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, @@ -745,7 +754,7 @@ struct nvmf_common_command { struct nvmf_disc_rsp_page_entry { __u8 trtype; __u8 adrfam; - __u8 nqntype; + __u8 subtype; __u8 treq; __le16 portid; __le16 cntlid; @@ -905,12 +914,23 @@ enum { NVME_SC_INVALID_VECTOR = 0x108, NVME_SC_INVALID_LOG_PAGE = 0x109, NVME_SC_INVALID_FORMAT = 0x10a, - NVME_SC_FIRMWARE_NEEDS_RESET = 0x10b, + NVME_SC_FW_NEEDS_CONV_RESET = 0x10b, NVME_SC_INVALID_QUEUE = 0x10c, NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, NVME_SC_FEATURE_NOT_PER_NS = 0x10f, - NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, + NVME_SC_FW_NEEDS_SUBSYS_RESET = 0x110, + NVME_SC_FW_NEEDS_RESET = 0x111, + NVME_SC_FW_NEEDS_MAX_TIME = 0x112, + NVME_SC_FW_ACIVATE_PROHIBITED = 0x113, + NVME_SC_OVERLAPPING_RANGE = 0x114, + NVME_SC_NS_INSUFFICENT_CAP = 0x115, + NVME_SC_NS_ID_UNAVAILABLE = 0x116, + NVME_SC_NS_ALREADY_ATTACHED = 0x118, + NVME_SC_NS_IS_PRIVATE = 0x119, + NVME_SC_NS_NOT_ATTACHED = 0x11a, + NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, + NVME_SC_CTRL_LIST_INVALID = 0x11c, /* * I/O Command Set Specific - NVM commands: @@ -941,6 +961,7 @@ enum { NVME_SC_REFTAG_CHECK = 0x284, NVME_SC_COMPARE_FAILED = 0x285, NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, }; -- cgit From 8d63687afda019a6e037bf9c4ceb3e514c26a35d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Sep 2016 13:51:07 +0200 Subject: nvme.h: don't use uuid_be This makes life easier for nvme-cli and we don't really need the uuid type anyway to start with. Signed-off-by: Christoph Hellwig Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Jay Freyensee Signed-off-by: Jens Axboe --- include/linux/nvme.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 989699641e10..d31ff2dd1d51 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -16,7 +16,6 @@ #define _LINUX_NVME_H #include -#include /* NQN names in commands fields specified one size */ #define NVMF_NQN_FIELD_LEN 256 @@ -803,7 +802,7 @@ struct nvmf_connect_command { }; struct nvmf_connect_data { - uuid_be hostid; + __u8 hostid[16]; __le16 cntlid; char resv4[238]; char subsysnqn[NVMF_NQN_FIELD_LEN]; -- cgit From 329dd7681c5af84e8ea9f4494c1a304389cdfc6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Sep 2016 13:51:08 +0200 Subject: nvme.h: add an enum for cns values Ported over from nvme-cli. Signed-off-by: Christoph Hellwig Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d31ff2dd1d51..fc3c24206593 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -281,6 +281,16 @@ struct nvme_id_ns { __u8 vs[3712]; }; +enum { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_CTRL_NS_LIST = 0x12, + NVME_ID_CNS_CTRL_LIST = 0x13, +}; + enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, -- cgit From d33fd776f992332be110f6ceca6dad60cb5d513f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:51:28 +1100 Subject: iomap: add IOMAP_REPORT This allows the file system to tell a FIEMAP from a read operation, and thus avoids the need to report flags that aren't actually used in the read path. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/iomap.c | 2 +- include/linux/iomap.h | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap.c b/fs/iomap.c index 013d1d36fbbf..a92204012e2d 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -561,7 +561,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } while (len > 0) { - ret = iomap_apply(inode, start, len, 0, ops, &ctx, + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); /* inode with no (attribute) mapping will give ENOENT */ if (ret == -ENOENT) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e63e288dee83..7892f55a1866 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -19,11 +19,15 @@ struct vm_fault; #define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ /* - * Flags for iomap mappings: + * Flags for all iomap mappings: */ -#define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */ -#define IOMAP_F_SHARED 0x02 /* block shared with another file */ -#define IOMAP_F_NEW 0x04 /* blocks have been newly allocated */ +#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ + +/* + * Flags that only need to be reported for IOMAP_REPORT requests: + */ +#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ +#define IOMAP_F_SHARED 0x20 /* block shared with another file */ /* * Magic value for blkno: @@ -42,8 +46,9 @@ struct iomap { /* * Flags for iomap_begin / iomap_end. No flag implies a read. */ -#define IOMAP_WRITE (1 << 0) -#define IOMAP_ZERO (1 << 1) +#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ +#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ +#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ struct iomap_ops { /* -- cgit From d17af5056cf9e9fc05e68832f7c15687fcc12281 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 Sep 2016 10:58:58 -0700 Subject: mm: Change vm_is_stack_for_task() to vm_is_stack_for_current() Asking for a non-current task's stack can't be done without races unless the task is frozen in kernel mode. As far as I know, vm_is_stack_for_task() never had a safe non-current use case. The __unused annotation is because some KSTK_ESP implementations ignore their parameter, which IMO is further justification for this patch. Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Al Viro Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Jann Horn Cc: Kees Cook Cc: Linus Torvalds Cc: Linux API Cc: Peter Zijlstra Cc: Tycho Andersen Link: http://lkml.kernel.org/r/4c3f68f426e6c061ca98b4fc7ef85ffbb0a25b0c.1475257877.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/mm.h | 2 +- mm/util.c | 4 +++- security/selinux/hooks.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e9caec6a51e9..a658a5167bce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1391,7 +1391,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, !vma_growsup(vma->vm_next, addr); } -int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t); +int vma_is_stack_for_current(struct vm_area_struct *vma); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, diff --git a/mm/util.c b/mm/util.c index 662cddf914af..c174e8921995 100644 --- a/mm/util.c +++ b/mm/util.c @@ -230,8 +230,10 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) +int vma_is_stack_for_current(struct vm_area_struct *vma) { + struct task_struct * __maybe_unused t = current; + return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 085057936287..09fd6108e421 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3557,7 +3557,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, } else if (!vma->vm_file && ((vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) || - vma_is_stack_for_task(vma, current))) { + vma_is_stack_for_current(vma))) { rc = current_has_perm(current, PROCESS__EXECSTACK); } else if (vma->vm_file && vma->anon_vma) { /* -- cgit From c8061485a0d7569a865a3cc3c63347b0f42b3765 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 19 Oct 2016 19:28:11 +0100 Subject: sched/core, x86: Make struct thread_info arch specific again The following commit: c65eacbe290b ("sched/core: Allow putting thread_info into task_struct") ... made 'struct thread_info' a generic struct with only a single ::flags member, if CONFIG_THREAD_INFO_IN_TASK_STRUCT=y is selected. This change however seems to be quite x86 centric, since at least the generic preemption code (asm-generic/preempt.h) assumes that struct thread_info also has a preempt_count member, which apparently was not true for x86. We could add a bit more #ifdefs to solve this problem too, but it seems to be much simpler to make struct thread_info arch specific again. This also makes the conversion to THREAD_INFO_IN_TASK_STRUCT a bit easier for architectures that have a couple of arch specific stuff in their thread_info definition. The arch specific stuff _could_ be moved to thread_struct. However keeping them in thread_info makes it easier: accessing thread_info members is simple, since it is at the beginning of the task_struct, while the thread_struct is at the end. At least on s390 the offsets needed to access members of the thread_struct (with task_struct as base) are too large for various asm instructions. This is not a problem when keeping these members within thread_info. Signed-off-by: Heiko Carstens Signed-off-by: Mark Rutland Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: keescook@chromium.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/1476901693-8492-2-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 9 +++++++++ include/linux/thread_info.h | 11 ----------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2aaca53c0974..ad6f5eb07a95 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -52,6 +52,15 @@ struct task_struct; #include #include +struct thread_info { + unsigned long flags; /* low level flags */ +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .flags = 0, \ +} + #define init_stack (init_thread_union.stack) #else /* !__ASSEMBLY__ */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 45f004e9cc59..2873baf5372a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -13,17 +13,6 @@ struct timespec; struct compat_timespec; -#ifdef CONFIG_THREAD_INFO_IN_TASK -struct thread_info { - unsigned long flags; /* low level flags */ -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .flags = 0, \ -} -#endif - #ifdef CONFIG_THREAD_INFO_IN_TASK #define current_thread_info() ((struct thread_info *)current) #endif -- cgit From c6fe46a79ecd79606bb96fada4515f6b23f87b62 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 18 Oct 2016 00:41:12 +0900 Subject: cpufreq: fix overflow in cpufreq_table_find_index_dl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'best' is always less or equals to 'pos', so `best - pos' returns a negative value which is then getting casted to `unsigned int' and passed to __cpufreq_driver_target()->acpi_cpufreq_target() for policy->freq_table selection. This results in BUG: unable to handle kernel paging request at ffff881019b469f8 IP: [] acpi_cpufreq_target+0x4f/0x190 [acpi_cpufreq] PGD 267f067 PUD 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 70 Comm: kworker/6:1 Not tainted 4.9.0-rc1-next-20161017-dbg-dirty Workqueue: events dbs_work_handler task: ffff88041b808000 task.stack: ffff88041b810000 RIP: 0010:[] [] acpi_cpufreq_target+0x4f/0x190 [acpi_cpufreq] RSP: 0018:ffff88041b813c60 EFLAGS: 00010282 RAX: ffff880419b46a00 RBX: ffff88041b848400 RCX: ffff880419b20f80 RDX: 00000000001dff38 RSI: 00000000ffffffff RDI: ffff88041b848400 RBP: ffff88041b813cb0 R08: 0000000000000006 R09: 0000000000000040 R10: ffffffff8207f9e0 R11: ffffffff8173595b R12: 0000000000000000 R13: ffff88041f1dff38 R14: 0000000000262900 R15: 0000000bfffffff4 FS: 0000000000000000(0000) GS:ffff88041f000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff881019b469f8 CR3: 000000041a2d3000 CR4: 00000000001406e0 Stack: ffff88041b813cb0 ffffffff813347f9 ffff88041b813ca0 ffffffff81334663 ffff88041f1d4bc0 ffff88041b848400 0000000000000000 0000000000000000 0000000000262900 0000000000000000 ffff88041b813d00 ffffffff813355dc Call Trace: [] ? cpufreq_freq_transition_begin+0xf1/0xfc [] ? get_cpu_idle_time+0x97/0xa6 [] __cpufreq_driver_target+0x3b6/0x44e [] cs_dbs_timer+0x11a/0x135 [] dbs_work_handler+0x39/0x62 [] process_one_work+0x280/0x4a5 [] worker_thread+0x24f/0x397 [] ? rescuer_thread+0x30b/0x30b [] ? nl80211_get_key+0x29/0x36a [] kthread+0xfc/0x104 [] ? put_lock_stats.isra.9+0xe/0x20 [] ? kthread_create_on_node+0x3f/0x3f [] ret_from_fork+0x22/0x30 Code: 56 4d 6b ff 0c 41 55 41 54 53 48 83 ec 28 48 8b 15 ad 1e 00 00 44 8b 41 08 48 8b 87 c8 00 00 00 49 89 d5 4e 03 2c c5 80 b2 78 81 <46> 8b 74 38 04 45 3b 75 00 75 11 31 c0 83 39 00 0f 84 1c 01 00 RIP [] acpi_cpufreq_target+0x4f/0x190 [acpi_cpufreq] RSP CR2: ffff881019b469f8 ---[ end trace 16d9fc7a17897d37 ]--- [ rjw: In some cases this bug may also cause incorrect frequencies to be selected by cpufreq governors. ] Fixes: 899bb6642f2a (cpufreq: skip invalid entries when searching the frequency) Link: http://marc.info/?l=linux-kernel&m=147672030714331&w=2 Reported-and-tested-by: Sedat Dilek Reported-and-tested-by: Jörg Otte Signed-off-by: Sergey Senozhatsky Acked-by: Viresh Kumar Cc: 4.8+ # 4.8+ Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5fa55fc56e18..32dc0cbd51ca 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -677,10 +677,10 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, if (best == table - 1) return pos - table; - return best - pos; + return best - table; } - return best - pos; + return best - table; } /* Works only on sorted freq-tables */ -- cgit From 9995f4f184613fb02ee73092b03545520a72b104 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Thu, 13 Oct 2016 21:51:06 +0000 Subject: clocksource: Add J-Core timer/clocksource driver At the hardware level, the J-Core PIT is integrated with the interrupt controller, but it is represented as its own device and has an independent programming interface. It provides a 12-bit countdown timer, which is not presently used, and a periodic timer. The interval length for the latter is programmable via a 32-bit throttle register whose units are determined by a bus-period register. The periodic timer is used to implement both periodic and oneshot clock event modes; in oneshot mode the interrupt handler simply disables the timer as soon as it fires. Despite its device tree node representing an interrupt for the PIT, the actual irq generated is programmable, not hard-wired. The driver is responsible for programming the PIT to generate the hardware irq number that the DT assigns to it. On SMP configurations, J-Core provides cpu-local instances of the PIT; no broadcast timer is needed. This driver supports the creation of the necessary per-cpu clock_event_device instances. A nanosecond-resolution clocksource is provided using the J-Core "RTC" registers, which give a 64-bit seconds count and 32-bit nanoseconds that wrap every second. The driver converts these to a full-range 32-bit nanoseconds count. Signed-off-by: Rich Felker Cc: Mark Rutland Cc: devicetree@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: Daniel Lezcano Cc: Rob Herring Link: http://lkml.kernel.org/r/b591ff12cc5ebf63d1edc98da26046f95a233814.1476393790.git.dalias@libc.org Signed-off-by: Thomas Gleixner --- drivers/clocksource/Kconfig | 10 ++ drivers/clocksource/Makefile | 1 + drivers/clocksource/jcore-pit.c | 249 ++++++++++++++++++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 4 files changed, 261 insertions(+) create mode 100644 drivers/clocksource/jcore-pit.c (limited to 'include/linux') diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 245190839359..e2c6e43cf8ca 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -417,6 +417,16 @@ config SYS_SUPPORTS_SH_TMU config SYS_SUPPORTS_EM_STI bool +config CLKSRC_JCORE_PIT + bool "J-Core PIT timer driver" if COMPILE_TEST + depends on OF + depends on GENERIC_CLOCKEVENTS + depends on HAS_IOMEM + select CLKSRC_MMIO + help + This enables build of clocksource and clockevent driver for + the integrated PIT in the J-Core synthesizable, open source SoC. + config SH_TIMER_CMT bool "Renesas CMT timer driver" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index fd9d6df0bbc0..cf87f407f1ad 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_ATMEL_TCB_CLKSRC) += tcb_clksrc.o obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o obj-$(CONFIG_SCx200HR_TIMER) += scx200_hrt.o obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC) += cs5535-clockevt.o +obj-$(CONFIG_CLKSRC_JCORE_PIT) += jcore-pit.o obj-$(CONFIG_SH_TIMER_CMT) += sh_cmt.o obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o obj-$(CONFIG_SH_TIMER_TMU) += sh_tmu.o diff --git a/drivers/clocksource/jcore-pit.c b/drivers/clocksource/jcore-pit.c new file mode 100644 index 000000000000..54e1665aa03c --- /dev/null +++ b/drivers/clocksource/jcore-pit.c @@ -0,0 +1,249 @@ +/* + * J-Core SoC PIT/clocksource driver + * + * Copyright (C) 2015-2016 Smart Energy Instruments, Inc. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PIT_IRQ_SHIFT 12 +#define PIT_PRIO_SHIFT 20 +#define PIT_ENABLE_SHIFT 26 +#define PIT_PRIO_MASK 0xf + +#define REG_PITEN 0x00 +#define REG_THROT 0x10 +#define REG_COUNT 0x14 +#define REG_BUSPD 0x18 +#define REG_SECHI 0x20 +#define REG_SECLO 0x24 +#define REG_NSEC 0x28 + +struct jcore_pit { + struct clock_event_device ced; + void __iomem *base; + unsigned long periodic_delta; + u32 enable_val; +}; + +static void __iomem *jcore_pit_base; +static struct jcore_pit __percpu *jcore_pit_percpu; + +static notrace u64 jcore_sched_clock_read(void) +{ + u32 seclo, nsec, seclo0; + __iomem void *base = jcore_pit_base; + + seclo = readl(base + REG_SECLO); + do { + seclo0 = seclo; + nsec = readl(base + REG_NSEC); + seclo = readl(base + REG_SECLO); + } while (seclo0 != seclo); + + return seclo * NSEC_PER_SEC + nsec; +} + +static cycle_t jcore_clocksource_read(struct clocksource *cs) +{ + return jcore_sched_clock_read(); +} + +static int jcore_pit_disable(struct jcore_pit *pit) +{ + writel(0, pit->base + REG_PITEN); + return 0; +} + +static int jcore_pit_set(unsigned long delta, struct jcore_pit *pit) +{ + jcore_pit_disable(pit); + writel(delta, pit->base + REG_THROT); + writel(pit->enable_val, pit->base + REG_PITEN); + return 0; +} + +static int jcore_pit_set_state_shutdown(struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_disable(pit); +} + +static int jcore_pit_set_state_oneshot(struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_disable(pit); +} + +static int jcore_pit_set_state_periodic(struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_set(pit->periodic_delta, pit); +} + +static int jcore_pit_set_next_event(unsigned long delta, + struct clock_event_device *ced) +{ + struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced); + + return jcore_pit_set(delta, pit); +} + +static int jcore_pit_local_init(unsigned cpu) +{ + struct jcore_pit *pit = this_cpu_ptr(jcore_pit_percpu); + unsigned buspd, freq; + + pr_info("Local J-Core PIT init on cpu %u\n", cpu); + + buspd = readl(pit->base + REG_BUSPD); + freq = DIV_ROUND_CLOSEST(NSEC_PER_SEC, buspd); + pit->periodic_delta = DIV_ROUND_CLOSEST(NSEC_PER_SEC, HZ * buspd); + + clockevents_config_and_register(&pit->ced, freq, 1, ULONG_MAX); + + return 0; +} + +static irqreturn_t jcore_timer_interrupt(int irq, void *dev_id) +{ + struct jcore_pit *pit = this_cpu_ptr(dev_id); + + if (clockevent_state_oneshot(&pit->ced)) + jcore_pit_disable(pit); + + pit->ced.event_handler(&pit->ced); + + return IRQ_HANDLED; +} + +static int __init jcore_pit_init(struct device_node *node) +{ + int err; + unsigned pit_irq, cpu; + unsigned long hwirq; + u32 irqprio, enable_val; + + jcore_pit_base = of_iomap(node, 0); + if (!jcore_pit_base) { + pr_err("Error: Cannot map base address for J-Core PIT\n"); + return -ENXIO; + } + + pit_irq = irq_of_parse_and_map(node, 0); + if (!pit_irq) { + pr_err("Error: J-Core PIT has no IRQ\n"); + return -ENXIO; + } + + pr_info("Initializing J-Core PIT at %p IRQ %d\n", + jcore_pit_base, pit_irq); + + err = clocksource_mmio_init(jcore_pit_base, "jcore_pit_cs", + NSEC_PER_SEC, 400, 32, + jcore_clocksource_read); + if (err) { + pr_err("Error registering clocksource device: %d\n", err); + return err; + } + + sched_clock_register(jcore_sched_clock_read, 32, NSEC_PER_SEC); + + jcore_pit_percpu = alloc_percpu(struct jcore_pit); + if (!jcore_pit_percpu) { + pr_err("Failed to allocate memory for clock event device\n"); + return -ENOMEM; + } + + err = request_irq(pit_irq, jcore_timer_interrupt, + IRQF_TIMER | IRQF_PERCPU, + "jcore_pit", jcore_pit_percpu); + if (err) { + pr_err("pit irq request failed: %d\n", err); + free_percpu(jcore_pit_percpu); + return err; + } + + /* + * The J-Core PIT is not hard-wired to a particular IRQ, but + * integrated with the interrupt controller such that the IRQ it + * generates is programmable, as follows: + * + * The bit layout of the PIT enable register is: + * + * .....e..ppppiiiiiiii............ + * + * where the .'s indicate unrelated/unused bits, e is enable, + * p is priority, and i is hard irq number. + * + * For the PIT included in AIC1 (obsolete but still in use), + * any hard irq (trap number) can be programmed via the 8 + * iiiiiiii bits, and a priority (0-15) is programmable + * separately in the pppp bits. + * + * For the PIT included in AIC2 (current), the programming + * interface is equivalent modulo interrupt mapping. This is + * why a different compatible tag was not used. However only + * traps 64-127 (the ones actually intended to be used for + * interrupts, rather than syscalls/exceptions/etc.) can be + * programmed (the high 2 bits of i are ignored) and the + * priority pppp is <<2'd and or'd onto the irq number. This + * choice seems to have been made on the hardware engineering + * side under an assumption that preserving old AIC1 priority + * mappings was important. Future models will likely ignore + * the pppp field. + */ + hwirq = irq_get_irq_data(pit_irq)->hwirq; + irqprio = (hwirq >> 2) & PIT_PRIO_MASK; + enable_val = (1U << PIT_ENABLE_SHIFT) + | (hwirq << PIT_IRQ_SHIFT) + | (irqprio << PIT_PRIO_SHIFT); + + for_each_present_cpu(cpu) { + struct jcore_pit *pit = per_cpu_ptr(jcore_pit_percpu, cpu); + + pit->base = of_iomap(node, cpu); + if (!pit->base) { + pr_err("Unable to map PIT for cpu %u\n", cpu); + continue; + } + + pit->ced.name = "jcore_pit"; + pit->ced.features = CLOCK_EVT_FEAT_PERIODIC + | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_PERCPU; + pit->ced.cpumask = cpumask_of(cpu); + pit->ced.rating = 400; + pit->ced.irq = pit_irq; + pit->ced.set_state_shutdown = jcore_pit_set_state_shutdown; + pit->ced.set_state_periodic = jcore_pit_set_state_periodic; + pit->ced.set_state_oneshot = jcore_pit_set_state_oneshot; + pit->ced.set_next_event = jcore_pit_set_next_event; + + pit->enable_val = enable_val; + } + + cpuhp_setup_state(CPUHP_AP_JCORE_TIMER_STARTING, + "AP_JCORE_TIMER_STARTING", + jcore_pit_local_init, NULL); + + return 0; +} + +CLOCKSOURCE_OF_DECLARE(jcore_pit, "jcore,pit", jcore_pit_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 9b207a8c5af3..afe641c02dca 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -81,6 +81,7 @@ enum cpuhp_state { CPUHP_AP_ARM_ARCH_TIMER_STARTING, CPUHP_AP_ARM_GLOBAL_TIMER_STARTING, CPUHP_AP_DUMMY_TIMER_STARTING, + CPUHP_AP_JCORE_TIMER_STARTING, CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING, CPUHP_AP_ARM_TWD_STARTING, CPUHP_AP_METAG_TIMER_STARTING, -- cgit From fcd91dd449867c6bfe56a81cabba76b829fd05cd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 20 Oct 2016 15:58:02 +0200 Subject: net: add recursion limit to GRO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, GRO can do unlimited recursion through the gro_receive handlers. This was fixed for tunneling protocols by limiting tunnel GRO to one level with encap_mark, but both VLAN and TEB still have this problem. Thus, the kernel is vulnerable to a stack overflow, if we receive a packet composed entirely of VLAN headers. This patch adds a recursion counter to the GRO layer to prevent stack overflow. When a gro_receive function hits the recursion limit, GRO is aborted for this skb and it is processed normally. This recursion counter is put in the GRO CB, but could be turned into a percpu counter if we run out of space in the CB. Thanks to Vladimír Beneš for the initial bug report. Fixes: CVE-2016-7039 Fixes: 9b174d88c257 ("net: Add Transparent Ethernet Bridging GRO support.") Fixes: 66e5133f19e9 ("vlan: Add GRO support for non hardware accelerated vlan") Signed-off-by: Sabrina Dubroca Reviewed-by: Jiri Benc Acked-by: Hannes Frederic Sowa Acked-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 2 +- include/linux/netdevice.h | 39 ++++++++++++++++++++++++++++++++++++++- net/8021q/vlan.c | 2 +- net/core/dev.c | 1 + net/ethernet/eth.c | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/fou.c | 4 ++-- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/ip6_offload.c | 2 +- 11 files changed, 49 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 3c20e87bb761..16af1ce99233 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -453,7 +453,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e7d16687538b..c1639a3e95a4 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -583,7 +583,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, } } - pp = eth_gro_receive(head, skb); + pp = call_gro_receive(eth_gro_receive, head, skb); flush = 0; out: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 465e128699a1..91ee3643ccc8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2169,7 +2169,10 @@ struct napi_gro_cb { /* Used to determine if flush_id can be ignored */ u8 is_atomic:1; - /* 5 bit hole */ + /* Number of gro_receive callbacks this packet already went through */ + u8 recursion_counter:4; + + /* 1 bit hole */ /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2180,6 +2183,40 @@ struct napi_gro_cb { #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) +#define GRO_RECURSION_LIMIT 15 +static inline int gro_recursion_inc_test(struct sk_buff *skb) +{ + return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; +} + +typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *); +static inline struct sk_buff **call_gro_receive(gro_receive_t cb, + struct sk_buff **head, + struct sk_buff *skb) +{ + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + + return cb(head, skb); +} + +typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **, + struct sk_buff *); +static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb, + struct sock *sk, + struct sk_buff **head, + struct sk_buff *skb) +{ + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + + return cb(sk, head, skb); +} + struct packet_type { __be16 type; /* This is really htons(ether_type). */ struct net_device *dev; /* NULL is wildcarded here */ diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8de138d3306b..f2531ad66b68 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -664,7 +664,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, skb_gro_pull(skb, sizeof(*vhdr)); skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/core/dev.c b/net/core/dev.c index b09ac57f4348..dbc871306910 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4511,6 +4511,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->recursion_counter = 0; NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 66dff5e3d772..02acfff36028 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -439,7 +439,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, skb_gro_pull(skb, sizeof(*eh)); skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1effc986739e..9648c97e541f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1391,7 +1391,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index cf50f7e2b012..030d1531e897 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -249,7 +249,7 @@ static struct sk_buff **fou_gro_receive(struct sock *sk, if (!ops || !ops->callbacks.gro_receive) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -441,7 +441,7 @@ next_proto: if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 96e0efecefa6..d5cac99170b1 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -229,7 +229,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ skb_gro_postpull_rcsum(skb, greh, grehlen); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f9333c963607..b2be1d9757ef 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -295,7 +295,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = udp_sk(sk)->gro_receive(sk, head, skb); + pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index e7bfd55899a3..1fcf61f1cbc3 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -246,7 +246,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, iph, nlen); - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); -- cgit From 0e191827383a6503a3bc547e63c74ff093f450f5 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Fri, 21 Oct 2016 04:43:42 -0400 Subject: qed*: Reduce the memory footprint for Rx path With the current default values for Rx path i.e., 8 queues of 8Kb entries each with 4Kb size, interface will consume 256Mb for Rx. The default values causing the driver probe to fail when the system memory is low. Based on the perforamnce results, rx-ring count value of 1Kb gives the comparable performance with Rx coalesce timeout of 12 seconds. Updating the default values. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_main.c | 1 + drivers/net/ethernet/qlogic/qede/qede.h | 2 +- include/linux/qed/qed_if.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 8dc3f4670f64..c418360ba02a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -878,6 +878,7 @@ static int qed_slowpath_start(struct qed_dev *cdev, } } + cdev->rx_coalesce_usecs = QED_DEFAULT_RX_USECS; rc = qed_nic_setup(cdev); if (rc) goto err; diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index d6a4a9a5eb4d..974689a13337 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -354,7 +354,7 @@ void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq); #define RX_RING_SIZE ((u16)BIT(RX_RING_SIZE_POW)) #define NUM_RX_BDS_MAX (RX_RING_SIZE - 1) #define NUM_RX_BDS_MIN 128 -#define NUM_RX_BDS_DEF NUM_RX_BDS_MAX +#define NUM_RX_BDS_DEF ((u16)BIT(10) - 1) #define TX_RING_SIZE_POW 13 #define TX_RING_SIZE ((u16)BIT(TX_RING_SIZE_POW)) diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index f9ae903bbb84..8978a60371f4 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -146,6 +146,7 @@ enum qed_led_mode { #define DIRECT_REG_RD(reg_addr) readl((void __iomem *)(reg_addr)) #define QED_COALESCE_MAX 0xFF +#define QED_DEFAULT_RX_USECS 12 /* forward */ struct qed_dev; -- cgit From f1caa61df2a3dc4c58316295c5dc5edba4c68d85 Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Mon, 24 Oct 2016 00:31:31 -0400 Subject: ACPI/PCI: pci_link: penalize SCI correctly Ondrej reported that IRQs stopped working in v4.7 on several platforms. A typical scenario, from Ondrej's VT82C694X/694X, is: ACPI: Using PIC for interrupt routing ACPI: PCI Interrupt Link [LNKA] (IRQs 1 3 4 5 6 7 10 *11 12 14 15) ACPI: No IRQ available for PCI Interrupt Link [LNKA] 8139too 0000:00:0f.0: PCI INT A: no GSI We're using PIC routing, so acpi_irq_balance == 0, and LNKA is already active at IRQ 11. In that case, acpi_pci_link_allocate() only tries to use the active IRQ (IRQ 11) which also happens to be the SCI. We should penalize the SCI by PIRQ_PENALTY_PCI_USING, but irq_get_trigger_type(11) returns something other than IRQ_TYPE_LEVEL_LOW, so we penalize it by PIRQ_PENALTY_ISA_ALWAYS instead, which makes acpi_pci_link_allocate() assume the IRQ isn't available and give up. Add acpi_penalize_sci_irq() so platforms can tell us the SCI IRQ, trigger, and polarity directly and we don't have to depend on irq_get_trigger_type(). Fixes: 103544d86976 (ACPI,PCI,IRQ: reduce resource requirements) Link: http://lkml.kernel.org/r/201609251512.05657.linux@rainbow-software.org Reported-by: Ondrej Zary Acked-by: Bjorn Helgaas Signed-off-by: Sinan Kaya Tested-by: Jonathan Liu Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 1 + drivers/acpi/pci_link.c | 30 +++++++++++++++--------------- include/linux/acpi.h | 1 + 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8a5abaa7d453..931ced8ca345 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -454,6 +454,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* * stash over-ride to indicate we've been here diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 6229b022a5d4..74bf96efae95 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -87,6 +87,7 @@ struct acpi_pci_link { static LIST_HEAD(acpi_link_list); static DEFINE_MUTEX(acpi_link_lock); +static int sci_irq = -1, sci_penalty; /* -------------------------------------------------------------------------- PCI Link Device Management @@ -496,25 +497,13 @@ static int acpi_irq_get_penalty(int irq) { int penalty = 0; - /* - * Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict - * with PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be - * use for PCI IRQs. - */ - if (irq == acpi_gbl_FADT.sci_interrupt) { - u32 type = irq_get_trigger_type(irq) & IRQ_TYPE_SENSE_MASK; - - if (type != IRQ_TYPE_LEVEL_LOW) - penalty += PIRQ_PENALTY_ISA_ALWAYS; - else - penalty += PIRQ_PENALTY_PCI_USING; - } + if (irq == sci_irq) + penalty += sci_penalty; if (irq < ACPI_MAX_ISA_IRQS) return penalty + acpi_isa_irq_penalty[irq]; - penalty += acpi_irq_pci_sharing_penalty(irq); - return penalty; + return penalty + acpi_irq_pci_sharing_penalty(irq); } int __init acpi_irq_penalty_init(void) @@ -881,6 +870,17 @@ bool acpi_isa_irq_available(int irq) acpi_irq_get_penalty(irq) < PIRQ_PENALTY_ISA_ALWAYS); } +void acpi_penalize_sci_irq(int irq, int trigger, int polarity) +{ + sci_irq = irq; + + if (trigger == ACPI_MADT_TRIGGER_LEVEL && + polarity == ACPI_MADT_POLARITY_ACTIVE_LOW) + sci_penalty = PIRQ_PENALTY_PCI_USING; + else + sci_penalty = PIRQ_PENALTY_ISA_ALWAYS; +} + /* * Over-ride default table to reserve additional IRQs for use by ISA * e.g. acpi_irq_isa=5 diff --git a/include/linux/acpi.h b/include/linux/acpi.h index ddbeda6dbdc8..689a8b9b9c8f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -326,6 +326,7 @@ struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); +void acpi_penalize_sci_irq(int irq, int trigger, int polarity); void acpi_pci_irq_disable (struct pci_dev *dev); extern int ec_read(u8 addr, u8 *val); -- cgit From 0d7317598214134d73da59990b846481a9527a00 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 24 Oct 2016 10:57:25 +0100 Subject: mm: unexport __get_user_pages() This patch unexports the low-level __get_user_pages() function. Recent refactoring of the get_user_pages* functions allow flags to be passed through get_user_pages() which eliminates the need for access to this function from its one user, kvm. We can see that the two calls to get_user_pages() which replace __get_user_pages() in kvm_main.c are equivalent by examining their call stacks: get_user_page_nowait(): get_user_pages(start, 1, flags, page, NULL) __get_user_pages_locked(current, current->mm, start, 1, page, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, start, 1, flags | FOLL_TOUCH | FOLL_GET, page, NULL, NULL) check_user_page_hwpoison(): get_user_pages(addr, 1, flags, NULL, NULL) __get_user_pages_locked(current, current->mm, addr, 1, NULL, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, addr, 1, flags | FOLL_TOUCH, NULL, NULL, NULL) Signed-off-by: Lorenzo Stoakes Acked-by: Paolo Bonzini Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- mm/gup.c | 3 +-- mm/nommu.c | 2 +- virt/kvm/kvm_main.c | 10 ++++------ 4 files changed, 6 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a191853faaa..a92c8d73aeaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1271,10 +1271,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/gup.c b/mm/gup.c index 7aa113c2d373..ec4f82704b6f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -526,7 +526,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) @@ -631,7 +631,6 @@ next_page: } while (nr_pages); return i; } -EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { diff --git a/mm/nommu.c b/mm/nommu.c index db5fd1795298..8b8faaf2a9e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -109,7 +109,7 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 28510e72618a..2907b7b78654 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1346,21 +1346,19 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w static int get_user_page_nowait(unsigned long start, int write, struct page **page) { - int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + int flags = FOLL_NOWAIT | FOLL_HWPOISON; if (write) flags |= FOLL_WRITE; - return __get_user_pages(current, current->mm, start, 1, flags, page, - NULL, NULL); + return get_user_pages(start, 1, flags, page, NULL); } static inline int check_user_page_hwpoison(unsigned long addr) { - int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - rc = __get_user_pages(current, current->mm, addr, 1, - flags, NULL, NULL, NULL); + rc = get_user_pages(addr, 1, flags, NULL, NULL); return rc == -EHWPOISON; } -- cgit From 8ef4227615e158faa4ee85a1d6466782f7e22f2f Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 24 Oct 2016 15:27:59 +1000 Subject: x86/io: add interface to reserve io memtype for a resource range. (v1.1) A recent change to the mm code in: 87744ab3832b mm: fix cache mode tracking in vm_insert_mixed() started enforcing checking the memory type against the registered list for amixed pfn insertion mappings. It happens that the drm drivers for a number of gpus relied on this being broken. Currently the driver only inserted VRAM mappings into the tracking table when they came from the kernel, and userspace mappings never landed in the table. This led to a regression where all the mapping end up as UC instead of WC now. I've considered a number of solutions but since this needs to be fixed in fixes and not next, and some of the solutions were going to introduce overhead that hadn't been there before I didn't consider them viable at this stage. These mainly concerned hooking into the TTM io reserve APIs, but these API have a bunch of fast paths I didn't want to unwind to add this to. The solution I've decided on is to add a new API like the arch_phys_wc APIs (these would have worked but wc_del didn't take a range), and use them from the drivers to add a WC compatible mapping to the table for all VRAM on those GPUs. This means we can then create userspace mapping that won't get degraded to UC. v1.1: use CONFIG_X86_PAT + add some comments in io.h Cc: Toshi Kani Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: x86@kernel.org Cc: mcgrof@suse.com Cc: Dan Williams Acked-by: Ingo Molnar Reviewed-by: Thomas Gleixner Signed-off-by: Dave Airlie --- arch/x86/include/asm/io.h | 6 ++++++ arch/x86/mm/pat.c | 14 ++++++++++++++ include/linux/io.h | 22 ++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index de25aad07853..d34bd370074b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -351,4 +351,10 @@ extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif +#ifdef CONFIG_X86_PAT +extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); +extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); +#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc +#endif + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 170cc4ff057b..83e701f160a9 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -730,6 +730,20 @@ void io_free_memtype(resource_size_t start, resource_size_t end) free_memtype(start, end); } +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ + enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + + return io_reserve_memtype(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ + io_free_memtype(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { diff --git a/include/linux/io.h b/include/linux/io.h index e2c8419278c1..82ef36eac8a1 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -141,4 +141,26 @@ enum { void *memremap(resource_size_t offset, size_t size, unsigned long flags); void memunmap(void *addr); +/* + * On x86 PAT systems we have memory tracking that keeps track of + * the allowed mappings on memory ranges. This tracking works for + * all the in-kernel mapping APIs (ioremap*), but where the user + * wishes to map a range from a physical device into user memory + * the tracking won't be updated. This API is to be used by + * drivers which remap physical device pages into userspace, + * and wants to make sure they are mapped WC and not UC. + */ +#ifndef arch_io_reserve_memtype_wc +static inline int arch_io_reserve_memtype_wc(resource_size_t base, + resource_size_t size) +{ + return 0; +} + +static inline void arch_io_free_memtype_wc(resource_size_t base, + resource_size_t size) +{ +} +#endif + #endif /* _LINUX_IO_H */ -- cgit From 293de7dee4f9602676846dbeb84b1580123306b4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 23 Oct 2016 09:28:29 -0700 Subject: doc: update docbook annotations for socket and skb The skbuff and sock structure both had missing parameter annotation values. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/sock.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 601258f6e621..32810f279f8e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -936,6 +936,7 @@ struct sk_buff_fclones { /** * skb_fclone_busy - check if fclone is busy + * @sk: socket * @skb: buffer * * Returns true if skb is a fast clone, and its clone is not freed. diff --git a/include/net/sock.h b/include/net/sock.h index ebf75db08e06..73c6b008f1b7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -252,6 +252,7 @@ struct sock_common { * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes + * @sk_padding: unused element for alignment * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) @@ -302,7 +303,8 @@ struct sock_common { * @sk_backlog_rcv: callback to process the backlog * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container - */ + * @sk_rcu: used during RCU grace period + */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just -- cgit From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 26 Oct 2016 10:15:30 -0700 Subject: mm: remove per-zone hashtable of bitlock waitqueues The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher Tested-by: Bob Peterson Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Steven Whitehouse Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 30 +------------ kernel/sched/core.c | 16 +++++++ kernel/sched/wait.c | 10 ----- mm/filemap.c | 4 +- mm/memory_hotplug.c | 28 ------------ mm/page_alloc.c | 115 +------------------------------------------------ 6 files changed, 21 insertions(+), 182 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f2ae99e5daf..0f088f3a2fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,33 +440,7 @@ struct zone { seqlock_t span_seqlock; #endif - /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. - */ - wait_queue_head_t *wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) @@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) static inline bool zone_is_initialized(struct zone *zone) { - return !!zone->wait_table; + return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..42d4027f9e26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..c7fe2f16503f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..b18dab401be6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -2158,20 +2144,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..de7c6e43b1c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4976,72 +4976,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) #endif } -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } -- cgit From c0a0aba8e478229b2f0956918542152fbad3f794 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 27 Oct 2016 17:46:38 -0700 Subject: kconfig.h: remove config_enabled() macro The use of config_enabled() is ambiguous. For config options, IS_ENABLED(), IS_REACHABLE(), etc. will make intention clearer. Sometimes config_enabled() has been used for non-config options because it is useful to check whether the given symbol is defined or not. I have been tackling on deprecating config_enabled(), and now is the time to finish this work. Some new users have appeared for v4.9-rc1, but it is trivial to replace them: - arch/x86/mm/kaslr.c replace config_enabled() with IS_ENABLED() because CONFIG_X86_ESPFIX64 and CONFIG_EFI are boolean. - include/asm-generic/export.h replace config_enabled() with __is_defined(). Then, config_enabled() can be removed now. Going forward, please use IS_ENABLED(), IS_REACHABLE(), etc. for config options, and __is_defined() for non-config symbols. Link: http://lkml.kernel.org/r/1476616078-32252-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Ingo Molnar Acked-by: Nicolas Pitre Cc: Peter Oberparleiter Cc: Arnd Bergmann Cc: Kees Cook Cc: Michal Marek Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Thomas Garnier Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/kaslr.c | 6 +++--- include/asm-generic/export.h | 2 +- include/linux/kconfig.h | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ddd2661c4502..887e57182716 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -104,10 +104,10 @@ void __init kernel_randomize_memory(void) * consistent with the vaddr_start/vaddr_end variables. */ BUILD_BUG_ON(vaddr_start >= vaddr_end); - BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && vaddr_end >= EFI_VA_START); - BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || - config_enabled(CONFIG_EFI)) && + BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || + IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 43199a049da5..63554e9f6e0c 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -70,7 +70,7 @@ KSYM(__kcrctab_\name): #include #define __EXPORT_SYMBOL(sym, val, sec) \ - __cond_export_sym(sym, val, sec, config_enabled(__KSYM_##sym)) + __cond_export_sym(sym, val, sec, __is_defined(__KSYM_##sym)) #define __cond_export_sym(sym, val, sec, conf) \ ___cond_export_sym(sym, val, sec, conf) #define ___cond_export_sym(sym, val, sec, enabled) \ diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 15ec117ec537..8f2e059e4d45 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -31,7 +31,6 @@ * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when * the last step cherry picks the 2nd arg, we get a zero. */ -#define config_enabled(cfg) ___is_defined(cfg) #define __is_defined(x) ___is_defined(x) #define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) #define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) @@ -41,13 +40,13 @@ * otherwise. For boolean options, this is equivalent to * IS_ENABLED(CONFIG_FOO). */ -#define IS_BUILTIN(option) config_enabled(option) +#define IS_BUILTIN(option) __is_defined(option) /* * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 * otherwise. */ -#define IS_MODULE(option) config_enabled(option##_MODULE) +#define IS_MODULE(option) __is_defined(option##_MODULE) /* * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled -- cgit From 73f907fd5fa56b0066d199bdd7126bbd04f6cd7b Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 24 Oct 2016 16:46:20 +0200 Subject: mtd: nand: Fix data interface configuration logic When changing from one data interface setting to another, one has to ensure a specific sequence which is described in the ONFI spec. One of these constraints is that the CE line has go high after a reset before a command can be sent with the new data interface setting, which is not guaranteed by the current implementation. Rework the nand_reset() function and all the call sites to make sure the CE line is asserted and released when required. Also make sure to actually apply the new data interface setting on the first die. Signed-off-by: Boris Brezillon Fixes: d8e725dd8311 ("mtd: nand: automate NAND timings selection") Reviewed-by: Sascha Hauer Tested-by: Marc Gonzalez --- drivers/mtd/nand/nand_base.c | 60 +++++++++++++++++++++++++++++++------------- include/linux/mtd/nand.h | 2 +- 2 files changed, 43 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index e5718e5ecf92..3bde96a3f7bf 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -1095,10 +1095,11 @@ static void nand_release_data_interface(struct nand_chip *chip) /** * nand_reset - Reset and initialize a NAND device * @chip: The NAND chip + * @chipnr: Internal die id * * Returns 0 for success or negative error code otherwise */ -int nand_reset(struct nand_chip *chip) +int nand_reset(struct nand_chip *chip, int chipnr) { struct mtd_info *mtd = nand_to_mtd(chip); int ret; @@ -1107,9 +1108,17 @@ int nand_reset(struct nand_chip *chip) if (ret) return ret; + /* + * The CS line has to be released before we can apply the new NAND + * interface settings, hence this weird ->select_chip() dance. + */ + chip->select_chip(mtd, chipnr); chip->cmdfunc(mtd, NAND_CMD_RESET, -1, -1); + chip->select_chip(mtd, -1); + chip->select_chip(mtd, chipnr); ret = nand_setup_data_interface(chip); + chip->select_chip(mtd, -1); if (ret) return ret; @@ -1185,8 +1194,6 @@ int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) /* Shift to get chip number */ chipnr = ofs >> chip->chip_shift; - chip->select_chip(mtd, chipnr); - /* * Reset the chip. * If we want to check the WP through READ STATUS and check the bit 7 @@ -1194,7 +1201,9 @@ int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) * some operation can also clear the bit 7 of status register * eg. erase/program a locked block */ - nand_reset(chip); + nand_reset(chip, chipnr); + + chip->select_chip(mtd, chipnr); /* Check, if it is write protected */ if (nand_check_wp(mtd)) { @@ -1244,8 +1253,6 @@ int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) /* Shift to get chip number */ chipnr = ofs >> chip->chip_shift; - chip->select_chip(mtd, chipnr); - /* * Reset the chip. * If we want to check the WP through READ STATUS and check the bit 7 @@ -1253,7 +1260,9 @@ int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) * some operation can also clear the bit 7 of status register * eg. erase/program a locked block */ - nand_reset(chip); + nand_reset(chip, chipnr); + + chip->select_chip(mtd, chipnr); /* Check, if it is write protected */ if (nand_check_wp(mtd)) { @@ -2940,10 +2949,6 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to, } chipnr = (int)(to >> chip->chip_shift); - chip->select_chip(mtd, chipnr); - - /* Shift to get page */ - page = (int)(to >> chip->page_shift); /* * Reset the chip. Some chips (like the Toshiba TC5832DC found in one @@ -2951,7 +2956,12 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to, * if we don't do this. I have no clue why, but I seem to have 'fixed' * it in the doc2000 driver in August 1999. dwmw2. */ - nand_reset(chip); + nand_reset(chip, chipnr); + + chip->select_chip(mtd, chipnr); + + /* Shift to get page */ + page = (int)(to >> chip->page_shift); /* Check, if it is write protected */ if (nand_check_wp(mtd)) { @@ -3984,14 +3994,14 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd, int i, maf_idx; u8 id_data[8]; - /* Select the device */ - chip->select_chip(mtd, 0); - /* * Reset the chip, required by some chips (e.g. Micron MT29FxGxxxxx) * after power-up. */ - nand_reset(chip); + nand_reset(chip, 0); + + /* Select the device */ + chip->select_chip(mtd, 0); /* Send the command for reading device ID */ chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1); @@ -4329,17 +4339,31 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips, return PTR_ERR(type); } + /* Initialize the ->data_interface field. */ ret = nand_init_data_interface(chip); if (ret) return ret; + /* + * Setup the data interface correctly on the chip and controller side. + * This explicit call to nand_setup_data_interface() is only required + * for the first die, because nand_reset() has been called before + * ->data_interface and ->default_onfi_timing_mode were set. + * For the other dies, nand_reset() will automatically switch to the + * best mode for us. + */ + ret = nand_setup_data_interface(chip); + if (ret) + return ret; + chip->select_chip(mtd, -1); /* Check for a chip array */ for (i = 1; i < maxchips; i++) { - chip->select_chip(mtd, i); /* See comment in nand_get_flash_type for reset */ - nand_reset(chip); + nand_reset(chip, i); + + chip->select_chip(mtd, i); /* Send the command for reading device ID */ chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1); /* Read manufacturer and device IDs */ diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index c5d3d5024fc8..d8905a229f34 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1184,7 +1184,7 @@ int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip, int page); /* Reset and initialize a NAND device */ -int nand_reset(struct nand_chip *chip); +int nand_reset(struct nand_chip *chip, int chipnr); /* Free resources held by the NAND device */ void nand_cleanup(struct nand_chip *chip); -- cgit From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Oct 2016 11:48:24 +0200 Subject: perf/powerpc: Don't call perf_event_disable() from atomic context The trinity syscall fuzzer triggered following WARN() on powerpc: WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 ... NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0 LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 Call Trace: [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable) [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 Followed by a lockdep warning: =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #7 Tainted: G W ------------------------------- ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ls/2998: #0: (rcu_read_lock){......}, at: [] .__atomic_notifier_call_chain+0x0/0x1c0 #1: (rcu_read_lock){......}, at: [] .hw_breakpoint_handler+0x0/0x2b0 stack backtrace: CPU: 9 PID: 2998 Comm: ls Tainted: G W 4.8.0-rc5+ #7 Call Trace: [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable) [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180 [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0 [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0 [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380 [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60 [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0 [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 While it looks like the first WARN() is probably valid, the other one is triggered by disabling event via perf_event_disable() from atomic context. The event is disabled here in case we were not able to emulate the instruction that hit the breakpoint. By disabling the event we unschedule the event and make sure it's not scheduled back. But we can't call perf_event_disable() from atomic context, instead we need to use the event's pending_disable irq_work method to disable it. Reported-by: Jan Stancek Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Huang Ying Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- include/linux/perf_event.h | 1 + kernel/events/core.c | 10 ++++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 9781c69eae57..03d089b3ed72 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -275,7 +275,7 @@ int hw_breakpoint_handler(struct die_args *args) if (!stepped) { WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " "0x%lx will be disabled.", info->address); - perf_event_disable(bp); + perf_event_disable_inatomic(bp); goto out; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 060d0ede88df..4741ecdb9817 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); +extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index a5d2e62faf7e..0e292132efac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); -- cgit From 72193a953ada9058e46970838cc42cbd18bf4eba Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 28 Oct 2016 11:38:53 +0100 Subject: regmap: Rename ret variable in regmap_read_poll_timeout As almost all of the callers of the regmap_read_poll_timeout macro will include a local ret variable we will always get a Sparse warning about the duplication of the ret variable: warning: symbol 'ret' shadows an earlier one Simply rename the ret variable in the marco to pollret to make this significantly less likely to happen. Signed-off-by: Charles Keepax Signed-off-by: Mark Brown --- include/linux/regmap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 9adc7b21903d..fc9e3c576f9c 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -116,22 +116,22 @@ struct reg_sequence { #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \ ({ \ ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \ - int ret; \ + int pollret; \ might_sleep_if(sleep_us); \ for (;;) { \ - ret = regmap_read((map), (addr), &(val)); \ - if (ret) \ + pollret = regmap_read((map), (addr), &(val)); \ + if (pollret) \ break; \ if (cond) \ break; \ if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ - ret = regmap_read((map), (addr), &(val)); \ + pollret = regmap_read((map), (addr), &(val)); \ break; \ } \ if (sleep_us) \ usleep_range((sleep_us >> 2) + 1, sleep_us); \ } \ - ret ?: ((cond) ? 0 : -ETIMEDOUT); \ + pollret ?: ((cond) ? 0 : -ETIMEDOUT); \ }) #ifdef CONFIG_REGMAP -- cgit From b47bd6ea40636362a8b6605de51207cc387ba0b8 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 25 Oct 2016 18:36:24 +0300 Subject: {net, ib}/mlx5: Make cache line size determination at runtime. ARM 64B cache line systems have L1_CACHE_BYTES set to 128. cache_line_size() will return the correct size. Fixes: cf50b5efa2fe('net/mlx5_core/ib: New device capabilities handling.') Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 31 +++++++++++++++++++++---- include/linux/mlx5/driver.h | 11 --------- 4 files changed, 27 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 22174774dbb8..63036c731626 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1019,7 +1019,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); - resp.cache_line_size = L1_CACHE_BYTES; + resp.cache_line_size = cache_line_size(); resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 41f4c2afbcdd..7ce97daf26c6 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -52,7 +52,6 @@ enum { enum { MLX5_IB_SQ_STRIDE = 6, - MLX5_IB_CACHE_LINE_SIZE = 64, }; static const u32 mlx5_ib_opcode[] = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index 6cb38304669f..2c6e3c7b7417 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -41,6 +41,13 @@ #include "mlx5_core.h" +struct mlx5_db_pgdir { + struct list_head list; + unsigned long *bitmap; + __be32 *db_page; + dma_addr_t db_dma; +}; + /* Handling for queue buffers -- we allocate a bunch of memory and * register it in a memory region at HCA virtual address 0. */ @@ -102,17 +109,28 @@ EXPORT_SYMBOL_GPL(mlx5_buf_free); static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev, int node) { + u32 db_per_page = PAGE_SIZE / cache_line_size(); struct mlx5_db_pgdir *pgdir; pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL); if (!pgdir) return NULL; - bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE); + pgdir->bitmap = kcalloc(BITS_TO_LONGS(db_per_page), + sizeof(unsigned long), + GFP_KERNEL); + + if (!pgdir->bitmap) { + kfree(pgdir); + return NULL; + } + + bitmap_fill(pgdir->bitmap, db_per_page); pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE, &pgdir->db_dma, node); if (!pgdir->db_page) { + kfree(pgdir->bitmap); kfree(pgdir); return NULL; } @@ -123,18 +141,19 @@ static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev, static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir, struct mlx5_db *db) { + u32 db_per_page = PAGE_SIZE / cache_line_size(); int offset; int i; - i = find_first_bit(pgdir->bitmap, MLX5_DB_PER_PAGE); - if (i >= MLX5_DB_PER_PAGE) + i = find_first_bit(pgdir->bitmap, db_per_page); + if (i >= db_per_page) return -ENOMEM; __clear_bit(i, pgdir->bitmap); db->u.pgdir = pgdir; db->index = i; - offset = db->index * L1_CACHE_BYTES; + offset = db->index * cache_line_size(); db->db = pgdir->db_page + offset / sizeof(*pgdir->db_page); db->dma = pgdir->db_dma + offset; @@ -181,14 +200,16 @@ EXPORT_SYMBOL_GPL(mlx5_db_alloc); void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db) { + u32 db_per_page = PAGE_SIZE / cache_line_size(); mutex_lock(&dev->priv.pgdir_mutex); __set_bit(db->index, db->u.pgdir->bitmap); - if (bitmap_full(db->u.pgdir->bitmap, MLX5_DB_PER_PAGE)) { + if (bitmap_full(db->u.pgdir->bitmap, db_per_page)) { dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, db->u.pgdir->db_page, db->u.pgdir->db_dma); list_del(&db->u.pgdir->list); + kfree(db->u.pgdir->bitmap); kfree(db->u.pgdir); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 85c4786427e4..5dbda60a09f4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -625,10 +625,6 @@ struct mlx5_db { int index; }; -enum { - MLX5_DB_PER_PAGE = PAGE_SIZE / L1_CACHE_BYTES, -}; - enum { MLX5_COMP_EQ_SIZE = 1024, }; @@ -638,13 +634,6 @@ enum { MLX5_PTYS_EN = 1 << 2, }; -struct mlx5_db_pgdir { - struct list_head list; - DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE); - __be32 *db_page; - dma_addr_t db_dma; -}; - typedef void (*mlx5_cmd_cbk_t)(int status, void *context); struct mlx5_cmd_work_ent { -- cgit From 05ac2c0b7438ea08c5d54b48797acf9b22cb2f6f Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Tue, 25 Oct 2016 18:36:33 +0300 Subject: net/mlx5: Fix race between PCI error handlers and health work Currently there is a race between the health care work and the kernel pci error handlers because both of them detect the error, the first one to be called will do the error handling. There is a chance that health care will disable the pci after resuming pci slot. Also create a separate WQ because now we will have two types of health works, one for the error detection and one for the recovery. Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver') Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 30 +++++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/main.c | 9 +++++-- include/linux/mlx5/driver.h | 4 ++++ 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 2cb4094c9c49..2d00022c92d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -64,6 +64,10 @@ enum { MLX5_NIC_IFC_NO_DRAM_NIC = 2 }; +enum { + MLX5_DROP_NEW_HEALTH_WORK, +}; + static u8 get_nic_interface(struct mlx5_core_dev *dev) { return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; @@ -272,7 +276,13 @@ static void poll_health(unsigned long data) if (in_fatal(dev) && !health->sick) { health->sick = true; print_health_info(dev); - schedule_work(&health->work); + spin_lock(&health->wq_lock); + if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) + queue_work(health->wq, &health->work); + else + dev_err(&dev->pdev->dev, + "new health works are not permitted at this stage\n"); + spin_unlock(&health->wq_lock); } } @@ -282,6 +292,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) init_timer(&health->timer); health->sick = 0; + clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); health->health = &dev->iseg->health; health->health_counter = &dev->iseg->health_counter; @@ -298,11 +309,21 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev) del_timer_sync(&health->timer); } +void mlx5_drain_health_wq(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + spin_lock(&health->wq_lock); + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + spin_unlock(&health->wq_lock); + cancel_work_sync(&health->work); +} + void mlx5_health_cleanup(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; - flush_work(&health->work); + destroy_workqueue(health->wq); } int mlx5_health_init(struct mlx5_core_dev *dev) @@ -317,8 +338,11 @@ int mlx5_health_init(struct mlx5_core_dev *dev) strcpy(name, "mlx5_health"); strcat(name, dev_name(&dev->pdev->dev)); + health->wq = create_singlethread_workqueue(name); kfree(name); - + if (!health->wq) + return -ENOMEM; + spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 8a63910bfccf..9f90226bc120 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, dev_info(&pdev->dev, "%s was called\n", __func__); mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); - pci_save_state(pdev); - mlx5_pci_disable_device(dev); + /* In case of kernel call save the pci state and drain health wq */ + if (state) { + pci_save_state(pdev); + mlx5_drain_health_wq(dev); + mlx5_pci_disable_device(dev); + } + return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5dbda60a09f4..7d9a5d08eb59 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -418,7 +418,10 @@ struct mlx5_core_health { u32 prev; int miss_counter; bool sick; + /* wq spinlock to synchronize draining */ + spinlock_t wq_lock; struct workqueue_struct *wq; + unsigned long flags; struct work_struct work; }; @@ -778,6 +781,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev); +void mlx5_drain_health_wq(struct mlx5_core_dev *dev); int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf, int node); int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf); -- cgit From 04c0c1ab38e95105d950db5b84e727637e149ce7 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Tue, 25 Oct 2016 18:36:34 +0300 Subject: net/mlx5: PCI error recovery health care simulation In case that the kernel PCI error handlers are not called, we will trigger our own recovery flow. The health work will give priority to the kernel pci error handlers to recover the PCI by waiting for a small period, if the pci error handlers are not triggered the manual recovery flow will be executed. We don't save pci state in case of manual recovery because it will ruin the pci configuration space and we will lose dma sync. Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver') Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 45 ++++++++++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 18 ++++++--- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 + include/linux/mlx5/driver.h | 1 + 4 files changed, 56 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 2d00022c92d8..5bcf93422ee0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -61,14 +61,15 @@ enum { enum { MLX5_NIC_IFC_FULL = 0, MLX5_NIC_IFC_DISABLED = 1, - MLX5_NIC_IFC_NO_DRAM_NIC = 2 + MLX5_NIC_IFC_NO_DRAM_NIC = 2, + MLX5_NIC_IFC_INVALID = 3 }; enum { MLX5_DROP_NEW_HEALTH_WORK, }; -static u8 get_nic_interface(struct mlx5_core_dev *dev) +static u8 get_nic_state(struct mlx5_core_dev *dev) { return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; } @@ -101,7 +102,7 @@ static int in_fatal(struct mlx5_core_dev *dev) struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; - if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED) + if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) return 1; if (ioread32be(&h->fw_ver) == 0xffffffff) @@ -131,7 +132,7 @@ unlock: static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) { - u8 nic_interface = get_nic_interface(dev); + u8 nic_interface = get_nic_state(dev); switch (nic_interface) { case MLX5_NIC_IFC_FULL: @@ -153,8 +154,34 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) mlx5_disable_device(dev); } +static void health_recover(struct work_struct *work) +{ + struct mlx5_core_health *health; + struct delayed_work *dwork; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + u8 nic_state; + + dwork = container_of(work, struct delayed_work, work); + health = container_of(dwork, struct mlx5_core_health, recover_work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + + nic_state = get_nic_state(dev); + if (nic_state == MLX5_NIC_IFC_INVALID) { + dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n"); + return; + } + + dev_err(&dev->pdev->dev, "starting health recovery flow\n"); + mlx5_recover_device(dev); +} + +/* How much time to wait until health resetting the driver (in msecs) */ +#define MLX5_RECOVERY_DELAY_MSECS 60000 static void health_care(struct work_struct *work) { + unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS); struct mlx5_core_health *health; struct mlx5_core_dev *dev; struct mlx5_priv *priv; @@ -164,6 +191,14 @@ static void health_care(struct work_struct *work) dev = container_of(priv, struct mlx5_core_dev, priv); mlx5_core_warn(dev, "handling bad device here\n"); mlx5_handle_bad_state(dev); + + spin_lock(&health->wq_lock); + if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) + schedule_delayed_work(&health->recover_work, recover_delay); + else + dev_err(&dev->pdev->dev, + "new health works are not permitted at this stage\n"); + spin_unlock(&health->wq_lock); } static const char *hsynd_str(u8 synd) @@ -316,6 +351,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) spin_lock(&health->wq_lock); set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); spin_unlock(&health->wq_lock); + cancel_delayed_work_sync(&health->recover_work); cancel_work_sync(&health->work); } @@ -344,6 +380,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) return -ENOMEM; spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); + INIT_DELAYED_WORK(&health->recover_work, health_recover); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 9f90226bc120..d5433c49b2b0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1313,6 +1313,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, struct mlx5_priv *priv = &dev->priv; dev_info(&pdev->dev, "%s was called\n", __func__); + mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); /* In case of kernel call save the pci state and drain health wq */ @@ -1378,11 +1379,6 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) return PCI_ERS_RESULT_RECOVERED; } -void mlx5_disable_device(struct mlx5_core_dev *dev) -{ - mlx5_pci_err_detected(dev->pdev, 0); -} - static void mlx5_pci_resume(struct pci_dev *pdev) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); @@ -1432,6 +1428,18 @@ static const struct pci_device_id mlx5_core_pci_table[] = { MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table); +void mlx5_disable_device(struct mlx5_core_dev *dev) +{ + mlx5_pci_err_detected(dev->pdev, 0); +} + +void mlx5_recover_device(struct mlx5_core_dev *dev) +{ + mlx5_pci_disable_device(dev); + if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED) + mlx5_pci_resume(dev->pdev); +} + static struct pci_driver mlx5_core_driver = { .name = DRIVER_NAME, .id_table = mlx5_core_pci_table, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 3d0cfb9f18f9..187662c8ea96 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -83,6 +83,7 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, unsigned long param); void mlx5_enter_error_state(struct mlx5_core_dev *dev); void mlx5_disable_device(struct mlx5_core_dev *dev); +void mlx5_recover_device(struct mlx5_core_dev *dev); int mlx5_sriov_init(struct mlx5_core_dev *dev); void mlx5_sriov_cleanup(struct mlx5_core_dev *dev); int mlx5_sriov_attach(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7d9a5d08eb59..ecc451d89ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -423,6 +423,7 @@ struct mlx5_core_health { struct workqueue_struct *wq; unsigned long flags; struct work_struct work; + struct delayed_work recover_work; }; struct mlx5_cq_table { -- cgit From e934f684856393a0b2aecc7fdd8357a48b79c535 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 26 Oct 2016 08:30:29 -0700 Subject: Revert "hv_netvsc: report vmbus name in ethtool" This reverts commit e3f74b841d48 ("hv_netvsc: report vmbus name in ethtool")' because of problem introduced by commit f9a56e5d6a0ba ("Drivers: hv: make VMBus bus ids persistent"). This changed the format of the vmbus name and this new format is too long to fit in the bus_info field of ethtool. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 4 ---- include/linux/hyperv.h | 7 ------- 2 files changed, 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index c71d966d905f..f6382150b16a 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -699,12 +699,8 @@ int netvsc_recv_callback(struct hv_device *device_obj, static void netvsc_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) { - struct net_device_context *net_device_ctx = netdev_priv(net); - struct hv_device *dev = net_device_ctx->device_ctx; - strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); - strlcpy(info->bus_info, vmbus_dev_name(dev), sizeof(info->bus_info)); } static void netvsc_get_channels(struct net_device *net, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6824556d37ed..cd184bdca58f 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1169,13 +1169,6 @@ int __must_check __vmbus_driver_register(struct hv_driver *hv_driver, const char *mod_name); void vmbus_driver_unregister(struct hv_driver *hv_driver); -static inline const char *vmbus_dev_name(const struct hv_device *device_obj) -{ - const struct kobject *kobj = &device_obj->device.kobj; - - return kobj->name; -} - void vmbus_hvsock_device_unregister(struct vmbus_channel *channel); int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, -- cgit From 6f2e0d2c3bf0f8d322ab7516c57340c7189cca02 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 27 Oct 2016 16:27:20 +0300 Subject: net/mlx4: Fix firmware command timeout during interrupt test Currently interrupt test that is part of ethtool selftest runs the check over all interrupt vectors of the device. In mlx4_en package part of interrupt vectors are uninitialized since mlx4_ib doesn't exist. This causes NOP FW command to time out. Change logic to test current port interrupt vectors only. Signed-off-by: Eugenia Emantayev Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_selftest.c | 26 +++++++++- drivers/net/ethernet/mellanox/mlx4/eq.c | 62 +++++++++++------------- include/linux/mlx4/device.h | 3 +- 3 files changed, 55 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c index b66e03d9711f..c06346a82496 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c @@ -118,6 +118,29 @@ mlx4_en_test_loopback_exit: return !loopback_ok; } +static int mlx4_en_test_interrupts(struct mlx4_en_priv *priv) +{ + struct mlx4_en_dev *mdev = priv->mdev; + int err = 0; + int i = 0; + + err = mlx4_test_async(mdev->dev); + /* When not in MSI_X or slave, test only async */ + if (!(mdev->dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(mdev->dev)) + return err; + + /* A loop over all completion vectors of current port, + * for each vector check whether it works by mapping command + * completions to that vector and performing a NOP command + */ + for (i = 0; i < priv->rx_ring_num; i++) { + err = mlx4_test_interrupt(mdev->dev, priv->rx_cq[i]->vector); + if (err) + break; + } + + return err; +} static int mlx4_en_test_link(struct mlx4_en_priv *priv) { @@ -151,7 +174,6 @@ static int mlx4_en_test_speed(struct mlx4_en_priv *priv) void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf) { struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_dev *mdev = priv->mdev; int i, carrier_ok; memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST); @@ -177,7 +199,7 @@ void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf) netif_carrier_on(dev); } - buf[0] = mlx4_test_interrupts(mdev->dev); + buf[0] = mlx4_en_test_interrupts(priv); buf[1] = mlx4_en_test_link(priv); buf[2] = mlx4_en_test_speed(priv); diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index cf8f8a72a801..cd3638e6fe25 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -1361,53 +1361,49 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev) kfree(priv->eq_table.uar_map); } -/* A test that verifies that we can accept interrupts on all - * the irq vectors of the device. +/* A test that verifies that we can accept interrupts + * on the vector allocated for asynchronous events + */ +int mlx4_test_async(struct mlx4_dev *dev) +{ + return mlx4_NOP(dev); +} +EXPORT_SYMBOL(mlx4_test_async); + +/* A test that verifies that we can accept interrupts + * on the given irq vector of the tested port. * Interrupts are checked using the NOP command. */ -int mlx4_test_interrupts(struct mlx4_dev *dev) +int mlx4_test_interrupt(struct mlx4_dev *dev, int vector) { struct mlx4_priv *priv = mlx4_priv(dev); - int i; int err; - err = mlx4_NOP(dev); - /* When not in MSI_X, there is only one irq to check */ - if (!(dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(dev)) - return err; - - /* A loop over all completion vectors, for each vector we will check - * whether it works by mapping command completions to that vector - * and performing a NOP command - */ - for(i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) { - /* Make sure request_irq was called */ - if (!priv->eq_table.eq[i].have_irq) - continue; - - /* Temporary use polling for command completions */ - mlx4_cmd_use_polling(dev); - - /* Map the new eq to handle all asynchronous events */ - err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, - priv->eq_table.eq[i].eqn); - if (err) { - mlx4_warn(dev, "Failed mapping eq for interrupt test\n"); - mlx4_cmd_use_events(dev); - break; - } + /* Temporary use polling for command completions */ + mlx4_cmd_use_polling(dev); - /* Go back to using events */ - mlx4_cmd_use_events(dev); - err = mlx4_NOP(dev); + /* Map the new eq to handle all asynchronous events */ + err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, + priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn); + if (err) { + mlx4_warn(dev, "Failed mapping eq for interrupt test\n"); + goto out; } + /* Go back to using events */ + mlx4_cmd_use_events(dev); + err = mlx4_NOP(dev); + /* Return to default */ + mlx4_cmd_use_polling(dev); +out: mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, priv->eq_table.eq[MLX4_EQ_ASYNC].eqn); + mlx4_cmd_use_events(dev); + return err; } -EXPORT_SYMBOL(mlx4_test_interrupts); +EXPORT_SYMBOL(mlx4_test_interrupt); bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index f6a164297358..3be7abd6e722 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1399,7 +1399,8 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u32 *lkey, u32 *rkey); int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); -int mlx4_test_interrupts(struct mlx4_dev *dev); +int mlx4_test_interrupt(struct mlx4_dev *dev, int vector); +int mlx4_test_async(struct mlx4_dev *dev); int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier, const u32 offset[], u32 value[], size_t array_len, u8 port); -- cgit From 723c038475b78edc9327eb952f95f9881cc9d79d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2016 11:42:02 -0500 Subject: fs: remove the never implemented aio_fsync file operation Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 1 - Documentation/filesystems/vfs.txt | 1 - fs/aio.c | 14 -------------- fs/ntfs/dir.c | 2 -- include/linux/fs.h | 1 - 5 files changed, 19 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 14cdc101d165..1b5f15653b1b 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -447,7 +447,6 @@ prototypes: int (*flush) (struct file *); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); - int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index d619c8d71966..b5039a00caaf 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -828,7 +828,6 @@ struct file_operations { int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); - int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); diff --git a/fs/aio.c b/fs/aio.c index 0aa71d338c04..2a6030af6ba5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1472,20 +1472,6 @@ rw_common: kfree(iovec); break; - case IOCB_CMD_FDSYNC: - if (!file->f_op->aio_fsync) - return -EINVAL; - - ret = file->f_op->aio_fsync(req, 1); - break; - - case IOCB_CMD_FSYNC: - if (!file->f_op->aio_fsync) - return -EINVAL; - - ret = file->f_op->aio_fsync(req, 0); - break; - default: pr_debug("EINVAL: no operation provided\n"); return -EINVAL; diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index a18613579001..0ee19ecc982d 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1544,8 +1544,6 @@ const struct file_operations ntfs_dir_ops = { .iterate = ntfs_readdir, /* Read directory contents. */ #ifdef NTFS_RW .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ - /*.aio_fsync = ,*/ /* Sync all outstanding async - i/o operations on a kiocb. */ #endif /* NTFS_RW */ /*.ioctl = ,*/ /* Perform function on the mounted filesystem. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 16d2b6e874d6..ff7bcd9e8398 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1709,7 +1709,6 @@ struct file_operations { int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); - int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); -- cgit From 70fe2f48152e60664809e2fed76bbb50c9fa2aa3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 30 Oct 2016 11:42:04 -0500 Subject: aio: fix freeze protection of aio writes Currently we dropped freeze protection of aio writes just after IO was submitted. Thus aio write could be in flight while the filesystem was frozen and that could result in unexpected situation like aio completion wanting to convert extent type on frozen filesystem. Testcase from Dmitry triggering this is like: for ((i=0;i<60;i++));do fsfreeze -f /mnt ;sleep 1;fsfreeze -u /mnt;done & fio --bs=4k --ioengine=libaio --iodepth=128 --size=1g --direct=1 \ --runtime=60 --filename=/mnt/file --name=rand-write --rw=randwrite Fix the problem by dropping freeze protection only once IO is completed in aio_complete(). Reported-by: Dmitry Monakhov Signed-off-by: Jan Kara [hch: forward ported on top of various VFS and aio changes] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/aio.c | 19 ++++++++++++++++++- include/linux/fs.h | 1 + 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/aio.c b/fs/aio.c index c19755187ca5..428484f2f841 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1078,6 +1078,17 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) unsigned tail, pos, head; unsigned long flags; + if (kiocb->ki_flags & IOCB_WRITE) { + struct file *file = kiocb->ki_filp; + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); + file_end_write(file); + } + /* * Special case handling for sync iocbs: * - events go directly into the iocb for fast handling @@ -1473,9 +1484,15 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { + req->ki_flags |= IOCB_WRITE; file_start_write(file); ret = aio_ret(req, file->f_op->write_iter(req, &iter)); - file_end_write(file); + /* + * We release freeze protection in aio_complete(). Fool lockdep + * by telling it the lock got released so that it doesn't + * complain about held lock when we return to userspace. + */ + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } kfree(iovec); return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index ff7bcd9e8398..dc0478c07b2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -321,6 +321,7 @@ struct writeback_control; #define IOCB_HIPRI (1 << 3) #define IOCB_DSYNC (1 << 4) #define IOCB_SYNC (1 << 5) +#define IOCB_WRITE (1 << 6) struct kiocb { struct file *ki_filp; -- cgit From da96786e26c3ae47316db2b92046b11268c4379c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 2 Nov 2016 12:08:25 -0700 Subject: net: tcp: check skb is non-NULL for exact match on lookups Andrey reported the following error report while running the syzkaller fuzzer: general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 648 Comm: syz-executor Not tainted 4.9.0-rc3+ #333 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff8800398c4480 task.stack: ffff88003b468000 RIP: 0010:[] [< inline >] inet_exact_dif_match include/net/tcp.h:808 RIP: 0010:[] [] __inet_lookup_listener+0xb6/0x500 net/ipv4/inet_hashtables.c:219 RSP: 0018:ffff88003b46f270 EFLAGS: 00010202 RAX: 0000000000000004 RBX: 0000000000004242 RCX: 0000000000000001 RDX: 0000000000000000 RSI: ffffc90000e3c000 RDI: 0000000000000054 RBP: ffff88003b46f2d8 R08: 0000000000004000 R09: ffffffff830910e7 R10: 0000000000000000 R11: 000000000000000a R12: ffffffff867fa0c0 R13: 0000000000004242 R14: 0000000000000003 R15: dffffc0000000000 FS: 00007fb135881700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020cc3000 CR3: 000000006d56a000 CR4: 00000000000006f0 Stack: 0000000000000000 000000000601a8c0 0000000000000000 ffffffff00004242 424200003b9083c2 ffff88003def4041 ffffffff84e7e040 0000000000000246 ffff88003a0911c0 0000000000000000 ffff88003a091298 ffff88003b9083ae Call Trace: [] tcp_v4_send_reset+0x584/0x1700 net/ipv4/tcp_ipv4.c:643 [] tcp_v4_rcv+0x198b/0x2e50 net/ipv4/tcp_ipv4.c:1718 [] ip_local_deliver_finish+0x332/0xad0 net/ipv4/ip_input.c:216 ... MD5 has a code path that calls __inet_lookup_listener with a null skb, so inet{6}_exact_dif_match needs to check skb against null before pulling the flag. Fixes: a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") Reported-by: Andrey Konovalov Signed-off-by: David Ahern Tested-by: Andrey Konovalov Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- include/net/tcp.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ca1ad9ebbc92..a0649973ee5b 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -149,7 +149,7 @@ static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) { #if defined(CONFIG_NET_L3_MASTER_DEV) if (!net->ipv4.sysctl_tcp_l3mdev_accept && - ipv6_l3mdev_skb(IP6CB(skb)->flags)) + skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) return true; #endif return false; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b82d4d94834..304a8e17bc87 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -805,7 +805,7 @@ static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (!net->ipv4.sysctl_tcp_l3mdev_accept && - ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) + skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return true; #endif return false; -- cgit From 98430c7aad6a3fdedcc78a0d6780dabb6580dc38 Mon Sep 17 00:00:00 2001 From: Randy Li Date: Tue, 25 Oct 2016 22:15:34 +0800 Subject: phy: Add reset callback for not generic phy Add a dummy function for phy_reset in case the CONFIG_GENERIC_PHY is disabled. Signed-off-by: Randy Li Signed-off-by: Kishon Vijay Abraham I --- include/linux/phy/phy.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index ee1bed7dbfc6..78bb0d7f6b11 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -253,6 +253,13 @@ static inline int phy_set_mode(struct phy *phy, enum phy_mode mode) return -ENOSYS; } +static inline int phy_reset(struct phy *phy) +{ + if (!phy) + return 0; + return -ENOSYS; +} + static inline int phy_get_bus_width(struct phy *phy) { return -ENOSYS; -- cgit From 1571875beecd5de9657f73931449bda1b1329b6f Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Thu, 3 Nov 2016 16:21:26 +0200 Subject: ACPI / platform: Add support for build-in properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have a couple of drivers, acpi_apd.c and acpi_lpss.c, that need to pass extra build-in properties to the devices they create. Previously the drivers added those properties to the struct device which is member of the struct acpi_device, but that does not work. Those properties need to be assigned to the struct device of the platform device instead in order for them to become available to the drivers. To fix this, this patch changes acpi_create_platform_device function to take struct property_entry pointer as parameter. Fixes: 20a875e2e86e (serial: 8250_dw: Add quirk for APM X-Gene SoC) Signed-off-by: Heikki Krogerus Tested-by: Yazen Ghannam Tested-by: Jérôme de Bretagne Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_apd.c | 10 ++-------- drivers/acpi/acpi_lpss.c | 10 ++-------- drivers/acpi/acpi_platform.c | 5 ++++- drivers/acpi/dptf/int340x_thermal.c | 4 ++-- drivers/acpi/scan.c | 2 +- drivers/platform/x86/intel-hid.c | 2 +- drivers/platform/x86/intel-vbtn.c | 2 +- include/linux/acpi.h | 3 ++- 8 files changed, 15 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c index 5f112d811e42..5959ed996a75 100644 --- a/drivers/acpi/acpi_apd.c +++ b/drivers/acpi/acpi_apd.c @@ -117,7 +117,7 @@ static int acpi_apd_create_device(struct acpi_device *adev, int ret; if (!dev_desc) { - pdev = acpi_create_platform_device(adev); + pdev = acpi_create_platform_device(adev, NULL); return IS_ERR_OR_NULL(pdev) ? PTR_ERR(pdev) : 1; } @@ -134,14 +134,8 @@ static int acpi_apd_create_device(struct acpi_device *adev, goto err_out; } - if (dev_desc->properties) { - ret = device_add_properties(&adev->dev, dev_desc->properties); - if (ret) - goto err_out; - } - adev->driver_data = pdata; - pdev = acpi_create_platform_device(adev); + pdev = acpi_create_platform_device(adev, dev_desc->properties); if (!IS_ERR_OR_NULL(pdev)) return 1; diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 552010288135..373657f7e35a 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -395,7 +395,7 @@ static int acpi_lpss_create_device(struct acpi_device *adev, dev_desc = (const struct lpss_device_desc *)id->driver_data; if (!dev_desc) { - pdev = acpi_create_platform_device(adev); + pdev = acpi_create_platform_device(adev, NULL); return IS_ERR_OR_NULL(pdev) ? PTR_ERR(pdev) : 1; } pdata = kzalloc(sizeof(*pdata), GFP_KERNEL); @@ -451,14 +451,8 @@ static int acpi_lpss_create_device(struct acpi_device *adev, goto err_out; } - if (dev_desc->properties) { - ret = device_add_properties(&adev->dev, dev_desc->properties); - if (ret) - goto err_out; - } - adev->driver_data = pdata; - pdev = acpi_create_platform_device(adev); + pdev = acpi_create_platform_device(adev, dev_desc->properties); if (!IS_ERR_OR_NULL(pdev)) { return 1; } diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 159f7f19abce..56e70da83c6c 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -33,6 +33,7 @@ static const struct acpi_device_id forbidden_id_list[] = { /** * acpi_create_platform_device - Create platform device for ACPI device node * @adev: ACPI device node to create a platform device for. + * @properties: Optional collection of build-in properties. * * Check if the given @adev can be represented as a platform device and, if * that's the case, create and register a platform device, populate its common @@ -40,7 +41,8 @@ static const struct acpi_device_id forbidden_id_list[] = { * * Name of the platform device will be the same as @adev's. */ -struct platform_device *acpi_create_platform_device(struct acpi_device *adev) +struct platform_device *acpi_create_platform_device(struct acpi_device *adev, + struct property_entry *properties) { struct platform_device *pdev = NULL; struct platform_device_info pdevinfo; @@ -88,6 +90,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) pdevinfo.res = resources; pdevinfo.num_res = count; pdevinfo.fwnode = acpi_fwnode_handle(adev); + pdevinfo.properties = properties; if (acpi_dma_supported(adev)) pdevinfo.dma_mask = DMA_BIT_MASK(32); diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c index 33505c651f62..86364097e236 100644 --- a/drivers/acpi/dptf/int340x_thermal.c +++ b/drivers/acpi/dptf/int340x_thermal.c @@ -34,11 +34,11 @@ static int int340x_thermal_handler_attach(struct acpi_device *adev, const struct acpi_device_id *id) { if (IS_ENABLED(CONFIG_INT340X_THERMAL)) - acpi_create_platform_device(adev); + acpi_create_platform_device(adev, NULL); /* Intel SoC DTS thermal driver needs INT3401 to set IRQ descriptor */ else if (IS_ENABLED(CONFIG_INTEL_SOC_DTS_THERMAL) && id->driver_data == INT3401_DEVICE) - acpi_create_platform_device(adev); + acpi_create_platform_device(adev, NULL); return 1; } diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index ad9fc84a8601..3d31ae3a482d 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1734,7 +1734,7 @@ static void acpi_default_enumeration(struct acpi_device *device) &is_spi_i2c_slave); acpi_dev_free_resource_list(&resource_list); if (!is_spi_i2c_slave) { - acpi_create_platform_device(device); + acpi_create_platform_device(device, NULL); acpi_device_set_enumerated(device); } else { blocking_notifier_call_chain(&acpi_reconfig_chain, diff --git a/drivers/platform/x86/intel-hid.c b/drivers/platform/x86/intel-hid.c index ed5874217ee7..12dbb5063376 100644 --- a/drivers/platform/x86/intel-hid.c +++ b/drivers/platform/x86/intel-hid.c @@ -264,7 +264,7 @@ check_acpi_dev(acpi_handle handle, u32 lvl, void *context, void **rv) return AE_OK; if (acpi_match_device_ids(dev, ids) == 0) - if (acpi_create_platform_device(dev)) + if (acpi_create_platform_device(dev, NULL)) dev_info(&dev->dev, "intel-hid: created platform device\n"); diff --git a/drivers/platform/x86/intel-vbtn.c b/drivers/platform/x86/intel-vbtn.c index 146d02f8c9bc..78080763df51 100644 --- a/drivers/platform/x86/intel-vbtn.c +++ b/drivers/platform/x86/intel-vbtn.c @@ -164,7 +164,7 @@ check_acpi_dev(acpi_handle handle, u32 lvl, void *context, void **rv) return AE_OK; if (acpi_match_device_ids(dev, ids) == 0) - if (acpi_create_platform_device(dev)) + if (acpi_create_platform_device(dev, NULL)) dev_info(&dev->dev, "intel-vbtn: created platform device\n"); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 632ec16a855e..c09936f55166 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -546,7 +546,8 @@ int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); void acpi_walk_dep_device_list(acpi_handle handle); -struct platform_device *acpi_create_platform_device(struct acpi_device *); +struct platform_device *acpi_create_platform_device(struct acpi_device *, + struct property_entry *); #define ACPI_PTR(_ptr) (_ptr) static inline void acpi_device_set_enumerated(struct acpi_device *adev) -- cgit From 264048afab27d7c27eedf5394714e0b396d787f7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 8 Nov 2016 15:15:24 +0100 Subject: libceph: initialize last_linger_id with a large integer osdc->last_linger_id is a counter for lreq->linger_id, which is used for watch cookies. Starting with a large integer should ease the task of telling apart kernel and userspace clients. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 ++ net/ceph/osd_client.c | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 96337b15a60d..a8e66344bacc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -258,6 +258,8 @@ struct ceph_watch_item { struct ceph_entity_addr addr; }; +#define CEPH_LINGER_ID_START 0xffff000000000000ULL + struct ceph_osd_client { struct ceph_client *client; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d9bf7a1d0a58..e6ae15bc41b7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4094,6 +4094,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osd_init(&osdc->homeless_osd); osdc->homeless_osd.o_osdc = osdc; osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; + osdc->last_linger_id = CEPH_LINGER_ID_START; osdc->linger_requests = RB_ROOT; osdc->map_checks = RB_ROOT; osdc->linger_map_checks = RB_ROOT; -- cgit From 5e322beefc8699b5747cfb35539a9496034e4296 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 10 Nov 2016 10:46:07 -0800 Subject: mm, frontswap: make sure allocated frontswap map is assigned Christian Borntraeger reports: With commit 8ea1d2a1985a ("mm, frontswap: convert frontswap_enabled to static key") kmemleak complains about a memory leak in swapon unreferenced object 0x3e09ba56000 (size 32112640): comm "swapon", pid 7852, jiffies 4294968787 (age 1490.770s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: __vmalloc_node_range+0x194/0x2d8 vzalloc+0x58/0x68 SyS_swapon+0xd60/0x12f8 system_call+0xd6/0x270 Turns out kmemleak is right. We now allocate the frontswap map depending on the kernel config (and no longer on the enablement) swapfile.c: [...] if (IS_ENABLED(CONFIG_FRONTSWAP)) frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); but later on this is passed along --> enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); and ignored if frontswap is disabled --> frontswap_init(p->type, frontswap_map); static inline void frontswap_init(unsigned type, unsigned long *map) { if (frontswap_enabled()) __frontswap_init(type, map); } Thing is, that frontswap map is never freed. The leakage is relatively not that bad, because swapon is an infrequent and privileged operation. However, if the first frontswap backend is registered after a swap type has been already enabled, it will WARN_ON in frontswap_register_ops() and frontswap will not be available for the swap type. Fix this by making sure the map is assigned by frontswap_init() as long as CONFIG_FRONTSWAP is enabled. Fixes: 8ea1d2a1985a ("mm, frontswap: convert frontswap_enabled to static key") Link: http://lkml.kernel.org/r/20161026134220.2566-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Christian Borntraeger Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index c46d2aa16d81..1d18af034554 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -106,8 +106,9 @@ static inline void frontswap_invalidate_area(unsigned type) static inline void frontswap_init(unsigned type, unsigned long *map) { - if (frontswap_enabled()) - __frontswap_init(type, map); +#ifdef CONFIG_FRONTSWAP + __frontswap_init(type, map); +#endif } #endif /* _LINUX_FRONTSWAP_H */ -- cgit From c6c7d83b9c9e6a8b3e6d84c820ac61fbffc9e396 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 10 Nov 2016 10:46:26 -0800 Subject: Revert "console: don't prefer first registered if DT specifies stdout-path" This reverts commit 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path"). The reverted commit changes existing behavior on which many ARM boards rely. Many ARM small-board-computers, like e.g. the Raspberry Pi have both a video output and a serial console. Depending on whether the user is using the device as a more regular computer; or as a headless device we need to have the console on either one or the other. Many users rely on the kernel behavior of the console being present on both outputs, before the reverted commit the console setup with no console= kernel arguments on an ARM board which sets stdout-path in dt would look like this: [root@localhost ~]# cat /proc/consoles ttyS0 -W- (EC p a) 4:64 tty0 -WU (E p ) 4:1 Where as after the reverted commit, it looks like this: [root@localhost ~]# cat /proc/consoles ttyS0 -W- (EC p a) 4:64 This commit reverts commit 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path") restoring the original behavior. Fixes: 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path") Link: http://lkml.kernel.org/r/20161104121135.4780-2-hdegoede@redhat.com Signed-off-by: Hans de Goede Cc: Paul Burton Cc: Rob Herring Cc: Frank Rowand Cc: Thorsten Leemhuis Cc: Greg Kroah-Hartman Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/base.c | 2 -- include/linux/console.h | 6 ------ kernel/printk/printk.c | 13 +------------ 3 files changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/of/base.c b/drivers/of/base.c index d687e6de24a0..a0bccb54a9bd 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -2077,8 +2077,6 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)) name = of_get_property(of_aliases, "stdout", NULL); if (name) of_stdout = of_find_node_opts_by_path(name, &of_stdout_options); - if (of_stdout) - console_set_by_of(); } if (!of_aliases) diff --git a/include/linux/console.h b/include/linux/console.h index 3672809234a7..d530c4627e54 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -173,12 +173,6 @@ static inline void console_sysfs_notify(void) #endif extern bool console_suspend_enabled; -#ifdef CONFIG_OF -extern void console_set_by_of(void); -#else -static inline void console_set_by_of(void) {} -#endif - /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index de08fc90baaf..5028f4fd504a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,17 +253,6 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); -#ifdef CONFIG_OF -static bool of_specified_console; - -void console_set_by_of(void) -{ - of_specified_console = true; -} -#else -# define of_specified_console false -#endif - /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -2657,7 +2646,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0 && !of_specified_console) { + if (preferred_console < 0) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || -- cgit From 4e3264d21b90984c2165e8fe5a7b64cf25bc2c2d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 9 Nov 2016 15:36:33 -0800 Subject: bpf: Fix bpf_redirect to an ipip/ip6tnl dev If the bpf program calls bpf_redirect(dev, 0) and dev is an ipip/ip6tnl, it currently includes the mac header. e.g. If dev is ipip, the end result is IP-EthHdr-IP instead of IP-IP. The fix is to pull the mac header. At ingress, skb_postpull_rcsum() is not needed because the ethhdr should have been pulled once already and then got pushed back just before calling the bpf_prog. At egress, this patch calls skb_postpull_rcsum(). If bpf_redirect(dev, BPF_F_INGRESS) is called, it also fails now because it calls dev_forward_skb() which eventually calls eth_type_trans(skb, dev). The eth_type_trans() will set skb->type = PACKET_OTHERHOST because the mac address does not match the redirecting dev->dev_addr. The PACKET_OTHERHOST will eventually cause the ip_rcv() errors out. To fix this, ____dev_forward_skb() is added. Joint work with Daniel Borkmann. Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel") Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/netdevice.h | 15 +++++++++++ net/core/dev.c | 17 +++++------- net/core/filter.c | 68 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 81 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 91ee3643ccc8..bf04a46f6d5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb); +static __always_inline int ____dev_forward_skb(struct net_device *dev, + struct sk_buff *skb) +{ + if (skb_orphan_frags(skb, GFP_ATOMIC) || + unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_scrub_packet(skb, true); + skb->priority = 0; + return 0; +} + void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; diff --git a/net/core/dev.c b/net/core/dev.c index eaad4c28069f..6666b28b6815 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_orphan_frags(skb, GFP_ATOMIC) || - unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } + int ret = ____dev_forward_skb(dev, skb); - skb_scrub_packet(skb, true); - skb->priority = 0; - skb->protocol = eth_type_trans(skb, dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } - return 0; + return ret; } EXPORT_SYMBOL_GPL(__dev_forward_skb); diff --git a/net/core/filter.c b/net/core/filter.c index 00351cdf7d0c..b391209838ef 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) return dev_forward_skb(dev, skb); } +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, + struct sk_buff *skb) +{ + int ret = ____dev_forward_skb(dev, skb); + + if (likely(!ret)) { + skb->dev = dev; + ret = netif_rx(skb); + } + + return ret; +} + static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; @@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) return ret; } +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* skb->mac_len is not set on normal egress */ + unsigned int mlen = skb->network_header - skb->mac_header; + + __skb_pull(skb, mlen); + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + skb_pop_mac_header(skb); + skb_reset_mac_len(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + bpf_push_mac_rcsum(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + return __bpf_redirect_no_mac(skb, dev, flags); + default: + return __bpf_redirect_common(skb, dev, flags); + } +} + BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; @@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) return -ENOMEM; } - bpf_push_mac_rcsum(clone); - - return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); + return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - bpf_push_mac_rcsum(skb); - - return ri->flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + return __bpf_redirect(skb, dev, ri->flags); } static const struct bpf_func_proto bpf_redirect_proto = { -- cgit From ea08e39230e898844d9de5b60cdbb30067cebfe7 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 11 Nov 2016 13:16:22 -0500 Subject: sunrpc: svc_age_temp_xprts_now should not call setsockopt non-tcp transports This fixes the following panic that can occur with NFSoRDMA. general protection fault: 0000 [#1] SMP Modules linked in: rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx5_ib ib_core intel_powerclamp coretemp kvm_intel kvm sg ioatdma ipmi_devintf ipmi_ssif dcdbas iTCO_wdt iTCO_vendor_support pcspkr irqbypass sb_edac shpchp dca crc32_pclmul ghash_clmulni_intel edac_core lpc_ich aesni_intel lrw gf128mul glue_helper ablk_helper mei_me mei ipmi_si cryptd wmi ipmi_msghandler acpi_pad acpi_power_meter nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt ahci fb_sys_fops ttm libahci mlx5_core tg3 crct10dif_pclmul drm crct10dif_common ptp i2c_core libata crc32c_intel pps_core fjes dm_mirror dm_region_hash dm_log dm_mod CPU: 1 PID: 120 Comm: kworker/1:1 Not tainted 3.10.0-514.el7.x86_64 #1 Hardware name: Dell Inc. PowerEdge R320/0KM5PX, BIOS 2.4.2 01/29/2015 Workqueue: events check_lifetime task: ffff88031f506dd0 ti: ffff88031f584000 task.ti: ffff88031f584000 RIP: 0010:[] [] _raw_spin_lock_bh+0x17/0x50 RSP: 0018:ffff88031f587ba8 EFLAGS: 00010206 RAX: 0000000000020000 RBX: 20041fac02080072 RCX: ffff88031f587fd8 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 20041fac02080072 RBP: ffff88031f587bb0 R08: 0000000000000008 R09: ffffffff8155be77 R10: ffff880322a59b00 R11: ffffea000bf39f00 R12: 20041fac02080072 R13: 000000000000000d R14: ffff8800c4fbd800 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff880322a40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3c52d4547e CR3: 00000000019ba000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: 20041fac02080002 ffff88031f587bd0 ffffffff81557830 20041fac02080002 ffff88031f587c78 ffff88031f587c40 ffffffff8155ae08 000000010157df32 0000000800000001 ffff88031f587c20 ffffffff81096acb ffffffff81aa37d0 Call Trace: [] lock_sock_nested+0x20/0x50 [] sock_setsockopt+0x78/0x940 [] ? lock_timer_base.isra.33+0x2b/0x50 [] kernel_setsockopt+0x4d/0x50 [] svc_age_temp_xprts_now+0x174/0x1e0 [sunrpc] [] nfsd_inetaddr_event+0x9d/0xd0 [nfsd] [] notifier_call_chain+0x4c/0x70 [] __blocking_notifier_call_chain+0x4d/0x70 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0x168/0x2d0 [] check_lifetime+0x25f/0x270 [] process_one_work+0x17b/0x470 [] worker_thread+0x126/0x410 [] ? rescuer_thread+0x460/0x460 [] kthread+0xcf/0xe0 [] ? kthread_create_on_node+0x140/0x140 [] ret_from_fork+0x58/0x90 [] ? kthread_create_on_node+0x140/0x140 Code: ca 75 f1 5d c3 0f 1f 80 00 00 00 00 eb d9 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 53 48 89 fb e8 7e 04 a0 ff b8 00 00 02 00 0f c1 03 89 c2 c1 ea 10 66 39 c2 75 03 5b 5d c3 83 e2 fe 0f RIP [] _raw_spin_lock_bh+0x17/0x50 RSP Signed-off-by: Scott Mayhew Fixes: c3d4879e ("sunrpc: Add a function to close temporary transports immediately") Reviewed-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc_xprt.c | 11 +---------- net/sunrpc/svcsock.c | 21 +++++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 ++++++ 4 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index ab02a457da1f..e5d193440374 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -25,6 +25,7 @@ struct svc_xprt_ops { void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); int (*xpo_secure_port)(struct svc_rqst *); + void (*xpo_kill_temp_xprt)(struct svc_xprt *); }; struct svc_xprt_class { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c3f652395a80..3bc1d61694cb 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1002,14 +1002,8 @@ static void svc_age_temp_xprts(unsigned long closure) void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) { struct svc_xprt *xprt; - struct svc_sock *svsk; - struct socket *sock; struct list_head *le, *next; LIST_HEAD(to_be_closed); - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; spin_lock_bh(&serv->sv_lock); list_for_each_safe(le, next, &serv->sv_tempsocks) { @@ -1027,10 +1021,7 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) list_del_init(le); xprt = list_entry(le, struct svc_xprt, xpt_list); dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + xprt->xpt_ops->xpo_kill_temp_xprt(xprt); svc_close_xprt(xprt); } } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 57625f64efd5..a4bc98265d88 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -438,6 +438,21 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); } +static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) +{ + struct svc_sock *svsk; + struct socket *sock; + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); +} + /* * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo */ @@ -648,6 +663,10 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) return NULL; } +static void svc_udp_kill_temp_xprt(struct svc_xprt *xprt) +{ +} + static struct svc_xprt *svc_udp_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -667,6 +686,7 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt, }; static struct svc_xprt_class svc_udp_class = { @@ -1242,6 +1262,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt, }; static struct svc_xprt_class svc_tcp_class = { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6864fb967038..1334de2715c2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -67,6 +67,7 @@ static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); static int svc_rdma_secure_port(struct svc_rqst *); +static void svc_rdma_kill_temp_xprt(struct svc_xprt *); static struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, @@ -79,6 +80,7 @@ static struct svc_xprt_ops svc_rdma_ops = { .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, .xpo_secure_port = svc_rdma_secure_port, + .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt, }; struct svc_xprt_class svc_rdma_class = { @@ -1317,6 +1319,10 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp) return 1; } +static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) +{ +} + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { struct ib_send_wr *bad_wr, *n_wr; -- cgit From f23cc643f9baec7f71f2b74692da3cf03abbbfda Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Nov 2016 15:45:36 -0500 Subject: bpf: fix range arithmetic for bpf map access I made some invalid assumptions with BPF_AND and BPF_MOD that could result in invalid accesses to bpf map entries. Fix this up by doing a few things 1) Kill BPF_MOD support. This doesn't actually get used by the compiler in real life and just adds extra complexity. 2) Fix the logic for BPF_AND, don't allow AND of negative numbers and set the minimum value to 0 for positive AND's. 3) Don't do operations on the ranges if they are set to the limits, as they are by definition undefined, and allowing arithmetic operations on those values could make them appear valid when they really aren't. This fixes the testcase provided by Jann as well as a few other theoretical problems. Reported-by: Jann Horn Signed-off-by: Josef Bacik Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 5 ++-- kernel/bpf/verifier.c | 70 +++++++++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7035b997aaa5..6aaf425cebc3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -14,7 +14,7 @@ * are obviously wrong for any sort of memory access. */ #define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024) -#define BPF_REGISTER_MIN_RANGE -(1024 * 1024 * 1024) +#define BPF_REGISTER_MIN_RANGE -1 struct bpf_reg_state { enum bpf_reg_type type; @@ -22,7 +22,8 @@ struct bpf_reg_state { * Used to determine if any memory access using this register will * result in a bad access. */ - u64 min_value, max_value; + s64 min_value; + u64 max_value; union { /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ s64 imm; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99a7e5b388f2..6a936159c6e0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -216,8 +216,8 @@ static void print_verifier_state(struct bpf_verifier_state *state) reg->map_ptr->key_size, reg->map_ptr->value_size); if (reg->min_value != BPF_REGISTER_MIN_RANGE) - verbose(",min_value=%llu", - (unsigned long long)reg->min_value); + verbose(",min_value=%lld", + (long long)reg->min_value); if (reg->max_value != BPF_REGISTER_MAX_RANGE) verbose(",max_value=%llu", (unsigned long long)reg->max_value); @@ -758,7 +758,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if ((s64)reg->min_value < 0) { + if (reg->min_value < 0) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -1468,7 +1468,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg) { if (reg->max_value > BPF_REGISTER_MAX_RANGE) reg->max_value = BPF_REGISTER_MAX_RANGE; - if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) + if (reg->min_value < BPF_REGISTER_MIN_RANGE || + reg->min_value > BPF_REGISTER_MAX_RANGE) reg->min_value = BPF_REGISTER_MIN_RANGE; } @@ -1476,7 +1477,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; - u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; + s64 min_val = BPF_REGISTER_MIN_RANGE; + u64 max_val = BPF_REGISTER_MAX_RANGE; bool min_set = false, max_set = false; u8 opcode = BPF_OP(insn->code); @@ -1512,22 +1514,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, return; } + /* If one of our values was at the end of our ranges then we can't just + * do our normal operations to the register, we need to set the values + * to the min/max since they are undefined. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + switch (opcode) { case BPF_ADD: - dst_reg->min_value += min_val; - dst_reg->max_value += max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value += min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value += max_val; break; case BPF_SUB: - dst_reg->min_value -= min_val; - dst_reg->max_value -= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value -= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value -= max_val; break; case BPF_MUL: - dst_reg->min_value *= min_val; - dst_reg->max_value *= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value *= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value *= max_val; break; case BPF_AND: - /* & is special since it could end up with 0 bits set. */ - dst_reg->min_value &= min_val; + /* Disallow AND'ing of negative numbers, ain't nobody got time + * for that. Otherwise the minimum is 0 and the max is the max + * value we could AND against. + */ + if (min_val < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = 0; dst_reg->max_value = max_val; break; case BPF_LSH: @@ -1537,24 +1560,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, */ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else + else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value <<= min_val; if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; - else + else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value <<= max_val; break; case BPF_RSH: - dst_reg->min_value >>= min_val; - dst_reg->max_value >>= max_val; - break; - case BPF_MOD: - /* % is special since it is an unsigned modulus, so the floor - * will always be 0. + /* RSH by a negative number is undefined, and the BPF_RSH is an + * unsigned shift, so make the appropriate casts. */ - dst_reg->min_value = 0; - dst_reg->max_value = max_val - 1; + if (min_val < 0 || dst_reg->min_value < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = + (u64)(dst_reg->min_value) >> min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value >>= max_val; break; default: reset_reg_range_values(regs, insn->dst_reg); -- cgit From 5d1904204c99596b50a700f092fe49d78edba400 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 10 Nov 2016 17:16:33 +0800 Subject: mremap: fix race between mremap() and page cleanning Prior to 3.15, there was a race between zap_pte_range() and page_mkclean() where writes to a page could be lost. Dave Hansen discovered by inspection that there is a similar race between move_ptes() and page_mkclean(). We've been able to reproduce the issue by enlarging the race window with a msleep(), but have not been able to hit it without modifying the code. So, we think it's a real issue, but is difficult or impossible to hit in practice. The zap_pte_range() issue is fixed by commit 1cf35d47712d("mm: split 'tlb_flush_mmu()' into tlb flushing and memory freeing parts"). And this patch is to fix the race between page_mkclean() and mremap(). Here is one possible way to hit the race: suppose a process mmapped a file with READ | WRITE and SHARED, it has two threads and they are bound to 2 different CPUs, e.g. CPU1 and CPU2. mmap returned X, then thread 1 did a write to addr X so that CPU1 now has a writable TLB for addr X on it. Thread 2 starts mremaping from addr X to Y while thread 1 cleaned the page and then did another write to the old addr X again. The 2nd write from thread 1 could succeed but the value will get lost. thread 1 thread 2 (bound to CPU1) (bound to CPU2) 1: write 1 to addr X to get a writeable TLB on this CPU 2: mremap starts 3: move_ptes emptied PTE for addr X and setup new PTE for addr Y and then dropped PTL for X and Y 4: page laundering for N by doing fadvise FADV_DONTNEED. When done, pageframe N is deemed clean. 5: *write 2 to addr X 6: tlb flush for addr X 7: munmap (Y, pagesize) to make the page unmapped 8: fadvise with FADV_DONTNEED again to kick the page off the pagecache 9: pread the page from file to verify the value. If 1 is there, it means we have lost the written 2. *the write may or may not cause segmentation fault, it depends on if the TLB is still on the CPU. Please note that this is only one specific way of how the race could occur, it didn't mean that the race could only occur in exact the above config, e.g. more than 2 threads could be involved and fadvise() could be done in another thread, etc. For anonymous pages, they could race between mremap() and page reclaim: THP: a huge PMD is moved by mremap to a new huge PMD, then the new huge PMD gets unmapped/splitted/pagedout before the flush tlb happened for the old huge PMD in move_page_tables() and we could still write data to it. The normal anonymous page has similar situation. To fix this, check for any dirty PTE in move_ptes()/move_huge_pmd() and if any, did the flush before dropping the PTL. If we did the flush for every move_ptes()/move_huge_pmd() call then we do not need to do the flush in move_pages_tables() for the whole range. But if we didn't, we still need to do the whole range flush. Alternatively, we can track which part of the range is flushed in move_ptes()/move_huge_pmd() and which didn't to avoid flushing the whole range in move_page_tables(). But that would require multiple tlb flushes for the different sub-ranges and should be less efficient than the single whole range flush. KBuild test on my Sandybridge desktop doesn't show any noticeable change. v4.9-rc4: real 5m14.048s user 32m19.800s sys 4m50.320s With this commit: real 5m13.888s user 32m19.330s sys 4m51.200s Reported-by: Dave Hansen Signed-off-by: Aaron Lu Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 +- mm/huge_memory.c | 9 ++++++++- mm/mremap.c | 30 +++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9b9f65d99873..e35e6de633b9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -22,7 +22,7 @@ extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned char *vec); extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd); + pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cdcd25cb30fe..eff3de359d50 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1426,11 +1426,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd) + pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; + bool force_flush = false; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || @@ -1455,6 +1456,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd)) + force_flush = true; pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); @@ -1467,6 +1470,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); if (new_ptl != old_ptl) spin_unlock(new_ptl); + if (force_flush) + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + else + *need_flush = true; spin_unlock(old_ptl); return true; } diff --git a/mm/mremap.c b/mm/mremap.c index da22ad2a5678..6ccecc03f56a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,11 +104,13 @@ static pte_t move_soft_dirty_pte(pte_t pte) static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr, bool need_rmap_locks) + unsigned long new_addr, bool need_rmap_locks, bool *need_flush) { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + bool force_flush = false; + unsigned long len = old_end - old_addr; /* * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma @@ -146,6 +148,14 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_pte++, new_addr += PAGE_SIZE) { if (pte_none(*old_pte)) continue; + + /* + * We are remapping a dirty PTE, make sure to + * flush TLB before we drop the PTL for the + * old PTE or we may race with page_mkclean(). + */ + if (pte_present(*old_pte) && pte_dirty(*old_pte)) + force_flush = true; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); @@ -156,6 +166,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_pte - 1); + if (force_flush) + flush_tlb_range(vma, old_end - len, old_end); + else + *need_flush = true; pte_unmap_unlock(old_pte - 1, old_ptl); if (need_rmap_locks) drop_rmap_locks(vma); @@ -201,13 +215,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (need_rmap_locks) take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, - old_end, old_pmd, new_pmd); + old_end, old_pmd, new_pmd, + &need_flush); if (need_rmap_locks) drop_rmap_locks(vma); - if (moved) { - need_flush = true; + if (moved) continue; - } } split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) @@ -220,11 +233,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma, extent = next - new_addr; if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; - move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr, need_rmap_locks); - need_flush = true; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, + new_pmd, new_addr, need_rmap_locks, &need_flush); } - if (likely(need_flush)) + if (need_flush) flush_tlb_range(vma, old_end-len, old_addr); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); -- cgit From 910170442944e1f8674fd5ddbeeb8ccd1877ea98 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 12 Sep 2016 10:49:11 +0800 Subject: iommu/vt-d: Fix PASID table allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Somehow I ended up with an off-by-three error in calculating the size of the PASID and PASID State tables, which triggers allocations failures as those tables unfortunately have to be physically contiguous. In fact, even the *correct* maximum size of 8MiB is problematic and is wont to lead to allocation failures. Since I have extracted a promise that this *will* be fixed in hardware, I'm happy to limit it on the current hardware to a maximum of 0x20000 PASIDs, which gives us 1MiB tables — still not ideal, but better than before. Reported by Mika Kuoppala and also by Xunlei Pang who submitted a simpler patch to fix only the allocation (and not the free) to the "correct" limit... which was still problematic. Signed-off-by: David Woodhouse Cc: stable@vger.kernel.org --- drivers/iommu/intel-svm.c | 28 +++++++++++++++++----------- include/linux/intel-iommu.h | 1 + 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 8ebb3530afa7..cb72e0011310 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -39,10 +39,18 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu) struct page *pages; int order; - order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT; - if (order < 0) - order = 0; - + /* Start at 2 because it's defined as 2^(1+PSS) */ + iommu->pasid_max = 2 << ecap_pss(iommu->ecap); + + /* Eventually I'm promised we will get a multi-level PASID table + * and it won't have to be physically contiguous. Until then, + * limit the size because 8MiB contiguous allocations can be hard + * to come by. The limit of 0x20000, which is 1MiB for each of + * the PASID and PASID-state tables, is somewhat arbitrary. */ + if (iommu->pasid_max > 0x20000) + iommu->pasid_max = 0x20000; + + order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max); pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (!pages) { pr_warn("IOMMU: %s: Failed to allocate PASID table\n", @@ -53,6 +61,8 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu) pr_info("%s: Allocated order %d PASID table.\n", iommu->name, order); if (ecap_dis(iommu->ecap)) { + /* Just making it explicit... */ + BUILD_BUG_ON(sizeof(struct pasid_entry) != sizeof(struct pasid_state_entry)); pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (pages) iommu->pasid_state_table = page_address(pages); @@ -68,11 +78,7 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu) int intel_svm_free_pasid_tables(struct intel_iommu *iommu) { - int order; - - order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT; - if (order < 0) - order = 0; + int order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max); if (iommu->pasid_table) { free_pages((unsigned long)iommu->pasid_table, order); @@ -371,8 +377,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } svm->iommu = iommu; - if (pasid_max > 2 << ecap_pss(iommu->ecap)) - pasid_max = 2 << ecap_pss(iommu->ecap); + if (pasid_max > iommu->pasid_max) + pasid_max = iommu->pasid_max; /* Do not use PASID 0 in caching mode (virtualised IOMMU) */ ret = idr_alloc(&iommu->pasid_idr, svm, diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 2d9b650047a5..d49e26c6cdc7 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -429,6 +429,7 @@ struct intel_iommu { struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ struct idr pasid_idr; + u32 pasid_max; #endif struct q_inval *qi; /* Queued invalidation info */ u32 *iommu_state; /* Store iommu states between suspend and resume.*/ -- cgit From 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:12 +0100 Subject: sched/autogroup: Do not use autogroup->tg in zombie threads Exactly because for_each_thread() in autogroup_move_group() can't see it and update its ->sched_task_group before _put() and possibly free(). So the exiting task needs another sched_move_task() before exit_notify() and we need to re-introduce the PF_EXITING (or similar) check removed by the previous change for another reason. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Cc: vlovejoy@redhat.com Link: http://lkml.kernel.org/r/20161114184612.GA15968@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/exit.c | 1 + kernel/sched/auto_group.c | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..e9c009dc3a4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); +extern void sched_autogroup_exit_task(struct task_struct *p); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); @@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { } static inline void sched_autogroup_detach(struct task_struct *p) { } static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } +static inline void sched_autogroup_exit_task(struct task_struct *p) { } #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45ebbe3..3076f3089919 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -836,6 +836,7 @@ void __noreturn do_exit(long code) */ perf_event_exit_task(tsk); + sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index ad2b19ad6ca0..f1c8fd566246 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -115,10 +115,26 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) * If we race with autogroup_move_group() the caller can use the old * value of signal->autogroup but in this case sched_move_task() will * be called again before autogroup_kref_put(). + * + * However, there is no way sched_autogroup_exit_task() could tell us + * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. */ + if (p->flags & PF_EXITING) + return false; + return true; } +void sched_autogroup_exit_task(struct task_struct *p) +{ + /* + * We are going to call exit_notify() and autogroup_move_group() can't + * see this thread after that: we can no longer use signal->autogroup. + * See the PF_EXITING check in task_wants_autogroup(). + */ + sched_move_task(p); +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -142,6 +158,9 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * In the latter case for_each_thread() can not miss a migrating thread, * cpu_cgroup_attach() must not be possible after cgroup_exit() and it * can't be removed from thread list, we hold ->siglock. + * + * If an exiting thread was already removed from thread list we rely on + * sched_autogroup_exit_task(). */ for_each_thread(p, t) sched_move_task(t); -- cgit From e784930bd645e7df78c66e7872fec282b0620075 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 2 Nov 2016 16:35:51 -0600 Subject: PCI: Export pcie_find_root_port Export pcie_find_root_port() so we can use it outside of PCIe-AER error injection. Signed-off-by: Johannes Thumshirn Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer/aer_inject.c | 14 -------------- include/linux/pci.h | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c index db553dc22c8e..2b6a59266689 100644 --- a/drivers/pci/pcie/aer/aer_inject.c +++ b/drivers/pci/pcie/aer/aer_inject.c @@ -307,20 +307,6 @@ out: return 0; } -static struct pci_dev *pcie_find_root_port(struct pci_dev *dev) -{ - while (1) { - if (!pci_is_pcie(dev)) - break; - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) - return dev; - if (!dev->bus->self) - break; - dev = dev->bus->self; - } - return NULL; -} - static int find_aer_device_iter(struct device *device, void *data) { struct pcie_device **result = data; diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e49f70dbd9b..a38772a85588 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1928,6 +1928,20 @@ static inline int pci_pcie_type(const struct pci_dev *dev) return (pcie_caps_reg(dev) & PCI_EXP_FLAGS_TYPE) >> 4; } +static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev) +{ + while (1) { + if (!pci_is_pcie(dev)) + break; + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) + return dev; + if (!dev->bus->self) + break; + dev = dev->bus->self; + } + return NULL; +} + void pci_request_acs(void); bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags); bool pci_acs_path_enabled(struct pci_dev *start, -- cgit From 920c1cd36642ac21a7b2fdc47ab44b9634d570f9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 21 Nov 2016 18:28:36 -0800 Subject: netdevice.h: fix kernel-doc warning Fix kernel-doc warning in (missing ':'): ..//include/linux/netdevice.h:1904: warning: No description found for parameter 'prio_tc_map[TC_BITMASK + 1]' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf04a46f6d5b..e16a2a980ea8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1619,7 +1619,7 @@ enum netdev_priv_flags { * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device * @tc_to_txq: XXX: need comments on this one - * @prio_tc_map XXX: need comments on this one + * @prio_tc_map: XXX: need comments on this one * * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * -- cgit From b4353708f5a1c084fd73f1b6fd243b142157b173 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 27 Nov 2016 19:20:51 +0200 Subject: Revert "net/mlx4_en: Avoid unregister_netdev at shutdown flow" This reverts commit 9d76931180557270796f9631e2c79b9c7bb3c9fb. Using unregister_netdev at shutdown flow prevents calling the netdev's ndos or trying to access its freed resources. This fixes crashes like the following: Call Trace: [] dev_get_phys_port_id+0x1e/0x30 [] rtnl_fill_ifinfo+0x4be/0xff0 [] rtmsg_ifinfo_build_skb+0x73/0xe0 [] rtmsg_ifinfo.part.27+0x16/0x50 [] rtmsg_ifinfo+0x18/0x20 [] netdev_state_change+0x46/0x50 [] linkwatch_do_dev+0x38/0x50 [] __linkwatch_run_queue+0xf5/0x170 [] linkwatch_event+0x25/0x30 [] process_one_work+0x152/0x400 [] worker_thread+0x125/0x4b0 [] ? rescuer_thread+0x350/0x350 [] kthread+0xca/0xe0 [] ? kthread_park+0x60/0x60 [] ret_from_fork+0x25/0x30 Fixes: 9d7693118055 ("net/mlx4_en: Avoid unregister_netdev at shutdown flow") Signed-off-by: Tariq Toukan Reported-by: Sebastian Ott Reported-by: Steve Wise Cc: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 ++--------------- drivers/net/ethernet/mellanox/mlx4/main.c | 5 +---- include/linux/mlx4/device.h | 1 - 3 files changed, 3 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index a60f635da78b..fb8bb027b69c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2079,13 +2079,6 @@ err: return -ENOMEM; } -static void mlx4_en_shutdown(struct net_device *dev) -{ - rtnl_lock(); - netif_device_detach(dev); - mlx4_en_close(dev); - rtnl_unlock(); -} static int mlx4_en_copy_priv(struct mlx4_en_priv *dst, struct mlx4_en_priv *src, @@ -2162,8 +2155,6 @@ void mlx4_en_destroy_netdev(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; - bool shutdown = mdev->dev->persist->interface_state & - MLX4_INTERFACE_STATE_SHUTDOWN; en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port); @@ -2171,10 +2162,7 @@ void mlx4_en_destroy_netdev(struct net_device *dev) if (priv->registered) { devlink_port_type_clear(mlx4_get_devlink_port(mdev->dev, priv->port)); - if (shutdown) - mlx4_en_shutdown(dev); - else - unregister_netdev(dev); + unregister_netdev(dev); } if (priv->allocated) @@ -2203,8 +2191,7 @@ void mlx4_en_destroy_netdev(struct net_device *dev) kfree(priv->tx_ring); kfree(priv->tx_cq); - if (!shutdown) - free_netdev(dev); + free_netdev(dev); } static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 6f4e67bc3538..75d07fa9d0b1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -4147,11 +4147,8 @@ static void mlx4_shutdown(struct pci_dev *pdev) mlx4_info(persist->dev, "mlx4_shutdown was called\n"); mutex_lock(&persist->interface_state_mutex); - if (persist->interface_state & MLX4_INTERFACE_STATE_UP) { - /* Notify mlx4 clients that the kernel is being shut down */ - persist->interface_state |= MLX4_INTERFACE_STATE_SHUTDOWN; + if (persist->interface_state & MLX4_INTERFACE_STATE_UP) mlx4_unload_one(pdev); - } mutex_unlock(&persist->interface_state_mutex); } diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 3be7abd6e722..c9f379689dd0 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -476,7 +476,6 @@ enum { enum { MLX4_INTERFACE_STATE_UP = 1 << 0, MLX4_INTERFACE_STATE_DELETION = 1 << 1, - MLX4_INTERFACE_STATE_SHUTDOWN = 1 << 2, }; #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ -- cgit From 3f65047c853a2a5abcd8ac1984af3452b5df4ada Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 28 Nov 2016 19:24:55 +0100 Subject: of_mdio: add helper to deregister fixed-link PHYs Add helper to deregister fixed-link PHYs registered using of_phy_register_fixed_link(). Convert the two drivers that care to deregister their fixed-link PHYs to use the new helper, but note that most drivers currently fail to do so. Signed-off-by: Johan Hovold Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 16 ++-------------- drivers/of/of_mdio.c | 15 +++++++++++++++ include/linux/of_mdio.h | 4 ++++ net/dsa/dsa.c | 12 ++---------- 4 files changed, 23 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 58947aae31c7..9f0646512624 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2459,20 +2459,8 @@ static void cpsw_remove_dt(struct platform_device *pdev) if (strcmp(slave_node->name, "slave")) continue; - if (of_phy_is_fixed_link(slave_node)) { - struct phy_device *phydev; - - phydev = of_phy_find_device(slave_node); - if (phydev) { - fixed_phy_unregister(phydev); - /* Put references taken by - * of_phy_find_device() and - * of_phy_register_fixed_link(). - */ - phy_device_free(phydev); - phy_device_free(phydev); - } - } + if (of_phy_is_fixed_link(slave_node)) + of_phy_deregister_fixed_link(slave_node); of_node_put(slave_data->phy_node); diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 5a3145a02547..262281bd68fa 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -490,3 +490,18 @@ int of_phy_register_fixed_link(struct device_node *np) return -ENODEV; } EXPORT_SYMBOL(of_phy_register_fixed_link); + +void of_phy_deregister_fixed_link(struct device_node *np) +{ + struct phy_device *phydev; + + phydev = of_phy_find_device(np); + if (!phydev) + return; + + fixed_phy_unregister(phydev); + + put_device(&phydev->mdio.dev); /* of_phy_find_device() */ + phy_device_free(phydev); /* fixed_phy_register() */ +} +EXPORT_SYMBOL(of_phy_deregister_fixed_link); diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 2ab233661ae5..a58cca8bcb29 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -29,6 +29,7 @@ struct phy_device *of_phy_attach(struct net_device *dev, extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np); extern int of_mdio_parse_addr(struct device *dev, const struct device_node *np); extern int of_phy_register_fixed_link(struct device_node *np); +extern void of_phy_deregister_fixed_link(struct device_node *np); extern bool of_phy_is_fixed_link(struct device_node *np); #else /* CONFIG_OF */ @@ -83,6 +84,9 @@ static inline int of_phy_register_fixed_link(struct device_node *np) { return -ENOSYS; } +static inline void of_phy_deregister_fixed_link(struct device_node *np) +{ +} static inline bool of_phy_is_fixed_link(struct device_node *np) { return false; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index cb0091b99592..7899919cd9f0 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -506,16 +506,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, void dsa_cpu_dsa_destroy(struct device_node *port_dn) { - struct phy_device *phydev; - - if (of_phy_is_fixed_link(port_dn)) { - phydev = of_phy_find_device(port_dn); - if (phydev) { - fixed_phy_unregister(phydev); - put_device(&phydev->mdio.dev); - phy_device_free(phydev); - } - } + if (of_phy_is_fixed_link(port_dn)) + of_phy_deregister_fixed_link(port_dn); } static void dsa_switch_destroy(struct dsa_switch *ds) -- cgit From 045d599a286bc01daa3510d59272440a17b23c2e Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 30 Nov 2016 15:54:13 -0800 Subject: kasan: update kasan_global for gcc 7 kasan_global struct is part of compiler/runtime ABI. gcc revision 241983 has added a new field to kasan_global struct. Update kernel definition of kasan_global struct to include the new field. Without this patch KASAN is broken with gcc 7. Link: http://lkml.kernel.org/r/1479219743-28682-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 4 +++- mm/kasan/kasan.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 432f5c97e18f..928e5ca0caee 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -263,7 +263,9 @@ #endif #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */ -#if GCC_VERSION >= 50000 +#if GCC_VERSION >= 70000 +#define KASAN_ABI_VERSION 5 +#elif GCC_VERSION >= 50000 #define KASAN_ABI_VERSION 4 #elif GCC_VERSION >= 40902 #define KASAN_ABI_VERSION 3 diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index e5c2181fee6f..03f4545b103d 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -53,6 +53,9 @@ struct kasan_global { #if KASAN_ABI_VERSION >= 4 struct kasan_source_location *location; #endif +#if KASAN_ABI_VERSION >= 5 + char *odr_indicator; +#endif }; /** -- cgit From 5cbc198ae08d84bd416b672ad8bd1222acd0855c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 30 Nov 2016 15:54:19 -0800 Subject: mm: fix false-positive WARN_ON() in truncate/invalidate for hugetlb Hugetlb pages have ->index in size of the huge pages (PMD_SIZE or PUD_SIZE), not in PAGE_SIZE as other types of pages. This means we cannot user page_to_pgoff() to check whether we've got the right page for the radix-tree index. Let's introduce page_to_index() which would return radix-tree index for given page. We will be able to get rid of this once hugetlb will be switched to multi-order entries. Fixes: fc127da085c2 ("truncate: handle file thp") Link: http://lkml.kernel.org/r/20161123093053.mjbnvn5zwxw5e6lk@black.fi.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Doug Nelson Tested-by: Doug Nelson Reviewed-by: Naoya Horiguchi Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 21 +++++++++++++++------ mm/truncate.c | 8 ++++---- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index dd15d39e1985..7dbe9148b2f8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -374,16 +374,13 @@ static inline struct page *read_mapping_page(struct address_space *mapping, } /* - * Get the offset in PAGE_SIZE. - * (TODO: hugepage should have ->index in PAGE_SIZE) + * Get index of the page with in radix-tree + * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) */ -static inline pgoff_t page_to_pgoff(struct page *page) +static inline pgoff_t page_to_index(struct page *page) { pgoff_t pgoff; - if (unlikely(PageHeadHuge(page))) - return page->index << compound_order(page); - if (likely(!PageTransTail(page))) return page->index; @@ -396,6 +393,18 @@ static inline pgoff_t page_to_pgoff(struct page *page) return pgoff; } +/* + * Get the offset in PAGE_SIZE. + * (TODO: hugepage should have ->index in PAGE_SIZE) + */ +static inline pgoff_t page_to_pgoff(struct page *page) +{ + if (unlikely(PageHeadHuge(page))) + return page->index << compound_order(page); + + return page_to_index(page); +} + /* * Return byte-offset into filesystem object for page. */ diff --git a/mm/truncate.c b/mm/truncate.c index a01cce450a26..8d8c62d89e6d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -283,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -371,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); @@ -492,7 +492,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); /* Middle of THP: skip */ if (PageTransTail(page)) { @@ -612,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) { unlock_page(page); continue; -- cgit From efda1b5d87cbc3d8816f94a3815b413f1868e10d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 6 Dec 2016 09:10:12 -0800 Subject: acpi, nfit, libnvdimm: fix / harden ars_status output length handling Given ambiguities in the ACPI 6.1 definition of the "Output (Size)" field of the ARS (Address Range Scrub) Status command, a firmware implementation may in practice return 0, 4, or 8 to indicate that there is no output payload to process. The specification states "Size of Output Buffer in bytes, including this field.". However, 'Output Buffer' is also the name of the entire payload, and earlier in the specification it states "Max Query ARS Status Output Buffer Size: Maximum size of buffer (including the Status and Extended Status fields)". Without this fix if the BIOS happens to return 0 it causes memory corruption as evidenced by this result from the acpi_nfit_ctl() unit test. ars_status00000000: 00020000 00000000 ........ BUG: stack guard page was hit at ffffc90001750000 (stack is ffffc9000174c000..ffffc9000174ffff) kernel stack overflow (page fault): 0000 [#1] SMP DEBUG_PAGEALLOC task: ffff8803332d2ec0 task.stack: ffffc9000174c000 RIP: 0010:[] [] __memcpy+0x12/0x20 RSP: 0018:ffffc9000174f9a8 EFLAGS: 00010246 RAX: ffffc9000174fab8 RBX: 0000000000000000 RCX: 000000001fffff56 RDX: 0000000000000000 RSI: ffff8803231f5a08 RDI: ffffc90001750000 RBP: ffffc9000174fa88 R08: ffffc9000174fab0 R09: ffff8803231f54b8 R10: 0000000000000008 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000003 R15: ffff8803231f54a0 FS: 00007f3a611af640(0000) GS:ffff88033ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc90001750000 CR3: 0000000325b20000 CR4: 00000000000406e0 Stack: ffffffffa00bc60d 0000000000000008 ffffc90000000001 ffffc9000174faac 0000000000000292 ffffffffa00c24e4 ffffffffa00c2914 0000000000000000 0000000000000000 ffffffff00000003 ffff880331ae8ad0 0000000800000246 Call Trace: [] ? acpi_nfit_ctl+0x49d/0x750 [nfit] [] nfit_test_probe+0x670/0xb1b [nfit_test] Cc: Fixes: 747ffe11b440 ("libnvdimm, tools/testing/nvdimm: fix 'ars_status' output buffer sizing") Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 3 ++- drivers/nvdimm/bus.c | 25 ++++++++++++++++++++----- include/linux/libnvdimm.h | 2 +- 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 60acbb1d159c..e58ec32393b7 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -298,7 +298,8 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, for (i = 0, offset = 0; i < desc->out_num; i++) { u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, - (u32 *) out_obj->buffer.pointer); + (u32 *) out_obj->buffer.pointer, + out_obj->buffer.length - offset); if (offset + out_size > out_obj->buffer.length) { dev_dbg(dev, "%s:%s output object underflow cmd: %s field: %d\n", diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index a8b6949a8778..23d4a1728cdf 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -715,7 +715,7 @@ EXPORT_SYMBOL_GPL(nd_cmd_in_size); u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, - const u32 *out_field) + const u32 *out_field, unsigned long remainder) { if (idx >= desc->out_num) return UINT_MAX; @@ -727,9 +727,24 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, return in_field[1]; else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) return out_field[1]; - else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 2) - return out_field[1] - 8; - else if (cmd == ND_CMD_CALL) { + else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 2) { + /* + * Per table 9-276 ARS Data in ACPI 6.1, out_field[1] is + * "Size of Output Buffer in bytes, including this + * field." + */ + if (out_field[1] < 4) + return 0; + /* + * ACPI 6.1 is ambiguous if 'status' is included in the + * output size. If we encounter an output size that + * overshoots the remainder by 4 bytes, assume it was + * including 'status'. + */ + if (out_field[1] - 8 == remainder) + return remainder; + return out_field[1] - 4; + } else if (cmd == ND_CMD_CALL) { struct nd_cmd_pkg *pkg = (struct nd_cmd_pkg *) in_field; return pkg->nd_size_out; @@ -876,7 +891,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, /* process an output envelope */ for (i = 0; i < desc->out_num; i++) { u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, - (u32 *) in_env, (u32 *) out_env); + (u32 *) in_env, (u32 *) out_env, 0); u32 copy; if (out_size == UINT_MAX) { diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index f4947fda11e7..8458c5351e56 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -143,7 +143,7 @@ u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, void *buf); u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, - const u32 *out_field); + const u32 *out_field, unsigned long remainder); int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); -- cgit From 7b8076ce8a00d553ae9d3b7eb5f0cc3e63cb16f1 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Wed, 7 Dec 2016 14:07:48 +0100 Subject: NET: usb: cdc_mbim: add quirk for supporting Telit LE922A MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Telit LE922A MBIM based composition does not work properly with altsetting toggle done in cdc_ncm_bind_common. This patch adds CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE quirk to avoid this procedure that, instead, is mandatory for other modems. Signed-off-by: Daniele Palmas Reviewed-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_mbim.c | 21 +++++++++++++++++++++ drivers/net/usb/cdc_ncm.c | 14 +++++++++----- include/linux/usb/cdc_ncm.h | 3 ++- 3 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index 96a5028621c8..3a98f3762a4c 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -602,6 +602,21 @@ static const struct driver_info cdc_mbim_info_ndp_to_end = { .data = CDC_NCM_FLAG_NDP_TO_END, }; +/* Some modems (e.g. Telit LE922A6) do not work properly with altsetting + * toggle done in cdc_ncm_bind_common. CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE + * flag is used to avoid this procedure. + */ +static const struct driver_info cdc_mbim_info_avoid_altsetting_toggle = { + .description = "CDC MBIM", + .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN, + .bind = cdc_mbim_bind, + .unbind = cdc_mbim_unbind, + .manage_power = cdc_mbim_manage_power, + .rx_fixup = cdc_mbim_rx_fixup, + .tx_fixup = cdc_mbim_tx_fixup, + .data = CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE, +}; + static const struct usb_device_id mbim_devs[] = { /* This duplicate NCM entry is intentional. MBIM devices can * be disguised as NCM by default, and this is necessary to @@ -626,6 +641,12 @@ static const struct usb_device_id mbim_devs[] = { { USB_VENDOR_AND_INTERFACE_INFO(0x12d1, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_ndp_to_end, }, + + /* Telit LE922A6 in MBIM composition */ + { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1041, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, + }, + /* default entry */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_zlp, diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 877c9516e781..afbfc0f656f3 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -839,11 +839,18 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ iface_no = ctx->data->cur_altsetting->desc.bInterfaceNumber; + /* Device-specific flags */ + ctx->drvflags = drvflags; + /* Reset data interface. Some devices will not reset properly * unless they are configured first. Toggle the altsetting to - * force a reset + * force a reset. + * Some other devices do not work properly with this procedure + * that can be avoided using quirk CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE */ - usb_set_interface(dev->udev, iface_no, data_altsetting); + if (!(ctx->drvflags & CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE)) + usb_set_interface(dev->udev, iface_no, data_altsetting); + temp = usb_set_interface(dev->udev, iface_no, 0); if (temp) { dev_dbg(&intf->dev, "set interface failed\n"); @@ -890,9 +897,6 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ /* finish setting up the device specific data */ cdc_ncm_setup(dev); - /* Device-specific flags */ - ctx->drvflags = drvflags; - /* Allocate the delayed NDP if needed. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) { ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL); diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 3a375d07d0dc..00d232406f18 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -81,7 +81,8 @@ #define CDC_NCM_TIMER_INTERVAL_MAX (U32_MAX / NSEC_PER_USEC) /* Driver flags */ -#define CDC_NCM_FLAG_NDP_TO_END 0x02 /* NDP is placed at end of frame */ +#define CDC_NCM_FLAG_NDP_TO_END 0x02 /* NDP is placed at end of frame */ +#define CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE 0x04 /* Avoid altsetting toggle during init */ #define cdc_ncm_comm_intf_is_mbim(x) ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \ (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE) -- cgit