diff options
415 files changed, 14661 insertions, 5478 deletions
diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000000..faffc0d5af4e --- /dev/null +++ b/.clang-format @@ -0,0 +1,428 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 4. +# +# For more information, see: +# +# Documentation/process/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +#AlignEscapedNewlines: Left # Unknown to clang-format-4.0 +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + #AfterExternBlock: false # Unknown to clang-format-5.0 + BeforeCatch: false + BeforeElse: false + IndentBraces: false + #SplitEmptyFunction: true # Unknown to clang-format-4.0 + #SplitEmptyRecord: true # Unknown to clang-format-4.0 + #SplitEmptyNamespace: true # Unknown to clang-format-4.0 +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +#CompactNamespaces: false # Unknown to clang-format-4.0 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +#FixNamespaceComments: false # Unknown to clang-format-4.0 + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | sort | uniq +ForEachMacros: + - 'apei_estatus_for_each_section' + - 'ata_for_each_dev' + - 'ata_for_each_link' + - 'ax25_for_each' + - 'ax25_uid_for_each' + - 'bio_for_each_integrity_vec' + - '__bio_for_each_segment' + - 'bio_for_each_segment' + - 'bio_for_each_segment_all' + - 'bio_list_for_each' + - 'bip_for_each_vec' + - 'blkg_for_each_descendant_post' + - 'blkg_for_each_descendant_pre' + - 'blk_queue_for_each_rl' + - 'bond_for_each_slave' + - 'bond_for_each_slave_rcu' + - 'btree_for_each_safe128' + - 'btree_for_each_safe32' + - 'btree_for_each_safe64' + - 'btree_for_each_safel' + - 'card_for_each_dev' + - 'cgroup_taskset_for_each' + - 'cgroup_taskset_for_each_leader' + - 'cpufreq_for_each_entry' + - 'cpufreq_for_each_entry_idx' + - 'cpufreq_for_each_valid_entry' + - 'cpufreq_for_each_valid_entry_idx' + - 'css_for_each_child' + - 'css_for_each_descendant_post' + - 'css_for_each_descendant_pre' + - 'device_for_each_child_node' + - 'drm_atomic_crtc_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane_state' + - 'drm_for_each_connector_iter' + - 'drm_for_each_crtc' + - 'drm_for_each_encoder' + - 'drm_for_each_encoder_mask' + - 'drm_for_each_fb' + - 'drm_for_each_legacy_plane' + - 'drm_for_each_plane' + - 'drm_for_each_plane_mask' + - 'drm_mm_for_each_hole' + - 'drm_mm_for_each_node' + - 'drm_mm_for_each_node_in_range' + - 'drm_mm_for_each_node_safe' + - 'for_each_active_drhd_unit' + - 'for_each_active_iommu' + - 'for_each_available_child_of_node' + - 'for_each_bio' + - 'for_each_board_func_rsrc' + - 'for_each_bvec' + - 'for_each_child_of_node' + - 'for_each_clear_bit' + - 'for_each_clear_bit_from' + - 'for_each_cmsghdr' + - 'for_each_compatible_node' + - 'for_each_console' + - 'for_each_cpu' + - 'for_each_cpu_and' + - 'for_each_cpu_not' + - 'for_each_cpu_wrap' + - 'for_each_dev_addr' + - 'for_each_dma_cap_mask' + - 'for_each_drhd_unit' + - 'for_each_dss_dev' + - 'for_each_efi_memory_desc' + - 'for_each_efi_memory_desc_in_map' + - 'for_each_endpoint_of_node' + - 'for_each_evictable_lru' + - 'for_each_fib6_node_rt_rcu' + - 'for_each_fib6_walker_rt' + - 'for_each_free_mem_range' + - 'for_each_free_mem_range_reverse' + - 'for_each_func_rsrc' + - 'for_each_hstate' + - 'for_each_if' + - 'for_each_iommu' + - 'for_each_ip_tunnel_rcu' + - 'for_each_irq_nr' + - 'for_each_lru' + - 'for_each_matching_node' + - 'for_each_matching_node_and_match' + - 'for_each_memblock' + - 'for_each_memblock_type' + - 'for_each_memcg_cache_index' + - 'for_each_mem_pfn_range' + - 'for_each_mem_range' + - 'for_each_mem_range_rev' + - 'for_each_migratetype_order' + - 'for_each_msi_entry' + - 'for_each_net' + - 'for_each_netdev' + - 'for_each_netdev_continue' + - 'for_each_netdev_continue_rcu' + - 'for_each_netdev_feature' + - 'for_each_netdev_in_bond_rcu' + - 'for_each_netdev_rcu' + - 'for_each_netdev_reverse' + - 'for_each_netdev_safe' + - 'for_each_net_rcu' + - 'for_each_new_connector_in_state' + - 'for_each_new_crtc_in_state' + - 'for_each_new_plane_in_state' + - 'for_each_new_private_obj_in_state' + - 'for_each_node' + - 'for_each_node_by_name' + - 'for_each_node_by_type' + - 'for_each_node_mask' + - 'for_each_node_state' + - 'for_each_node_with_cpus' + - 'for_each_node_with_property' + - 'for_each_of_allnodes' + - 'for_each_of_allnodes_from' + - 'for_each_of_pci_range' + - 'for_each_old_connector_in_state' + - 'for_each_old_crtc_in_state' + - 'for_each_oldnew_connector_in_state' + - 'for_each_oldnew_crtc_in_state' + - 'for_each_oldnew_plane_in_state' + - 'for_each_oldnew_private_obj_in_state' + - 'for_each_old_plane_in_state' + - 'for_each_old_private_obj_in_state' + - 'for_each_online_cpu' + - 'for_each_online_node' + - 'for_each_online_pgdat' + - 'for_each_pci_bridge' + - 'for_each_pci_dev' + - 'for_each_pci_msi_entry' + - 'for_each_populated_zone' + - 'for_each_possible_cpu' + - 'for_each_present_cpu' + - 'for_each_prime_number' + - 'for_each_prime_number_from' + - 'for_each_process' + - 'for_each_process_thread' + - 'for_each_property_of_node' + - 'for_each_reserved_mem_region' + - 'for_each_resv_unavail_range' + - 'for_each_rtdcom' + - 'for_each_rtdcom_safe' + - 'for_each_set_bit' + - 'for_each_set_bit_from' + - 'for_each_sg' + - 'for_each_sg_page' + - '__for_each_thread' + - 'for_each_thread' + - 'for_each_zone' + - 'for_each_zone_zonelist' + - 'for_each_zone_zonelist_nodemask' + - 'fwnode_for_each_available_child_node' + - 'fwnode_for_each_child_node' + - 'fwnode_graph_for_each_endpoint' + - 'gadget_for_each_ep' + - 'hash_for_each' + - 'hash_for_each_possible' + - 'hash_for_each_possible_rcu' + - 'hash_for_each_possible_rcu_notrace' + - 'hash_for_each_possible_safe' + - 'hash_for_each_rcu' + - 'hash_for_each_safe' + - 'hctx_for_each_ctx' + - 'hlist_bl_for_each_entry' + - 'hlist_bl_for_each_entry_rcu' + - 'hlist_bl_for_each_entry_safe' + - 'hlist_for_each' + - 'hlist_for_each_entry' + - 'hlist_for_each_entry_continue' + - 'hlist_for_each_entry_continue_rcu' + - 'hlist_for_each_entry_continue_rcu_bh' + - 'hlist_for_each_entry_from' + - 'hlist_for_each_entry_from_rcu' + - 'hlist_for_each_entry_rcu' + - 'hlist_for_each_entry_rcu_bh' + - 'hlist_for_each_entry_rcu_notrace' + - 'hlist_for_each_entry_safe' + - '__hlist_for_each_rcu' + - 'hlist_for_each_safe' + - 'hlist_nulls_for_each_entry' + - 'hlist_nulls_for_each_entry_from' + - 'hlist_nulls_for_each_entry_rcu' + - 'hlist_nulls_for_each_entry_safe' + - 'ide_host_for_each_port' + - 'ide_port_for_each_dev' + - 'ide_port_for_each_present_dev' + - 'idr_for_each_entry' + - 'idr_for_each_entry_continue' + - 'idr_for_each_entry_ul' + - 'inet_bind_bucket_for_each' + - 'inet_lhash2_for_each_icsk_rcu' + - 'iov_for_each' + - 'key_for_each' + - 'key_for_each_safe' + - 'klp_for_each_func' + - 'klp_for_each_object' + - 'kvm_for_each_memslot' + - 'kvm_for_each_vcpu' + - 'list_for_each' + - 'list_for_each_entry' + - 'list_for_each_entry_continue' + - 'list_for_each_entry_continue_rcu' + - 'list_for_each_entry_continue_reverse' + - 'list_for_each_entry_from' + - 'list_for_each_entry_from_reverse' + - 'list_for_each_entry_lockless' + - 'list_for_each_entry_rcu' + - 'list_for_each_entry_reverse' + - 'list_for_each_entry_safe' + - 'list_for_each_entry_safe_continue' + - 'list_for_each_entry_safe_from' + - 'list_for_each_entry_safe_reverse' + - 'list_for_each_prev' + - 'list_for_each_prev_safe' + - 'list_for_each_safe' + - 'llist_for_each' + - 'llist_for_each_entry' + - 'llist_for_each_entry_safe' + - 'llist_for_each_safe' + - 'media_device_for_each_entity' + - 'media_device_for_each_intf' + - 'media_device_for_each_link' + - 'media_device_for_each_pad' + - 'netdev_for_each_lower_dev' + - 'netdev_for_each_lower_private' + - 'netdev_for_each_lower_private_rcu' + - 'netdev_for_each_mc_addr' + - 'netdev_for_each_uc_addr' + - 'netdev_for_each_upper_dev_rcu' + - 'netdev_hw_addr_list_for_each' + - 'nft_rule_for_each_expr' + - 'nla_for_each_attr' + - 'nla_for_each_nested' + - 'nlmsg_for_each_attr' + - 'nlmsg_for_each_msg' + - 'nr_neigh_for_each' + - 'nr_neigh_for_each_safe' + - 'nr_node_for_each' + - 'nr_node_for_each_safe' + - 'of_for_each_phandle' + - 'of_property_for_each_string' + - 'of_property_for_each_u32' + - 'pci_bus_for_each_resource' + - 'ping_portaddr_for_each_entry' + - 'plist_for_each' + - 'plist_for_each_continue' + - 'plist_for_each_entry' + - 'plist_for_each_entry_continue' + - 'plist_for_each_entry_safe' + - 'plist_for_each_safe' + - 'pnp_for_each_card' + - 'pnp_for_each_dev' + - 'protocol_for_each_card' + - 'protocol_for_each_dev' + - 'queue_for_each_hw_ctx' + - 'radix_tree_for_each_contig' + - 'radix_tree_for_each_slot' + - 'radix_tree_for_each_tagged' + - 'rbtree_postorder_for_each_entry_safe' + - 'resource_list_for_each_entry' + - 'resource_list_for_each_entry_safe' + - 'rhl_for_each_entry_rcu' + - 'rhl_for_each_rcu' + - 'rht_for_each' + - 'rht_for_each_continue' + - 'rht_for_each_entry' + - 'rht_for_each_entry_continue' + - 'rht_for_each_entry_rcu' + - 'rht_for_each_entry_rcu_continue' + - 'rht_for_each_entry_safe' + - 'rht_for_each_rcu' + - 'rht_for_each_rcu_continue' + - '__rq_for_each_bio' + - 'rq_for_each_segment' + - 'scsi_for_each_prot_sg' + - 'scsi_for_each_sg' + - 'sctp_for_each_hentry' + - 'sctp_skb_for_each' + - 'shdma_for_each_chan' + - '__shost_for_each_device' + - 'shost_for_each_device' + - 'sk_for_each' + - 'sk_for_each_bound' + - 'sk_for_each_entry_offset_rcu' + - 'sk_for_each_from' + - 'sk_for_each_rcu' + - 'sk_for_each_safe' + - 'sk_nulls_for_each' + - 'sk_nulls_for_each_from' + - 'sk_nulls_for_each_rcu' + - 'snd_pcm_group_for_each_entry' + - 'snd_soc_dapm_widget_for_each_path' + - 'snd_soc_dapm_widget_for_each_path_safe' + - 'snd_soc_dapm_widget_for_each_sink_path' + - 'snd_soc_dapm_widget_for_each_source_path' + - 'tb_property_for_each' + - 'udp_portaddr_for_each_entry' + - 'udp_portaddr_for_each_entry_rcu' + - 'usb_hub_for_each_child' + - 'v4l2_device_for_each_subdev' + - 'v4l2_m2m_for_each_dst_buf' + - 'v4l2_m2m_for_each_dst_buf_safe' + - 'v4l2_m2m_for_each_src_buf' + - 'v4l2_m2m_for_each_src_buf_safe' + - 'zorro_for_each_dev' + +#IncludeBlocks: Preserve # Unknown to clang-format-5.0 +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +#IndentPPDirectives: None # Unknown to clang-format-5.0 +IndentWidth: 8 +IndentWrappedFunctionNames: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: Inner +#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +#SortUsingDeclarations: false # Unknown to clang-format-4.0 +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 +#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 +SpaceBeforeParens: ControlStatements +#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/.gitignore b/.gitignore index 85bcc2696442..a1dfd2acd9c3 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,7 @@ modules.builtin !.gitignore !.mailmap !.cocciconfig +!.clang-format # # Generated include files diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index a4af2e124e24..3682e99234c2 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -262,7 +262,7 @@ When oom event notifier is registered, event will be delivered. 2.6 Locking lock_page_cgroup()/unlock_page_cgroup() should not be called under - mapping->tree_lock. + the i_pages lock. Other lock order is following: PG_locked. diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt index 978463a7c81e..073f128af5a7 100644 --- a/Documentation/cpu-freq/core.txt +++ b/Documentation/cpu-freq/core.txt @@ -97,12 +97,10 @@ flags - flags of the cpufreq driver ================================================================== For details about OPP, see Documentation/power/opp.txt -dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with - cpufreq_table_validate_and_show() which is provided with the list of - frequencies that are available for operation. This function provides - a ready to use conversion routine to translate the OPP layer's internal - information about the available frequencies into a format readily - providable to cpufreq. +dev_pm_opp_init_cpufreq_table - + This function provides a ready to use conversion routine to translate + the OPP layer's internal information about the available frequencies + into a format readily providable to cpufreq. WARNING: Do not use this function in interrupt context. @@ -112,7 +110,7 @@ dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with /* Do things */ r = dev_pm_opp_init_cpufreq_table(dev, &freq_table); if (!r) - cpufreq_table_validate_and_show(policy, freq_table); + policy->freq_table = freq_table; /* Do other things */ } diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt index 61546ac578d6..6e353d00cdc6 100644 --- a/Documentation/cpu-freq/cpu-drivers.txt +++ b/Documentation/cpu-freq/cpu-drivers.txt @@ -259,10 +259,8 @@ CPUFREQ_ENTRY_INVALID. The entries don't need to be in sorted in any particular order, but if they are cpufreq core will do DVFS a bit quickly for them as search for best match is faster. -By calling cpufreq_table_validate_and_show(), the cpuinfo.min_freq and -cpuinfo.max_freq values are detected, and policy->min and policy->max -are set to the same values. This is helpful for the per-CPU -initialization stage. +The cpufreq table is verified automatically by the core if the policy contains a +valid pointer in its policy->freq_table field. cpufreq_frequency_table_verify() assures that at least one valid frequency is within policy->min and policy->max, and all other criteria diff --git a/Documentation/cpuidle/sysfs.txt b/Documentation/cpuidle/sysfs.txt index b6f44f490ed7..d1587f434e7b 100644 --- a/Documentation/cpuidle/sysfs.txt +++ b/Documentation/cpuidle/sysfs.txt @@ -40,6 +40,7 @@ total 0 -r--r--r-- 1 root root 4096 Feb 8 10:42 latency -r--r--r-- 1 root root 4096 Feb 8 10:42 name -r--r--r-- 1 root root 4096 Feb 8 10:42 power +-r--r--r-- 1 root root 4096 Feb 8 10:42 residency -r--r--r-- 1 root root 4096 Feb 8 10:42 time -r--r--r-- 1 root root 4096 Feb 8 10:42 usage @@ -50,6 +51,7 @@ total 0 -r--r--r-- 1 root root 4096 Feb 8 10:42 latency -r--r--r-- 1 root root 4096 Feb 8 10:42 name -r--r--r-- 1 root root 4096 Feb 8 10:42 power +-r--r--r-- 1 root root 4096 Feb 8 10:42 residency -r--r--r-- 1 root root 4096 Feb 8 10:42 time -r--r--r-- 1 root root 4096 Feb 8 10:42 usage @@ -60,6 +62,7 @@ total 0 -r--r--r-- 1 root root 4096 Feb 8 10:42 latency -r--r--r-- 1 root root 4096 Feb 8 10:42 name -r--r--r-- 1 root root 4096 Feb 8 10:42 power +-r--r--r-- 1 root root 4096 Feb 8 10:42 residency -r--r--r-- 1 root root 4096 Feb 8 10:42 time -r--r--r-- 1 root root 4096 Feb 8 10:42 usage @@ -70,6 +73,7 @@ total 0 -r--r--r-- 1 root root 4096 Feb 8 10:42 latency -r--r--r-- 1 root root 4096 Feb 8 10:42 name -r--r--r-- 1 root root 4096 Feb 8 10:42 power +-r--r--r-- 1 root root 4096 Feb 8 10:42 residency -r--r--r-- 1 root root 4096 Feb 8 10:42 time -r--r--r-- 1 root root 4096 Feb 8 10:42 usage -------------------------------------------------------------------------------- @@ -78,6 +82,8 @@ total 0 * desc : Small description about the idle state (string) * disable : Option to disable this idle state (bool) -> see note below * latency : Latency to exit out of this idle state (in microseconds) +* residency : Time after which a state becomes more effecient than any + shallower state (in microseconds) * name : Name of the idle state (string) * power : Power consumed while in this idle state (in milliwatts) * time : Total time spent in this idle state (in microseconds) diff --git a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt index 1fd5d69647ca..ffadb7c6f1f3 100644 --- a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt +++ b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt @@ -11,6 +11,8 @@ Required Properties: the device is compatible with the R-Car Gen2 VMSA-compatible IPMMU. - "renesas,ipmmu-r8a73a4" for the R8A73A4 (R-Mobile APE6) IPMMU. + - "renesas,ipmmu-r8a7743" for the R8A7743 (RZ/G1M) IPMMU. + - "renesas,ipmmu-r8a7745" for the R8A7745 (RZ/G1E) IPMMU. - "renesas,ipmmu-r8a7790" for the R8A7790 (R-Car H2) IPMMU. - "renesas,ipmmu-r8a7791" for the R8A7791 (R-Car M2-W) IPMMU. - "renesas,ipmmu-r8a7793" for the R8A7793 (R-Car M2-N) IPMMU. @@ -19,7 +21,8 @@ Required Properties: - "renesas,ipmmu-r8a7796" for the R8A7796 (R-Car M3-W) IPMMU. - "renesas,ipmmu-r8a77970" for the R8A77970 (R-Car V3M) IPMMU. - "renesas,ipmmu-r8a77995" for the R8A77995 (R-Car D3) IPMMU. - - "renesas,ipmmu-vmsa" for generic R-Car Gen2 VMSA-compatible IPMMU. + - "renesas,ipmmu-vmsa" for generic R-Car Gen2 or RZ/G1 VMSA-compatible + IPMMU. - reg: Base address and size of the IPMMU registers. - interrupts: Specifiers for the MMU fault interrupts. For instances that diff --git a/Documentation/devicetree/bindings/iommu/rockchip,iommu.txt b/Documentation/devicetree/bindings/iommu/rockchip,iommu.txt index 2098f7732264..6ecefea1c6f9 100644 --- a/Documentation/devicetree/bindings/iommu/rockchip,iommu.txt +++ b/Documentation/devicetree/bindings/iommu/rockchip,iommu.txt @@ -14,6 +14,11 @@ Required properties: "single-master" device, and needs no additional information to associate with its master device. See: Documentation/devicetree/bindings/iommu/iommu.txt +- clocks : A list of clocks required for the IOMMU to be accessible by + the host CPU. +- clock-names : Should contain the following: + "iface" - Main peripheral bus clock (PCLK/HCL) (required) + "aclk" - AXI bus clock (required) Optional properties: - rockchip,disable-mmu-reset : Don't use the mmu reset operation. @@ -27,5 +32,7 @@ Example: reg = <0xff940300 0x100>; interrupts = <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "vopl_mmu"; + clocks = <&cru ACLK_VOP1>, <&cru HCLK_VOP1>; + clock-names = "aclk", "iface"; #iommu-cells = <0>; }; diff --git a/Documentation/filesystems/afs.txt b/Documentation/filesystems/afs.txt index c5254f6d234d..8c6ea7b41048 100644 --- a/Documentation/filesystems/afs.txt +++ b/Documentation/filesystems/afs.txt @@ -11,7 +11,7 @@ Contents: - Proc filesystem. - The cell database. - Security. - - Examples. + - The @sys substitution. ======== @@ -230,3 +230,29 @@ If a file is opened with a particular key and then the file descriptor is passed to a process that doesn't have that key (perhaps over an AF_UNIX socket), then the operations on the file will be made with key that was used to open the file. + + +===================== +THE @SYS SUBSTITUTION +===================== + +The list of up to 16 @sys substitutions for the current network namespace can +be configured by writing a list to /proc/fs/afs/sysname: + + [root@andromeda ~]# echo foo amd64_linux_26 >/proc/fs/afs/sysname + +or cleared entirely by writing an empty list: + + [root@andromeda ~]# echo >/proc/fs/afs/sysname + +The current list for current network namespace can be retrieved by: + + [root@andromeda ~]# cat /proc/fs/afs/sysname + foo + amd64_linux_26 + +When @sys is being substituted for, each element of the list is tried in the +order given. + +By default, the list will contain one item that conforms to the pattern +"<arch>_linux_26", amd64 being the name for x86_64. diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt index 1fb12f9dfe48..7059623635b2 100644 --- a/Documentation/filesystems/gfs2-glocks.txt +++ b/Documentation/filesystems/gfs2-glocks.txt @@ -100,14 +100,15 @@ indicates that it is caching uptodate data. Glock locking order within GFS2: - 1. i_mutex (if required) + 1. i_rwsem (if required) 2. Rename glock (for rename only) 3. Inode glock(s) (Parents before children, inodes at "same level" with same parent in lock number order) 4. Rgrp glock(s) (for (de)allocation operations) 5. Transaction glock (via gfs2_trans_begin) for non-read operations - 6. Page lock (always last, very important!) + 6. i_rw_mutex (if required) + 7. Page lock (always last, very important!) There are two glocks per inode. One deals with access to the inode itself (locking order as above), and the other, known as the iopen diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 26b106071364..eb4b185d168c 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -58,6 +58,14 @@ can never be transgressed. If there is a good reason to go against the style (a line which becomes far less readable if split to fit within the 80-column limit, for example), just do it. +Note that you can also use the ``clang-format`` tool to help you with +these rules, to quickly re-format parts of your code automatically, +and to review full files in order to spot coding style mistakes, +typos and possible improvements. It is also handy for sorting ``#includes``, +for aligning variables/macros, for reflowing text and other similar tasks. +See the file :ref:`Documentation/process/clang-format.rst <clangformat>` +for more details. + Abstraction layers ****************** diff --git a/Documentation/process/clang-format.rst b/Documentation/process/clang-format.rst new file mode 100644 index 000000000000..6710c0707721 --- /dev/null +++ b/Documentation/process/clang-format.rst @@ -0,0 +1,184 @@ +.. _clangformat: + +clang-format +============ + +``clang-format`` is a tool to format C/C++/... code according to +a set of rules and heuristics. Like most tools, it is not perfect +nor covers every single case, but it is good enough to be helpful. + +``clang-format`` can be used for several purposes: + + - Quickly reformat a block of code to the kernel style. Specially useful + when moving code around and aligning/sorting. See clangformatreformat_. + + - Spot style mistakes, typos and possible improvements in files + you maintain, patches you review, diffs, etc. See clangformatreview_. + + - Help you follow the coding style rules, specially useful for those + new to kernel development or working at the same time in several + projects with different coding styles. + +Its configuration file is ``.clang-format`` in the root of the kernel tree. +The rules contained there try to approximate the most common kernel +coding style. They also try to follow :ref:`Documentation/process/coding-style.rst <codingstyle>` +as much as possible. Since not all the kernel follows the same style, +it is possible that you may want to tweak the defaults for a particular +subsystem or folder. To do so, you can override the defaults by writing +another ``.clang-format`` file in a subfolder. + +The tool itself has already been included in the repositories of popular +Linux distributions for a long time. Search for ``clang-format`` in +your repositories. Otherwise, you can either download pre-built +LLVM/clang binaries or build the source code from: + + http://releases.llvm.org/download.html + +See more information about the tool at: + + https://clang.llvm.org/docs/ClangFormat.html + + https://clang.llvm.org/docs/ClangFormatStyleOptions.html + + +.. _clangformatreview: + +Review files and patches for coding style +----------------------------------------- + +By running the tool in its inline mode, you can review full subsystems, +folders or individual files for code style mistakes, typos or improvements. + +To do so, you can run something like:: + + # Make sure your working directory is clean! + clang-format -i kernel/*.[ch] + +And then take a look at the git diff. + +Counting the lines of such a diff is also useful for improving/tweaking +the style options in the configuration file; as well as testing new +``clang-format`` features/versions. + +``clang-format`` also supports reading unified diffs, so you can review +patches and git diffs easily. See the documentation at: + + https://clang.llvm.org/docs/ClangFormat.html#script-for-patch-reformatting + +To avoid ``clang-format`` formatting some portion of a file, you can do:: + + int formatted_code; + // clang-format off + void unformatted_code ; + // clang-format on + void formatted_code_again; + +While it might be tempting to use this to keep a file always in sync with +``clang-format``, specially if you are writing new files or if you are +a maintainer, please note that people might be running different +``clang-format`` versions or not have it available at all. Therefore, +you should probably refrain yourself from using this in kernel sources; +at least until we see if ``clang-format`` becomes commonplace. + + +.. _clangformatreformat: + +Reformatting blocks of code +--------------------------- + +By using an integration with your text editor, you can reformat arbitrary +blocks (selections) of code with a single keystroke. This is specially +useful when moving code around, for complex code that is deeply intended, +for multi-line macros (and aligning their backslashes), etc. + +Remember that you can always tweak the changes afterwards in those cases +where the tool did not do an optimal job. But as a first approximation, +it can be very useful. + +There are integrations for many popular text editors. For some of them, +like vim, emacs, BBEdit and Visual Studio you can find support built-in. +For instructions, read the appropiate section at: + + https://clang.llvm.org/docs/ClangFormat.html + +For Atom, Eclipse, Sublime Text, Visual Studio Code, XCode and other +editors and IDEs you should be able to find ready-to-use plugins. + +For this use case, consider using a secondary ``.clang-format`` +so that you can tweak a few options. See clangformatextra_. + + +.. _clangformatmissing: + +Missing support +--------------- + +``clang-format`` is missing support for some things that are common +in kernel code. They are easy to remember, so if you use the tool +regularly, you will quickly learn to avoid/ignore those. + +In particular, some very common ones you will notice are: + + - Aligned blocks of one-line ``#defines``, e.g.:: + + #define TRACING_MAP_BITS_DEFAULT 11 + #define TRACING_MAP_BITS_MAX 17 + #define TRACING_MAP_BITS_MIN 7 + + vs.:: + + #define TRACING_MAP_BITS_DEFAULT 11 + #define TRACING_MAP_BITS_MAX 17 + #define TRACING_MAP_BITS_MIN 7 + + - Aligned designated initializers, e.g.:: + + static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, + }; + + vs.:: + + static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, + }; + + +.. _clangformatextra: + +Extra features/options +---------------------- + +Some features/style options are not enabled by default in the configuration +file in order to minimize the differences between the output and the current +code. In other words, to make the difference as small as possible, +which makes reviewing full-file style, as well diffs and patches as easy +as possible. + +In other cases (e.g. particular subsystems/folders/files), the kernel style +might be different and enabling some of these options may approximate +better the style there. + +For instance: + + - Aligning assignments (``AlignConsecutiveAssignments``). + + - Aligning declarations (``AlignConsecutiveDeclarations``). + + - Reflowing text in comments (``ReflowComments``). + + - Sorting ``#includes`` (``SortIncludes``). + +They are typically useful for block re-formatting, rather than full-file. +You might want to create another ``.clang-format`` file and use that one +from your editor/IDE instead. diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index d98deb62c400..4e7c0a1c427a 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -631,6 +631,14 @@ options ``-kr -i8`` (stands for ``K&R, 8 character indents``), or use re-formatting you may want to take a look at the man page. But remember: ``indent`` is not a fix for bad programming. +Note that you can also use the ``clang-format`` tool to help you with +these rules, to quickly re-format parts of your code automatically, +and to review full files in order to spot coding style mistakes, +typos and possible improvements. It is also handy for sorting ``#includes``, +for aligning variables/macros, for reflowing text and other similar tasks. +See the file :ref:`Documentation/process/clang-format.rst <clangformat>` +for more details. + 10) Kconfig configuration files ------------------------------- diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 412314eebda6..eded671d55eb 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -964,32 +964,34 @@ detect a hard lockup condition. tainted: -Non-zero if the kernel has been tainted. Numeric values, which -can be ORed together: - - 1 - A module with a non-GPL license has been loaded, this - includes modules with no license. - Set by modutils >= 2.4.9 and module-init-tools. - 2 - A module was force loaded by insmod -f. - Set by modutils >= 2.4.9 and module-init-tools. - 4 - Unsafe SMP processors: SMP with CPUs not designed for SMP. - 8 - A module was forcibly unloaded from the system by rmmod -f. - 16 - A hardware machine check error occurred on the system. - 32 - A bad page was discovered on the system. - 64 - The user has asked that the system be marked "tainted". This - could be because they are running software that directly modifies - the hardware, or for other reasons. - 128 - The system has died. - 256 - The ACPI DSDT has been overridden with one supplied by the user - instead of using the one provided by the hardware. - 512 - A kernel warning has occurred. -1024 - A module from drivers/staging was loaded. -2048 - The system is working around a severe firmware bug. -4096 - An out-of-tree module has been loaded. -8192 - An unsigned module has been loaded in a kernel supporting module - signature. -16384 - A soft lockup has previously occurred on the system. -32768 - The kernel has been live patched. +Non-zero if the kernel has been tainted. Numeric values, which can be +ORed together. The letters are seen in "Tainted" line of Oops reports. + + 1 (P): A module with a non-GPL license has been loaded, this + includes modules with no license. + Set by modutils >= 2.4.9 and module-init-tools. + 2 (F): A module was force loaded by insmod -f. + Set by modutils >= 2.4.9 and module-init-tools. + 4 (S): Unsafe SMP processors: SMP with CPUs not designed for SMP. + 8 (R): A module was forcibly unloaded from the system by rmmod -f. + 16 (M): A hardware machine check error occurred on the system. + 32 (B): A bad page was discovered on the system. + 64 (U): The user has asked that the system be marked "tainted". This + could be because they are running software that directly modifies + the hardware, or for other reasons. + 128 (D): The system has died. + 256 (A): The ACPI DSDT has been overridden with one supplied by the user + instead of using the one provided by the hardware. + 512 (W): A kernel warning has occurred. + 1024 (C): A module from drivers/staging was loaded. + 2048 (I): The system is working around a severe firmware bug. + 4096 (O): An out-of-tree module has been loaded. + 8192 (E): An unsigned module has been loaded in a kernel supporting module + signature. + 16384 (L): A soft lockup has previously occurred on the system. + 32768 (K): The kernel has been live patched. + 65536 (X): Auxiliary taint, defined and used by for distros. +131072 (T): The kernel was built with the struct randomization plugin. ============================================================== diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index ff234d229cbb..17256f2ad919 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -312,8 +312,6 @@ The lowmem_reserve_ratio is an array. You can see them by reading this file. % cat /proc/sys/vm/lowmem_reserve_ratio 256 256 32 - -Note: # of this elements is one fewer than number of zones. Because the highest - zone's value is not necessary for following calculation. But, these values are not used directly. The kernel calculates # of protection pages for each zones from them. These are shown as array of protection pages @@ -364,7 +362,8 @@ As above expression, they are reciprocal number of ratio. pages of higher zones on the node. If you would like to protect more pages, smaller values are effective. -The minimum value is 1 (1/1 -> 100%). +The minimum value is 1 (1/1 -> 100%). The value less than 1 completely +disables protection of the pages. ============================================================== diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt index 4d3aac9f4a5d..2d1d6f69e91b 100644 --- a/Documentation/vm/hmm.txt +++ b/Documentation/vm/hmm.txt @@ -1,152 +1,160 @@ Heterogeneous Memory Management (HMM) -Transparently allow any component of a program to use any memory region of said -program with a device without using device specific memory allocator. This is -becoming a requirement to simplify the use of advance heterogeneous computing -where GPU, DSP or FPGA are use to perform various computations. - -This document is divided as follow, in the first section i expose the problems -related to the use of a device specific allocator. The second section i expose -the hardware limitations that are inherent to many platforms. The third section -gives an overview of HMM designs. The fourth section explains how CPU page- -table mirroring works and what is HMM purpose in this context. Fifth section -deals with how device memory is represented inside the kernel. Finaly the last -section present the new migration helper that allow to leverage the device DMA -engine. - - -1) Problems of using device specific memory allocator: -2) System bus, device memory characteristics -3) Share address space and migration +Provide infrastructure and helpers to integrate non-conventional memory (device +memory like GPU on board memory) into regular kernel path, with the cornerstone +of this being specialized struct page for such memory (see sections 5 to 7 of +this document). + +HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., +allowing a device to transparently access program address coherently with the +CPU meaning that any valid pointer on the CPU is also a valid pointer for the +device. This is becoming mandatory to simplify the use of advanced hetero- +geneous computing where GPU, DSP, or FPGA are used to perform various +computations on behalf of a process. + +This document is divided as follows: in the first section I expose the problems +related to using device specific memory allocators. In the second section, I +expose the hardware limitations that are inherent to many platforms. The third +section gives an overview of the HMM design. The fourth section explains how +CPU page-table mirroring works and the purpose of HMM in this context. The +fifth section deals with how device memory is represented inside the kernel. +Finally, the last section presents a new migration helper that allows lever- +aging the device DMA engine. + + +1) Problems of using a device specific memory allocator: +2) I/O bus, device memory characteristics +3) Shared address space and migration 4) Address space mirroring implementation and API 5) Represent and manage device memory from core kernel point of view -6) Migrate to and from device memory +6) Migration to and from device memory 7) Memory cgroup (memcg) and rss accounting ------------------------------------------------------------------------------- -1) Problems of using device specific memory allocator: - -Device with large amount of on board memory (several giga bytes) like GPU have -historically manage their memory through dedicated driver specific API. This -creates a disconnect between memory allocated and managed by device driver and -regular application memory (private anonymous, share memory or regular file -back memory). From here on i will refer to this aspect as split address space. -I use share address space to refer to the opposite situation ie one in which -any memory region can be use by device transparently. - -Split address space because device can only access memory allocated through the -device specific API. This imply that all memory object in a program are not -equal from device point of view which complicate large program that rely on a -wide set of libraries. - -Concretly this means that code that wants to leverage device like GPU need to -copy object between genericly allocated memory (malloc, mmap private/share/) -and memory allocated through the device driver API (this still end up with an -mmap but of the device file). - -For flat dataset (array, grid, image, ...) this isn't too hard to achieve but -complex data-set (list, tree, ...) are hard to get right. Duplicating a complex -data-set need to re-map all the pointer relations between each of its elements. -This is error prone and program gets harder to debug because of the duplicate -data-set. - -Split address space also means that library can not transparently use data they -are getting from core program or other library and thus each library might have -to duplicate its input data-set using specific memory allocator. Large project -suffer from this and waste resources because of the various memory copy. - -Duplicating each library API to accept as input or output memory allocted by +1) Problems of using a device specific memory allocator: + +Devices with a large amount of on board memory (several gigabytes) like GPUs +have historically managed their memory through dedicated driver specific APIs. +This creates a disconnect between memory allocated and managed by a device +driver and regular application memory (private anonymous, shared memory, or +regular file backed memory). From here on I will refer to this aspect as split +address space. I use shared address space to refer to the opposite situation: +i.e., one in which any application memory region can be used by a device +transparently. + +Split address space happens because device can only access memory allocated +through device specific API. This implies that all memory objects in a program +are not equal from the device point of view which complicates large programs +that rely on a wide set of libraries. + +Concretely this means that code that wants to leverage devices like GPUs needs +to copy object between generically allocated memory (malloc, mmap private, mmap +share) and memory allocated through the device driver API (this still ends up +with an mmap but of the device file). + +For flat data sets (array, grid, image, ...) this isn't too hard to achieve but +complex data sets (list, tree, ...) are hard to get right. Duplicating a +complex data set needs to re-map all the pointer relations between each of its +elements. This is error prone and program gets harder to debug because of the +duplicate data set and addresses. + +Split address space also means that libraries cannot transparently use data +they are getting from the core program or another library and thus each library +might have to duplicate its input data set using the device specific memory +allocator. Large projects suffer from this and waste resources because of the +various memory copies. + +Duplicating each library API to accept as input or output memory allocated by each device specific allocator is not a viable option. It would lead to a -combinatorial explosions in the library entry points. +combinatorial explosion in the library entry points. -Finaly with the advance of high level language constructs (in C++ but in other -language too) it is now possible for compiler to leverage GPU or other devices -without even the programmer knowledge. Some of compiler identified patterns are -only do-able with a share address. It is as well more reasonable to use a share -address space for all the other patterns. +Finally, with the advance of high level language constructs (in C++ but in +other languages too) it is now possible for the compiler to leverage GPUs and +other devices without programmer knowledge. Some compiler identified patterns +are only do-able with a shared address space. It is also more reasonable to use +a shared address space for all other patterns. ------------------------------------------------------------------------------- -2) System bus, device memory characteristics +2) I/O bus, device memory characteristics -System bus cripple share address due to few limitations. Most system bus only -allow basic memory access from device to main memory, even cache coherency is -often optional. Access to device memory from CPU is even more limited, most -often than not it is not cache coherent. +I/O buses cripple shared address spaces due to a few limitations. Most I/O +buses only allow basic memory access from device to main memory; even cache +coherency is often optional. Access to device memory from CPU is even more +limited. More often than not, it is not cache coherent. -If we only consider the PCIE bus than device can access main memory (often -through an IOMMU) and be cache coherent with the CPUs. However it only allows -a limited set of atomic operation from device on main memory. This is worse -in the other direction the CPUs can only access a limited range of the device -memory and can not perform atomic operations on it. Thus device memory can not -be consider like regular memory from kernel point of view. +If we only consider the PCIE bus, then a device can access main memory (often +through an IOMMU) and be cache coherent with the CPUs. However, it only allows +a limited set of atomic operations from device on main memory. This is worse +in the other direction: the CPU can only access a limited range of the device +memory and cannot perform atomic operations on it. Thus device memory cannot +be considered the same as regular memory from the kernel point of view. Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 -and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s). -The final limitation is latency, access to main memory from the device has an -order of magnitude higher latency than when the device access its own memory. +and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). +The final limitation is latency. Access to main memory from the device has an +order of magnitude higher latency than when the device accesses its own memory. -Some platform are developing new system bus or additions/modifications to PCIE -to address some of those limitations (OpenCAPI, CCIX). They mainly allow two +Some platforms are developing new I/O buses or additions/modifications to PCIE +to address some of these limitations (OpenCAPI, CCIX). They mainly allow two- way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Saddly not all platform are following this trends and -some major architecture are left without hardware solutions to those problems. +architecture supports. Sadly, not all platforms are following this trend and +some major architectures are left without hardware solutions to these problems. -So for share address space to make sense not only we must allow device to -access any memory memory but we must also permit any memory to be migrated to -device memory while device is using it (blocking CPU access while it happens). +So for shared address space to make sense, not only must we allow devices to +access any memory but we must also permit any memory to be migrated to device +memory while device is using it (blocking CPU access while it happens). ------------------------------------------------------------------------------- -3) Share address space and migration +3) Shared address space and migration HMM intends to provide two main features. First one is to share the address -space by duplication the CPU page table into the device page table so same -address point to same memory and this for any valid main memory address in +space by duplicating the CPU page table in the device page table so the same +address points to the same physical memory for any valid main memory address in the process address space. -To achieve this, HMM offer a set of helpers to populate the device page table +To achieve this, HMM offers a set of helpers to populate the device page table while keeping track of CPU page table updates. Device page table updates are -not as easy as CPU page table updates. To update the device page table you must -allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics -commands in it to perform the update (unmap, cache invalidations and flush, -...). This can not be done through common code for all device. Hence why HMM -provides helpers to factor out everything that can be while leaving the gory -details to the device driver. - -The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does -allow to allocate a struct page for each page of the device memory. Those page -are special because the CPU can not map them. They however allow to migrate -main memory to device memory using exhisting migration mechanism and everything -looks like if page was swap out to disk from CPU point of view. Using a struct -page gives the easiest and cleanest integration with existing mm mechanisms. -Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory -for the device memory and second to perform migration. Policy decision of what -and when to migrate things is left to the device driver. - -Note that any CPU access to a device page trigger a page fault and a migration -back to main memory ie when a page backing an given address A is migrated from -a main memory page to a device page then any CPU access to address A trigger a -page fault and initiate a migration back to main memory. - - -With this two features, HMM not only allow a device to mirror a process address -space and keeps both CPU and device page table synchronize, but also allow to -leverage device memory by migrating part of data-set that is actively use by a -device. +not as easy as CPU page table updates. To update the device page table, you must +allocate a buffer (or use a pool of pre-allocated buffers) and write GPU +specific commands in it to perform the update (unmap, cache invalidations, and +flush, ...). This cannot be done through common code for all devices. Hence +why HMM provides helpers to factor out everything that can be while leaving the +hardware specific details to the device driver. + +The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that +allows allocating a struct page for each page of the device memory. Those pages +are special because the CPU cannot map them. However, they allow migrating +main memory to device memory using existing migration mechanisms and everything +looks like a page is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm mech- +anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +memory for the device memory and second to perform migration. Policy decisions +of what and when to migrate things is left to the device driver. + +Note that any CPU access to a device page triggers a page fault and a migration +back to main memory. For example, when a page backing a given CPU address A is +migrated from a main memory page to a device page, then any CPU access to +address A triggers a page fault and initiates a migration back to main memory. + +With these two features, HMM not only allows a device to mirror process address +space and keeping both CPU and device page table synchronized, but also lever- +ages device memory by migrating the part of the data set that is actively being +used by the device. ------------------------------------------------------------------------------- 4) Address space mirroring implementation and API -Address space mirroring main objective is to allow to duplicate range of CPU -page table into a device page table and HMM helps keeping both synchronize. A -device driver that want to mirror a process address space must start with the +Address space mirroring's main objective is to allow duplication of a range of +CPU page table into a device page table; HMM helps keep both synchronized. A +device driver that wants to mirror a process address space must start with the registration of an hmm_mirror struct: int hmm_mirror_register(struct hmm_mirror *mirror, @@ -154,9 +162,9 @@ registration of an hmm_mirror struct: int hmm_mirror_register_locked(struct hmm_mirror *mirror, struct mm_struct *mm); -The locked variant is to be use when the driver is already holding the mmap_sem -of the mm in write mode. The mirror struct has a set of callback that are use -to propagate CPU page table: +The locked variant is to be used when the driver is already holding mmap_sem +of the mm in write mode. The mirror struct has a set of callbacks that are used +to propagate CPU page tables: struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables @@ -181,13 +189,13 @@ to propagate CPU page table: unsigned long end); }; -Device driver must perform update to the range following action (turn range -read only, or fully unmap, ...). Once driver callback returns the device must -be done with the update. +The device driver must perform the update action to the range (mark range +read only, or fully unmap, ...). The device must be done with the update before +the driver callback returns. -When device driver wants to populate a range of virtual address it can use -either: +When the device driver wants to populate a range of virtual addresses, it can +use either: int hmm_vma_get_pfns(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, @@ -201,17 +209,19 @@ either: bool write, bool block); -First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and -will not trigger a page fault on missing or non present entry. The second one -do trigger page fault on missing or read only entry if write parameter is true. -Page fault use the generic mm page fault code path just like a CPU page fault. +The first one (hmm_vma_get_pfns()) will only fetch present CPU page table +entries and will not trigger a page fault on missing or non-present entries. +The second one does trigger a page fault on missing or read-only entry if the +write parameter is true. Page faults use the generic mm page fault code path +just like a CPU page fault. -Both function copy CPU page table into their pfns array argument. Each entry in -that array correspond to an address in the virtual range. HMM provide a set of -flags to help driver identify special CPU page table entries. +Both functions copy CPU page table entries into their pfns array argument. Each +entry in that array corresponds to an address in the virtual range. HMM +provides a set of flags to help the driver identify special CPU page table +entries. Locking with the update() callback is the most important aspect the driver must -respect in order to keep things properly synchronize. The usage pattern is : +respect in order to keep things properly synchronized. The usage pattern is: int driver_populate_range(...) { @@ -233,43 +243,44 @@ respect in order to keep things properly synchronize. The usage pattern is : return 0; } -The driver->update lock is the same lock that driver takes inside its update() -callback. That lock must be call before hmm_vma_range_done() to avoid any race -with a concurrent CPU page table update. +The driver->update lock is the same lock that the driver takes inside its +update() callback. That lock must be held before hmm_vma_range_done() to avoid +any race with a concurrent CPU page table update. -HMM implements all this on top of the mmu_notifier API because we wanted to a -simpler API and also to be able to perform optimization latter own like doing -concurrent device update in multi-devices scenario. +HMM implements all this on top of the mmu_notifier API because we wanted a +simpler API and also to be able to perform optimizations latter on like doing +concurrent device updates in multi-devices scenario. -HMM also serve as an impedence missmatch between how CPU page table update are -done (by CPU write to the page table and TLB flushes) from how device update -their own page table. Device update is a multi-step process, first appropriate -commands are write to a buffer, then this buffer is schedule for execution on -the device. It is only once the device has executed commands in the buffer that -the update is done. Creating and scheduling update command buffer can happen -concurrently for multiple devices. Waiting for each device to report commands -as executed is serialize (there is no point in doing this concurrently). +HMM also serves as an impedance mismatch between how CPU page table updates +are done (by CPU write to the page table and TLB flushes) and how devices +update their own page table. Device updates are a multi-step process. First, +appropriate commands are written to a buffer, then this buffer is scheduled for +execution on the device. It is only once the device has executed commands in +the buffer that the update is done. Creating and scheduling the update command +buffer can happen concurrently for multiple devices. Waiting for each device to +report commands as executed is serialized (there is no point in doing this +concurrently). ------------------------------------------------------------------------------- 5) Represent and manage device memory from core kernel point of view -Several differents design were try to support device memory. First one use -device specific data structure to keep information about migrated memory and -HMM hooked itself in various place of mm code to handle any access to address -that were back by device memory. It turns out that this ended up replicating -most of the fields of struct page and also needed many kernel code path to be -updated to understand this new kind of memory. +Several different designs were tried to support device memory. First one used +a device specific data structure to keep information about migrated memory and +HMM hooked itself in various places of mm code to handle any access to +addresses that were backed by device memory. It turns out that this ended up +replicating most of the fields of struct page and also needed many kernel code +paths to be updated to understand this new kind of memory. -Thing is most kernel code path never try to access the memory behind a page -but only care about struct page contents. Because of this HMM switchted to -directly using struct page for device memory which left most kernel code path -un-aware of the difference. We only need to make sure that no one ever try to -map those page from the CPU side. +Most kernel code paths never try to access the memory behind a page +but only care about struct page contents. Because of this, HMM switched to +directly using struct page for device memory which left most kernel code paths +unaware of the difference. We only need to make sure that no one ever tries to +map those pages from the CPU side. -HMM provide a set of helpers to register and hotplug device memory as a new -region needing struct page. This is offer through a very simple API: +HMM provides a set of helpers to register and hotplug device memory as a new +region needing a struct page. This is offered through a very simple API: struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, struct device *device, @@ -289,18 +300,19 @@ The hmm_devmem_ops is where most of the important things are: }; The first callback (free()) happens when the last reference on a device page is -drop. This means the device page is now free and no longer use by anyone. The -second callback happens whenever CPU try to access a device page which it can -not do. This second callback must trigger a migration back to system memory. +dropped. This means the device page is now free and no longer used by anyone. +The second callback happens whenever the CPU tries to access a device page +which it cannot do. This second callback must trigger a migration back to +system memory. ------------------------------------------------------------------------------- -6) Migrate to and from device memory +6) Migration to and from device memory -Because CPU can not access device memory, migration must use device DMA engine -to perform copy from and to device memory. For this we need a new migration -helper: +Because the CPU cannot access device memory, migration must use the device DMA +engine to perform copy from and to device memory. For this we need a new +migration helper: int migrate_vma(const struct migrate_vma_ops *ops, struct vm_area_struct *vma, @@ -311,15 +323,15 @@ helper: unsigned long *dst, void *private); -Unlike other migration function it works on a range of virtual address, there -is two reasons for that. First device DMA copy has a high setup overhead cost +Unlike other migration functions it works on a range of virtual address, there +are two reasons for that. First, device DMA copy has a high setup overhead cost and thus batching multiple pages is needed as otherwise the migration overhead -make the whole excersie pointless. The second reason is because driver trigger -such migration base on range of address the device is actively accessing. +makes the whole exercise pointless. The second reason is because the +migration might be for a range of addresses the device is actively accessing. -The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy()) -control destination memory allocation and copy operation. Second one is there -to allow device driver to perform cleanup operation after migration. +The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) +controls destination memory allocation and copy operation. Second one is there +to allow the device driver to perform cleanup operations after migration. struct migrate_vma_ops { void (*alloc_and_copy)(struct vm_area_struct *vma, @@ -336,19 +348,19 @@ to allow device driver to perform cleanup operation after migration. void *private); }; -It is important to stress that this migration helpers allow for hole in the +It is important to stress that these migration helpers allow for holes in the virtual address range. Some pages in the range might not be migrated for all -the usual reasons (page is pin, page is lock, ...). This helper does not fail -but just skip over those pages. +the usual reasons (page is pinned, page is locked, ...). This helper does not +fail but just skips over those pages. -The alloc_and_copy() might as well decide to not migrate all pages in the -range (for reasons under the callback control). For those the callback just -have to leave the corresponding dst entry empty. +The alloc_and_copy() might decide to not migrate all pages in the +range (for reasons under the callback control). For those, the callback just +has to leave the corresponding dst entry empty. -Finaly the migration of the struct page might fails (for file back page) for +Finally, the migration of the struct page might fail (for file backed page) for various reasons (failure to freeze reference, or update page cache, ...). If -that happens then the finalize_and_map() can catch any pages that was not -migrated. Note those page were still copied to new page and thus we wasted +that happens, then the finalize_and_map() can catch any pages that were not +migrated. Note those pages were still copied to a new page and thus we wasted bandwidth but this is considered as a rare event and a price that we are willing to pay to keep all the code simpler. @@ -358,27 +370,27 @@ willing to pay to keep all the code simpler. 7) Memory cgroup (memcg) and rss accounting For now device memory is accounted as any regular page in rss counters (either -anonymous if device page is use for anonymous, file if device page is use for -file back page or shmem if device page is use for share memory). This is a -deliberate choice to keep existing application that might start using device -memory without knowing about it to keep runing unimpacted. - -Drawbacks is that OOM killer might kill an application using a lot of device -memory and not a lot of regular system memory and thus not freeing much system -memory. We want to gather more real world experience on how application and -system react under memory pressure in the presence of device memory before +anonymous if device page is used for anonymous, file if device page is used for +file backed page or shmem if device page is used for shared memory). This is a +deliberate choice to keep existing applications, that might start using device +memory without knowing about it, running unimpacted. + +A drawback is that the OOM killer might kill an application using a lot of +device memory and not a lot of regular system memory and thus not freeing much +system memory. We want to gather more real world experience on how applications +and system react under memory pressure in the presence of device memory before deciding to account device memory differently. -Same decision was made for memory cgroup. Device memory page are accounted +Same decision was made for memory cgroup. Device memory pages are accounted against same memory cgroup a regular page would be accounted to. This does simplify migration to and from device memory. This also means that migration -back from device memory to regular memory can not fail because it would +back from device memory to regular memory cannot fail because it would go above memory cgroup limit. We might revisit this choice latter on once we -get more experience in how device memory is use and its impact on memory +get more experience in how device memory is used and its impact on memory resource control. -Note that device memory can never be pin nor by device driver nor through GUP +Note that device memory can never be pinned by device driver nor through GUP and thus such memory is always free upon process exit. Or when last reference -is drop in case of share memory or file back memory. +is dropped in case of shared memory or file backed memory. diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0478ae2ad44a..496868072e24 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -90,7 +90,7 @@ Steps: 1. Lock the page to be migrated -2. Insure that writeback is complete. +2. Ensure that writeback is complete. 3. Lock the new page that we want to move to. It is locked so that accesses to this (not yet uptodate) page immediately lock while the move is in progress. @@ -100,8 +100,8 @@ Steps: mapcount is not zero then we do not migrate the page. All user space processes that attempt to access the page will now wait on the page lock. -5. The radix tree lock is taken. This will cause all processes trying - to access the page via the mapping to block on the radix tree spinlock. +5. The i_pages lock is taken. This will cause all processes trying + to access the page via the mapping to block on the spinlock. 6. The refcount of the page is examined and we back out if references remain otherwise we know that we are the only one referencing this page. @@ -114,12 +114,12 @@ Steps: 9. The radix tree is changed to point to the new page. -10. The reference count of the old page is dropped because the radix tree +10. The reference count of the old page is dropped because the address space reference is gone. A reference to the new page is established because - the new page is referenced to by the radix tree. + the new page is referenced by the address space. -11. The radix tree lock is dropped. With that lookups in the mapping - become possible again. Processes will move from spinning on the tree_lock +11. The i_pages lock is dropped. With that lookups in the mapping + become possible again. Processes will move from spinning on the lock to sleeping on the locked new page. 12. The page contents are copied to the new page. diff --git a/MAINTAINERS b/MAINTAINERS index 7bb2e9595f14..b60179d948bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1232,10 +1232,15 @@ F: Documentation/devicetree/bindings/i2c/i2c-aspeed.txt ARM/ASPEED MACHINE SUPPORT M: Joel Stanley <[email protected]> -S: Maintained +R: Andrew Jeffery <[email protected]> +L: [email protected] (moderated for non-subscribers) +L: [email protected] (moderated for non-subscribers) +Q: https://patchwork.ozlabs.org/project/linux-aspeed/list/ +S: Supported +T: git git://git.kernel.org/pub/scm/linux/kernel/git/joel/aspeed.git F: arch/arm/mach-aspeed/ F: arch/arm/boot/dts/aspeed-* -F: drivers/*/*aspeed* +N: aspeed ARM/ATMEL AT91 Clock Support M: Boris Brezillon <[email protected]> @@ -1743,7 +1748,7 @@ F: arch/arm/mach-orion5x/ts78xx-* ARM/OXNAS platform support M: Neil Armstrong <[email protected]> L: [email protected] (moderated for non-subscribers) -L: [email protected] (moderated for non-subscribers) +L: [email protected] (moderated for non-subscribers) S: Maintained F: arch/arm/mach-oxnas/ F: arch/arm/boot/dts/ox8*.dts* @@ -4392,7 +4397,7 @@ S: Maintained F: drivers/staging/fsl-dpaa2/ethsw DPT_I2O SCSI RAID DRIVER -M: Adaptec OEM Raid Solutions <[email protected]> +M: Adaptec OEM Raid Solutions <[email protected]> W: http://www.adaptec.com/ S: Maintained @@ -6410,6 +6415,7 @@ L: [email protected] S: Maintained F: mm/hmm* F: include/linux/hmm* +F: Documentation/vm/hmm.txt HOST AP DRIVER M: Jouni Malinen <[email protected]> @@ -7344,7 +7350,7 @@ F: include/linux/ipmi* F: include/uapi/linux/ipmi* IPS SCSI RAID DRIVER -M: Adaptec OEM Raid Solutions <[email protected]> +M: Adaptec OEM Raid Solutions <[email protected]> W: http://www.adaptec.com/ S: Maintained @@ -11762,7 +11768,7 @@ F: drivers/char/random.c RAPIDIO SUBSYSTEM M: Matt Porter <[email protected]> -M: Alexandre Bounine <[email protected]> +M: Alexandre Bounine <[email protected]> S: Maintained F: drivers/rapidio/ diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 2dbdf59258d9..f9d4e6b6d4bd 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -32,6 +32,7 @@ #define MAP_NONBLOCK 0x40000 /* do not block on IO */ #define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x100000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 74504b154256..869080bedb89 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -318,10 +318,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma, #define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE extern void flush_kernel_dcache_page(struct page *); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_user_range(vma,page,addr,len) \ flush_dcache_page(page) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 496667703693..ed8fd0d19a3e 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -22,12 +22,6 @@ #include <mach/memory.h> #endif -/* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - /* PAGE_OFFSET - the virtual address of the start of the kernel image */ #define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET) diff --git a/arch/arm/mach-npcm/npcm7xx.c b/arch/arm/mach-npcm/npcm7xx.c index 5f7cd88103ef..c5f77d854c4f 100644 --- a/arch/arm/mach-npcm/npcm7xx.c +++ b/arch/arm/mach-npcm/npcm7xx.c @@ -17,4 +17,6 @@ static const char *const npcm7xx_dt_match[] = { DT_MACHINE_START(NPCM7XX_DT, "NPCM7XX Chip family") .atag_offset = 0x100, .dt_compat = npcm7xx_dt_match, + .l2c_aux_val = 0x0, + .l2c_aux_mask = ~0x0, MACHINE_END diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ada8eb206a90..8c398fedbbb6 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -466,6 +466,12 @@ void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) void __init dma_contiguous_remap(void) { int i; + + if (!dma_mmu_remap_num) + return; + + /* call flush_cache_all() since CMA area would be large enough */ + flush_cache_all(); for (i = 0; i < dma_mmu_remap_num; i++) { phys_addr_t start = dma_mmu_remap[i].base; phys_addr_t end = start + dma_mmu_remap[i].size; @@ -498,7 +504,15 @@ void __init dma_contiguous_remap(void) flush_tlb_kernel_range(__phys_to_virt(start), __phys_to_virt(end)); - iotable_init(&map, 1); + /* + * All the memory in CMA region will be on ZONE_MOVABLE. + * If that zone is considered as highmem, the memory in CMA + * region is also considered as highmem even if it's + * physical address belong to lowmem. In this case, + * re-mapping isn't required. + */ + if (!is_highmem_idx(ZONE_MOVABLE)) + iotable_init(&map, 1); } } diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index eb1de66517d5..f866870db749 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -21,20 +21,20 @@ #define MIN_GAP (128*1024*1024UL) #define MAX_GAP ((TASK_SIZE)/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -180,18 +180,18 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 7dfcec4700fe..0094c6653b06 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -140,10 +140,8 @@ static inline void __flush_icache_all(void) dsb(ish); } -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) /* * We don't appear to need to do anything here. In fact, if we did, we'd diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 50fa96a49792..49d99214f43c 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -29,12 +29,6 @@ #include <asm/sizes.h> /* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - -/* * Size of the PCI I/O space. This must remain a power of two so that * IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses. */ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index decccffb03ca..842c8a5fcd53 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -38,12 +38,12 @@ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -62,9 +62,9 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ @@ -83,7 +83,7 @@ static unsigned long mmap_base(unsigned long rnd) * This function, called very early during the creation of a new process VM * image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -94,11 +94,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality bit is set, or * if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index 114b93488193..5de871eb4a59 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -47,9 +47,10 @@ extern int pci_proc_domain(struct pci_bus *bus); struct vm_area_struct; -/* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */ -#define HAVE_PCI_MMAP 1 -#define arch_can_pci_mmap_io() 1 +/* Tell PCI code what kind of PCI resource mappings we support */ +#define HAVE_PCI_MMAP 1 +#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1 +#define arch_can_pci_mmap_io() 1 extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t count); diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index e53b8532353c..db8b1fa83452 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -33,6 +33,8 @@ extern int mem_init_done; #define PAGE_KERNEL __pgprot(0) /* these mean nothing to non MMU */ #define pgprot_noncached(x) (x) +#define pgprot_writecombine pgprot_noncached +#define pgprot_device pgprot_noncached #define __swp_type(x) (0) #define __swp_offset(x) (0) diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index ae79e8638d50..161f9758c631 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -151,72 +151,22 @@ void pcibios_set_master(struct pci_dev *dev) } /* - * Platform support for /proc/bus/pci/X/Y mmap()s, - * modelled on the sparc64 implementation by Dave Miller. - * -- paulus. + * Platform support for /proc/bus/pci/X/Y mmap()s. */ -/* - * Adjust vm_pgoff of VMA such that it is the physical page offset - * corresponding to the 32-bit pci bus offset for DEV requested by the user. - * - * Basically, the user finds the base address for his device which he wishes - * to mmap. They read the 32-bit value from the config space base register, - * add whatever PAGE_SIZE multiple offset they wish, and feed this into the - * offset parameter of mmap on /proc/bus/pci/XXX for that device. - * - * Returns negative error code on failure, zero on success. - */ -static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, - resource_size_t *offset, - enum pci_mmap_state mmap_state) +int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - unsigned long io_offset = 0; - int i, res_bit; + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + resource_size_t ioaddr = pci_resource_start(pdev, bar); if (!hose) - return NULL; /* should never happen */ - - /* If memory, add on the PCI bridge address offset */ - if (mmap_state == pci_mmap_mem) { -#if 0 /* See comment in pci_resource_to_user() for why this is disabled */ - *offset += hose->pci_mem_offset; -#endif - res_bit = IORESOURCE_MEM; - } else { - io_offset = (unsigned long)hose->io_base_virt - _IO_BASE; - *offset += io_offset; - res_bit = IORESOURCE_IO; - } - - /* - * Check that the offset requested corresponds to one of the - * resources of the device. - */ - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - struct resource *rp = &dev->resource[i]; - int flags = rp->flags; + return -EINVAL; /* should never happen */ - /* treat ROM as memory (should be already) */ - if (i == PCI_ROM_RESOURCE) - flags |= IORESOURCE_MEM; - - /* Active and same type? */ - if ((flags & res_bit) == 0) - continue; - - /* In the range of this resource? */ - if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end) - continue; - - /* found it! construct the final physical address */ - if (mmap_state == pci_mmap_io) - *offset += hose->io_base_phys - io_offset; - return rp; - } + /* Convert to an offset within this PCI controller */ + ioaddr -= (unsigned long)hose->io_base_virt - _IO_BASE; - return NULL; + vma->vm_pgoff += (ioaddr + hose->io_base_phys) >> PAGE_SHIFT; + return 0; } /* @@ -268,37 +218,6 @@ pgprot_t pci_phys_mem_access_prot(struct file *file, return prot; } -/* - * Perform the actual remap of the pages for a PCI device mapping, as - * appropriate for this architecture. The region in the process to map - * is described by vm_start and vm_end members of VMA, the base physical - * address is found in vm_pgoff. - * The pci device structure is provided so that architectures may make mapping - * decisions on a per-device or per-bus basis. - * - * Returns a negative error code on failure, zero on success. - */ -int pci_mmap_page_range(struct pci_dev *dev, int bar, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine) -{ - resource_size_t offset = - ((resource_size_t)vma->vm_pgoff) << PAGE_SHIFT; - struct resource *rp; - int ret; - - rp = __pci_mmap_make_offset(dev, &offset, mmap_state); - if (rp == NULL) - return -EINVAL; - - vma->vm_pgoff = offset >> PAGE_SHIFT; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, vma->vm_page_prot); - - return ret; -} - /* This provides legacy IO read access on a bus */ int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size) { diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 606e02ca4b6c..3035ca499cd8 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -50,6 +50,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ /* * Flags for msync diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 33d3251ecd37..2f616ebeb7e0 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -24,20 +24,20 @@ EXPORT_SYMBOL(shm_align_mask); #define MIN_GAP (128*1024*1024UL) #define MAX_GAP ((TASK_SIZE)/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -158,18 +158,18 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 7b9b20a381cb..1240f148ec0f 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -34,8 +34,8 @@ void flush_anon_page(struct vm_area_struct *vma, void flush_kernel_dcache_page(struct page *page); void flush_icache_range(unsigned long start, unsigned long end); void flush_icache_page(struct vm_area_struct *vma, struct page *page); -#define flush_dcache_mmap_lock(mapping) spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages) #else #include <asm-generic/cacheflush.h> diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 55e383c173f7..18eb9f69f806 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -46,9 +46,7 @@ extern void copy_from_user_page(struct vm_area_struct *vma, struct page *page, extern void flush_dcache_range(unsigned long start, unsigned long end); extern void invalidate_dcache_range(unsigned long start, unsigned long end); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #endif /* _ASM_NIOS2_CACHEFLUSH_H */ diff --git a/arch/nios2/kernel/time.c b/arch/nios2/kernel/time.c index 20e86209ef2e..ab88b6dd4679 100644 --- a/arch/nios2/kernel/time.c +++ b/arch/nios2/kernel/time.c @@ -336,9 +336,9 @@ static int __init nios2_time_init(struct device_node *timer) return ret; } -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { - ts->tv_sec = mktime(2007, 1, 1, 0, 0, 0); + ts->tv_sec = mktime64(2007, 1, 1, 0, 0, 0); ts->tv_nsec = 0; } diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index bd5ce31936f5..0c83644bfa5c 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -55,10 +55,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_page(vma,page) do { \ flush_kernel_dcache_page(page); \ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index a056a642bb31..870fbf8c7088 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -26,6 +26,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 8c99ebbe2bac..43b308cfdf53 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -70,12 +70,18 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, * Top of mmap area (just below the process stack). */ -static unsigned long mmap_upper_limit(void) +/* + * When called from arch_get_unmapped_area(), rlim_stack will be NULL, + * indicating that "current" should be used instead of a passed-in + * value from the exec bprm as done with arch_pick_mmap_layout(). + */ +static unsigned long mmap_upper_limit(struct rlimit *rlim_stack) { unsigned long stack_base; /* Limit stack size - see setup_arg_pages() in fs/exec.c */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = rlim_stack ? rlim_stack->rlim_max + : rlimit_max(RLIMIT_STACK); if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -127,7 +133,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_legacy_base; - info.high_limit = mmap_upper_limit(); + info.high_limit = mmap_upper_limit(NULL); info.align_mask = last_mmap ? (PAGE_MASK & (SHM_COLOUR - 1)) : 0; info.align_offset = shared_align_offset(last_mmap, pgoff); addr = vm_unmapped_area(&info); @@ -250,10 +256,10 @@ static unsigned long mmap_legacy_base(void) * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_legacy_base = mmap_legacy_base(); - mm->mmap_base = mmap_upper_limit(); + mm->mmap_base = mmap_upper_limit(rlim_stack); if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index d503f344e476..b24ce40acd47 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -39,12 +39,12 @@ #define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -76,9 +76,10 @@ static inline unsigned long stack_maxrandom_size(void) return (1<<30); } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ @@ -196,26 +197,28 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, } static void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor) + unsigned long random_factor, + struct rlimit *rlim_stack) { - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = radix__arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; } } #else /* dummy */ extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor); + unsigned long random_factor, + struct rlimit *rlim_stack); #endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -223,16 +226,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm) random_factor = arch_mmap_rnd(); if (radix_enabled()) - return radix__arch_pick_mmap_layout(mm, random_factor); + return radix__arch_pick_mmap_layout(mm, random_factor, + rlim_stack); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 9a8a084e4aba..4c615fcb0cf0 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -75,8 +75,7 @@ EXPORT_SYMBOL_GPL(mm_iommu_preregistered); /* * Taken from alloc_migrate_target with changes to remove CMA allocations */ -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private, - int **resultp) +struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) { gfp_t gfp_mask = GFP_USER; struct page *new_page; diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 831bdcf407bb..0a7627cdb34e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -37,11 +37,11 @@ static unsigned long stack_maxrandom_size(void) #define MIN_GAP (32*1024*1024) #define MAX_GAP (STACK_TOP/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } @@ -56,9 +56,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd) return TASK_UNMAPPED_BASE + rnd; } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -184,7 +185,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -195,11 +196,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 348a17ecdf66..9ef8de63f28b 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -276,7 +276,7 @@ static unsigned long mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = mmap_rnd(); unsigned long gap; @@ -285,7 +285,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - gap = rlimit(RLIMIT_STACK); + gap = rlim_stack->rlim_cur; if (!test_thread_flag(TIF_32BIT) || (current->personality & ADDR_COMPAT_LAYOUT) || gap == RLIM_INFINITY || diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net index e871af24d9cd..c390f3deb0dc 100644 --- a/arch/um/Kconfig.net +++ b/arch/um/Kconfig.net @@ -109,6 +109,17 @@ config UML_NET_DAEMON more than one without conflict. If you don't need UML networking, say N. +config UML_NET_VECTOR + bool "Vector I/O high performance network devices" + depends on UML_NET + help + This User-Mode Linux network driver uses multi-message send + and receive functions. The host running the UML guest must have + a linux kernel version above 3.0 and a libc version > 2.13. + This driver provides tap, raw, gre and l2tpv3 network transports + with up to 4 times higher network throughput than the UML network + drivers. + config UML_NET_VDE bool "VDE transport" depends on UML_NET diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index e7582e1d248c..16b3cebddafb 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -9,6 +9,7 @@ slip-objs := slip_kern.o slip_user.o slirp-objs := slirp_kern.o slirp_user.o daemon-objs := daemon_kern.o daemon_user.o +vector-objs := vector_kern.o vector_user.o vector_transports.o umcast-objs := umcast_kern.o umcast_user.o net-objs := net_kern.o net_user.o mconsole-objs := mconsole_kern.o mconsole_user.o @@ -43,6 +44,7 @@ obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o obj-$(CONFIG_UML_NET_DAEMON) += daemon.o +obj-$(CONFIG_UML_NET_VECTOR) += vector.o obj-$(CONFIG_UML_NET_VDE) += vde.o obj-$(CONFIG_UML_NET_MCAST) += umcast.o obj-$(CONFIG_UML_NET_PCAP) += pcap.o @@ -61,7 +63,7 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o obj-$(CONFIG_UML_RANDOM) += random.o # pcap_user.o must be added explicitly. -USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o +USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o vector_user.o CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) include arch/um/scripts/Makefile.rules diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c index acbe6c67afba..05588f9466c7 100644 --- a/arch/um/drivers/chan_kern.c +++ b/arch/um/drivers/chan_kern.c @@ -171,56 +171,19 @@ int enable_chan(struct line *line) return err; } -/* Items are added in IRQ context, when free_irq can't be called, and - * removed in process context, when it can. - * This handles interrupt sources which disappear, and which need to - * be permanently disabled. This is discovered in IRQ context, but - * the freeing of the IRQ must be done later. - */ -static DEFINE_SPINLOCK(irqs_to_free_lock); -static LIST_HEAD(irqs_to_free); - -void free_irqs(void) -{ - struct chan *chan; - LIST_HEAD(list); - struct list_head *ele; - unsigned long flags; - - spin_lock_irqsave(&irqs_to_free_lock, flags); - list_splice_init(&irqs_to_free, &list); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); - - list_for_each(ele, &list) { - chan = list_entry(ele, struct chan, free_list); - - if (chan->input && chan->enabled) - um_free_irq(chan->line->driver->read_irq, chan); - if (chan->output && chan->enabled) - um_free_irq(chan->line->driver->write_irq, chan); - chan->enabled = 0; - } -} - static void close_one_chan(struct chan *chan, int delay_free_irq) { - unsigned long flags; - if (!chan->opened) return; - if (delay_free_irq) { - spin_lock_irqsave(&irqs_to_free_lock, flags); - list_add(&chan->free_list, &irqs_to_free); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); - } - else { - if (chan->input && chan->enabled) - um_free_irq(chan->line->driver->read_irq, chan); - if (chan->output && chan->enabled) - um_free_irq(chan->line->driver->write_irq, chan); - chan->enabled = 0; - } + /* we can safely call free now - it will be marked + * as free and freed once the IRQ stopped processing + */ + if (chan->input && chan->enabled) + um_free_irq(chan->line->driver->read_irq, chan); + if (chan->output && chan->enabled) + um_free_irq(chan->line->driver->write_irq, chan); + chan->enabled = 0; if (chan->ops->close != NULL) (*chan->ops->close)(chan->fd, chan->data); diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 366e57f5e8d6..8d80b27502e6 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -284,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data) if (err) return err; if (output) - err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, + err = um_request_irq(driver->write_irq, fd, IRQ_NONE, line_write_interrupt, IRQF_SHARED, driver->write_irq_name, data); return err; diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index b305f8247909..3ef1b48e064a 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -288,7 +288,7 @@ static void uml_net_user_timer_expire(struct timer_list *t) #endif } -static void setup_etheraddr(struct net_device *dev, char *str) +void uml_net_setup_etheraddr(struct net_device *dev, char *str) { unsigned char *addr = dev->dev_addr; char *end; @@ -412,7 +412,7 @@ static void eth_configure(int n, void *init, char *mac, */ snprintf(dev->name, sizeof(dev->name), "eth%d", n); - setup_etheraddr(dev, mac); + uml_net_setup_etheraddr(dev, mac); printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr); diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 37c51a6be690..778a0e52d5a5 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -13,6 +13,7 @@ #include <linux/miscdevice.h> #include <linux/delay.h> #include <linux/uaccess.h> +#include <init.h> #include <irq_kern.h> #include <os.h> @@ -154,7 +155,14 @@ err_out_cleanup_hw: /* * rng_cleanup - shutdown RNG module */ -static void __exit rng_cleanup (void) + +static void cleanup(void) +{ + free_irq_by_fd(random_fd); + os_close_file(random_fd); +} + +static void __exit rng_cleanup(void) { os_close_file(random_fd); misc_deregister (&rng_miscdev); @@ -162,6 +170,7 @@ static void __exit rng_cleanup (void) module_init (rng_init); module_exit (rng_cleanup); +__uml_exitcall(cleanup); MODULE_DESCRIPTION("UML Host Random Number Generator (RNG) driver"); MODULE_LICENSE("GPL"); diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index b55fe9bf5d3e..d4e8c497ae86 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1587,11 +1587,11 @@ int io_thread(void *arg) do { res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n); - if (res > 0) { + if (res >= 0) { written += res; } else { if (res != -EAGAIN) { - printk("io_thread - read failed, fd = %d, " + printk("io_thread - write failed, fd = %d, " "err = %d\n", kernel_fd, -n); } } diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c new file mode 100644 index 000000000000..02168fe25105 --- /dev/null +++ b/arch/um/drivers/vector_kern.c @@ -0,0 +1,1633 @@ +/* + * Copyright (C) 2017 - Cambridge Greys Limited + * Copyright (C) 2011 - 2014 Cisco Systems Inc + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2001 Lennert Buytenhek ([email protected]) and + * James Leu ([email protected]). + * Copyright (C) 2001 by various other people who didn't put their name here. + * Licensed under the GPL. + */ + +#include <linux/version.h> +#include <linux/bootmem.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/inetdevice.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/platform_device.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <init.h> +#include <irq_kern.h> +#include <irq_user.h> +#include <net_kern.h> +#include <os.h> +#include "mconsole_kern.h" +#include "vector_user.h" +#include "vector_kern.h" + +/* + * Adapted from network devices with the following major changes: + * All transports are static - simplifies the code significantly + * Multiple FDs/IRQs per device + * Vector IO optionally used for read/write, falling back to legacy + * based on configuration and/or availability + * Configuration is no longer positional - L2TPv3 and GRE require up to + * 10 parameters, passing this as positional is not fit for purpose. + * Only socket transports are supported + */ + + +#define DRIVER_NAME "uml-vector" +#define DRIVER_VERSION "01" +struct vector_cmd_line_arg { + struct list_head list; + int unit; + char *arguments; +}; + +struct vector_device { + struct list_head list; + struct net_device *dev; + struct platform_device pdev; + int unit; + int opened; +}; + +static LIST_HEAD(vec_cmd_line); + +static DEFINE_SPINLOCK(vector_devices_lock); +static LIST_HEAD(vector_devices); + +static int driver_registered; + +static void vector_eth_configure(int n, struct arglist *def); + +/* Argument accessors to set variables (and/or set default values) + * mtu, buffer sizing, default headroom, etc + */ + +#define DEFAULT_HEADROOM 2 +#define SAFETY_MARGIN 32 +#define DEFAULT_VECTOR_SIZE 64 +#define TX_SMALL_PACKET 128 +#define MAX_IOV_SIZE (MAX_SKB_FRAGS + 1) + +static const struct { + const char string[ETH_GSTRING_LEN]; +} ethtool_stats_keys[] = { + { "rx_queue_max" }, + { "rx_queue_running_average" }, + { "tx_queue_max" }, + { "tx_queue_running_average" }, + { "rx_encaps_errors" }, + { "tx_timeout_count" }, + { "tx_restart_queue" }, + { "tx_kicks" }, + { "tx_flow_control_xon" }, + { "tx_flow_control_xoff" }, + { "rx_csum_offload_good" }, + { "rx_csum_offload_errors"}, + { "sg_ok"}, + { "sg_linearized"}, +}; + +#define VECTOR_NUM_STATS ARRAY_SIZE(ethtool_stats_keys) + +static void vector_reset_stats(struct vector_private *vp) +{ + vp->estats.rx_queue_max = 0; + vp->estats.rx_queue_running_average = 0; + vp->estats.tx_queue_max = 0; + vp->estats.tx_queue_running_average = 0; + vp->estats.rx_encaps_errors = 0; + vp->estats.tx_timeout_count = 0; + vp->estats.tx_restart_queue = 0; + vp->estats.tx_kicks = 0; + vp->estats.tx_flow_control_xon = 0; + vp->estats.tx_flow_control_xoff = 0; + vp->estats.sg_ok = 0; + vp->estats.sg_linearized = 0; +} + +static int get_mtu(struct arglist *def) +{ + char *mtu = uml_vector_fetch_arg(def, "mtu"); + long result; + + if (mtu != NULL) { + if (kstrtoul(mtu, 10, &result) == 0) + return result; + } + return ETH_MAX_PACKET; +} + +static int get_depth(struct arglist *def) +{ + char *mtu = uml_vector_fetch_arg(def, "depth"); + long result; + + if (mtu != NULL) { + if (kstrtoul(mtu, 10, &result) == 0) + return result; + } + return DEFAULT_VECTOR_SIZE; +} + +static int get_headroom(struct arglist *def) +{ + char *mtu = uml_vector_fetch_arg(def, "headroom"); + long result; + + if (mtu != NULL) { + if (kstrtoul(mtu, 10, &result) == 0) + return result; + } + return DEFAULT_HEADROOM; +} + +static int get_req_size(struct arglist *def) +{ + char *gro = uml_vector_fetch_arg(def, "gro"); + long result; + + if (gro != NULL) { + if (kstrtoul(gro, 10, &result) == 0) { + if (result > 0) + return 65536; + } + } + return get_mtu(def) + ETH_HEADER_OTHER + + get_headroom(def) + SAFETY_MARGIN; +} + + +static int get_transport_options(struct arglist *def) +{ + char *transport = uml_vector_fetch_arg(def, "transport"); + char *vector = uml_vector_fetch_arg(def, "vec"); + + int vec_rx = VECTOR_RX; + int vec_tx = VECTOR_TX; + long parsed; + + if (vector != NULL) { + if (kstrtoul(vector, 10, &parsed) == 0) { + if (parsed == 0) { + vec_rx = 0; + vec_tx = 0; + } + } + } + + + if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0) + return (vec_rx | VECTOR_BPF); + if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0) + return (vec_rx | vec_tx); + return (vec_rx | vec_tx); +} + + +/* A mini-buffer for packet drop read + * All of our supported transports are datagram oriented and we always + * read using recvmsg or recvmmsg. If we pass a buffer which is smaller + * than the packet size it still counts as full packet read and will + * clean the incoming stream to keep sigio/epoll happy + */ + +#define DROP_BUFFER_SIZE 32 + +static char *drop_buffer; + +/* Array backed queues optimized for bulk enqueue/dequeue and + * 1:N (small values of N) or 1:1 enqueuer/dequeuer ratios. + * For more details and full design rationale see + * http://foswiki.cambridgegreys.com/Main/EatYourTailAndEnjoyIt + */ + + +/* + * Advance the mmsg queue head by n = advance. Resets the queue to + * maximum enqueue/dequeue-at-once capacity if possible. Called by + * dequeuers. Caller must hold the head_lock! + */ + +static int vector_advancehead(struct vector_queue *qi, int advance) +{ + int queue_depth; + + qi->head = + (qi->head + advance) + % qi->max_depth; + + + spin_lock(&qi->tail_lock); + qi->queue_depth -= advance; + + /* we are at 0, use this to + * reset head and tail so we can use max size vectors + */ + + if (qi->queue_depth == 0) { + qi->head = 0; + qi->tail = 0; + } + queue_depth = qi->queue_depth; + spin_unlock(&qi->tail_lock); + return queue_depth; +} + +/* Advance the queue tail by n = advance. + * This is called by enqueuers which should hold the + * head lock already + */ + +static int vector_advancetail(struct vector_queue *qi, int advance) +{ + int queue_depth; + + qi->tail = + (qi->tail + advance) + % qi->max_depth; + spin_lock(&qi->head_lock); + qi->queue_depth += advance; + queue_depth = qi->queue_depth; + spin_unlock(&qi->head_lock); + return queue_depth; +} + +static int prep_msg(struct vector_private *vp, + struct sk_buff *skb, + struct iovec *iov) +{ + int iov_index = 0; + int nr_frags, frag; + skb_frag_t *skb_frag; + + nr_frags = skb_shinfo(skb)->nr_frags; + if (nr_frags > MAX_IOV_SIZE) { + if (skb_linearize(skb) != 0) + goto drop; + } + if (vp->header_size > 0) { + iov[iov_index].iov_len = vp->header_size; + vp->form_header(iov[iov_index].iov_base, skb, vp); + iov_index++; + } + iov[iov_index].iov_base = skb->data; + if (nr_frags > 0) { + iov[iov_index].iov_len = skb->len - skb->data_len; + vp->estats.sg_ok++; + } else + iov[iov_index].iov_len = skb->len; + iov_index++; + for (frag = 0; frag < nr_frags; frag++) { + skb_frag = &skb_shinfo(skb)->frags[frag]; + iov[iov_index].iov_base = skb_frag_address_safe(skb_frag); + iov[iov_index].iov_len = skb_frag_size(skb_frag); + iov_index++; + } + return iov_index; +drop: + return -1; +} +/* + * Generic vector enqueue with support for forming headers using transport + * specific callback. Allows GRE, L2TPv3, RAW and other transports + * to use a common enqueue procedure in vector mode + */ + +static int vector_enqueue(struct vector_queue *qi, struct sk_buff *skb) +{ + struct vector_private *vp = netdev_priv(qi->dev); + int queue_depth; + int packet_len; + struct mmsghdr *mmsg_vector = qi->mmsg_vector; + int iov_count; + + spin_lock(&qi->tail_lock); + spin_lock(&qi->head_lock); + queue_depth = qi->queue_depth; + spin_unlock(&qi->head_lock); + + if (skb) + packet_len = skb->len; + + if (queue_depth < qi->max_depth) { + + *(qi->skbuff_vector + qi->tail) = skb; + mmsg_vector += qi->tail; + iov_count = prep_msg( + vp, + skb, + mmsg_vector->msg_hdr.msg_iov + ); + if (iov_count < 1) + goto drop; + mmsg_vector->msg_hdr.msg_iovlen = iov_count; + mmsg_vector->msg_hdr.msg_name = vp->fds->remote_addr; + mmsg_vector->msg_hdr.msg_namelen = vp->fds->remote_addr_size; + queue_depth = vector_advancetail(qi, 1); + } else + goto drop; + spin_unlock(&qi->tail_lock); + return queue_depth; +drop: + qi->dev->stats.tx_dropped++; + if (skb != NULL) { + packet_len = skb->len; + dev_consume_skb_any(skb); + netdev_completed_queue(qi->dev, 1, packet_len); + } + spin_unlock(&qi->tail_lock); + return queue_depth; +} + +static int consume_vector_skbs(struct vector_queue *qi, int count) +{ + struct sk_buff *skb; + int skb_index; + int bytes_compl = 0; + + for (skb_index = qi->head; skb_index < qi->head + count; skb_index++) { + skb = *(qi->skbuff_vector + skb_index); + /* mark as empty to ensure correct destruction if + * needed + */ + bytes_compl += skb->len; + *(qi->skbuff_vector + skb_index) = NULL; + dev_consume_skb_any(skb); + } + qi->dev->stats.tx_bytes += bytes_compl; + qi->dev->stats.tx_packets += count; + netdev_completed_queue(qi->dev, count, bytes_compl); + return vector_advancehead(qi, count); +} + +/* + * Generic vector deque via sendmmsg with support for forming headers + * using transport specific callback. Allows GRE, L2TPv3, RAW and + * other transports to use a common dequeue procedure in vector mode + */ + + +static int vector_send(struct vector_queue *qi) +{ + struct vector_private *vp = netdev_priv(qi->dev); + struct mmsghdr *send_from; + int result = 0, send_len, queue_depth = qi->max_depth; + + if (spin_trylock(&qi->head_lock)) { + if (spin_trylock(&qi->tail_lock)) { + /* update queue_depth to current value */ + queue_depth = qi->queue_depth; + spin_unlock(&qi->tail_lock); + while (queue_depth > 0) { + /* Calculate the start of the vector */ + send_len = queue_depth; + send_from = qi->mmsg_vector; + send_from += qi->head; + /* Adjust vector size if wraparound */ + if (send_len + qi->head > qi->max_depth) + send_len = qi->max_depth - qi->head; + /* Try to TX as many packets as possible */ + if (send_len > 0) { + result = uml_vector_sendmmsg( + vp->fds->tx_fd, + send_from, + send_len, + 0 + ); + vp->in_write_poll = + (result != send_len); + } + /* For some of the sendmmsg error scenarios + * we may end being unsure in the TX success + * for all packets. It is safer to declare + * them all TX-ed and blame the network. + */ + if (result < 0) { + if (net_ratelimit()) + netdev_err(vp->dev, "sendmmsg err=%i\n", + result); + result = send_len; + } + if (result > 0) { + queue_depth = + consume_vector_skbs(qi, result); + /* This is equivalent to an TX IRQ. + * Restart the upper layers to feed us + * more packets. + */ + if (result > vp->estats.tx_queue_max) + vp->estats.tx_queue_max = result; + vp->estats.tx_queue_running_average = + (vp->estats.tx_queue_running_average + result) >> 1; + } + netif_trans_update(qi->dev); + netif_wake_queue(qi->dev); + /* if TX is busy, break out of the send loop, + * poll write IRQ will reschedule xmit for us + */ + if (result != send_len) { + vp->estats.tx_restart_queue++; + break; + } + } + } + spin_unlock(&qi->head_lock); + } else { + tasklet_schedule(&vp->tx_poll); + } + return queue_depth; +} + +/* Queue destructor. Deliberately stateless so we can use + * it in queue cleanup if initialization fails. + */ + +static void destroy_queue(struct vector_queue *qi) +{ + int i; + struct iovec *iov; + struct vector_private *vp = netdev_priv(qi->dev); + struct mmsghdr *mmsg_vector; + + if (qi == NULL) + return; + /* deallocate any skbuffs - we rely on any unused to be + * set to NULL. + */ + if (qi->skbuff_vector != NULL) { + for (i = 0; i < qi->max_depth; i++) { + if (*(qi->skbuff_vector + i) != NULL) + dev_kfree_skb_any(*(qi->skbuff_vector + i)); + } + kfree(qi->skbuff_vector); + } + /* deallocate matching IOV structures including header buffs */ + if (qi->mmsg_vector != NULL) { + mmsg_vector = qi->mmsg_vector; + for (i = 0; i < qi->max_depth; i++) { + iov = mmsg_vector->msg_hdr.msg_iov; + if (iov != NULL) { + if ((vp->header_size > 0) && + (iov->iov_base != NULL)) + kfree(iov->iov_base); + kfree(iov); + } + mmsg_vector++; + } + kfree(qi->mmsg_vector); + } + kfree(qi); +} + +/* + * Queue constructor. Create a queue with a given side. + */ +static struct vector_queue *create_queue( + struct vector_private *vp, + int max_size, + int header_size, + int num_extra_frags) +{ + struct vector_queue *result; + int i; + struct iovec *iov; + struct mmsghdr *mmsg_vector; + + result = kmalloc(sizeof(struct vector_queue), GFP_KERNEL); + if (result == NULL) + goto out_fail; + result->max_depth = max_size; + result->dev = vp->dev; + result->mmsg_vector = kmalloc( + (sizeof(struct mmsghdr) * max_size), GFP_KERNEL); + result->skbuff_vector = kmalloc( + (sizeof(void *) * max_size), GFP_KERNEL); + if (result->mmsg_vector == NULL || result->skbuff_vector == NULL) + goto out_fail; + + mmsg_vector = result->mmsg_vector; + for (i = 0; i < max_size; i++) { + /* Clear all pointers - we use non-NULL as marking on + * what to free on destruction + */ + *(result->skbuff_vector + i) = NULL; + mmsg_vector->msg_hdr.msg_iov = NULL; + mmsg_vector++; + } + mmsg_vector = result->mmsg_vector; + result->max_iov_frags = num_extra_frags; + for (i = 0; i < max_size; i++) { + if (vp->header_size > 0) + iov = kmalloc( + sizeof(struct iovec) * (3 + num_extra_frags), + GFP_KERNEL + ); + else + iov = kmalloc( + sizeof(struct iovec) * (2 + num_extra_frags), + GFP_KERNEL + ); + if (iov == NULL) + goto out_fail; + mmsg_vector->msg_hdr.msg_iov = iov; + mmsg_vector->msg_hdr.msg_iovlen = 1; + mmsg_vector->msg_hdr.msg_control = NULL; + mmsg_vector->msg_hdr.msg_controllen = 0; + mmsg_vector->msg_hdr.msg_flags = MSG_DONTWAIT; + mmsg_vector->msg_hdr.msg_name = NULL; + mmsg_vector->msg_hdr.msg_namelen = 0; + if (vp->header_size > 0) { + iov->iov_base = kmalloc(header_size, GFP_KERNEL); + if (iov->iov_base == NULL) + goto out_fail; + iov->iov_len = header_size; + mmsg_vector->msg_hdr.msg_iovlen = 2; + iov++; + } + iov->iov_base = NULL; + iov->iov_len = 0; + mmsg_vector++; + } + spin_lock_init(&result->head_lock); + spin_lock_init(&result->tail_lock); + result->queue_depth = 0; + result->head = 0; + result->tail = 0; + return result; +out_fail: + destroy_queue(result); + return NULL; +} + +/* + * We do not use the RX queue as a proper wraparound queue for now + * This is not necessary because the consumption via netif_rx() + * happens in-line. While we can try using the return code of + * netif_rx() for flow control there are no drivers doing this today. + * For this RX specific use we ignore the tail/head locks and + * just read into a prepared queue filled with skbuffs. + */ + +static struct sk_buff *prep_skb( + struct vector_private *vp, + struct user_msghdr *msg) +{ + int linear = vp->max_packet + vp->headroom + SAFETY_MARGIN; + struct sk_buff *result; + int iov_index = 0, len; + struct iovec *iov = msg->msg_iov; + int err, nr_frags, frag; + skb_frag_t *skb_frag; + + if (vp->req_size <= linear) + len = linear; + else + len = vp->req_size; + result = alloc_skb_with_frags( + linear, + len - vp->max_packet, + 3, + &err, + GFP_ATOMIC + ); + if (vp->header_size > 0) + iov_index++; + if (result == NULL) { + iov[iov_index].iov_base = NULL; + iov[iov_index].iov_len = 0; + goto done; + } + skb_reserve(result, vp->headroom); + result->dev = vp->dev; + skb_put(result, vp->max_packet); + result->data_len = len - vp->max_packet; + result->len += len - vp->max_packet; + skb_reset_mac_header(result); + result->ip_summed = CHECKSUM_NONE; + iov[iov_index].iov_base = result->data; + iov[iov_index].iov_len = vp->max_packet; + iov_index++; + + nr_frags = skb_shinfo(result)->nr_frags; + for (frag = 0; frag < nr_frags; frag++) { + skb_frag = &skb_shinfo(result)->frags[frag]; + iov[iov_index].iov_base = skb_frag_address_safe(skb_frag); + if (iov[iov_index].iov_base != NULL) + iov[iov_index].iov_len = skb_frag_size(skb_frag); + else + iov[iov_index].iov_len = 0; + iov_index++; + } +done: + msg->msg_iovlen = iov_index; + return result; +} + + +/* Prepare queue for recvmmsg one-shot rx - fill with fresh sk_buffs*/ + +static void prep_queue_for_rx(struct vector_queue *qi) +{ + struct vector_private *vp = netdev_priv(qi->dev); + struct mmsghdr *mmsg_vector = qi->mmsg_vector; + void **skbuff_vector = qi->skbuff_vector; + int i; + + if (qi->queue_depth == 0) + return; + for (i = 0; i < qi->queue_depth; i++) { + /* it is OK if allocation fails - recvmmsg with NULL data in + * iov argument still performs an RX, just drops the packet + * This allows us stop faffing around with a "drop buffer" + */ + + *skbuff_vector = prep_skb(vp, &mmsg_vector->msg_hdr); + skbuff_vector++; + mmsg_vector++; + } + qi->queue_depth = 0; +} + +static struct vector_device *find_device(int n) +{ + struct vector_device *device; + struct list_head *ele; + + spin_lock(&vector_devices_lock); + list_for_each(ele, &vector_devices) { + device = list_entry(ele, struct vector_device, list); + if (device->unit == n) + goto out; + } + device = NULL; + out: + spin_unlock(&vector_devices_lock); + return device; +} + +static int vector_parse(char *str, int *index_out, char **str_out, + char **error_out) +{ + int n, len, err; + char *start = str; + + len = strlen(str); + + while ((*str != ':') && (strlen(str) > 1)) + str++; + if (*str != ':') { + *error_out = "Expected ':' after device number"; + return -EINVAL; + } + *str = '\0'; + + err = kstrtouint(start, 0, &n); + if (err < 0) { + *error_out = "Bad device number"; + return err; + } + + str++; + if (find_device(n)) { + *error_out = "Device already configured"; + return -EINVAL; + } + + *index_out = n; + *str_out = str; + return 0; +} + +static int vector_config(char *str, char **error_out) +{ + int err, n; + char *params; + struct arglist *parsed; + + err = vector_parse(str, &n, ¶ms, error_out); + if (err != 0) + return err; + + /* This string is broken up and the pieces used by the underlying + * driver. We should copy it to make sure things do not go wrong + * later. + */ + + params = kstrdup(params, GFP_KERNEL); + if (params == NULL) { + *error_out = "vector_config failed to strdup string"; + return -ENOMEM; + } + + parsed = uml_parse_vector_ifspec(params); + + if (parsed == NULL) { + *error_out = "vector_config failed to parse parameters"; + return -EINVAL; + } + + vector_eth_configure(n, parsed); + return 0; +} + +static int vector_id(char **str, int *start_out, int *end_out) +{ + char *end; + int n; + + n = simple_strtoul(*str, &end, 0); + if ((*end != '\0') || (end == *str)) + return -1; + + *start_out = n; + *end_out = n; + *str = end; + return n; +} + +static int vector_remove(int n, char **error_out) +{ + struct vector_device *vec_d; + struct net_device *dev; + struct vector_private *vp; + + vec_d = find_device(n); + if (vec_d == NULL) + return -ENODEV; + dev = vec_d->dev; + vp = netdev_priv(dev); + if (vp->fds != NULL) + return -EBUSY; + unregister_netdev(dev); + platform_device_unregister(&vec_d->pdev); + return 0; +} + +/* + * There is no shared per-transport initialization code, so + * we will just initialize each interface one by one and + * add them to a list + */ + +static struct platform_driver uml_net_driver = { + .driver = { + .name = DRIVER_NAME, + }, +}; + + +static void vector_device_release(struct device *dev) +{ + struct vector_device *device = dev_get_drvdata(dev); + struct net_device *netdev = device->dev; + + list_del(&device->list); + kfree(device); + free_netdev(netdev); +} + +/* Bog standard recv using recvmsg - not used normally unless the user + * explicitly specifies not to use recvmmsg vector RX. + */ + +static int vector_legacy_rx(struct vector_private *vp) +{ + int pkt_len; + struct user_msghdr hdr; + struct iovec iov[2 + MAX_IOV_SIZE]; /* header + data use case only */ + int iovpos = 0; + struct sk_buff *skb; + int header_check; + + hdr.msg_name = NULL; + hdr.msg_namelen = 0; + hdr.msg_iov = (struct iovec *) &iov; + hdr.msg_control = NULL; + hdr.msg_controllen = 0; + hdr.msg_flags = 0; + + if (vp->header_size > 0) { + iov[0].iov_base = vp->header_rxbuffer; + iov[0].iov_len = vp->header_size; + } + + skb = prep_skb(vp, &hdr); + + if (skb == NULL) { + /* Read a packet into drop_buffer and don't do + * anything with it. + */ + iov[iovpos].iov_base = drop_buffer; + iov[iovpos].iov_len = DROP_BUFFER_SIZE; + hdr.msg_iovlen = 1; + vp->dev->stats.rx_dropped++; + } + + pkt_len = uml_vector_recvmsg(vp->fds->rx_fd, &hdr, 0); + + if (skb != NULL) { + if (pkt_len > vp->header_size) { + if (vp->header_size > 0) { + header_check = vp->verify_header( + vp->header_rxbuffer, skb, vp); + if (header_check < 0) { + dev_kfree_skb_irq(skb); + vp->dev->stats.rx_dropped++; + vp->estats.rx_encaps_errors++; + return 0; + } + if (header_check > 0) { + vp->estats.rx_csum_offload_good++; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + } + pskb_trim(skb, pkt_len - vp->rx_header_size); + skb->protocol = eth_type_trans(skb, skb->dev); + vp->dev->stats.rx_bytes += skb->len; + vp->dev->stats.rx_packets++; + netif_rx(skb); + } else { + dev_kfree_skb_irq(skb); + } + } + return pkt_len; +} + +/* + * Packet at a time TX which falls back to vector TX if the + * underlying transport is busy. + */ + + + +static int writev_tx(struct vector_private *vp, struct sk_buff *skb) +{ + struct iovec iov[3 + MAX_IOV_SIZE]; + int iov_count, pkt_len = 0; + + iov[0].iov_base = vp->header_txbuffer; + iov_count = prep_msg(vp, skb, (struct iovec *) &iov); + + if (iov_count < 1) + goto drop; + pkt_len = uml_vector_writev( + vp->fds->tx_fd, + (struct iovec *) &iov, + iov_count + ); + + netif_trans_update(vp->dev); + netif_wake_queue(vp->dev); + + if (pkt_len > 0) { + vp->dev->stats.tx_bytes += skb->len; + vp->dev->stats.tx_packets++; + } else { + vp->dev->stats.tx_dropped++; + } + consume_skb(skb); + return pkt_len; +drop: + vp->dev->stats.tx_dropped++; + consume_skb(skb); + return pkt_len; +} + +/* + * Receive as many messages as we can in one call using the special + * mmsg vector matched to an skb vector which we prepared earlier. + */ + +static int vector_mmsg_rx(struct vector_private *vp) +{ + int packet_count, i; + struct vector_queue *qi = vp->rx_queue; + struct sk_buff *skb; + struct mmsghdr *mmsg_vector = qi->mmsg_vector; + void **skbuff_vector = qi->skbuff_vector; + int header_check; + + /* Refresh the vector and make sure it is with new skbs and the + * iovs are updated to point to them. + */ + + prep_queue_for_rx(qi); + + /* Fire the Lazy Gun - get as many packets as we can in one go. */ + + packet_count = uml_vector_recvmmsg( + vp->fds->rx_fd, qi->mmsg_vector, qi->max_depth, 0); + + if (packet_count <= 0) + return packet_count; + + /* We treat packet processing as enqueue, buffer refresh as dequeue + * The queue_depth tells us how many buffers have been used and how + * many do we need to prep the next time prep_queue_for_rx() is called. + */ + + qi->queue_depth = packet_count; + + for (i = 0; i < packet_count; i++) { + skb = (*skbuff_vector); + if (mmsg_vector->msg_len > vp->header_size) { + if (vp->header_size > 0) { + header_check = vp->verify_header( + mmsg_vector->msg_hdr.msg_iov->iov_base, + skb, + vp + ); + if (header_check < 0) { + /* Overlay header failed to verify - discard. + * We can actually keep this skb and reuse it, + * but that will make the prep logic too + * complex. + */ + dev_kfree_skb_irq(skb); + vp->estats.rx_encaps_errors++; + continue; + } + if (header_check > 0) { + vp->estats.rx_csum_offload_good++; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + } + pskb_trim(skb, + mmsg_vector->msg_len - vp->rx_header_size); + skb->protocol = eth_type_trans(skb, skb->dev); + /* + * We do not need to lock on updating stats here + * The interrupt loop is non-reentrant. + */ + vp->dev->stats.rx_bytes += skb->len; + vp->dev->stats.rx_packets++; + netif_rx(skb); + } else { + /* Overlay header too short to do anything - discard. + * We can actually keep this skb and reuse it, + * but that will make the prep logic too complex. + */ + if (skb != NULL) + dev_kfree_skb_irq(skb); + } + (*skbuff_vector) = NULL; + /* Move to the next buffer element */ + mmsg_vector++; + skbuff_vector++; + } + if (packet_count > 0) { + if (vp->estats.rx_queue_max < packet_count) + vp->estats.rx_queue_max = packet_count; + vp->estats.rx_queue_running_average = + (vp->estats.rx_queue_running_average + packet_count) >> 1; + } + return packet_count; +} + +static void vector_rx(struct vector_private *vp) +{ + int err; + + if ((vp->options & VECTOR_RX) > 0) + while ((err = vector_mmsg_rx(vp)) > 0) + ; + else + while ((err = vector_legacy_rx(vp)) > 0) + ; + if ((err != 0) && net_ratelimit()) + netdev_err(vp->dev, "vector_rx: error(%d)\n", err); +} + +static int vector_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct vector_private *vp = netdev_priv(dev); + int queue_depth = 0; + + if ((vp->options & VECTOR_TX) == 0) { + writev_tx(vp, skb); + return NETDEV_TX_OK; + } + + /* We do BQL only in the vector path, no point doing it in + * packet at a time mode as there is no device queue + */ + + netdev_sent_queue(vp->dev, skb->len); + queue_depth = vector_enqueue(vp->tx_queue, skb); + + /* if the device queue is full, stop the upper layers and + * flush it. + */ + + if (queue_depth >= vp->tx_queue->max_depth - 1) { + vp->estats.tx_kicks++; + netif_stop_queue(dev); + vector_send(vp->tx_queue); + return NETDEV_TX_OK; + } + if (skb->xmit_more) { + mod_timer(&vp->tl, vp->coalesce); + return NETDEV_TX_OK; + } + if (skb->len < TX_SMALL_PACKET) { + vp->estats.tx_kicks++; + vector_send(vp->tx_queue); + } else + tasklet_schedule(&vp->tx_poll); + return NETDEV_TX_OK; +} + +static irqreturn_t vector_rx_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct vector_private *vp = netdev_priv(dev); + + if (!netif_running(dev)) + return IRQ_NONE; + vector_rx(vp); + return IRQ_HANDLED; + +} + +static irqreturn_t vector_tx_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct vector_private *vp = netdev_priv(dev); + + if (!netif_running(dev)) + return IRQ_NONE; + /* We need to pay attention to it only if we got + * -EAGAIN or -ENOBUFFS from sendmmsg. Otherwise + * we ignore it. In the future, it may be worth + * it to improve the IRQ controller a bit to make + * tweaking the IRQ mask less costly + */ + + if (vp->in_write_poll) + tasklet_schedule(&vp->tx_poll); + return IRQ_HANDLED; + +} + +static int irq_rr; + +static int vector_net_close(struct net_device *dev) +{ + struct vector_private *vp = netdev_priv(dev); + unsigned long flags; + + netif_stop_queue(dev); + del_timer(&vp->tl); + + if (vp->fds == NULL) + return 0; + + /* Disable and free all IRQS */ + if (vp->rx_irq > 0) { + um_free_irq(vp->rx_irq, dev); + vp->rx_irq = 0; + } + if (vp->tx_irq > 0) { + um_free_irq(vp->tx_irq, dev); + vp->tx_irq = 0; + } + tasklet_kill(&vp->tx_poll); + if (vp->fds->rx_fd > 0) { + os_close_file(vp->fds->rx_fd); + vp->fds->rx_fd = -1; + } + if (vp->fds->tx_fd > 0) { + os_close_file(vp->fds->tx_fd); + vp->fds->tx_fd = -1; + } + if (vp->bpf != NULL) + kfree(vp->bpf); + if (vp->fds->remote_addr != NULL) + kfree(vp->fds->remote_addr); + if (vp->transport_data != NULL) + kfree(vp->transport_data); + if (vp->header_rxbuffer != NULL) + kfree(vp->header_rxbuffer); + if (vp->header_txbuffer != NULL) + kfree(vp->header_txbuffer); + if (vp->rx_queue != NULL) + destroy_queue(vp->rx_queue); + if (vp->tx_queue != NULL) + destroy_queue(vp->tx_queue); + kfree(vp->fds); + vp->fds = NULL; + spin_lock_irqsave(&vp->lock, flags); + vp->opened = false; + spin_unlock_irqrestore(&vp->lock, flags); + return 0; +} + +/* TX tasklet */ + +static void vector_tx_poll(unsigned long data) +{ + struct vector_private *vp = (struct vector_private *)data; + + vp->estats.tx_kicks++; + vector_send(vp->tx_queue); +} +static void vector_reset_tx(struct work_struct *work) +{ + struct vector_private *vp = + container_of(work, struct vector_private, reset_tx); + netdev_reset_queue(vp->dev); + netif_start_queue(vp->dev); + netif_wake_queue(vp->dev); +} +static int vector_net_open(struct net_device *dev) +{ + struct vector_private *vp = netdev_priv(dev); + unsigned long flags; + int err = -EINVAL; + struct vector_device *vdevice; + + spin_lock_irqsave(&vp->lock, flags); + if (vp->opened) { + spin_unlock_irqrestore(&vp->lock, flags); + return -ENXIO; + } + vp->opened = true; + spin_unlock_irqrestore(&vp->lock, flags); + + vp->fds = uml_vector_user_open(vp->unit, vp->parsed); + + if (vp->fds == NULL) + goto out_close; + + if (build_transport_data(vp) < 0) + goto out_close; + + if ((vp->options & VECTOR_RX) > 0) { + vp->rx_queue = create_queue( + vp, + get_depth(vp->parsed), + vp->rx_header_size, + MAX_IOV_SIZE + ); + vp->rx_queue->queue_depth = get_depth(vp->parsed); + } else { + vp->header_rxbuffer = kmalloc( + vp->rx_header_size, + GFP_KERNEL + ); + if (vp->header_rxbuffer == NULL) + goto out_close; + } + if ((vp->options & VECTOR_TX) > 0) { + vp->tx_queue = create_queue( + vp, + get_depth(vp->parsed), + vp->header_size, + MAX_IOV_SIZE + ); + } else { + vp->header_txbuffer = kmalloc(vp->header_size, GFP_KERNEL); + if (vp->header_txbuffer == NULL) + goto out_close; + } + + /* READ IRQ */ + err = um_request_irq( + irq_rr + VECTOR_BASE_IRQ, vp->fds->rx_fd, + IRQ_READ, vector_rx_interrupt, + IRQF_SHARED, dev->name, dev); + if (err != 0) { + netdev_err(dev, "vector_open: failed to get rx irq(%d)\n", err); + err = -ENETUNREACH; + goto out_close; + } + vp->rx_irq = irq_rr + VECTOR_BASE_IRQ; + dev->irq = irq_rr + VECTOR_BASE_IRQ; + irq_rr = (irq_rr + 1) % VECTOR_IRQ_SPACE; + + /* WRITE IRQ - we need it only if we have vector TX */ + if ((vp->options & VECTOR_TX) > 0) { + err = um_request_irq( + irq_rr + VECTOR_BASE_IRQ, vp->fds->tx_fd, + IRQ_WRITE, vector_tx_interrupt, + IRQF_SHARED, dev->name, dev); + if (err != 0) { + netdev_err(dev, + "vector_open: failed to get tx irq(%d)\n", err); + err = -ENETUNREACH; + goto out_close; + } + vp->tx_irq = irq_rr + VECTOR_BASE_IRQ; + irq_rr = (irq_rr + 1) % VECTOR_IRQ_SPACE; + } + + if ((vp->options & VECTOR_QDISC_BYPASS) != 0) { + if (!uml_raw_enable_qdisc_bypass(vp->fds->rx_fd)) + vp->options = vp->options | VECTOR_BPF; + } + + if ((vp->options & VECTOR_BPF) != 0) + vp->bpf = uml_vector_default_bpf(vp->fds->rx_fd, dev->dev_addr); + + netif_start_queue(dev); + + /* clear buffer - it can happen that the host side of the interface + * is full when we get here. In this case, new data is never queued, + * SIGIOs never arrive, and the net never works. + */ + + vector_rx(vp); + + vector_reset_stats(vp); + vdevice = find_device(vp->unit); + vdevice->opened = 1; + + if ((vp->options & VECTOR_TX) != 0) + add_timer(&vp->tl); + return 0; +out_close: + vector_net_close(dev); + return err; +} + + +static void vector_net_set_multicast_list(struct net_device *dev) +{ + /* TODO: - we can do some BPF games here */ + return; +} + +static void vector_net_tx_timeout(struct net_device *dev) +{ + struct vector_private *vp = netdev_priv(dev); + + vp->estats.tx_timeout_count++; + netif_trans_update(dev); + schedule_work(&vp->reset_tx); +} + +static netdev_features_t vector_fix_features(struct net_device *dev, + netdev_features_t features) +{ + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + return features; +} + +static int vector_set_features(struct net_device *dev, + netdev_features_t features) +{ + struct vector_private *vp = netdev_priv(dev); + /* Adjust buffer sizes for GSO/GRO. Unfortunately, there is + * no way to negotiate it on raw sockets, so we can change + * only our side. + */ + if (features & NETIF_F_GRO) + /* All new frame buffers will be GRO-sized */ + vp->req_size = 65536; + else + /* All new frame buffers will be normal sized */ + vp->req_size = vp->max_packet + vp->headroom + SAFETY_MARGIN; + return 0; +} + +#ifdef CONFIG_NET_POLL_CONTROLLER +static void vector_net_poll_controller(struct net_device *dev) +{ + disable_irq(dev->irq); + vector_rx_interrupt(dev->irq, dev); + enable_irq(dev->irq); +} +#endif + +static void vector_net_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); + strlcpy(info->version, DRIVER_VERSION, sizeof(info->version)); +} + +static void vector_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring) +{ + struct vector_private *vp = netdev_priv(netdev); + + ring->rx_max_pending = vp->rx_queue->max_depth; + ring->tx_max_pending = vp->tx_queue->max_depth; + ring->rx_pending = vp->rx_queue->max_depth; + ring->tx_pending = vp->tx_queue->max_depth; +} + +static void vector_get_strings(struct net_device *dev, u32 stringset, u8 *buf) +{ + switch (stringset) { + case ETH_SS_TEST: + *buf = '\0'; + break; + case ETH_SS_STATS: + memcpy(buf, ðtool_stats_keys, sizeof(ethtool_stats_keys)); + break; + default: + WARN_ON(1); + break; + } +} + +static int vector_get_sset_count(struct net_device *dev, int sset) +{ + switch (sset) { + case ETH_SS_TEST: + return 0; + case ETH_SS_STATS: + return VECTOR_NUM_STATS; + default: + return -EOPNOTSUPP; + } +} + +static void vector_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *estats, + u64 *tmp_stats) +{ + struct vector_private *vp = netdev_priv(dev); + + memcpy(tmp_stats, &vp->estats, sizeof(struct vector_estats)); +} + +static int vector_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec) +{ + struct vector_private *vp = netdev_priv(netdev); + + ec->tx_coalesce_usecs = (vp->coalesce * 1000000) / HZ; + return 0; +} + +static int vector_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *ec) +{ + struct vector_private *vp = netdev_priv(netdev); + + vp->coalesce = (ec->tx_coalesce_usecs * HZ) / 1000000; + if (vp->coalesce == 0) + vp->coalesce = 1; + return 0; +} + +static const struct ethtool_ops vector_net_ethtool_ops = { + .get_drvinfo = vector_net_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, + .get_ringparam = vector_get_ringparam, + .get_strings = vector_get_strings, + .get_sset_count = vector_get_sset_count, + .get_ethtool_stats = vector_get_ethtool_stats, + .get_coalesce = vector_get_coalesce, + .set_coalesce = vector_set_coalesce, +}; + + +static const struct net_device_ops vector_netdev_ops = { + .ndo_open = vector_net_open, + .ndo_stop = vector_net_close, + .ndo_start_xmit = vector_net_start_xmit, + .ndo_set_rx_mode = vector_net_set_multicast_list, + .ndo_tx_timeout = vector_net_tx_timeout, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_fix_features = vector_fix_features, + .ndo_set_features = vector_set_features, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = vector_net_poll_controller, +#endif +}; + + +static void vector_timer_expire(struct timer_list *t) +{ + struct vector_private *vp = from_timer(vp, t, tl); + + vp->estats.tx_kicks++; + vector_send(vp->tx_queue); +} + +static void vector_eth_configure( + int n, + struct arglist *def + ) +{ + struct vector_device *device; + struct net_device *dev; + struct vector_private *vp; + int err; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (device == NULL) { + printk(KERN_ERR "eth_configure failed to allocate struct " + "vector_device\n"); + return; + } + dev = alloc_etherdev(sizeof(struct vector_private)); + if (dev == NULL) { + printk(KERN_ERR "eth_configure: failed to allocate struct " + "net_device for vec%d\n", n); + goto out_free_device; + } + + dev->mtu = get_mtu(def); + + INIT_LIST_HEAD(&device->list); + device->unit = n; + + /* If this name ends up conflicting with an existing registered + * netdevice, that is OK, register_netdev{,ice}() will notice this + * and fail. + */ + snprintf(dev->name, sizeof(dev->name), "vec%d", n); + uml_net_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac")); + vp = netdev_priv(dev); + + /* sysfs register */ + if (!driver_registered) { + platform_driver_register(¨_net_driver); + driver_registered = 1; + } + device->pdev.id = n; + device->pdev.name = DRIVER_NAME; + device->pdev.dev.release = vector_device_release; + dev_set_drvdata(&device->pdev.dev, device); + if (platform_device_register(&device->pdev)) + goto out_free_netdev; + SET_NETDEV_DEV(dev, &device->pdev.dev); + + device->dev = dev; + + *vp = ((struct vector_private) + { + .list = LIST_HEAD_INIT(vp->list), + .dev = dev, + .unit = n, + .options = get_transport_options(def), + .rx_irq = 0, + .tx_irq = 0, + .parsed = def, + .max_packet = get_mtu(def) + ETH_HEADER_OTHER, + /* TODO - we need to calculate headroom so that ip header + * is 16 byte aligned all the time + */ + .headroom = get_headroom(def), + .form_header = NULL, + .verify_header = NULL, + .header_rxbuffer = NULL, + .header_txbuffer = NULL, + .header_size = 0, + .rx_header_size = 0, + .rexmit_scheduled = false, + .opened = false, + .transport_data = NULL, + .in_write_poll = false, + .coalesce = 2, + .req_size = get_req_size(def) + }); + + dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST); + tasklet_init(&vp->tx_poll, vector_tx_poll, (unsigned long)vp); + INIT_WORK(&vp->reset_tx, vector_reset_tx); + + timer_setup(&vp->tl, vector_timer_expire, 0); + spin_lock_init(&vp->lock); + + /* FIXME */ + dev->netdev_ops = &vector_netdev_ops; + dev->ethtool_ops = &vector_net_ethtool_ops; + dev->watchdog_timeo = (HZ >> 1); + /* primary IRQ - fixme */ + dev->irq = 0; /* we will adjust this once opened */ + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + if (err) + goto out_undo_user_init; + + spin_lock(&vector_devices_lock); + list_add(&device->list, &vector_devices); + spin_unlock(&vector_devices_lock); + + return; + +out_undo_user_init: + return; +out_free_netdev: + free_netdev(dev); +out_free_device: + kfree(device); +} + + + + +/* + * Invoked late in the init + */ + +static int __init vector_init(void) +{ + struct list_head *ele; + struct vector_cmd_line_arg *def; + struct arglist *parsed; + + list_for_each(ele, &vec_cmd_line) { + def = list_entry(ele, struct vector_cmd_line_arg, list); + parsed = uml_parse_vector_ifspec(def->arguments); + if (parsed != NULL) + vector_eth_configure(def->unit, parsed); + } + return 0; +} + + +/* Invoked at initial argument parsing, only stores + * arguments until a proper vector_init is called + * later + */ + +static int __init vector_setup(char *str) +{ + char *error; + int n, err; + struct vector_cmd_line_arg *new; + + err = vector_parse(str, &n, &str, &error); + if (err) { + printk(KERN_ERR "vector_setup - Couldn't parse '%s' : %s\n", + str, error); + return 1; + } + new = alloc_bootmem(sizeof(*new)); + INIT_LIST_HEAD(&new->list); + new->unit = n; + new->arguments = str; + list_add_tail(&new->list, &vec_cmd_line); + return 1; +} + +__setup("vec", vector_setup); +__uml_help(vector_setup, +"vec[0-9]+:<option>=<value>,<option>=<value>\n" +" Configure a vector io network device.\n\n" +); + +late_initcall(vector_init); + +static struct mc_device vector_mc = { + .list = LIST_HEAD_INIT(vector_mc.list), + .name = "vec", + .config = vector_config, + .get_config = NULL, + .id = vector_id, + .remove = vector_remove, +}; + +#ifdef CONFIG_INET +static int vector_inetaddr_event( + struct notifier_block *this, + unsigned long event, + void *ptr) +{ + return NOTIFY_DONE; +} + +static struct notifier_block vector_inetaddr_notifier = { + .notifier_call = vector_inetaddr_event, +}; + +static void inet_register(void) +{ + register_inetaddr_notifier(&vector_inetaddr_notifier); +} +#else +static inline void inet_register(void) +{ +} +#endif + +static int vector_net_init(void) +{ + mconsole_register_dev(&vector_mc); + inet_register(); + return 0; +} + +__initcall(vector_net_init); + + + diff --git a/arch/um/drivers/vector_kern.h b/arch/um/drivers/vector_kern.h new file mode 100644 index 000000000000..0b0a767b9076 --- /dev/null +++ b/arch/um/drivers/vector_kern.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __UM_VECTOR_KERN_H +#define __UM_VECTOR_KERN_H + +#include <linux/netdevice.h> +#include <linux/platform_device.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/workqueue.h> +#include <linux/interrupt.h> +#include "vector_user.h" + +/* Queue structure specially adapted for multiple enqueue/dequeue + * in a mmsgrecv/mmsgsend context + */ + +/* Dequeue method */ + +#define QUEUE_SENDMSG 0 +#define QUEUE_SENDMMSG 1 + +#define VECTOR_RX 1 +#define VECTOR_TX (1 << 1) +#define VECTOR_BPF (1 << 2) +#define VECTOR_QDISC_BYPASS (1 << 3) + +#define ETH_MAX_PACKET 1500 +#define ETH_HEADER_OTHER 32 /* just in case someone decides to go mad on QnQ */ + +struct vector_queue { + struct mmsghdr *mmsg_vector; + void **skbuff_vector; + /* backlink to device which owns us */ + struct net_device *dev; + spinlock_t head_lock; + spinlock_t tail_lock; + int queue_depth, head, tail, max_depth, max_iov_frags; + short options; +}; + +struct vector_estats { + uint64_t rx_queue_max; + uint64_t rx_queue_running_average; + uint64_t tx_queue_max; + uint64_t tx_queue_running_average; + uint64_t rx_encaps_errors; + uint64_t tx_timeout_count; + uint64_t tx_restart_queue; + uint64_t tx_kicks; + uint64_t tx_flow_control_xon; + uint64_t tx_flow_control_xoff; + uint64_t rx_csum_offload_good; + uint64_t rx_csum_offload_errors; + uint64_t sg_ok; + uint64_t sg_linearized; +}; + +#define VERIFY_HEADER_NOK -1 +#define VERIFY_HEADER_OK 0 +#define VERIFY_CSUM_OK 1 + +struct vector_private { + struct list_head list; + spinlock_t lock; + struct net_device *dev; + + int unit; + + /* Timeout timer in TX */ + + struct timer_list tl; + + /* Scheduled "remove device" work */ + struct work_struct reset_tx; + struct vector_fds *fds; + + struct vector_queue *rx_queue; + struct vector_queue *tx_queue; + + int rx_irq; + int tx_irq; + + struct arglist *parsed; + + void *transport_data; /* transport specific params if needed */ + + int max_packet; + int req_size; /* different from max packet - used for TSO */ + int headroom; + + int options; + + /* remote address if any - some transports will leave this as null */ + + int header_size; + int rx_header_size; + int coalesce; + + void *header_rxbuffer; + void *header_txbuffer; + + int (*form_header)(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp); + int (*verify_header)(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp); + + spinlock_t stats_lock; + + struct tasklet_struct tx_poll; + bool rexmit_scheduled; + bool opened; + bool in_write_poll; + + /* ethtool stats */ + + struct vector_estats estats; + void *bpf; + + char user[0]; +}; + +extern int build_transport_data(struct vector_private *vp); + +#endif diff --git a/arch/um/drivers/vector_transports.c b/arch/um/drivers/vector_transports.c new file mode 100644 index 000000000000..9065047f844b --- /dev/null +++ b/arch/um/drivers/vector_transports.c @@ -0,0 +1,458 @@ +/* + * Copyright (C) 2017 - Cambridge Greys Limited + * Copyright (C) 2011 - 2014 Cisco Systems Inc + * Licensed under the GPL. + */ + +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/virtio_net.h> +#include <linux/virtio_net.h> +#include <linux/virtio_byteorder.h> +#include <linux/netdev_features.h> +#include "vector_user.h" +#include "vector_kern.h" + +#define GOOD_LINEAR 512 +#define GSO_ERROR "Incoming GSO frames and GRO disabled on the interface" + +struct gre_minimal_header { + uint16_t header; + uint16_t arptype; +}; + + +struct uml_gre_data { + uint32_t rx_key; + uint32_t tx_key; + uint32_t sequence; + + bool ipv6; + bool has_sequence; + bool pin_sequence; + bool checksum; + bool key; + struct gre_minimal_header expected_header; + + uint32_t checksum_offset; + uint32_t key_offset; + uint32_t sequence_offset; + +}; + +struct uml_l2tpv3_data { + uint64_t rx_cookie; + uint64_t tx_cookie; + uint64_t rx_session; + uint64_t tx_session; + uint32_t counter; + + bool udp; + bool ipv6; + bool has_counter; + bool pin_counter; + bool cookie; + bool cookie_is_64; + + uint32_t cookie_offset; + uint32_t session_offset; + uint32_t counter_offset; +}; + +static int l2tpv3_form_header(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp) +{ + struct uml_l2tpv3_data *td = vp->transport_data; + uint32_t *counter; + + if (td->udp) + *(uint32_t *) header = cpu_to_be32(L2TPV3_DATA_PACKET); + (*(uint32_t *) (header + td->session_offset)) = td->tx_session; + + if (td->cookie) { + if (td->cookie_is_64) + (*(uint64_t *)(header + td->cookie_offset)) = + td->tx_cookie; + else + (*(uint32_t *)(header + td->cookie_offset)) = + td->tx_cookie; + } + if (td->has_counter) { + counter = (uint32_t *)(header + td->counter_offset); + if (td->pin_counter) { + *counter = 0; + } else { + td->counter++; + *counter = cpu_to_be32(td->counter); + } + } + return 0; +} + +static int gre_form_header(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp) +{ + struct uml_gre_data *td = vp->transport_data; + uint32_t *sequence; + *((uint32_t *) header) = *((uint32_t *) &td->expected_header); + if (td->key) + (*(uint32_t *) (header + td->key_offset)) = td->tx_key; + if (td->has_sequence) { + sequence = (uint32_t *)(header + td->sequence_offset); + if (td->pin_sequence) + *sequence = 0; + else + *sequence = cpu_to_be32(++td->sequence); + } + return 0; +} + +static int raw_form_header(uint8_t *header, + struct sk_buff *skb, struct vector_private *vp) +{ + struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header; + + virtio_net_hdr_from_skb( + skb, + vheader, + virtio_legacy_is_little_endian(), + false + ); + + return 0; +} + +static int l2tpv3_verify_header( + uint8_t *header, struct sk_buff *skb, struct vector_private *vp) +{ + struct uml_l2tpv3_data *td = vp->transport_data; + uint32_t *session; + uint64_t cookie; + + if ((!td->udp) && (!td->ipv6)) + header += sizeof(struct iphdr) /* fix for ipv4 raw */; + + /* we do not do a strict check for "data" packets as per + * the RFC spec because the pure IP spec does not have + * that anyway. + */ + + if (td->cookie) { + if (td->cookie_is_64) + cookie = *(uint64_t *)(header + td->cookie_offset); + else + cookie = *(uint32_t *)(header + td->cookie_offset); + if (cookie != td->rx_cookie) { + if (net_ratelimit()) + netdev_err(vp->dev, "uml_l2tpv3: unknown cookie id"); + return -1; + } + } + session = (uint32_t *) (header + td->session_offset); + if (*session != td->rx_session) { + if (net_ratelimit()) + netdev_err(vp->dev, "uml_l2tpv3: session mismatch"); + return -1; + } + return 0; +} + +static int gre_verify_header( + uint8_t *header, struct sk_buff *skb, struct vector_private *vp) +{ + + uint32_t key; + struct uml_gre_data *td = vp->transport_data; + + if (!td->ipv6) + header += sizeof(struct iphdr) /* fix for ipv4 raw */; + + if (*((uint32_t *) header) != *((uint32_t *) &td->expected_header)) { + if (net_ratelimit()) + netdev_err(vp->dev, "header type disagreement, expecting %0x, got %0x", + *((uint32_t *) &td->expected_header), + *((uint32_t *) header) + ); + return -1; + } + + if (td->key) { + key = (*(uint32_t *)(header + td->key_offset)); + if (key != td->rx_key) { + if (net_ratelimit()) + netdev_err(vp->dev, "unknown key id %0x, expecting %0x", + key, td->rx_key); + return -1; + } + } + return 0; +} + +static int raw_verify_header( + uint8_t *header, struct sk_buff *skb, struct vector_private *vp) +{ + struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header; + + if ((vheader->gso_type != VIRTIO_NET_HDR_GSO_NONE) && + (vp->req_size != 65536)) { + if (net_ratelimit()) + netdev_err( + vp->dev, + GSO_ERROR + ); + } + if ((vheader->flags & VIRTIO_NET_HDR_F_DATA_VALID) > 0) + return 1; + + virtio_net_hdr_to_skb(skb, vheader, virtio_legacy_is_little_endian()); + return 0; +} + +static bool get_uint_param( + struct arglist *def, char *param, unsigned int *result) +{ + char *arg = uml_vector_fetch_arg(def, param); + + if (arg != NULL) { + if (kstrtoint(arg, 0, result) == 0) + return true; + } + return false; +} + +static bool get_ulong_param( + struct arglist *def, char *param, unsigned long *result) +{ + char *arg = uml_vector_fetch_arg(def, param); + + if (arg != NULL) { + if (kstrtoul(arg, 0, result) == 0) + return true; + return true; + } + return false; +} + +static int build_gre_transport_data(struct vector_private *vp) +{ + struct uml_gre_data *td; + int temp_int; + int temp_rx; + int temp_tx; + + vp->transport_data = kmalloc(sizeof(struct uml_gre_data), GFP_KERNEL); + if (vp->transport_data == NULL) + return -ENOMEM; + td = vp->transport_data; + td->sequence = 0; + + td->expected_header.arptype = GRE_IRB; + td->expected_header.header = 0; + + vp->form_header = &gre_form_header; + vp->verify_header = &gre_verify_header; + vp->header_size = 4; + td->key_offset = 4; + td->sequence_offset = 4; + td->checksum_offset = 4; + + td->ipv6 = false; + if (get_uint_param(vp->parsed, "v6", &temp_int)) { + if (temp_int > 0) + td->ipv6 = true; + } + td->key = false; + if (get_uint_param(vp->parsed, "rx_key", &temp_rx)) { + if (get_uint_param(vp->parsed, "tx_key", &temp_tx)) { + td->key = true; + td->expected_header.header |= GRE_MODE_KEY; + td->rx_key = cpu_to_be32(temp_rx); + td->tx_key = cpu_to_be32(temp_tx); + vp->header_size += 4; + td->sequence_offset += 4; + } else { + return -EINVAL; + } + } + + td->sequence = false; + if (get_uint_param(vp->parsed, "sequence", &temp_int)) { + if (temp_int > 0) { + vp->header_size += 4; + td->has_sequence = true; + td->expected_header.header |= GRE_MODE_SEQUENCE; + if (get_uint_param( + vp->parsed, "pin_sequence", &temp_int)) { + if (temp_int > 0) + td->pin_sequence = true; + } + } + } + vp->rx_header_size = vp->header_size; + if (!td->ipv6) + vp->rx_header_size += sizeof(struct iphdr); + return 0; +} + +static int build_l2tpv3_transport_data(struct vector_private *vp) +{ + + struct uml_l2tpv3_data *td; + int temp_int, temp_rxs, temp_txs; + unsigned long temp_rx; + unsigned long temp_tx; + + vp->transport_data = kmalloc( + sizeof(struct uml_l2tpv3_data), GFP_KERNEL); + + if (vp->transport_data == NULL) + return -ENOMEM; + + td = vp->transport_data; + + vp->form_header = &l2tpv3_form_header; + vp->verify_header = &l2tpv3_verify_header; + td->counter = 0; + + vp->header_size = 4; + td->session_offset = 0; + td->cookie_offset = 4; + td->counter_offset = 4; + + + td->ipv6 = false; + if (get_uint_param(vp->parsed, "v6", &temp_int)) { + if (temp_int > 0) + td->ipv6 = true; + } + + if (get_uint_param(vp->parsed, "rx_session", &temp_rxs)) { + if (get_uint_param(vp->parsed, "tx_session", &temp_txs)) { + td->tx_session = cpu_to_be32(temp_txs); + td->rx_session = cpu_to_be32(temp_rxs); + } else { + return -EINVAL; + } + } else { + return -EINVAL; + } + + td->cookie_is_64 = false; + if (get_uint_param(vp->parsed, "cookie64", &temp_int)) { + if (temp_int > 0) + td->cookie_is_64 = true; + } + td->cookie = false; + if (get_ulong_param(vp->parsed, "rx_cookie", &temp_rx)) { + if (get_ulong_param(vp->parsed, "tx_cookie", &temp_tx)) { + td->cookie = true; + if (td->cookie_is_64) { + td->rx_cookie = cpu_to_be64(temp_rx); + td->tx_cookie = cpu_to_be64(temp_tx); + vp->header_size += 8; + td->counter_offset += 8; + } else { + td->rx_cookie = cpu_to_be32(temp_rx); + td->tx_cookie = cpu_to_be32(temp_tx); + vp->header_size += 4; + td->counter_offset += 4; + } + } else { + return -EINVAL; + } + } + + td->has_counter = false; + if (get_uint_param(vp->parsed, "counter", &temp_int)) { + if (temp_int > 0) { + td->has_counter = true; + vp->header_size += 4; + if (get_uint_param( + vp->parsed, "pin_counter", &temp_int)) { + if (temp_int > 0) + td->pin_counter = true; + } + } + } + + if (get_uint_param(vp->parsed, "udp", &temp_int)) { + if (temp_int > 0) { + td->udp = true; + vp->header_size += 4; + td->counter_offset += 4; + td->session_offset += 4; + td->cookie_offset += 4; + } + } + + vp->rx_header_size = vp->header_size; + if ((!td->ipv6) && (!td->udp)) + vp->rx_header_size += sizeof(struct iphdr); + + return 0; +} + +static int build_raw_transport_data(struct vector_private *vp) +{ + if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) { + if (!uml_raw_enable_vnet_headers(vp->fds->tx_fd)) + return -1; + vp->form_header = &raw_form_header; + vp->verify_header = &raw_verify_header; + vp->header_size = sizeof(struct virtio_net_hdr); + vp->rx_header_size = sizeof(struct virtio_net_hdr); + vp->dev->hw_features |= (NETIF_F_TSO | NETIF_F_GRO); + vp->dev->features |= + (NETIF_F_RXCSUM | NETIF_F_HW_CSUM | + NETIF_F_TSO | NETIF_F_GRO); + netdev_info( + vp->dev, + "raw: using vnet headers for tso and tx/rx checksum" + ); + } + return 0; +} + +static int build_tap_transport_data(struct vector_private *vp) +{ + if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) { + vp->form_header = &raw_form_header; + vp->verify_header = &raw_verify_header; + vp->header_size = sizeof(struct virtio_net_hdr); + vp->rx_header_size = sizeof(struct virtio_net_hdr); + vp->dev->hw_features |= + (NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO); + vp->dev->features |= + (NETIF_F_RXCSUM | NETIF_F_HW_CSUM | + NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO); + netdev_info( + vp->dev, + "tap/raw: using vnet headers for tso and tx/rx checksum" + ); + } else { + return 0; /* do not try to enable tap too if raw failed */ + } + if (uml_tap_enable_vnet_headers(vp->fds->tx_fd)) + return 0; + return -1; +} + +int build_transport_data(struct vector_private *vp) +{ + char *transport = uml_vector_fetch_arg(vp->parsed, "transport"); + + if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0) + return build_gre_transport_data(vp); + if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0) + return build_l2tpv3_transport_data(vp); + if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0) + return build_raw_transport_data(vp); + if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0) + return build_tap_transport_data(vp); + return 0; +} + diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c new file mode 100644 index 000000000000..4d6a78e31089 --- /dev/null +++ b/arch/um/drivers/vector_user.c @@ -0,0 +1,590 @@ +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdarg.h> +#include <errno.h> +#include <stddef.h> +#include <string.h> +#include <sys/ioctl.h> +#include <net/if.h> +#include <linux/if_tun.h> +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <net/ethernet.h> +#include <netinet/ip.h> +#include <netinet/ether.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <linux/virtio_net.h> +#include <netdb.h> +#include <stdlib.h> +#include <os.h> +#include <um_malloc.h> +#include "vector_user.h" + +#define ID_GRE 0 +#define ID_L2TPV3 1 +#define ID_MAX 1 + +#define TOKEN_IFNAME "ifname" + +#define TRANS_RAW "raw" +#define TRANS_RAW_LEN strlen(TRANS_RAW) + +#define VNET_HDR_FAIL "could not enable vnet headers on fd %d" +#define TUN_GET_F_FAIL "tapraw: TUNGETFEATURES failed: %s" +#define L2TPV3_BIND_FAIL "l2tpv3_open : could not bind socket err=%i" +#define BPF_ATTACH_FAIL "Failed to attach filter size %d to %d, err %d\n" + +/* This is very ugly and brute force lookup, but it is done + * only once at initialization so not worth doing hashes or + * anything more intelligent + */ + +char *uml_vector_fetch_arg(struct arglist *ifspec, char *token) +{ + int i; + + for (i = 0; i < ifspec->numargs; i++) { + if (strcmp(ifspec->tokens[i], token) == 0) + return ifspec->values[i]; + } + return NULL; + +} + +struct arglist *uml_parse_vector_ifspec(char *arg) +{ + struct arglist *result; + int pos, len; + bool parsing_token = true, next_starts = true; + + if (arg == NULL) + return NULL; + result = uml_kmalloc(sizeof(struct arglist), UM_GFP_KERNEL); + if (result == NULL) + return NULL; + result->numargs = 0; + len = strlen(arg); + for (pos = 0; pos < len; pos++) { + if (next_starts) { + if (parsing_token) { + result->tokens[result->numargs] = arg + pos; + } else { + result->values[result->numargs] = arg + pos; + result->numargs++; + } + next_starts = false; + } + if (*(arg + pos) == '=') { + if (parsing_token) + parsing_token = false; + else + goto cleanup; + next_starts = true; + (*(arg + pos)) = '\0'; + } + if (*(arg + pos) == ',') { + parsing_token = true; + next_starts = true; + (*(arg + pos)) = '\0'; + } + } + return result; +cleanup: + printk(UM_KERN_ERR "vector_setup - Couldn't parse '%s'\n", arg); + kfree(result); + return NULL; +} + +/* + * Socket/FD configuration functions. These return an structure + * of rx and tx descriptors to cover cases where these are not + * the same (f.e. read via raw socket and write via tap). + */ + +#define PATH_NET_TUN "/dev/net/tun" + +static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) +{ + struct ifreq ifr; + int fd = -1; + struct sockaddr_ll sock; + int err = -ENOMEM, offload; + char *iface; + struct vector_fds *result = NULL; + + iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); + if (iface == NULL) { + printk(UM_KERN_ERR "uml_tap: failed to parse interface spec\n"); + goto tap_cleanup; + } + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "uml_tap: failed to allocate file descriptors\n"); + goto tap_cleanup; + } + result->rx_fd = -1; + result->tx_fd = -1; + result->remote_addr = NULL; + result->remote_addr_size = 0; + + /* TAP */ + + fd = open(PATH_NET_TUN, O_RDWR); + if (fd < 0) { + printk(UM_KERN_ERR "uml_tap: failed to open tun device\n"); + goto tap_cleanup; + } + result->tx_fd = fd; + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; + strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1); + + err = ioctl(fd, TUNSETIFF, (void *) &ifr); + if (err != 0) { + printk(UM_KERN_ERR "uml_tap: failed to select tap interface\n"); + goto tap_cleanup; + } + + offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6; + ioctl(fd, TUNSETOFFLOAD, offload); + + /* RAW */ + + fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (fd == -1) { + printk(UM_KERN_ERR + "uml_tap: failed to create socket: %i\n", -errno); + goto tap_cleanup; + } + result->rx_fd = fd; + memset(&ifr, 0, sizeof(ifr)); + strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1); + if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) { + printk(UM_KERN_ERR + "uml_tap: failed to set interface: %i\n", -errno); + goto tap_cleanup; + } + + sock.sll_family = AF_PACKET; + sock.sll_protocol = htons(ETH_P_ALL); + sock.sll_ifindex = ifr.ifr_ifindex; + + if (bind(fd, + (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) { + printk(UM_KERN_ERR + "user_init_tap: failed to bind raw pair, err %d\n", + -errno); + goto tap_cleanup; + } + return result; +tap_cleanup: + printk(UM_KERN_ERR "user_init_tap: init failed, error %d", err); + if (result != NULL) { + if (result->rx_fd >= 0) + os_close_file(result->rx_fd); + if (result->tx_fd >= 0) + os_close_file(result->tx_fd); + kfree(result); + } + return NULL; +} + + +static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) +{ + struct ifreq ifr; + int rxfd = -1, txfd = -1; + struct sockaddr_ll sock; + int err = -ENOMEM; + char *iface; + struct vector_fds *result = NULL; + + iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); + if (iface == NULL) + goto cleanup; + + rxfd = socket(AF_PACKET, SOCK_RAW, ETH_P_ALL); + if (rxfd == -1) { + err = -errno; + goto cleanup; + } + txfd = socket(AF_PACKET, SOCK_RAW, 0); /* Turn off RX on this fd */ + if (txfd == -1) { + err = -errno; + goto cleanup; + } + memset(&ifr, 0, sizeof(ifr)); + strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1); + if (ioctl(rxfd, SIOCGIFINDEX, (void *) &ifr) < 0) { + err = -errno; + goto cleanup; + } + + sock.sll_family = AF_PACKET; + sock.sll_protocol = htons(ETH_P_ALL); + sock.sll_ifindex = ifr.ifr_ifindex; + + if (bind(rxfd, + (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) { + err = -errno; + goto cleanup; + } + + sock.sll_family = AF_PACKET; + sock.sll_protocol = htons(ETH_P_IP); + sock.sll_ifindex = ifr.ifr_ifindex; + + if (bind(txfd, + (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) { + err = -errno; + goto cleanup; + } + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result != NULL) { + result->rx_fd = rxfd; + result->tx_fd = txfd; + result->remote_addr = NULL; + result->remote_addr_size = 0; + } + return result; +cleanup: + printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err); + if (rxfd >= 0) + os_close_file(rxfd); + if (txfd >= 0) + os_close_file(txfd); + if (result != NULL) + kfree(result); + return NULL; +} + + +bool uml_raw_enable_qdisc_bypass(int fd) +{ + int optval = 1; + + if (setsockopt(fd, + SOL_PACKET, PACKET_QDISC_BYPASS, + &optval, sizeof(optval)) != 0) { + return false; + } + return true; +} + +bool uml_raw_enable_vnet_headers(int fd) +{ + int optval = 1; + + if (setsockopt(fd, + SOL_PACKET, PACKET_VNET_HDR, + &optval, sizeof(optval)) != 0) { + printk(UM_KERN_INFO VNET_HDR_FAIL, fd); + return false; + } + return true; +} +bool uml_tap_enable_vnet_headers(int fd) +{ + unsigned int features; + int len = sizeof(struct virtio_net_hdr); + + if (ioctl(fd, TUNGETFEATURES, &features) == -1) { + printk(UM_KERN_INFO TUN_GET_F_FAIL, strerror(errno)); + return false; + } + if ((features & IFF_VNET_HDR) == 0) { + printk(UM_KERN_INFO "tapraw: No VNET HEADER support"); + return false; + } + ioctl(fd, TUNSETVNETHDRSZ, &len); + return true; +} + +static struct vector_fds *user_init_socket_fds(struct arglist *ifspec, int id) +{ + int err = -ENOMEM; + int fd = -1, gairet; + struct addrinfo srchints; + struct addrinfo dsthints; + bool v6, udp; + char *value; + char *src, *dst, *srcport, *dstport; + struct addrinfo *gairesult = NULL; + struct vector_fds *result = NULL; + + + value = uml_vector_fetch_arg(ifspec, "v6"); + v6 = false; + udp = false; + if (value != NULL) { + if (strtol((const char *) value, NULL, 10) > 0) + v6 = true; + } + + value = uml_vector_fetch_arg(ifspec, "udp"); + if (value != NULL) { + if (strtol((const char *) value, NULL, 10) > 0) + udp = true; + } + src = uml_vector_fetch_arg(ifspec, "src"); + dst = uml_vector_fetch_arg(ifspec, "dst"); + srcport = uml_vector_fetch_arg(ifspec, "srcport"); + dstport = uml_vector_fetch_arg(ifspec, "dstport"); + + memset(&dsthints, 0, sizeof(dsthints)); + + if (v6) + dsthints.ai_family = AF_INET6; + else + dsthints.ai_family = AF_INET; + + switch (id) { + case ID_GRE: + dsthints.ai_socktype = SOCK_RAW; + dsthints.ai_protocol = IPPROTO_GRE; + break; + case ID_L2TPV3: + if (udp) { + dsthints.ai_socktype = SOCK_DGRAM; + dsthints.ai_protocol = 0; + } else { + dsthints.ai_socktype = SOCK_RAW; + dsthints.ai_protocol = IPPROTO_L2TP; + } + break; + default: + printk(KERN_ERR "Unsupported socket type\n"); + return NULL; + } + memcpy(&srchints, &dsthints, sizeof(struct addrinfo)); + + gairet = getaddrinfo(src, srcport, &dsthints, &gairesult); + if ((gairet != 0) || (gairesult == NULL)) { + printk(UM_KERN_ERR + "socket_open : could not resolve src, error = %s", + gai_strerror(gairet) + ); + return NULL; + } + fd = socket(gairesult->ai_family, + gairesult->ai_socktype, gairesult->ai_protocol); + if (fd == -1) { + printk(UM_KERN_ERR + "socket_open : could not open socket, error = %d", + -errno + ); + goto cleanup; + } + if (bind(fd, + (struct sockaddr *) gairesult->ai_addr, + gairesult->ai_addrlen)) { + printk(UM_KERN_ERR L2TPV3_BIND_FAIL, errno); + goto cleanup; + } + + if (gairesult != NULL) + freeaddrinfo(gairesult); + + gairesult = NULL; + + gairet = getaddrinfo(dst, dstport, &dsthints, &gairesult); + if ((gairet != 0) || (gairesult == NULL)) { + printk(UM_KERN_ERR + "socket_open : could not resolve dst, error = %s", + gai_strerror(gairet) + ); + return NULL; + } + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result != NULL) { + result->rx_fd = fd; + result->tx_fd = fd; + result->remote_addr = uml_kmalloc( + gairesult->ai_addrlen, UM_GFP_KERNEL); + if (result->remote_addr == NULL) + goto cleanup; + result->remote_addr_size = gairesult->ai_addrlen; + memcpy( + result->remote_addr, + gairesult->ai_addr, + gairesult->ai_addrlen + ); + } + freeaddrinfo(gairesult); + return result; +cleanup: + if (gairesult != NULL) + freeaddrinfo(gairesult); + printk(UM_KERN_ERR "user_init_socket: init failed, error %d", err); + if (fd >= 0) + os_close_file(fd); + if (result != NULL) { + if (result->remote_addr != NULL) + kfree(result->remote_addr); + kfree(result); + } + return NULL; +} + +struct vector_fds *uml_vector_user_open( + int unit, + struct arglist *parsed +) +{ + char *transport; + + if (parsed == NULL) { + printk(UM_KERN_ERR "no parsed config for unit %d\n", unit); + return NULL; + } + transport = uml_vector_fetch_arg(parsed, "transport"); + if (transport == NULL) { + printk(UM_KERN_ERR "missing transport for unit %d\n", unit); + return NULL; + } + if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0) + return user_init_raw_fds(parsed); + if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0) + return user_init_tap_fds(parsed); + if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0) + return user_init_socket_fds(parsed, ID_GRE); + if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0) + return user_init_socket_fds(parsed, ID_L2TPV3); + return NULL; +} + + +int uml_vector_sendmsg(int fd, void *hdr, int flags) +{ + int n; + + CATCH_EINTR(n = sendmsg(fd, (struct msghdr *) hdr, flags)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_recvmsg(int fd, void *hdr, int flags) +{ + int n; + + CATCH_EINTR(n = recvmsg(fd, (struct msghdr *) hdr, flags)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_writev(int fd, void *hdr, int iovcount) +{ + int n; + + CATCH_EINTR(n = writev(fd, (struct iovec *) hdr, iovcount)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_sendmmsg( + int fd, + void *msgvec, + unsigned int vlen, + unsigned int flags) +{ + int n; + + CATCH_EINTR(n = sendmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} + +int uml_vector_recvmmsg( + int fd, + void *msgvec, + unsigned int vlen, + unsigned int flags) +{ + int n; + + CATCH_EINTR( + n = recvmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags, 0)); + if ((n < 0) && (errno == EAGAIN)) + return 0; + if (n >= 0) + return n; + else + return -errno; +} +int uml_vector_attach_bpf(int fd, void *bpf, int bpf_len) +{ + int err = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, bpf, bpf_len); + + if (err < 0) + printk(KERN_ERR BPF_ATTACH_FAIL, bpf_len, fd, -errno); + return err; +} + +#define DEFAULT_BPF_LEN 6 + +void *uml_vector_default_bpf(int fd, void *mac) +{ + struct sock_filter *bpf; + uint32_t *mac1 = (uint32_t *)(mac + 2); + uint16_t *mac2 = (uint16_t *) mac; + struct sock_fprog bpf_prog = { + .len = 6, + .filter = NULL, + }; + + bpf = uml_kmalloc( + sizeof(struct sock_filter) * DEFAULT_BPF_LEN, UM_GFP_KERNEL); + if (bpf != NULL) { + bpf_prog.filter = bpf; + /* ld [8] */ + bpf[0] = (struct sock_filter){ 0x20, 0, 0, 0x00000008 }; + /* jeq #0xMAC[2-6] jt 2 jf 5*/ + bpf[1] = (struct sock_filter){ 0x15, 0, 3, ntohl(*mac1)}; + /* ldh [6] */ + bpf[2] = (struct sock_filter){ 0x28, 0, 0, 0x00000006 }; + /* jeq #0xMAC[0-1] jt 4 jf 5 */ + bpf[3] = (struct sock_filter){ 0x15, 0, 1, ntohs(*mac2)}; + /* ret #0 */ + bpf[4] = (struct sock_filter){ 0x6, 0, 0, 0x00000000 }; + /* ret #0x40000 */ + bpf[5] = (struct sock_filter){ 0x6, 0, 0, 0x00040000 }; + if (uml_vector_attach_bpf( + fd, &bpf_prog, sizeof(struct sock_fprog)) < 0) { + kfree(bpf); + bpf = NULL; + } + } + return bpf; +} + diff --git a/arch/um/drivers/vector_user.h b/arch/um/drivers/vector_user.h new file mode 100644 index 000000000000..d7cbff73b7ff --- /dev/null +++ b/arch/um/drivers/vector_user.h @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __UM_VECTOR_USER_H +#define __UM_VECTOR_USER_H + +#define MAXVARGS 20 + +#define TOKEN_IFNAME "ifname" + +#define TRANS_RAW "raw" +#define TRANS_RAW_LEN strlen(TRANS_RAW) + +#define TRANS_TAP "tap" +#define TRANS_TAP_LEN strlen(TRANS_TAP) + + +#define TRANS_GRE "gre" +#define TRANS_GRE_LEN strlen(TRANS_RAW) + +#define TRANS_L2TPV3 "l2tpv3" +#define TRANS_L2TPV3_LEN strlen(TRANS_L2TPV3) + +#ifndef IPPROTO_GRE +#define IPPROTO_GRE 0x2F +#endif + +#define GRE_MODE_CHECKSUM cpu_to_be16(8 << 12) /* checksum */ +#define GRE_MODE_RESERVED cpu_to_be16(4 << 12) /* unused */ +#define GRE_MODE_KEY cpu_to_be16(2 << 12) /* KEY present */ +#define GRE_MODE_SEQUENCE cpu_to_be16(1 << 12) /* sequence */ + +#define GRE_IRB cpu_to_be16(0x6558) + +#define L2TPV3_DATA_PACKET 0x30000 + +/* IANA-assigned IP protocol ID for L2TPv3 */ + +#ifndef IPPROTO_L2TP +#define IPPROTO_L2TP 0x73 +#endif + +struct arglist { + int numargs; + char *tokens[MAXVARGS]; + char *values[MAXVARGS]; +}; + +/* Separating read and write FDs allows us to have different + * rx and tx method. Example - read tap via raw socket using + * recvmmsg, write using legacy tap write calls + */ + +struct vector_fds { + int rx_fd; + int tx_fd; + void *remote_addr; + int remote_addr_size; +}; + +#define VECTOR_READ 1 +#define VECTOR_WRITE (1 < 1) +#define VECTOR_HEADERS (1 < 2) + +extern struct arglist *uml_parse_vector_ifspec(char *arg); + +extern struct vector_fds *uml_vector_user_open( + int unit, + struct arglist *parsed +); + +extern char *uml_vector_fetch_arg( + struct arglist *ifspec, + char *token +); + +extern int uml_vector_recvmsg(int fd, void *hdr, int flags); +extern int uml_vector_sendmsg(int fd, void *hdr, int flags); +extern int uml_vector_writev(int fd, void *hdr, int iovcount); +extern int uml_vector_sendmmsg( + int fd, void *msgvec, + unsigned int vlen, + unsigned int flags +); +extern int uml_vector_recvmmsg( + int fd, + void *msgvec, + unsigned int vlen, + unsigned int flags +); +extern void *uml_vector_default_bpf(int fd, void *mac); +extern int uml_vector_attach_bpf(int fd, void *bpf, int bpf_len); +extern bool uml_raw_enable_qdisc_bypass(int fd); +extern bool uml_raw_enable_vnet_headers(int fd); +extern bool uml_tap_enable_vnet_headers(int fd); + + +#endif diff --git a/arch/um/include/asm/asm-prototypes.h b/arch/um/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..5898a26daa0d --- /dev/null +++ b/arch/um/include/asm/asm-prototypes.h @@ -0,0 +1 @@ +#include <asm-generic/asm-prototypes.h> diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h index b5cdd3f91157..49ed3e35b35a 100644 --- a/arch/um/include/asm/irq.h +++ b/arch/um/include/asm/irq.h @@ -18,7 +18,19 @@ #define XTERM_IRQ 13 #define RANDOM_IRQ 14 +#ifdef CONFIG_UML_NET_VECTOR + +#define VECTOR_BASE_IRQ 15 +#define VECTOR_IRQ_SPACE 8 + +#define LAST_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ) + +#else + #define LAST_IRQ RANDOM_IRQ + +#endif + #define NR_IRQS (LAST_IRQ + 1) #endif diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index df5633053957..a7a6120f19d5 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -7,6 +7,7 @@ #define __IRQ_USER_H__ #include <sysdep/ptrace.h> +#include <stdbool.h> struct irq_fd { struct irq_fd *next; @@ -15,10 +16,17 @@ struct irq_fd { int type; int irq; int events; - int current_events; + bool active; + bool pending; + bool purge; }; -enum { IRQ_READ, IRQ_WRITE }; +#define IRQ_READ 0 +#define IRQ_WRITE 1 +#define IRQ_NONE 2 +#define MAX_IRQ_TYPE (IRQ_NONE + 1) + + struct siginfo; extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h index 012ac87d4900..40442b98b173 100644 --- a/arch/um/include/shared/net_kern.h +++ b/arch/um/include/shared/net_kern.h @@ -65,5 +65,7 @@ extern int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, char **gate_addr); extern void register_transport(struct transport *new); extern unsigned short eth_protocol(struct sk_buff *skb); +extern void uml_net_setup_etheraddr(struct net_device *dev, char *str); + #endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index d8ddaf9790d2..048ae37eb5aa 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -290,15 +290,16 @@ extern void halt_skas(void); extern void reboot_skas(void); /* irq.c */ -extern int os_waiting_for_events(struct irq_fd *active_fds); -extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds); -extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, - struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2); -extern void os_free_irq_later(struct irq_fd *active_fds, - int irq, void *dev_id); -extern int os_get_pollfd(int i); -extern void os_set_pollfd(int i, int fd); +extern int os_waiting_for_events_epoll(void); +extern void *os_epoll_get_data_pointer(int index); +extern int os_epoll_triggered(int index, int events); +extern int os_event_mask(int irq_type); +extern int os_setup_epoll(void); +extern int os_add_epoll_fd(int events, int fd, void *data); +extern int os_mod_epoll_fd(int events, int fd, void *data); +extern int os_del_epoll_fd(int fd); extern void os_set_ioignore(void); +extern void os_close_epoll_fd(void); /* sigio.c */ extern int add_sigio_fd(int fd); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 23cb9350d47e..6b7f3827d6e4 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -1,4 +1,6 @@ /* + * Copyright (C) 2017 - Cambridge Greys Ltd + * Copyright (C) 2011 - 2014 Cisco Systems Inc * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: @@ -16,243 +18,362 @@ #include <as-layout.h> #include <kern_util.h> #include <os.h> +#include <irq_user.h> -/* - * This list is accessed under irq_lock, except in sigio_handler, - * where it is safe from being modified. IRQ handlers won't change it - - * if an IRQ source has vanished, it will be freed by free_irqs just - * before returning from sigio_handler. That will process a separate - * list of irqs to free, with its own locking, coming back here to - * remove list elements, taking the irq_lock to do so. + +/* When epoll triggers we do not know why it did so + * we can also have different IRQs for read and write. + * This is why we keep a small irq_fd array for each fd - + * one entry per IRQ type */ -static struct irq_fd *active_fds = NULL; -static struct irq_fd **last_irq_ptr = &active_fds; -extern void free_irqs(void); +struct irq_entry { + struct irq_entry *next; + int fd; + struct irq_fd *irq_array[MAX_IRQ_TYPE + 1]; +}; + +static struct irq_entry *active_fds; + +static DEFINE_SPINLOCK(irq_lock); + +static void irq_io_loop(struct irq_fd *irq, struct uml_pt_regs *regs) +{ +/* + * irq->active guards against reentry + * irq->pending accumulates pending requests + * if pending is raised the irq_handler is re-run + * until pending is cleared + */ + if (irq->active) { + irq->active = false; + do { + irq->pending = false; + do_IRQ(irq->irq, regs); + } while (irq->pending && (!irq->purge)); + if (!irq->purge) + irq->active = true; + } else { + irq->pending = true; + } +} void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { - struct irq_fd *irq_fd; - int n; + struct irq_entry *irq_entry; + struct irq_fd *irq; + + int n, i, j; while (1) { - n = os_waiting_for_events(active_fds); + /* This is now lockless - epoll keeps back-referencesto the irqs + * which have trigger it so there is no need to walk the irq + * list and lock it every time. We avoid locking by turning off + * IO for a specific fd by executing os_del_epoll_fd(fd) before + * we do any changes to the actual data structures + */ + n = os_waiting_for_events_epoll(); + if (n <= 0) { if (n == -EINTR) continue; - else break; + else + break; } - for (irq_fd = active_fds; irq_fd != NULL; - irq_fd = irq_fd->next) { - if (irq_fd->current_events != 0) { - irq_fd->current_events = 0; - do_IRQ(irq_fd->irq, regs); + for (i = 0; i < n ; i++) { + /* Epoll back reference is the entry with 3 irq_fd + * leaves - one for each irq type. + */ + irq_entry = (struct irq_entry *) + os_epoll_get_data_pointer(i); + for (j = 0; j < MAX_IRQ_TYPE ; j++) { + irq = irq_entry->irq_array[j]; + if (irq == NULL) + continue; + if (os_epoll_triggered(i, irq->events) > 0) + irq_io_loop(irq, regs); + if (irq->purge) { + irq_entry->irq_array[j] = NULL; + kfree(irq); + } } } } +} + +static int assign_epoll_events_to_irq(struct irq_entry *irq_entry) +{ + int i; + int events = 0; + struct irq_fd *irq; - free_irqs(); + for (i = 0; i < MAX_IRQ_TYPE ; i++) { + irq = irq_entry->irq_array[i]; + if (irq != NULL) + events = irq->events | events; + } + if (events > 0) { + /* os_add_epoll will call os_mod_epoll if this already exists */ + return os_add_epoll_fd(events, irq_entry->fd, irq_entry); + } + /* No events - delete */ + return os_del_epoll_fd(irq_entry->fd); } -static DEFINE_SPINLOCK(irq_lock); + static int activate_fd(int irq, int fd, int type, void *dev_id) { - struct pollfd *tmp_pfd; - struct irq_fd *new_fd, *irq_fd; + struct irq_fd *new_fd; + struct irq_entry *irq_entry; + int i, err, events; unsigned long flags; - int events, err, n; err = os_set_fd_async(fd); if (err < 0) goto out; - err = -ENOMEM; - new_fd = kmalloc(sizeof(struct irq_fd), GFP_KERNEL); - if (new_fd == NULL) - goto out; + spin_lock_irqsave(&irq_lock, flags); - if (type == IRQ_READ) - events = UM_POLLIN | UM_POLLPRI; - else events = UM_POLLOUT; - *new_fd = ((struct irq_fd) { .next = NULL, - .id = dev_id, - .fd = fd, - .type = type, - .irq = irq, - .events = events, - .current_events = 0 } ); + /* Check if we have an entry for this fd */ err = -EBUSY; - spin_lock_irqsave(&irq_lock, flags); - for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) { - if ((irq_fd->fd == fd) && (irq_fd->type == type)) { - printk(KERN_ERR "Registering fd %d twice\n", fd); - printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq); - printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id, - dev_id); + for (irq_entry = active_fds; + irq_entry != NULL; irq_entry = irq_entry->next) { + if (irq_entry->fd == fd) + break; + } + + if (irq_entry == NULL) { + /* This needs to be atomic as it may be called from an + * IRQ context. + */ + irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC); + if (irq_entry == NULL) { + printk(KERN_ERR + "Failed to allocate new IRQ entry\n"); goto out_unlock; } + irq_entry->fd = fd; + for (i = 0; i < MAX_IRQ_TYPE; i++) + irq_entry->irq_array[i] = NULL; + irq_entry->next = active_fds; + active_fds = irq_entry; } - if (type == IRQ_WRITE) - fd = -1; - - tmp_pfd = NULL; - n = 0; + /* Check if we are trying to re-register an interrupt for a + * particular fd + */ - while (1) { - n = os_create_pollfd(fd, events, tmp_pfd, n); - if (n == 0) - break; + if (irq_entry->irq_array[type] != NULL) { + printk(KERN_ERR + "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n", + irq, fd, type, dev_id + ); + goto out_unlock; + } else { + /* New entry for this fd */ + + err = -ENOMEM; + new_fd = kmalloc(sizeof(struct irq_fd), GFP_ATOMIC); + if (new_fd == NULL) + goto out_unlock; - /* - * n > 0 - * It means we couldn't put new pollfd to current pollfds - * and tmp_fds is NULL or too small for new pollfds array. - * Needed size is equal to n as minimum. - * - * Here we have to drop the lock in order to call - * kmalloc, which might sleep. - * If something else came in and changed the pollfds array - * so we will not be able to put new pollfd struct to pollfds - * then we free the buffer tmp_fds and try again. + events = os_event_mask(type); + + *new_fd = ((struct irq_fd) { + .id = dev_id, + .irq = irq, + .type = type, + .events = events, + .active = true, + .pending = false, + .purge = false + }); + /* Turn off any IO on this fd - allows us to + * avoid locking the IRQ loop */ - spin_unlock_irqrestore(&irq_lock, flags); - kfree(tmp_pfd); - - tmp_pfd = kmalloc(n, GFP_KERNEL); - if (tmp_pfd == NULL) - goto out_kfree; - - spin_lock_irqsave(&irq_lock, flags); + os_del_epoll_fd(irq_entry->fd); + irq_entry->irq_array[type] = new_fd; } - *last_irq_ptr = new_fd; - last_irq_ptr = &new_fd->next; - + /* Turn back IO on with the correct (new) IO event mask */ + assign_epoll_events_to_irq(irq_entry); spin_unlock_irqrestore(&irq_lock, flags); - - /* - * This calls activate_fd, so it has to be outside the critical - * section. - */ - maybe_sigio_broken(fd, (type == IRQ_READ)); + maybe_sigio_broken(fd, (type != IRQ_NONE)); return 0; - - out_unlock: +out_unlock: spin_unlock_irqrestore(&irq_lock, flags); - out_kfree: - kfree(new_fd); - out: +out: return err; } -static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) +/* + * Walk the IRQ list and dispose of any unused entries. + * Should be done under irq_lock. + */ + +static void garbage_collect_irq_entries(void) { - unsigned long flags; + int i; + bool reap; + struct irq_entry *walk; + struct irq_entry *previous = NULL; + struct irq_entry *to_free; - spin_lock_irqsave(&irq_lock, flags); - os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr); - spin_unlock_irqrestore(&irq_lock, flags); + if (active_fds == NULL) + return; + walk = active_fds; + while (walk != NULL) { + reap = true; + for (i = 0; i < MAX_IRQ_TYPE ; i++) { + if (walk->irq_array[i] != NULL) { + reap = false; + break; + } + } + if (reap) { + if (previous == NULL) + active_fds = walk->next; + else + previous->next = walk->next; + to_free = walk; + } else { + to_free = NULL; + } + walk = walk->next; + if (to_free != NULL) + kfree(to_free); + } } -struct irq_and_dev { - int irq; - void *dev; -}; +/* + * Walk the IRQ list and get the descriptor for our FD + */ -static int same_irq_and_dev(struct irq_fd *irq, void *d) +static struct irq_entry *get_irq_entry_by_fd(int fd) { - struct irq_and_dev *data = d; + struct irq_entry *walk = active_fds; - return ((irq->irq == data->irq) && (irq->id == data->dev)); + while (walk != NULL) { + if (walk->fd == fd) + return walk; + walk = walk->next; + } + return NULL; } -static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) -{ - struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq, - .dev = dev }); - free_irq_by_cb(same_irq_and_dev, &data); -} +/* + * Walk the IRQ list and dispose of an entry for a specific + * device, fd and number. Note - if sharing an IRQ for read + * and writefor the same FD it will be disposed in either case. + * If this behaviour is undesirable use different IRQ ids. + */ -static int same_fd(struct irq_fd *irq, void *fd) -{ - return (irq->fd == *((int *)fd)); -} +#define IGNORE_IRQ 1 +#define IGNORE_DEV (1<<1) -void free_irq_by_fd(int fd) +static void do_free_by_irq_and_dev( + struct irq_entry *irq_entry, + unsigned int irq, + void *dev, + int flags +) { - free_irq_by_cb(same_fd, &fd); + int i; + struct irq_fd *to_free; + + for (i = 0; i < MAX_IRQ_TYPE ; i++) { + if (irq_entry->irq_array[i] != NULL) { + if ( + ((flags & IGNORE_IRQ) || + (irq_entry->irq_array[i]->irq == irq)) && + ((flags & IGNORE_DEV) || + (irq_entry->irq_array[i]->id == dev)) + ) { + /* Turn off any IO on this fd - allows us to + * avoid locking the IRQ loop + */ + os_del_epoll_fd(irq_entry->fd); + to_free = irq_entry->irq_array[i]; + irq_entry->irq_array[i] = NULL; + assign_epoll_events_to_irq(irq_entry); + if (to_free->active) + to_free->purge = true; + else + kfree(to_free); + } + } + } } -/* Must be called with irq_lock held */ -static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) +void free_irq_by_fd(int fd) { - struct irq_fd *irq; - int i = 0; - int fdi; + struct irq_entry *to_free; + unsigned long flags; - for (irq = active_fds; irq != NULL; irq = irq->next) { - if ((irq->fd == fd) && (irq->irq == irqnum)) - break; - i++; - } - if (irq == NULL) { - printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n", - fd); - goto out; - } - fdi = os_get_pollfd(i); - if ((fdi != -1) && (fdi != fd)) { - printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds " - "and pollfds, fd %d vs %d, need %d\n", irq->fd, - fdi, fd); - irq = NULL; - goto out; + spin_lock_irqsave(&irq_lock, flags); + to_free = get_irq_entry_by_fd(fd); + if (to_free != NULL) { + do_free_by_irq_and_dev( + to_free, + -1, + NULL, + IGNORE_IRQ | IGNORE_DEV + ); } - *index_out = i; - out: - return irq; + garbage_collect_irq_entries(); + spin_unlock_irqrestore(&irq_lock, flags); } +EXPORT_SYMBOL(free_irq_by_fd); -void reactivate_fd(int fd, int irqnum) +static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_fd *irq; + struct irq_entry *to_free; unsigned long flags; - int i; spin_lock_irqsave(&irq_lock, flags); - irq = find_irq_by_fd(fd, irqnum, &i); - if (irq == NULL) { - spin_unlock_irqrestore(&irq_lock, flags); - return; + to_free = active_fds; + while (to_free != NULL) { + do_free_by_irq_and_dev( + to_free, + irq, + dev, + 0 + ); + to_free = to_free->next; } - os_set_pollfd(i, irq->fd); + garbage_collect_irq_entries(); spin_unlock_irqrestore(&irq_lock, flags); +} - add_sigio_fd(fd); + +void reactivate_fd(int fd, int irqnum) +{ + /** NOP - we do auto-EOI now **/ } void deactivate_fd(int fd, int irqnum) { - struct irq_fd *irq; + struct irq_entry *to_free; unsigned long flags; - int i; + os_del_epoll_fd(fd); spin_lock_irqsave(&irq_lock, flags); - irq = find_irq_by_fd(fd, irqnum, &i); - if (irq == NULL) { - spin_unlock_irqrestore(&irq_lock, flags); - return; + to_free = get_irq_entry_by_fd(fd); + if (to_free != NULL) { + do_free_by_irq_and_dev( + to_free, + irqnum, + NULL, + IGNORE_DEV + ); } - - os_set_pollfd(i, -1); + garbage_collect_irq_entries(); spin_unlock_irqrestore(&irq_lock, flags); - ignore_sigio_fd(fd); } EXPORT_SYMBOL(deactivate_fd); @@ -265,17 +386,28 @@ EXPORT_SYMBOL(deactivate_fd); */ int deactivate_all_fds(void) { - struct irq_fd *irq; - int err; + unsigned long flags; + struct irq_entry *to_free; - for (irq = active_fds; irq != NULL; irq = irq->next) { - err = os_clear_fd_async(irq->fd); - if (err) - return err; - } - /* If there is a signal already queued, after unblocking ignore it */ + spin_lock_irqsave(&irq_lock, flags); + /* Stop IO. The IRQ loop has no lock so this is our + * only way of making sure we are safe to dispose + * of all IRQ handlers + */ os_set_ioignore(); - + to_free = active_fds; + while (to_free != NULL) { + do_free_by_irq_and_dev( + to_free, + -1, + NULL, + IGNORE_IRQ | IGNORE_DEV + ); + to_free = to_free->next; + } + garbage_collect_irq_entries(); + spin_unlock_irqrestore(&irq_lock, flags); + os_close_epoll_fd(); return 0; } @@ -353,8 +485,11 @@ void __init init_IRQ(void) irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq); + for (i = 1; i < NR_IRQS; i++) irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); + /* Initialize EPOLL Loop */ + os_setup_epoll(); } /* diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 7f69d17de354..052de4c8acb2 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -121,12 +121,12 @@ static void __init um_timer_setup(void) clockevents_register_device(&timer_clockevent); } -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { long long nsecs = os_persistent_clock_emulation(); - set_normalized_timespec(ts, nsecs / NSEC_PER_SEC, - nsecs % NSEC_PER_SEC); + set_normalized_timespec64(ts, nsecs / NSEC_PER_SEC, + nsecs % NSEC_PER_SEC); } void __init time_init(void) diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 2db18cbbb0ea..c0197097c86e 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -12,6 +12,7 @@ #include <sys/mount.h> #include <sys/socket.h> #include <sys/stat.h> +#include <sys/sysmacros.h> #include <sys/un.h> #include <sys/types.h> #include <os.h> diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c index b9afb74b79ad..365823010346 100644 --- a/arch/um/os-Linux/irq.c +++ b/arch/um/os-Linux/irq.c @@ -1,135 +1,147 @@ /* + * Copyright (C) 2017 - Cambridge Greys Ltd + * Copyright (C) 2011 - 2014 Cisco Systems Inc * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) * Licensed under the GPL */ #include <stdlib.h> #include <errno.h> -#include <poll.h> +#include <sys/epoll.h> #include <signal.h> #include <string.h> #include <irq_user.h> #include <os.h> #include <um_malloc.h> +/* Epoll support */ + +static int epollfd = -1; + +#define MAX_EPOLL_EVENTS 64 + +static struct epoll_event epoll_events[MAX_EPOLL_EVENTS]; + +/* Helper to return an Epoll data pointer from an epoll event structure. + * We need to keep this one on the userspace side to keep includes separate + */ + +void *os_epoll_get_data_pointer(int index) +{ + return epoll_events[index].data.ptr; +} + +/* Helper to compare events versus the events in the epoll structure. + * Same as above - needs to be on the userspace side + */ + + +int os_epoll_triggered(int index, int events) +{ + return epoll_events[index].events & events; +} +/* Helper to set the event mask. + * The event mask is opaque to the kernel side, because it does not have + * access to the right includes/defines for EPOLL constants. + */ + +int os_event_mask(int irq_type) +{ + if (irq_type == IRQ_READ) + return EPOLLIN | EPOLLPRI; + if (irq_type == IRQ_WRITE) + return EPOLLOUT; + return 0; +} + /* - * Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd - * and os_free_irq_by_cb, which are called under irq_lock. + * Initial Epoll Setup */ -static struct pollfd *pollfds = NULL; -static int pollfds_num = 0; -static int pollfds_size = 0; +int os_setup_epoll(void) +{ + epollfd = epoll_create(MAX_EPOLL_EVENTS); + return epollfd; +} -int os_waiting_for_events(struct irq_fd *active_fds) +/* + * Helper to run the actual epoll_wait + */ +int os_waiting_for_events_epoll(void) { - struct irq_fd *irq_fd; - int i, n, err; + int n, err; - n = poll(pollfds, pollfds_num, 0); + n = epoll_wait(epollfd, + (struct epoll_event *) &epoll_events, MAX_EPOLL_EVENTS, 0); if (n < 0) { err = -errno; if (errno != EINTR) - printk(UM_KERN_ERR "os_waiting_for_events:" - " poll returned %d, errno = %d\n", n, errno); + printk( + UM_KERN_ERR "os_waiting_for_events:" + " epoll returned %d, error = %s\n", n, + strerror(errno) + ); return err; } - - if (n == 0) - return 0; - - irq_fd = active_fds; - - for (i = 0; i < pollfds_num; i++) { - if (pollfds[i].revents != 0) { - irq_fd->current_events = pollfds[i].revents; - pollfds[i].fd = -1; - } - irq_fd = irq_fd->next; - } return n; } -int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds) -{ - if (pollfds_num == pollfds_size) { - if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) { - /* return min size needed for new pollfds area */ - return (pollfds_size + 1) * sizeof(pollfds[0]); - } - - if (pollfds != NULL) { - memcpy(tmp_pfd, pollfds, - sizeof(pollfds[0]) * pollfds_size); - /* remove old pollfds */ - kfree(pollfds); - } - pollfds = tmp_pfd; - pollfds_size++; - } else - kfree(tmp_pfd); /* remove not used tmp_pfd */ - - pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, - .events = events, - .revents = 0 }); - pollfds_num++; - - return 0; -} -void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, - struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2) +/* + * Helper to add a fd to epoll + */ +int os_add_epoll_fd(int events, int fd, void *data) { - struct irq_fd **prev; - int i = 0; - - prev = &active_fds; - while (*prev != NULL) { - if ((*test)(*prev, arg)) { - struct irq_fd *old_fd = *prev; - if ((pollfds[i].fd != -1) && - (pollfds[i].fd != (*prev)->fd)) { - printk(UM_KERN_ERR "os_free_irq_by_cb - " - "mismatch between active_fds and " - "pollfds, fd %d vs %d\n", - (*prev)->fd, pollfds[i].fd); - goto out; - } - - pollfds_num--; - - /* - * This moves the *whole* array after pollfds[i] - * (though it doesn't spot as such)! - */ - memmove(&pollfds[i], &pollfds[i + 1], - (pollfds_num - i) * sizeof(pollfds[0])); - if (*last_irq_ptr2 == &old_fd->next) - *last_irq_ptr2 = prev; - - *prev = (*prev)->next; - if (old_fd->type == IRQ_WRITE) - ignore_sigio_fd(old_fd->fd); - kfree(old_fd); - continue; - } - prev = &(*prev)->next; - i++; - } - out: - return; + struct epoll_event event; + int result; + + event.data.ptr = data; + event.events = events | EPOLLET; + result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event); + if ((result) && (errno == EEXIST)) + result = os_mod_epoll_fd(events, fd, data); + if (result) + printk("epollctl add err fd %d, %s\n", fd, strerror(errno)); + return result; } -int os_get_pollfd(int i) +/* + * Helper to mod the fd event mask and/or data backreference + */ +int os_mod_epoll_fd(int events, int fd, void *data) { - return pollfds[i].fd; + struct epoll_event event; + int result; + + event.data.ptr = data; + event.events = events; + result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event); + if (result) + printk(UM_KERN_ERR + "epollctl mod err fd %d, %s\n", fd, strerror(errno)); + return result; } -void os_set_pollfd(int i, int fd) +/* + * Helper to delete the epoll fd + */ +int os_del_epoll_fd(int fd) { - pollfds[i].fd = fd; + struct epoll_event event; + int result; + /* This is quiet as we use this as IO ON/OFF - so it is often + * invoked on a non-existent fd + */ + result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event); + return result; } void os_set_ioignore(void) { signal(SIGIO, SIG_IGN); } + +void os_close_epoll_fd(void) +{ + /* Needed so we do not leak an fd when rebooting */ + os_close_file(epollfd); +} diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index a86d7cc2c2d8..bf0acb8aad8b 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -16,6 +16,7 @@ #include <os.h> #include <sysdep/mcontext.h> #include <um_malloc.h> +#include <sys/ucontext.h> void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { [SIGTRAP] = relay_signal, @@ -159,7 +160,7 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { static void hard_handler(int sig, siginfo_t *si, void *p) { - struct ucontext *uc = p; + ucontext_t *uc = p; mcontext_t *mc = &uc->uc_mcontext; unsigned long pending = 1UL << sig; diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index a5e08e2d5d6d..1d9132b66039 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -170,10 +170,8 @@ extern void flush_cache_page(struct vm_area_struct *vma, #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_user_range(vma, page, addr, len) \ flush_dcache_page(page) diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h index 3bb0a29fd2d7..66bb9f6525c0 100644 --- a/arch/unicore32/include/asm/memory.h +++ b/arch/unicore32/include/asm/memory.h @@ -20,12 +20,6 @@ #include <mach/memory.h> /* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - -/* * PAGE_OFFSET - the virtual address of the start of the kernel image * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 199e15bd3ec5..ce8b4da07e35 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -122,12 +122,14 @@ struct x86_init_pci { * @guest_late_init: guest late init * @x2apic_available: X2APIC detection * @init_mem_mapping: setup early mappings during init_mem_mapping() + * @init_after_bootmem: guest init after boot allocator is finished */ struct x86_hyper_init { void (*init_platform)(void); void (*guest_late_init)(void); bool (*x2apic_available)(void); void (*init_mem_mapping)(void); + void (*init_after_bootmem)(void); }; /** diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ebda84a91510..3ab867603e81 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = { .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, + .init_after_bootmem = x86_init_noop, }, .acpi = { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 396e1f0151ac..8008db2bddb3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -778,6 +778,7 @@ void __init mem_init(void) free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dca9abf2b85c..66de40e45f58 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1185,6 +1185,7 @@ void __init mem_init(void) /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); /* * Must be done after boot memory is put on freelist, because here we diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 155ecbac9e28..48c591251600 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -90,9 +90,10 @@ unsigned long arch_mmap_rnd(void) return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; unsigned long gap_min, gap_max; @@ -126,16 +127,17 @@ static unsigned long mmap_legacy_base(unsigned long rnd, * process VM image, sets up which VM layout function to use: */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, - unsigned long random_factor, unsigned long task_size) + unsigned long random_factor, unsigned long task_size, + struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) *base = *legacy_base; else - *base = mmap_base(random_factor, task_size); + *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm->get_unmapped_area = arch_get_unmapped_area; @@ -143,7 +145,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0), + rlim_stack); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -153,7 +156,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), task_size_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit(), + rlim_stack); #endif } diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c index 1518d2805ae8..27361cbb7ca9 100644 --- a/arch/x86/um/stub_segv.c +++ b/arch/x86/um/stub_segv.c @@ -6,11 +6,12 @@ #include <sysdep/stub.h> #include <sysdep/faultinfo.h> #include <sysdep/mcontext.h> +#include <sys/ucontext.h> void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig, siginfo_t *info, void *p) { - struct ucontext *uc = p; + ucontext_t *uc = p; GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA), &uc->uc_mcontext); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 3c2c2530737e..c36d23aa6c35 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1259,10 +1259,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - /* Work out if we support NX */ - get_cpu_cap(&boot_cpu_data); - x86_configure_nx(); - /* Get mfn list */ xen_build_dynamic_phys_to_machine(); @@ -1272,6 +1268,10 @@ asmlinkage __visible void __init xen_start_kernel(void) */ xen_setup_gdt(0); + /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); + x86_configure_nx(); + xen_init_irq_ops(); /* Let's presume PV guests always boot on vCPU with id 0. */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d20763472920..486c0a34d00b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ static phys_addr_t xen_pt_base, xen_pt_size __initdata; +static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready); + /* * Just beyond the highest usermode address. STACK_TOP_MAX has a * redzone above it, so round it up to a PGD boundary. @@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr) } +/* + * During early boot all page table pages are pinned, but we do not have struct + * pages, so return true until struct pages are ready. + */ static bool xen_page_pinned(void *ptr) { - struct page *page = virt_to_page(ptr); + if (static_branch_likely(&xen_struct_pages_ready)) { + struct page *page = virt_to_page(ptr); - return PagePinned(page); + return PagePinned(page); + } + return true; } static void xen_extend_mmu_update(const struct mmu_update *update) @@ -836,11 +845,6 @@ void xen_mm_pin_all(void) spin_unlock(&pgd_lock); } -/* - * The init_mm pagetable is really pinned as soon as its created, but - * that's before we have page structures to store the bits. So do all - * the book-keeping now. - */ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, enum pt_level level) { @@ -848,8 +852,18 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, return 0; } -static void __init xen_mark_init_mm_pinned(void) +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now once struct pages for allocated pages are + * initialized. This happens only after free_all_bootmem() is called. + */ +static void __init xen_after_bootmem(void) { + static_branch_enable(&xen_struct_pages_ready); +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1623,14 +1637,15 @@ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { - bool pinned = PagePinned(virt_to_page(mm->pgd)); + bool pinned = xen_page_pinned(mm->pgd); trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); if (pinned) { struct page *page = pfn_to_page(pfn); - SetPagePinned(page); + if (static_branch_likely(&xen_struct_pages_ready)) + SetPagePinned(page); if (!PageHighMem(page)) { xen_mc_batch(); @@ -2364,9 +2379,7 @@ static void __init xen_post_allocator_init(void) #ifdef CONFIG_X86_64 pv_mmu_ops.write_cr3 = &xen_write_cr3; - SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif - xen_mark_init_mm_pinned(); } static void xen_leave_lazy_mmu(void) @@ -2450,6 +2463,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; + x86_init.hyper.init_after_bootmem = xen_after_bootmem; pv_mmu_ops = xen_mmu_ops; diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index c0c756c76afe..2e20ae2fa2d6 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -425,6 +425,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ * data back is to call: */ tick_nohz_idle_enter(); + tick_nohz_idle_stop_tick_protected(); cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 96f26e026783..5077ead5e59c 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -89,7 +89,9 @@ END(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, - .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0)) + .long (1 << XENFEAT_writable_page_tables) | \ + (1 << XENFEAT_dom0) | \ + (1 << XENFEAT_linux_rsdp_unrestricted)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 3e9d01ada81f..58f29a9d895d 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -57,6 +57,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED # define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 9e702bc4960f..7a3a541046ed 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -34,6 +34,7 @@ struct iort_its_msi_chip { struct list_head list; struct fwnode_handle *fw_node; + phys_addr_t base_addr; u32 translation_id; }; @@ -156,14 +157,16 @@ static LIST_HEAD(iort_msi_chip_list); static DEFINE_SPINLOCK(iort_msi_chip_lock); /** - * iort_register_domain_token() - register domain token and related ITS ID - * to the list from where we can get it back later on. + * iort_register_domain_token() - register domain token along with related + * ITS ID and base address to the list from where we can get it back later on. * @trans_id: ITS ID. + * @base: ITS base address. * @fw_node: Domain token. * * Returns: 0 on success, -ENOMEM if no memory when allocating list element */ -int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node) +int iort_register_domain_token(int trans_id, phys_addr_t base, + struct fwnode_handle *fw_node) { struct iort_its_msi_chip *its_msi_chip; @@ -173,6 +176,7 @@ int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node) its_msi_chip->fw_node = fw_node; its_msi_chip->translation_id = trans_id; + its_msi_chip->base_addr = base; spin_lock(&iort_msi_chip_lock); list_add(&its_msi_chip->list, &iort_msi_chip_list); @@ -569,6 +573,24 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id) return -ENODEV; } +static int __maybe_unused iort_find_its_base(u32 its_id, phys_addr_t *base) +{ + struct iort_its_msi_chip *its_msi_chip; + int ret = -ENODEV; + + spin_lock(&iort_msi_chip_lock); + list_for_each_entry(its_msi_chip, &iort_msi_chip_list, list) { + if (its_msi_chip->translation_id == its_id) { + *base = its_msi_chip->base_addr; + ret = 0; + break; + } + } + spin_unlock(&iort_msi_chip_lock); + + return ret; +} + /** * iort_dev_find_its_id() - Find the ITS identifier for a device * @dev: The device. @@ -754,6 +776,24 @@ static inline bool iort_iommu_driver_enabled(u8 type) } #ifdef CONFIG_IOMMU_API +static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev) +{ + struct acpi_iort_node *iommu; + struct iommu_fwspec *fwspec = dev->iommu_fwspec; + + iommu = iort_get_iort_node(fwspec->iommu_fwnode); + + if (iommu && (iommu->type == ACPI_IORT_NODE_SMMU_V3)) { + struct acpi_iort_smmu_v3 *smmu; + + smmu = (struct acpi_iort_smmu_v3 *)iommu->node_data; + if (smmu->model == ACPI_IORT_SMMU_V3_HISILICON_HI161X) + return iommu; + } + + return NULL; +} + static inline const struct iommu_ops *iort_fwspec_iommu_ops( struct iommu_fwspec *fwspec) { @@ -770,6 +810,69 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops, return err; } + +/** + * iort_iommu_msi_get_resv_regions - Reserved region driver helper + * @dev: Device from iommu_get_resv_regions() + * @head: Reserved region list from iommu_get_resv_regions() + * + * Returns: Number of msi reserved regions on success (0 if platform + * doesn't require the reservation or no associated msi regions), + * appropriate error value otherwise. The ITS interrupt translation + * spaces (ITS_base + SZ_64K, SZ_64K) associated with the device + * are the msi reserved regions. + */ +int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) +{ + struct acpi_iort_its_group *its; + struct acpi_iort_node *iommu_node, *its_node = NULL; + int i, resv = 0; + + iommu_node = iort_get_msi_resv_iommu(dev); + if (!iommu_node) + return 0; + + /* + * Current logic to reserve ITS regions relies on HW topologies + * where a given PCI or named component maps its IDs to only one + * ITS group; if a PCI or named component can map its IDs to + * different ITS groups through IORT mappings this function has + * to be reworked to ensure we reserve regions for all ITS groups + * a given PCI or named component may map IDs to. + */ + + for (i = 0; i < dev->iommu_fwspec->num_ids; i++) { + its_node = iort_node_map_id(iommu_node, + dev->iommu_fwspec->ids[i], + NULL, IORT_MSI_TYPE); + if (its_node) + break; + } + + if (!its_node) + return 0; + + /* Move to ITS specific data */ + its = (struct acpi_iort_its_group *)its_node->node_data; + + for (i = 0; i < its->its_count; i++) { + phys_addr_t base; + + if (!iort_find_its_base(its->identifiers[i], &base)) { + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; + struct iommu_resv_region *region; + + region = iommu_alloc_resv_region(base + SZ_64K, SZ_64K, + prot, IOMMU_RESV_MSI); + if (region) { + list_add_tail(®ion->list, head); + resv++; + } + } + } + + return (resv == its->its_count) ? resv : -ENODEV; +} #else static inline const struct iommu_ops *iort_fwspec_iommu_ops( struct iommu_fwspec *fwspec) @@ -777,6 +880,8 @@ static inline const struct iommu_ops *iort_fwspec_iommu_ops( static inline int iort_add_device_replay(const struct iommu_ops *ops, struct device *dev) { return 0; } +int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) +{ return 0; } #endif static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node, diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index c7cf48ad5cb9..a651ab3490d8 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -533,7 +533,7 @@ int acpi_processor_notify_smm(struct module *calling_module) EXPORT_SYMBOL(acpi_processor_notify_smm); -static int acpi_processor_get_psd(struct acpi_processor *pr) +int acpi_processor_get_psd(acpi_handle handle, struct acpi_psd_package *pdomain) { int result = 0; acpi_status status = AE_OK; @@ -541,9 +541,8 @@ static int acpi_processor_get_psd(struct acpi_processor *pr) struct acpi_buffer format = {sizeof("NNNNN"), "NNNNN"}; struct acpi_buffer state = {0, NULL}; union acpi_object *psd = NULL; - struct acpi_psd_package *pdomain; - status = acpi_evaluate_object(pr->handle, "_PSD", NULL, &buffer); + status = acpi_evaluate_object(handle, "_PSD", NULL, &buffer); if (ACPI_FAILURE(status)) { return -ENODEV; } @@ -561,8 +560,6 @@ static int acpi_processor_get_psd(struct acpi_processor *pr) goto end; } - pdomain = &(pr->performance->domain_info); - state.length = sizeof(struct acpi_psd_package); state.pointer = pdomain; @@ -597,6 +594,7 @@ end: kfree(buffer.pointer); return result; } +EXPORT_SYMBOL(acpi_processor_get_psd); int acpi_processor_preregister_performance( struct acpi_processor_performance __percpu *performance) @@ -645,7 +643,8 @@ int acpi_processor_preregister_performance( pr->performance = per_cpu_ptr(performance, i); cpumask_set_cpu(i, pr->performance->shared_cpu_map); - if (acpi_processor_get_psd(pr)) { + pdomain = &(pr->performance->domain_info); + if (acpi_processor_get_psd(pr->handle, pdomain)) { retval = -EINVAL; continue; } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 79fcd2bae96b..bffe8616bd55 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -837,11 +837,8 @@ int __init memory_dev_init(void) * during boot and have been initialized */ mutex_lock(&mem_sysfs_mutex); - for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { - /* Don't iterate over sections we know are !present: */ - if (i > __highest_present_section_nr) - break; - + for (i = 0; i <= __highest_present_section_nr; + i += sections_per_block) { err = add_memory_block(i); if (!ret) ret = err; diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c index c6ebc88a7d8d..72a2975499db 100644 --- a/drivers/cpufreq/armada-37xx-cpufreq.c +++ b/drivers/cpufreq/armada-37xx-cpufreq.c @@ -202,6 +202,7 @@ static int __init armada37xx_cpufreq_driver_init(void) cur_frequency = clk_get_rate(clk); if (!cur_frequency) { dev_err(cpu_dev, "Failed to get clock rate for CPU\n"); + clk_put(clk); return -EINVAL; } @@ -210,6 +211,7 @@ static int __init armada37xx_cpufreq_driver_init(void) return -EINVAL; armada37xx_cpufreq_dvfs_setup(nb_pm_base, clk, dvfs->divider); + clk_put(clk); for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR; load_lvl++) { diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 8300a9fcb80c..bc5fc1630876 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -162,14 +162,23 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) cpu->perf_caps.highest_perf; policy->cpuinfo.max_freq = cppc_dmi_max_khz; - policy->cpuinfo.transition_latency = cppc_get_transition_latency(cpu_num); policy->transition_delay_us = cppc_get_transition_latency(cpu_num) / NSEC_PER_USEC; policy->shared_type = cpu->shared_type; - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { + int i; + cpumask_copy(policy->cpus, cpu->shared_cpu_map); - else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { + + for_each_cpu(i, policy->cpus) { + if (unlikely(i == policy->cpu)) + continue; + + memcpy(&all_cpu_data[i]->perf_caps, &cpu->perf_caps, + sizeof(cpu->perf_caps)); + } + } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { /* Support only SW_ANY for now. */ pr_debug("Unsupported CPU co-ord type\n"); return -EFAULT; diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 10e119ae66dd..3a8cc99e6815 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -352,20 +352,6 @@ static int set_freq_table_sorted(struct cpufreq_policy *policy) return 0; } -int cpufreq_table_validate_and_show(struct cpufreq_policy *policy, - struct cpufreq_frequency_table *table) -{ - int ret; - - ret = cpufreq_frequency_table_cpuinfo(policy, table); - if (ret) - return ret; - - policy->freq_table = table; - return 0; -} -EXPORT_SYMBOL_GPL(cpufreq_table_validate_and_show); - int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy) { int ret; diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6d084c61ee25..17e566afbb41 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -26,7 +26,6 @@ #include <linux/sysfs.h> #include <linux/types.h> #include <linux/fs.h> -#include <linux/debugfs.h> #include <linux/acpi.h> #include <linux/vmalloc.h> #include <trace/events/power.h> diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 959a1dbe3835..b4dbc77459b6 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -159,13 +159,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) priv->domain_id = handle->perf_ops->device_domain_id(cpu_dev); policy->driver_data = priv; - - ret = cpufreq_table_validate_and_show(policy, freq_table); - if (ret) { - dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__, - ret); - goto out_free_cpufreq_table; - } + policy->freq_table = freq_table; /* SCMI allows DVFS request for any domain from any CPU */ policy->dvfs_possible_from_any_cpu = true; @@ -179,8 +173,6 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) policy->fast_switch_possible = true; return 0; -out_free_cpufreq_table: - dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table); out_free_priv: kfree(priv); out_free_opp: diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c index a099b7bf74cd..6ba709b6f095 100644 --- a/drivers/cpufreq/ti-cpufreq.c +++ b/drivers/cpufreq/ti-cpufreq.c @@ -304,7 +304,7 @@ static struct platform_driver ti_cpufreq_driver = { .name = "ti-cpufreq", }, }; -module_platform_driver(ti_cpufreq_driver); +builtin_platform_driver(ti_cpufreq_driver); MODULE_DESCRIPTION("TI CPUFreq/OPP hw-supported driver"); MODULE_AUTHOR("Dave Gerlach <[email protected]>"); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 0003e9a02637..6df894d65d9e 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -272,12 +272,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * * @drv: the cpuidle driver * @dev: the cpuidle device + * @stop_tick: indication on whether or not to stop the tick * * Returns the index of the idle state. The return value must not be negative. + * + * The memory location pointed to by @stop_tick is expected to be written the + * 'false' boolean value if the scheduler tick should not be stopped before + * entering the returned state. */ -int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) +int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + bool *stop_tick) { - return cpuidle_curr_governor->select(drv, dev); + return cpuidle_curr_governor->select(drv, dev, stop_tick); } /** diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 1ad8745fd6d6..b24883f85c99 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -63,9 +63,10 @@ static inline void ladder_do_selection(struct ladder_device *ldev, * ladder_select_state - selects the next state to enter * @drv: cpuidle driver * @dev: the CPU + * @dummy: not used */ static int ladder_select_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, bool *dummy) { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); struct device *device = get_cpu_device(dev->cpu); diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index aa390404e85f..1bfe03ceb236 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -123,6 +123,7 @@ struct menu_device { int last_state_idx; int needs_update; + int tick_wakeup; unsigned int next_timer_us; unsigned int predicted_us; @@ -279,8 +280,10 @@ again: * menu_select - selects the next idle state to enter * @drv: cpuidle driver containing state data * @dev: the CPU + * @stop_tick: indication on whether or not to stop the tick */ -static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) +static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + bool *stop_tick) { struct menu_device *data = this_cpu_ptr(&menu_devices); struct device *device = get_cpu_device(dev->cpu); @@ -292,6 +295,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) unsigned int expected_interval; unsigned long nr_iowaiters, cpu_load; int resume_latency = dev_pm_qos_raw_read_value(device); + ktime_t delta_next; if (data->needs_update) { menu_update(drv, dev); @@ -303,11 +307,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) latency_req = resume_latency; /* Special case when user has set very strict latency requirement */ - if (unlikely(latency_req == 0)) + if (unlikely(latency_req == 0)) { + *stop_tick = false; return 0; + } /* determine the expected residency time, round up */ - data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); + data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next)); get_iowait_load(&nr_iowaiters, &cpu_load); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); @@ -346,14 +352,30 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) */ data->predicted_us = min(data->predicted_us, expected_interval); - /* - * Use the performance multiplier and the user-configurable - * latency_req to determine the maximum exit latency. - */ - interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); - if (latency_req > interactivity_req) - latency_req = interactivity_req; + if (tick_nohz_tick_stopped()) { + /* + * If the tick is already stopped, the cost of possible short + * idle duration misprediction is much higher, because the CPU + * may be stuck in a shallow idle state for a long time as a + * result of it. In that case say we might mispredict and try + * to force the CPU into a state for which we would have stopped + * the tick, unless a timer is going to expire really soon + * anyway. + */ + if (data->predicted_us < TICK_USEC) + data->predicted_us = min_t(unsigned int, TICK_USEC, + ktime_to_us(delta_next)); + } else { + /* + * Use the performance multiplier and the user-configurable + * latency_req to determine the maximum exit latency. + */ + interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); + if (latency_req > interactivity_req) + latency_req = interactivity_req; + } + expected_interval = data->predicted_us; /* * Find the idle state with the lowest power while satisfying * our constraints. @@ -369,15 +391,52 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) idx = i; /* first enabled state */ if (s->target_residency > data->predicted_us) break; - if (s->exit_latency > latency_req) + if (s->exit_latency > latency_req) { + /* + * If we break out of the loop for latency reasons, use + * the target residency of the selected state as the + * expected idle duration so that the tick is retained + * as long as that target residency is low enough. + */ + expected_interval = drv->states[idx].target_residency; break; - + } idx = i; } if (idx == -1) idx = 0; /* No states enabled. Must use 0. */ + /* + * Don't stop the tick if the selected state is a polling one or if the + * expected idle duration is shorter than the tick period length. + */ + if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || + expected_interval < TICK_USEC) { + unsigned int delta_next_us = ktime_to_us(delta_next); + + *stop_tick = false; + + if (!tick_nohz_tick_stopped() && idx > 0 && + drv->states[idx].target_residency > delta_next_us) { + /* + * The tick is not going to be stopped and the target + * residency of the state to be returned is not within + * the time until the next timer event including the + * tick, so try to correct that. + */ + for (i = idx - 1; i >= 0; i--) { + if (drv->states[i].disabled || + dev->states_usage[i].disable) + continue; + + idx = i; + if (drv->states[i].target_residency <= delta_next_us) + break; + } + } + } + data->last_state_idx = idx; return data->last_state_idx; @@ -397,6 +456,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index) data->last_state_idx = index; data->needs_update = 1; + data->tick_wakeup = tick_nohz_idle_got_tick(); } /** @@ -427,14 +487,27 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * assume the state was never reached and the exit latency is 0. */ - /* measured value */ - measured_us = cpuidle_get_last_residency(dev); - - /* Deduct exit latency */ - if (measured_us > 2 * target->exit_latency) - measured_us -= target->exit_latency; - else - measured_us /= 2; + if (data->tick_wakeup && data->next_timer_us > TICK_USEC) { + /* + * The nohz code said that there wouldn't be any events within + * the tick boundary (if the tick was stopped), but the idle + * duration predictor had a differing opinion. Since the CPU + * was woken up by a tick (that wasn't stopped after all), the + * predictor was not quite right, so assume that the CPU could + * have been idle long (but not forever) to help the idle + * duration predictor do a better job next time. + */ + measured_us = 9 * MAX_INTERESTING / 10; + } else { + /* measured value */ + measured_us = cpuidle_get_last_residency(dev); + + /* Deduct exit latency */ + if (measured_us > 2 * target->exit_latency) + measured_us -= target->exit_latency; + else + measured_us /= 2; + } /* Make sure our coefficients do not exceed unity */ if (measured_us > data->next_timer_us) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 83819d0cbf90..2a99f0f14795 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -81,11 +81,12 @@ */ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) -static DEFINE_RWLOCK(amd_iommu_devtable_lock); +static DEFINE_SPINLOCK(amd_iommu_devtable_lock); +static DEFINE_SPINLOCK(pd_bitmap_lock); +static DEFINE_SPINLOCK(iommu_table_lock); /* List of all available dev_data structures */ -static LIST_HEAD(dev_data_list); -static DEFINE_SPINLOCK(dev_data_list_lock); +static LLIST_HEAD(dev_data_list); LIST_HEAD(ioapic_map); LIST_HEAD(hpet_map); @@ -204,40 +205,33 @@ static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain static struct iommu_dev_data *alloc_dev_data(u16 devid) { struct iommu_dev_data *dev_data; - unsigned long flags; dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); if (!dev_data) return NULL; dev_data->devid = devid; - - spin_lock_irqsave(&dev_data_list_lock, flags); - list_add_tail(&dev_data->dev_data_list, &dev_data_list); - spin_unlock_irqrestore(&dev_data_list_lock, flags); - ratelimit_default_init(&dev_data->rs); + llist_add(&dev_data->dev_data_list, &dev_data_list); return dev_data; } static struct iommu_dev_data *search_dev_data(u16 devid) { struct iommu_dev_data *dev_data; - unsigned long flags; + struct llist_node *node; - spin_lock_irqsave(&dev_data_list_lock, flags); - list_for_each_entry(dev_data, &dev_data_list, dev_data_list) { + if (llist_empty(&dev_data_list)) + return NULL; + + node = dev_data_list.first; + llist_for_each_entry(dev_data, node, dev_data_list) { if (dev_data->devid == devid) - goto out_unlock; + return dev_data; } - dev_data = NULL; - -out_unlock: - spin_unlock_irqrestore(&dev_data_list_lock, flags); - - return dev_data; + return NULL; } static int __last_alias(struct pci_dev *pdev, u16 alias, void *data) @@ -311,6 +305,8 @@ static struct iommu_dev_data *find_dev_data(u16 devid) if (dev_data == NULL) { dev_data = alloc_dev_data(devid); + if (!dev_data) + return NULL; if (translation_pre_enabled(iommu)) dev_data->defer_attach = true; @@ -548,6 +544,7 @@ static void amd_iommu_report_page_fault(u16 devid, u16 domain_id, static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { + struct device *dev = iommu->iommu.dev; int type, devid, domid, flags; volatile u32 *event = __evt; int count = 0; @@ -574,53 +571,53 @@ retry: amd_iommu_report_page_fault(devid, domid, address, flags); return; } else { - printk(KERN_ERR "AMD-Vi: Event logged ["); + dev_err(dev, "AMD-Vi: Event logged ["); } switch (type) { case EVENT_TYPE_ILL_DEV: - printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " - "address=0x%016llx flags=0x%04x]\n", - PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address, flags); + dev_err(dev, "ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); dump_dte_entry(devid); break; case EVENT_TYPE_DEV_TAB_ERR: - printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " - "address=0x%016llx flags=0x%04x]\n", - PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address, flags); + dev_err(dev, "DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); break; case EVENT_TYPE_PAGE_TAB_ERR: - printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " - "domain=0x%04x address=0x%016llx flags=0x%04x]\n", - PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), - domid, address, flags); + dev_err(dev, "PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domid, address, flags); break; case EVENT_TYPE_ILL_CMD: - printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + dev_err(dev, "ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: - printk("COMMAND_HARDWARE_ERROR address=0x%016llx " - "flags=0x%04x]\n", address, flags); + dev_err(dev, "COMMAND_HARDWARE_ERROR address=0x%016llx " + "flags=0x%04x]\n", address, flags); break; case EVENT_TYPE_IOTLB_INV_TO: - printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " - "address=0x%016llx]\n", - PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address); + dev_err(dev, "IOTLB_INV_TIMEOUT device=%02x:%02x.%x " + "address=0x%016llx]\n", + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address); break; case EVENT_TYPE_INV_DEV_REQ: - printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " - "address=0x%016llx flags=0x%04x]\n", - PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address, flags); + dev_err(dev, "INVALID_DEVICE_REQUEST device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); break; default: - printk(KERN_ERR "UNKNOWN type=0x%02x event[0]=0x%08x " - "event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n", - type, event[0], event[1], event[2], event[3]); + dev_err(dev, KERN_ERR "UNKNOWN event[0]=0x%08x event[1]=0x%08x " + "event[2]=0x%08x event[3]=0x%08x\n", + event[0], event[1], event[2], event[3]); } memset(__evt, 0, 4 * sizeof(u32)); @@ -1057,9 +1054,9 @@ static int iommu_queue_command_sync(struct amd_iommu *iommu, unsigned long flags; int ret; - spin_lock_irqsave(&iommu->lock, flags); + raw_spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command_sync(iommu, cmd, sync); - spin_unlock_irqrestore(&iommu->lock, flags); + raw_spin_unlock_irqrestore(&iommu->lock, flags); return ret; } @@ -1085,7 +1082,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) build_completion_wait(&cmd, (u64)&iommu->cmd_sem); - spin_lock_irqsave(&iommu->lock, flags); + raw_spin_lock_irqsave(&iommu->lock, flags); iommu->cmd_sem = 0; @@ -1096,7 +1093,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) ret = wait_on_sem(&iommu->cmd_sem); out_unlock: - spin_unlock_irqrestore(&iommu->lock, flags); + raw_spin_unlock_irqrestore(&iommu->lock, flags); return ret; } @@ -1606,29 +1603,26 @@ static void del_domain_from_list(struct protection_domain *domain) static u16 domain_id_alloc(void) { - unsigned long flags; int id; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); + spin_lock(&pd_bitmap_lock); id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); BUG_ON(id == 0); if (id > 0 && id < MAX_DOMAIN_ID) __set_bit(id, amd_iommu_pd_alloc_bitmap); else id = 0; - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + spin_unlock(&pd_bitmap_lock); return id; } static void domain_id_free(int id) { - unsigned long flags; - - write_lock_irqsave(&amd_iommu_devtable_lock, flags); + spin_lock(&pd_bitmap_lock); if (id > 0 && id < MAX_DOMAIN_ID) __clear_bit(id, amd_iommu_pd_alloc_bitmap); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + spin_unlock(&pd_bitmap_lock); } #define DEFINE_FREE_PT_FN(LVL, FN) \ @@ -2104,9 +2098,9 @@ static int attach_device(struct device *dev, } skip_ats_check: - write_lock_irqsave(&amd_iommu_devtable_lock, flags); + spin_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev_data, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* * We might boot into a crash-kernel here. The crashed kernel @@ -2156,9 +2150,9 @@ static void detach_device(struct device *dev) domain = dev_data->domain; /* lock device table */ - write_lock_irqsave(&amd_iommu_devtable_lock, flags); + spin_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev_data); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); if (!dev_is_pci(dev)) return; @@ -2795,7 +2789,7 @@ static void cleanup_domain(struct protection_domain *domain) struct iommu_dev_data *entry; unsigned long flags; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); + spin_lock_irqsave(&amd_iommu_devtable_lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, @@ -2803,7 +2797,7 @@ static void cleanup_domain(struct protection_domain *domain) __detach_device(entry); } - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } static void protection_domain_free(struct protection_domain *domain) @@ -3025,15 +3019,12 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, size_t unmap_size; if (domain->mode == PAGE_MODE_NONE) - return -EINVAL; + return 0; mutex_lock(&domain->api_lock); unmap_size = iommu_unmap_page(domain, iova, page_size); mutex_unlock(&domain->api_lock); - domain_flush_tlb_pde(domain); - domain_flush_complete(domain); - return unmap_size; } @@ -3151,6 +3142,19 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, return dev_data->defer_attach; } +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct protection_domain *dom = to_pdomain(domain); + + domain_flush_tlb_pde(dom); + domain_flush_complete(dom); +} + +static void amd_iommu_iotlb_range_add(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ +} + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, @@ -3169,6 +3173,9 @@ const struct iommu_ops amd_iommu_ops = { .apply_resv_region = amd_iommu_apply_resv_region, .is_attach_deferred = amd_iommu_is_attach_deferred, .pgsize_bitmap = AMD_IOMMU_PGSIZES, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_range_add = amd_iommu_iotlb_range_add, + .iotlb_sync = amd_iommu_flush_iotlb_all, }; /***************************************************************************** @@ -3570,14 +3577,62 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table) amd_iommu_dev_table[devid].data[2] = dte; } -static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) +static struct irq_remap_table *get_irq_table(u16 devid) +{ + struct irq_remap_table *table; + + if (WARN_ONCE(!amd_iommu_rlookup_table[devid], + "%s: no iommu for devid %x\n", __func__, devid)) + return NULL; + + table = irq_lookup_table[devid]; + if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid)) + return NULL; + + return table; +} + +static struct irq_remap_table *__alloc_irq_table(void) +{ + struct irq_remap_table *table; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return NULL; + + table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL); + if (!table->table) { + kfree(table); + return NULL; + } + raw_spin_lock_init(&table->lock); + + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + memset(table->table, 0, + MAX_IRQS_PER_TABLE * sizeof(u32)); + else + memset(table->table, 0, + (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); + return table; +} + +static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, + struct irq_remap_table *table) +{ + irq_lookup_table[devid] = table; + set_dte_irq_entry(devid, table); + iommu_flush_dte(iommu, devid); +} + +static struct irq_remap_table *alloc_irq_table(u16 devid) { struct irq_remap_table *table = NULL; + struct irq_remap_table *new_table = NULL; struct amd_iommu *iommu; unsigned long flags; u16 alias; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); + spin_lock_irqsave(&iommu_table_lock, flags); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -3590,60 +3645,45 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) alias = amd_iommu_alias_table[devid]; table = irq_lookup_table[alias]; if (table) { - irq_lookup_table[devid] = table; - set_dte_irq_entry(devid, table); - iommu_flush_dte(iommu, devid); - goto out; + set_remap_table_entry(iommu, devid, table); + goto out_wait; } + spin_unlock_irqrestore(&iommu_table_lock, flags); /* Nothing there yet, allocate new irq remapping table */ - table = kzalloc(sizeof(*table), GFP_ATOMIC); - if (!table) - goto out_unlock; - - /* Initialize table spin-lock */ - spin_lock_init(&table->lock); + new_table = __alloc_irq_table(); + if (!new_table) + return NULL; - if (ioapic) - /* Keep the first 32 indexes free for IOAPIC interrupts */ - table->min_index = 32; + spin_lock_irqsave(&iommu_table_lock, flags); - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC); - if (!table->table) { - kfree(table); - table = NULL; + table = irq_lookup_table[devid]; + if (table) goto out_unlock; - } - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - memset(table->table, 0, - MAX_IRQS_PER_TABLE * sizeof(u32)); - else - memset(table->table, 0, - (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); - - if (ioapic) { - int i; - - for (i = 0; i < 32; ++i) - iommu->irte_ops->set_allocated(table, i); + table = irq_lookup_table[alias]; + if (table) { + set_remap_table_entry(iommu, devid, table); + goto out_wait; } - irq_lookup_table[devid] = table; - set_dte_irq_entry(devid, table); - iommu_flush_dte(iommu, devid); - if (devid != alias) { - irq_lookup_table[alias] = table; - set_dte_irq_entry(alias, table); - iommu_flush_dte(iommu, alias); - } + table = new_table; + new_table = NULL; -out: + set_remap_table_entry(iommu, devid, table); + if (devid != alias) + set_remap_table_entry(iommu, alias, table); + +out_wait: iommu_completion_wait(iommu); out_unlock: - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + spin_unlock_irqrestore(&iommu_table_lock, flags); + if (new_table) { + kmem_cache_free(amd_iommu_irq_cache, new_table->table); + kfree(new_table); + } return table; } @@ -3657,14 +3697,14 @@ static int alloc_irq_index(u16 devid, int count, bool align) if (!iommu) return -ENODEV; - table = get_irq_table(devid, false); + table = alloc_irq_table(devid); if (!table) return -ENODEV; if (align) alignment = roundup_pow_of_two(count); - spin_lock_irqsave(&table->lock, flags); + raw_spin_lock_irqsave(&table->lock, flags); /* Scan table for free entries */ for (index = ALIGN(table->min_index, alignment), c = 0; @@ -3691,7 +3731,7 @@ static int alloc_irq_index(u16 devid, int count, bool align) index = -ENOSPC; out: - spin_unlock_irqrestore(&table->lock, flags); + raw_spin_unlock_irqrestore(&table->lock, flags); return index; } @@ -3708,11 +3748,11 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte, if (iommu == NULL) return -EINVAL; - table = get_irq_table(devid, false); + table = get_irq_table(devid); if (!table) return -ENOMEM; - spin_lock_irqsave(&table->lock, flags); + raw_spin_lock_irqsave(&table->lock, flags); entry = (struct irte_ga *)table->table; entry = &entry[index]; @@ -3723,7 +3763,7 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte, if (data) data->ref = entry; - spin_unlock_irqrestore(&table->lock, flags); + raw_spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); iommu_completion_wait(iommu); @@ -3741,13 +3781,13 @@ static int modify_irte(u16 devid, int index, union irte *irte) if (iommu == NULL) return -EINVAL; - table = get_irq_table(devid, false); + table = get_irq_table(devid); if (!table) return -ENOMEM; - spin_lock_irqsave(&table->lock, flags); + raw_spin_lock_irqsave(&table->lock, flags); table->table[index] = irte->val; - spin_unlock_irqrestore(&table->lock, flags); + raw_spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); iommu_completion_wait(iommu); @@ -3765,13 +3805,13 @@ static void free_irte(u16 devid, int index) if (iommu == NULL) return; - table = get_irq_table(devid, false); + table = get_irq_table(devid); if (!table) return; - spin_lock_irqsave(&table->lock, flags); + raw_spin_lock_irqsave(&table->lock, flags); iommu->irte_ops->clear_allocated(table, index); - spin_unlock_irqrestore(&table->lock, flags); + raw_spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); iommu_completion_wait(iommu); @@ -3852,10 +3892,8 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index, u8 vector, u32 dest_apicid) { struct irte_ga *irte = (struct irte_ga *) entry; - struct iommu_dev_data *dev_data = search_dev_data(devid); - if (!dev_data || !dev_data->use_vapic || - !irte->lo.fields_remap.guest_mode) { + if (!irte->lo.fields_remap.guest_mode) { irte->hi.fields.vector = vector; irte->lo.fields_remap.destination = dest_apicid; modify_irte_ga(devid, index, irte, NULL); @@ -4061,7 +4099,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, struct amd_ir_data *data = NULL; struct irq_cfg *cfg; int i, ret, devid; - int index = -1; + int index; if (!info) return -EINVAL; @@ -4085,10 +4123,26 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, return ret; if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { - if (get_irq_table(devid, true)) + struct irq_remap_table *table; + struct amd_iommu *iommu; + + table = alloc_irq_table(devid); + if (table) { + if (!table->min_index) { + /* + * Keep the first 32 indexes free for IOAPIC + * interrupts. + */ + table->min_index = 32; + iommu = amd_iommu_rlookup_table[devid]; + for (i = 0; i < 32; ++i) + iommu->irte_ops->set_allocated(table, i); + } + WARN_ON(table->min_index != 32); index = info->ioapic_pin; - else - ret = -ENOMEM; + } else { + index = -ENOMEM; + } } else { bool align = (info->type == X86_IRQ_ALLOC_TYPE_MSI); @@ -4354,7 +4408,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) { unsigned long flags; struct amd_iommu *iommu; - struct irq_remap_table *irt; + struct irq_remap_table *table; struct amd_ir_data *ir_data = (struct amd_ir_data *)data; int devid = ir_data->irq_2_irte.devid; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -4368,11 +4422,11 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) if (!iommu) return -ENODEV; - irt = get_irq_table(devid, false); - if (!irt) + table = get_irq_table(devid); + if (!table) return -ENODEV; - spin_lock_irqsave(&irt->lock, flags); + raw_spin_lock_irqsave(&table->lock, flags); if (ref->lo.fields_vapic.guest_mode) { if (cpu >= 0) @@ -4381,7 +4435,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) barrier(); } - spin_unlock_irqrestore(&irt->lock, flags); + raw_spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); iommu_completion_wait(iommu); diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 4e4a615bf13f..904c575d1677 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1474,7 +1474,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { int ret; - spin_lock_init(&iommu->lock); + raw_spin_lock_init(&iommu->lock); /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 6a877ebd058b..1c9b080276c9 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -408,7 +408,7 @@ extern bool amd_iommu_iotlb_sup; #define IRQ_TABLE_ALIGNMENT 128 struct irq_remap_table { - spinlock_t lock; + raw_spinlock_t lock; unsigned min_index; u32 *table; }; @@ -490,7 +490,7 @@ struct amd_iommu { int index; /* locks the accesses to the hardware */ - spinlock_t lock; + raw_spinlock_t lock; /* Pointer to PCI device of this IOMMU */ struct pci_dev *dev; @@ -627,7 +627,7 @@ struct devid_map { */ struct iommu_dev_data { struct list_head list; /* For domain->dev_list */ - struct list_head dev_data_list; /* For global dev_data_list */ + struct llist_node dev_data_list; /* For global dev_data_list */ struct protection_domain *domain; /* Domain the device is bound to */ u16 devid; /* PCI Device ID */ u16 alias; /* Alias Device ID */ diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 3f2f1fc68b52..1d647104bccc 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -22,6 +22,8 @@ #include <linux/acpi.h> #include <linux/acpi_iort.h> +#include <linux/bitfield.h> +#include <linux/bitops.h> #include <linux/delay.h> #include <linux/dma-iommu.h> #include <linux/err.h> @@ -43,18 +45,15 @@ /* MMIO registers */ #define ARM_SMMU_IDR0 0x0 -#define IDR0_ST_LVL_SHIFT 27 -#define IDR0_ST_LVL_MASK 0x3 -#define IDR0_ST_LVL_2LVL (1 << IDR0_ST_LVL_SHIFT) -#define IDR0_STALL_MODEL_SHIFT 24 -#define IDR0_STALL_MODEL_MASK 0x3 -#define IDR0_STALL_MODEL_STALL (0 << IDR0_STALL_MODEL_SHIFT) -#define IDR0_STALL_MODEL_FORCE (2 << IDR0_STALL_MODEL_SHIFT) -#define IDR0_TTENDIAN_SHIFT 21 -#define IDR0_TTENDIAN_MASK 0x3 -#define IDR0_TTENDIAN_LE (2 << IDR0_TTENDIAN_SHIFT) -#define IDR0_TTENDIAN_BE (3 << IDR0_TTENDIAN_SHIFT) -#define IDR0_TTENDIAN_MIXED (0 << IDR0_TTENDIAN_SHIFT) +#define IDR0_ST_LVL GENMASK(28, 27) +#define IDR0_ST_LVL_2LVL 1 +#define IDR0_STALL_MODEL GENMASK(25, 24) +#define IDR0_STALL_MODEL_STALL 0 +#define IDR0_STALL_MODEL_FORCE 2 +#define IDR0_TTENDIAN GENMASK(22, 21) +#define IDR0_TTENDIAN_MIXED 0 +#define IDR0_TTENDIAN_LE 2 +#define IDR0_TTENDIAN_BE 3 #define IDR0_CD2L (1 << 19) #define IDR0_VMID16 (1 << 18) #define IDR0_PRI (1 << 16) @@ -64,10 +63,9 @@ #define IDR0_ATS (1 << 10) #define IDR0_HYP (1 << 9) #define IDR0_COHACC (1 << 4) -#define IDR0_TTF_SHIFT 2 -#define IDR0_TTF_MASK 0x3 -#define IDR0_TTF_AARCH64 (2 << IDR0_TTF_SHIFT) -#define IDR0_TTF_AARCH32_64 (3 << IDR0_TTF_SHIFT) +#define IDR0_TTF GENMASK(3, 2) +#define IDR0_TTF_AARCH64 2 +#define IDR0_TTF_AARCH32_64 3 #define IDR0_S1P (1 << 1) #define IDR0_S2P (1 << 0) @@ -75,31 +73,27 @@ #define IDR1_TABLES_PRESET (1 << 30) #define IDR1_QUEUES_PRESET (1 << 29) #define IDR1_REL (1 << 28) -#define IDR1_CMDQ_SHIFT 21 -#define IDR1_CMDQ_MASK 0x1f -#define IDR1_EVTQ_SHIFT 16 -#define IDR1_EVTQ_MASK 0x1f -#define IDR1_PRIQ_SHIFT 11 -#define IDR1_PRIQ_MASK 0x1f -#define IDR1_SSID_SHIFT 6 -#define IDR1_SSID_MASK 0x1f -#define IDR1_SID_SHIFT 0 -#define IDR1_SID_MASK 0x3f +#define IDR1_CMDQS GENMASK(25, 21) +#define IDR1_EVTQS GENMASK(20, 16) +#define IDR1_PRIQS GENMASK(15, 11) +#define IDR1_SSIDSIZE GENMASK(10, 6) +#define IDR1_SIDSIZE GENMASK(5, 0) #define ARM_SMMU_IDR5 0x14 -#define IDR5_STALL_MAX_SHIFT 16 -#define IDR5_STALL_MAX_MASK 0xffff +#define IDR5_STALL_MAX GENMASK(31, 16) #define IDR5_GRAN64K (1 << 6) #define IDR5_GRAN16K (1 << 5) #define IDR5_GRAN4K (1 << 4) -#define IDR5_OAS_SHIFT 0 -#define IDR5_OAS_MASK 0x7 -#define IDR5_OAS_32_BIT (0 << IDR5_OAS_SHIFT) -#define IDR5_OAS_36_BIT (1 << IDR5_OAS_SHIFT) -#define IDR5_OAS_40_BIT (2 << IDR5_OAS_SHIFT) -#define IDR5_OAS_42_BIT (3 << IDR5_OAS_SHIFT) -#define IDR5_OAS_44_BIT (4 << IDR5_OAS_SHIFT) -#define IDR5_OAS_48_BIT (5 << IDR5_OAS_SHIFT) +#define IDR5_OAS GENMASK(2, 0) +#define IDR5_OAS_32_BIT 0 +#define IDR5_OAS_36_BIT 1 +#define IDR5_OAS_40_BIT 2 +#define IDR5_OAS_42_BIT 3 +#define IDR5_OAS_44_BIT 4 +#define IDR5_OAS_48_BIT 5 +#define IDR5_OAS_52_BIT 6 +#define IDR5_VAX GENMASK(11, 10) +#define IDR5_VAX_52_BIT 1 #define ARM_SMMU_CR0 0x20 #define CR0_CMDQEN (1 << 3) @@ -110,18 +104,16 @@ #define ARM_SMMU_CR0ACK 0x24 #define ARM_SMMU_CR1 0x28 -#define CR1_SH_NSH 0 -#define CR1_SH_OSH 2 -#define CR1_SH_ISH 3 +#define CR1_TABLE_SH GENMASK(11, 10) +#define CR1_TABLE_OC GENMASK(9, 8) +#define CR1_TABLE_IC GENMASK(7, 6) +#define CR1_QUEUE_SH GENMASK(5, 4) +#define CR1_QUEUE_OC GENMASK(3, 2) +#define CR1_QUEUE_IC GENMASK(1, 0) +/* CR1 cacheability fields don't quite follow the usual TCR-style encoding */ #define CR1_CACHE_NC 0 #define CR1_CACHE_WB 1 #define CR1_CACHE_WT 2 -#define CR1_TABLE_SH_SHIFT 10 -#define CR1_TABLE_OC_SHIFT 8 -#define CR1_TABLE_IC_SHIFT 6 -#define CR1_QUEUE_SH_SHIFT 4 -#define CR1_QUEUE_OC_SHIFT 2 -#define CR1_QUEUE_IC_SHIFT 0 #define ARM_SMMU_CR2 0x2c #define CR2_PTM (1 << 2) @@ -129,8 +121,8 @@ #define CR2_E2H (1 << 0) #define ARM_SMMU_GBPA 0x44 -#define GBPA_ABORT (1 << 20) #define GBPA_UPDATE (1 << 31) +#define GBPA_ABORT (1 << 20) #define ARM_SMMU_IRQ_CTRL 0x50 #define IRQ_CTRL_EVTQ_IRQEN (1 << 2) @@ -158,18 +150,14 @@ #define ARM_SMMU_STRTAB_BASE 0x80 #define STRTAB_BASE_RA (1UL << 62) -#define STRTAB_BASE_ADDR_SHIFT 6 -#define STRTAB_BASE_ADDR_MASK 0x3ffffffffffUL +#define STRTAB_BASE_ADDR_MASK GENMASK_ULL(51, 6) #define ARM_SMMU_STRTAB_BASE_CFG 0x88 -#define STRTAB_BASE_CFG_LOG2SIZE_SHIFT 0 -#define STRTAB_BASE_CFG_LOG2SIZE_MASK 0x3f -#define STRTAB_BASE_CFG_SPLIT_SHIFT 6 -#define STRTAB_BASE_CFG_SPLIT_MASK 0x1f -#define STRTAB_BASE_CFG_FMT_SHIFT 16 -#define STRTAB_BASE_CFG_FMT_MASK 0x3 -#define STRTAB_BASE_CFG_FMT_LINEAR (0 << STRTAB_BASE_CFG_FMT_SHIFT) -#define STRTAB_BASE_CFG_FMT_2LVL (1 << STRTAB_BASE_CFG_FMT_SHIFT) +#define STRTAB_BASE_CFG_FMT GENMASK(17, 16) +#define STRTAB_BASE_CFG_FMT_LINEAR 0 +#define STRTAB_BASE_CFG_FMT_2LVL 1 +#define STRTAB_BASE_CFG_SPLIT GENMASK(10, 6) +#define STRTAB_BASE_CFG_LOG2SIZE GENMASK(5, 0) #define ARM_SMMU_CMDQ_BASE 0x90 #define ARM_SMMU_CMDQ_PROD 0x98 @@ -190,14 +178,16 @@ #define ARM_SMMU_PRIQ_IRQ_CFG2 0xdc /* Common MSI config fields */ -#define MSI_CFG0_ADDR_SHIFT 2 -#define MSI_CFG0_ADDR_MASK 0x3fffffffffffUL -#define MSI_CFG2_SH_SHIFT 4 -#define MSI_CFG2_SH_NSH (0UL << MSI_CFG2_SH_SHIFT) -#define MSI_CFG2_SH_OSH (2UL << MSI_CFG2_SH_SHIFT) -#define MSI_CFG2_SH_ISH (3UL << MSI_CFG2_SH_SHIFT) -#define MSI_CFG2_MEMATTR_SHIFT 0 -#define MSI_CFG2_MEMATTR_DEVICE_nGnRE (0x1 << MSI_CFG2_MEMATTR_SHIFT) +#define MSI_CFG0_ADDR_MASK GENMASK_ULL(51, 2) +#define MSI_CFG2_SH GENMASK(5, 4) +#define MSI_CFG2_MEMATTR GENMASK(3, 0) + +/* Common memory attribute values */ +#define ARM_SMMU_SH_NSH 0 +#define ARM_SMMU_SH_OSH 2 +#define ARM_SMMU_SH_ISH 3 +#define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1 +#define ARM_SMMU_MEMATTR_OIWB 0xf #define Q_IDX(q, p) ((p) & ((1 << (q)->max_n_shift) - 1)) #define Q_WRP(q, p) ((p) & (1 << (q)->max_n_shift)) @@ -207,10 +197,8 @@ Q_IDX(q, p) * (q)->ent_dwords) #define Q_BASE_RWA (1UL << 62) -#define Q_BASE_ADDR_SHIFT 5 -#define Q_BASE_ADDR_MASK 0xfffffffffffUL -#define Q_BASE_LOG2SIZE_SHIFT 0 -#define Q_BASE_LOG2SIZE_MASK 0x1fUL +#define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5) +#define Q_BASE_LOG2SIZE GENMASK(4, 0) /* * Stream table. @@ -223,187 +211,143 @@ #define STRTAB_SPLIT 8 #define STRTAB_L1_DESC_DWORDS 1 -#define STRTAB_L1_DESC_SPAN_SHIFT 0 -#define STRTAB_L1_DESC_SPAN_MASK 0x1fUL -#define STRTAB_L1_DESC_L2PTR_SHIFT 6 -#define STRTAB_L1_DESC_L2PTR_MASK 0x3ffffffffffUL +#define STRTAB_L1_DESC_SPAN GENMASK_ULL(4, 0) +#define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6) #define STRTAB_STE_DWORDS 8 #define STRTAB_STE_0_V (1UL << 0) -#define STRTAB_STE_0_CFG_SHIFT 1 -#define STRTAB_STE_0_CFG_MASK 0x7UL -#define STRTAB_STE_0_CFG_ABORT (0UL << STRTAB_STE_0_CFG_SHIFT) -#define STRTAB_STE_0_CFG_BYPASS (4UL << STRTAB_STE_0_CFG_SHIFT) -#define STRTAB_STE_0_CFG_S1_TRANS (5UL << STRTAB_STE_0_CFG_SHIFT) -#define STRTAB_STE_0_CFG_S2_TRANS (6UL << STRTAB_STE_0_CFG_SHIFT) - -#define STRTAB_STE_0_S1FMT_SHIFT 4 -#define STRTAB_STE_0_S1FMT_LINEAR (0UL << STRTAB_STE_0_S1FMT_SHIFT) -#define STRTAB_STE_0_S1CTXPTR_SHIFT 6 -#define STRTAB_STE_0_S1CTXPTR_MASK 0x3ffffffffffUL -#define STRTAB_STE_0_S1CDMAX_SHIFT 59 -#define STRTAB_STE_0_S1CDMAX_MASK 0x1fUL +#define STRTAB_STE_0_CFG GENMASK_ULL(3, 1) +#define STRTAB_STE_0_CFG_ABORT 0 +#define STRTAB_STE_0_CFG_BYPASS 4 +#define STRTAB_STE_0_CFG_S1_TRANS 5 +#define STRTAB_STE_0_CFG_S2_TRANS 6 + +#define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) +#define STRTAB_STE_0_S1FMT_LINEAR 0 +#define STRTAB_STE_0_S1CTXPTR_MASK GENMASK_ULL(51, 6) +#define STRTAB_STE_0_S1CDMAX GENMASK_ULL(63, 59) #define STRTAB_STE_1_S1C_CACHE_NC 0UL #define STRTAB_STE_1_S1C_CACHE_WBRA 1UL #define STRTAB_STE_1_S1C_CACHE_WT 2UL #define STRTAB_STE_1_S1C_CACHE_WB 3UL -#define STRTAB_STE_1_S1C_SH_NSH 0UL -#define STRTAB_STE_1_S1C_SH_OSH 2UL -#define STRTAB_STE_1_S1C_SH_ISH 3UL -#define STRTAB_STE_1_S1CIR_SHIFT 2 -#define STRTAB_STE_1_S1COR_SHIFT 4 -#define STRTAB_STE_1_S1CSH_SHIFT 6 +#define STRTAB_STE_1_S1CIR GENMASK_ULL(3, 2) +#define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4) +#define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6) #define STRTAB_STE_1_S1STALLD (1UL << 27) +#define STRTAB_STE_1_EATS GENMASK_ULL(29, 28) #define STRTAB_STE_1_EATS_ABT 0UL #define STRTAB_STE_1_EATS_TRANS 1UL #define STRTAB_STE_1_EATS_S1CHK 2UL -#define STRTAB_STE_1_EATS_SHIFT 28 +#define STRTAB_STE_1_STRW GENMASK_ULL(31, 30) #define STRTAB_STE_1_STRW_NSEL1 0UL #define STRTAB_STE_1_STRW_EL2 2UL -#define STRTAB_STE_1_STRW_SHIFT 30 +#define STRTAB_STE_1_SHCFG GENMASK_ULL(45, 44) #define STRTAB_STE_1_SHCFG_INCOMING 1UL -#define STRTAB_STE_1_SHCFG_SHIFT 44 -#define STRTAB_STE_2_S2VMID_SHIFT 0 -#define STRTAB_STE_2_S2VMID_MASK 0xffffUL -#define STRTAB_STE_2_VTCR_SHIFT 32 -#define STRTAB_STE_2_VTCR_MASK 0x7ffffUL +#define STRTAB_STE_2_S2VMID GENMASK_ULL(15, 0) +#define STRTAB_STE_2_VTCR GENMASK_ULL(50, 32) #define STRTAB_STE_2_S2AA64 (1UL << 51) #define STRTAB_STE_2_S2ENDI (1UL << 52) #define STRTAB_STE_2_S2PTW (1UL << 54) #define STRTAB_STE_2_S2R (1UL << 58) -#define STRTAB_STE_3_S2TTB_SHIFT 4 -#define STRTAB_STE_3_S2TTB_MASK 0xfffffffffffUL +#define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) /* Context descriptor (stage-1 only) */ #define CTXDESC_CD_DWORDS 8 -#define CTXDESC_CD_0_TCR_T0SZ_SHIFT 0 -#define ARM64_TCR_T0SZ_SHIFT 0 -#define ARM64_TCR_T0SZ_MASK 0x1fUL -#define CTXDESC_CD_0_TCR_TG0_SHIFT 6 -#define ARM64_TCR_TG0_SHIFT 14 -#define ARM64_TCR_TG0_MASK 0x3UL -#define CTXDESC_CD_0_TCR_IRGN0_SHIFT 8 -#define ARM64_TCR_IRGN0_SHIFT 8 -#define ARM64_TCR_IRGN0_MASK 0x3UL -#define CTXDESC_CD_0_TCR_ORGN0_SHIFT 10 -#define ARM64_TCR_ORGN0_SHIFT 10 -#define ARM64_TCR_ORGN0_MASK 0x3UL -#define CTXDESC_CD_0_TCR_SH0_SHIFT 12 -#define ARM64_TCR_SH0_SHIFT 12 -#define ARM64_TCR_SH0_MASK 0x3UL -#define CTXDESC_CD_0_TCR_EPD0_SHIFT 14 -#define ARM64_TCR_EPD0_SHIFT 7 -#define ARM64_TCR_EPD0_MASK 0x1UL -#define CTXDESC_CD_0_TCR_EPD1_SHIFT 30 -#define ARM64_TCR_EPD1_SHIFT 23 -#define ARM64_TCR_EPD1_MASK 0x1UL +#define CTXDESC_CD_0_TCR_T0SZ GENMASK_ULL(5, 0) +#define ARM64_TCR_T0SZ GENMASK_ULL(5, 0) +#define CTXDESC_CD_0_TCR_TG0 GENMASK_ULL(7, 6) +#define ARM64_TCR_TG0 GENMASK_ULL(15, 14) +#define CTXDESC_CD_0_TCR_IRGN0 GENMASK_ULL(9, 8) +#define ARM64_TCR_IRGN0 GENMASK_ULL(9, 8) +#define CTXDESC_CD_0_TCR_ORGN0 GENMASK_ULL(11, 10) +#define ARM64_TCR_ORGN0 GENMASK_ULL(11, 10) +#define CTXDESC_CD_0_TCR_SH0 GENMASK_ULL(13, 12) +#define ARM64_TCR_SH0 GENMASK_ULL(13, 12) +#define CTXDESC_CD_0_TCR_EPD0 (1ULL << 14) +#define ARM64_TCR_EPD0 (1ULL << 7) +#define CTXDESC_CD_0_TCR_EPD1 (1ULL << 30) +#define ARM64_TCR_EPD1 (1ULL << 23) #define CTXDESC_CD_0_ENDI (1UL << 15) #define CTXDESC_CD_0_V (1UL << 31) -#define CTXDESC_CD_0_TCR_IPS_SHIFT 32 -#define ARM64_TCR_IPS_SHIFT 32 -#define ARM64_TCR_IPS_MASK 0x7UL -#define CTXDESC_CD_0_TCR_TBI0_SHIFT 38 -#define ARM64_TCR_TBI0_SHIFT 37 -#define ARM64_TCR_TBI0_MASK 0x1UL +#define CTXDESC_CD_0_TCR_IPS GENMASK_ULL(34, 32) +#define ARM64_TCR_IPS GENMASK_ULL(34, 32) +#define CTXDESC_CD_0_TCR_TBI0 (1ULL << 38) +#define ARM64_TCR_TBI0 (1ULL << 37) #define CTXDESC_CD_0_AA64 (1UL << 41) #define CTXDESC_CD_0_S (1UL << 44) #define CTXDESC_CD_0_R (1UL << 45) #define CTXDESC_CD_0_A (1UL << 46) -#define CTXDESC_CD_0_ASET_SHIFT 47 -#define CTXDESC_CD_0_ASET_SHARED (0UL << CTXDESC_CD_0_ASET_SHIFT) -#define CTXDESC_CD_0_ASET_PRIVATE (1UL << CTXDESC_CD_0_ASET_SHIFT) -#define CTXDESC_CD_0_ASID_SHIFT 48 -#define CTXDESC_CD_0_ASID_MASK 0xffffUL - -#define CTXDESC_CD_1_TTB0_SHIFT 4 -#define CTXDESC_CD_1_TTB0_MASK 0xfffffffffffUL +#define CTXDESC_CD_0_ASET (1UL << 47) +#define CTXDESC_CD_0_ASID GENMASK_ULL(63, 48) -#define CTXDESC_CD_3_MAIR_SHIFT 0 +#define CTXDESC_CD_1_TTB0_MASK GENMASK_ULL(51, 4) /* Convert between AArch64 (CPU) TCR format and SMMU CD format */ -#define ARM_SMMU_TCR2CD(tcr, fld) \ - (((tcr) >> ARM64_TCR_##fld##_SHIFT & ARM64_TCR_##fld##_MASK) \ - << CTXDESC_CD_0_TCR_##fld##_SHIFT) +#define ARM_SMMU_TCR2CD(tcr, fld) FIELD_PREP(CTXDESC_CD_0_TCR_##fld, \ + FIELD_GET(ARM64_TCR_##fld, tcr)) /* Command queue */ #define CMDQ_ENT_DWORDS 2 #define CMDQ_MAX_SZ_SHIFT 8 -#define CMDQ_ERR_SHIFT 24 -#define CMDQ_ERR_MASK 0x7f +#define CMDQ_CONS_ERR GENMASK(30, 24) #define CMDQ_ERR_CERROR_NONE_IDX 0 #define CMDQ_ERR_CERROR_ILL_IDX 1 #define CMDQ_ERR_CERROR_ABT_IDX 2 -#define CMDQ_0_OP_SHIFT 0 -#define CMDQ_0_OP_MASK 0xffUL +#define CMDQ_0_OP GENMASK_ULL(7, 0) #define CMDQ_0_SSV (1UL << 11) -#define CMDQ_PREFETCH_0_SID_SHIFT 32 -#define CMDQ_PREFETCH_1_SIZE_SHIFT 0 -#define CMDQ_PREFETCH_1_ADDR_MASK ~0xfffUL +#define CMDQ_PREFETCH_0_SID GENMASK_ULL(63, 32) +#define CMDQ_PREFETCH_1_SIZE GENMASK_ULL(4, 0) +#define CMDQ_PREFETCH_1_ADDR_MASK GENMASK_ULL(63, 12) -#define CMDQ_CFGI_0_SID_SHIFT 32 -#define CMDQ_CFGI_0_SID_MASK 0xffffffffUL +#define CMDQ_CFGI_0_SID GENMASK_ULL(63, 32) #define CMDQ_CFGI_1_LEAF (1UL << 0) -#define CMDQ_CFGI_1_RANGE_SHIFT 0 -#define CMDQ_CFGI_1_RANGE_MASK 0x1fUL +#define CMDQ_CFGI_1_RANGE GENMASK_ULL(4, 0) -#define CMDQ_TLBI_0_VMID_SHIFT 32 -#define CMDQ_TLBI_0_ASID_SHIFT 48 +#define CMDQ_TLBI_0_VMID GENMASK_ULL(47, 32) +#define CMDQ_TLBI_0_ASID GENMASK_ULL(63, 48) #define CMDQ_TLBI_1_LEAF (1UL << 0) -#define CMDQ_TLBI_1_VA_MASK ~0xfffUL -#define CMDQ_TLBI_1_IPA_MASK 0xfffffffff000UL - -#define CMDQ_PRI_0_SSID_SHIFT 12 -#define CMDQ_PRI_0_SSID_MASK 0xfffffUL -#define CMDQ_PRI_0_SID_SHIFT 32 -#define CMDQ_PRI_0_SID_MASK 0xffffffffUL -#define CMDQ_PRI_1_GRPID_SHIFT 0 -#define CMDQ_PRI_1_GRPID_MASK 0x1ffUL -#define CMDQ_PRI_1_RESP_SHIFT 12 -#define CMDQ_PRI_1_RESP_DENY (0UL << CMDQ_PRI_1_RESP_SHIFT) -#define CMDQ_PRI_1_RESP_FAIL (1UL << CMDQ_PRI_1_RESP_SHIFT) -#define CMDQ_PRI_1_RESP_SUCC (2UL << CMDQ_PRI_1_RESP_SHIFT) - -#define CMDQ_SYNC_0_CS_SHIFT 12 -#define CMDQ_SYNC_0_CS_NONE (0UL << CMDQ_SYNC_0_CS_SHIFT) -#define CMDQ_SYNC_0_CS_IRQ (1UL << CMDQ_SYNC_0_CS_SHIFT) -#define CMDQ_SYNC_0_CS_SEV (2UL << CMDQ_SYNC_0_CS_SHIFT) -#define CMDQ_SYNC_0_MSH_SHIFT 22 -#define CMDQ_SYNC_0_MSH_ISH (3UL << CMDQ_SYNC_0_MSH_SHIFT) -#define CMDQ_SYNC_0_MSIATTR_SHIFT 24 -#define CMDQ_SYNC_0_MSIATTR_OIWB (0xfUL << CMDQ_SYNC_0_MSIATTR_SHIFT) -#define CMDQ_SYNC_0_MSIDATA_SHIFT 32 -#define CMDQ_SYNC_0_MSIDATA_MASK 0xffffffffUL -#define CMDQ_SYNC_1_MSIADDR_SHIFT 0 -#define CMDQ_SYNC_1_MSIADDR_MASK 0xffffffffffffcUL +#define CMDQ_TLBI_1_VA_MASK GENMASK_ULL(63, 12) +#define CMDQ_TLBI_1_IPA_MASK GENMASK_ULL(51, 12) + +#define CMDQ_PRI_0_SSID GENMASK_ULL(31, 12) +#define CMDQ_PRI_0_SID GENMASK_ULL(63, 32) +#define CMDQ_PRI_1_GRPID GENMASK_ULL(8, 0) +#define CMDQ_PRI_1_RESP GENMASK_ULL(13, 12) + +#define CMDQ_SYNC_0_CS GENMASK_ULL(13, 12) +#define CMDQ_SYNC_0_CS_NONE 0 +#define CMDQ_SYNC_0_CS_IRQ 1 +#define CMDQ_SYNC_0_CS_SEV 2 +#define CMDQ_SYNC_0_MSH GENMASK_ULL(23, 22) +#define CMDQ_SYNC_0_MSIATTR GENMASK_ULL(27, 24) +#define CMDQ_SYNC_0_MSIDATA GENMASK_ULL(63, 32) +#define CMDQ_SYNC_1_MSIADDR_MASK GENMASK_ULL(51, 2) /* Event queue */ #define EVTQ_ENT_DWORDS 4 #define EVTQ_MAX_SZ_SHIFT 7 -#define EVTQ_0_ID_SHIFT 0 -#define EVTQ_0_ID_MASK 0xffUL +#define EVTQ_0_ID GENMASK_ULL(7, 0) /* PRI queue */ #define PRIQ_ENT_DWORDS 2 #define PRIQ_MAX_SZ_SHIFT 8 -#define PRIQ_0_SID_SHIFT 0 -#define PRIQ_0_SID_MASK 0xffffffffUL -#define PRIQ_0_SSID_SHIFT 32 -#define PRIQ_0_SSID_MASK 0xfffffUL +#define PRIQ_0_SID GENMASK_ULL(31, 0) +#define PRIQ_0_SSID GENMASK_ULL(51, 32) #define PRIQ_0_PERM_PRIV (1UL << 58) #define PRIQ_0_PERM_EXEC (1UL << 59) #define PRIQ_0_PERM_READ (1UL << 60) @@ -411,10 +355,8 @@ #define PRIQ_0_PRG_LAST (1UL << 62) #define PRIQ_0_SSID_V (1UL << 63) -#define PRIQ_1_PRG_IDX_SHIFT 0 -#define PRIQ_1_PRG_IDX_MASK 0x1ffUL -#define PRIQ_1_ADDR_SHIFT 12 -#define PRIQ_1_ADDR_MASK 0xfffffffffffffUL +#define PRIQ_1_PRG_IDX GENMASK_ULL(8, 0) +#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12) /* High-level queue structures */ #define ARM_SMMU_POLL_TIMEOUT_US 100 @@ -430,9 +372,9 @@ MODULE_PARM_DESC(disable_bypass, "Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU."); enum pri_resp { - PRI_RESP_DENY, - PRI_RESP_FAIL, - PRI_RESP_SUCC, + PRI_RESP_DENY = 0, + PRI_RESP_FAIL = 1, + PRI_RESP_SUCC = 2, }; enum arm_smmu_msi_index { @@ -611,6 +553,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_STALLS (1 << 11) #define ARM_SMMU_FEAT_HYP (1 << 12) #define ARM_SMMU_FEAT_STALL_FORCE (1 << 13) +#define ARM_SMMU_FEAT_VAX (1 << 14) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) @@ -836,67 +779,64 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent) static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) { memset(cmd, 0, CMDQ_ENT_DWORDS << 3); - cmd[0] |= (ent->opcode & CMDQ_0_OP_MASK) << CMDQ_0_OP_SHIFT; + cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode); switch (ent->opcode) { case CMDQ_OP_TLBI_EL2_ALL: case CMDQ_OP_TLBI_NSNH_ALL: break; case CMDQ_OP_PREFETCH_CFG: - cmd[0] |= (u64)ent->prefetch.sid << CMDQ_PREFETCH_0_SID_SHIFT; - cmd[1] |= ent->prefetch.size << CMDQ_PREFETCH_1_SIZE_SHIFT; + cmd[0] |= FIELD_PREP(CMDQ_PREFETCH_0_SID, ent->prefetch.sid); + cmd[1] |= FIELD_PREP(CMDQ_PREFETCH_1_SIZE, ent->prefetch.size); cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK; break; case CMDQ_OP_CFGI_STE: - cmd[0] |= (u64)ent->cfgi.sid << CMDQ_CFGI_0_SID_SHIFT; - cmd[1] |= ent->cfgi.leaf ? CMDQ_CFGI_1_LEAF : 0; + cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid); + cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_LEAF, ent->cfgi.leaf); break; case CMDQ_OP_CFGI_ALL: /* Cover the entire SID range */ - cmd[1] |= CMDQ_CFGI_1_RANGE_MASK << CMDQ_CFGI_1_RANGE_SHIFT; + cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31); break; case CMDQ_OP_TLBI_NH_VA: - cmd[0] |= (u64)ent->tlbi.asid << CMDQ_TLBI_0_ASID_SHIFT; - cmd[1] |= ent->tlbi.leaf ? CMDQ_TLBI_1_LEAF : 0; + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); + cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf); cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK; break; case CMDQ_OP_TLBI_S2_IPA: - cmd[0] |= (u64)ent->tlbi.vmid << CMDQ_TLBI_0_VMID_SHIFT; - cmd[1] |= ent->tlbi.leaf ? CMDQ_TLBI_1_LEAF : 0; + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); + cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf); cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK; break; case CMDQ_OP_TLBI_NH_ASID: - cmd[0] |= (u64)ent->tlbi.asid << CMDQ_TLBI_0_ASID_SHIFT; + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); /* Fallthrough */ case CMDQ_OP_TLBI_S12_VMALL: - cmd[0] |= (u64)ent->tlbi.vmid << CMDQ_TLBI_0_VMID_SHIFT; + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); break; case CMDQ_OP_PRI_RESP: - cmd[0] |= ent->substream_valid ? CMDQ_0_SSV : 0; - cmd[0] |= ent->pri.ssid << CMDQ_PRI_0_SSID_SHIFT; - cmd[0] |= (u64)ent->pri.sid << CMDQ_PRI_0_SID_SHIFT; - cmd[1] |= ent->pri.grpid << CMDQ_PRI_1_GRPID_SHIFT; + cmd[0] |= FIELD_PREP(CMDQ_0_SSV, ent->substream_valid); + cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SSID, ent->pri.ssid); + cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SID, ent->pri.sid); + cmd[1] |= FIELD_PREP(CMDQ_PRI_1_GRPID, ent->pri.grpid); switch (ent->pri.resp) { case PRI_RESP_DENY: - cmd[1] |= CMDQ_PRI_1_RESP_DENY; - break; case PRI_RESP_FAIL: - cmd[1] |= CMDQ_PRI_1_RESP_FAIL; - break; case PRI_RESP_SUCC: - cmd[1] |= CMDQ_PRI_1_RESP_SUCC; break; default: return -EINVAL; } + cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp); break; case CMDQ_OP_CMD_SYNC: if (ent->sync.msiaddr) - cmd[0] |= CMDQ_SYNC_0_CS_IRQ; + cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ); else - cmd[0] |= CMDQ_SYNC_0_CS_SEV; - cmd[0] |= CMDQ_SYNC_0_MSH_ISH | CMDQ_SYNC_0_MSIATTR_OIWB; - cmd[0] |= (u64)ent->sync.msidata << CMDQ_SYNC_0_MSIDATA_SHIFT; + cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV); + cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH); + cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB); + cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA, ent->sync.msidata); cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; break; default: @@ -918,7 +858,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) u64 cmd[CMDQ_ENT_DWORDS]; struct arm_smmu_queue *q = &smmu->cmdq.q; u32 cons = readl_relaxed(q->cons_reg); - u32 idx = cons >> CMDQ_ERR_SHIFT & CMDQ_ERR_MASK; + u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons); struct arm_smmu_cmdq_ent cmd_sync = { .opcode = CMDQ_OP_CMD_SYNC, }; @@ -1083,8 +1023,8 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu, #ifdef __BIG_ENDIAN CTXDESC_CD_0_ENDI | #endif - CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET_PRIVATE | - CTXDESC_CD_0_AA64 | (u64)cfg->cd.asid << CTXDESC_CD_0_ASID_SHIFT | + CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET | + CTXDESC_CD_0_AA64 | FIELD_PREP(CTXDESC_CD_0_ASID, cfg->cd.asid) | CTXDESC_CD_0_V; /* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */ @@ -1093,10 +1033,10 @@ static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu, cfg->cdptr[0] = cpu_to_le64(val); - val = cfg->cd.ttbr & CTXDESC_CD_1_TTB0_MASK << CTXDESC_CD_1_TTB0_SHIFT; + val = cfg->cd.ttbr & CTXDESC_CD_1_TTB0_MASK; cfg->cdptr[1] = cpu_to_le64(val); - cfg->cdptr[3] = cpu_to_le64(cfg->cd.mair << CTXDESC_CD_3_MAIR_SHIFT); + cfg->cdptr[3] = cpu_to_le64(cfg->cd.mair); } /* Stream table manipulation functions */ @@ -1105,10 +1045,8 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc) { u64 val = 0; - val |= (desc->span & STRTAB_L1_DESC_SPAN_MASK) - << STRTAB_L1_DESC_SPAN_SHIFT; - val |= desc->l2ptr_dma & - STRTAB_L1_DESC_L2PTR_MASK << STRTAB_L1_DESC_L2PTR_SHIFT; + val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span); + val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK; *dst = cpu_to_le64(val); } @@ -1156,10 +1094,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid, }; if (val & STRTAB_STE_0_V) { - u64 cfg; - - cfg = val & STRTAB_STE_0_CFG_MASK << STRTAB_STE_0_CFG_SHIFT; - switch (cfg) { + switch (FIELD_GET(STRTAB_STE_0_CFG, val)) { case STRTAB_STE_0_CFG_BYPASS: break; case STRTAB_STE_0_CFG_S1_TRANS: @@ -1180,13 +1115,13 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid, /* Bypass/fault */ if (!ste->assigned || !(ste->s1_cfg || ste->s2_cfg)) { if (!ste->assigned && disable_bypass) - val |= STRTAB_STE_0_CFG_ABORT; + val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT); else - val |= STRTAB_STE_0_CFG_BYPASS; + val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); dst[0] = cpu_to_le64(val); - dst[1] = cpu_to_le64(STRTAB_STE_1_SHCFG_INCOMING - << STRTAB_STE_1_SHCFG_SHIFT); + dst[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + STRTAB_STE_1_SHCFG_INCOMING)); dst[2] = 0; /* Nuke the VMID */ /* * The SMMU can perform negative caching, so we must sync @@ -1200,41 +1135,36 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_device *smmu, u32 sid, if (ste->s1_cfg) { BUG_ON(ste_live); dst[1] = cpu_to_le64( - STRTAB_STE_1_S1C_CACHE_WBRA - << STRTAB_STE_1_S1CIR_SHIFT | - STRTAB_STE_1_S1C_CACHE_WBRA - << STRTAB_STE_1_S1COR_SHIFT | - STRTAB_STE_1_S1C_SH_ISH << STRTAB_STE_1_S1CSH_SHIFT | + FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | + FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | + FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | #ifdef CONFIG_PCI_ATS - STRTAB_STE_1_EATS_TRANS << STRTAB_STE_1_EATS_SHIFT | + FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS) | #endif - STRTAB_STE_1_STRW_NSEL1 << STRTAB_STE_1_STRW_SHIFT); + FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_NSEL1)); if (smmu->features & ARM_SMMU_FEAT_STALLS && !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); - val |= (ste->s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK - << STRTAB_STE_0_S1CTXPTR_SHIFT) | - STRTAB_STE_0_CFG_S1_TRANS; + val |= (ste->s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK) | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS); } if (ste->s2_cfg) { BUG_ON(ste_live); dst[2] = cpu_to_le64( - ste->s2_cfg->vmid << STRTAB_STE_2_S2VMID_SHIFT | - (ste->s2_cfg->vtcr & STRTAB_STE_2_VTCR_MASK) - << STRTAB_STE_2_VTCR_SHIFT | + FIELD_PREP(STRTAB_STE_2_S2VMID, ste->s2_cfg->vmid) | + FIELD_PREP(STRTAB_STE_2_VTCR, ste->s2_cfg->vtcr) | #ifdef __BIG_ENDIAN STRTAB_STE_2_S2ENDI | #endif STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R); - dst[3] = cpu_to_le64(ste->s2_cfg->vttbr & - STRTAB_STE_3_S2TTB_MASK << STRTAB_STE_3_S2TTB_SHIFT); + dst[3] = cpu_to_le64(ste->s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); - val |= STRTAB_STE_0_CFG_S2_TRANS; + val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); } arm_smmu_sync_ste_for_sid(smmu, sid); @@ -1295,7 +1225,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) do { while (!queue_remove_raw(q, evt)) { - u8 id = evt[0] >> EVTQ_0_ID_SHIFT & EVTQ_0_ID_MASK; + u8 id = FIELD_GET(EVTQ_0_ID, evt[0]); dev_info(smmu->dev, "event 0x%02x received:\n", id); for (i = 0; i < ARRAY_SIZE(evt); ++i) @@ -1323,11 +1253,11 @@ static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt) u16 grpid; bool ssv, last; - sid = evt[0] >> PRIQ_0_SID_SHIFT & PRIQ_0_SID_MASK; - ssv = evt[0] & PRIQ_0_SSID_V; - ssid = ssv ? evt[0] >> PRIQ_0_SSID_SHIFT & PRIQ_0_SSID_MASK : 0; - last = evt[0] & PRIQ_0_PRG_LAST; - grpid = evt[1] >> PRIQ_1_PRG_IDX_SHIFT & PRIQ_1_PRG_IDX_MASK; + sid = FIELD_GET(PRIQ_0_SID, evt[0]); + ssv = FIELD_GET(PRIQ_0_SSID_V, evt[0]); + ssid = ssv ? FIELD_GET(PRIQ_0_SSID, evt[0]) : 0; + last = FIELD_GET(PRIQ_0_PRG_LAST, evt[0]); + grpid = FIELD_GET(PRIQ_1_PRG_IDX, evt[1]); dev_info(smmu->dev, "unexpected PRI request received:\n"); dev_info(smmu->dev, @@ -1337,7 +1267,7 @@ static void arm_smmu_handle_ppr(struct arm_smmu_device *smmu, u64 *evt) evt[0] & PRIQ_0_PERM_READ ? "R" : "", evt[0] & PRIQ_0_PERM_WRITE ? "W" : "", evt[0] & PRIQ_0_PERM_EXEC ? "X" : "", - evt[1] & PRIQ_1_ADDR_MASK << PRIQ_1_ADDR_SHIFT); + evt[1] & PRIQ_1_ADDR_MASK); if (last) { struct arm_smmu_cmdq_ent cmd = { @@ -1664,7 +1594,8 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: - ias = VA_BITS; + ias = (smmu->features & ARM_SMMU_FEAT_VAX) ? 52 : 48; + ias = min_t(unsigned long, ias, VA_BITS); oas = smmu->ias; fmt = ARM_64_LPAE_S1; finalise_stage_fn = arm_smmu_domain_finalise_s1; @@ -1696,7 +1627,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) return -ENOMEM; domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; - domain->geometry.aperture_end = (1UL << ias) - 1; + domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; domain->geometry.force_aperture = true; ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg); @@ -2102,9 +2033,8 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, q->ent_dwords = dwords; q->q_base = Q_BASE_RWA; - q->q_base |= q->base_dma & Q_BASE_ADDR_MASK << Q_BASE_ADDR_SHIFT; - q->q_base |= (q->max_n_shift & Q_BASE_LOG2SIZE_MASK) - << Q_BASE_LOG2SIZE_SHIFT; + q->q_base |= q->base_dma & Q_BASE_ADDR_MASK; + q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->max_n_shift); q->prod = q->cons = 0; return 0; @@ -2186,11 +2116,9 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu) cfg->strtab = strtab; /* Configure strtab_base_cfg for 2 levels */ - reg = STRTAB_BASE_CFG_FMT_2LVL; - reg |= (size & STRTAB_BASE_CFG_LOG2SIZE_MASK) - << STRTAB_BASE_CFG_LOG2SIZE_SHIFT; - reg |= (STRTAB_SPLIT & STRTAB_BASE_CFG_SPLIT_MASK) - << STRTAB_BASE_CFG_SPLIT_SHIFT; + reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL); + reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, size); + reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT); cfg->strtab_base_cfg = reg; return arm_smmu_init_l1_strtab(smmu); @@ -2216,9 +2144,8 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) cfg->num_l1_ents = 1 << smmu->sid_bits; /* Configure strtab_base_cfg for a linear table covering all SIDs */ - reg = STRTAB_BASE_CFG_FMT_LINEAR; - reg |= (smmu->sid_bits & STRTAB_BASE_CFG_LOG2SIZE_MASK) - << STRTAB_BASE_CFG_LOG2SIZE_SHIFT; + reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_LINEAR); + reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits); cfg->strtab_base_cfg = reg; arm_smmu_init_bypass_stes(strtab, cfg->num_l1_ents); @@ -2239,8 +2166,7 @@ static int arm_smmu_init_strtab(struct arm_smmu_device *smmu) return ret; /* Set the strtab base address */ - reg = smmu->strtab_cfg.strtab_dma & - STRTAB_BASE_ADDR_MASK << STRTAB_BASE_ADDR_SHIFT; + reg = smmu->strtab_cfg.strtab_dma & STRTAB_BASE_ADDR_MASK; reg |= STRTAB_BASE_RA; smmu->strtab_cfg.strtab_base = reg; @@ -2303,11 +2229,11 @@ static void arm_smmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg) phys_addr_t *cfg = arm_smmu_msi_cfg[desc->platform.msi_index]; doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo; - doorbell &= MSI_CFG0_ADDR_MASK << MSI_CFG0_ADDR_SHIFT; + doorbell &= MSI_CFG0_ADDR_MASK; writeq_relaxed(doorbell, smmu->base + cfg[0]); writel_relaxed(msg->data, smmu->base + cfg[1]); - writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE, smmu->base + cfg[2]); + writel_relaxed(ARM_SMMU_MEMATTR_DEVICE_nGnRE, smmu->base + cfg[2]); } static void arm_smmu_setup_msis(struct arm_smmu_device *smmu) @@ -2328,10 +2254,15 @@ static void arm_smmu_setup_msis(struct arm_smmu_device *smmu) if (!(smmu->features & ARM_SMMU_FEAT_MSI)) return; + if (!dev->msi_domain) { + dev_info(smmu->dev, "msi_domain absent - falling back to wired irqs\n"); + return; + } + /* Allocate MSIs for evtq, gerror and priq. Ignore cmdq */ ret = platform_msi_domain_alloc_irqs(dev, nvec, arm_smmu_write_msi_msg); if (ret) { - dev_warn(dev, "failed to allocate MSIs\n"); + dev_warn(dev, "failed to allocate MSIs - falling back to wired irqs\n"); return; } @@ -2370,6 +2301,8 @@ static void arm_smmu_setup_unique_irqs(struct arm_smmu_device *smmu) "arm-smmu-v3-evtq", smmu); if (ret < 0) dev_warn(smmu->dev, "failed to enable evtq irq\n"); + } else { + dev_warn(smmu->dev, "no evtq irq - events will not be reported!\n"); } irq = smmu->gerr_irq; @@ -2378,6 +2311,8 @@ static void arm_smmu_setup_unique_irqs(struct arm_smmu_device *smmu) 0, "arm-smmu-v3-gerror", smmu); if (ret < 0) dev_warn(smmu->dev, "failed to enable gerror irq\n"); + } else { + dev_warn(smmu->dev, "no gerr irq - errors will not be reported!\n"); } if (smmu->features & ARM_SMMU_FEAT_PRI) { @@ -2391,6 +2326,8 @@ static void arm_smmu_setup_unique_irqs(struct arm_smmu_device *smmu) if (ret < 0) dev_warn(smmu->dev, "failed to enable priq irq\n"); + } else { + dev_warn(smmu->dev, "no priq irq - PRI will be broken\n"); } } } @@ -2463,12 +2400,12 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) return ret; /* CR1 (table and queue memory attributes) */ - reg = (CR1_SH_ISH << CR1_TABLE_SH_SHIFT) | - (CR1_CACHE_WB << CR1_TABLE_OC_SHIFT) | - (CR1_CACHE_WB << CR1_TABLE_IC_SHIFT) | - (CR1_SH_ISH << CR1_QUEUE_SH_SHIFT) | - (CR1_CACHE_WB << CR1_QUEUE_OC_SHIFT) | - (CR1_CACHE_WB << CR1_QUEUE_IC_SHIFT); + reg = FIELD_PREP(CR1_TABLE_SH, ARM_SMMU_SH_ISH) | + FIELD_PREP(CR1_TABLE_OC, CR1_CACHE_WB) | + FIELD_PREP(CR1_TABLE_IC, CR1_CACHE_WB) | + FIELD_PREP(CR1_QUEUE_SH, ARM_SMMU_SH_ISH) | + FIELD_PREP(CR1_QUEUE_OC, CR1_CACHE_WB) | + FIELD_PREP(CR1_QUEUE_IC, CR1_CACHE_WB); writel_relaxed(reg, smmu->base + ARM_SMMU_CR1); /* CR2 (random crap) */ @@ -2578,7 +2515,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) reg = readl_relaxed(smmu->base + ARM_SMMU_IDR0); /* 2-level structures */ - if ((reg & IDR0_ST_LVL_MASK << IDR0_ST_LVL_SHIFT) == IDR0_ST_LVL_2LVL) + if (FIELD_GET(IDR0_ST_LVL, reg) == IDR0_ST_LVL_2LVL) smmu->features |= ARM_SMMU_FEAT_2_LVL_STRTAB; if (reg & IDR0_CD2L) @@ -2589,7 +2526,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) * We currently require the same endianness as the CPU, but this * could be changed later by adding a new IO_PGTABLE_QUIRK. */ - switch (reg & IDR0_TTENDIAN_MASK << IDR0_TTENDIAN_SHIFT) { + switch (FIELD_GET(IDR0_TTENDIAN, reg)) { case IDR0_TTENDIAN_MIXED: smmu->features |= ARM_SMMU_FEAT_TT_LE | ARM_SMMU_FEAT_TT_BE; break; @@ -2631,7 +2568,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) dev_warn(smmu->dev, "IDR0.COHACC overridden by FW configuration (%s)\n", coherent ? "true" : "false"); - switch (reg & IDR0_STALL_MODEL_MASK << IDR0_STALL_MODEL_SHIFT) { + switch (FIELD_GET(IDR0_STALL_MODEL, reg)) { case IDR0_STALL_MODEL_FORCE: smmu->features |= ARM_SMMU_FEAT_STALL_FORCE; /* Fallthrough */ @@ -2651,7 +2588,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) } /* We only support the AArch64 table format at present */ - switch (reg & IDR0_TTF_MASK << IDR0_TTF_SHIFT) { + switch (FIELD_GET(IDR0_TTF, reg)) { case IDR0_TTF_AARCH32_64: smmu->ias = 40; /* Fallthrough */ @@ -2674,22 +2611,22 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) } /* Queue sizes, capped at 4k */ - smmu->cmdq.q.max_n_shift = min((u32)CMDQ_MAX_SZ_SHIFT, - reg >> IDR1_CMDQ_SHIFT & IDR1_CMDQ_MASK); + smmu->cmdq.q.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_CMDQS, reg)); if (!smmu->cmdq.q.max_n_shift) { /* Odd alignment restrictions on the base, so ignore for now */ dev_err(smmu->dev, "unit-length command queue not supported\n"); return -ENXIO; } - smmu->evtq.q.max_n_shift = min((u32)EVTQ_MAX_SZ_SHIFT, - reg >> IDR1_EVTQ_SHIFT & IDR1_EVTQ_MASK); - smmu->priq.q.max_n_shift = min((u32)PRIQ_MAX_SZ_SHIFT, - reg >> IDR1_PRIQ_SHIFT & IDR1_PRIQ_MASK); + smmu->evtq.q.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_EVTQS, reg)); + smmu->priq.q.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_PRIQS, reg)); /* SID/SSID sizes */ - smmu->ssid_bits = reg >> IDR1_SSID_SHIFT & IDR1_SSID_MASK; - smmu->sid_bits = reg >> IDR1_SID_SHIFT & IDR1_SID_MASK; + smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg); + smmu->sid_bits = FIELD_GET(IDR1_SIDSIZE, reg); /* * If the SMMU supports fewer bits than would fill a single L2 stream @@ -2702,8 +2639,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5); /* Maximum number of outstanding stalls */ - smmu->evtq.max_stalls = reg >> IDR5_STALL_MAX_SHIFT - & IDR5_STALL_MAX_MASK; + smmu->evtq.max_stalls = FIELD_GET(IDR5_STALL_MAX, reg); /* Page sizes */ if (reg & IDR5_GRAN64K) @@ -2713,13 +2649,12 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) if (reg & IDR5_GRAN4K) smmu->pgsize_bitmap |= SZ_4K | SZ_2M | SZ_1G; - if (arm_smmu_ops.pgsize_bitmap == -1UL) - arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap; - else - arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap; + /* Input address size */ + if (FIELD_GET(IDR5_VAX, reg) == IDR5_VAX_52_BIT) + smmu->features |= ARM_SMMU_FEAT_VAX; /* Output address size */ - switch (reg & IDR5_OAS_MASK << IDR5_OAS_SHIFT) { + switch (FIELD_GET(IDR5_OAS, reg)) { case IDR5_OAS_32_BIT: smmu->oas = 32; break; @@ -2735,6 +2670,10 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) case IDR5_OAS_44_BIT: smmu->oas = 44; break; + case IDR5_OAS_52_BIT: + smmu->oas = 52; + smmu->pgsize_bitmap |= 1ULL << 42; /* 4TB */ + break; default: dev_info(smmu->dev, "unknown output address size. Truncating to 48-bit\n"); @@ -2743,6 +2682,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) smmu->oas = 48; } + if (arm_smmu_ops.pgsize_bitmap == -1UL) + arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap; + else + arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap; + /* Set the DMA mask for our table walker */ if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(smmu->oas))) dev_warn(smmu->dev, diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 25914d36c5ac..f05f3cf90756 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -19,6 +19,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/acpi_iort.h> #include <linux/device.h> #include <linux/dma-iommu.h> #include <linux/gfp.h> @@ -167,13 +168,18 @@ EXPORT_SYMBOL(iommu_put_dma_cookie); * * IOMMU drivers can use this to implement their .get_resv_regions callback * for general non-IOMMU-specific reservations. Currently, this covers host - * bridge windows for PCI devices. + * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI + * based ARM platforms that may require HW MSI reservation. */ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { struct pci_host_bridge *bridge; struct resource_entry *window; + if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) && + iort_iommu_msi_get_resv_regions(dev, list) < 0) + return; + if (!dev_is_pci(dev)) return; diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 9a7ffd13c7f0..accf58388bdb 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -806,7 +806,7 @@ int __init dmar_dev_scope_init(void) return dmar_dev_scope_status; } -void dmar_register_bus_notifier(void) +void __init dmar_register_bus_notifier(void) { bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb); } diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index c5f4f7691b57..85879cfec52f 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -1239,17 +1239,6 @@ static phys_addr_t exynos_iommu_iova_to_phys(struct iommu_domain *iommu_domain, return phys; } -static struct iommu_group *get_device_iommu_group(struct device *dev) -{ - struct iommu_group *group; - - group = iommu_group_get(dev); - if (!group) - group = iommu_group_alloc(); - - return group; -} - static int exynos_iommu_add_device(struct device *dev) { struct exynos_iommu_owner *owner = dev->archdata.iommu; @@ -1345,7 +1334,7 @@ static const struct iommu_ops exynos_iommu_ops = { .unmap = exynos_iommu_unmap, .map_sg = default_iommu_map_sg, .iova_to_phys = exynos_iommu_iova_to_phys, - .device_group = get_device_iommu_group, + .device_group = generic_device_group, .add_device = exynos_iommu_add_device, .remove_device = exynos_iommu_remove_device, .pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 24d1b1b42013..749d8f235346 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5043,7 +5043,6 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct page *freelist = NULL; - struct intel_iommu *iommu; unsigned long start_pfn, last_pfn; unsigned int npages; int iommu_id, level = 0; @@ -5062,12 +5061,9 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, npages = last_pfn - start_pfn + 1; - for_each_domain_iommu(iommu_id, dmar_domain) { - iommu = g_iommus[iommu_id]; - + for_each_domain_iommu(iommu_id, dmar_domain) iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, start_pfn, npages, !freelist, 0); - } dma_free_pagelist(freelist); diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 99bc9bd64b9e..e8cd984cf9c8 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -396,6 +396,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ pasid_max - 1, GFP_KERNEL); if (ret < 0) { kfree(svm); + kfree(sdev); goto out; } svm->pasid = ret; @@ -422,17 +423,13 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ iommu->pasid_table[svm->pasid].val = pasid_entry_val; wmb(); - /* In caching mode, we still have to flush with PASID 0 when - * a PASID table entry becomes present. Not entirely clear - * *why* that would be the case — surely we could just issue - * a flush with the PASID value that we've changed? The PASID - * is the index into the table, after all. It's not like domain - * IDs in the case of the equivalent context-entry change in - * caching mode. And for that matter it's not entirely clear why - * a VMM would be in the business of caching the PASID table - * anyway. Surely that can be left entirely to the guest? */ + + /* + * Flush PASID cache when a PASID table entry becomes + * present. + */ if (cap_caching_mode(iommu->cap)) - intel_flush_pasid_dev(svm, sdev, 0); + intel_flush_pasid_dev(svm, sdev, svm->pasid); } list_add_rcu(&sdev->list, &svm->devs); diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 2ca08dc9331c..10e4a3d11c02 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -357,8 +357,8 @@ static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl) return false; } -static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *, unsigned long, - size_t, int, arm_v7s_iopte *); +static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *, unsigned long, + size_t, int, arm_v7s_iopte *); static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, unsigned long iova, phys_addr_t paddr, int prot, @@ -541,9 +541,10 @@ static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, return pte; } -static int arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, - unsigned long iova, size_t size, - arm_v7s_iopte blk_pte, arm_v7s_iopte *ptep) +static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, + unsigned long iova, size_t size, + arm_v7s_iopte blk_pte, + arm_v7s_iopte *ptep) { struct io_pgtable_cfg *cfg = &data->iop.cfg; arm_v7s_iopte pte, *tablep; @@ -584,9 +585,9 @@ static int arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, return size; } -static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, - unsigned long iova, size_t size, int lvl, - arm_v7s_iopte *ptep) +static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, + unsigned long iova, size_t size, int lvl, + arm_v7s_iopte *ptep) { arm_v7s_iopte pte[ARM_V7S_CONT_PAGES]; struct io_pgtable *iop = &data->iop; @@ -656,8 +657,8 @@ static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, return __arm_v7s_unmap(data, iova, size, lvl + 1, ptep); } -static int arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size) +static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, + size_t size) { struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops); diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 51e5c43caed1..39c2a056da21 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -21,6 +21,7 @@ #define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt #include <linux/atomic.h> +#include <linux/bitops.h> #include <linux/iommu.h> #include <linux/kernel.h> #include <linux/sizes.h> @@ -32,7 +33,7 @@ #include "io-pgtable.h" -#define ARM_LPAE_MAX_ADDR_BITS 48 +#define ARM_LPAE_MAX_ADDR_BITS 52 #define ARM_LPAE_S2_MAX_CONCAT_PAGES 16 #define ARM_LPAE_MAX_LEVELS 4 @@ -86,6 +87,8 @@ #define ARM_LPAE_PTE_TYPE_TABLE 3 #define ARM_LPAE_PTE_TYPE_PAGE 3 +#define ARM_LPAE_PTE_ADDR_MASK GENMASK_ULL(47,12) + #define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63) #define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53) #define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10) @@ -159,6 +162,7 @@ #define ARM_LPAE_TCR_PS_42_BIT 0x3ULL #define ARM_LPAE_TCR_PS_44_BIT 0x4ULL #define ARM_LPAE_TCR_PS_48_BIT 0x5ULL +#define ARM_LPAE_TCR_PS_52_BIT 0x6ULL #define ARM_LPAE_MAIR_ATTR_SHIFT(n) ((n) << 3) #define ARM_LPAE_MAIR_ATTR_MASK 0xff @@ -170,9 +174,7 @@ #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2 /* IOPTE accessors */ -#define iopte_deref(pte,d) \ - (__va((pte) & ((1ULL << ARM_LPAE_MAX_ADDR_BITS) - 1) \ - & ~(ARM_LPAE_GRANULE(d) - 1ULL))) +#define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) #define iopte_type(pte,l) \ (((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK) @@ -184,12 +186,6 @@ (iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_PAGE) : \ (iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_BLOCK)) -#define iopte_to_pfn(pte,d) \ - (((pte) & ((1ULL << ARM_LPAE_MAX_ADDR_BITS) - 1)) >> (d)->pg_shift) - -#define pfn_to_iopte(pfn,d) \ - (((pfn) << (d)->pg_shift) & ((1ULL << ARM_LPAE_MAX_ADDR_BITS) - 1)) - struct arm_lpae_io_pgtable { struct io_pgtable iop; @@ -203,6 +199,27 @@ struct arm_lpae_io_pgtable { typedef u64 arm_lpae_iopte; +static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr, + struct arm_lpae_io_pgtable *data) +{ + arm_lpae_iopte pte = paddr; + + /* Of the bits which overlap, either 51:48 or 15:12 are always RES0 */ + return (pte | (pte >> (48 - 12))) & ARM_LPAE_PTE_ADDR_MASK; +} + +static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte, + struct arm_lpae_io_pgtable *data) +{ + u64 paddr = pte & ARM_LPAE_PTE_ADDR_MASK; + + if (data->pg_shift < 16) + return paddr; + + /* Rotate the packed high-order bits back to the top */ + return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4); +} + static bool selftest_running = false; static dma_addr_t __arm_lpae_dma_addr(void *pages) @@ -268,9 +285,9 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, __arm_lpae_sync_pte(ptep, cfg); } -static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, - unsigned long iova, size_t size, int lvl, - arm_lpae_iopte *ptep); +static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, int lvl, + arm_lpae_iopte *ptep); static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, phys_addr_t paddr, arm_lpae_iopte prot, @@ -287,7 +304,7 @@ static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, pte |= ARM_LPAE_PTE_TYPE_BLOCK; pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS; - pte |= pfn_to_iopte(paddr >> data->pg_shift, data); + pte |= paddr_to_iopte(paddr, data); __arm_lpae_set_pte(ptep, pte, &data->iop.cfg); } @@ -506,10 +523,10 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) kfree(data); } -static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, - unsigned long iova, size_t size, - arm_lpae_iopte blk_pte, int lvl, - arm_lpae_iopte *ptep) +static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, + arm_lpae_iopte blk_pte, int lvl, + arm_lpae_iopte *ptep) { struct io_pgtable_cfg *cfg = &data->iop.cfg; arm_lpae_iopte pte, *tablep; @@ -528,7 +545,7 @@ static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, if (size == split_sz) unmap_idx = ARM_LPAE_LVL_IDX(iova, lvl, data); - blk_paddr = iopte_to_pfn(blk_pte, data) << data->pg_shift; + blk_paddr = iopte_to_paddr(blk_pte, data); pte = iopte_prot(blk_pte); for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) { @@ -560,9 +577,9 @@ static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, return size; } -static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, - unsigned long iova, size_t size, int lvl, - arm_lpae_iopte *ptep) +static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, + unsigned long iova, size_t size, int lvl, + arm_lpae_iopte *ptep) { arm_lpae_iopte pte; struct io_pgtable *iop = &data->iop; @@ -606,8 +623,8 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep); } -static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size) +static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, + size_t size) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); arm_lpae_iopte *ptep = data->pgd; @@ -652,12 +669,13 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, found_translation: iova &= (ARM_LPAE_BLOCK_SIZE(lvl, data) - 1); - return ((phys_addr_t)iopte_to_pfn(pte,data) << data->pg_shift) | iova; + return iopte_to_paddr(pte, data) | iova; } static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg) { - unsigned long granule; + unsigned long granule, page_sizes; + unsigned int max_addr_bits = 48; /* * We need to restrict the supported page sizes to match the @@ -677,17 +695,24 @@ static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg) switch (granule) { case SZ_4K: - cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G); + page_sizes = (SZ_4K | SZ_2M | SZ_1G); break; case SZ_16K: - cfg->pgsize_bitmap &= (SZ_16K | SZ_32M); + page_sizes = (SZ_16K | SZ_32M); break; case SZ_64K: - cfg->pgsize_bitmap &= (SZ_64K | SZ_512M); + max_addr_bits = 52; + page_sizes = (SZ_64K | SZ_512M); + if (cfg->oas > 48) + page_sizes |= 1ULL << 42; /* 4TB */ break; default: - cfg->pgsize_bitmap = 0; + page_sizes = 0; } + + cfg->pgsize_bitmap &= page_sizes; + cfg->ias = min(cfg->ias, max_addr_bits); + cfg->oas = min(cfg->oas, max_addr_bits); } static struct arm_lpae_io_pgtable * @@ -784,6 +809,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) case 48: reg |= (ARM_LPAE_TCR_PS_48_BIT << ARM_LPAE_TCR_IPS_SHIFT); break; + case 52: + reg |= (ARM_LPAE_TCR_PS_52_BIT << ARM_LPAE_TCR_IPS_SHIFT); + break; default: goto out_free_data; } @@ -891,6 +919,9 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) case 48: reg |= (ARM_LPAE_TCR_PS_48_BIT << ARM_LPAE_TCR_PS_SHIFT); break; + case 52: + reg |= (ARM_LPAE_TCR_PS_52_BIT << ARM_LPAE_TCR_PS_SHIFT); + break; default: goto out_free_data; } diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h index cd2e1eafffe6..2df79093cad9 100644 --- a/drivers/iommu/io-pgtable.h +++ b/drivers/iommu/io-pgtable.h @@ -119,8 +119,8 @@ struct io_pgtable_cfg { struct io_pgtable_ops { int (*map)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int prot); - int (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, - size_t size); + size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, + size_t size); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); }; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 69fef991c651..d2aa23202bb9 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1573,10 +1573,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain, if (unlikely(ops->unmap == NULL || domain->pgsize_bitmap == 0UL)) - return -ENODEV; + return 0; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return -EINVAL; + return 0; /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -1589,7 +1589,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain, if (!IS_ALIGNED(iova | size, min_pagesz)) { pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n", iova, size, min_pagesz); - return -EINVAL; + return 0; } pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index f227d73e7bf6..f2832a10fcea 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -60,7 +60,7 @@ (((prot) & 0x3) << F_MMU_TF_PROTECT_SEL_SHIFT(data)) #define REG_MMU_IVRP_PADDR 0x114 -#define F_MMU_IVRP_PA_SET(pa, ext) (((pa) >> 1) | ((!!(ext)) << 31)) + #define REG_MMU_VLD_PA_RNG 0x118 #define F_MMU_VLD_PA_RNG(EA, SA) (((EA) << 8) | (SA)) @@ -539,8 +539,13 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data) F_INT_PRETETCH_TRANSATION_FIFO_FAULT; writel_relaxed(regval, data->base + REG_MMU_INT_MAIN_CONTROL); - writel_relaxed(F_MMU_IVRP_PA_SET(data->protect_base, data->enable_4GB), - data->base + REG_MMU_IVRP_PADDR); + if (data->m4u_plat == M4U_MT8173) + regval = (data->protect_base >> 1) | (data->enable_4GB << 31); + else + regval = lower_32_bits(data->protect_base) | + upper_32_bits(data->protect_base); + writel_relaxed(regval, data->base + REG_MMU_IVRP_PADDR); + if (data->enable_4GB && data->m4u_plat != M4U_MT8173) { /* * If 4GB mode is enabled, the validate PA range is from @@ -695,6 +700,7 @@ static int __maybe_unused mtk_iommu_suspend(struct device *dev) reg->ctrl_reg = readl_relaxed(base + REG_MMU_CTRL_REG); reg->int_control0 = readl_relaxed(base + REG_MMU_INT_CONTROL0); reg->int_main_control = readl_relaxed(base + REG_MMU_INT_MAIN_CONTROL); + reg->ivrp_paddr = readl_relaxed(base + REG_MMU_IVRP_PADDR); clk_disable_unprepare(data->bclk); return 0; } @@ -717,8 +723,7 @@ static int __maybe_unused mtk_iommu_resume(struct device *dev) writel_relaxed(reg->ctrl_reg, base + REG_MMU_CTRL_REG); writel_relaxed(reg->int_control0, base + REG_MMU_INT_CONTROL0); writel_relaxed(reg->int_main_control, base + REG_MMU_INT_MAIN_CONTROL); - writel_relaxed(F_MMU_IVRP_PA_SET(data->protect_base, data->enable_4GB), - base + REG_MMU_IVRP_PADDR); + writel_relaxed(reg->ivrp_paddr, base + REG_MMU_IVRP_PADDR); if (data->m4u_dom) writel(data->m4u_dom->cfg.arm_v7s_cfg.ttbr[0], base + REG_MMU_PT_BASE_ADDR); diff --git a/drivers/iommu/mtk_iommu.h b/drivers/iommu/mtk_iommu.h index b4451a1c7c2f..778498b8633f 100644 --- a/drivers/iommu/mtk_iommu.h +++ b/drivers/iommu/mtk_iommu.h @@ -32,6 +32,7 @@ struct mtk_iommu_suspend_reg { u32 ctrl_reg; u32 int_control0; u32 int_main_control; + u32 ivrp_paddr; }; enum mtk_iommu_plat { diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 5a96fd14ac22..a7c2a973784f 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -417,20 +417,12 @@ static int mtk_iommu_create_mapping(struct device *dev, m4udev->archdata.iommu = mtk_mapping; } - ret = arm_iommu_attach_device(dev, mtk_mapping); - if (ret) - goto err_release_mapping; - return 0; - -err_release_mapping: - arm_iommu_release_mapping(mtk_mapping); - m4udev->archdata.iommu = NULL; - return ret; } static int mtk_iommu_add_device(struct device *dev) { + struct dma_iommu_mapping *mtk_mapping; struct of_phandle_args iommu_spec; struct of_phandle_iterator it; struct mtk_iommu_data *data; @@ -451,15 +443,30 @@ static int mtk_iommu_add_device(struct device *dev) if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != &mtk_iommu_ops) return -ENODEV; /* Not a iommu client device */ - data = dev->iommu_fwspec->iommu_priv; - iommu_device_link(&data->iommu, dev); - - group = iommu_group_get_for_dev(dev); + /* + * This is a short-term bodge because the ARM DMA code doesn't + * understand multi-device groups, but we have to call into it + * successfully (and not just rely on a normal IOMMU API attach + * here) in order to set the correct DMA API ops on @dev. + */ + group = iommu_group_alloc(); if (IS_ERR(group)) return PTR_ERR(group); + err = iommu_group_add_device(group, dev); iommu_group_put(group); - return 0; + if (err) + return err; + + data = dev->iommu_fwspec->iommu_priv; + mtk_mapping = data->dev->archdata.iommu; + err = arm_iommu_attach_device(dev, mtk_mapping); + if (err) { + iommu_group_remove_device(dev); + return err; + } + + return iommu_device_link(&data->iommu, dev);; } static void mtk_iommu_remove_device(struct device *dev) @@ -476,24 +483,6 @@ static void mtk_iommu_remove_device(struct device *dev) iommu_fwspec_free(dev); } -static struct iommu_group *mtk_iommu_device_group(struct device *dev) -{ - struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv; - - if (!data) - return ERR_PTR(-ENODEV); - - /* All the client devices are in the same m4u iommu-group */ - if (!data->m4u_group) { - data->m4u_group = iommu_group_alloc(); - if (IS_ERR(data->m4u_group)) - dev_err(dev, "Failed to allocate M4U IOMMU group\n"); - } else { - iommu_group_ref_get(data->m4u_group); - } - return data->m4u_group; -} - static int mtk_iommu_hw_init(const struct mtk_iommu_data *data) { u32 regval; @@ -546,7 +535,6 @@ static struct iommu_ops mtk_iommu_ops = { .iova_to_phys = mtk_iommu_iova_to_phys, .add_device = mtk_iommu_add_device, .remove_device = mtk_iommu_remove_device, - .device_group = mtk_iommu_device_group, .pgsize_bitmap = ~0UL << MT2701_IOMMU_PAGE_SHIFT, }; diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index e135ab830ebf..c33b7b104e72 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1536,7 +1536,7 @@ static struct iommu_group *omap_iommu_device_group(struct device *dev) struct iommu_group *group = ERR_PTR(-EINVAL); if (arch_data->iommu_dev) - group = arch_data->iommu_dev->group; + group = iommu_group_ref_get(arch_data->iommu_dev->group); return group; } diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 9d991c2d8767..5fc8656c60f9 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -4,6 +4,7 @@ * published by the Free Software Foundation. */ +#include <linux/clk.h> #include <linux/compiler.h> #include <linux/delay.h> #include <linux/device.h> @@ -13,13 +14,15 @@ #include <linux/interrupt.h> #include <linux/io.h> #include <linux/iommu.h> -#include <linux/jiffies.h> +#include <linux/iopoll.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/of.h> +#include <linux/of_iommu.h> #include <linux/of_platform.h> #include <linux/platform_device.h> +#include <linux/pm_runtime.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -36,7 +39,10 @@ #define RK_MMU_AUTO_GATING 0x24 #define DTE_ADDR_DUMMY 0xCAFEBABE -#define FORCE_RESET_TIMEOUT 100 /* ms */ + +#define RK_MMU_POLL_PERIOD_US 100 +#define RK_MMU_FORCE_RESET_TIMEOUT_US 100000 +#define RK_MMU_POLL_TIMEOUT_US 1000 /* RK_MMU_STATUS fields */ #define RK_MMU_STATUS_PAGING_ENABLED BIT(0) @@ -73,11 +79,8 @@ */ #define RK_IOMMU_PGSIZE_BITMAP 0x007ff000 -#define IOMMU_REG_POLL_COUNT_FAST 1000 - struct rk_iommu_domain { struct list_head iommus; - struct platform_device *pdev; u32 *dt; /* page directory table */ dma_addr_t dt_dma; spinlock_t iommus_lock; /* lock for iommus list */ @@ -86,24 +89,37 @@ struct rk_iommu_domain { struct iommu_domain domain; }; +/* list of clocks required by IOMMU */ +static const char * const rk_iommu_clocks[] = { + "aclk", "iface", +}; + struct rk_iommu { struct device *dev; void __iomem **bases; int num_mmu; - int *irq; - int num_irq; + struct clk_bulk_data *clocks; + int num_clocks; bool reset_disabled; struct iommu_device iommu; struct list_head node; /* entry in rk_iommu_domain.iommus */ struct iommu_domain *domain; /* domain to which iommu is attached */ + struct iommu_group *group; +}; + +struct rk_iommudata { + struct device_link *link; /* runtime PM link from IOMMU to master */ + struct rk_iommu *iommu; }; +static struct device *dma_dev; + static inline void rk_table_flush(struct rk_iommu_domain *dom, dma_addr_t dma, unsigned int count) { size_t size = count * sizeof(u32); /* count of u32 entry */ - dma_sync_single_for_device(&dom->pdev->dev, dma, size, DMA_TO_DEVICE); + dma_sync_single_for_device(dma_dev, dma, size, DMA_TO_DEVICE); } static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom) @@ -111,27 +127,6 @@ static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom) return container_of(dom, struct rk_iommu_domain, domain); } -/** - * Inspired by _wait_for in intel_drv.h - * This is NOT safe for use in interrupt context. - * - * Note that it's important that we check the condition again after having - * timed out, since the timeout could be due to preemption or similar and - * we've never had a chance to check the condition before the timeout. - */ -#define rk_wait_for(COND, MS) ({ \ - unsigned long timeout__ = jiffies + msecs_to_jiffies(MS) + 1; \ - int ret__ = 0; \ - while (!(COND)) { \ - if (time_after(jiffies, timeout__)) { \ - ret__ = (COND) ? 0 : -ETIMEDOUT; \ - break; \ - } \ - usleep_range(50, 100); \ - } \ - ret__; \ -}) - /* * The Rockchip rk3288 iommu uses a 2-level page table. * The first level is the "Directory Table" (DT). @@ -296,19 +291,21 @@ static void rk_iommu_base_command(void __iomem *base, u32 command) { writel(command, base + RK_MMU_COMMAND); } -static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova, +static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova_start, size_t size) { int i; - - dma_addr_t iova_end = iova + size; + dma_addr_t iova_end = iova_start + size; /* * TODO(djkurtz): Figure out when it is more efficient to shootdown the * entire iotlb rather than iterate over individual iovas. */ - for (i = 0; i < iommu->num_mmu; i++) - for (; iova < iova_end; iova += SPAGE_SIZE) + for (i = 0; i < iommu->num_mmu; i++) { + dma_addr_t iova; + + for (iova = iova_start; iova < iova_end; iova += SPAGE_SIZE) rk_iommu_write(iommu->bases[i], RK_MMU_ZAP_ONE_LINE, iova); + } } static bool rk_iommu_is_stall_active(struct rk_iommu *iommu) @@ -335,9 +332,21 @@ static bool rk_iommu_is_paging_enabled(struct rk_iommu *iommu) return enable; } +static bool rk_iommu_is_reset_done(struct rk_iommu *iommu) +{ + bool done = true; + int i; + + for (i = 0; i < iommu->num_mmu; i++) + done &= rk_iommu_read(iommu->bases[i], RK_MMU_DTE_ADDR) == 0; + + return done; +} + static int rk_iommu_enable_stall(struct rk_iommu *iommu) { int ret, i; + bool val; if (rk_iommu_is_stall_active(iommu)) return 0; @@ -348,7 +357,9 @@ static int rk_iommu_enable_stall(struct rk_iommu *iommu) rk_iommu_command(iommu, RK_MMU_CMD_ENABLE_STALL); - ret = rk_wait_for(rk_iommu_is_stall_active(iommu), 1); + ret = readx_poll_timeout(rk_iommu_is_stall_active, iommu, val, + val, RK_MMU_POLL_PERIOD_US, + RK_MMU_POLL_TIMEOUT_US); if (ret) for (i = 0; i < iommu->num_mmu; i++) dev_err(iommu->dev, "Enable stall request timed out, status: %#08x\n", @@ -360,13 +371,16 @@ static int rk_iommu_enable_stall(struct rk_iommu *iommu) static int rk_iommu_disable_stall(struct rk_iommu *iommu) { int ret, i; + bool val; if (!rk_iommu_is_stall_active(iommu)) return 0; rk_iommu_command(iommu, RK_MMU_CMD_DISABLE_STALL); - ret = rk_wait_for(!rk_iommu_is_stall_active(iommu), 1); + ret = readx_poll_timeout(rk_iommu_is_stall_active, iommu, val, + !val, RK_MMU_POLL_PERIOD_US, + RK_MMU_POLL_TIMEOUT_US); if (ret) for (i = 0; i < iommu->num_mmu; i++) dev_err(iommu->dev, "Disable stall request timed out, status: %#08x\n", @@ -378,13 +392,16 @@ static int rk_iommu_disable_stall(struct rk_iommu *iommu) static int rk_iommu_enable_paging(struct rk_iommu *iommu) { int ret, i; + bool val; if (rk_iommu_is_paging_enabled(iommu)) return 0; rk_iommu_command(iommu, RK_MMU_CMD_ENABLE_PAGING); - ret = rk_wait_for(rk_iommu_is_paging_enabled(iommu), 1); + ret = readx_poll_timeout(rk_iommu_is_paging_enabled, iommu, val, + val, RK_MMU_POLL_PERIOD_US, + RK_MMU_POLL_TIMEOUT_US); if (ret) for (i = 0; i < iommu->num_mmu; i++) dev_err(iommu->dev, "Enable paging request timed out, status: %#08x\n", @@ -396,13 +413,16 @@ static int rk_iommu_enable_paging(struct rk_iommu *iommu) static int rk_iommu_disable_paging(struct rk_iommu *iommu) { int ret, i; + bool val; if (!rk_iommu_is_paging_enabled(iommu)) return 0; rk_iommu_command(iommu, RK_MMU_CMD_DISABLE_PAGING); - ret = rk_wait_for(!rk_iommu_is_paging_enabled(iommu), 1); + ret = readx_poll_timeout(rk_iommu_is_paging_enabled, iommu, val, + !val, RK_MMU_POLL_PERIOD_US, + RK_MMU_POLL_TIMEOUT_US); if (ret) for (i = 0; i < iommu->num_mmu; i++) dev_err(iommu->dev, "Disable paging request timed out, status: %#08x\n", @@ -415,6 +435,7 @@ static int rk_iommu_force_reset(struct rk_iommu *iommu) { int ret, i; u32 dte_addr; + bool val; if (iommu->reset_disabled) return 0; @@ -435,13 +456,12 @@ static int rk_iommu_force_reset(struct rk_iommu *iommu) rk_iommu_command(iommu, RK_MMU_CMD_FORCE_RESET); - for (i = 0; i < iommu->num_mmu; i++) { - ret = rk_wait_for(rk_iommu_read(iommu->bases[i], RK_MMU_DTE_ADDR) == 0x00000000, - FORCE_RESET_TIMEOUT); - if (ret) { - dev_err(iommu->dev, "FORCE_RESET command timed out\n"); - return ret; - } + ret = readx_poll_timeout(rk_iommu_is_reset_done, iommu, val, + val, RK_MMU_FORCE_RESET_TIMEOUT_US, + RK_MMU_POLL_TIMEOUT_US); + if (ret) { + dev_err(iommu->dev, "FORCE_RESET command timed out\n"); + return ret; } return 0; @@ -503,6 +523,12 @@ static irqreturn_t rk_iommu_irq(int irq, void *dev_id) irqreturn_t ret = IRQ_NONE; int i; + if (WARN_ON(!pm_runtime_get_if_in_use(iommu->dev))) + return 0; + + if (WARN_ON(clk_bulk_enable(iommu->num_clocks, iommu->clocks))) + goto out; + for (i = 0; i < iommu->num_mmu; i++) { int_status = rk_iommu_read(iommu->bases[i], RK_MMU_INT_STATUS); if (int_status == 0) @@ -549,6 +575,10 @@ static irqreturn_t rk_iommu_irq(int irq, void *dev_id) rk_iommu_write(iommu->bases[i], RK_MMU_INT_CLEAR, int_status); } + clk_bulk_disable(iommu->num_clocks, iommu->clocks); + +out: + pm_runtime_put(iommu->dev); return ret; } @@ -590,8 +620,17 @@ static void rk_iommu_zap_iova(struct rk_iommu_domain *rk_domain, spin_lock_irqsave(&rk_domain->iommus_lock, flags); list_for_each(pos, &rk_domain->iommus) { struct rk_iommu *iommu; + iommu = list_entry(pos, struct rk_iommu, node); - rk_iommu_zap_lines(iommu, iova, size); + + /* Only zap TLBs of IOMMUs that are powered on. */ + if (pm_runtime_get_if_in_use(iommu->dev)) { + WARN_ON(clk_bulk_enable(iommu->num_clocks, + iommu->clocks)); + rk_iommu_zap_lines(iommu, iova, size); + clk_bulk_disable(iommu->num_clocks, iommu->clocks); + pm_runtime_put(iommu->dev); + } } spin_unlock_irqrestore(&rk_domain->iommus_lock, flags); } @@ -608,7 +647,6 @@ static void rk_iommu_zap_iova_first_last(struct rk_iommu_domain *rk_domain, static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain, dma_addr_t iova) { - struct device *dev = &rk_domain->pdev->dev; u32 *page_table, *dte_addr; u32 dte_index, dte; phys_addr_t pt_phys; @@ -626,9 +664,9 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain, if (!page_table) return ERR_PTR(-ENOMEM); - pt_dma = dma_map_single(dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE); - if (dma_mapping_error(dev, pt_dma)) { - dev_err(dev, "DMA mapping error while allocating page table\n"); + pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(dma_dev, pt_dma)) { + dev_err(dma_dev, "DMA mapping error while allocating page table\n"); free_page((unsigned long)page_table); return ERR_PTR(-ENOMEM); } @@ -790,52 +828,46 @@ static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, static struct rk_iommu *rk_iommu_from_dev(struct device *dev) { - struct iommu_group *group; - struct device *iommu_dev; - struct rk_iommu *rk_iommu; + struct rk_iommudata *data = dev->archdata.iommu; - group = iommu_group_get(dev); - if (!group) - return NULL; - iommu_dev = iommu_group_get_iommudata(group); - rk_iommu = dev_get_drvdata(iommu_dev); - iommu_group_put(group); + return data ? data->iommu : NULL; +} + +/* Must be called with iommu powered on and attached */ +static void rk_iommu_disable(struct rk_iommu *iommu) +{ + int i; - return rk_iommu; + /* Ignore error while disabling, just keep going */ + WARN_ON(clk_bulk_enable(iommu->num_clocks, iommu->clocks)); + rk_iommu_enable_stall(iommu); + rk_iommu_disable_paging(iommu); + for (i = 0; i < iommu->num_mmu; i++) { + rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, 0); + rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, 0); + } + rk_iommu_disable_stall(iommu); + clk_bulk_disable(iommu->num_clocks, iommu->clocks); } -static int rk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) +/* Must be called with iommu powered on and attached */ +static int rk_iommu_enable(struct rk_iommu *iommu) { - struct rk_iommu *iommu; + struct iommu_domain *domain = iommu->domain; struct rk_iommu_domain *rk_domain = to_rk_domain(domain); - unsigned long flags; int ret, i; - /* - * Allow 'virtual devices' (e.g., drm) to attach to domain. - * Such a device does not belong to an iommu group. - */ - iommu = rk_iommu_from_dev(dev); - if (!iommu) - return 0; + ret = clk_bulk_enable(iommu->num_clocks, iommu->clocks); + if (ret) + return ret; ret = rk_iommu_enable_stall(iommu); if (ret) - return ret; + goto out_disable_clocks; ret = rk_iommu_force_reset(iommu); if (ret) - return ret; - - iommu->domain = domain; - - for (i = 0; i < iommu->num_irq; i++) { - ret = devm_request_irq(iommu->dev, iommu->irq[i], rk_iommu_irq, - IRQF_SHARED, dev_name(dev), iommu); - if (ret) - return ret; - } + goto out_disable_stall; for (i = 0; i < iommu->num_mmu; i++) { rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, @@ -845,18 +877,12 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, } ret = rk_iommu_enable_paging(iommu); - if (ret) - return ret; - - spin_lock_irqsave(&rk_domain->iommus_lock, flags); - list_add_tail(&iommu->node, &rk_domain->iommus); - spin_unlock_irqrestore(&rk_domain->iommus_lock, flags); - - dev_dbg(dev, "Attached to iommu domain\n"); +out_disable_stall: rk_iommu_disable_stall(iommu); - - return 0; +out_disable_clocks: + clk_bulk_disable(iommu->num_clocks, iommu->clocks); + return ret; } static void rk_iommu_detach_device(struct iommu_domain *domain, @@ -865,60 +891,90 @@ static void rk_iommu_detach_device(struct iommu_domain *domain, struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; - int i; /* Allow 'virtual devices' (eg drm) to detach from domain */ iommu = rk_iommu_from_dev(dev); if (!iommu) return; + dev_dbg(dev, "Detaching from iommu domain\n"); + + /* iommu already detached */ + if (iommu->domain != domain) + return; + + iommu->domain = NULL; + spin_lock_irqsave(&rk_domain->iommus_lock, flags); list_del_init(&iommu->node); spin_unlock_irqrestore(&rk_domain->iommus_lock, flags); - /* Ignore error while disabling, just keep going */ - rk_iommu_enable_stall(iommu); - rk_iommu_disable_paging(iommu); - for (i = 0; i < iommu->num_mmu; i++) { - rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, 0); - rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, 0); + if (pm_runtime_get_if_in_use(iommu->dev)) { + rk_iommu_disable(iommu); + pm_runtime_put(iommu->dev); } - rk_iommu_disable_stall(iommu); +} - for (i = 0; i < iommu->num_irq; i++) - devm_free_irq(iommu->dev, iommu->irq[i], iommu); +static int rk_iommu_attach_device(struct iommu_domain *domain, + struct device *dev) +{ + struct rk_iommu *iommu; + struct rk_iommu_domain *rk_domain = to_rk_domain(domain); + unsigned long flags; + int ret; - iommu->domain = NULL; + /* + * Allow 'virtual devices' (e.g., drm) to attach to domain. + * Such a device does not belong to an iommu group. + */ + iommu = rk_iommu_from_dev(dev); + if (!iommu) + return 0; + + dev_dbg(dev, "Attaching to iommu domain\n"); + + /* iommu already attached */ + if (iommu->domain == domain) + return 0; - dev_dbg(dev, "Detached from iommu domain\n"); + if (iommu->domain) + rk_iommu_detach_device(iommu->domain, dev); + + iommu->domain = domain; + + spin_lock_irqsave(&rk_domain->iommus_lock, flags); + list_add_tail(&iommu->node, &rk_domain->iommus); + spin_unlock_irqrestore(&rk_domain->iommus_lock, flags); + + if (!pm_runtime_get_if_in_use(iommu->dev)) + return 0; + + ret = rk_iommu_enable(iommu); + if (ret) + rk_iommu_detach_device(iommu->domain, dev); + + pm_runtime_put(iommu->dev); + + return ret; } static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) { struct rk_iommu_domain *rk_domain; - struct platform_device *pdev; - struct device *iommu_dev; if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) return NULL; - /* Register a pdev per domain, so DMA API can base on this *dev - * even some virtual master doesn't have an iommu slave - */ - pdev = platform_device_register_simple("rk_iommu_domain", - PLATFORM_DEVID_AUTO, NULL, 0); - if (IS_ERR(pdev)) + if (!dma_dev) return NULL; - rk_domain = devm_kzalloc(&pdev->dev, sizeof(*rk_domain), GFP_KERNEL); + rk_domain = devm_kzalloc(dma_dev, sizeof(*rk_domain), GFP_KERNEL); if (!rk_domain) - goto err_unreg_pdev; - - rk_domain->pdev = pdev; + return NULL; if (type == IOMMU_DOMAIN_DMA && iommu_get_dma_cookie(&rk_domain->domain)) - goto err_unreg_pdev; + return NULL; /* * rk32xx iommus use a 2 level pagetable. @@ -929,11 +985,10 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) if (!rk_domain->dt) goto err_put_cookie; - iommu_dev = &pdev->dev; - rk_domain->dt_dma = dma_map_single(iommu_dev, rk_domain->dt, + rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt, SPAGE_SIZE, DMA_TO_DEVICE); - if (dma_mapping_error(iommu_dev, rk_domain->dt_dma)) { - dev_err(iommu_dev, "DMA map error for DT\n"); + if (dma_mapping_error(dma_dev, rk_domain->dt_dma)) { + dev_err(dma_dev, "DMA map error for DT\n"); goto err_free_dt; } @@ -954,8 +1009,6 @@ err_free_dt: err_put_cookie: if (type == IOMMU_DOMAIN_DMA) iommu_put_dma_cookie(&rk_domain->domain); -err_unreg_pdev: - platform_device_unregister(pdev); return NULL; } @@ -972,126 +1025,82 @@ static void rk_iommu_domain_free(struct iommu_domain *domain) if (rk_dte_is_pt_valid(dte)) { phys_addr_t pt_phys = rk_dte_pt_address(dte); u32 *page_table = phys_to_virt(pt_phys); - dma_unmap_single(&rk_domain->pdev->dev, pt_phys, + dma_unmap_single(dma_dev, pt_phys, SPAGE_SIZE, DMA_TO_DEVICE); free_page((unsigned long)page_table); } } - dma_unmap_single(&rk_domain->pdev->dev, rk_domain->dt_dma, + dma_unmap_single(dma_dev, rk_domain->dt_dma, SPAGE_SIZE, DMA_TO_DEVICE); free_page((unsigned long)rk_domain->dt); if (domain->type == IOMMU_DOMAIN_DMA) iommu_put_dma_cookie(&rk_domain->domain); - - platform_device_unregister(rk_domain->pdev); } -static bool rk_iommu_is_dev_iommu_master(struct device *dev) +static int rk_iommu_add_device(struct device *dev) { - struct device_node *np = dev->of_node; - int ret; - - /* - * An iommu master has an iommus property containing a list of phandles - * to iommu nodes, each with an #iommu-cells property with value 0. - */ - ret = of_count_phandle_with_args(np, "iommus", "#iommu-cells"); - return (ret > 0); -} + struct iommu_group *group; + struct rk_iommu *iommu; + struct rk_iommudata *data; -static int rk_iommu_group_set_iommudata(struct iommu_group *group, - struct device *dev) -{ - struct device_node *np = dev->of_node; - struct platform_device *pd; - int ret; - struct of_phandle_args args; + data = dev->archdata.iommu; + if (!data) + return -ENODEV; - /* - * An iommu master has an iommus property containing a list of phandles - * to iommu nodes, each with an #iommu-cells property with value 0. - */ - ret = of_parse_phandle_with_args(np, "iommus", "#iommu-cells", 0, - &args); - if (ret) { - dev_err(dev, "of_parse_phandle_with_args(%pOF) => %d\n", - np, ret); - return ret; - } - if (args.args_count != 0) { - dev_err(dev, "incorrect number of iommu params found for %pOF (found %d, expected 0)\n", - args.np, args.args_count); - return -EINVAL; - } + iommu = rk_iommu_from_dev(dev); - pd = of_find_device_by_node(args.np); - of_node_put(args.np); - if (!pd) { - dev_err(dev, "iommu %pOF not found\n", args.np); - return -EPROBE_DEFER; - } + group = iommu_group_get_for_dev(dev); + if (IS_ERR(group)) + return PTR_ERR(group); + iommu_group_put(group); - /* TODO(djkurtz): handle multiple slave iommus for a single master */ - iommu_group_set_iommudata(group, &pd->dev, NULL); + iommu_device_link(&iommu->iommu, dev); + data->link = device_link_add(dev, iommu->dev, DL_FLAG_PM_RUNTIME); return 0; } -static int rk_iommu_add_device(struct device *dev) +static void rk_iommu_remove_device(struct device *dev) { - struct iommu_group *group; struct rk_iommu *iommu; - int ret; - - if (!rk_iommu_is_dev_iommu_master(dev)) - return -ENODEV; + struct rk_iommudata *data = dev->archdata.iommu; - group = iommu_group_get(dev); - if (!group) { - group = iommu_group_alloc(); - if (IS_ERR(group)) { - dev_err(dev, "Failed to allocate IOMMU group\n"); - return PTR_ERR(group); - } - } + iommu = rk_iommu_from_dev(dev); - ret = iommu_group_add_device(group, dev); - if (ret) - goto err_put_group; + device_link_del(data->link); + iommu_device_unlink(&iommu->iommu, dev); + iommu_group_remove_device(dev); +} - ret = rk_iommu_group_set_iommudata(group, dev); - if (ret) - goto err_remove_device; +static struct iommu_group *rk_iommu_device_group(struct device *dev) +{ + struct rk_iommu *iommu; iommu = rk_iommu_from_dev(dev); - if (iommu) - iommu_device_link(&iommu->iommu, dev); - - iommu_group_put(group); - - return 0; -err_remove_device: - iommu_group_remove_device(dev); -err_put_group: - iommu_group_put(group); - return ret; + return iommu_group_ref_get(iommu->group); } -static void rk_iommu_remove_device(struct device *dev) +static int rk_iommu_of_xlate(struct device *dev, + struct of_phandle_args *args) { - struct rk_iommu *iommu; + struct platform_device *iommu_dev; + struct rk_iommudata *data; - if (!rk_iommu_is_dev_iommu_master(dev)) - return; + data = devm_kzalloc(dma_dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; - iommu = rk_iommu_from_dev(dev); - if (iommu) - iommu_device_unlink(&iommu->iommu, dev); + iommu_dev = of_find_device_by_node(args->np); - iommu_group_remove_device(dev); + data->iommu = platform_get_drvdata(iommu_dev); + dev->archdata.iommu = data; + + of_dev_put(iommu_dev); + + return 0; } static const struct iommu_ops rk_iommu_ops = { @@ -1105,31 +1114,9 @@ static const struct iommu_ops rk_iommu_ops = { .add_device = rk_iommu_add_device, .remove_device = rk_iommu_remove_device, .iova_to_phys = rk_iommu_iova_to_phys, + .device_group = rk_iommu_device_group, .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP, -}; - -static int rk_iommu_domain_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - - dev->dma_parms = devm_kzalloc(dev, sizeof(*dev->dma_parms), GFP_KERNEL); - if (!dev->dma_parms) - return -ENOMEM; - - /* Set dma_ops for dev, otherwise it would be dummy_dma_ops */ - arch_setup_dma_ops(dev, 0, DMA_BIT_MASK(32), NULL, false); - - dma_set_max_seg_size(dev, DMA_BIT_MASK(32)); - dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(32)); - - return 0; -} - -static struct platform_driver rk_iommu_domain_driver = { - .probe = rk_iommu_domain_probe, - .driver = { - .name = "rk_iommu_domain", - }, + .of_xlate = rk_iommu_of_xlate, }; static int rk_iommu_probe(struct platform_device *pdev) @@ -1138,7 +1125,7 @@ static int rk_iommu_probe(struct platform_device *pdev) struct rk_iommu *iommu; struct resource *res; int num_res = pdev->num_resources; - int err, i; + int err, i, irq; iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL); if (!iommu) @@ -1165,50 +1152,108 @@ static int rk_iommu_probe(struct platform_device *pdev) if (iommu->num_mmu == 0) return PTR_ERR(iommu->bases[0]); - iommu->num_irq = platform_irq_count(pdev); - if (iommu->num_irq < 0) - return iommu->num_irq; - if (iommu->num_irq == 0) - return -ENXIO; - - iommu->irq = devm_kcalloc(dev, iommu->num_irq, sizeof(*iommu->irq), - GFP_KERNEL); - if (!iommu->irq) - return -ENOMEM; + i = 0; + while ((irq = platform_get_irq(pdev, i++)) != -ENXIO) { + if (irq < 0) + return irq; - for (i = 0; i < iommu->num_irq; i++) { - iommu->irq[i] = platform_get_irq(pdev, i); - if (iommu->irq[i] < 0) { - dev_err(dev, "Failed to get IRQ, %d\n", iommu->irq[i]); - return -ENXIO; - } + err = devm_request_irq(iommu->dev, irq, rk_iommu_irq, + IRQF_SHARED, dev_name(dev), iommu); + if (err) + return err; } iommu->reset_disabled = device_property_read_bool(dev, "rockchip,disable-mmu-reset"); - err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); + iommu->num_clocks = ARRAY_SIZE(rk_iommu_clocks); + iommu->clocks = devm_kcalloc(iommu->dev, iommu->num_clocks, + sizeof(*iommu->clocks), GFP_KERNEL); + if (!iommu->clocks) + return -ENOMEM; + + for (i = 0; i < iommu->num_clocks; ++i) + iommu->clocks[i].id = rk_iommu_clocks[i]; + + err = devm_clk_bulk_get(iommu->dev, iommu->num_clocks, iommu->clocks); + if (err) + return err; + + err = clk_bulk_prepare(iommu->num_clocks, iommu->clocks); if (err) return err; + iommu->group = iommu_group_alloc(); + if (IS_ERR(iommu->group)) { + err = PTR_ERR(iommu->group); + goto err_unprepare_clocks; + } + + err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); + if (err) + goto err_put_group; + iommu_device_set_ops(&iommu->iommu, &rk_iommu_ops); + iommu_device_set_fwnode(&iommu->iommu, &dev->of_node->fwnode); + err = iommu_device_register(&iommu->iommu); + if (err) + goto err_remove_sysfs; + + /* + * Use the first registered IOMMU device for domain to use with DMA + * API, since a domain might not physically correspond to a single + * IOMMU device.. + */ + if (!dma_dev) + dma_dev = &pdev->dev; + + bus_set_iommu(&platform_bus_type, &rk_iommu_ops); + pm_runtime_enable(dev); + + return 0; +err_remove_sysfs: + iommu_device_sysfs_remove(&iommu->iommu); +err_put_group: + iommu_group_put(iommu->group); +err_unprepare_clocks: + clk_bulk_unprepare(iommu->num_clocks, iommu->clocks); return err; } -static int rk_iommu_remove(struct platform_device *pdev) +static void rk_iommu_shutdown(struct platform_device *pdev) { - struct rk_iommu *iommu = platform_get_drvdata(pdev); + pm_runtime_force_suspend(&pdev->dev); +} - if (iommu) { - iommu_device_sysfs_remove(&iommu->iommu); - iommu_device_unregister(&iommu->iommu); - } +static int __maybe_unused rk_iommu_suspend(struct device *dev) +{ + struct rk_iommu *iommu = dev_get_drvdata(dev); + if (!iommu->domain) + return 0; + + rk_iommu_disable(iommu); return 0; } +static int __maybe_unused rk_iommu_resume(struct device *dev) +{ + struct rk_iommu *iommu = dev_get_drvdata(dev); + + if (!iommu->domain) + return 0; + + return rk_iommu_enable(iommu); +} + +static const struct dev_pm_ops rk_iommu_pm_ops = { + SET_RUNTIME_PM_OPS(rk_iommu_suspend, rk_iommu_resume, NULL) + SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, + pm_runtime_force_resume) +}; + static const struct of_device_id rk_iommu_dt_ids[] = { { .compatible = "rockchip,iommu" }, { /* sentinel */ } @@ -1217,45 +1262,22 @@ MODULE_DEVICE_TABLE(of, rk_iommu_dt_ids); static struct platform_driver rk_iommu_driver = { .probe = rk_iommu_probe, - .remove = rk_iommu_remove, + .shutdown = rk_iommu_shutdown, .driver = { .name = "rk_iommu", .of_match_table = rk_iommu_dt_ids, + .pm = &rk_iommu_pm_ops, + .suppress_bind_attrs = true, }, }; static int __init rk_iommu_init(void) { - struct device_node *np; - int ret; - - np = of_find_matching_node(NULL, rk_iommu_dt_ids); - if (!np) - return 0; - - of_node_put(np); - - ret = bus_set_iommu(&platform_bus_type, &rk_iommu_ops); - if (ret) - return ret; - - ret = platform_driver_register(&rk_iommu_domain_driver); - if (ret) - return ret; - - ret = platform_driver_register(&rk_iommu_driver); - if (ret) - platform_driver_unregister(&rk_iommu_domain_driver); - return ret; + return platform_driver_register(&rk_iommu_driver); } -static void __exit rk_iommu_exit(void) -{ - platform_driver_unregister(&rk_iommu_driver); - platform_driver_unregister(&rk_iommu_domain_driver); -} - subsys_initcall(rk_iommu_init); -module_exit(rk_iommu_exit); + +IOMMU_OF_DECLARE(rk_iommu_of, "rockchip,iommu"); MODULE_DESCRIPTION("IOMMU API for Rockchip"); MODULE_AUTHOR("Simon Xue <[email protected]> and Daniel Kurtz <[email protected]>"); diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 2982e93d2369..5416f2b2ac21 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3612,7 +3612,8 @@ static int __init gic_acpi_parse_madt_its(struct acpi_subtable_header *header, return -ENOMEM; } - err = iort_register_domain_token(its_entry->translation_id, dom_handle); + err = iort_register_domain_token(its_entry->translation_id, res.start, + dom_handle); if (err) { pr_err("ITS@%pa: Unable to register GICv3 ITS domain token (ITS ID %d) to IORT\n", &res.start, its_entry->translation_id); diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c index 24108bfad889..6193270e7b3d 100644 --- a/drivers/misc/kgdbts.c +++ b/drivers/misc/kgdbts.c @@ -400,10 +400,14 @@ static void skip_back_repeat_test(char *arg) int go_back = simple_strtol(arg, NULL, 10); repeat_test--; - if (repeat_test <= 0) + if (repeat_test <= 0) { ts.idx++; - else + } else { + if (repeat_test % 100 == 0) + v1printk("kgdbts:RUN ... %d remaining\n", repeat_test); + ts.idx -= go_back; + } fill_get_buf(ts.tst[ts.idx].get); } diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 02485e310c81..9e923cd1d80e 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -3080,6 +3080,7 @@ static void __exit mmc_blk_exit(void) mmc_unregister_driver(&mmc_driver); unregister_blkdev(MMC_BLOCK_MAJOR, "mmc"); unregister_chrdev_region(mmc_rpmb_devt, MAX_DEVICES); + bus_unregister(&mmc_rpmb_bus_type); } module_init(mmc_blk_init); diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index 712e08d9a45e..a0168e9e4fce 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -362,9 +362,9 @@ static void jz4740_mmc_set_irq_enabled(struct jz4740_mmc_host *host, host->irq_mask &= ~irq; else host->irq_mask |= irq; - spin_unlock_irqrestore(&host->lock, flags); writew(host->irq_mask, host->base + JZ_REG_MMC_IMASK); + spin_unlock_irqrestore(&host->lock, flags); } static void jz4740_mmc_clock_enable(struct jz4740_mmc_host *host, diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index e30df9ad8197..308029930304 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -913,7 +913,7 @@ static void tmio_mmc_finish_request(struct tmio_mmc_host *host) host->check_scc_error(host); /* If SET_BLOCK_COUNT, continue with main command */ - if (host->mrq) { + if (host->mrq && !mrq->cmd->error) { tmio_process_mrq(host, mrq); return; } diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index b1fc28f63882..d0b63bbf46a7 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -244,7 +244,7 @@ static int ubiblock_open(struct block_device *bdev, fmode_t mode) * in any case. */ if (mode & FMODE_WRITE) { - ret = -EPERM; + ret = -EROFS; goto out_unlock; } diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index e941395de3ae..753494e042d5 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -854,6 +854,17 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, return -EINVAL; } + /* + * Both UBI and UBIFS have been designed for SLC NAND and NOR flashes. + * MLC NAND is different and needs special care, otherwise UBI or UBIFS + * will die soon and you will lose all your data. + */ + if (mtd->type == MTD_MLCNANDFLASH) { + pr_err("ubi: refuse attaching mtd%d - MLC NAND is not supported\n", + mtd->index); + return -EINVAL; + } + if (ubi_num == UBI_DEV_NUM_AUTO) { /* Search for an empty slot in the @ubi_devices array */ for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) diff --git a/drivers/mtd/ubi/fastmap-wl.c b/drivers/mtd/ubi/fastmap-wl.c index 590d967011bb..98f7d6be8d1f 100644 --- a/drivers/mtd/ubi/fastmap-wl.c +++ b/drivers/mtd/ubi/fastmap-wl.c @@ -362,7 +362,6 @@ static void ubi_fastmap_close(struct ubi_device *ubi) { int i; - flush_work(&ubi->fm_work); return_unused_pool_pebs(ubi, &ubi->fm_pool); return_unused_pool_pebs(ubi, &ubi->fm_wl_pool); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index c96a92118b8b..32f6d2e24d66 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -951,9 +951,11 @@ void aq_nic_shutdown(struct aq_nic_s *self) netif_device_detach(self->ndev); - err = aq_nic_stop(self); - if (err < 0) - goto err_exit; + if (netif_running(self->ndev)) { + err = aq_nic_stop(self); + if (err < 0) + goto err_exit; + } aq_nic_deinit(self); err_exit: diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 84d7f4dd4ce1..e652d86b87d4 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -48,6 +48,8 @@ #define FORCE_FLASHLESS 0 static int hw_atl_utils_ver_match(u32 ver_expected, u32 ver_actual); +static int hw_atl_utils_mpi_set_state(struct aq_hw_s *self, + enum hal_atl_utils_fw_state_e state); int hw_atl_utils_initfw(struct aq_hw_s *self, const struct aq_fw_ops **fw_ops) { @@ -247,6 +249,20 @@ int hw_atl_utils_soft_reset(struct aq_hw_s *self) self->rbl_enabled = (boot_exit_code != 0); + /* FW 1.x may bootup in an invalid POWER state (WOL feature). + * We should work around this by forcing its state back to DEINIT + */ + if (!hw_atl_utils_ver_match(HW_ATL_FW_VER_1X, + aq_hw_read_reg(self, + HW_ATL_MPI_FW_VERSION))) { + int err = 0; + + hw_atl_utils_mpi_set_state(self, MPI_DEINIT); + AQ_HW_WAIT_FOR((aq_hw_read_reg(self, HW_ATL_MPI_STATE_ADR) & + HW_ATL_MPI_STATE_MSK) == MPI_DEINIT, + 10, 1000U); + } + if (self->rbl_enabled) return hw_atl_utils_soft_reset_rbl(self); else diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1991f0c7bc0e..f83769d8047b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6090,7 +6090,7 @@ static void bnxt_free_irq(struct bnxt *bp) free_irq_cpu_rmap(bp->dev->rx_cpu_rmap); bp->dev->rx_cpu_rmap = NULL; #endif - if (!bp->irq_tbl) + if (!bp->irq_tbl || !bp->bnapi) return; for (i = 0; i < bp->cp_nr_rings; i++) { @@ -7686,6 +7686,8 @@ int bnxt_check_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs, if (bp->flags & BNXT_FLAG_AGG_RINGS) rx_rings <<= 1; cp = sh ? max_t(int, tx_rings_needed, rx) : tx_rings_needed + rx; + if (bp->flags & BNXT_FLAG_NEW_RM) + cp += bnxt_get_ulp_msix_num(bp); return bnxt_hwrm_check_rings(bp, tx_rings_needed, rx_rings, rx, cp, vnics); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 8d8ccd67e0e2..1f622ca2a64f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -870,17 +870,22 @@ static int bnxt_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc) { struct bnxt *bp = netdev_priv(dev); - struct bnxt_vnic_info *vnic = &bp->vnic_info[0]; + struct bnxt_vnic_info *vnic; int i = 0; if (hfunc) *hfunc = ETH_RSS_HASH_TOP; - if (indir) + if (!bp->vnic_info) + return 0; + + vnic = &bp->vnic_info[0]; + if (indir && vnic->rss_table) { for (i = 0; i < HW_HASH_INDEX_SIZE; i++) indir[i] = le16_to_cpu(vnic->rss_table[i]); + } - if (key) + if (key && vnic->rss_hash_key) memcpy(key, vnic->rss_hash_key, HW_HASH_KEY_SIZE); return 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 65c2cee35766..795f45024c20 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -377,6 +377,30 @@ static bool is_wildcard(void *mask, int len) return true; } +static bool is_exactmatch(void *mask, int len) +{ + const u8 *p = mask; + int i; + + for (i = 0; i < len; i++) + if (p[i] != 0xff) + return false; + + return true; +} + +static bool bits_set(void *key, int len) +{ + const u8 *p = key; + int i; + + for (i = 0; i < len; i++) + if (p[i] != 0) + return true; + + return false; +} + static int bnxt_hwrm_cfa_flow_alloc(struct bnxt *bp, struct bnxt_tc_flow *flow, __le16 ref_flow_handle, __le32 tunnel_handle, __le16 *flow_handle) @@ -764,6 +788,41 @@ static bool bnxt_tc_can_offload(struct bnxt *bp, struct bnxt_tc_flow *flow) return false; } + /* Currently source/dest MAC cannot be partial wildcard */ + if (bits_set(&flow->l2_key.smac, sizeof(flow->l2_key.smac)) && + !is_exactmatch(flow->l2_mask.smac, sizeof(flow->l2_mask.smac))) { + netdev_info(bp->dev, "Wildcard match unsupported for Source MAC\n"); + return false; + } + if (bits_set(&flow->l2_key.dmac, sizeof(flow->l2_key.dmac)) && + !is_exactmatch(&flow->l2_mask.dmac, sizeof(flow->l2_mask.dmac))) { + netdev_info(bp->dev, "Wildcard match unsupported for Dest MAC\n"); + return false; + } + + /* Currently VLAN fields cannot be partial wildcard */ + if (bits_set(&flow->l2_key.inner_vlan_tci, + sizeof(flow->l2_key.inner_vlan_tci)) && + !is_exactmatch(&flow->l2_mask.inner_vlan_tci, + sizeof(flow->l2_mask.inner_vlan_tci))) { + netdev_info(bp->dev, "Wildcard match unsupported for VLAN TCI\n"); + return false; + } + if (bits_set(&flow->l2_key.inner_vlan_tpid, + sizeof(flow->l2_key.inner_vlan_tpid)) && + !is_exactmatch(&flow->l2_mask.inner_vlan_tpid, + sizeof(flow->l2_mask.inner_vlan_tpid))) { + netdev_info(bp->dev, "Wildcard match unsupported for VLAN TPID\n"); + return false; + } + + /* Currently Ethertype must be set */ + if (!is_exactmatch(&flow->l2_mask.ether_type, + sizeof(flow->l2_mask.ether_type))) { + netdev_info(bp->dev, "Wildcard match unsupported for Ethertype\n"); + return false; + } + return true; } @@ -992,8 +1051,10 @@ static int bnxt_tc_get_decap_handle(struct bnxt *bp, struct bnxt_tc_flow *flow, /* Check if there's another flow using the same tunnel decap. * If not, add this tunnel to the table and resolve the other - * tunnel header fileds + * tunnel header fileds. Ignore src_port in the tunnel_key, + * since it is not required for decap filters. */ + decap_key->tp_src = 0; decap_node = bnxt_tc_get_tunnel_node(bp, &tc_info->decap_table, &tc_info->decap_ht_params, decap_key); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c index 26290403f38f..38f635cf8408 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c @@ -64,6 +64,31 @@ static int hwrm_cfa_vfr_free(struct bnxt *bp, u16 vf_idx) return rc; } +static int bnxt_hwrm_vfr_qcfg(struct bnxt *bp, struct bnxt_vf_rep *vf_rep, + u16 *max_mtu) +{ + struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr; + struct hwrm_func_qcfg_input req = {0}; + u16 mtu; + int rc; + + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1); + req.fid = cpu_to_le16(bp->pf.vf[vf_rep->vf_idx].fw_fid); + + mutex_lock(&bp->hwrm_cmd_lock); + + rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + if (!rc) { + mtu = le16_to_cpu(resp->max_mtu_configured); + if (!mtu) + *max_mtu = BNXT_MAX_MTU; + else + *max_mtu = mtu; + } + mutex_unlock(&bp->hwrm_cmd_lock); + return rc; +} + static int bnxt_vf_rep_open(struct net_device *dev) { struct bnxt_vf_rep *vf_rep = netdev_priv(dev); @@ -365,6 +390,7 @@ static void bnxt_vf_rep_netdev_init(struct bnxt *bp, struct bnxt_vf_rep *vf_rep, struct net_device *dev) { struct net_device *pf_dev = bp->dev; + u16 max_mtu; dev->netdev_ops = &bnxt_vf_rep_netdev_ops; dev->ethtool_ops = &bnxt_vf_rep_ethtool_ops; @@ -380,6 +406,10 @@ static void bnxt_vf_rep_netdev_init(struct bnxt *bp, struct bnxt_vf_rep *vf_rep, bnxt_vf_rep_eth_addr_gen(bp->pf.mac_addr, vf_rep->vf_idx, dev->perm_addr); ether_addr_copy(dev->dev_addr, dev->perm_addr); + /* Set VF-Rep's max-mtu to the corresponding VF's max-mtu */ + if (!bnxt_hwrm_vfr_qcfg(bp, vf_rep, &max_mtu)) + dev->max_mtu = max_mtu; + dev->min_mtu = ETH_ZLEN; } static int bnxt_pcie_dsn_get(struct bnxt *bp, u8 dsn[]) diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c index 9c2567b0d93e..dfad93fca0a6 100644 --- a/drivers/net/ethernet/sfc/mcdi.c +++ b/drivers/net/ethernet/sfc/mcdi.c @@ -375,7 +375,7 @@ static int efx_mcdi_poll(struct efx_nic *efx) * because generally mcdi responses are fast. After that, back off * and poll once a jiffy (approximately) */ - spins = TICK_USEC; + spins = USER_TICK_USEC; finish = jiffies + MCDI_RPC_TIMEOUT; while (1) { diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index 5782733959f0..f4e93f5fc204 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -509,6 +509,10 @@ slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) if(x < 0 || x > comp->rslot_limit) goto bad; + /* Check if the cstate is initialized */ + if (!comp->rstate[x].initialized) + goto bad; + comp->flags &=~ SLF_TOSS; comp->recv_current = x; } else { @@ -673,6 +677,7 @@ slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) if (cs->cs_tcp.doff > 5) memcpy(cs->cs_tcpopt, icp + ihl*4 + sizeof(struct tcphdr), (cs->cs_tcp.doff - 5) * 4); cs->cs_hsize = ihl*2 + cs->cs_tcp.doff*2; + cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated */ diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a1ba262f40ad..28583aa0c17d 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -743,8 +743,15 @@ static void __tun_detach(struct tun_file *tfile, bool clean) static void tun_detach(struct tun_file *tfile, bool clean) { + struct tun_struct *tun; + struct net_device *dev; + rtnl_lock(); + tun = rtnl_dereference(tfile->tun); + dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); + if (dev) + netdev_state_change(dev); rtnl_unlock(); } @@ -2562,10 +2569,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) /* One or more queue has already been attached, no need * to initialize the device again. */ + netdev_state_change(dev); return 0; } - } - else { + + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); + + netdev_state_change(dev); + } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? @@ -2642,6 +2654,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); + INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI); if (err < 0) @@ -2656,9 +2671,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun_debug(KERN_INFO, tun, "tun_set_iff\n"); - tun->flags = (tun->flags & ~TUN_FEATURES) | - (ifr->ifr_flags & TUN_FEATURES); - /* Make sure persistent devices do not get stuck in * xoff state. */ @@ -2805,6 +2817,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) } else ret = -EINVAL; + if (ret >= 0) + netdev_state_change(tun->dev); + unlock: rtnl_unlock(); return ret; @@ -2845,6 +2860,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned int ifindex; int le; int ret; + bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { @@ -2941,10 +2957,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); + do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); + do_notify = true; } tun_debug(KERN_INFO, tun, "persist %s\n", @@ -2959,6 +2977,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } tun->owner = owner; + do_notify = true; tun_debug(KERN_INFO, tun, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; @@ -2971,6 +2990,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } tun->group = group; + do_notify = true; tun_debug(KERN_INFO, tun, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; @@ -3130,6 +3150,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } + if (do_notify) + netdev_state_change(tun->dev); + unlock: rtnl_unlock(); if (tun) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index fff4b13eece2..5c42cf81a08b 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -902,6 +902,12 @@ static const struct usb_device_id products[] = { USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, { + /* Cinterion AHS3 modem by GEMALTO */ + USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0055, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, +}, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &cdc_info, diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index aff105f5f58c..0867f7275852 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -928,7 +928,8 @@ static int lan78xx_read_otp(struct lan78xx_net *dev, u32 offset, offset += 0x100; else ret = -EINVAL; - ret = lan78xx_read_raw_otp(dev, offset, length, data); + if (!ret) + ret = lan78xx_read_raw_otp(dev, offset, length, data); } return ret; @@ -2502,7 +2503,7 @@ static void lan78xx_init_stats(struct lan78xx_net *dev) dev->stats.rollover_max.eee_tx_lpi_transitions = 0xFFFFFFFF; dev->stats.rollover_max.eee_tx_lpi_time = 0xFFFFFFFF; - lan78xx_defer_kevent(dev, EVENT_STAT_UPDATE); + set_bit(EVENT_STAT_UPDATE, &dev->flags); } static int lan78xx_open(struct net_device *net) @@ -2514,10 +2515,6 @@ static int lan78xx_open(struct net_device *net) if (ret < 0) goto out; - ret = lan78xx_reset(dev); - if (ret < 0) - goto done; - phy_start(net->phydev); netif_dbg(dev, ifup, dev->net, "phy initialised successfully"); diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 6afe896e5cb8..96d26cfae90b 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -253,7 +253,7 @@ static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c) static unsigned int hwsim_net_id; -static struct ida hwsim_netgroup_ida = IDA_INIT; +static DEFINE_IDA(hwsim_netgroup_ida); struct hwsim_net { int netgroup; diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 02c5984ab09b..6bb37c18292a 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -295,7 +295,7 @@ static void __init of_unittest_printf(void) return; } - num_to_str(phandle_str, sizeof(phandle_str), np->phandle); + num_to_str(phandle_str, sizeof(phandle_str), np->phandle, 0); of_unittest_printf_one(np, "%pOF", full_name); of_unittest_printf_one(np, "%pOFf", full_name); diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index cfb54e01d758..9d27016c899e 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -212,7 +212,6 @@ struct mport_cdev_priv { #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_chan *dmach; struct list_head async_list; - struct list_head pend_list; spinlock_t req_lock; struct mutex dma_lock; struct kref dma_ref; @@ -258,8 +257,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mport_cdev_wait); static struct class *dev_class; static dev_t dev_number; -static struct workqueue_struct *dma_wq; - static void mport_release_mapping(struct kref *ref); static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg, @@ -539,6 +536,7 @@ static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg) #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct mport_dma_req { + struct kref refcount; struct list_head node; struct file *filp; struct mport_cdev_priv *priv; @@ -554,11 +552,6 @@ struct mport_dma_req { struct completion req_comp; }; -struct mport_faf_work { - struct work_struct work; - struct mport_dma_req *req; -}; - static void mport_release_def_dma(struct kref *dma_ref) { struct mport_dev *md = @@ -578,8 +571,10 @@ static void mport_release_dma(struct kref *dma_ref) complete(&priv->comp); } -static void dma_req_free(struct mport_dma_req *req) +static void dma_req_free(struct kref *ref) { + struct mport_dma_req *req = container_of(ref, struct mport_dma_req, + refcount); struct mport_cdev_priv *priv = req->priv; unsigned int i; @@ -611,30 +606,7 @@ static void dma_xfer_callback(void *param) req->status = dma_async_is_tx_complete(priv->dmach, req->cookie, NULL, NULL); complete(&req->req_comp); -} - -static void dma_faf_cleanup(struct work_struct *_work) -{ - struct mport_faf_work *work = container_of(_work, - struct mport_faf_work, work); - struct mport_dma_req *req = work->req; - - dma_req_free(req); - kfree(work); -} - -static void dma_faf_callback(void *param) -{ - struct mport_dma_req *req = (struct mport_dma_req *)param; - struct mport_faf_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (!work) - return; - - INIT_WORK(&work->work, dma_faf_cleanup); - work->req = req; - queue_work(dma_wq, &work->work); + kref_put(&req->refcount, dma_req_free); } /* @@ -765,16 +737,14 @@ static int do_dma_request(struct mport_dma_req *req, goto err_out; } - if (sync == RIO_TRANSFER_FAF) - tx->callback = dma_faf_callback; - else - tx->callback = dma_xfer_callback; + tx->callback = dma_xfer_callback; tx->callback_param = req; req->dmach = chan; req->sync = sync; req->status = DMA_IN_PROGRESS; init_completion(&req->req_comp); + kref_get(&req->refcount); cookie = dmaengine_submit(tx); req->cookie = cookie; @@ -785,6 +755,7 @@ static int do_dma_request(struct mport_dma_req *req, if (dma_submit_error(cookie)) { rmcd_error("submit err=%d (addr:0x%llx len:0x%llx)", cookie, xfer->rio_addr, xfer->length); + kref_put(&req->refcount, dma_req_free); ret = -EIO; goto err_out; } @@ -860,6 +831,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, if (!req) return -ENOMEM; + kref_init(&req->refcount); + ret = get_dma_channel(priv); if (ret) { kfree(req); @@ -968,42 +941,20 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, ret = do_dma_request(req, xfer, sync, nents); if (ret >= 0) { - if (sync == RIO_TRANSFER_SYNC) - goto sync_out; - return ret; /* return ASYNC cookie */ - } - - if (ret == -ETIMEDOUT || ret == -EINTR) { - /* - * This can happen only in case of SYNC transfer. - * Do not free unfinished request structure immediately. - * Place it into pending list and deal with it later - */ - spin_lock(&priv->req_lock); - list_add_tail(&req->node, &priv->pend_list); - spin_unlock(&priv->req_lock); - return ret; + if (sync == RIO_TRANSFER_ASYNC) + return ret; /* return ASYNC cookie */ + } else { + rmcd_debug(DMA, "do_dma_request failed with err=%d", ret); } - - rmcd_debug(DMA, "do_dma_request failed with err=%d", ret); -sync_out: - dma_unmap_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir); - sg_free_table(&req->sgt); err_pg: - if (page_list) { + if (!req->page_list) { for (i = 0; i < nr_pages; i++) put_page(page_list[i]); kfree(page_list); } err_req: - if (req->map) { - mutex_lock(&md->buf_mutex); - kref_put(&req->map->ref, mport_release_mapping); - mutex_unlock(&md->buf_mutex); - } - put_dma_channel(priv); - kfree(req); + kref_put(&req->refcount, dma_req_free); return ret; } @@ -1121,7 +1072,7 @@ static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg) ret = 0; if (req->status != DMA_IN_PROGRESS && req->status != DMA_PAUSED) - dma_req_free(req); + kref_put(&req->refcount, dma_req_free); return ret; @@ -1966,7 +1917,6 @@ static int mport_cdev_open(struct inode *inode, struct file *filp) #ifdef CONFIG_RAPIDIO_DMA_ENGINE INIT_LIST_HEAD(&priv->async_list); - INIT_LIST_HEAD(&priv->pend_list); spin_lock_init(&priv->req_lock); mutex_init(&priv->dma_lock); #endif @@ -2006,8 +1956,6 @@ static void mport_cdev_release_dma(struct file *filp) md = priv->md; - flush_workqueue(dma_wq); - spin_lock(&priv->req_lock); if (!list_empty(&priv->async_list)) { rmcd_debug(EXIT, "async list not empty filp=%p %s(%d)", @@ -2023,20 +1971,7 @@ static void mport_cdev_release_dma(struct file *filp) req->filp, req->cookie, completion_done(&req->req_comp)?"yes":"no"); list_del(&req->node); - dma_req_free(req); - } - } - - if (!list_empty(&priv->pend_list)) { - rmcd_debug(EXIT, "Free pending DMA requests for filp=%p %s(%d)", - filp, current->comm, task_pid_nr(current)); - list_for_each_entry_safe(req, - req_next, &priv->pend_list, node) { - rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s", - req->filp, req->cookie, - completion_done(&req->req_comp)?"yes":"no"); - list_del(&req->node); - dma_req_free(req); + kref_put(&req->refcount, dma_req_free); } } @@ -2048,15 +1983,6 @@ static void mport_cdev_release_dma(struct file *filp) current->comm, task_pid_nr(current), wret); } - spin_lock(&priv->req_lock); - - if (!list_empty(&priv->pend_list)) { - rmcd_debug(EXIT, "ATTN: pending DMA requests, filp=%p %s(%d)", - filp, current->comm, task_pid_nr(current)); - } - - spin_unlock(&priv->req_lock); - if (priv->dmach != priv->md->dma_chan) { rmcd_debug(EXIT, "Release DMA channel for filp=%p %s(%d)", filp, current->comm, task_pid_nr(current)); @@ -2573,8 +2499,6 @@ static void mport_cdev_remove(struct mport_dev *md) cdev_device_del(&md->cdev, &md->dev); mport_cdev_kill_fasync(md); - flush_workqueue(dma_wq); - /* TODO: do we need to give clients some time to close file * descriptors? Simple wait for XX, or kref? */ @@ -2691,17 +2615,8 @@ static int __init mport_init(void) goto err_cli; } - dma_wq = create_singlethread_workqueue("dma_wq"); - if (!dma_wq) { - rmcd_error("failed to create DMA work queue"); - ret = -ENOMEM; - goto err_wq; - } - return 0; -err_wq: - class_interface_unregister(&rio_mport_interface); err_cli: unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); err_chr: @@ -2717,7 +2632,6 @@ static void __exit mport_exit(void) class_interface_unregister(&rio_mport_interface); class_destroy(dev_class); unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); - destroy_workqueue(dma_wq); } module_init(mport_init); diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index 23429bdaca84..161b927d9de1 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -76,7 +76,7 @@ static u16 rio_destid_alloc(struct rio_net *net) } /** - * rio_destid_reserve - Reserve the specivied destID + * rio_destid_reserve - Reserve the specified destID * @net: RIO network * @destid: destID to reserve * @@ -885,7 +885,7 @@ static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport, * * For each enumerated device, ensure that each switch in a system * has correct routing entries. Add routes for devices that where - * unknown dirung the first enumeration pass through the switch. + * unknown during the first enumeration pass through the switch. */ static void rio_update_route_tables(struct rio_net *net) { @@ -983,7 +983,7 @@ static int rio_enum_mport(struct rio_mport *mport, u32 flags) /* reserve mport destID in new net */ rio_destid_reserve(net, mport->host_deviceid); - /* Enable Input Output Port (transmitter reviever) */ + /* Enable Input Output Port (transmitter receiver) */ rio_enable_rx_tx_port(mport, 1, 0, 0, 0); /* Set component tag for host */ diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c index c43ac574274c..3075358f3f08 100644 --- a/drivers/staging/lustre/lustre/llite/glimpse.c +++ b/drivers/staging/lustre/lustre/llite/glimpse.c @@ -69,7 +69,7 @@ blkcnt_t dirty_cnt(struct inode *inode) void *results[1]; if (inode->i_mapping) - cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, + cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages, results, 0, 1, PAGECACHE_TAG_DIRTY); if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 3b1c8e5a3053..8ee7b4d273b2 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -934,14 +934,14 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, struct page *page; int found; - spin_lock_irq(&mapping->tree_lock); - found = radix_tree_gang_lookup(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + found = radix_tree_gang_lookup(&mapping->i_pages, (void **)&page, offset, 1); if (found > 0 && !radix_tree_exceptional_entry(page)) { struct lu_dirpage *dp; get_page(page); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * In contrast to find_lock_page() we are sure that directory * page cannot be truncated (while DLM lock is held) and, @@ -989,7 +989,7 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, page = ERR_PTR(-EIO); } } else { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page = NULL; } return page; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index bec722e41f58..f3bd8e941224 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -641,14 +641,14 @@ void vhost_dev_cleanup(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); -static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) +static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) { u64 a = addr / VHOST_PAGE_SIZE / 8; /* Make sure 64 bit math will not overflow. */ if (a > ULONG_MAX - (unsigned long)log_base || a + (unsigned long)log_base > ULONG_MAX) - return 0; + return false; return access_ok(VERIFY_WRITE, log_base + a, (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); @@ -661,30 +661,30 @@ static bool vhost_overflow(u64 uaddr, u64 size) } /* Caller should have vq mutex and device mutex. */ -static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, - int log_all) +static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, + int log_all) { struct vhost_umem_node *node; if (!umem) - return 0; + return false; list_for_each_entry(node, &umem->umem_list, link) { unsigned long a = node->userspace_addr; if (vhost_overflow(node->userspace_addr, node->size)) - return 0; + return false; if (!access_ok(VERIFY_WRITE, (void __user *)a, node->size)) - return 0; + return false; else if (log_all && !log_access_ok(log_base, node->start, node->size)) - return 0; + return false; } - return 1; + return true; } static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, @@ -701,13 +701,13 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ -static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, - int log_all) +static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, + int log_all) { int i; for (i = 0; i < d->nvqs; ++i) { - int ok; + bool ok; bool log; mutex_lock(&d->vqs[i]->mutex); @@ -717,12 +717,12 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, ok = vq_memory_access_ok(d->vqs[i]->log_base, umem, log); else - ok = 1; + ok = true; mutex_unlock(&d->vqs[i]->mutex); if (!ok) - return 0; + return false; } - return 1; + return true; } static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, @@ -744,7 +744,7 @@ static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to, struct iov_iter t; void __user *uaddr = vhost_vq_meta_fetch(vq, (u64)(uintptr_t)to, size, - VHOST_ADDR_DESC); + VHOST_ADDR_USED); if (uaddr) return __copy_to_user(uaddr, from, size); @@ -959,21 +959,21 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d, spin_unlock(&d->iotlb_lock); } -static int umem_access_ok(u64 uaddr, u64 size, int access) +static bool umem_access_ok(u64 uaddr, u64 size, int access) { unsigned long a = uaddr; /* Make sure 64 bit math will not overflow. */ if (vhost_overflow(uaddr, size)) - return -EFAULT; + return false; if ((access & VHOST_ACCESS_RO) && !access_ok(VERIFY_READ, (void __user *)a, size)) - return -EFAULT; + return false; if ((access & VHOST_ACCESS_WO) && !access_ok(VERIFY_WRITE, (void __user *)a, size)) - return -EFAULT; - return 0; + return false; + return true; } static int vhost_process_iotlb_msg(struct vhost_dev *dev, @@ -988,7 +988,7 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, ret = -EFAULT; break; } - if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) { + if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) { ret = -EFAULT; break; } @@ -1135,10 +1135,10 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) return 0; } -static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used) +static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; @@ -1161,8 +1161,8 @@ static void vhost_vq_meta_update(struct vhost_virtqueue *vq, vq->meta_iotlb[type] = node; } -static int iotlb_access_ok(struct vhost_virtqueue *vq, - int access, u64 addr, u64 len, int type) +static bool iotlb_access_ok(struct vhost_virtqueue *vq, + int access, u64 addr, u64 len, int type) { const struct vhost_umem_node *node; struct vhost_umem *umem = vq->iotlb; @@ -1220,7 +1220,7 @@ EXPORT_SYMBOL_GPL(vq_iotlb_prefetch); /* Can we log writes? */ /* Caller should have device mutex but not vq mutex */ -int vhost_log_access_ok(struct vhost_dev *dev) +bool vhost_log_access_ok(struct vhost_dev *dev) { return memory_access_ok(dev, dev->umem, 1); } @@ -1228,8 +1228,8 @@ EXPORT_SYMBOL_GPL(vhost_log_access_ok); /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ -static int vq_log_access_ok(struct vhost_virtqueue *vq, - void __user *log_base) +static bool vq_log_access_ok(struct vhost_virtqueue *vq, + void __user *log_base) { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; @@ -1242,12 +1242,14 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, /* Can we start vq? */ /* Caller should have vq mutex and device mutex */ -int vhost_vq_access_ok(struct vhost_virtqueue *vq) +bool vhost_vq_access_ok(struct vhost_virtqueue *vq) { - int ret = vq_log_access_ok(vq, vq->log_base); + if (!vq_log_access_ok(vq, vq->log_base)) + return false; - if (ret || vq->iotlb) - return ret; + /* Access validation occurs at prefetch time with IOTLB */ + if (vq->iotlb) + return true; return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index d8ee85ae8fdc..6c844b90a168 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -178,8 +178,8 @@ void vhost_dev_cleanup(struct vhost_dev *); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); -int vhost_vq_access_ok(struct vhost_virtqueue *vq); -int vhost_log_access_ok(struct vhost_dev *); +bool vhost_vq_access_ok(struct vhost_virtqueue *vq); +bool vhost_log_access_ok(struct vhost_dev *); int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_count, diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index dfe5684000be..6b237e3f4983 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -272,6 +272,12 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb) pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); +#ifdef CONFIG_HUGETLB_PAGE + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, + events[HTLB_BUDDY_PGALLOC]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, + events[HTLB_BUDDY_PGALLOC_FAIL]); +#endif #endif update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, pages_to_bytes(i.freeram)); diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 23e391d3ec01..b29f4e40851f 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -53,6 +53,8 @@ static unsigned long *acpi_ids_done; static unsigned long *acpi_id_present; /* And if there is an _CST definition (or a PBLK) for the ACPI IDs */ static unsigned long *acpi_id_cst_present; +/* Which ACPI P-State dependencies for a enumerated processor */ +static struct acpi_psd_package *acpi_psd; static int push_cxx_to_hypervisor(struct acpi_processor *_pr) { @@ -362,9 +364,9 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) } /* There are more ACPI Processor objects than in x2APIC or MADT. * This can happen with incorrect ACPI SSDT declerations. */ - if (acpi_id > nr_acpi_bits) { - pr_debug("We only have %u, trying to set %u\n", - nr_acpi_bits, acpi_id); + if (acpi_id >= nr_acpi_bits) { + pr_debug("max acpi id %u, trying to set %u\n", + nr_acpi_bits - 1, acpi_id); return AE_OK; } /* OK, There is a ACPI Processor object */ @@ -372,6 +374,13 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk); + /* It has P-state dependencies */ + if (!acpi_processor_get_psd(handle, &acpi_psd[acpi_id])) { + pr_debug("ACPI CPU%u w/ PST:coord_type = %llu domain = %llu\n", + acpi_id, acpi_psd[acpi_id].coord_type, + acpi_psd[acpi_id].domain); + } + status = acpi_evaluate_object(handle, "_CST", NULL, &buffer); if (ACPI_FAILURE(status)) { if (!pblk) @@ -405,6 +414,14 @@ static int check_acpi_ids(struct acpi_processor *pr_backup) return -ENOMEM; } + acpi_psd = kcalloc(nr_acpi_bits, sizeof(struct acpi_psd_package), + GFP_KERNEL); + if (!acpi_psd) { + kfree(acpi_id_present); + kfree(acpi_id_cst_present); + return -ENOMEM; + } + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, read_acpi_id, NULL, NULL, NULL); @@ -417,6 +434,12 @@ upload: pr_backup->acpi_id = i; /* Mask out C-states if there are no _CST or PBLK */ pr_backup->flags.power = test_bit(i, acpi_id_cst_present); + /* num_entries is non-zero if we evaluated _PSD */ + if (acpi_psd[i].num_entries) { + memcpy(&pr_backup->performance->domain_info, + &acpi_psd[i], + sizeof(struct acpi_psd_package)); + } (void)upload_pm_data(pr_backup); } } @@ -566,6 +589,7 @@ static void __exit xen_acpi_processor_exit(void) kfree(acpi_ids_done); kfree(acpi_id_present); kfree(acpi_id_cst_present); + kfree(acpi_psd); for_each_possible_cpu(i) acpi_processor_unregister_performance(i); diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index a493e99bed21..0d6d9264d6a9 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -365,7 +365,7 @@ void xenbus_dev_queue_reply(struct xb_req_data *req) if (WARN_ON(rc)) goto out; } - } else if (req->msg.type == XS_TRANSACTION_END) { + } else if (req->type == XS_TRANSACTION_END) { trans = xenbus_get_transaction(u, req->msg.tx_id); if (WARN_ON(!trans)) goto out; @@ -429,6 +429,10 @@ static int xenbus_write_transaction(unsigned msg_type, { int rc; struct xenbus_transaction_holder *trans = NULL; + struct { + struct xsd_sockmsg hdr; + char body[]; + } *msg = (void *)u->u.buffer; if (msg_type == XS_TRANSACTION_START) { trans = kzalloc(sizeof(*trans), GFP_KERNEL); @@ -437,11 +441,15 @@ static int xenbus_write_transaction(unsigned msg_type, goto out; } list_add(&trans->list, &u->transactions); - } else if (u->u.msg.tx_id != 0 && - !xenbus_get_transaction(u, u->u.msg.tx_id)) + } else if (msg->hdr.tx_id != 0 && + !xenbus_get_transaction(u, msg->hdr.tx_id)) return xenbus_command_reply(u, XS_ERROR, "ENOENT"); + else if (msg_type == XS_TRANSACTION_END && + !(msg->hdr.len == 2 && + (!strcmp(msg->body, "T") || !strcmp(msg->body, "F")))) + return xenbus_command_reply(u, XS_ERROR, "EINVAL"); - rc = xenbus_dev_request_and_reply(&u->u.msg, u); + rc = xenbus_dev_request_and_reply(&msg->hdr, u); if (rc && trans) { list_del(&trans->list); kfree(trans); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 3f3b29398ab8..49a3874ae6bb 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -140,7 +140,9 @@ void xs_request_exit(struct xb_req_data *req) spin_lock(&xs_state_lock); xs_state_users--; if ((req->type == XS_TRANSACTION_START && req->msg.type == XS_ERROR) || - req->type == XS_TRANSACTION_END) + (req->type == XS_TRANSACTION_END && + !WARN_ON_ONCE(req->msg.type == XS_ERROR && + !strcmp(req->body, "ENOENT")))) xs_state_users--; spin_unlock(&xs_state_lock); diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 45b7fc405fa6..532acae25453 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -12,6 +12,8 @@ kafs-objs := \ cell.o \ cmservice.o \ dir.o \ + dir_edit.o \ + dynroot.o \ file.o \ flock.o \ fsclient.o \ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index fd9f28b8a933..3bedfed608a2 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -243,9 +243,9 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) xport == a->sin6_port) return; if (xdr == a->sin6_addr.s6_addr32[3] && - xport < a->sin6_port) + (u16 __force)xport < (u16 __force)a->sin6_port) break; - if (xdr < a->sin6_addr.s6_addr32[3]) + if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3]) break; } @@ -280,7 +280,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) xport == a->sin6_port) return; if (diff == 0 && - xport < a->sin6_port) + (u16 __force)xport < (u16 __force)a->sin6_port) break; if (diff < 0) break; diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b94d0edc2b78..b4ff1f7ae4ab 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -67,10 +67,14 @@ typedef enum { } afs_callback_type_t; struct afs_callback { - struct afs_fid fid; /* file identifier */ - unsigned version; /* callback version */ - unsigned expiry; /* time at which expires */ - afs_callback_type_t type; /* type of callback */ + unsigned version; /* Callback version */ + unsigned expiry; /* Time at which expires */ + afs_callback_type_t type; /* Type of callback */ +}; + +struct afs_callback_break { + struct afs_fid fid; /* File identifier */ + struct afs_callback cb; /* Callback details */ }; #define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ @@ -123,21 +127,20 @@ typedef u32 afs_access_t; * AFS file status information */ struct afs_file_status { - unsigned if_version; /* interface version */ -#define AFS_FSTATUS_VERSION 1 + u64 size; /* file size */ + afs_dataversion_t data_version; /* current data version */ + time_t mtime_client; /* last time client changed data */ + time_t mtime_server; /* last time server changed data */ + unsigned abort_code; /* Abort if bulk-fetching this failed */ afs_file_type_t type; /* file type */ unsigned nlink; /* link count */ - u64 size; /* file size */ - afs_dataversion_t data_version; /* current data version */ u32 author; /* author ID */ - kuid_t owner; /* owner ID */ - kgid_t group; /* group ID */ + u32 owner; /* owner ID */ + u32 group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ - time_t mtime_client; /* last time client changed data */ - time_t mtime_server; /* last time server changed data */ s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ }; diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h index d47b6d01e4c0..ddfa88a7a9c0 100644 --- a/fs/afs/afs_fs.h +++ b/fs/afs/afs_fs.h @@ -31,10 +31,12 @@ enum AFS_FS_Operations { FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ FSGETROOTVOLUME = 151, /* AFS Get root volume name */ + FSBULKSTATUS = 155, /* AFS Fetch multiple file statuses */ FSSETLOCK = 156, /* AFS Request a file lock */ FSEXTENDLOCK = 157, /* AFS Extend a file lock */ FSRELEASELOCK = 158, /* AFS Release a file lock */ FSLOOKUP = 161, /* AFS lookup file in directory */ + FSINLINEBULKSTATUS = 65536, /* AFS Fetch multiple file statuses with inline errors */ FSFETCHDATA64 = 65537, /* AFS Fetch file data */ FSSTOREDATA64 = 65538, /* AFS Store file data */ FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index f4291b576054..abd9a84f4e88 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -97,26 +97,6 @@ again: } /* - * Set a vnode's interest on a server. - */ -void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi) -{ - struct afs_cb_interest *old_cbi = NULL; - - if (vnode->cb_interest == cbi) - return; - - write_seqlock(&vnode->cb_lock); - if (vnode->cb_interest != cbi) { - afs_get_cb_interest(cbi); - old_cbi = vnode->cb_interest; - vnode->cb_interest = cbi; - } - write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); -} - -/* * Remove an interest on a server. */ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) @@ -150,6 +130,7 @@ void afs_break_callback(struct afs_vnode *vnode) write_seqlock(&vnode->cb_lock); + clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { vnode->cb_break++; afs_clear_permits(vnode); @@ -207,7 +188,7 @@ static void afs_break_one_callback(struct afs_server *server, * allow the fileserver to break callback promises */ void afs_break_callbacks(struct afs_server *server, size_t count, - struct afs_callback callbacks[]) + struct afs_callback_break *callbacks) { _enter("%p,%zu,", server, count); @@ -219,9 +200,9 @@ void afs_break_callbacks(struct afs_server *server, size_t count, callbacks->fid.vid, callbacks->fid.vnode, callbacks->fid.unique, - callbacks->version, - callbacks->expiry, - callbacks->type + callbacks->cb.version, + callbacks->cb.expiry, + callbacks->cb.type ); afs_break_one_callback(server, &callbacks->fid); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 4235a05afc76..fdf4c36cff79 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -18,7 +18,7 @@ #include <keys/rxrpc-type.h> #include "internal.h" -unsigned __read_mostly afs_cell_gc_delay = 10; +static unsigned __read_mostly afs_cell_gc_delay = 10; static void afs_manage_cell(struct work_struct *); @@ -75,7 +75,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, cell = rcu_dereference_raw(net->ws_cell); if (cell) { afs_get_cell(cell); - continue; + break; } ret = -EDESTADDRREQ; continue; @@ -130,6 +130,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } + if (namelen == 5 && memcmp(name, "@cell", 5) == 0) + return ERR_PTR(-EINVAL); _enter("%*.*s,%s", namelen, namelen, name, vllist); @@ -334,8 +336,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return PTR_ERR(new_root); } - set_bit(AFS_CELL_FL_NO_GC, &new_root->flags); - afs_get_cell(new_root); + if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) + afs_get_cell(new_root); /* install the new cell */ write_seqlock(&net->cells_lock); @@ -411,7 +413,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) ASSERTCMP(atomic_read(&cell->usage), ==, 0); - afs_put_addrlist(cell->vl_addrs); + afs_put_addrlist(rcu_access_pointer(cell->vl_addrs)); key_put(cell->anonymous_key); kfree(cell); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 41e277f57b20..357de908df3a 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -178,8 +178,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work) */ static int afs_deliver_cb_callback(struct afs_call *call) { + struct afs_callback_break *cb; struct sockaddr_rxrpc srx; - struct afs_callback *cb; struct afs_server *server; __be32 *bp; int ret, loop; @@ -201,7 +201,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > AFSCBMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL); if (!call->buffer) @@ -218,7 +218,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) _debug("unmarshall FID array"); call->request = kcalloc(call->count, - sizeof(struct afs_callback), + sizeof(struct afs_callback_break), GFP_KERNEL); if (!call->request) return -ENOMEM; @@ -229,7 +229,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb->fid.vid = ntohl(*bp++); cb->fid.vnode = ntohl(*bp++); cb->fid.unique = ntohl(*bp++); - cb->type = AFSCM_CB_UNTYPED; + cb->cb.type = AFSCM_CB_UNTYPED; } call->offset = 0; @@ -245,7 +245,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count2 = ntohl(call->tmp); _debug("CB count: %u", call->count2); if (call->count2 != call->count && call->count2 != 0) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -260,9 +260,9 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb = call->request; bp = call->buffer; for (loop = call->count2; loop > 0; loop--, cb++) { - cb->version = ntohl(*bp++); - cb->expiry = ntohl(*bp++); - cb->type = ntohl(*bp++); + cb->cb.version = ntohl(*bp++); + cb->cb.expiry = ntohl(*bp++); + cb->cb.type = ntohl(*bp++); } call->offset = 0; @@ -500,9 +500,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) b = call->buffer; r = call->request; - r->time_low = ntohl(b[0]); - r->time_mid = ntohl(b[1]); - r->time_hi_and_version = ntohl(b[2]); + r->time_low = b[0]; + r->time_mid = htons(ntohl(b[1])); + r->time_hi_and_version = htons(ntohl(b[2])); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ba2b458b36d1..5889f70d4d27 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1,6 +1,6 @@ /* dir.c: AFS filesystem directory handling * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002, 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells ([email protected]) * * This program is free software; you can redistribute it and/or @@ -10,27 +10,26 @@ */ #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> -#include <linux/dns_resolver.h> +#include <linux/task_io_accounting_ops.h> #include "internal.h" +#include "xdr_fs.h" static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); -static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); -static void afs_d_release(struct dentry *dentry); -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, +static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, + loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); @@ -43,6 +42,14 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); +static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags); +static void afs_dir_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); + +static int afs_dir_set_page_dirty(struct page *page) +{ + BUG(); /* This should never happen. */ +} const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -67,15 +74,10 @@ const struct inode_operations afs_dir_inode_operations = { .listxattr = afs_listxattr, }; -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - -const struct inode_operations afs_dynroot_inode_operations = { - .lookup = afs_dynroot_lookup, +const struct address_space_operations afs_dir_aops = { + .set_page_dirty = afs_dir_set_page_dirty, + .releasepage = afs_dir_releasepage, + .invalidatepage = afs_dir_invalidatepage, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -85,91 +87,38 @@ const struct dentry_operations afs_fs_dentry_operations = { .d_automount = afs_d_automount, }; -#define AFS_DIR_HASHTBL_SIZE 128 -#define AFS_DIR_DIRENT_SIZE 32 -#define AFS_DIRENT_PER_BLOCK 64 - -union afs_dirent { - struct { - uint8_t valid; - uint8_t unused[1]; - __be16 hash_next; - __be32 vnode; - __be32 unique; - uint8_t name[16]; - uint8_t overflow[4]; /* if any char of the name (inc - * NUL) reaches here, consume - * the next dirent too */ - } u; - uint8_t extended_name[32]; -}; - -/* AFS directory page header (one at the beginning of every 2048-byte chunk) */ -struct afs_dir_pagehdr { - __be16 npages; - __be16 magic; -#define AFS_DIR_MAGIC htons(1234) - uint8_t nentries; - uint8_t bitmap[8]; - uint8_t pad[19]; -}; - -/* directory block layout */ -union afs_dir_block { - - struct afs_dir_pagehdr pagehdr; - - struct { - struct afs_dir_pagehdr pagehdr; - uint8_t alloc_ctrs[128]; - /* dir hash table */ - uint16_t hashtable[AFS_DIR_HASHTBL_SIZE]; - } hdr; - - union afs_dirent dirents[AFS_DIRENT_PER_BLOCK]; -}; - -/* layout on a linux VM page */ -struct afs_dir_page { - union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)]; +struct afs_lookup_one_cookie { + struct dir_context ctx; + struct qstr name; + bool found; + struct afs_fid fid; }; struct afs_lookup_cookie { - struct dir_context ctx; - struct afs_fid fid; - struct qstr name; - int found; + struct dir_context ctx; + struct qstr name; + bool found; + bool one_only; + unsigned short nr_fids; + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_fid fids[50]; }; /* * check that a directory page is valid */ -bool afs_dir_check_page(struct inode *dir, struct page *page) +static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, + loff_t i_size) { - struct afs_dir_page *dbuf; - struct afs_vnode *vnode = AFS_FS_I(dir); - loff_t latter, i_size, off; + struct afs_xdr_dir_page *dbuf; + loff_t latter, off; int tmp, qty; -#if 0 - /* check the page count */ - qty = desc.size / sizeof(dbuf->blocks[0]); - if (qty == 0) - goto error; - - if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) { - printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", - __func__, dir->i_ino, qty, - ntohs(dbuf->blocks[0].pagehdr.npages)); - goto error; - } -#endif - /* Determine how many magic numbers there should be in this page, but * we must take care because the directory may change size under us. */ off = page_offset(page); - i_size = i_size_read(dir); if (i_size <= off) goto checked; @@ -178,112 +127,225 @@ bool afs_dir_check_page(struct inode *dir, struct page *page) qty = PAGE_SIZE; else qty = latter; - qty /= sizeof(union afs_dir_block); + qty /= sizeof(union afs_xdr_dir_block); /* check them */ - dbuf = page_address(page); + dbuf = kmap(page); for (tmp = 0; tmp < qty; tmp++) { - if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { + if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", - __func__, dir->i_ino, tmp, qty, - ntohs(dbuf->blocks[tmp].pagehdr.magic)); - trace_afs_dir_check_failed(vnode, off, i_size); + __func__, dvnode->vfs_inode.i_ino, tmp, qty, + ntohs(dbuf->blocks[tmp].hdr.magic)); + trace_afs_dir_check_failed(dvnode, off, i_size); + kunmap(page); goto error; } + + /* Make sure each block is NUL terminated so we can reasonably + * use string functions on it. The filenames in the page + * *should* be NUL-terminated anyway. + */ + ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0; } + kunmap(page); + checked: - SetPageChecked(page); + afs_stat_v(dvnode, n_read_dir); return true; error: - SetPageError(page); return false; } /* - * discard a page cached in the pagecache + * open an AFS directory file */ -static inline void afs_dir_put_page(struct page *page) +static int afs_dir_open(struct inode *inode, struct file *file) { - kunmap(page); - unlock_page(page); - put_page(page); + _enter("{%lu}", inode->i_ino); + + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); + + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) + return -ENOENT; + + return afs_open(inode, file); } /* - * get a page into the pagecache + * Read the directory into the pagecache in one go, scrubbing the previous + * contents. The list of pages is returned, pinning them so that they don't + * get reclaimed during the iteration. */ -static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, - struct key *key) +static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) { - struct page *page; - _enter("{%lu},%lu", dir->i_ino, index); - - page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); - if (!IS_ERR(page)) { - lock_page(page); - kmap(page); - if (unlikely(!PageChecked(page))) { - if (PageError(page)) - goto fail; - } + struct afs_read *req; + loff_t i_size; + int nr_pages, nr_inline, i, n; + int ret = -ENOMEM; + +retry: + i_size = i_size_read(&dvnode->vfs_inode); + if (i_size < 2048) + return ERR_PTR(-EIO); + if (i_size > 2048 * 1024) + return ERR_PTR(-EFBIG); + + _enter("%llu", i_size); + + /* Get a request record to hold the page list. We want to hold it + * inline if we can, but we don't want to make an order 1 allocation. + */ + nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; + nr_inline = nr_pages; + if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *)) + nr_inline = 0; + + req = kzalloc(sizeof(*req) + sizeof(struct page *) * nr_inline, + GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + refcount_set(&req->usage, 1); + req->nr_pages = nr_pages; + req->actual_len = i_size; /* May change */ + req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ + req->data_version = dvnode->status.data_version; /* May change */ + if (nr_inline > 0) { + req->pages = req->array; + } else { + req->pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!req->pages) + goto error; } - return page; -fail: - afs_dir_put_page(page); - _leave(" = -EIO"); - return ERR_PTR(-EIO); -} + /* Get a list of all the pages that hold or will hold the directory + * content. We need to fill in any gaps that we might find where the + * memory reclaimer has been at work. If there are any gaps, we will + * need to reread the entire directory contents. + */ + i = 0; + do { + n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i, + req->nr_pages - i, + req->pages + i); + _debug("find %u at %u/%u", n, i, req->nr_pages); + if (n == 0) { + gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask; + + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_inval); + + ret = -ENOMEM; + req->pages[i] = __page_cache_alloc(gfp); + if (!req->pages[i]) + goto error; + ret = add_to_page_cache_lru(req->pages[i], + dvnode->vfs_inode.i_mapping, + i, gfp); + if (ret < 0) + goto error; + + set_page_private(req->pages[i], 1); + SetPagePrivate(req->pages[i]); + unlock_page(req->pages[i]); + i++; + } else { + i += n; + } + } while (i < req->nr_pages); -/* - * open an AFS directory file - */ -static int afs_dir_open(struct inode *inode, struct file *file) -{ - _enter("{%lu}", inode->i_ino); + /* If we're going to reload, we need to lock all the pages to prevent + * races. + */ + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + ret = -ERESTARTSYS; + for (i = 0; i < req->nr_pages; i++) + if (lock_page_killable(req->pages[i]) < 0) + goto error_unlock; - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + goto success; - if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) - return -ENOENT; + ret = afs_fetch_data(dvnode, key, req); + if (ret < 0) + goto error_unlock_all; - return afs_open(inode, file); + task_io_account_read(PAGE_SIZE * req->nr_pages); + + if (req->len < req->file_size) + goto content_has_grown; + + /* Validate the data we just read. */ + ret = -EIO; + for (i = 0; i < req->nr_pages; i++) + if (!afs_dir_check_page(dvnode, req->pages[i], + req->actual_len)) + goto error_unlock_all; + + // TODO: Trim excess pages + + set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + } + +success: + i = req->nr_pages; + while (i > 0) + unlock_page(req->pages[--i]); + return req; + +error_unlock_all: + i = req->nr_pages; +error_unlock: + while (i > 0) + unlock_page(req->pages[--i]); +error: + afs_put_read(req); + _leave(" = %d", ret); + return ERR_PTR(ret); + +content_has_grown: + i = req->nr_pages; + while (i > 0) + unlock_page(req->pages[--i]); + afs_put_read(req); + goto retry; } /* * deal with one block in an AFS directory */ static int afs_dir_iterate_block(struct dir_context *ctx, - union afs_dir_block *block, + union afs_xdr_dir_block *block, unsigned blkoff) { - union afs_dirent *dire; + union afs_xdr_dirent *dire; unsigned offset, next, curr; size_t nlen; int tmp; _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); - curr = (ctx->pos - blkoff) / sizeof(union afs_dirent); + curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent); /* walk through the block, an entry at a time */ - for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; - offset < AFS_DIRENT_PER_BLOCK; + for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + offset < AFS_DIR_SLOTS_PER_BLOCK; offset = next ) { next = offset + 1; /* skip entries marked unused in the bitmap */ - if (!(block->pagehdr.bitmap[offset / 8] & + if (!(block->hdr.bitmap[offset / 8] & (1 << (offset % 8)))) { _debug("ENT[%zu.%u]: unused", - blkoff / sizeof(union afs_dir_block), offset); + blkoff / sizeof(union afs_xdr_dir_block), offset); if (offset >= curr) ctx->pos = blkoff + - next * sizeof(union afs_dirent); + next * sizeof(union afs_xdr_dirent); continue; } @@ -291,34 +353,34 @@ static int afs_dir_iterate_block(struct dir_context *ctx, dire = &block->dirents[offset]; nlen = strnlen(dire->u.name, sizeof(*block) - - offset * sizeof(union afs_dirent)); + offset * sizeof(union afs_xdr_dirent)); _debug("ENT[%zu.%u]: %s %zu \"%s\"", - blkoff / sizeof(union afs_dir_block), offset, + blkoff / sizeof(union afs_xdr_dir_block), offset, (offset < curr ? "skip" : "fill"), nlen, dire->u.name); /* work out where the next possible entry is */ - for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) { - if (next >= AFS_DIRENT_PER_BLOCK) { + for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_xdr_dirent)) { + if (next >= AFS_DIR_SLOTS_PER_BLOCK) { _debug("ENT[%zu.%u]:" " %u travelled beyond end dir block" " (len %u/%zu)", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); return -EIO; } - if (!(block->pagehdr.bitmap[next / 8] & + if (!(block->hdr.bitmap[next / 8] & (1 << (next % 8)))) { _debug("ENT[%zu.%u]:" " %u unmarked extension (len %u/%zu)", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); return -EIO; } _debug("ENT[%zu.%u]: ext %u/%zu", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), next, tmp, nlen); next++; } @@ -330,13 +392,14 @@ static int afs_dir_iterate_block(struct dir_context *ctx, /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), - ctx->actor == afs_lookup_filldir ? + (ctx->actor == afs_lookup_filldir || + ctx->actor == afs_lookup_one_filldir)? ntohl(dire->u.unique) : DT_UNKNOWN)) { _leave(" = 0 [full]"); return 0; } - ctx->pos = blkoff + next * sizeof(union afs_dirent); + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); } _leave(" = 1 [more]"); @@ -349,8 +412,10 @@ static int afs_dir_iterate_block(struct dir_context *ctx, static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, struct key *key) { - union afs_dir_block *dblock; - struct afs_dir_page *dbuf; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_xdr_dir_page *dbuf; + union afs_xdr_dir_block *dblock; + struct afs_read *req; struct page *page; unsigned blkoff, limit; int ret; @@ -362,45 +427,53 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, return -ESTALE; } + req = afs_read_dir(dvnode, key); + if (IS_ERR(req)) + return PTR_ERR(req); + /* round the file position up to the next entry boundary */ - ctx->pos += sizeof(union afs_dirent) - 1; - ctx->pos &= ~(sizeof(union afs_dirent) - 1); + ctx->pos += sizeof(union afs_xdr_dirent) - 1; + ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1); /* walk through the blocks in sequence */ ret = 0; - while (ctx->pos < dir->i_size) { - blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1); + while (ctx->pos < req->actual_len) { + blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1); - /* fetch the appropriate page from the directory */ - page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); - if (IS_ERR(page)) { - ret = PTR_ERR(page); + /* Fetch the appropriate page from the directory and re-add it + * to the LRU. + */ + page = req->pages[blkoff / PAGE_SIZE]; + if (!page) { + ret = -EIO; break; } + mark_page_accessed(page); limit = blkoff & ~(PAGE_SIZE - 1); - dbuf = page_address(page); + dbuf = kmap(page); /* deal with the individual blocks stashed on this page */ do { dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / - sizeof(union afs_dir_block)]; + sizeof(union afs_xdr_dir_block)]; ret = afs_dir_iterate_block(ctx, dblock, blkoff); if (ret != 1) { - afs_dir_put_page(page); + kunmap(page); goto out; } - blkoff += sizeof(union afs_dir_block); + blkoff += sizeof(union afs_xdr_dir_block); } while (ctx->pos < dir->i_size && blkoff < limit); - afs_dir_put_page(page); + kunmap(page); ret = 0; } out: + afs_put_read(req); _leave(" = %d", ret); return ret; } @@ -414,23 +487,23 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) } /* - * search the directory for a name + * Search the directory for a single name * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, - int nlen, loff_t fpos, u64 ino, unsigned dtype) +static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_lookup_cookie *cookie = - container_of(ctx, struct afs_lookup_cookie, ctx); + struct afs_lookup_one_cookie *cookie = + container_of(ctx, struct afs_lookup_one_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, (unsigned long long) ino, dtype); /* insanity checks first */ - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); if (cookie->name.len != nlen || memcmp(cookie->name.name, name, nlen) != 0) { @@ -447,15 +520,15 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, } /* - * do a lookup in a directory + * Do a lookup of a single name in a directory * - just returns the FID the dentry name maps to if found */ -static int afs_do_lookup(struct inode *dir, struct dentry *dentry, - struct afs_fid *fid, struct key *key) +static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, + struct afs_fid *fid, struct key *key) { struct afs_super_info *as = dir->i_sb->s_fs_info; - struct afs_lookup_cookie cookie = { - .ctx.actor = afs_lookup_filldir, + struct afs_lookup_one_cookie cookie = { + .ctx.actor = afs_lookup_one_filldir, .name = dentry->d_name, .fid.vid = as->volume->vid }; @@ -482,70 +555,265 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry, } /* - * Probe to see if a cell may exist. This prevents positive dentries from - * being created unnecessarily. + * search the directory for a name + * - if afs_dir_iterate_block() spots this function, it'll pass the FID + * uniquifier through dtype */ -static int afs_probe_cell_name(struct dentry *dentry) +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_cell *cell; - const char *name = dentry->d_name.name; - size_t len = dentry->d_name.len; + struct afs_lookup_cookie *cookie = + container_of(ctx, struct afs_lookup_cookie, ctx); int ret; - /* Names prefixed with a dot are R/W mounts. */ - if (name[0] == '.') { - if (len == 1) - return -EINVAL; - name++; - len--; - } + _enter("{%s,%u},%s,%u,,%llu,%u", + cookie->name.name, cookie->name.len, name, nlen, + (unsigned long long) ino, dtype); - cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len); - if (!IS_ERR(cell)) { - afs_put_cell(afs_d2net(dentry), cell); - return 0; + /* insanity checks first */ + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); + + if (cookie->found) { + if (cookie->nr_fids < 50) { + cookie->fids[cookie->nr_fids].vnode = ino; + cookie->fids[cookie->nr_fids].unique = dtype; + cookie->nr_fids++; + } + } else if (cookie->name.len == nlen && + memcmp(cookie->name.name, name, nlen) == 0) { + cookie->fids[0].vnode = ino; + cookie->fids[0].unique = dtype; + cookie->found = 1; + if (cookie->one_only) + return -1; } - ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL); - if (ret == -ENODATA) - ret = -EDESTADDRREQ; + ret = cookie->nr_fids >= 50 ? -1 : 0; + _leave(" = %d", ret); return ret; } /* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. + * Do a lookup in a directory. We make use of bulk lookup to query a slew of + * files in one go and create inodes for them. The inode of the file we were + * asked for is returned. */ -static struct inode *afs_try_auto_mntpt(struct dentry *dentry, - struct inode *dir, struct afs_fid *fid) +static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, + struct key *key) { - struct afs_vnode *vnode = AFS_FS_I(dir); - struct inode *inode; - int ret = -ENOENT; + struct afs_lookup_cookie *cookie; + struct afs_cb_interest *cbi = NULL; + struct afs_super_info *as = dir->i_sb->s_fs_info; + struct afs_iget_data data; + struct afs_fs_cursor fc; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct inode *inode = NULL; + int ret, i; - _enter("%p{%pd}, {%x:%u}", - dentry, dentry, vnode->fid.vid, vnode->fid.vnode); + _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + + cookie = kzalloc(sizeof(struct afs_lookup_cookie), GFP_KERNEL); + if (!cookie) + return ERR_PTR(-ENOMEM); + + cookie->ctx.actor = afs_lookup_filldir; + cookie->name = dentry->d_name; + cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */ + + read_seqlock_excl(&dvnode->cb_lock); + if (dvnode->cb_interest && + dvnode->cb_interest->server && + test_bit(AFS_SERVER_FL_NO_IBULK, &dvnode->cb_interest->server->flags)) + cookie->one_only = true; + read_sequnlock_excl(&dvnode->cb_lock); + + for (i = 0; i < 50; i++) + cookie->fids[i].vid = as->volume->vid; + + /* search the directory */ + ret = afs_dir_iterate(dir, &cookie->ctx, key); + if (ret < 0) { + inode = ERR_PTR(ret); + goto out; + } - if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + inode = ERR_PTR(-ENOENT); + if (!cookie->found) goto out; - ret = afs_probe_cell_name(dentry); - if (ret < 0) + /* Check to see if we already have an inode for the primary fid. */ + data.volume = dvnode->volume; + data.fid = cookie->fids[0]; + inode = ilookup5(dir->i_sb, cookie->fids[0].vnode, afs_iget5_test, &data); + if (inode) goto out; - inode = afs_iget_pseudo_dir(dir->i_sb, false); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + /* Need space for examining all the selected files */ + inode = ERR_PTR(-ENOMEM); + cookie->statuses = kcalloc(cookie->nr_fids, sizeof(struct afs_file_status), + GFP_KERNEL); + if (!cookie->statuses) goto out; + + cookie->callbacks = kcalloc(cookie->nr_fids, sizeof(struct afs_callback), + GFP_KERNEL); + if (!cookie->callbacks) + goto out_s; + + /* Try FS.InlineBulkStatus first. Abort codes for the individual + * lookups contained therein are stored in the reply without aborting + * the whole operation. + */ + if (cookie->one_only) + goto no_inline_bulk_status; + + inode = ERR_PTR(-ERESTARTSYS); + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + if (test_bit(AFS_SERVER_FL_NO_IBULK, + &fc.cbi->server->flags)) { + fc.ac.abort_code = RX_INVALID_OPERATION; + fc.ac.error = -ECONNABORTED; + break; + } + afs_fs_inline_bulk_status(&fc, + afs_v2net(dvnode), + cookie->fids, + cookie->statuses, + cookie->callbacks, + cookie->nr_fids, NULL); + } + + if (fc.ac.error == 0) + cbi = afs_get_cb_interest(fc.cbi); + if (fc.ac.abort_code == RX_INVALID_OPERATION) + set_bit(AFS_SERVER_FL_NO_IBULK, &fc.cbi->server->flags); + inode = ERR_PTR(afs_end_vnode_operation(&fc)); } - *fid = AFS_FS_I(inode)->fid; - _leave("= %p", inode); - return inode; + if (!IS_ERR(inode)) + goto success; + if (fc.ac.abort_code != RX_INVALID_OPERATION) + goto out_c; + +no_inline_bulk_status: + /* We could try FS.BulkStatus next, but this aborts the entire op if + * any of the lookups fails - so, for the moment, revert to + * FS.FetchStatus for just the primary fid. + */ + cookie->nr_fids = 1; + inode = ERR_PTR(-ERESTARTSYS); + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + afs_fs_fetch_status(&fc, + afs_v2net(dvnode), + cookie->fids, + cookie->statuses, + cookie->callbacks, + NULL); + } + + if (fc.ac.error == 0) + cbi = afs_get_cb_interest(fc.cbi); + inode = ERR_PTR(afs_end_vnode_operation(&fc)); + } + if (IS_ERR(inode)) + goto out_c; + + for (i = 0; i < cookie->nr_fids; i++) + cookie->statuses[i].abort_code = 0; + +success: + /* Turn all the files into inodes and save the first one - which is the + * one we actually want. + */ + if (cookie->statuses[0].abort_code != 0) + inode = ERR_PTR(afs_abort_to_error(cookie->statuses[0].abort_code)); + + for (i = 0; i < cookie->nr_fids; i++) { + struct inode *ti; + + if (cookie->statuses[i].abort_code != 0) + continue; + + ti = afs_iget(dir->i_sb, key, &cookie->fids[i], + &cookie->statuses[i], + &cookie->callbacks[i], + cbi); + if (i == 0) { + inode = ti; + } else { + if (!IS_ERR(ti)) + iput(ti); + } + } + +out_c: + afs_put_cb_interest(afs_v2net(dvnode), cbi); + kfree(cookie->callbacks); +out_s: + kfree(cookie->statuses); out: - _leave("= %d", ret); - return ERR_PTR(ret); + kfree(cookie); + return inode; +} + +/* + * Look up an entry in a directory with @sys substitution. + */ +static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry, + struct key *key) +{ + struct afs_sysnames *subs; + struct afs_net *net = afs_i2net(dir); + struct dentry *ret; + char *buf, *p, *name; + int len, i; + + _enter(""); + + ret = ERR_PTR(-ENOMEM); + p = buf = kmalloc(AFSNAMEMAX, GFP_KERNEL); + if (!buf) + goto out_p; + if (dentry->d_name.len > 4) { + memcpy(p, dentry->d_name.name, dentry->d_name.len - 4); + p += dentry->d_name.len - 4; + } + + /* There is an ordered list of substitutes that we have to try. */ + read_lock(&net->sysnames_lock); + subs = net->sysnames; + refcount_inc(&subs->usage); + read_unlock(&net->sysnames_lock); + + for (i = 0; i < subs->nr; i++) { + name = subs->subs[i]; + len = dentry->d_name.len - 4 + strlen(name); + if (len >= AFSNAMEMAX) { + ret = ERR_PTR(-ENAMETOOLONG); + goto out_s; + } + + strcpy(p, name); + ret = lookup_one_len(buf, dentry->d_parent, len); + if (IS_ERR(ret) || d_is_positive(ret)) + goto out_s; + dput(ret); + } + + /* We don't want to d_add() the @sys dentry here as we don't want to + * the cached dentry to hide changes to the sysnames list. + */ + ret = NULL; +out_s: + afs_put_sysnames(subs); + kfree(buf); +out_p: + key_put(key); + return ret; } /* @@ -554,16 +822,13 @@ out: static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode; - struct afs_fid fid; + struct afs_vnode *dvnode = AFS_FS_I(dir); struct inode *inode; struct key *key; int ret; - vnode = AFS_FS_I(dir); - _enter("{%x:%u},%p{%pd},", - vnode->fid.vid, vnode->fid.vnode, dentry, dentry); + dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry); ASSERTCMP(d_inode(dentry), ==, NULL); @@ -572,28 +837,37 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); } - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) { _leave(" = -ESTALE"); return ERR_PTR(-ESTALE); } - key = afs_request_key(vnode->volume->cell); + key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { _leave(" = %ld [key]", PTR_ERR(key)); return ERR_CAST(key); } - ret = afs_validate(vnode, key); + ret = afs_validate(dvnode, key); if (ret < 0) { key_put(key); _leave(" = %d [val]", ret); return ERR_PTR(ret); } - ret = afs_do_lookup(dir, dentry, &fid, key); - if (ret < 0) { + if (dentry->d_name.len >= 4 && + dentry->d_name.name[dentry->d_name.len - 4] == '@' && + dentry->d_name.name[dentry->d_name.len - 3] == 's' && + dentry->d_name.name[dentry->d_name.len - 2] == 'y' && + dentry->d_name.name[dentry->d_name.len - 1] == 's') + return afs_lookup_atsys(dir, dentry, key); + + afs_stat_v(dvnode, n_lookup); + inode = afs_do_lookup(dir, dentry, key); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); if (ret == -ENOENT) { - inode = afs_try_auto_mntpt(dentry, dir, &fid); + inode = afs_try_auto_mntpt(dentry, dir); if (!IS_ERR(inode)) { key_put(key); goto success; @@ -611,10 +885,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, _leave(" = %d [do]", ret); return ERR_PTR(ret); } - dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version; + dentry->d_fsdata = (void *)(unsigned long)dvnode->status.data_version; /* instantiate the dentry */ - inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL); key_put(key); if (IS_ERR(inode)) { _leave(" = %ld", PTR_ERR(inode)); @@ -623,9 +896,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, success: d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", - fid.vnode, - fid.unique, + _leave(" = 0 { ino=%lu v=%u }", d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); @@ -633,67 +904,23 @@ success: } /* - * Look up an entry in a dynroot directory. - */ -static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct afs_vnode *vnode; - struct afs_fid fid; - struct inode *inode; - int ret; - - vnode = AFS_FS_I(dir); - - _enter("%pd", dentry); - - ASSERTCMP(d_inode(dentry), ==, NULL); - - if (dentry->d_name.len >= AFSNAMEMAX) { - _leave(" = -ENAMETOOLONG"); - return ERR_PTR(-ENAMETOOLONG); - } - - inode = afs_try_auto_mntpt(dentry, dir, &fid); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - if (ret == -ENOENT) { - d_add(dentry, NULL); - _leave(" = NULL [negative]"); - return NULL; - } - _leave(" = %d [do]", ret); - return ERR_PTR(ret); - } - - d_add(dentry, inode); - _leave(" = 0 { ino=%lu v=%u }", - d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); - return NULL; -} - -/* * check that a dentry lookup hit has found a valid entry * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct afs_super_info *as = dentry->d_sb->s_fs_info; struct afs_vnode *vnode, *dir; struct afs_fid uninitialized_var(fid); struct dentry *parent; struct inode *inode; struct key *key; - void *dir_version; + long dir_version, de_version; int ret; if (flags & LOOKUP_RCU) return -ECHILD; - if (as->dyn_root) - return 1; - if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); _enter("{v={%x:%u} n=%pd fl=%lx},", @@ -729,14 +956,25 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad_parent; } - dir_version = (void *) (unsigned long) dir->status.data_version; - if (dentry->d_fsdata == dir_version) - goto out_valid; /* the dir contents are unchanged */ + /* We only need to invalidate a dentry if the server's copy changed + * behind our back. If we made the change, it's no problem. Note that + * on a 32-bit system, we only have 32 bits in the dentry to store the + * version. + */ + dir_version = (long)dir->status.data_version; + de_version = (long)dentry->d_fsdata; + if (de_version == dir_version) + goto out_valid; + + dir_version = (long)dir->invalid_before; + if (de_version - dir_version >= 0) + goto out_valid; _debug("dir modified"); + afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key); + ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key); switch (ret) { case 0: /* the filename maps to something */ @@ -789,7 +1027,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) } out_valid: - dentry->d_fsdata = dir_version; + dentry->d_fsdata = (void *)dir_version; dput(parent); key_put(key); _leave(" = 1 [valid]"); @@ -840,7 +1078,7 @@ zap: /* * handle dentry release */ -static void afs_d_release(struct dentry *dentry) +void afs_d_release(struct dentry *dentry) { _enter("%pd", dentry); } @@ -854,6 +1092,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, struct afs_file_status *newstatus, struct afs_callback *newcb) { + struct afs_vnode *vnode; struct inode *inode; if (fc->ac.error < 0) @@ -871,6 +1110,8 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, return; } + vnode = AFS_FS_I(inode); + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); d_add(new_dentry, inode); } @@ -885,6 +1126,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; mode |= S_IFDIR; @@ -902,7 +1144,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_create(&fc, dentry->d_name.name, mode, + afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -916,6 +1158,11 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto error_key; } + if (ret == 0 && + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_create); + key_put(key); _leave(" = 0"); return 0; @@ -939,6 +1186,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) clear_nlink(&vnode->vfs_inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); } } @@ -950,6 +1198,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) struct afs_fs_cursor fc; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd}", @@ -965,13 +1214,18 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_remove(&fc, dentry->d_name.name, true); + afs_fs_remove(&fc, dentry->d_name.name, true, + data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); ret = afs_end_vnode_operation(&fc); - if (ret == 0) + if (ret == 0) { afs_dir_remove_subdir(dentry); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_rmdir); + } } key_put(key); @@ -1036,6 +1290,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct key *key; unsigned long d_version = (unsigned long)dentry->d_fsdata; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd}", @@ -1062,7 +1317,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_remove(&fc, dentry->d_name.name, false); + afs_fs_remove(&fc, dentry->d_name.name, false, + data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); @@ -1071,6 +1327,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) ret = afs_dir_remove_link( dentry, key, d_version, (unsigned long)dvnode->status.data_version); + if (ret == 0 && + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_unlink); } error_key: @@ -1092,6 +1352,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; mode |= S_IFREG; @@ -1113,7 +1374,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_create(&fc, dentry->d_name.name, mode, + afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -1127,6 +1388,10 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_create); + key_put(key); _leave(" = 0"); return 0; @@ -1148,10 +1413,12 @@ static int afs_link(struct dentry *from, struct inode *dir, struct afs_fs_cursor fc; struct afs_vnode *dvnode, *vnode; struct key *key; + u64 data_version; int ret; vnode = AFS_FS_I(d_inode(from)); dvnode = AFS_FS_I(dir); + data_version = dvnode->status.data_version; _enter("{%x:%u},{%x:%u},{%pd}", vnode->fid.vid, vnode->fid.vnode, @@ -1178,7 +1445,7 @@ static int afs_link(struct dentry *from, struct inode *dir, while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break; - afs_fs_link(&fc, vnode, dentry->d_name.name); + afs_fs_link(&fc, vnode, dentry->d_name.name, data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); @@ -1194,6 +1461,10 @@ static int afs_link(struct dentry *from, struct inode *dir, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid, + afs_edit_dir_for_link); + key_put(key); _leave(" = 0"); return 0; @@ -1217,6 +1488,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd},%s", @@ -1241,7 +1513,8 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_symlink(&fc, dentry->d_name.name, content, + afs_fs_symlink(&fc, dentry->d_name.name, + content, data_version, &newfid, &newstatus); } @@ -1255,6 +1528,10 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_symlink); + key_put(key); _leave(" = 0"); return 0; @@ -1277,6 +1554,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct afs_fs_cursor fc; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; struct key *key; + u64 orig_data_version, new_data_version; + bool new_negative = d_is_negative(new_dentry); int ret; if (flags) @@ -1285,6 +1564,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + orig_data_version = orig_dvnode->status.data_version; + new_data_version = new_dvnode->status.data_version; _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1310,7 +1591,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break; fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break; afs_fs_rename(&fc, old_dentry->d_name.name, - new_dvnode, new_dentry->d_name.name); + new_dvnode, new_dentry->d_name.name, + orig_data_version, new_data_version); } afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break); @@ -1322,9 +1604,68 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, goto error_key; } + if (ret == 0) { + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags)) + afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, + afs_edit_dir_for_rename); + + if (!new_negative && + test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) + afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, + afs_edit_dir_for_rename); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) + afs_edit_dir_add(new_dvnode, &new_dentry->d_name, + &vnode->fid, afs_edit_dir_for_rename); + } + error_key: key_put(key); error: _leave(" = %d", ret); return ret; } + +/* + * Release a directory page and clean up its private state if it's not busy + * - return true if the page can now be released, false if not + */ +static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + + _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); + + set_page_private(page, 0); + ClearPagePrivate(page); + + /* The directory will need reloading. */ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_relpg); + return 1; +} + +/* + * invalidate part or all of a page + * - release a page and clean up its private data if offset is 0 (indicating + * the entire page) + */ +static void afs_dir_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + + _enter("{%lu},%u,%u", page->index, offset, length); + + BUG_ON(!PageLocked(page)); + + /* The directory will need reloading. */ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_inval); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0 && length == PAGE_SIZE) { + set_page_private(page, 0); + ClearPagePrivate(page); + } +} diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c new file mode 100644 index 000000000000..8b400f5aead5 --- /dev/null +++ b/fs/afs/dir_edit.c @@ -0,0 +1,505 @@ +/* AFS filesystem directory editing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells ([email protected]) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/pagemap.h> +#include <linux/iversion.h> +#include "internal.h" +#include "xdr_fs.h" + +/* + * Find a number of contiguous clear bits in a directory block bitmask. + * + * There are 64 slots, which means we can load the entire bitmap into a + * variable. The first bit doesn't count as it corresponds to the block header + * slot. nr_slots is between 1 and 9. + */ +static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_slots) +{ + u64 bitmap; + u32 mask; + int bit, n; + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + bitmap >>= 1; /* The first entry is metadata */ + bit = 1; + mask = (1 << nr_slots) - 1; + + do { + if (sizeof(unsigned long) == 8) + n = ffz(bitmap); + else + n = ((u32)bitmap) != 0 ? + ffz((u32)bitmap) : + ffz((u32)(bitmap >> 32)) + 32; + bitmap >>= n; + bit += n; + + if ((bitmap & mask) == 0) { + if (bit > 64 - nr_slots) + return -1; + return bit; + } + + n = __ffs(bitmap); + bitmap >>= n; + bit += n; + } while (bitmap); + + return -1; +} + +/* + * Set a number of contiguous bits in the directory block bitmap. + */ +static void afs_set_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask, before, after; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + before = *(u64 *)block->hdr.bitmap; + + block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8); + block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8); + block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8); + block->hdr.bitmap[3] |= (u8)(mask >> 3 * 8); + block->hdr.bitmap[4] |= (u8)(mask >> 4 * 8); + block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8); + block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8); + block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8); + + after = *(u64 *)block->hdr.bitmap; +} + +/* + * Clear a number of contiguous bits in the directory block bitmap. + */ +static void afs_clear_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask, before, after; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + before = *(u64 *)block->hdr.bitmap; + + block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8); + block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8); + block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8); + block->hdr.bitmap[3] &= ~(u8)(mask >> 3 * 8); + block->hdr.bitmap[4] &= ~(u8)(mask >> 4 * 8); + block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8); + block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8); + block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8); + + after = *(u64 *)block->hdr.bitmap; +} + +/* + * Scan a directory block looking for a dirent of the right name. + */ +static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name, + unsigned int blocknum) +{ + union afs_xdr_dirent *de; + u64 bitmap; + int d, len, n; + + _enter(""); + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + + for (d = (blocknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + d < AFS_DIR_SLOTS_PER_BLOCK; + d++) { + if (!((bitmap >> d) & 1)) + continue; + de = &block->dirents[d]; + if (de->u.valid != 1) + continue; + + /* The block was NUL-terminated by afs_dir_check_page(). */ + len = strlen(de->u.name); + if (len == name->len && + memcmp(de->u.name, name->name, name->len) == 0) + return d; + + n = round_up(12 + len + 1 + 4, AFS_DIR_DIRENT_SIZE); + n /= AFS_DIR_DIRENT_SIZE; + d += n - 1; + } + + return -1; +} + +/* + * Initialise a new directory block. Note that block 0 is special and contains + * some extra metadata. + */ +static void afs_edit_init_block(union afs_xdr_dir_block *meta, + union afs_xdr_dir_block *block, int block_num) +{ + memset(block, 0, sizeof(*block)); + block->hdr.npages = htons(1); + block->hdr.magic = AFS_DIR_MAGIC; + block->hdr.bitmap[0] = 1; + + if (block_num == 0) { + block->hdr.bitmap[0] = 0xff; + block->hdr.bitmap[1] = 0x1f; + memset(block->meta.alloc_ctrs, + AFS_DIR_SLOTS_PER_BLOCK, + sizeof(block->meta.alloc_ctrs)); + meta->meta.alloc_ctrs[0] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS0; + } + + if (block_num < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[block_num] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS; +} + +/* + * Edit a directory's file data to add a new directory entry. Doing this after + * create, mkdir, symlink, link or rename if the data version number is + * incremented by exactly one avoids the need to re-download the entire + * directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_add(struct afs_vnode *vnode, + struct qstr *name, struct afs_fid *new_fid, + enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *meta, *block; + struct afs_xdr_dir_page *meta_page, *dir_page; + union afs_xdr_dirent *de; + struct page *page0, *page; + unsigned int need_slots, nr_blocks, b; + pgoff_t index; + loff_t i_size; + gfp_t gfp; + int slot; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->vfs_inode); + if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + + gfp = vnode->vfs_inode.i_mapping->gfp_mask; + page0 = find_or_create_page(vnode->vfs_inode.i_mapping, 0, gfp); + if (!page0) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + _leave(" [fgp]"); + return; + } + + /* Work out how many slots we're going to need. */ + need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); + need_slots /= AFS_DIR_DIRENT_SIZE; + + meta_page = kmap(page0); + meta = &meta_page->blocks[0]; + if (i_size == 0) + goto new_directory; + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each VM page + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks + 1; b++) { + /* If the directory extended into a new page, then we need to + * tack a new page on the end. + */ + index = b / AFS_DIR_BLOCKS_PER_PAGE; + if (index == 0) { + page = page0; + dir_page = meta_page; + } else { + if (nr_blocks >= AFS_DIR_MAX_BLOCKS) + goto error; + gfp = vnode->vfs_inode.i_mapping->gfp_mask; + page = find_or_create_page(vnode->vfs_inode.i_mapping, + index, gfp); + if (!page) + goto error; + if (!PagePrivate(page)) { + set_page_private(page, 1); + SetPagePrivate(page); + } + dir_page = kmap(page); + } + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; + + _debug("block %u: %2u %3u %u", + b, + (b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99, + ntohs(block->hdr.npages), + ntohs(block->hdr.magic)); + + /* Initialise the block if necessary. */ + if (b == nr_blocks) { + _debug("init %u", b); + afs_edit_init_block(meta, block, b); + i_size_write(&vnode->vfs_inode, (b + 1) * AFS_DIR_BLOCK_SIZE); + } + + /* Only lower dir pages have a counter in the header. */ + if (b >= AFS_DIR_BLOCKS_WITH_CTR || + meta->meta.alloc_ctrs[b] >= need_slots) { + /* We need to try and find one or more consecutive + * slots to hold the entry. + */ + slot = afs_find_contig_bits(block, need_slots); + if (slot >= 0) { + _debug("slot %u", slot); + goto found_space; + } + } + + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + } + + /* There are no spare slots of sufficient size, yet the operation + * succeeded. Download the directory again. + */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; + +new_directory: + afs_edit_init_block(meta, meta, 0); + i_size = AFS_DIR_BLOCK_SIZE; + i_size_write(&vnode->vfs_inode, i_size); + slot = AFS_DIR_RESV_BLOCKS0; + page = page0; + block = meta; + nr_blocks = 1; + b = 0; + +found_space: + /* Set the dirent slot. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create, b, slot, + new_fid->vnode, new_fid->unique, name->name); + de = &block->dirents[slot]; + de->u.valid = 1; + de->u.unused[0] = 0; + de->u.hash_next = 0; // TODO: Really need to maintain this + de->u.vnode = htonl(new_fid->vnode); + de->u.unique = htonl(new_fid->unique); + memcpy(de->u.name, name->name, name->len + 1); + de->u.name[name->len] = 0; + + /* Adjust the bitmap. */ + afs_set_contig_bits(block, slot, need_slots); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] -= need_slots; + + inode_inc_iversion_raw(&vnode->vfs_inode); + afs_stat_v(vnode, n_dir_cr); + _debug("Insert %s in %u[%u]", name->name, b, slot); + +out_unmap: + unlock_page(page0); + kunmap(page0); + put_page(page0); + _leave(""); + return; + +invalidated: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + if (page != page0) { + kunmap(page); + put_page(page); + } + goto out_unmap; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; +} + +/* + * Edit a directory's file data to remove a new directory entry. Doing this + * after unlink, rmdir or rename if the data version number is incremented by + * exactly one avoids the need to re-download the entire directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_remove(struct afs_vnode *vnode, + struct qstr *name, enum afs_edit_dir_reason why) +{ + struct afs_xdr_dir_page *meta_page, *dir_page; + union afs_xdr_dir_block *meta, *block; + union afs_xdr_dirent *de; + struct page *page0, *page; + unsigned int need_slots, nr_blocks, b; + pgoff_t index; + loff_t i_size; + int slot; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->vfs_inode); + if (i_size < AFS_DIR_BLOCK_SIZE || + i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + page0 = find_lock_page(vnode->vfs_inode.i_mapping, 0); + if (!page0) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + _leave(" [fgp]"); + return; + } + + /* Work out how many slots we're going to discard. */ + need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); + need_slots /= AFS_DIR_DIRENT_SIZE; + + meta_page = kmap(page0); + meta = &meta_page->blocks[0]; + + /* Find a page that has sufficient slots available. Each VM page + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + index = b / AFS_DIR_BLOCKS_PER_PAGE; + if (index != 0) { + page = find_lock_page(vnode->vfs_inode.i_mapping, index); + if (!page) + goto error; + dir_page = kmap(page); + } else { + page = page0; + dir_page = meta_page; + } + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; + + if (b > AFS_DIR_BLOCKS_WITH_CTR || + meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) { + slot = afs_dir_scan_block(block, name, b); + if (slot >= 0) + goto found_dirent; + } + + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; + +found_dirent: + de = &block->dirents[slot]; + + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), + name->name); + + memset(de, 0, sizeof(*de) * need_slots); + + /* Adjust the bitmap. */ + afs_clear_contig_bits(block, slot, need_slots); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] += need_slots; + + inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); + afs_stat_v(vnode, n_dir_rm); + _debug("Remove %s from %u[%u]", name->name, b, slot); + +out_unmap: + unlock_page(page0); + kunmap(page0); + put_page(page0); + _leave(""); + return; + +invalidated: + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + goto out_unmap; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; +} diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c new file mode 100644 index 000000000000..983f3946ab57 --- /dev/null +++ b/fs/afs/dynroot.c @@ -0,0 +1,209 @@ +/* dir.c: AFS dynamic root handling + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells ([email protected]) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/dns_resolver.h> +#include "internal.h" + +const struct file_operations afs_dynroot_file_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .iterate_shared = dcache_readdir, + .llseek = dcache_dir_lseek, +}; + +/* + * Probe to see if a cell may exist. This prevents positive dentries from + * being created unnecessarily. + */ +static int afs_probe_cell_name(struct dentry *dentry) +{ + struct afs_cell *cell; + const char *name = dentry->d_name.name; + size_t len = dentry->d_name.len; + int ret; + + /* Names prefixed with a dot are R/W mounts. */ + if (name[0] == '.') { + if (len == 1) + return -EINVAL; + name++; + len--; + } + + cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len); + if (!IS_ERR(cell)) { + afs_put_cell(afs_d2net(dentry), cell); + return 0; + } + + ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL); + if (ret == -ENODATA) + ret = -EDESTADDRREQ; + return ret; +} + +/* + * Try to auto mount the mountpoint with pseudo directory, if the autocell + * operation is setted. + */ +struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) +{ + struct afs_vnode *vnode = AFS_FS_I(dir); + struct inode *inode; + int ret = -ENOENT; + + _enter("%p{%pd}, {%x:%u}", + dentry, dentry, vnode->fid.vid, vnode->fid.vnode); + + if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + goto out; + + ret = afs_probe_cell_name(dentry); + if (ret < 0) + goto out; + + inode = afs_iget_pseudo_dir(dir->i_sb, false); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + _leave("= %p", inode); + return inode; + +out: + _leave("= %d", ret); + return ERR_PTR(ret); +} + +/* + * Look up @cell in a dynroot directory. This is a substitution for the + * local cell name for the net namespace. + */ +static struct dentry *afs_lookup_atcell(struct dentry *dentry) +{ + struct afs_cell *cell; + struct afs_net *net = afs_d2net(dentry); + struct dentry *ret; + unsigned int seq = 0; + char *name; + int len; + + if (!net->ws_cell) + return ERR_PTR(-ENOENT); + + ret = ERR_PTR(-ENOMEM); + name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL); + if (!name) + goto out_p; + + rcu_read_lock(); + do { + read_seqbegin_or_lock(&net->cells_lock, &seq); + cell = rcu_dereference_raw(net->ws_cell); + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len + 1); + } + } while (need_seqretry(&net->cells_lock, seq)); + done_seqretry(&net->cells_lock, seq); + rcu_read_unlock(); + + ret = ERR_PTR(-ENOENT); + if (!cell) + goto out_n; + + ret = lookup_one_len(name, dentry->d_parent, len); + + /* We don't want to d_add() the @cell dentry here as we don't want to + * the cached dentry to hide changes to the local cell name. + */ + +out_n: + kfree(name); +out_p: + return ret; +} + +/* + * Look up an entry in a dynroot directory. + */ +static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct afs_vnode *vnode; + struct inode *inode; + int ret; + + vnode = AFS_FS_I(dir); + + _enter("%pd", dentry); + + ASSERTCMP(d_inode(dentry), ==, NULL); + + if (dentry->d_name.len >= AFSNAMEMAX) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + if (dentry->d_name.len == 5 && + memcmp(dentry->d_name.name, "@cell", 5) == 0) + return afs_lookup_atcell(dentry); + + inode = afs_try_auto_mntpt(dentry, dir); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -ENOENT) { + d_add(dentry, NULL); + _leave(" = NULL [negative]"); + return NULL; + } + _leave(" = %d [do]", ret); + return ERR_PTR(ret); + } + + d_add(dentry, inode); + _leave(" = 0 { ino=%lu v=%u }", + d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); + return NULL; +} + +const struct inode_operations afs_dynroot_inode_operations = { + .lookup = afs_dynroot_lookup, +}; + +/* + * Dirs in the dynamic root don't need revalidation. + */ +static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 1; +} + +/* + * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't + * sleep) + * - called from dput() when d_count is going to 0. + * - return 1 to request dentry be unhashed, 0 otherwise + */ +static int afs_dynroot_d_delete(const struct dentry *dentry) +{ + return d_really_is_positive(dentry); +} + +const struct dentry_operations afs_dynroot_dentry_operations = { + .d_revalidate = afs_dynroot_d_revalidate, + .d_delete = afs_dynroot_d_delete, + .d_release = afs_d_release, + .d_automount = afs_d_automount, +}; diff --git a/fs/afs/file.c b/fs/afs/file.c index 79e665a35fea..c24c08016dd9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -30,7 +30,6 @@ static int afs_readpages(struct file *filp, struct address_space *mapping, const struct file_operations afs_file_operations = { .open = afs_open, - .flush = afs_flush, .release = afs_release, .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, @@ -146,6 +145,9 @@ int afs_open(struct inode *inode, struct file *file) if (ret < 0) goto error_af; } + + if (file->f_flags & O_TRUNC) + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); file->private_data = af; _leave(" = 0"); @@ -170,6 +172,9 @@ int afs_release(struct inode *inode, struct file *file) _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + if ((file->f_mode & FMODE_WRITE)) + return vfs_fsync(file, 0); + file->private_data = NULL; if (af->wb) afs_put_wb_key(af->wb); @@ -187,10 +192,12 @@ void afs_put_read(struct afs_read *req) { int i; - if (atomic_dec_and_test(&req->usage)) { + if (refcount_dec_and_test(&req->usage)) { for (i = 0; i < req->nr_pages; i++) if (req->pages[i]) put_page(req->pages[i]); + if (req->pages != req->array) + kfree(req->pages); kfree(req); } } @@ -240,6 +247,12 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de ret = afs_end_vnode_operation(&fc); } + if (ret == 0) { + afs_stat_v(vnode, n_fetches); + atomic_long_add(desc->actual_len, + &afs_v2net(vnode)->n_fetch_bytes); + } + _leave(" = %d", ret); return ret; } @@ -297,10 +310,11 @@ int afs_page_filler(void *data, struct page *page) * end of the file, the server will return a short read and the * unmarshalling code will clear the unfilled space. */ - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->pos = (loff_t)page->index << PAGE_SHIFT; req->len = PAGE_SIZE; req->nr_pages = 1; + req->pages = req->array; req->pages[0] = page; get_page(page); @@ -309,10 +323,6 @@ int afs_page_filler(void *data, struct page *page) ret = afs_fetch_data(vnode, key, req); afs_put_read(req); - if (ret >= 0 && S_ISDIR(inode->i_mode) && - !afs_dir_check_page(inode, page)) - ret = -EIO; - if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -447,10 +457,11 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping, if (!req) return -ENOMEM; - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->page_done = afs_readpages_page_done; req->pos = first->index; req->pos <<= PAGE_SHIFT; + req->pages = req->array; /* Transfer the pages to the request. We add them in until one fails * to add to the LRU and then we stop (as that'll make a hole in the diff --git a/fs/afs/flock.c b/fs/afs/flock.c index c40ba2fe3cbe..7a0e017070ec 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -613,7 +613,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) posix_test_lock(file, fl); if (fl->fl_type == F_UNLCK) { /* no local locks; consult the server */ - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) goto error; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 88ec38c2d83c..efacdb7c1dee 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -16,6 +16,7 @@ #include <linux/iversion.h> #include "internal.h" #include "afs_fs.h" +#include "xdr_fs.h" static const struct afs_fid afs_zero_fid; @@ -44,109 +45,194 @@ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid) } /* - * decode an AFSFetchStatus block + * Dump a bad file status record. */ -static void xdr_decode_AFSFetchStatus(const __be32 **_bp, - struct afs_file_status *status, - struct afs_vnode *vnode, - afs_dataversion_t *store_version) +static void xdr_dump_bad(const __be32 *bp) { - afs_dataversion_t expected_version; - const __be32 *bp = *_bp; + __be32 x[4]; + int i; + + pr_notice("AFS XDR: Bad status record\n"); + for (i = 0; i < 5 * 4 * 4; i += 16) { + memcpy(x, bp, 16); + bp += 4; + pr_notice("%03x: %08x %08x %08x %08x\n", + i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); + } + + memcpy(x, bp, 4); + pr_notice("0x50: %08x\n", ntohl(x[0])); +} + +/* + * Update the core inode struct from a returned status record. + */ +void afs_update_inode_from_status(struct afs_vnode *vnode, + struct afs_file_status *status, + const afs_dataversion_t *expected_version, + u8 flags) +{ + struct timespec t; umode_t mode; + + t.tv_sec = status->mtime_client; + t.tv_nsec = 0; + vnode->vfs_inode.i_ctime = t; + vnode->vfs_inode.i_mtime = t; + vnode->vfs_inode.i_atime = t; + + if (flags & (AFS_VNODE_META_CHANGED | AFS_VNODE_NOT_YET_SET)) { + vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner); + vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group); + set_nlink(&vnode->vfs_inode, status->nlink); + + mode = vnode->vfs_inode.i_mode; + mode &= ~S_IALLUGO; + mode |= status->mode; + barrier(); + vnode->vfs_inode.i_mode = mode; + } + + if (!(flags & AFS_VNODE_NOT_YET_SET)) { + if (expected_version && + *expected_version != status->data_version) { + _debug("vnode modified %llx on {%x:%u} [exp %llx]", + (unsigned long long) status->data_version, + vnode->fid.vid, vnode->fid.vnode, + (unsigned long long) *expected_version); + vnode->invalid_before = status->data_version; + if (vnode->status.type == AFS_FTYPE_DIR) { + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_stat_v(vnode, n_inval); + } else { + set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + } + } else if (vnode->status.type == AFS_FTYPE_DIR) { + /* Expected directory change is handled elsewhere so + * that we can locally edit the directory and save on a + * download. + */ + if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + flags &= ~AFS_VNODE_DATA_CHANGED; + } + } + + if (flags & (AFS_VNODE_DATA_CHANGED | AFS_VNODE_NOT_YET_SET)) { + inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + i_size_write(&vnode->vfs_inode, status->size); + } +} + +/* + * decode an AFSFetchStatus block + */ +static int xdr_decode_AFSFetchStatus(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp; u64 data_version, size; - bool changed = false; - kuid_t owner; - kgid_t group; + u32 type, abort_code; + u8 flags = 0; + int ret; if (vnode) write_seqlock(&vnode->cb_lock); -#define EXTRACT(DST) \ - do { \ - u32 x = ntohl(*bp++); \ - if (DST != x) \ - changed |= true; \ - DST = x; \ - } while (0) - - status->if_version = ntohl(*bp++); - EXTRACT(status->type); - EXTRACT(status->nlink); - size = ntohl(*bp++); - data_version = ntohl(*bp++); - EXTRACT(status->author); - owner = make_kuid(&init_user_ns, ntohl(*bp++)); - changed |= !uid_eq(owner, status->owner); - status->owner = owner; - EXTRACT(status->caller_access); /* call ticket dependent */ - EXTRACT(status->anon_access); - EXTRACT(status->mode); - bp++; /* parent.vnode */ - bp++; /* parent.unique */ - bp++; /* seg size */ - status->mtime_client = ntohl(*bp++); - status->mtime_server = ntohl(*bp++); - group = make_kgid(&init_user_ns, ntohl(*bp++)); - changed |= !gid_eq(group, status->group); - status->group = group; - bp++; /* sync counter */ - data_version |= (u64) ntohl(*bp++) << 32; - EXTRACT(status->lock_count); - size |= (u64) ntohl(*bp++) << 32; - bp++; /* spare 4 */ - *_bp = bp; + if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) { + pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); + goto bad; + } - if (size != status->size) { - status->size = size; - changed |= true; + type = ntohl(xdr->type); + abort_code = ntohl(xdr->abort_code); + switch (type) { + case AFS_FTYPE_FILE: + case AFS_FTYPE_DIR: + case AFS_FTYPE_SYMLINK: + if (type != status->type && + vnode && + !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + pr_warning("Vnode %x:%x:%x changed type %u to %u\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + status->type, type); + goto bad; + } + status->type = type; + break; + case AFS_FTYPE_INVALID: + if (abort_code != 0) { + status->abort_code = abort_code; + ret = 0; + goto out; + } + /* Fall through */ + default: + goto bad; } - status->mode &= S_IALLUGO; - _debug("vnode time %lx, %lx", - status->mtime_client, status->mtime_server); +#define EXTRACT_M(FIELD) \ + do { \ + u32 x = ntohl(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_META_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) - if (vnode) { - if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode changed"); - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_uid = status->owner; - vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_generation = vnode->fid.unique; - set_nlink(&vnode->vfs_inode, status->nlink); - - mode = vnode->vfs_inode.i_mode; - mode &= ~S_IALLUGO; - mode |= status->mode; - barrier(); - vnode->vfs_inode.i_mode = mode; - } + EXTRACT_M(nlink); + EXTRACT_M(author); + EXTRACT_M(owner); + EXTRACT_M(caller_access); /* call ticket dependent */ + EXTRACT_M(anon_access); + EXTRACT_M(mode); + EXTRACT_M(group); + + status->mtime_client = ntohl(xdr->mtime_client); + status->mtime_server = ntohl(xdr->mtime_server); + status->lock_count = ntohl(xdr->lock_count); + + size = (u64)ntohl(xdr->size_lo); + size |= (u64)ntohl(xdr->size_hi) << 32; + status->size = size; + + data_version = (u64)ntohl(xdr->data_version_lo); + data_version |= (u64)ntohl(xdr->data_version_hi) << 32; + if (data_version != status->data_version) { + status->data_version = data_version; + flags |= AFS_VNODE_DATA_CHANGED; + } - vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client; - vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; - vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; - inode_set_iversion_raw(&vnode->vfs_inode, data_version); + if (read_req) { + read_req->data_version = data_version; + read_req->file_size = size; } - expected_version = status->data_version; - if (store_version) - expected_version = *store_version; + *_bp = (const void *)*_bp + sizeof(*xdr); - if (expected_version != data_version) { - status->data_version = data_version; - if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode modified %llx on {%x:%u}", - (unsigned long long) data_version, - vnode->fid.vid, vnode->fid.vnode); - set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); - set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); - } - } else if (store_version) { - status->data_version = data_version; + if (vnode) { + if (test_bit(AFS_VNODE_UNSET, &vnode->flags)) + flags |= AFS_VNODE_NOT_YET_SET; + afs_update_inode_from_status(vnode, status, expected_version, + flags); } + ret = 0; + +out: if (vnode) write_sequnlock(&vnode->cb_lock); + return ret; + +bad: + xdr_dump_bad(*_bp); + ret = afs_protocol_error(call, -EBADMSG); + goto out; } /* @@ -274,7 +360,7 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call) +static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; @@ -288,7 +374,9 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) xdr_decode_AFSVolSync(&bp, call->reply[1]); @@ -300,17 +388,18 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* * FS.FetchStatus operation type */ -static const struct afs_call_type afs_RXFSFetchStatus = { - .name = "FS.FetchStatus", +static const struct afs_call_type afs_RXFSFetchStatus_vnode = { + .name = "FS.FetchStatus(vnode)", .op = afs_FS_FetchStatus, - .deliver = afs_deliver_fs_fetch_status, + .deliver = afs_deliver_fs_fetch_status_vnode, .destructor = afs_flat_call_destructor, }; /* * fetch the status information for a file */ -int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync) +int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync, + bool new_inode) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -320,7 +409,8 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy _enter(",%x,{%x:%u},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); - call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode, + 16, (21 + 3 + 6) * 4); if (!call) { fc->ac.error = -ENOMEM; return -ENOMEM; @@ -329,6 +419,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy call->key = fc->key; call->reply[0] = vnode; call->reply[1] = volsync; + call->expected_version = new_inode ? 1 : vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -464,7 +555,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) xdr_decode_AFSVolSync(&bp, call->reply[1]); @@ -534,6 +627,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[0] = vnode; call->reply[1] = NULL; /* volsync */ call->reply[2] = req; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -546,7 +640,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); - atomic_inc(&req->usage); + refcount_inc(&req->usage); call->cb_break = fc->cb_break; afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); @@ -578,6 +672,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[0] = vnode; call->reply[1] = NULL; /* volsync */ call->reply[2] = req; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -588,7 +683,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); - atomic_inc(&req->usage); + refcount_inc(&req->usage); call->cb_break = fc->cb_break; afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); @@ -613,8 +708,10 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 || + xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack_raw(&bp, call->reply[3]); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -645,6 +742,7 @@ static const struct afs_call_type afs_RXFSMakeDir = { int afs_fs_create(struct afs_fs_cursor *fc, const char *name, umode_t mode, + u64 current_data_version, struct afs_fid *newfid, struct afs_file_status *newstatus, struct afs_callback *newcb) @@ -672,6 +770,7 @@ int afs_fs_create(struct afs_fs_cursor *fc, call->reply[1] = newfid; call->reply[2] = newstatus; call->reply[3] = newcb; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -715,7 +814,9 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -742,7 +843,8 @@ static const struct afs_call_type afs_RXFSRemoveDir = { /* * remove a file or directory */ -int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir) +int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, + u64 current_data_version) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -764,6 +866,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir) call->key = fc->key; call->reply[0] = vnode; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -801,8 +904,10 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 || + xdr_decode_AFSFetchStatus(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -823,7 +928,7 @@ static const struct afs_call_type afs_RXFSLink = { * make a hard link */ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, - const char *name) + const char *name, u64 current_data_version) { struct afs_vnode *dvnode = fc->vnode; struct afs_call *call; @@ -844,6 +949,7 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, call->key = fc->key; call->reply[0] = dvnode; call->reply[1] = vnode; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -885,8 +991,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) || + xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -909,6 +1017,7 @@ static const struct afs_call_type afs_RXFSSymlink = { int afs_fs_symlink(struct afs_fs_cursor *fc, const char *name, const char *contents, + u64 current_data_version, struct afs_fid *newfid, struct afs_file_status *newstatus) { @@ -937,6 +1046,7 @@ int afs_fs_symlink(struct afs_fs_cursor *fc, call->reply[0] = vnode; call->reply[1] = newfid; call->reply[2] = newstatus; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -987,10 +1097,13 @@ static int afs_deliver_fs_rename(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL); - if (new_dvnode != orig_dvnode) - xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode, - NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); + if (new_dvnode != orig_dvnode && + xdr_decode_AFSFetchStatus(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1013,7 +1126,9 @@ static const struct afs_call_type afs_RXFSRename = { int afs_fs_rename(struct afs_fs_cursor *fc, const char *orig_name, struct afs_vnode *new_dvnode, - const char *new_name) + const char *new_name, + u64 current_orig_data_version, + u64 current_new_data_version) { struct afs_vnode *orig_dvnode = fc->vnode; struct afs_call *call; @@ -1041,6 +1156,8 @@ int afs_fs_rename(struct afs_fs_cursor *fc, call->key = fc->key; call->reply[0] = orig_dvnode; call->reply[1] = new_dvnode; + call->expected_version = current_orig_data_version + 1; + call->expected_version_2 = current_new_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1089,8 +1206,9 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, - &call->store_version); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ afs_pages_written_back(vnode, call); @@ -1147,7 +1265,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc, call->first_offset = offset; call->last_to = to; call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1222,7 +1340,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, call->first_offset = offset; call->last_to = to; call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1252,7 +1370,6 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, */ static int afs_deliver_fs_store_status(struct afs_call *call) { - afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -1264,12 +1381,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call) return ret; /* unmarshall the reply once we've received all of it */ - store_version = NULL; - if (call->operation_ID == FSSTOREDATA) - store_version = &call->store_version; - bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1324,7 +1439,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1373,7 +1488,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1418,6 +1533,7 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -1471,7 +1587,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1518,7 +1634,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1565,7 +1681,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1947,3 +2063,265 @@ int afs_fs_get_capabilities(struct afs_net *net, trace_afs_make_fs_call(call, NULL); return afs_make_call(ac, call, GFP_NOFS, false); } + +/* + * Deliver reply data to an FS.FetchStatus with no vnode. + */ +static int afs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_file_status *status = call->reply[1]; + struct afs_callback *callback = call->reply[2]; + struct afs_volsync *volsync = call->reply[3]; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(call, &bp, status, vnode, + &call->expected_version, NULL); + callback[call->count].version = ntohl(bp[0]); + callback[call->count].expiry = ntohl(bp[1]); + callback[call->count].type = ntohl(bp[2]); + if (vnode) + xdr_decode_AFSCallBack(call, vnode, &bp); + else + bp += 3; + if (volsync) + xdr_decode_AFSVolSync(&bp, volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchStatus operation type + */ +static const struct afs_call_type afs_RXFSFetchStatus = { + .name = "FS.FetchStatus", + .op = afs_FS_FetchStatus, + .deliver = afs_deliver_fs_fetch_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a fid without needing a vnode handle. + */ +int afs_fs_fetch_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fid, + struct afs_file_status *status, + struct afs_callback *callback, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(fc->key), fid->vid, fid->vnode); + + call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = status; + call->reply[2] = callback; + call->reply[3] = volsync; + call->expected_version = 1; /* vnode->status.data_version */ + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHSTATUS); + bp[1] = htonl(fid->vid); + bp[2] = htonl(fid->vnode); + bp[3] = htonl(fid->unique); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an FS.InlineBulkStatus call + */ +static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) +{ + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + u32 tmp; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* Extract the file status count and array in two steps */ + case 1: + _debug("extract status count"); + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("status count: %u/%u", tmp, call->count2); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG); + + call->count = 0; + call->unmarshall++; + more_counts: + call->offset = 0; + + case 2: + _debug("extract status array %u", call->count); + ret = afs_extract_data(call, call->buffer, 21 * 4, true); + if (ret < 0) + return ret; + + bp = call->buffer; + statuses = call->reply[1]; + if (xdr_decode_AFSFetchStatus(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); + + call->count++; + if (call->count < call->count2) + goto more_counts; + + call->count = 0; + call->unmarshall++; + call->offset = 0; + + /* Extract the callback count and array in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG); + call->count = 0; + call->unmarshall++; + more_cbs: + call->offset = 0; + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, call->buffer, 3 * 4, true); + if (ret < 0) + return ret; + + _debug("unmarshall CB array"); + bp = call->buffer; + callbacks = call->reply[2]; + callbacks[call->count].version = ntohl(bp[0]); + callbacks[call->count].expiry = ntohl(bp[1]); + callbacks[call->count].type = ntohl(bp[2]); + statuses = call->reply[1]; + if (call->count == 0 && vnode && statuses[0].abort_code == 0) + xdr_decode_AFSCallBack(call, vnode, &bp); + call->count++; + if (call->count < call->count2) + goto more_cbs; + + call->offset = 0; + call->unmarshall++; + + case 5: + ret = afs_extract_data(call, call->buffer, 6 * 4, false); + if (ret < 0) + return ret; + + bp = call->buffer; + if (call->reply[3]) + xdr_decode_AFSVolSync(&bp, call->reply[3]); + + call->offset = 0; + call->unmarshall++; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.InlineBulkStatus operation type + */ +static const struct afs_call_type afs_RXFSInlineBulkStatus = { + .name = "FS.InlineBulkStatus", + .op = afs_FS_InlineBulkStatus, + .deliver = afs_deliver_fs_inline_bulk_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for up to 50 files + */ +int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fids, + struct afs_file_status *statuses, + struct afs_callback *callbacks, + unsigned int nr_fids, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + int i; + + _enter(",%x,{%x:%u},%u", + key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); + + call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus, + (2 + nr_fids * 3) * 4, + 21 * 4); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = statuses; + call->reply[2] = callbacks; + call->reply[3] = volsync; + call->count2 = nr_fids; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSINLINEBULKSTATUS); + *bp++ = htonl(nr_fids); + for (i = 0; i < nr_fids; i++) { + *bp++ = htonl(fids[i].vid); + *bp++ = htonl(fids[i].vnode); + *bp++ = htonl(fids[i].unique); + } + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &fids[0]); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 65c5b1edd338..06194cfe9724 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -30,12 +30,11 @@ static const struct inode_operations afs_symlink_inode_operations = { }; /* - * map the AFS file status to the inode member variables + * Initialise an inode from the vnode status. */ -static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) +static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key) { struct inode *inode = AFS_VNODE_TO_I(vnode); - bool changed; _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", vnode->status.type, @@ -46,16 +45,21 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) read_seqlock_excl(&vnode->cb_lock); + afs_update_inode_from_status(vnode, &vnode->status, NULL, + AFS_VNODE_NOT_YET_SET); + switch (vnode->status.type) { case AFS_FTYPE_FILE: inode->i_mode = S_IFREG | vnode->status.mode; inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; + inode->i_mapping->a_ops = &afs_fs_aops; break; case AFS_FTYPE_DIR: inode->i_mode = S_IFDIR | vnode->status.mode; inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; + inode->i_mapping->a_ops = &afs_dir_aops; break; case AFS_FTYPE_SYMLINK: /* Symlinks with a mode of 0644 are actually mountpoints. */ @@ -67,45 +71,31 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_mntpt_inode_operations; inode->i_fop = &afs_mntpt_file_operations; + inode->i_mapping->a_ops = &afs_fs_aops; } else { inode->i_mode = S_IFLNK | vnode->status.mode; inode->i_op = &afs_symlink_inode_operations; + inode->i_mapping->a_ops = &afs_fs_aops; } inode_nohighmem(inode); break; default: printk("kAFS: AFS vnode with undefined type\n"); read_sequnlock_excl(&vnode->cb_lock); - return -EBADMSG; + return afs_protocol_error(NULL, -EBADMSG); } - changed = (vnode->status.size != inode->i_size); - - set_nlink(inode, vnode->status.nlink); - inode->i_uid = vnode->status.owner; - inode->i_gid = vnode->status.group; - inode->i_size = vnode->status.size; - inode->i_ctime.tv_sec = vnode->status.mtime_client; - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_generation = vnode->fid.unique; - inode_set_iversion_raw(inode, vnode->status.data_version); - inode->i_mapping->a_ops = &afs_fs_aops; + vnode->invalid_before = vnode->status.data_version; read_sequnlock_excl(&vnode->cb_lock); - -#ifdef CONFIG_AFS_FSCACHE - if (changed) - fscache_attr_changed(vnode->cache); -#endif return 0; } /* * Fetch file status from the volume. */ -int afs_fetch_status(struct afs_vnode *vnode, struct key *key) +int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) { struct afs_fs_cursor fc; int ret; @@ -119,7 +109,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key) if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = vnode->cb_break + vnode->cb_s_break; - afs_fs_fetch_file_status(&fc, NULL); + afs_fs_fetch_file_status(&fc, NULL, new_inode); } afs_check_for_remote_deletion(&fc, fc.vnode); @@ -255,6 +245,11 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) } __packed key; struct afs_vnode_cache_aux aux; + if (vnode->status.type == AFS_FTYPE_DIR) { + vnode->cache = NULL; + return; + } + key.vnode_id = vnode->fid.vnode; key.unique = vnode->fid.unique; key.vnode_id_ext[0] = 0; @@ -307,7 +302,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, if (!status) { /* it's a remotely extant inode */ - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, true); if (ret < 0) goto bad_inode; } else { @@ -331,15 +326,12 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, vnode->cb_expires_at += ktime_get_real_seconds(); } - /* set up caching before mapping the status, as map-status reads the - * first page of symlinks to see if they're really mountpoints */ - inode->i_size = vnode->status.size; - afs_get_inode_cache(vnode); - - ret = afs_inode_map_status(vnode, key); + ret = afs_inode_init_from_status(vnode, key); if (ret < 0) goto bad_inode; + afs_get_inode_cache(vnode); + /* success */ clear_bit(AFS_VNODE_UNSET, &vnode->flags); inode->i_flags |= S_NOATIME; @@ -349,10 +341,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* failure */ bad_inode: -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, NULL, ret == -ENOENT); - vnode->cache = NULL; -#endif iget_failed(inode); _leave(" = %d [bad]", ret); return ERR_PTR(ret); @@ -407,8 +395,11 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) { vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; - } else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) && - !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + } else if (vnode->status.type == AFS_FTYPE_DIR && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) && + vnode->cb_expires_at - 10 > now) { + valid = true; + } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && vnode->cb_expires_at - 10 > now) { valid = true; } @@ -432,7 +423,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * access */ if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { _debug("not promised"); - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) { if (ret == -ENOENT) { set_bit(AFS_VNODE_DELETED, &vnode->flags); @@ -453,8 +444,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * different */ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) afs_zap_data(vnode); - - clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); mutex_unlock(&vnode->validate_lock); valid: _leave(" = 0"); @@ -544,7 +533,7 @@ void afs_evict_inode(struct inode *inode) } #endif - afs_put_permits(vnode->permit_cache); + afs_put_permits(rcu_access_pointer(vnode->permit_cache)); _leave(""); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a6a1d75eee41..f8086ec95e24 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -122,7 +122,8 @@ struct afs_call { u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ - afs_dataversion_t store_version; /* updated version expected from store */ + afs_dataversion_t expected_version; /* Updated version expected from store */ + afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */ }; struct afs_call_type { @@ -173,11 +174,14 @@ struct afs_read { loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ loff_t remain; /* Amount remaining */ - atomic_t usage; + loff_t file_size; /* File size returned by server */ + afs_dataversion_t data_version; /* Version number returned by server */ + refcount_t usage; unsigned int index; /* Which page we're reading into */ unsigned int nr_pages; void (*page_done)(struct afs_call *, struct afs_read *); - struct page *pages[]; + struct page **pages; + struct page *array[]; }; /* @@ -199,6 +203,18 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) extern struct file_system_type afs_fs_type; /* + * Set of substitutes for @sys. + */ +struct afs_sysnames { +#define AFS_NR_SYSNAME 16 + char *subs[AFS_NR_SYSNAME]; + refcount_t usage; + unsigned short nr; + short error; + char blank[1]; +}; + +/* * AFS network namespace record. */ struct afs_net { @@ -245,9 +261,25 @@ struct afs_net { struct mutex lock_manager_mutex; /* Misc */ - struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ + struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ + struct afs_sysnames *sysnames; + rwlock_t sysnames_lock; + + /* Statistics counters */ + atomic_t n_lookup; /* Number of lookups done */ + atomic_t n_reval; /* Number of dentries needing revalidation */ + atomic_t n_inval; /* Number of invalidations by the server */ + atomic_t n_relpg; /* Number of invalidations by releasepage */ + atomic_t n_read_dir; /* Number of directory pages read */ + atomic_t n_dir_cr; /* Number of directory entry creation edits */ + atomic_t n_dir_rm; /* Number of directory entry removal edits */ + atomic_t n_stores; /* Number of store ops */ + atomic_long_t n_store_bytes; /* Number of bytes stored */ + atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ + atomic_t n_fetches; /* Number of data fetch ops */ }; +extern const char afs_init_sysname[]; extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns enum afs_cell_state { @@ -363,6 +395,7 @@ struct afs_server { #define AFS_SERVER_FL_UPDATING 4 #define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */ #define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ +#define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ atomic_t usage; u32 addr_version; /* Address list version */ @@ -455,10 +488,11 @@ struct afs_vnode { struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ + afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif - struct afs_permits *permit_cache; /* cache of permits so far obtained */ + struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct mutex validate_lock; /* lock for validating this vnode */ spinlock_t wb_lock; /* lock for wb_keys */ @@ -466,12 +500,13 @@ struct afs_vnode { unsigned long flags; #define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ -#define AFS_VNODE_DIR_MODIFIED 2 /* set if dir vnode's data modified */ +#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ +#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -611,7 +646,7 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def; */ extern void afs_init_callback_state(struct afs_server *); extern void afs_break_callback(struct afs_vnode *); -extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]); +extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*); extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *); extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *); @@ -646,11 +681,26 @@ extern bool afs_cm_incoming_call(struct afs_call *); */ extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; +extern const struct address_space_operations afs_dir_aops; +extern const struct dentry_operations afs_fs_dentry_operations; + +extern void afs_d_release(struct dentry *); + +/* + * dir_edit.c + */ +extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, + enum afs_edit_dir_reason); +extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); + +/* + * dynroot.c + */ extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; -extern const struct dentry_operations afs_fs_dentry_operations; +extern const struct dentry_operations afs_dynroot_dentry_operations; -extern bool afs_dir_check_page(struct inode *, struct page *); +extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); /* * file.c @@ -680,17 +730,23 @@ extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ -extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *); +#define AFS_VNODE_NOT_YET_SET 0x01 +#define AFS_VNODE_META_CHANGED 0x02 +#define AFS_VNODE_DATA_CHANGED 0x04 +extern void afs_update_inode_from_status(struct afs_vnode *, struct afs_file_status *, + const afs_dataversion_t *, u8); + +extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool); extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *); extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); -extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, +extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64, struct afs_fid *, struct afs_file_status *, struct afs_callback *); -extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool); -extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *); -extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, +extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64); +extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); +extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64, struct afs_fid *, struct afs_file_status *); extern int afs_fs_rename(struct afs_fs_cursor *, const char *, - struct afs_vnode *, const char *); + struct afs_vnode *, const char *, u64, u64); extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *, pgoff_t, pgoff_t, unsigned, unsigned); extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *); @@ -702,11 +758,18 @@ extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); +extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, unsigned int, + struct afs_volsync *); +extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, struct afs_volsync *); /* * inode.c */ -extern int afs_fetch_status(struct afs_vnode *, struct key *); +extern int afs_fetch_status(struct afs_vnode *, struct key *, bool); extern int afs_iget5_test(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct super_block *, struct key *, @@ -754,6 +817,13 @@ static inline void afs_put_net(struct afs_net *net) { } +static inline void __afs_stat(atomic_t *s) +{ + atomic_inc(s); +} + +#define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) + /* * misc.c */ @@ -781,6 +851,7 @@ extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *); extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *); +extern void afs_put_sysnames(struct afs_sysnames *); /* * rotate.c @@ -809,6 +880,7 @@ extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, void *, size_t, bool); +extern int afs_protocol_error(struct afs_call *, int); static inline int afs_transfer_reply(struct afs_call *call) { @@ -955,7 +1027,6 @@ extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); -extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern int afs_page_mkwrite(struct vm_fault *); extern void afs_prune_wb_keys(struct afs_vnode *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 15a02a05ff40..d7560168b3bf 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -34,11 +34,42 @@ MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); struct workqueue_struct *afs_wq; struct afs_net __afs_net; +#if defined(CONFIG_ALPHA) +const char afs_init_sysname[] = "alpha_linux26"; +#elif defined(CONFIG_X86_64) +const char afs_init_sysname[] = "amd64_linux26"; +#elif defined(CONFIG_ARM) +const char afs_init_sysname[] = "arm_linux26"; +#elif defined(CONFIG_ARM64) +const char afs_init_sysname[] = "aarch64_linux26"; +#elif defined(CONFIG_X86_32) +const char afs_init_sysname[] = "i386_linux26"; +#elif defined(CONFIG_IA64) +const char afs_init_sysname[] = "ia64_linux26"; +#elif defined(CONFIG_PPC64) +const char afs_init_sysname[] = "ppc64_linux26"; +#elif defined(CONFIG_PPC32) +const char afs_init_sysname[] = "ppc_linux26"; +#elif defined(CONFIG_S390) +#ifdef CONFIG_64BIT +const char afs_init_sysname[] = "s390x_linux26"; +#else +const char afs_init_sysname[] = "s390_linux26"; +#endif +#elif defined(CONFIG_SPARC64) +const char afs_init_sysname[] = "sparc64_linux26"; +#elif defined(CONFIG_SPARC32) +const char afs_init_sysname[] = "sparc_linux26"; +#else +const char afs_init_sysname[] = "unknown_linux26"; +#endif + /* * Initialise an AFS network namespace record. */ static int __net_init afs_net_init(struct afs_net *net) { + struct afs_sysnames *sysnames; int ret; net->live = true; @@ -67,6 +98,16 @@ static int __net_init afs_net_init(struct afs_net *net) INIT_WORK(&net->fs_manager, afs_manage_servers); timer_setup(&net->fs_timer, afs_servers_timer, 0); + ret = -ENOMEM; + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) + goto error_sysnames; + sysnames->subs[0] = (char *)&afs_init_sysname; + sysnames->nr = 1; + refcount_set(&sysnames->usage, 1); + net->sysnames = sysnames; + rwlock_init(&net->sysnames_lock); + /* Register the /proc stuff */ ret = afs_proc_init(net); if (ret < 0) @@ -92,6 +133,8 @@ error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: + afs_put_sysnames(net->sysnames); +error_sysnames: net->live = false; return ret; } @@ -106,6 +149,7 @@ static void __net_exit afs_net_exit(struct afs_net *net) afs_purge_servers(net); afs_close_socket(net); afs_proc_cleanup(net); + afs_put_sysnames(net->sysnames); } /* diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 4508dd54f789..839a22280606 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -126,6 +126,34 @@ static const struct file_operations afs_proc_servers_fops = { .release = seq_release, }; +static int afs_proc_sysname_open(struct inode *inode, struct file *file); +static int afs_proc_sysname_release(struct inode *inode, struct file *file); +static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_sysname_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_sysname_stop(struct seq_file *p, void *v); +static int afs_proc_sysname_show(struct seq_file *m, void *v); +static ssize_t afs_proc_sysname_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos); + +static const struct seq_operations afs_proc_sysname_ops = { + .start = afs_proc_sysname_start, + .next = afs_proc_sysname_next, + .stop = afs_proc_sysname_stop, + .show = afs_proc_sysname_show, +}; + +static const struct file_operations afs_proc_sysname_fops = { + .open = afs_proc_sysname_open, + .read = seq_read, + .llseek = seq_lseek, + .release = afs_proc_sysname_release, + .write = afs_proc_sysname_write, +}; + +static const struct file_operations afs_proc_stats_fops; + /* * initialise the /proc/fs/afs/ directory */ @@ -139,7 +167,9 @@ int afs_proc_init(struct afs_net *net) if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) || !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) || - !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops)) + !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops) || + !proc_create("stats", 0644, net->proc_afs, &afs_proc_stats_fops) || + !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops)) goto error_tree; _leave(" = 0"); @@ -183,6 +213,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_net *net = afs_seq2net(m); @@ -204,6 +235,7 @@ static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) * clean up after reading from the cells list */ static void afs_proc_cells_stop(struct seq_file *m, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -282,7 +314,8 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, goto done; } - set_bit(AFS_CELL_FL_NO_GC, &cell->flags); + if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_put_cell(net, cell); printk("kAFS: Added new cell '%s'\n", name); } else { goto inval; @@ -304,7 +337,40 @@ inval: static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos) { - return 0; + struct afs_cell *cell; + struct afs_net *net = afs_proc2net(file); + unsigned int seq = 0; + char name[AFS_MAXCELLNAME + 1]; + int len; + + if (*_pos > 0) + return 0; + if (!net->ws_cell) + return 0; + + rcu_read_lock(); + do { + read_seqbegin_or_lock(&net->cells_lock, &seq); + len = 0; + cell = rcu_dereference_raw(net->ws_cell); + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len); + } + } while (need_seqretry(&net->cells_lock, seq)); + done_seqretry(&net->cells_lock, seq); + rcu_read_unlock(); + + if (!len) + return 0; + + name[len++] = '\n'; + if (len > size) + len = size; + if (copy_to_user(buf, name, len) != 0) + return -EFAULT; + *_pos = 1; + return len; } /* @@ -327,6 +393,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); + ret = -EINVAL; + if (kbuf[0] == '.') + goto out; + if (memchr(kbuf, '/', size)) + goto out; + /* trim to first NL */ s = memchr(kbuf, '\n', size); if (s) @@ -339,6 +411,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (ret >= 0) ret = size; /* consume everything, always */ +out: kfree(kbuf); _leave(" = %d", ret); return ret; @@ -413,6 +486,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) + __acquires(cell->proc_lock) { struct afs_cell *cell = m->private; @@ -438,6 +512,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, * clean up after reading from the cells list */ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) + __releases(cell->proc_lock) { struct afs_cell *cell = p->private; @@ -500,6 +575,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_addr_list *alist; struct afs_cell *cell = m->private; @@ -544,6 +620,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, * clean up after reading from the cells list */ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -580,6 +657,7 @@ static int afs_proc_servers_open(struct inode *inode, struct file *file) * first item. */ static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_net *net = afs_seq2net(m); @@ -601,6 +679,7 @@ static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos) * clean up after reading from the cells list */ static void afs_proc_servers_stop(struct seq_file *p, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -626,3 +705,244 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) &alist->addrs[alist->index].transport); return 0; } + +void afs_put_sysnames(struct afs_sysnames *sysnames) +{ + int i; + + if (sysnames && refcount_dec_and_test(&sysnames->usage)) { + for (i = 0; i < sysnames->nr; i++) + if (sysnames->subs[i] != afs_init_sysname && + sysnames->subs[i] != sysnames->blank) + kfree(sysnames->subs[i]); + } +} + +/* + * Handle opening of /proc/fs/afs/sysname. If it is opened for writing, we + * assume the caller wants to change the substitution list and we allocate a + * buffer to hold the list. + */ +static int afs_proc_sysname_open(struct inode *inode, struct file *file) +{ + struct afs_sysnames *sysnames; + struct seq_file *m; + int ret; + + ret = seq_open(file, &afs_proc_sysname_ops); + if (ret < 0) + return ret; + + if (file->f_mode & FMODE_WRITE) { + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) { + seq_release(inode, file); + return -ENOMEM; + } + + refcount_set(&sysnames->usage, 1); + m = file->private_data; + m->private = sysnames; + } + + return 0; +} + +/* + * Handle writes to /proc/fs/afs/sysname to set the @sys substitution. + */ +static ssize_t afs_proc_sysname_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos) +{ + struct afs_sysnames *sysnames; + struct seq_file *m = file->private_data; + char *kbuf = NULL, *s, *p, *sub; + int ret, len; + + sysnames = m->private; + if (!sysnames) + return -EINVAL; + if (sysnames->error) + return sysnames->error; + + if (size >= PAGE_SIZE - 1) { + sysnames->error = -EINVAL; + return -EINVAL; + } + if (size == 0) + return 0; + + kbuf = memdup_user_nul(buf, size); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + inode_lock(file_inode(file)); + + p = kbuf; + while ((s = strsep(&p, " \t\n"))) { + len = strlen(s); + if (len == 0) + continue; + ret = -ENAMETOOLONG; + if (len >= AFSNAMEMAX) + goto error; + + if (len >= 4 && + s[len - 4] == '@' && + s[len - 3] == 's' && + s[len - 2] == 'y' && + s[len - 1] == 's') + /* Protect against recursion */ + goto invalid; + + if (s[0] == '.' && + (len < 2 || (len == 2 && s[1] == '.'))) + goto invalid; + + if (memchr(s, '/', len)) + goto invalid; + + ret = -EFBIG; + if (sysnames->nr >= AFS_NR_SYSNAME) + goto out; + + if (strcmp(s, afs_init_sysname) == 0) { + sub = (char *)afs_init_sysname; + } else { + ret = -ENOMEM; + sub = kmemdup(s, len + 1, GFP_KERNEL); + if (!sub) + goto out; + } + + sysnames->subs[sysnames->nr] = sub; + sysnames->nr++; + } + + ret = size; /* consume everything, always */ +out: + inode_unlock(file_inode(file)); + kfree(kbuf); + return ret; + +invalid: + ret = -EINVAL; +error: + sysnames->error = ret; + goto out; +} + +static int afs_proc_sysname_release(struct inode *inode, struct file *file) +{ + struct afs_sysnames *sysnames, *kill = NULL; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net(m); + + sysnames = m->private; + if (sysnames) { + if (!sysnames->error) { + kill = sysnames; + if (sysnames->nr == 0) { + sysnames->subs[0] = sysnames->blank; + sysnames->nr++; + } + write_lock(&net->sysnames_lock); + kill = net->sysnames; + net->sysnames = sysnames; + write_unlock(&net->sysnames_lock); + } + afs_put_sysnames(kill); + } + + return seq_release(inode, file); +} + +static void *afs_proc_sysname_start(struct seq_file *m, loff_t *pos) + __acquires(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; + + read_lock(&net->sysnames_lock); + + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); +} + +static void *afs_proc_sysname_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; + + *pos += 1; + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); +} + +static void afs_proc_sysname_stop(struct seq_file *m, void *v) + __releases(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); + + read_unlock(&net->sysnames_lock); +} + +static int afs_proc_sysname_show(struct seq_file *m, void *v) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *sysnames = net->sysnames; + unsigned int i = (unsigned long)v - 1; + + if (i < sysnames->nr) + seq_printf(m, "%s\n", sysnames->subs[i]); + return 0; +} + +/* + * Display general per-net namespace statistics + */ +static int afs_proc_stats_show(struct seq_file *m, void *v) +{ + struct afs_net *net = afs_seq2net(m); + + seq_puts(m, "kAFS statistics\n"); + + seq_printf(m, "dir-mgmt: look=%u reval=%u inval=%u relpg=%u\n", + atomic_read(&net->n_lookup), + atomic_read(&net->n_reval), + atomic_read(&net->n_inval), + atomic_read(&net->n_relpg)); + + seq_printf(m, "dir-data: rdpg=%u\n", + atomic_read(&net->n_read_dir)); + + seq_printf(m, "dir-edit: cr=%u rm=%u\n", + atomic_read(&net->n_dir_cr), + atomic_read(&net->n_dir_rm)); + + seq_printf(m, "file-rd : n=%u nb=%lu\n", + atomic_read(&net->n_fetches), + atomic_long_read(&net->n_fetch_bytes)); + seq_printf(m, "file-wr : n=%u nb=%lu\n", + atomic_read(&net->n_stores), + atomic_long_read(&net->n_store_bytes)); + return 0; +} + +/* + * Open "/proc/fs/afs/stats" to allow reading of the stat counters. + */ +static int afs_proc_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, afs_proc_stats_show, NULL); +} + +static const struct file_operations afs_proc_stats_fops = { + .open = afs_proc_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index ad1328d85526..ac0feac9d746 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -21,7 +21,7 @@ /* * Initialise a filesystem server cursor for iterating over FS servers. */ -void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) +static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) { memset(fc, 0, sizeof(*fc)); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index f7ae54b6a393..5c6263972ec9 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -926,3 +926,12 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, afs_set_call_complete(call, ret, remote_abort); return ret; } + +/* + * Log protocol error production. + */ +noinline int afs_protocol_error(struct afs_call *call, int error) +{ + trace_afs_protocol_error(call, error, __builtin_return_address(0)); + return error; +} diff --git a/fs/afs/security.c b/fs/afs/security.c index b88b7d45fdaa..cea2fff313dc 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -178,18 +178,14 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) { - rcu_read_unlock(); + if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) goto someone_else_changed_it; - } /* We need a ref on any permits list we want to copy as we'll have to * drop the lock to do memory allocation. */ - if (permits && !refcount_inc_not_zero(&permits->usage)) { - rcu_read_unlock(); + if (permits && !refcount_inc_not_zero(&permits->usage)) goto someone_else_changed_it; - } rcu_read_unlock(); @@ -278,6 +274,7 @@ someone_else_changed_it: /* Someone else changed the cache under us - don't recheck at this * time. */ + rcu_read_unlock(); return; } @@ -296,8 +293,6 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, _enter("{%x:%u},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); - permits = vnode->permit_cache; - /* check the permits to see if we've got one yet */ if (key == vnode->volume->cell->anonymous_key) { _debug("anon"); @@ -327,7 +322,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, */ _debug("no valid permit"); - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) { *_access = 0; _leave(" = %d", ret); diff --git a/fs/afs/server.c b/fs/afs/server.c index a43ef77dabae..e23be63998a8 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -59,7 +59,8 @@ struct afs_server *afs_find_server(struct afs_net *net, alist = rcu_dereference(server->addresses); for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { b = &alist->addrs[i].transport.sin6; - diff = (u16)a->sin6_port - (u16)b->sin6_port; + diff = ((u16 __force)a->sin6_port - + (u16 __force)b->sin6_port); if (diff == 0) diff = memcmp(&a->sin6_addr, &b->sin6_addr, @@ -79,10 +80,11 @@ struct afs_server *afs_find_server(struct afs_net *net, alist = rcu_dereference(server->addresses); for (i = 0; i < alist->nr_ipv4; i++) { b = &alist->addrs[i].transport.sin6; - diff = (u16)a->sin6_port - (u16)b->sin6_port; + diff = ((u16 __force)a->sin6_port - + (u16 __force)b->sin6_port); if (diff == 0) - diff = ((u32)a->sin6_addr.s6_addr32[3] - - (u32)b->sin6_addr.s6_addr32[3]); + diff = ((u32 __force)a->sin6_addr.s6_addr32[3] - + (u32 __force)b->sin6_addr.s6_addr32[3]); if (diff == 0) goto found; if (diff < 0) { @@ -381,7 +383,7 @@ static void afs_server_rcu(struct rcu_head *rcu) { struct afs_server *server = container_of(rcu, struct afs_server, rcu); - afs_put_addrlist(server->addresses); + afs_put_addrlist(rcu_access_pointer(server->addresses)); kfree(server); } @@ -390,7 +392,7 @@ static void afs_server_rcu(struct rcu_head *rcu) */ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) { - struct afs_addr_list *alist = server->addresses; + struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { .alist = alist, .addr = &alist->addrs[0], diff --git a/fs/afs/super.c b/fs/afs/super.c index 3623c952b6ff..65081ec3c36e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -154,7 +154,7 @@ static int afs_show_devname(struct seq_file *m, struct dentry *root) seq_puts(m, "none"); return 0; } - + switch (volume->type) { case AFSVL_RWVOL: break; @@ -269,7 +269,7 @@ static int afs_parse_device_name(struct afs_mount_params *params, int cellnamesz; _enter(",%s", name); - + if (!name) { printk(KERN_ERR "kAFS: no volume name specified\n"); return -EINVAL; @@ -418,7 +418,10 @@ static int afs_fill_super(struct super_block *sb, if (!sb->s_root) goto error; - sb->s_d_op = &afs_fs_dentry_operations; + if (params->dyn_root) + sb->s_d_op = &afs_dynroot_dentry_operations; + else + sb->s_d_op = &afs_fs_dentry_operations; _leave(" = 0"); return 0; @@ -676,7 +679,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = 0; return 0; } - + key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) return PTR_ERR(key); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 5d8562f1ad4a..1ed7e2fd2f35 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -303,7 +303,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, r->uuid.clock_seq_hi_and_reserved = htonl(u->clock_seq_hi_and_reserved); r->uuid.clock_seq_low = htonl(u->clock_seq_low); for (i = 0; i < 6; i++) - r->uuid.node[i] = ntohl(u->node[i]); + r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); @@ -450,7 +450,7 @@ again: call->count2 = ntohl(*bp); /* Type or next count */ if (call->count > YFS_MAXENDPOINTS) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); if (!alist) @@ -474,7 +474,7 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } size += sizeof(__be32); @@ -487,24 +487,24 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); bp += 6; break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } /* Got either the type of the next entry or the count of * volEndpoints if no more fsEndpoints. */ - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); call->offset = 0; call->count--; @@ -517,7 +517,7 @@ again: if (!call->count) goto end; if (call->count > YFS_MAXENDPOINTS) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->unmarshall = 3; @@ -531,7 +531,7 @@ again: return ret; bp = call->buffer; - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); call->offset = 0; call->unmarshall = 4; @@ -545,7 +545,7 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } if (call->count > 1) @@ -558,16 +558,16 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); bp += 6; break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } /* Got either the type of the next entry or the count of @@ -576,7 +576,7 @@ again: call->offset = 0; call->count--; if (call->count > 0) { - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); goto again; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 9370e2feb999..c164698dc304 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -42,10 +42,11 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, if (!req) return -ENOMEM; - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->pos = pos; req->len = len; req->nr_pages = 1; + req->pages = req->array; req->pages[0] = page; get_page(page); @@ -124,7 +125,12 @@ try_again: page->index, priv); goto flush_conflicting_write; } - if (to < f || from > t) + /* If the file is being filled locally, allow inter-write + * spaces to be merged into writes. If it's not, only write + * back what the user gives us. + */ + if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && + (to < f || from > t)) goto flush_conflicting_write; if (from < f) f = from; @@ -355,6 +361,12 @@ found_key: } switch (ret) { + case 0: + afs_stat_v(vnode, n_stores); + atomic_long_add((last * PAGE_SIZE + to) - + (first * PAGE_SIZE + offset), + &afs_v2net(vnode)->n_store_bytes); + break; case -EACCES: case -EPERM: case -ENOKEY: @@ -412,7 +424,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, trace_afs_page_dirty(vnode, tracepoint_string("WARN"), primary_page->index, priv); - if (start >= final_page || to < PAGE_SIZE) + if (start >= final_page || + (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) goto no_more; start++; @@ -433,9 +446,10 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, } for (loop = 0; loop < n; loop++) { - if (to != PAGE_SIZE) - break; page = pages[loop]; + if (to != PAGE_SIZE && + !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) + break; if (page->index > final_page) break; if (!trylock_page(page)) @@ -448,7 +462,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, priv = page_private(page); f = priv & AFS_PRIV_MAX; t = priv >> AFS_PRIV_SHIFT; - if (f != 0) { + if (f != 0 && + !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); break; } @@ -570,10 +585,11 @@ static int afs_writepages_region(struct address_space *mapping, _debug("wback %lx", page->index); - /* at this point we hold neither mapping->tree_lock nor lock on - * the page itself: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled back from - * swapper_space to tmpfs file mapping + /* + * at this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ ret = lock_page_killable(page); if (ret < 0) { @@ -734,20 +750,6 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } /* - * Flush out all outstanding writes on a file opened for writing when it is - * closed. - */ -int afs_flush(struct file *file, fl_owner_t id) -{ - _enter(""); - - if ((file->f_mode & FMODE_WRITE) == 0) - return 0; - - return vfs_fsync(file, 0); -} - -/* * notification that a previously read-only page is about to become writable * - if it returns an error, the caller will deliver a bus error signal */ diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h new file mode 100644 index 000000000000..aa21f3068d52 --- /dev/null +++ b/fs/afs/xdr_fs.h @@ -0,0 +1,103 @@ +/* AFS fileserver XDR types + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells ([email protected]) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef XDR_FS_H +#define XDR_FS_H + +struct afs_xdr_AFSFetchStatus { + __be32 if_version; +#define AFS_FSTATUS_VERSION 1 + __be32 type; + __be32 nlink; + __be32 size_lo; + __be32 data_version_lo; + __be32 author; + __be32 owner; + __be32 caller_access; + __be32 anon_access; + __be32 mode; + __be32 parent_vnode; + __be32 parent_unique; + __be32 seg_size; + __be32 mtime_client; + __be32 mtime_server; + __be32 group; + __be32 sync_counter; + __be32 data_version_hi; + __be32 lock_count; + __be32 size_hi; + __be32 abort_code; +} __packed; + +#define AFS_DIR_HASHTBL_SIZE 128 +#define AFS_DIR_DIRENT_SIZE 32 +#define AFS_DIR_SLOTS_PER_BLOCK 64 +#define AFS_DIR_BLOCK_SIZE 2048 +#define AFS_DIR_BLOCKS_PER_PAGE (PAGE_SIZE / AFS_DIR_BLOCK_SIZE) +#define AFS_DIR_MAX_SLOTS 65536 +#define AFS_DIR_BLOCKS_WITH_CTR 128 +#define AFS_DIR_MAX_BLOCKS 1023 +#define AFS_DIR_RESV_BLOCKS 1 +#define AFS_DIR_RESV_BLOCKS0 13 + +/* + * Directory entry structure. + */ +union afs_xdr_dirent { + struct { + u8 valid; + u8 unused[1]; + __be16 hash_next; + __be32 vnode; + __be32 unique; + u8 name[16]; + u8 overflow[4]; /* if any char of the name (inc + * NUL) reaches here, consume + * the next dirent too */ + } u; + u8 extended_name[32]; +} __packed; + +/* + * Directory block header (one at the beginning of every 2048-byte block). + */ +struct afs_xdr_dir_hdr { + __be16 npages; + __be16 magic; +#define AFS_DIR_MAGIC htons(1234) + u8 reserved; + u8 bitmap[8]; + u8 pad[19]; +} __packed; + +/* + * Directory block layout + */ +union afs_xdr_dir_block { + struct afs_xdr_dir_hdr hdr; + + struct { + struct afs_xdr_dir_hdr hdr; + u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS]; + __be16 hashtable[AFS_DIR_HASHTBL_SIZE]; + } meta; + + union afs_xdr_dirent dirents[AFS_DIR_SLOTS_PER_BLOCK]; +} __packed; + +/* + * Directory layout on a linux VM page. + */ +struct afs_xdr_dir_page { + union afs_xdr_dir_block blocks[AFS_DIR_BLOCKS_PER_PAGE]; +}; + +#endif /* XDR_FS_H */ diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index a0c57c37fa21..be9c3dc048ab 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -19,9 +19,6 @@ */ static autofs_wqt_t autofs4_next_wait_queue = 1; -/* These are the signals we allow interrupting a pending mount */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) - void autofs4_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -486,29 +483,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, * wq->name.name is NULL iff the lock is already released * or the mount has been made catatonic. */ - if (wq->name.name) { - /* Block all but "shutdown" signals while waiting */ - unsigned long shutdown_sigs_mask; - unsigned long irqflags; - sigset_t oldset; - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - oldset = current->blocked; - shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0]; - siginitsetinv(¤t->blocked, shutdown_sigs_mask); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - - wait_event_interruptible(wq->queue, wq->name.name == NULL); - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - } else { - pr_debug("skipped sleeping\n"); - } - + wait_event_killable(wq->queue, wq->name.name == NULL); status = wq->status; /* @@ -574,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok kfree(wq->name.name); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ce1824f47ba6..c3deb2e35f20 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -330,6 +330,7 @@ beyond_if: #ifdef __alpha__ regs->gp = ex.a_gpvalue; #endif + finalize_exec(bprm); start_thread(regs, ex.a_entry, current->mm->start_stack); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bdb201230bae..41e04183e4ce 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); + if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) + pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, + (void *)addr); + return(map_addr); } @@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED; + elf_type |= MAP_FIXED_NOREPLACE; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; @@ -890,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm) the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot = 0, elf_flags; + int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; unsigned long k, vaddr; unsigned long total_size = 0; @@ -922,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm) */ } } + + /* + * Some binaries have overlapping elf segments and then + * we have to forcefully map over an existing mapping + * e.g. over this newly established brk mapping. + */ + elf_fixed = MAP_FIXED; } if (elf_ppnt->p_flags & PF_R) @@ -939,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED; + elf_flags |= elf_fixed; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -975,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED; + elf_flags |= elf_fixed; } else load_bias = 0; @@ -1155,6 +1167,7 @@ static int load_elf_binary(struct linux_binprm *bprm) ELF_PLAT_INIT(regs, reloc_func_desc); #endif + finalize_exec(bprm); start_thread(regs, elf_entry, bprm->p); retval = 0; out: @@ -1234,7 +1247,7 @@ static int load_elf_library(struct file *file) (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); if (error != ELF_PAGESTART(eppnt->p_vaddr)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 429326b6e2e7..d90993adeffa 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -463,6 +463,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) dynaddr); #endif + finalize_exec(bprm); /* everything is now ready... get the userspace context ready to roll */ entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; start_thread(regs, entryaddr, current->mm->start_stack); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d6b94475f27..82a48e830018 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -994,6 +994,7 @@ static int load_flat_binary(struct linux_binprm *bprm) FLAT_PLAT_INIT(regs); #endif + finalize_exec(bprm); pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 562c3e633403..578181cd96b5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -458,7 +458,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, pg_index); + page = radix_tree_lookup(&mapping->i_pages, pg_index); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) { misses++; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47a8fe9d22e8..cf87976e389d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3963,11 +3963,11 @@ retry: done_index = page->index; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor + * the page lock: the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to + * tmpfs file mapping */ if (!trylock_page(page)) { flush_write_bio(epd); @@ -5174,13 +5174,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!PagePrivate(page)); clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); + xa_lock_irq(&page->mapping->i_pages); if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, + radix_tree_tag_clear(&page->mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&page->mapping->tree_lock); + xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); } diff --git a/fs/buffer.c b/fs/buffer.c index ec5dd39071e6..249b83fafe48 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync); * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. * - * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * Hack idea: for the blockdev mapping, private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->tree_lock). + * succeeds, there is no need to take private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -495,35 +494,12 @@ repeat: return err; } -static void do_thaw_one(struct super_block *sb, void *unused) +void emergency_thaw_bdev(struct super_block *sb) { while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } -static void do_thaw_all(struct work_struct *work) -{ - iterate_supers(do_thaw_one, NULL); - kfree(work); - printk(KERN_WARNING "Emergency Thaw complete\n"); -} - -/** - * emergency_thaw_all -- forcibly thaw every frozen filesystem - * - * Used for emergency unfreeze of all filesystems via SysRq - */ -void emergency_thaw_all(void) -{ - struct work_struct *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - INIT_WORK(work, do_thaw_all); - schedule_work(work); - } -} - /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written @@ -594,20 +570,21 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * The caller must hold lock_page_memcg(). */ -static void __set_page_dirty(struct page *page, struct address_space *mapping, +void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } +EXPORT_SYMBOL_GPL(__set_page_dirty); /* * Add a page to the dirty page list. @@ -1095,7 +1072,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and mapping->host->i_lock. + * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7cee97b93a61..4bcd4e838b47 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, for (i = 0; i < found_pages; i++) { page = wdata->pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ if (nr_pages == 0) @@ -158,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. + * @entry may no longer be the entry at the index in the mapping. + * The important information it's conveying is whether the entry at + * this index used to be a PMD entry. */ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all) @@ -174,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, /* * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. + * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ @@ -183,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. The function must be called with - * mapping->tree_lock held + * Check whether the given slot is locked. Must be called with the i_pages + * lock held. */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); return entry & RADIX_DAX_ENTRY_LOCK; } /* - * Mark the given slot is locked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as locked. Must be called with the i_pages lock held. */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } /* - * Mark the given slot is unlocked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as unlocked. Must be called with the i_pages lock held. */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } @@ -228,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * - * The function must be called with mapping->tree_lock held. + * Must be called with the i_pages lock held. */ static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) @@ -241,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || @@ -254,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); schedule(); finish_wait(wq, &ewait.wait); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } } @@ -266,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, { void *entry, **slot; - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + xa_lock_irq(&mapping->i_pages); + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return; } unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); dax_wake_mapping_entry_waiter(mapping, index, entry, false); } @@ -388,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry, **slot; restart: - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { @@ -420,12 +416,12 @@ restart: if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop - * mapping->tree_lock. + * the i_pages lock. */ entry = lock_slot(mapping, slot); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be @@ -442,27 +438,27 @@ restart: put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (!entry) { /* - * We needed to drop the page_tree lock while calling + * We needed to drop the i_pages lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ - entry = __radix_tree_lookup(&mapping->page_tree, index, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (entry) { radix_tree_preload_end(); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); goto restart; } } if (pmd_downgrade) { dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->page_tree, index); + radix_tree_delete(&mapping->i_pages, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, true); @@ -470,11 +466,11 @@ restart: entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - err = __radix_tree_insert(&mapping->page_tree, index, + err = __radix_tree_insert(&mapping->i_pages, index, dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it @@ -487,12 +483,12 @@ restart: } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } entry = lock_slot(mapping, slot); out_unlock: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } @@ -501,23 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, { int ret = 0; void *entry; - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && - (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) + (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(page_tree, index); + radix_tree_delete(pages, index); mapping->nrexceptional--; ret = 1; out: put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } /* @@ -587,7 +583,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; void *new_entry; @@ -604,7 +600,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); new_entry = dax_radix_locked_entry(pfn, flags); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); @@ -624,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, &node, &slot); + ret = __radix_tree_lookup(pages, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, + __radix_tree_replace(pages, node, slot, new_entry, NULL); entry = new_entry; } if (dirty) - radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return entry; } @@ -723,7 +719,7 @@ unlock_pte: static int dax_writeback_one(struct dax_device *dax_dev, struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; void *entry2, **slot; unsigned long pfn; long ret = 0; @@ -736,7 +732,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, if (WARN_ON(!radix_tree_exceptional_entry(entry))) return -EIO; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) @@ -755,7 +751,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, } /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; /* Lock the entry to serialize with page faults */ entry = lock_slot(mapping, slot); @@ -763,11 +759,11 @@ static int dax_writeback_one(struct dax_device *dax_dev, * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look - * at the entry only under tree_lock and once they do that they will - * see the entry locked and wait for it to unlock. + * at the entry only under the i_pages lock and once they do that + * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); - spin_unlock_irq(&mapping->tree_lock); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); + xa_unlock_irq(pages); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -787,16 +783,16 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(pages); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); + xa_unlock_irq(pages); trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); put_locked_mapping_entry(mapping, index); return ret; put_unlocked: put_unlocked_mapping_entry(mapping, index, entry2); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } @@ -1566,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, pgoff_t index = vmf->pgoff; int vmf_ret, error; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); /* Did we race with someone splitting entry or so? */ if (!entry || (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); diff --git a/fs/dcache.c b/fs/dcache.c index 593079176123..86d2de63461e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,11 +257,25 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } +static void __d_free_external_name(struct rcu_head *head) +{ + struct external_name *name = container_of(head, struct external_name, + u.head); + + mod_node_page_state(page_pgdat(virt_to_page(name)), + NR_INDIRECTLY_RECLAIMABLE_BYTES, + -ksize(name)); + + kfree(name); +} + static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - kfree(external_name(dentry)); - kmem_cache_free(dentry_cache, dentry); + + __d_free_external_name(&external_name(dentry)->u.head); + + kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) @@ -291,7 +305,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - kfree_rcu(p, u.head); + call_rcu(&p->u.head, __d_free_external_name); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1038,6 +1052,8 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; + cond_resched(); + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1191,7 +1207,6 @@ void shrink_dcache_sb(struct super_block *sb) this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); - cond_resched(); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1473,7 +1488,6 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(&data.dispose); - cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1600,7 +1614,6 @@ void d_invalidate(struct dentry *dentry) detach_mounts(data.mountpoint); dput(data.mountpoint); } - cond_resched(); } } EXPORT_SYMBOL(d_invalidate); @@ -1617,6 +1630,7 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { + struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1637,14 +1651,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, - GFP_KERNEL_ACCOUNT); - if (!p) { + + ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); + if (!ext) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&p->u.count, 1); - dname = p->name; + atomic_set(&ext->u.count, 1); + dname = ext->name; } else { dname = dentry->d_iname; } @@ -1683,6 +1697,12 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } + if (unlikely(ext)) { + pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); + mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, + ksize(ext)); + } + this_cpu_inc(nr_dentry); return dentry; @@ -2770,7 +2790,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - kfree_rcu(old_name, u.head); + call_rcu(&old_name->u.head, __d_free_external_name); } /* diff --git a/fs/exec.c b/fs/exec.c index a919a827d181..183059c427b9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -257,7 +257,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ limit = _STK_LIM / 4 * 3; - limit = min(limit, rlimit(RLIMIT_STACK) / 4); + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); if (size > limit) goto fail; } @@ -411,6 +411,11 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; + /* Save current stack limit for all calculations made during exec. */ + task_lock(current->group_leader); + bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + err = __bprm_mm_init(bprm); if (err) goto err; @@ -697,7 +702,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = bprm->rlim_stack.rlim_max; if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -770,7 +775,7 @@ int setup_arg_pages(struct linux_binprm *bprm, * Align this down to a page boundary as expand_stack * will align it up. */ - rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; + rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; @@ -1341,11 +1346,11 @@ void setup_new_exec(struct linux_binprm * bprm) * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ - if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (bprm->rlim_stack.rlim_cur > _STK_LIM) + bprm->rlim_stack.rlim_cur = _STK_LIM; } - arch_pick_mmap_layout(current->mm); + arch_pick_mmap_layout(current->mm, &bprm->rlim_stack); current->sas_ss_sp = current->sas_ss_size = 0; @@ -1378,6 +1383,16 @@ void setup_new_exec(struct linux_binprm * bprm) } EXPORT_SYMBOL(setup_new_exec); +/* Runs immediately before start_thread() takes over. */ +void finalize_exec(struct linux_binprm *bprm) +{ + /* Store any stack rlimit changes before starting thread. */ + task_lock(current->group_leader); + current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + task_unlock(current->group_leader); +} +EXPORT_SYMBOL(finalize_exec); + /* * Prepare credentials and lock ->cred_guard_mutex. * install_exec_creds() commits the new creds and drops the lock. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index db50686f5096..02237d4d91f5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2424,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page) SetPageDirty(page); spin_unlock(&mapping->private_lock); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); WARN_ON_ONCE(!PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fe661274ff10..8c9c2f31b253 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -732,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); ClearPagePrivate(page); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bfb7a4a3a929..9327411fd93b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), - .iroot = RADIX_TREE_INIT(GFP_NOFS), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; trace_f2fs_gc_begin(sbi->sb, sync, background, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b77d6421218..265da200daa8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9a99243054ba..f202398e20ea 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page) unsigned int long flags; if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); @@ -1161,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, check_nid_range(sbi, nid)); rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); rcu_read_unlock(); if (apage) return; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1280f915079b..4b12ba70a895 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against mapping->tree_lock. + * synchronizing against the i_pages lock. * - * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ @@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } spin_lock(&inode->i_lock); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING is visible under i_lock, the eviction path owns @@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under underwriteback. + * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_DIRTY) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page) && PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_WRITEBACK) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); dec_wb_stat(old_wb, WB_WRITEBACK); @@ -430,7 +430,7 @@ skip_switch: */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); @@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the mapping's - * tree_lock so that stat transfer can synchronize against them. + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 7dc55b93a830..97137d7ec5ee 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -832,7 +832,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; - BUG_ON(cookie->stores.rnode); + BUG_ON(!radix_tree_empty(&cookie->stores)); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 1085ca12e25c..20e0d0a4dc8c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -973,7 +973,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj * retire the object instead. */ if (!fscache_use_cookie(object)) { - ASSERT(object->cookie->stores.rnode == NULL); + ASSERT(radix_tree_empty(&object->cookie->stores)); set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 685c305cbeb6..278ed0869c3c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1744,7 +1744,7 @@ do_grow_qunlock: * @newsize: the size to make the file * * The file size can grow, shrink, or stay the same size. This - * is called holding i_mutex and an exclusive glock on the inode + * is called holding i_rwsem and an exclusive glock on the inode * in question. * * Returns: errno diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 82fb5583445c..097bd3c0f270 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1923,28 +1923,37 @@ void gfs2_glock_exit(void) static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) { - if (n == 0) - gi->gl = rhashtable_walk_peek(&gi->hti); - else { - gi->gl = rhashtable_walk_next(&gi->hti); - n--; + struct gfs2_glock *gl = gi->gl; + + if (gl) { + if (n == 0) + return; + if (!lockref_put_not_zero(&gl->gl_lockref)) + gfs2_glock_queue_put(gl); } for (;;) { - if (IS_ERR_OR_NULL(gi->gl)) { - if (!gi->gl) - return; - if (PTR_ERR(gi->gl) != -EAGAIN) { - gi->gl = NULL; - return; + gl = rhashtable_walk_next(&gi->hti); + if (IS_ERR_OR_NULL(gl)) { + if (gl == ERR_PTR(-EAGAIN)) { + n = 1; + continue; } - n = 0; - } else if (gi->sdp == gi->gl->gl_name.ln_sbd && - !__lockref_is_dead(&gi->gl->gl_lockref)) { - if (!n--) - break; + gl = NULL; + break; + } + if (gl->gl_name.ln_sbd != gi->sdp) + continue; + if (n <= 1) { + if (!lockref_get_not_dead(&gl->gl_lockref)) + continue; + break; + } else { + if (__lockref_is_dead(&gl->gl_lockref)) + continue; + n--; } - gi->gl = rhashtable_walk_next(&gi->hti); } + gi->gl = gl; } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) @@ -1988,7 +1997,6 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock_iter *gi = seq->private; - gi->gl = NULL; rhashtable_walk_stop(&gi->hti); } @@ -2076,7 +2084,8 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; - gi->gl = NULL; + if (gi->gl) + gfs2_glock_put(gi->gl); rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e6a0a8a89ea7..3ba3f167641c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -825,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail_rindex; } /* - * i_mutex on quota files is special. Since this inode is hidden system + * i_rwsem on quota files is special. Since this inode is hidden system * file, we are safe to define locking ourselves. */ lockdep_set_class(&sdp->sd_quota_inode->i_rwsem, diff --git a/fs/inode.c b/fs/inode.c index b153aeaa61ea..13ceb98c3bd3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,8 +348,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&mapping->tree_lock); + INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); @@ -504,14 +503,14 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { /* - * We have to cycle tree_lock here because reclaim can be still in the + * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __delete_from_page_cache()) - * and we must not free mapping under it. + * and we must not free the mapping under it. */ - spin_lock_irq(&inode->i_data.tree_lock); + xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); BUG_ON(inode->i_data.nrexceptional); - spin_unlock_irq(&inode->i_data.tree_lock); + xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 123c069429a7..a813979b5be0 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -535,35 +535,10 @@ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char return 0; } -#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) -#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) -static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep) +static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, size_t sz) { - __be32 bm[2]; - __be32 *p; - - bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); - bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); - if (bm[1] != 0) { - p = xdr_reserve_space(xdr, 16); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(2); - *p++ = bm[0]; - *p++ = bm[1]; - } else if (bm[0] != 0) { - p = xdr_reserve_space(xdr, 12); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(1); - *p++ = bm[0]; - } else { - p = xdr_reserve_space(xdr, 8); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(0); - } - *savep = p; + if (xdr_stream_encode_uint32_array(xdr, bitmap, sz) < 0) + return cpu_to_be32(NFS4ERR_RESOURCE); return 0; } @@ -656,9 +631,13 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (unlikely(status != 0)) goto out; - status = encode_attr_bitmap(xdr, res->bitmap, &savep); + status = encode_attr_bitmap(xdr, res->bitmap, ARRAY_SIZE(res->bitmap)); if (unlikely(status != 0)) goto out; + status = cpu_to_be32(NFS4ERR_RESOURCE); + savep = xdr_reserve_space(xdr, sizeof(*savep)); + if (unlikely(!savep)) + goto out; status = encode_attr_change(xdr, res->bitmap, res->change_attr); if (unlikely(status != 0)) goto out; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d8b47624fee2..1819d0d0ba4b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -19,6 +19,7 @@ #include <linux/nfs_xdr.h> #include "nfs4_fs.h" +#include "nfs4session.h" #include "delegation.h" #include "internal.h" #include "nfs4trace.h" @@ -171,11 +172,15 @@ again: * nfs_inode_reclaim_delegation - process a delegation reclaim request * @inode: inode to process * @cred: credential to use for request - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" * */ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, - struct nfs_openres *res) + fmode_t type, + const nfs4_stateid *stateid, + unsigned long pagemod_limit) { struct nfs_delegation *delegation; struct rpc_cred *oldcred = NULL; @@ -185,9 +190,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation != NULL) { spin_lock(&delegation->lock); if (delegation->inode != NULL) { - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, @@ -195,14 +200,14 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, spin_unlock(&delegation->lock); rcu_read_unlock(); put_rpccred(oldcred); - trace_nfs4_reclaim_delegation(inode, res->delegation_type); + trace_nfs4_reclaim_delegation(inode, type); return; } /* We appear to have raced with a delegation return. */ spin_unlock(&delegation->lock); } rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -329,11 +334,16 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies * @cred: cred to use for subsequent delegation processing - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" * * Returns zero on success, or a negative errno value. */ -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, + const nfs4_stateid *stateid, + unsigned long pagemod_limit) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; @@ -345,9 +355,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = kmalloc(sizeof(*delegation), GFP_NOFS); if (delegation == NULL) return -ENOMEM; - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; delegation->change_attr = inode_peek_iversion_raw(inode); delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -392,7 +402,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - trace_nfs4_set_delegation(inode, res->delegation_type); + trace_nfs4_set_delegation(inode, type); out: spin_unlock(&clp->cl_lock); @@ -547,6 +557,22 @@ int nfs4_inode_return_delegation(struct inode *inode) return err; } +/** + * nfs4_inode_make_writeable + * @inode: pointer to inode + * + * Make the inode writeable by returning the delegation if necessary + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_inode_make_writeable(struct inode *inode) +{ + if (!nfs4_has_session(NFS_SERVER(inode)->nfs_client) || + !nfs4_check_delegation(inode, FMODE_WRITE)) + return nfs4_inode_return_delegation(inode); + return 0; +} + static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 185a09f37a89..bb1ef8c37af4 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -36,8 +36,10 @@ enum { NFS_DELEGATION_TEST_EXPIRED, }; -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); int nfs4_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); @@ -70,6 +72,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags); bool nfs4_delegation_flush_on_close(const struct inode *inode); void nfs_inode_find_delegation_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); +int nfs4_inode_make_writeable(struct inode *inode); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2f3f86726f5b..73f8b43d988c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1272,7 +1272,9 @@ static void nfs_drop_nlink(struct inode *inode) /* drop the inode if we're reasonably sure this is the last link */ if (inode->i_nlink == 1) clear_nlink(inode); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_OTHER; spin_unlock(&inode->i_lock); } @@ -1798,12 +1800,11 @@ static int nfs_safe_remove(struct dentry *dentry) trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { - NFS_PROTO(inode)->return_delegation(inode); - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == 0) nfs_drop_nlink(inode); } else - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); trace_nfs_remove_exit(dir, dentry, error); @@ -1932,8 +1933,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry, dentry); trace_nfs_link_enter(inode, dir, dentry); - NFS_PROTO(inode)->return_delegation(inode); - d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { @@ -2023,10 +2022,6 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - NFS_PROTO(old_inode)->return_delegation(old_inode); - if (new_inode != NULL) - NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d17a90c4fa37..bd15d0b57626 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,7 +195,10 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); + bool have_delegation = nfs_have_delegated_attributes(inode); + if (have_delegation) + flags &= ~(NFS_INO_INVALID_CHANGE|NFS_INO_REVAL_PAGECACHE); if (inode->i_mapping->nrpages == 0) flags &= ~NFS_INO_INVALID_DATA; nfsi->cache_validity |= flags; @@ -447,7 +450,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -493,37 +496,35 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -608,11 +609,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) goto out; } - /* - * Return any delegations if we're going to change ACLs - */ - if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) - NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) error = nfs_refresh_inode(inode, fattr); @@ -645,6 +641,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); @@ -657,6 +654,7 @@ out: * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr + * @fattr: pointer to struct nfs_fattr * * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. @@ -669,6 +667,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; @@ -683,13 +683,12 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, | NFS_INO_INVALID_ACL); } if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } if (fattr->valid) nfs_update_inode(inode, fattr); - else - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); @@ -1303,24 +1302,20 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } -static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long ret = 0; - if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) @@ -1329,17 +1324,13 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && !nfs_have_writebacks(inode)) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); - ret |= NFS_INO_INVALID_ATTR; } - - return ret; } /** @@ -1369,33 +1360,41 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_CHANGE + | NFS_INO_REVAL_PAGECACHE; if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_MTIME; if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_SIZE + | NFS_INO_REVAL_PAGECACHE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; @@ -1597,10 +1596,9 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_refresh_inode); -static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +static int nfs_post_op_update_inode_locked(struct inode *inode, + struct nfs_fattr *fattr, unsigned int invalid) { - unsigned long invalid = NFS_INO_INVALID_ATTR; - if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); @@ -1629,7 +1627,9 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); spin_unlock(&inode->i_lock); return status; @@ -1681,7 +1681,10 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME); return status; } @@ -1789,7 +1792,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ - invalid |= nfs_wcc_update_inode(inode, fattr); + nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; @@ -1803,17 +1806,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ if (!have_writers) { - invalid |= NFS_INO_INVALID_ATTR + invalid |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + /* Force revalidate of all attributes */ + save_cache_validity |= NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_SIZE + | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); } inode_set_iversion_raw(inode, fattr->change_attr); } } else { - nfsi->cache_validity |= save_cache_validity; + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_CHANGE + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1821,7 +1832,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_MTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1830,7 +1841,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1845,7 +1856,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1856,7 +1867,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } else { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); cache_revalidated = false; @@ -1877,55 +1888,61 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) umode_t newmode = inode->i_mode & S_IFMT; newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; } } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; inode->i_uid = fattr->uid; } } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; inode->i_gid = fattr->gid; } } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1942,6 +1959,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { + invalid &= ~NFS_INO_INVALID_ATTR; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1962,10 +1980,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attr_gencount = fattr->gencount; } - /* Don't declare attrcache up to date if there were no attrs! */ - if (cache_revalidated) - invalid &= ~NFS_INO_INVALID_ATTR; - /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7327930ad970..eadf1ab31d16 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,8 +138,11 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - if (status == 0) + if (status == 0) { + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); nfs_setattr_update_inode(inode, sattr, fattr); + } dprintk("NFS reply setattr: %d\n", status); return status; } @@ -383,11 +386,11 @@ out: } static int -nfs3_proc_remove(struct inode *dir, const struct qstr *name) +nfs3_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct nfs_removeres res; struct rpc_message msg = { @@ -397,7 +400,7 @@ nfs3_proc_remove(struct inode *dir, const struct qstr *name) }; int status = -ENOMEM; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n", dentry); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) goto out; @@ -411,7 +414,7 @@ out: } static void -nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; } @@ -433,7 +436,9 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; } @@ -908,12 +913,6 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags) return 0; } -static int nfs3_return_delegation(struct inode *inode) -{ - nfs_wb_all(inode); - return 0; -} - static const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -990,7 +989,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .clear_acl_cache = forget_all_cached_acls, .close_context = nfs_close_context, .have_delegation = nfs3_have_delegation, - .return_delegation = nfs3_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 6cd33bd5da87..09ee36dd8426 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1997,6 +1997,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_entry old = *entry; __be32 *p; int error; + u64 new_cookie; p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) @@ -2019,8 +2020,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (unlikely(error)) return error; - entry->prev_cookie = entry->cookie; - error = decode_cookie3(xdr, &entry->cookie); + error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) return error; @@ -2054,6 +2054,9 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, zero_nfs_fh3(entry->fh); } + entry->prev_cookie = entry->cookie; + entry->cookie = new_cookie; + return 0; out_overflow: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47f3c273245e..b71757e85066 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1045,7 +1045,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, struct nfs_inode *nfsi = NFS_I(dir); spin_lock(&dir->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_DATA; if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) { nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; @@ -1669,6 +1671,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo { struct nfs_delegation *delegation; + fmode &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation == NULL || (delegation->type & fmode) == fmode) { @@ -1751,12 +1754,16 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) } if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, - data->owner->so_cred, - &data->o_res); + data->owner->so_cred, + data->o_res.delegation_type, + &data->o_res.delegation, + data->o_res.pagemod_limit); else nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - &data->o_res); + data->owner->so_cred, + data->o_res.delegation_type, + &data->o_res.delegation, + data->o_res.pagemod_limit); } /* @@ -2743,27 +2750,40 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st * fields corresponding to attributes that were used to store the verifier. * Make sure we clobber those fields in the later setattr call */ -static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, +static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr, struct nfs4_label **label) { - const u32 *attrset = opendata->o_res.attrset; + const __u32 *bitmask = opendata->o_arg.server->exclcreat_bitmask; + __u32 attrset[3]; + unsigned ret; + unsigned i; - if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) && - !(sattr->ia_valid & ATTR_ATIME_SET)) - sattr->ia_valid |= ATTR_ATIME; + for (i = 0; i < ARRAY_SIZE(attrset); i++) { + attrset[i] = opendata->o_res.attrset[i]; + if (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE4_1) + attrset[i] &= ~bitmask[i]; + } + + ret = (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE) ? + sattr->ia_valid : 0; - if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) && - !(sattr->ia_valid & ATTR_MTIME_SET)) - sattr->ia_valid |= ATTR_MTIME; + if ((attrset[1] & (FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET))) { + if (sattr->ia_valid & ATTR_ATIME_SET) + ret |= ATTR_ATIME_SET; + else + ret |= ATTR_ATIME; + } - /* Except MODE, it seems harmless of setting twice. */ - if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - (attrset[1] & FATTR4_WORD1_MODE || - attrset[2] & FATTR4_WORD2_MODE_UMASK)) - sattr->ia_valid &= ~ATTR_MODE; + if ((attrset[1] & (FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET))) { + if (sattr->ia_valid & ATTR_MTIME_SET) + ret |= ATTR_MTIME_SET; + else + ret |= ATTR_MTIME; + } - if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) + if (!(attrset[2] & FATTR4_WORD2_SECURITY_LABEL)) *label = NULL; + return ret; } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, @@ -2892,12 +2912,15 @@ static int _nfs4_do_open(struct inode *dir, if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { - nfs4_exclusive_attrset(opendata, sattr, &label); + unsigned attrs = nfs4_exclusive_attrset(opendata, sattr, &label); /* * send create attributes which was not set by open * with an extra setattr. */ - if (sattr->ia_valid & NFS4_VALID_ATTRS) { + if (attrs || label) { + unsigned ia_old = sattr->ia_valid; + + sattr->ia_valid = attrs; nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, @@ -2907,6 +2930,7 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } + sattr->ia_valid = ia_old; } } if (opened && opendata->file_created) @@ -3874,6 +3898,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, if (IS_ERR(label)) return PTR_ERR(label); + /* Return any delegations if we're going to change ACLs */ + if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs4_inode_make_writeable(inode); + status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label); if (status == 0) { nfs_setattr_update_inode(inode, sattr, fattr); @@ -4048,7 +4076,6 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { .fh = NFS_FH(inode), - .bitmask = server->cache_consistency_bitmask, .access = entry->mask, }; struct nfs4_accessres res = { @@ -4062,14 +4089,18 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry }; int status = 0; - res.fattr = nfs_alloc_fattr(); - if (res.fattr == NULL) - return -ENOMEM; + if (!nfs_have_delegated_attributes(inode)) { + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + args.bitmask = server->cache_consistency_bitmask; + } status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { nfs_access_set_mask(entry, res.access); - nfs_refresh_inode(inode, res.fattr); + if (res.fattr) + nfs_refresh_inode(inode, res.fattr); } nfs_free_fattr(res.fattr); return status; @@ -4199,10 +4230,32 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) return status; } -static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) +static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry) +{ + struct nfs4_exception exception = { }; + struct inode *inode = d_inode(dentry); + int err; + + if (inode) { + if (inode->i_nlink == 1) + nfs4_inode_return_delegation(inode); + else + nfs4_inode_make_writeable(inode); + } + do { + err = _nfs4_proc_remove(dir, &dentry->d_name); + trace_nfs4_remove(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name) { struct nfs4_exception exception = { }; int err; + do { err = _nfs4_proc_remove(dir, name); trace_nfs4_remove(dir, name, err); @@ -4212,17 +4265,20 @@ static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) return err; } -static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; + struct inode *inode = d_inode(dentry); - res->server = server; + res->server = NFS_SB(dentry->d_sb); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); nfs_fattr_init(res->dir_attr); + + if (inode) + nfs4_inode_return_delegation(inode); } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -4248,14 +4304,21 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } -static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_renameargs *arg = msg->rpc_argp; struct nfs_renameres *res = msg->rpc_resp; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + if (old_inode) + nfs4_inode_make_writeable(old_inode); + if (new_inode) + nfs4_inode_return_delegation(new_inode); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; - res->server = server; + res->server = NFS_SB(old_dentry->d_sb); nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); } @@ -4317,6 +4380,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct } arg.bitmask = nfs4_bitmask(server, res.label); + nfs4_inode_make_writeable(inode); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo, res.fattr->time_start); @@ -5310,7 +5375,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl i = buf_to_pages_noslab(buf, buflen, arg.acl_pages); if (i < 0) return i; - nfs4_inode_return_delegation(inode); + nfs4_inode_make_writeable(inode); ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* @@ -5325,7 +5390,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl * so mark the attribute cache invalid. */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME; spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); @@ -6621,22 +6687,24 @@ static int nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; - struct cb_notify_lock_args *cbnl = key; struct nfs4_lock_waiter *waiter = wait->private; - struct nfs_lowner *lowner = &cbnl->cbnl_owner, - *wowner = waiter->owner; - /* Only wake if the callback was for the same owner */ - if (lowner->clientid != wowner->clientid || - lowner->id != wowner->id || - lowner->s_dev != wowner->s_dev) - return 0; + /* NULL key means to wake up everyone */ + if (key) { + struct cb_notify_lock_args *cbnl = key; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; - /* Make sure it's for the right inode */ - if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) - return 0; + /* Only wake if the callback was for the same owner. */ + if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev) + return 0; - waiter->notified = true; + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + } /* override "private" so we can use default_wake_function */ wait->private = waiter->task; @@ -6673,6 +6741,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) add_wait_queue(q, &wait); while(!signalled()) { + waiter.notified = false; status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; @@ -8414,6 +8483,8 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf { switch(task->tk_status) { case 0: + wake_up_all(&clp->cl_lock_waitq); + /* Fallthrough */ case -NFS4ERR_COMPLETE_ALREADY: case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; @@ -9593,7 +9664,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, .mkdir = nfs4_proc_mkdir, - .rmdir = nfs4_proc_remove, + .rmdir = nfs4_proc_rmdir, .readdir = nfs4_proc_readdir, .mknod = nfs4_proc_mknod, .statfs = nfs4_proc_statfs, @@ -9614,7 +9685,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .have_delegation = nfs4_have_delegation, - .return_delegation = nfs4_inode_return_delegation, .alloc_client = nfs4_alloc_client, .init_client = nfs4_init_client, .free_client = nfs4_free_client, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 91a4d4eeb235..c10a422efe6f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -428,7 +428,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; - int err; while (*p != NULL) { parent = *p; @@ -445,9 +444,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) return sp; } } - err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); - if (err) - return ERR_PTR(err); rb_link_node(&new->so_server_node, parent, p); rb_insert_color(&new->so_server_node, &server->state_owners); return new; @@ -460,7 +456,6 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) if (!RB_EMPTY_NODE(&sp->so_server_node)) rb_erase(&sp->so_server_node, &server->state_owners); - ida_remove(&server->openowner_id, sp->so_seqid.owner_id); } static void @@ -495,6 +490,12 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; + sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0, + gfp_flags); + if (sp->so_seqid.owner_id < 0) { + kfree(sp); + return NULL; + } sp->so_server = server; sp->so_cred = get_rpccred(cred); spin_lock_init(&sp->so_lock); @@ -526,6 +527,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_rpccred(sp->so_cred); + ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -576,13 +578,9 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, new = nfs4_alloc_state_owner(server, cred, gfp_flags); if (new == NULL) goto out; - do { - if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) - break; - spin_lock(&clp->cl_lock); - sp = nfs4_insert_state_owner_locked(new); - spin_unlock(&clp->cl_lock); - } while (sp == ERR_PTR(-EAGAIN)); + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); if (sp != new) nfs4_free_state_owner(new); out: diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b993ad282de2..9b7392032321 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -98,6 +98,7 @@ static int nfs4_stat_to_errno(int); ((3+NFS4_FHSIZE) >> 2)) #define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define nfstime4_maxsz (3) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -112,7 +113,8 @@ static int nfs4_stat_to_errno(int); #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ - 3 + 3 + 3 + nfs4_owner_maxsz + \ + 3*nfstime4_maxsz + \ + nfs4_owner_maxsz + \ nfs4_group_maxsz + nfs4_label_maxsz + \ decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ @@ -123,7 +125,8 @@ static int nfs4_stat_to_errno(int); nfs4_owner_maxsz + \ nfs4_group_maxsz + \ nfs4_label_maxsz + \ - 4 + 4) + 1 + nfstime4_maxsz + \ + 1 + nfstime4_maxsz) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_restorefh_maxsz (op_encode_hdr_maxsz) @@ -957,6 +960,35 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n) WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0); } +static ssize_t xdr_encode_bitmap4(struct xdr_stream *xdr, + const __u32 *bitmap, size_t len) +{ + ssize_t ret; + + /* Trim empty words */ + while (len > 0 && bitmap[len-1] == 0) + len--; + ret = xdr_stream_encode_uint32_array(xdr, bitmap, len); + if (WARN_ON_ONCE(ret < 0)) + return ret; + return len; +} + +static size_t mask_bitmap4(const __u32 *bitmap, const __u32 *mask, + __u32 *res, size_t len) +{ + size_t i; + __u32 tmp; + + while (len > 0 && (bitmap[len-1] == 0 || mask[len-1] == 0)) + len--; + for (i = len; i-- > 0;) { + tmp = bitmap[i] & mask[i]; + res[i] = tmp; + } + return len; +} + static void encode_nfs4_seqid(struct xdr_stream *xdr, const struct nfs_seqid *seqid) { @@ -1011,6 +1043,14 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); } +static __be32 * +xdr_encode_nfstime4(__be32 *p, const struct timespec *t) +{ + p = xdr_encode_hyper(p, (__s64)t->tv_sec); + *p++ = cpu_to_be32(t->tv_nsec); + return p; +} + static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, const umode_t *umask, @@ -1022,9 +1062,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, int owner_namelen = 0; int owner_grouplen = 0; __be32 *p; - unsigned i; uint32_t len = 0; - uint32_t bmval_len; uint32_t bmval[3] = { 0 }; /* @@ -1072,7 +1110,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_ATIME) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 4; @@ -1081,7 +1119,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_MTIME) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; @@ -1093,19 +1131,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } - if (bmval[2] != 0) - bmval_len = 3; - else if (bmval[1] != 0) - bmval_len = 2; - else - bmval_len = 1; - - p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); - - *p++ = cpu_to_be32(bmval_len); - for (i = 0; i < bmval_len; i++) - *p++ = cpu_to_be32(bmval[i]); - *p++ = cpu_to_be32(len); + xdr_encode_bitmap4(xdr, bmval, ARRAY_SIZE(bmval)); + xdr_stream_encode_opaque_inline(xdr, (void **)&p, len); if (bmval[0] & FATTR4_WORD0_SIZE) p = xdr_encode_hyper(p, iap->ia_size); @@ -1118,16 +1145,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); - *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_atime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); - *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_mtime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } @@ -1199,85 +1224,45 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * create->server, create->server->attr_bitmask); } -static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bitmap); -} - -static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); -} - -static void -encode_getattr_three(struct xdr_stream *xdr, - uint32_t bm0, uint32_t bm1, uint32_t bm2, - struct compound_hdr *hdr) +static void encode_getattr(struct xdr_stream *xdr, + const __u32 *bitmap, const __u32 *mask, size_t len, + struct compound_hdr *hdr) { - __be32 *p; + __u32 masked_bitmap[nfs4_fattr_bitmap_maxsz]; encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - if (bm2) { - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(bm0); - *p++ = cpu_to_be32(bm1); - *p = cpu_to_be32(bm2); - } else if (bm1) { - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); - } else { - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bm0); + if (mask) { + if (WARN_ON_ONCE(len > ARRAY_SIZE(masked_bitmap))) + len = ARRAY_SIZE(masked_bitmap); + len = mask_bitmap4(bitmap, mask, masked_bitmap, len); + bitmap = masked_bitmap; } + xdr_encode_bitmap4(xdr, bitmap, len); } static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], - bitmask[2] & nfs4_fattr_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fattr_bitmap, bitmask, + ARRAY_SIZE(nfs4_fattr_bitmap), hdr); } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, const u32 *open_bitmap, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & open_bitmap[0], - bitmask[1] & open_bitmap[1], - bitmask[2] & open_bitmap[2], - hdr); + encode_getattr(xdr, open_bitmap, bitmask, 3, hdr); } static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1], - bitmask[2] & nfs4_fsinfo_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fsinfo_bitmap, bitmask, + ARRAY_SIZE(nfs4_fsinfo_bitmap), hdr); } static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], - bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); + encode_getattr(xdr, nfs4_fs_locations_bitmap, bitmask, + ARRAY_SIZE(nfs4_fs_locations_bitmap), hdr); } static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) @@ -2116,7 +2101,8 @@ static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_access(xdr, args->access, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2558,13 +2544,17 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + const __u32 nfs4_acl_bitmap[1] = { + [0] = FATTR4_WORD0_ACL, + }; uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); replen = hdr.replen + op_decode_hdr_maxsz; - encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); + encode_getattr(xdr, nfs4_acl_bitmap, NULL, + ARRAY_SIZE(nfs4_acl_bitmap), &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, 0, args->acl_len); @@ -2643,8 +2633,8 @@ static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], - &hdr); + encode_getattr(xdr, nfs4_pathconf_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_pathconf_bitmap), &hdr); encode_nops(&hdr); } @@ -2662,8 +2652,8 @@ static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], - args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_getattr(xdr, nfs4_statfs_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_statfs_bitmap), &hdr); encode_nops(&hdr); } @@ -2683,7 +2673,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr); + encode_getattr(xdr, bitmask, NULL, 3, &hdr); encode_nops(&hdr); } @@ -3217,34 +3207,27 @@ static int decode_ace(struct xdr_stream *xdr, void *ace) return -EIO; } -static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +static ssize_t +decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz) { - uint32_t bmlen; - __be32 *p; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); + ssize_t ret; - bitmap[0] = bitmap[1] = bitmap[2] = 0; - p = xdr_inline_decode(xdr, (bmlen << 2)); - if (unlikely(!p)) - goto out_overflow; - if (bmlen > 0) { - bitmap[0] = be32_to_cpup(p++); - if (bmlen > 1) { - bitmap[1] = be32_to_cpup(p++); - if (bmlen > 2) - bitmap[2] = be32_to_cpup(p); - } - } - return 0; -out_overflow: + ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz); + if (likely(ret >= 0)) + return ret; + if (ret == -EMSGSIZE) + return sz; print_overflow_msg(__func__, xdr); return -EIO; } +static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + ssize_t ret; + ret = decode_bitmap4(xdr, bitmap, 3); + return ret < 0 ? ret : 0; +} + static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep) { __be32 *p; @@ -3980,7 +3963,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER; if (owner_name != NULL) { - len = decode_nfs4_string(xdr, owner_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, owner_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, owner_name->data); @@ -4015,7 +3998,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; if (group_name != NULL) { - len = decode_nfs4_string(xdr, group_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, group_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, group_name->data); @@ -4155,19 +4138,25 @@ out_overflow: return -EIO; } +static __be32 * +xdr_decode_nfstime4(__be32 *p, struct timespec *t) +{ + __u64 sec; + + p = xdr_decode_hyper(p, &sec); + t-> tv_sec = (time_t)sec; + t->tv_nsec = be32_to_cpup(p++); + return p; +} + static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) { __be32 *p; - uint64_t sec; - uint32_t nsec; - p = xdr_inline_decode(xdr, 12); + p = xdr_inline_decode(xdr, nfstime4_maxsz << 2); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &sec); - nsec = be32_to_cpup(p); - time->tv_sec = (time_t)sec; - time->tv_nsec = (long)nsec; + xdr_decode_nfstime4(p, time); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -5470,21 +5459,13 @@ decode_savefh(struct xdr_stream *xdr) static int decode_setattr(struct xdr_stream *xdr) { - __be32 *p; - uint32_t bmlen; int status; status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); - p = xdr_inline_decode(xdr, bmlen << 2); - if (likely(p)) + if (decode_bitmap4(xdr, NULL, 0) >= 0) return 0; -out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } @@ -6255,7 +6236,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_access(xdr, &res->supported, &res->access); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -7535,6 +7517,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, unsigned int savep; uint32_t bitmap[3] = {0}; uint32_t len; + uint64_t new_cookie; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -7551,8 +7534,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; - entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + p = xdr_decode_hyper(p, &new_cookie); entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); @@ -7586,6 +7568,9 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + entry->prev_cookie = entry->cookie; + entry->cookie = new_cookie; + return 0; out_overflow: diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f7fd9192d4bc..4e93d6308733 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -300,11 +300,11 @@ out: } static int -nfs_proc_remove(struct inode *dir, const struct qstr *name) +nfs_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], @@ -312,7 +312,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) }; int status; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n",dentry); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -321,7 +321,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) } static void -nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; } @@ -338,7 +338,9 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; } @@ -671,12 +673,6 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags) return 0; } -static int nfs_return_delegation(struct inode *inode) -{ - nfs_wb_all(inode); - return 0; -} - static const struct inode_operations nfs_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -741,7 +737,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, .have_delegation = nfs_have_delegation, - .return_delegation = nfs_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 630b4a3c1a93..bf54fc9ae135 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -105,7 +105,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data) data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); - NFS_PROTO(dir)->unlink_setup(&msg, dir); + NFS_PROTO(dir)->unlink_setup(&msg, data->dentry); task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); @@ -386,7 +386,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, nfs_sb_active(old_dir->i_sb); - NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry); return rpc_run_task(&task_setup_data); } @@ -463,9 +463,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) fileid = NFS_FILEID(d_inode(dentry)); - /* Return delegation in anticipation of the rename */ - NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); - sdentry = NULL; do { int slen; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6579f3b367bd..0193053bc139 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -231,6 +231,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c if (i_size >= end) goto out; i_size_write(inode, end); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); out: spin_unlock(&inode->i_lock); @@ -1562,8 +1563,11 @@ static int nfs_writeback_done(struct rpc_task *task, } /* Deal with the suid/sgid bit corner case */ - if (nfs_should_remove_suid(inode)) - nfs_mark_for_revalidate(inode); + if (nfs_should_remove_suid(inode)) { + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + spin_unlock(&inode->i_lock); + } return 0; } diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index c21e0b4454a6..dec98cab729d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -193,9 +193,9 @@ retry: (unsigned long long)oldkey, (unsigned long long)newkey); - spin_lock_irq(&btnc->tree_lock); - err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. @@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, (unsigned long long)newkey); mark_buffer_dirty(obh); - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, oldkey); - radix_tree_tag_set(&btnc->page_tree, newkey, + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, oldkey); + radix_tree_tag_set(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&btnc->tree_lock); + xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; unlock_page(opage); @@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, newkey); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, newkey); + xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 68241512d7c1..4cb850a6f1c2 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -331,15 +331,15 @@ repeat: struct page *page2; /* move the page to the destination cache */ - spin_lock_irq(&smap->tree_lock); - page2 = radix_tree_delete(&smap->page_tree, offset); + xa_lock_irq(&smap->i_pages); + page2 = radix_tree_delete(&smap->i_pages, offset); WARN_ON(page2 != page); smap->nrpages--; - spin_unlock_irq(&smap->tree_lock); + xa_unlock_irq(&smap->i_pages); - spin_lock_irq(&dmap->tree_lock); - err = radix_tree_insert(&dmap->page_tree, offset, page); + xa_lock_irq(&dmap->i_pages); + err = radix_tree_insert(&dmap->i_pages, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; @@ -348,11 +348,11 @@ repeat: page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->page_tree, + radix_tree_tag_set(&dmap->i_pages, offset, PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&dmap->tree_lock); + xa_unlock_irq(&dmap->i_pages); } unlock_page(page); } @@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page) struct address_space *mapping = page->mapping; if (mapping) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->page_tree, + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return 0; } return TestClearPageDirty(page); diff --git a/fs/proc/array.c b/fs/proc/array.c index 598803576e4c..ae2c807fd719 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -141,25 +141,12 @@ static inline const char *get_task_state(struct task_struct *tsk) return task_state_array[task_state_index(tsk)]; } -static inline int get_task_umask(struct task_struct *tsk) -{ - struct fs_struct *fs; - int umask = -ENOENT; - - task_lock(tsk); - fs = tsk->fs; - if (fs) - umask = fs->umask; - task_unlock(tsk); - return umask; -} - static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g, umask; + int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; @@ -177,17 +164,18 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, ngid = task_numa_group_id(p); cred = get_task_cred(p); - umask = get_task_umask(p); - if (umask >= 0) - seq_printf(m, "Umask:\t%#04o\n", umask); - task_lock(p); + if (p->fs) + umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); - seq_printf(m, "State:\t%s", get_task_state(p)); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); + seq_puts(m, "State:\t"); + seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); @@ -313,8 +301,8 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { - seq_printf(m, "%08x", - a->cap[CAP_LAST_U32 - __capi]); + seq_put_hex_ll(m, NULL, + a->cap[CAP_LAST_U32 - __capi], 8); } seq_putc(m, '\n'); } @@ -368,7 +356,8 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { - seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); + seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); + seq_putc(m, '\n'); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -504,7 +493,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(task->real_start_time); - seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); + seq_puts(m, " ("); + seq_puts(m, tcomm); + seq_puts(m, ") "); + seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); diff --git a/fs/proc/base.c b/fs/proc/base.c index d53246863cfb..eafa39a3a88c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -388,14 +388,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, unsigned long wchan; char symname[KSYM_NAME_LEN]; - wchan = get_wchan(task); + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) - && !lookup_symbol_name(wchan, symname)) - seq_printf(m, "%s", symname); - else - seq_putc(m, '0'); + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { + seq_puts(m, symname); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -1910,6 +1913,8 @@ static int dname_to_vma_addr(struct dentry *dentry, unsigned long long sval, eval; unsigned int len; + if (str[0] == '0' && str[1] != '-') + return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -1921,6 +1926,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return -EINVAL; str++; + if (str[0] == '0' && str[1]) + return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -2204,6 +2211,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } } up_read(&mm->mmap_sem); + mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2221,7 +2229,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } if (fa) flex_array_free(fa); - mmput(mm); out_put_task: put_task_struct(task); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 403cbb12a6e9..8233e7af9389 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -6,7 +6,8 @@ static int cmdline_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%s\n", saved_command_line); + seq_puts(m, saved_command_line); + seq_putc(m, '\n'); return 0; } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5d709fa8f3a2..04c4804cbdef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -8,6 +8,7 @@ * Copyright (C) 1997 Theodore Ts'o */ +#include <linux/cache.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -28,6 +29,17 @@ static DEFINE_RWLOCK(proc_subdir_lock); +struct kmem_cache *proc_dir_entry_cache __ro_after_init; + +void pde_free(struct proc_dir_entry *pde) +{ + if (S_ISLNK(pde->mode)) + kfree(pde->data); + if (pde->name != pde->inline_name) + kfree(pde->name); + kmem_cache_free(proc_dir_entry_cache, pde); +} + static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) @@ -40,8 +52,8 @@ static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - return rb_entry_safe(rb_first_cached(&dir->subdir), - struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) @@ -54,7 +66,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { - struct rb_node *node = dir->subdir.rb_root.rb_node; + struct rb_node *node = dir->subdir.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, @@ -75,9 +87,8 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { - struct rb_root_cached *root = &dir->subdir; - struct rb_node **new = &root->rb_root.rb_node, *parent = NULL; - bool leftmost = true; + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; /* Figure out where to put new node */ while (*new) { @@ -89,16 +100,15 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, parent = *new; if (result < 0) new = &(*new)->rb_left; - else if (result > 0) { + else if (result > 0) new = &(*new)->rb_right; - leftmost = false; - } else + else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); - rb_insert_color_cached(&de->subdir_node, root, leftmost); + rb_insert_color(&de->subdir_node, root); return true; } @@ -354,6 +364,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "name len %u\n", qstr.len); return NULL; } + if (qstr.len == 1 && fn[0] == '.') { + WARN(1, "name '.'\n"); + return NULL; + } + if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { + WARN(1, "name '..'\n"); + return NULL; + } if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; @@ -363,16 +381,26 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, return NULL; } - ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); + ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!ent) goto out; + if (qstr.len + 1 <= sizeof(ent->inline_name)) { + ent->name = ent->inline_name; + } else { + ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); + if (!ent->name) { + pde_free(ent); + return NULL; + } + } + memcpy(ent->name, fn, qstr.len + 1); ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; - ent->subdir = RB_ROOT_CACHED; - atomic_set(&ent->count, 1); + ent->subdir = RB_ROOT; + refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); @@ -395,12 +423,11 @@ struct proc_dir_entry *proc_symlink(const char *name, strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; if (proc_register(parent, ent) < 0) { - kfree(ent->data); - kfree(ent); + pde_free(ent); ent = NULL; } } else { - kfree(ent); + pde_free(ent); ent = NULL; } } @@ -423,7 +450,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -458,7 +485,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_iops = NULL; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -495,7 +522,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, goto out_free; return pde; out_free: - kfree(pde); + pde_free(pde); out: return NULL; } @@ -522,19 +549,12 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) } EXPORT_SYMBOL(proc_set_user); -static void free_proc_entry(struct proc_dir_entry *de) -{ - proc_free_inum(de->low_ino); - - if (S_ISLNK(de->mode)) - kfree(de->data); - kfree(de); -} - void pde_put(struct proc_dir_entry *pde) { - if (atomic_dec_and_test(&pde->count)) - free_proc_entry(pde); + if (refcount_dec_and_test(&pde->refcnt)) { + proc_free_inum(pde->low_ino); + pde_free(pde); + } } /* @@ -555,7 +575,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) - rb_erase_cached(&de->subdir_node, &parent->subdir); + rb_erase(&de->subdir_node, &parent->subdir); write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -592,13 +612,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase_cached(&root->subdir_node, &parent->subdir); + rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase_cached(&next->subdir_node, &de->subdir); + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6e8724958116..2cf3b74391ca 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -54,6 +54,7 @@ static void proc_evict_inode(struct inode *inode) } static struct kmem_cache *proc_inode_cachep __ro_after_init; +static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -92,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -void __init proc_init_inodecache(void) +void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), @@ -100,6 +101,13 @@ void __init proc_init_inodecache(void) SLAB_MEM_SPREAD|SLAB_ACCOUNT| SLAB_PANIC), init_once); + pde_opener_cache = + kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, + SLAB_ACCOUNT|SLAB_PANIC, NULL); + proc_dir_entry_cache = kmem_cache_create_usercopy( + "proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + sizeof_field(struct proc_dir_entry, inline_name), NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) @@ -138,7 +146,7 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked */ +/* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { /* @@ -157,9 +165,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); - spin_lock(&pde->pde_unload_lock); } else { struct file *file; + struct completion *c; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; @@ -167,9 +176,11 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (unlikely(pdeo->c)) - complete(pdeo->c); - kfree(pdeo); + c = pdeo->c; + spin_unlock(&pde->pde_unload_lock); + if (unlikely(c)) + complete(c); + kmem_cache_free(pde_opener_cache, pdeo); } } @@ -188,6 +199,7 @@ void proc_entry_rundown(struct proc_dir_entry *de) struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); + spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } @@ -338,31 +350,36 @@ static int proc_reg_open(struct inode *inode, struct file *file) * * Save every "struct file" with custom ->release hook. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); - if (!pdeo) - return -ENOMEM; - - if (!use_pde(pde)) { - kfree(pdeo); + if (!use_pde(pde)) return -ENOENT; - } - open = pde->proc_fops->open; + release = pde->proc_fops->release; + if (release) { + pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); + if (!pdeo) { + rv = -ENOMEM; + goto out_unuse; + } + } + open = pde->proc_fops->open; if (open) rv = open(inode, file); - if (rv == 0 && release) { - /* To know what to release. */ - pdeo->file = file; - pdeo->closing = false; - pdeo->c = NULL; - spin_lock(&pde->pde_unload_lock); - list_add(&pdeo->lh, &pde->pde_openers); - spin_unlock(&pde->pde_unload_lock); - } else - kfree(pdeo); + if (release) { + if (rv == 0) { + /* To know what to release. */ + pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kmem_cache_free(pde_opener_cache, pdeo); + } +out_unuse: unuse_pde(pde); return rv; } @@ -375,7 +392,7 @@ static int proc_reg_release(struct inode *inode, struct file *file) list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); - break; + return 0; } } spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d697c8ab0a14..0f1692e63cb6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/proc_ns.h> +#include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> @@ -36,7 +37,7 @@ struct proc_dir_entry { * negative -> it's going away RSN */ atomic_t in_use; - atomic_t count; /* use count */ + refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; @@ -50,13 +51,22 @@ struct proc_dir_entry { kgid_t gid; loff_t size; struct proc_dir_entry *parent; - struct rb_root_cached subdir; + struct rb_root subdir; struct rb_node subdir_node; + char *name; umode_t mode; u8 namelen; - char name[]; +#ifdef CONFIG_64BIT +#define SIZEOF_PDE_INLINE_NAME (192-139) +#else +#define SIZEOF_PDE_INLINE_NAME (128-87) +#endif + char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, @@ -159,7 +169,7 @@ int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry * static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { - atomic_inc(&pde->count); + refcount_inc(&pde->refcnt); return pde; } extern void pde_put(struct proc_dir_entry *); @@ -177,12 +187,12 @@ struct pde_opener { struct list_head lh; bool closing; struct completion *c; -}; +} __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; -extern void proc_init_inodecache(void); +void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6bb20f864259..65a72ab57471 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,20 +26,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { - char v[32]; - static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; - int len; - - len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); - - seq_write(m, s, 16); - - if (len > 0) { - if (len < 8) - seq_write(m, blanks, 8 - len); - - seq_write(m, v, len); - } + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 68c06ae7888c..1763f370489d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,15 +192,16 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; - netd->subdir = RB_ROOT_CACHED; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; + netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); @@ -223,7 +224,7 @@ static __net_init int proc_net_ns_init(struct net *net) return 0; free_net: - kfree(netd); + pde_free(netd); out: return err; } @@ -231,7 +232,7 @@ out: static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); - kfree(net->proc_net); + pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c41ab261397d..8989936f2995 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -707,14 +707,14 @@ static bool proc_sys_link_fill_cache(struct file *file, struct ctl_table *table) { bool ret = true; + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; - if (S_ISLNK(table->mode)) { - /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table); - if (err) - goto out; - } + /* It is not an error if we can not follow the link ignore it */ + if (sysctl_follow_link(&head, &table)) + goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: @@ -1086,7 +1086,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) - err |= sysctl_err(path, table, "array now allowed"); + err |= sysctl_err(path, table, "array not allowed"); } return err; diff --git a/fs/proc/root.c b/fs/proc/root.c index ede8e64974be..61b7340b357a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -123,23 +123,13 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - int err; - - proc_init_inodecache(); + proc_init_kmemcache(); set_proc_pid_nlink(); - err = register_filesystem(&proc_fs_type); - if (err) - return; - proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); - -#ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); -#endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ @@ -150,6 +140,8 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + + register_filesystem(&proc_fs_type); } static int proc_root_getattr(const struct path *path, struct kstat *stat, @@ -207,12 +199,13 @@ struct proc_dir_entry proc_root = { .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, - .count = ATOMIC_INIT(1), + .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, - .subdir = RB_ROOT_CACHED, - .name = "/proc", + .subdir = RB_ROOT, + .name = proc_root.inline_name, + .inline_name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ec6d2983a5cb..65ae54659833 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -24,6 +24,8 @@ #include <asm/tlbflush.h> #include "internal.h" +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; @@ -53,39 +55,28 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - seq_printf(m, - "VmPeak:\t%8lu kB\n" - "VmSize:\t%8lu kB\n" - "VmLck:\t%8lu kB\n" - "VmPin:\t%8lu kB\n" - "VmHWM:\t%8lu kB\n" - "VmRSS:\t%8lu kB\n" - "RssAnon:\t%8lu kB\n" - "RssFile:\t%8lu kB\n" - "RssShmem:\t%8lu kB\n" - "VmData:\t%8lu kB\n" - "VmStk:\t%8lu kB\n" - "VmExe:\t%8lu kB\n" - "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n" - "VmSwap:\t%8lu kB\n", - hiwater_vm << (PAGE_SHIFT-10), - total_vm << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - mm->pinned_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), - anon << (PAGE_SHIFT-10), - file << (PAGE_SHIFT-10), - shmem << (PAGE_SHIFT-10), - mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), - text >> 10, - lib >> 10, - mm_pgtables_bytes(mm) >> 10, - swap << (PAGE_SHIFT-10)); + SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); + SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); + SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); + SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); + SEQ_PUT_DEC(" kB\nRssFile:\t", file); + SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); + SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); + SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); + seq_put_decimal_ull_width(m, + " kB\nVmExe:\t", text >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmLib:\t", lib >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); + SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); + seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } +#undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { @@ -287,15 +278,18 @@ static void show_vma_header_prefix(struct seq_file *m, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); } static void @@ -694,8 +688,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) { - seq_printf(m, "%c%c ", - mnemonics[i][0], mnemonics[i][1]); + seq_putc(m, mnemonics[i][0]); + seq_putc(m, mnemonics[i][1]); + seq_putc(m, ' '); } } seq_putc(m, '\n'); @@ -736,6 +731,8 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) { } +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -809,51 +806,34 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) ret = SEQ_SKIP; } - if (!rollup_mode) - seq_printf(m, - "Size: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); - - - if (!rollup_mode || last_vma) - seq_printf(m, - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "Locked: %8lu kB\n", - mss->resident >> 10, - (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10, - mss->anonymous >> 10, - mss->lazyfree >> 10, - mss->anonymous_thp >> 10, - mss->shmem_thp >> 10, - mss->shared_hugetlb >> 10, - mss->private_hugetlb >> 10, - mss->swap >> 10, - (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), - (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + if (!rollup_mode) { + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + } + if (!rollup_mode || last_vma) { + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT); + seq_puts(m, " kB\n"); + } if (!rollup_mode) { arch_show_smap(m, vma); show_smap_vma_flags(m, vma); @@ -861,6 +841,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) m_cache_vma(m, vma); return ret; } +#undef SEQ_PUT_DEC static int show_pid_smap(struct seq_file *m, void *v) { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 70057359fbaf..23148c3ed675 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super, if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; - reiserfs_warning(super, + reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; diff --git a/fs/seq_file.c b/fs/seq_file.c index eea09f6d8830..c6c27f1f9c98 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,7 @@ * initial implementation -- AV, Oct 2001. */ +#include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> @@ -19,6 +20,8 @@ #include <linux/uaccess.h> #include <asm/page.h> +static struct kmem_cache *seq_file_cache __ro_after_init; + static void seq_set_overflow(struct seq_file *m) { m->count = m->size; @@ -26,7 +29,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - return kvmalloc(size, GFP_KERNEL); + return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** @@ -51,7 +54,7 @@ int seq_open(struct file *file, const struct seq_operations *op) WARN_ON(file->private_data); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; @@ -366,7 +369,7 @@ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); - kfree(m); + kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); @@ -563,7 +566,7 @@ static void single_stop(struct seq_file *p, void *v) int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { @@ -625,7 +628,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; @@ -673,29 +676,37 @@ void seq_puts(struct seq_file *m, const char *s) } EXPORT_SYMBOL(seq_puts); -/* +/** * A helper routine for putting decimal numbers without rich format of printf(). * only 'unsigned long long' is supported. - * This routine will put strlen(delimiter) + number into seq_file. + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @num: the number + * @width: a minimum field width + * + * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, - unsigned long long num) +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width) { int len; if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (!width) + width = 1; - if (m->count + 1 >= m->size) + if (m->count + width >= m->size) goto overflow; if (num < 10) { @@ -703,7 +714,7 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; @@ -713,8 +724,60 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, overflow: seq_set_overflow(m); } + +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, + unsigned long long num) +{ + return seq_put_decimal_ull_width(m, delimiter, num, 0); +} EXPORT_SYMBOL(seq_put_decimal_ull); +/** + * seq_put_hex_ll - put a number in hexadecimal notation + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @v: the number + * @width: a minimum field width + * + * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) + * + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width) +{ + unsigned int len; + int i; + + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } + + /* If x is 0, the result of __builtin_clzll is undefined */ + if (v == 0) + len = 1; + else + len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; + + if (len < width) + len = width; + + if (m->count + len > m->size) { + seq_set_overflow(m); + return; + } + + for (i = len - 1; i >= 0; i--) { + m->buf[m->count + i] = hex_asc[0xf & v]; + v = v >> 4; + } + m->count += len; +} + void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; @@ -722,12 +785,12 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; - - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } if (m->count + 2 >= m->size) goto overflow; @@ -742,7 +805,7 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; @@ -782,8 +845,14 @@ EXPORT_SYMBOL(seq_write); void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; - if (size > 0) - seq_printf(m, "%*s", size, ""); + if (size > 0) { + if (size + m->count > m->size) { + seq_set_overflow(m); + return; + } + memset(m->buf + m->count, ' ', size); + m->count += size; + } if (c) seq_putc(m, c); } @@ -1040,3 +1109,8 @@ seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); + +void __init seq_file_init(void) +{ + seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); +} diff --git a/fs/super.c b/fs/super.c index 672538ca9831..5fa9a8d8d865 100644 --- a/fs/super.c +++ b/fs/super.c @@ -37,6 +37,7 @@ #include <linux/user_namespace.h> #include "internal.h" +static int thaw_super_locked(struct super_block *sb); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); @@ -574,6 +575,28 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); +static void __iterate_supers(void (*f)(struct super_block *)) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + f(sb); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} /** * iterate_supers - call function for all active superblocks * @f: function to call @@ -881,33 +904,22 @@ cancel_readonly: return retval; } -static void do_emergency_remount(struct work_struct *work) +static void do_emergency_remount_callback(struct super_block *sb) { - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && - !sb_rdonly(sb)) { - /* - * What lock protects sb->s_flags?? - */ - do_remount_sb(sb, SB_RDONLY, NULL, 1); - } - up_write(&sb->s_umount); - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; + down_write(&sb->s_umount); + if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && + !sb_rdonly(sb)) { + /* + * What lock protects sb->s_flags?? + */ + do_remount_sb(sb, SB_RDONLY, NULL, 1); } - if (p) - __put_super(p); - spin_unlock(&sb_lock); + up_write(&sb->s_umount); +} + +static void do_emergency_remount(struct work_struct *work) +{ + __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } @@ -923,6 +935,40 @@ void emergency_remount(void) } } +static void do_thaw_all_callback(struct super_block *sb) +{ + down_write(&sb->s_umount); + if (sb->s_root && sb->s_flags & MS_BORN) { + emergency_thaw_bdev(sb); + thaw_super_locked(sb); + } else { + up_write(&sb->s_umount); + } +} + +static void do_thaw_all(struct work_struct *work) +{ + __iterate_supers(do_thaw_all_callback); + kfree(work); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_thaw_all); + schedule_work(work); + } +} + /* * Unnamed block devices are dummy devices used by virtual * filesystems which don't use real block-devices. -- jrs @@ -1492,11 +1538,10 @@ EXPORT_SYMBOL(freeze_super); * * Unlocks the filesystem and marks it writeable again after freeze_super(). */ -int thaw_super(struct super_block *sb) +static int thaw_super_locked(struct super_block *sb) { int error; - down_write(&sb->s_umount); if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; @@ -1527,4 +1572,10 @@ out: deactivate_locked_super(sb); return 0; } + +int thaw_super(struct super_block *sb) +{ + down_write(&sb->s_umount); + return thaw_super_locked(sb); +} EXPORT_SYMBOL(thaw_super); diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 2dcf3d473fec..9571616b5dda 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -632,7 +632,7 @@ static int scan_for_idx_cb(struct ubifs_info *c, */ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) { - struct ubifs_lprops *lprops; + const struct ubifs_lprops *lprops; struct scan_data data; int err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 6c3a1abd0e22..f5a46844340c 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -244,7 +244,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c, /** * lpt_heap_replace - replace lprops in a category heap. * @c: UBIFS file-system description object - * @old_lprops: LEB properties to replace * @new_lprops: LEB properties with which to replace * @cat: LEB category * @@ -254,7 +253,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c, * lprops. This function does that. */ static void lpt_heap_replace(struct ubifs_info *c, - struct ubifs_lprops *old_lprops, struct ubifs_lprops *new_lprops, int cat) { struct ubifs_lpt_heap *heap; @@ -362,7 +360,7 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, case LPROPS_DIRTY: case LPROPS_DIRTY_IDX: case LPROPS_FREE: - lpt_heap_replace(c, old_lprops, new_lprops, cat); + lpt_heap_replace(c, new_lprops, cat); break; case LPROPS_UNCAT: case LPROPS_EMPTY: diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index aab87340d3de..16f03d9929e5 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -175,7 +175,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, int lnum, int offs) { - lnum = lnum; dbg_scan("stop scanning LEB %d at offset %d", lnum, offs); ubifs_assert(offs % c->min_io_size == 0); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b16ef162344a..6c397a389105 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1737,8 +1737,11 @@ static void ubifs_remount_ro(struct ubifs_info *c) dbg_save_space_info(c); - for (i = 0; i < c->jhead_cnt; i++) - ubifs_wbuf_sync(&c->jheads[i].wbuf); + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + ubifs_ro_mode(c, err); + } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); @@ -1804,8 +1807,11 @@ static void ubifs_put_super(struct super_block *sb) int err; /* Synchronize write-buffers */ - for (i = 0; i < c->jhead_cnt; i++) - ubifs_wbuf_sync(&c->jheads[i].wbuf); + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + ubifs_ro_mode(c, err); + } /* * We are being cleanly unmounted which means the diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 39387bdd225d..4bcc095fe44a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1947,7 +1947,7 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, (mp->m_sb.sb_agblocks + 1) / 2); } @@ -1959,7 +1959,6 @@ xfs_alloc_compute_maxlevels( */ xfs_extlen_t xfs_alloc_longest_free_extent( - struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need, xfs_extlen_t reserved) @@ -2038,8 +2037,7 @@ xfs_alloc_space_available( /* do we have enough contiguous free space for the allocation? */ alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; - longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, - reservation); + longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); if (longest < alloc_len) return false; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index a311a2414a6b..cbf789ea5a4e 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -116,9 +116,8 @@ xfs_alloc_allow_busy_reuse(int datatype) unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); -xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, - struct xfs_perag *pag, xfs_extlen_t need, - xfs_extlen_t reserved); +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag, + xfs_extlen_t need, xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3b03d886df66..6a7c2f03ea11 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3225,7 +3225,7 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag, + longest = xfs_alloc_longest_free_extent(pag, xfs_alloc_min_freelist(mp, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) @@ -5667,7 +5667,6 @@ xfs_bmap_collapse_extents( xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, - xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops) { diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f3be6416260b..2b766b37096d 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -228,7 +228,7 @@ void xfs_bmap_del_extent_cow(struct xfs_inode *ip, uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, - bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + bool *done, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index edc0193358a5..ac7d66427e42 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4531,7 +4531,6 @@ xfs_btree_sblock_verify( */ uint xfs_btree_compute_maxlevels( - struct xfs_mount *mp, uint *limits, unsigned long len) { @@ -4839,7 +4838,6 @@ xfs_btree_query_all( */ xfs_extlen_t xfs_btree_calc_size( - struct xfs_mount *mp, uint *limits, unsigned long long len) { diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 58e30c0975c3..9227159a751e 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -481,10 +481,8 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); -uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, - unsigned long len); -xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, - unsigned long long len); +uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); +xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0e2cf5f0be1f..de627fa19168 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2406,7 +2406,7 @@ xfs_ialloc_compute_maxlevels( uint inodes; inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; - mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr, inodes); } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index a2dd7f4a2719..367e9a0726e6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -556,7 +556,7 @@ xfs_inobt_max_size( if (mp->m_inobt_mxr[0] == 0) return 0; - return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + return xfs_btree_calc_size(mp->m_inobt_mnr, (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK); } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index bee68c23d612..560e28473024 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -351,7 +351,6 @@ xfs_refcount_merge_center_extents( struct xfs_refcount_irec *center, struct xfs_refcount_irec *right, unsigned long long extlen, - xfs_agblock_t *agbno, xfs_extlen_t *aglen) { int error; @@ -471,7 +470,6 @@ xfs_refcount_merge_right_extent( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, - xfs_agblock_t *agbno, xfs_extlen_t *aglen) { int error; @@ -749,7 +747,7 @@ xfs_refcount_merge_extents( ulen < MAXREFCEXTLEN) { *shape_changed = true; return xfs_refcount_merge_center_extents(cur, &left, &cleft, - &right, ulen, agbno, aglen); + &right, ulen, aglen); } /* Try to merge left and cleft. */ @@ -778,7 +776,7 @@ xfs_refcount_merge_extents( ulen < MAXREFCEXTLEN) { *shape_changed = true; return xfs_refcount_merge_right_extent(cur, &right, &cright, - agbno, aglen); + aglen); } return error; @@ -1356,9 +1354,7 @@ xfs_refcount_adjust_cow_extents( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, - enum xfs_refc_adjust_op adj, - struct xfs_defer_ops *dfops, - struct xfs_owner_info *oinfo) + enum xfs_refc_adjust_op adj) { struct xfs_refcount_irec ext, tmp; int error; @@ -1437,8 +1433,7 @@ xfs_refcount_adjust_cow( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, - enum xfs_refc_adjust_op adj, - struct xfs_defer_ops *dfops) + enum xfs_refc_adjust_op adj) { bool shape_changed; int error; @@ -1465,8 +1460,7 @@ xfs_refcount_adjust_cow( goto out_error; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, - dfops, NULL); + error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj); if (error) goto out_error; @@ -1493,7 +1487,7 @@ __xfs_refcount_cow_alloc( /* Add refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, - XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); + XFS_REFCOUNT_ADJUST_COW_ALLOC); } /* @@ -1511,7 +1505,7 @@ __xfs_refcount_cow_free( /* Remove refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, - XFS_REFCOUNT_ADJUST_COW_FREE, dfops); + XFS_REFCOUNT_ADJUST_COW_FREE); } /* Record a CoW staging extent in the refcount btree. */ @@ -1568,7 +1562,7 @@ struct xfs_refcount_recovery { /* Stuff an extent on the recovery list. */ STATIC int xfs_refcount_recover_extent( - struct xfs_btree_cur *cur, + struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv) { diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 265fdcefcbae..375abfeb6267 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -373,7 +373,6 @@ xfs_refcountbt_init_cursor( */ int xfs_refcountbt_maxrecs( - struct xfs_mount *mp, int blocklen, bool leaf) { @@ -390,7 +389,7 @@ void xfs_refcountbt_compute_maxlevels( struct xfs_mount *mp) { - mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels( mp->m_refc_mnr, mp->m_sb.sb_agblocks); } @@ -400,7 +399,7 @@ xfs_refcountbt_calc_size( struct xfs_mount *mp, unsigned long long len) { - return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); + return xfs_btree_calc_size(mp->m_refc_mnr, len); } /* diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 9db008b955b7..2bc4694ef146 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -60,8 +60,7 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, struct xfs_defer_ops *dfops); -extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, - bool leaf); +extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 79822cf6ebe3..fba8d2718017 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -376,7 +376,6 @@ xfs_rmap_free_check_owner( struct xfs_mount *mp, uint64_t ltoff, struct xfs_rmap_irec *rec, - xfs_fsblock_t bno, xfs_filblks_t len, uint64_t owner, uint64_t offset, @@ -519,7 +518,7 @@ xfs_rmap_unmap( bno + len, out_error); /* Check owner information. */ - error = xfs_rmap_free_check_owner(mp, ltoff, <rec, bno, len, owner, + error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, offset, flags); if (error) goto out_error; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 8b0d0de1cd11..d756e0b84abf 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -499,7 +499,6 @@ xfs_rmapbt_init_cursor( */ int xfs_rmapbt_maxrecs( - struct xfs_mount *mp, int blocklen, int leaf) { @@ -534,7 +533,7 @@ xfs_rmapbt_compute_maxlevels( if (xfs_sb_version_hasreflink(&mp->m_sb)) mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; else - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( mp->m_rmap_mnr, mp->m_sb.sb_agblocks); } @@ -544,7 +543,7 @@ xfs_rmapbt_calc_size( struct xfs_mount *mp, unsigned long long len) { - return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); + return xfs_btree_calc_size(mp->m_rmap_mnr, len); } /* diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 19c08e933049..d68d96eed7ea 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -55,7 +55,7 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, xfs_agnumber_t agno); -int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); +int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 53433cc024fd..d9b94bd5f689 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -756,15 +756,13 @@ xfs_sb_mount_common( mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; - mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, - true); - mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, - false); + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5f17641f040f..3bccdf73e141 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -734,8 +734,7 @@ xfs_calc_clear_agi_bucket_reservation( * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) */ STATIC uint -xfs_calc_qm_setqlim_reservation( - struct xfs_mount *mp) +xfs_calc_qm_setqlim_reservation(void) { return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); } @@ -772,8 +771,7 @@ xfs_calc_qm_quotaoff_reservation( * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 */ STATIC uint -xfs_calc_qm_quotaoff_end_reservation( - struct xfs_mount *mp) +xfs_calc_qm_quotaoff_end_reservation(void) { return sizeof(struct xfs_qoff_logitem) * 2; } @@ -877,14 +875,14 @@ xfs_trans_resv_calc( * The following transactions are logged in logical format with * a default log count. */ - resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_qm_equotaoff.tr_logres = - xfs_calc_qm_quotaoff_end_reservation(mp); + xfs_calc_qm_quotaoff_end_reservation(); resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 436a1de3fcdf..0ab824f574ed 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1467,19 +1467,8 @@ xfs_vm_set_page_dirty( newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - if (newly_dirty) { - /* sigh - __set_page_dirty() is static, so copy it here, too */ - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irqrestore(&mapping->tree_lock, flags); - } + if (newly_dirty) + __set_page_dirty(page, mapping, 1); unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e5fb008d75e8..2203465e63ea 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -53,6 +53,25 @@ xfs_bui_item_free( kmem_zone_free(xfs_bui_zone, buip); } +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( + struct xfs_bui_log_item *buip) +{ + ASSERT(atomic_read(&buip->bui_refcount) > 0); + if (atomic_dec_and_test(&buip->bui_refcount)) { + xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_bui_item_free(buip); + } +} + + STATIC void xfs_bui_item_size( struct xfs_log_item *lip, @@ -142,7 +161,7 @@ xfs_bui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_bui_item_free(BUI_ITEM(lip)); + xfs_bui_release(BUI_ITEM(lip)); } /* @@ -206,24 +225,6 @@ xfs_bui_init( return buip; } -/* - * Freeing the BUI requires that we remove it from the AIL if it has already - * been placed there. However, the BUI may not yet have been placed in the AIL - * when called by xfs_bui_release() from BUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the BUI. - */ -void -xfs_bui_release( - struct xfs_bui_log_item *buip) -{ - ASSERT(atomic_read(&buip->bui_refcount) > 0); - if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_bui_item_free(buip); - } -} - static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bud_log_item, bud_item); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 05dee8fdd895..8cd8c412f52d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1326,7 +1326,6 @@ xfs_collapse_file_space( int error; struct xfs_defer_ops dfops; xfs_fsblock_t first_block; - xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); @@ -1361,7 +1360,7 @@ xfs_collapse_file_space( xfs_defer_init(&dfops, &first_block); error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, - &done, stop_fsb, &first_block, &dfops); + &done, &first_block, &dfops); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ac669a10c62f..55661cbdb51b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1754,7 +1754,6 @@ xfs_buftarg_shrink_count( void xfs_free_buftarg( - struct xfs_mount *mp, struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 2f4c91452861..edced162a674 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -388,7 +388,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, struct block_device *, struct dax_device *); -extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); +extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index b2cde5426182..7b68e6c9a474 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -50,19 +50,19 @@ xfs_trim_extents( pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); - if (error || !agbp) - goto out_put_perag; - - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); - /* * Force out the log. This means any transactions that might have freed - * space before we took the AGF buffer lock are now on disk, and the + * space before we take the AGF buffer lock are now on disk, and the * volatile disk cache is flushed. */ xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + /* * Look up the longest btree in the AGF and start with it. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 64da90655e95..b5b1e567b9f4 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -51,6 +51,24 @@ xfs_efi_item_free( } /* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. + */ +void +xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + ASSERT(atomic_read(&efip->efi_refcount) > 0); + if (atomic_dec_and_test(&efip->efi_refcount)) { + xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); + } +} + +/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -151,7 +169,7 @@ xfs_efi_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_efi_item_free(EFI_ITEM(lip)); + xfs_efi_release(EFI_ITEM(lip)); } /* @@ -279,24 +297,6 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) return -EFSCORRUPTED; } -/* - * Freeing the efi requires that we remove it from the AIL if it has already - * been placed there. However, the EFI may not yet have been placed in the AIL - * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the EFI. - */ -void -xfs_efi_release( - struct xfs_efi_log_item *efip) -{ - ASSERT(atomic_read(&efip->efi_refcount) > 0); - if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } -} - static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_efd_log_item, efd_item); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 043ca3808ea2..3f8722e51dbe 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -34,7 +34,6 @@ struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; - struct xfs_inode *ip; xfs_agnumber_t ag; /* AG in use for this directory */ }; @@ -122,14 +121,15 @@ xfs_filestream_put_ag( static void xfs_fstrm_free_func( + void *data, struct xfs_mru_cache_elem *mru) { + struct xfs_mount *mp = data; struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); - xfs_filestream_put_ag(item->ip->i_mount, item->ag); - - trace_xfs_filestream_free(item->ip, item->ag); + xfs_filestream_put_ag(mp, item->ag); + trace_xfs_filestream_free(mp, mru->key, item->ag); kmem_free(item); } @@ -165,7 +165,7 @@ xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { - trace_xfs_filestream_scan(ip, ag); + trace_xfs_filestream_scan(mp, ip->i_ino, ag); pag = xfs_perag_get(mp, ag); @@ -198,7 +198,7 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag, + longest = xfs_alloc_longest_free_extent(pag, xfs_alloc_min_freelist(mp, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (((minlen && longest >= minlen) || @@ -265,7 +265,6 @@ next_ag: goto out_put_ag; item->ag = *agp; - item->ip = ip; err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); if (err) { @@ -333,7 +332,7 @@ xfs_filestream_lookup_ag( ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; xfs_mru_cache_done(mp->m_filestream); - trace_xfs_filestream_lookup(ip, ag); + trace_xfs_filestream_lookup(mp, ip->i_ino, ag); goto out; } @@ -399,7 +398,7 @@ xfs_filestream_new_ag( * Only free the item here so we skip over the old AG earlier. */ if (mru) - xfs_fstrm_free_func(mru); + xfs_fstrm_free_func(mp, mru); IRELE(pip); exit: @@ -426,8 +425,8 @@ xfs_filestream_mount( * timer tunable to within about 10 percent. This requires at least 10 * groups. */ - return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, - 10, xfs_fstrm_free_func); + return xfs_mru_cache_create(&mp->m_filestream, mp, + xfs_fstrm_centisecs * 10, 10, xfs_fstrm_free_func); } void diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3e3aab3888fa..2b70c8b4cee2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -972,10 +972,8 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ - xfs_inode_t **ipp, /* pointer to inode; it will be + xfs_inode_t **ipp) /* pointer to inode; it will be locked. */ - int *committed) - { xfs_trans_t *tp; xfs_inode_t *ip; @@ -1050,8 +1048,6 @@ xfs_dir_ialloc( } code = xfs_trans_roll(&tp); - if (committed != NULL) - *committed = 1; /* * Re-attach the quota info that we detached from prev trx. @@ -1088,9 +1084,6 @@ xfs_dir_ialloc( } ASSERT(!ialloc_context && ip); - } else { - if (committed != NULL) - *committed = 0; } *ipp = ip; @@ -1217,8 +1210,7 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, - NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip); if (error) goto out_trans_cancel; @@ -1309,7 +1301,6 @@ xfs_create( int xfs_create_tmpfile( struct xfs_inode *dp, - struct dentry *dentry, umode_t mode, struct xfs_inode **ipp) { @@ -1351,7 +1342,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1611,13 +1602,15 @@ xfs_itruncate_extents( goto out; } - /* Remove all pending CoW reservations. */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block, true); - if (error) - goto out; + if (whichfork == XFS_DATA_FORK) { + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, + first_unmap_block, last_block, true); + if (error) + goto out; - xfs_itruncate_clear_reflink_flags(ip); + xfs_itruncate_clear_reflink_flags(ip); + } /* * Always re-log the inode so that our permanent transaction can keep @@ -2903,7 +2896,7 @@ xfs_rename_alloc_whiteout( struct xfs_inode *tmpfile; int error; - error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 132d8aa2afc4..1eebc53df7d7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -393,8 +393,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode, + struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, @@ -431,7 +431,7 @@ xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, dev_t, prid_t, - struct xfs_inode **, int *); + struct xfs_inode **); /* from xfs_file.c */ enum xfs_prealloc_flags { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 154725b1b813..a3ed3c811dfa 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -177,7 +177,7 @@ xfs_generic_create( if (!tmpfile) { error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); } else { - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + error = xfs_create_tmpfile(XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b9c9c848146b..2fcd9ed5d075 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -560,7 +560,6 @@ xfs_log_done( */ int xfs_log_notify( - struct xfs_mount *mp, struct xlog_in_core *iclog, xfs_log_callback_t *cb) { diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 7e2d62922a16..fa8ad31d587f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -141,8 +141,7 @@ int xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_notify(struct xfs_mount *mp, - struct xlog_in_core *iclog, +int xfs_log_notify(struct xlog_in_core *iclog, struct xfs_log_callback *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, struct xlog_in_core *iclog); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index cb376ac8a595..4668403b1741 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -848,7 +848,7 @@ restart: /* attach all the transactions w/ busy extents to iclog */ ctx->log_cb.cb_func = xlog_cil_committed; ctx->log_cb.cb_arg = ctx; - error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + error = xfs_log_notify(commit_iclog, &ctx->log_cb); if (error) goto out_abort; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index f8a674d7f092..70eea7ae2876 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -112,6 +112,7 @@ struct xfs_mru_cache { xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ struct delayed_work work; /* Workqueue data for reaping. */ unsigned int queued; /* work has been queued */ + void *data; }; static struct workqueue_struct *xfs_mru_reap_wq; @@ -259,7 +260,7 @@ _xfs_mru_cache_clear_reap_list( list_for_each_entry_safe(elem, next, &tmp, list_node) { list_del_init(&elem->list_node); - mru->free_func(elem); + mru->free_func(mru->data, elem); } spin_lock(&mru->lock); @@ -326,6 +327,7 @@ xfs_mru_cache_uninit(void) int xfs_mru_cache_create( struct xfs_mru_cache **mrup, + void *data, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func) @@ -369,7 +371,7 @@ xfs_mru_cache_create( mru->grp_time = grp_time; mru->free_func = free_func; - + mru->data = data; *mrup = mru; exit: @@ -492,7 +494,7 @@ xfs_mru_cache_delete( elem = xfs_mru_cache_remove(mru, key); if (elem) - mru->free_func(elem); + mru->free_func(mru->data, elem); } /* diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index fb5245ba5ff7..b3f3fbdfcc47 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -26,13 +26,13 @@ struct xfs_mru_cache_elem { }; /* Function pointer type for callback to free a client's data pointer. */ -typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); +typedef void (*xfs_mru_cache_free_func_t)(void *, struct xfs_mru_cache_elem *); int xfs_mru_cache_init(void); void xfs_mru_cache_uninit(void); -int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, - unsigned int grp_count, - xfs_mru_cache_free_func_t free_func); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, void *data, + unsigned int lifetime_ms, unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, struct xfs_mru_cache_elem *elem); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5b848f4b637f..ec39ae274c78 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -748,7 +748,6 @@ xfs_qm_qino_alloc( { xfs_trans_t *tp; int error; - int committed; bool need_alloc = true; *ip = NULL; @@ -788,8 +787,7 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7a39f40645f7..15c9393dd7a7 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -52,6 +52,25 @@ xfs_cui_item_free( kmem_zone_free(xfs_cui_zone, cuip); } +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( + struct xfs_cui_log_item *cuip) +{ + ASSERT(atomic_read(&cuip->cui_refcount) > 0); + if (atomic_dec_and_test(&cuip->cui_refcount)) { + xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_cui_item_free(cuip); + } +} + + STATIC void xfs_cui_item_size( struct xfs_log_item *lip, @@ -141,7 +160,7 @@ xfs_cui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_cui_item_free(CUI_ITEM(lip)); + xfs_cui_release(CUI_ITEM(lip)); } /* @@ -211,24 +230,6 @@ xfs_cui_init( return cuip; } -/* - * Freeing the CUI requires that we remove it from the AIL if it has already - * been placed there. However, the CUI may not yet have been placed in the AIL - * when called by xfs_cui_release() from CUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the CUI. - */ -void -xfs_cui_release( - struct xfs_cui_log_item *cuip) -{ - ASSERT(atomic_read(&cuip->cui_refcount) > 0); - if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_cui_item_free(cuip); - } -} - static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cud_log_item, cud_item); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 49d3124863a8..06a07846c9b3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -52,6 +52,24 @@ xfs_rui_item_free( kmem_zone_free(xfs_rui_zone, ruip); } +/* + * Freeing the RUI requires that we remove it from the AIL if it has already + * been placed there. However, the RUI may not yet have been placed in the AIL + * when called by xfs_rui_release() from RUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the RUI. + */ +void +xfs_rui_release( + struct xfs_rui_log_item *ruip) +{ + ASSERT(atomic_read(&ruip->rui_refcount) > 0); + if (atomic_dec_and_test(&ruip->rui_refcount)) { + xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_rui_item_free(ruip); + } +} + STATIC void xfs_rui_item_size( struct xfs_log_item *lip, @@ -141,7 +159,7 @@ xfs_rui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_rui_item_free(RUI_ITEM(lip)); + xfs_rui_release(RUI_ITEM(lip)); } /* @@ -233,24 +251,6 @@ xfs_rui_copy_format( return 0; } -/* - * Freeing the RUI requires that we remove it from the AIL if it has already - * been placed there. However, the RUI may not yet have been placed in the AIL - * when called by xfs_rui_release() from RUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the RUI. - */ -void -xfs_rui_release( - struct xfs_rui_log_item *ruip) -{ - ASSERT(atomic_read(&ruip->rui_refcount) > 0); - if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_rui_item_free(ruip); - } -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 612c1d5348b3..d71424052917 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -722,7 +722,7 @@ xfs_close_devices( struct block_device *logdev = mp->m_logdev_targp->bt_bdev; struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; - xfs_free_buftarg(mp, mp->m_logdev_targp); + xfs_free_buftarg(mp->m_logdev_targp); xfs_blkdev_put(logdev); fs_put_dax(dax_logdev); } @@ -730,11 +730,11 @@ xfs_close_devices( struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; - xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_free_buftarg(mp->m_rtdev_targp); xfs_blkdev_put(rtdev); fs_put_dax(dax_rtdev); } - xfs_free_buftarg(mp, mp->m_ddev_targp); + xfs_free_buftarg(mp->m_ddev_targp); fs_put_dax(dax_ddev); } @@ -808,9 +808,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_free_buftarg(mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp, mp->m_ddev_targp); + xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); fs_put_dax(dax_rtdev); @@ -1247,7 +1247,6 @@ xfs_quiesce_attr( STATIC int xfs_test_remount_options( struct super_block *sb, - struct xfs_mount *mp, char *options) { int error = 0; @@ -1278,7 +1277,7 @@ xfs_fs_remount( int error; /* First, check for complete junk; i.e. invalid options */ - error = xfs_test_remount_options(sb, mp, options); + error = xfs_test_remount_options(sb, options); if (error) return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 2e9e793a8f9d..5b66ac12913c 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -264,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, &ip, NULL); + prid, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a982c0b623d0..8955254b900e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -506,8 +506,8 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DECLARE_EVENT_CLASS(xfs_filestream_class, - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), - TP_ARGS(ip, agno), + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), + TP_ARGS(mp, ino, agno), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -515,10 +515,10 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __field(int, streams) ), TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; __entry->agno = agno; - __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->streams = xfs_filestream_peek_ag(mp, agno); ), TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -528,8 +528,8 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, ) #define DEFINE_FILESTREAM_EVENT(name) \ DEFINE_EVENT(xfs_filestream_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ - TP_ARGS(ip, agno)) + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \ + TP_ARGS(mp, ino, agno)) DEFINE_FILESTREAM_EVENT(xfs_filestream_free); DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index d591bb77f592..40a916efd7c0 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -254,6 +254,8 @@ int acpi_processor_pstate_control(void); /* note: this locks both the calling module and the processor module if a _PPC object exists, rmmod is disallowed then */ int acpi_processor_notify_smm(struct module *calling_module); +int acpi_processor_get_psd(acpi_handle handle, + struct acpi_psd_package *pdomain); /* parsing the _P* objects. */ extern int acpi_processor_get_performance_info(struct acpi_processor *pr); diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 04c4cc6fd820..66d1d45fa2e1 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -25,6 +25,50 @@ #define mmiowb() do {} while (0) #endif +#ifndef __io_br +#define __io_br() barrier() +#endif + +/* prevent prefetching of coherent DMA data ahead of a dma-complete */ +#ifndef __io_ar +#ifdef rmb +#define __io_ar() rmb() +#else +#define __io_ar() barrier() +#endif +#endif + +/* flush writes to coherent DMA data before possibly triggering a DMA read */ +#ifndef __io_bw +#ifdef wmb +#define __io_bw() wmb() +#else +#define __io_bw() barrier() +#endif +#endif + +/* serialize device access against a spin_unlock, usually handled there. */ +#ifndef __io_aw +#define __io_aw() barrier() +#endif + +#ifndef __io_pbw +#define __io_pbw() __io_bw() +#endif + +#ifndef __io_paw +#define __io_paw() __io_aw() +#endif + +#ifndef __io_pbr +#define __io_pbr() __io_br() +#endif + +#ifndef __io_par +#define __io_par() __io_ar() +#endif + + /* * __raw_{read,write}{b,w,l,q}() access memory in native endianness. * @@ -110,7 +154,12 @@ static inline void __raw_writeq(u64 value, volatile void __iomem *addr) #define readb readb static inline u8 readb(const volatile void __iomem *addr) { - return __raw_readb(addr); + u8 val; + + __io_br(); + val = __raw_readb(addr); + __io_ar(); + return val; } #endif @@ -118,7 +167,12 @@ static inline u8 readb(const volatile void __iomem *addr) #define readw readw static inline u16 readw(const volatile void __iomem *addr) { - return __le16_to_cpu(__raw_readw(addr)); + u16 val; + + __io_br(); + val = __le16_to_cpu(__raw_readw(addr)); + __io_ar(); + return val; } #endif @@ -126,7 +180,12 @@ static inline u16 readw(const volatile void __iomem *addr) #define readl readl static inline u32 readl(const volatile void __iomem *addr) { - return __le32_to_cpu(__raw_readl(addr)); + u32 val; + + __io_br(); + val = __le32_to_cpu(__raw_readl(addr)); + __io_ar(); + return val; } #endif @@ -135,7 +194,12 @@ static inline u32 readl(const volatile void __iomem *addr) #define readq readq static inline u64 readq(const volatile void __iomem *addr) { - return __le64_to_cpu(__raw_readq(addr)); + u64 val; + + __io_br(); + val = __le64_to_cpu(__raw_readq(addr)); + __io_ar(); + return val; } #endif #endif /* CONFIG_64BIT */ @@ -144,7 +208,9 @@ static inline u64 readq(const volatile void __iomem *addr) #define writeb writeb static inline void writeb(u8 value, volatile void __iomem *addr) { + __io_bw(); __raw_writeb(value, addr); + __io_aw(); } #endif @@ -152,7 +218,9 @@ static inline void writeb(u8 value, volatile void __iomem *addr) #define writew writew static inline void writew(u16 value, volatile void __iomem *addr) { + __io_bw(); __raw_writew(cpu_to_le16(value), addr); + __io_aw(); } #endif @@ -160,7 +228,9 @@ static inline void writew(u16 value, volatile void __iomem *addr) #define writel writel static inline void writel(u32 value, volatile void __iomem *addr) { + __io_bw(); __raw_writel(__cpu_to_le32(value), addr); + __io_aw(); } #endif @@ -169,7 +239,9 @@ static inline void writel(u32 value, volatile void __iomem *addr) #define writeq writeq static inline void writeq(u64 value, volatile void __iomem *addr) { + __io_bw(); __raw_writeq(__cpu_to_le64(value), addr); + __io_aw(); } #endif #endif /* CONFIG_64BIT */ @@ -180,35 +252,67 @@ static inline void writeq(u64 value, volatile void __iomem *addr) * accesses. */ #ifndef readb_relaxed -#define readb_relaxed readb +#define readb_relaxed readb_relaxed +static inline u8 readb_relaxed(const volatile void __iomem *addr) +{ + return __raw_readb(addr); +} #endif #ifndef readw_relaxed -#define readw_relaxed readw +#define readw_relaxed readw_relaxed +static inline u16 readw_relaxed(const volatile void __iomem *addr) +{ + return __le16_to_cpu(__raw_readw(addr)); +} #endif #ifndef readl_relaxed -#define readl_relaxed readl +#define readl_relaxed readl_relaxed +static inline u32 readl_relaxed(const volatile void __iomem *addr) +{ + return __le32_to_cpu(__raw_readl(addr)); +} #endif #if defined(readq) && !defined(readq_relaxed) -#define readq_relaxed readq +#define readq_relaxed readq_relaxed +static inline u64 readq_relaxed(const volatile void __iomem *addr) +{ + return __le64_to_cpu(__raw_readq(addr)); +} #endif #ifndef writeb_relaxed -#define writeb_relaxed writeb +#define writeb_relaxed writeb_relaxed +static inline void writeb_relaxed(u8 value, volatile void __iomem *addr) +{ + __raw_writeb(value, addr); +} #endif #ifndef writew_relaxed -#define writew_relaxed writew +#define writew_relaxed writew_relaxed +static inline void writew_relaxed(u16 value, volatile void __iomem *addr) +{ + __raw_writew(cpu_to_le16(value), addr); +} #endif #ifndef writel_relaxed -#define writel_relaxed writel +#define writel_relaxed writel_relaxed +static inline void writel_relaxed(u32 value, volatile void __iomem *addr) +{ + __raw_writel(__cpu_to_le32(value), addr); +} #endif #if defined(writeq) && !defined(writeq_relaxed) -#define writeq_relaxed writeq +#define writeq_relaxed writeq_relaxed +static inline void writeq_relaxed(u64 value, volatile void __iomem *addr) +{ + __raw_writeq(__cpu_to_le64(value), addr); +} #endif /* @@ -363,7 +467,12 @@ static inline void writesq(volatile void __iomem *addr, const void *buffer, #define inb inb static inline u8 inb(unsigned long addr) { - return readb(PCI_IOBASE + addr); + u8 val; + + __io_pbr(); + val = __raw_readb(PCI_IOBASE + addr); + __io_par(); + return val; } #endif @@ -371,7 +480,12 @@ static inline u8 inb(unsigned long addr) #define inw inw static inline u16 inw(unsigned long addr) { - return readw(PCI_IOBASE + addr); + u16 val; + + __io_pbr(); + val = __le16_to_cpu(__raw_readw(PCI_IOBASE + addr)); + __io_par(); + return val; } #endif @@ -379,7 +493,12 @@ static inline u16 inw(unsigned long addr) #define inl inl static inline u32 inl(unsigned long addr) { - return readl(PCI_IOBASE + addr); + u32 val; + + __io_pbr(); + val = __le32_to_cpu(__raw_readl(PCI_IOBASE + addr)); + __io_par(); + return val; } #endif @@ -387,7 +506,9 @@ static inline u32 inl(unsigned long addr) #define outb outb static inline void outb(u8 value, unsigned long addr) { - writeb(value, PCI_IOBASE + addr); + __io_pbw(); + __raw_writeb(value, PCI_IOBASE + addr); + __io_paw(); } #endif @@ -395,7 +516,9 @@ static inline void outb(u8 value, unsigned long addr) #define outw outw static inline void outw(u16 value, unsigned long addr) { - writew(value, PCI_IOBASE + addr); + __io_pbw(); + __raw_writew(cpu_to_le16(value), PCI_IOBASE + addr); + __io_paw(); } #endif @@ -403,7 +526,9 @@ static inline void outw(u16 value, unsigned long addr) #define outl outl static inline void outl(u32 value, unsigned long addr) { - writel(value, PCI_IOBASE + addr); + __io_pbw(); + __raw_writel(cpu_to_le32(value), PCI_IOBASE + addr); + __io_paw(); } #endif diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 2f7a29242b87..38cd77b39a64 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -26,7 +26,8 @@ #define IORT_IRQ_MASK(irq) (irq & 0xffffffffULL) #define IORT_IRQ_TRIGGER_MASK(irq) ((irq >> 32) & 0xffffffffULL) -int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node); +int iort_register_domain_token(int trans_id, phys_addr_t base, + struct fwnode_handle *fw_node); void iort_deregister_domain_token(int trans_id); struct fwnode_handle *iort_find_domain_token(int trans_id); #ifdef CONFIG_ACPI_IORT @@ -38,6 +39,7 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); /* IOMMU interface */ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size); const struct iommu_ops *iort_iommu_configure(struct device *dev); +int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); #else static inline void acpi_iort_init(void) { } static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) @@ -52,6 +54,9 @@ static inline void iort_dma_setup(struct device *dev, u64 *dma_addr, static inline const struct iommu_ops *iort_iommu_configure( struct device *dev) { return NULL; } +static inline +int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) +{ return 0; } #endif #endif /* __ACPI_IORT_H__ */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e4ce54d84ab..09da0f124699 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -175,7 +175,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); +long wait_iff_congested(int sync, long timeout); static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) { @@ -329,7 +329,7 @@ static inline bool inode_to_wb_is_valid(struct inode *inode) * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be - * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the + * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) @@ -337,7 +337,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && - !lockdep_is_held(&inode->i_mapping->tree_lock) && + !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; @@ -349,7 +349,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) * @lockedp: temp bool output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't - * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This + * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). @@ -370,11 +370,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(*lockedp)) - spin_lock_irq(&inode->i_mapping->tree_lock); + xa_lock_irq(&inode->i_mapping->i_pages); /* - * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. - * inode_to_wb() will bark. Deref directly. + * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages + * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } @@ -387,7 +387,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) { if (unlikely(locked)) - spin_unlock_irq(&inode->i_mapping->tree_lock); + xa_unlock_irq(&inode->i_mapping->i_pages); rcu_read_unlock(); } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b0abe21d6cc9..4955e0863b83 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -61,6 +61,8 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; + + struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ } __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 @@ -118,6 +120,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *); extern int flush_old_exec(struct linux_binprm * bprm); extern void setup_new_exec(struct linux_binprm * bprm); +extern void finalize_exec(struct linux_binprm *bprm); extern void would_dump(struct linux_binprm *, struct file *); extern int suid_dumpable; diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d3f264a5b04d..ceb96ecab96e 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -17,9 +17,6 @@ */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) -#define randomized_struct_fields_start struct { -#define randomized_struct_fields_end }; - /* all clang versions usable with the kernel support KASAN ABI version 5 */ #define KASAN_ABI_VERSION 5 diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index e2c7f4369eff..b4bf73f5e38f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -242,6 +242,9 @@ #if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__) #define __randomize_layout __attribute__((randomize_layout)) #define __no_randomize_layout __attribute__((no_randomize_layout)) +/* This anon struct can add padding, so only enable it under randstruct. */ +#define randomized_struct_fields_start struct { +#define randomized_struct_fields_end } __randomize_layout; #endif #endif /* GCC_VERSION >= 40500 */ @@ -256,15 +259,6 @@ */ #define __visible __attribute__((externally_visible)) -/* - * RANDSTRUCT_PLUGIN wants to use an anonymous struct, but it is only - * possible since GCC 4.6. To provide as much build testing coverage - * as possible, this is used for all GCC 4.6+ builds, and not just on - * RANDSTRUCT_PLUGIN builds. - */ -#define randomized_struct_fields_start struct { -#define randomized_struct_fields_end } __randomize_layout; - #endif /* GCC_VERSION >= 40600 */ diff --git a/include/linux/const.h b/include/linux/const.h new file mode 100644 index 000000000000..7b55a55f5911 --- /dev/null +++ b/include/linux/const.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_CONST_H +#define _LINUX_CONST_H + +#include <uapi/linux/const.h> + +#define UL(x) (_UL(x)) +#define ULL(x) (_ULL(x)) + +#endif /* _LINUX_CONST_H */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1fe49724da9e..87f48dd932eb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -960,8 +960,6 @@ extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; extern struct freq_attr *cpufreq_generic_attr[]; -int cpufreq_table_validate_and_show(struct cpufreq_policy *policy, - struct cpufreq_frequency_table *table); int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy); unsigned int cpufreq_generic_get(unsigned int cpu); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a806e94c482f..1eefabf1621f 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -135,7 +135,8 @@ extern bool cpuidle_not_available(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + bool *stop_tick); extern int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index); extern void cpuidle_reflect(struct cpuidle_device *dev, int index); @@ -167,7 +168,7 @@ static inline bool cpuidle_not_available(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return true; } static inline int cpuidle_select(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, bool *stop_tick) {return -ENODEV; } static inline int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) @@ -250,7 +251,8 @@ struct cpuidle_governor { struct cpuidle_device *dev); int (*select) (struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + bool *stop_tick); void (*reflect) (struct cpuidle_device *dev, int index); }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2aa02cad94d4..760d8da1b6c7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -13,6 +13,7 @@ #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> +#include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> @@ -390,12 +391,11 @@ int pagecache_write_end(struct file *, struct address_space *mapping, struct address_space { struct inode *host; /* owner: inode, block_device */ - struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t tree_lock; /* and lock protecting it */ + struct radix_tree_root i_pages; /* cached pages */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root_cached i_mmap; /* tree of private and shared mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ - /* Protected by tree_lock together with the radix tree */ + /* Protected by the i_pages lock */ unsigned long nrpages; /* number of total pages */ /* number of shadow or DAX exceptional entries */ unsigned long nrexceptional; @@ -1667,7 +1667,7 @@ typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { - const filldir_t actor; + filldir_t actor; loff_t pos; }; @@ -1989,7 +1989,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell - * wb stat updates to grab mapping->tree_lock. See + * wb stat updates to grab the i_pages lock. See * inode_switch_wb_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper @@ -2445,6 +2445,7 @@ extern int sync_blockdev(struct block_device *bdev); extern void kill_bdev(struct block_device *); extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); +extern void emergency_thaw_bdev(struct super_block *sb); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); @@ -2470,6 +2471,11 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) return 0; } +static inline int emergency_thaw_bdev(struct super_block *sb) +{ + return 0; +} + static inline void iterate_bdevs(void (*f)(struct block_device *, void *), void *arg) { } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 325017ad9311..39988924de3a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -80,76 +80,145 @@ struct hmm; /* - * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page + * hmm_pfn_flag_e - HMM flag enums * * Flags: - * HMM_PFN_VALID: pfn is valid - * HMM_PFN_READ: CPU page table has read permission set + * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set + * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) + * + * The driver provide a flags array, if driver valid bit for an entry is bit + * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide + * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3. + * Same logic apply to all flags. This is same idea as vm_page_prot in vma + * except that this is per device driver rather than per architecture. + */ +enum hmm_pfn_flag_e { + HMM_PFN_VALID = 0, + HMM_PFN_WRITE, + HMM_PFN_DEVICE_PRIVATE, + HMM_PFN_FLAG_MAX +}; + +/* + * hmm_pfn_value_e - HMM pfn special value + * + * Flags: * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory - * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none() + * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. - * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE) + * + * Driver provide entry value for none entry, error entry and special entry, + * driver can alias (ie use same value for error and special for instance). It + * should not alias none and error or special. + * + * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: + * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, + * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table + * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one */ -typedef unsigned long hmm_pfn_t; +enum hmm_pfn_value_e { + HMM_PFN_ERROR, + HMM_PFN_NONE, + HMM_PFN_SPECIAL, + HMM_PFN_VALUE_MAX +}; -#define HMM_PFN_VALID (1 << 0) -#define HMM_PFN_READ (1 << 1) -#define HMM_PFN_WRITE (1 << 2) -#define HMM_PFN_ERROR (1 << 3) -#define HMM_PFN_EMPTY (1 << 4) -#define HMM_PFN_SPECIAL (1 << 5) -#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6) -#define HMM_PFN_SHIFT 7 +/* + * struct hmm_range - track invalidation lock on virtual address range + * + * @vma: the vm area struct for the range + * @list: all range lock are on a list + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @pfns: array of pfns (big enough for the range) + * @flags: pfn flags to match device driver page table + * @values: pfn value for some special case (none, special, error, ...) + * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) + * @valid: pfns array did not change since it has been fill by an HMM function + */ +struct hmm_range { + struct vm_area_struct *vma; + struct list_head list; + unsigned long start; + unsigned long end; + uint64_t *pfns; + const uint64_t *flags; + const uint64_t *values; + uint8_t pfn_shift; + bool valid; +}; /* - * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t - * @pfn: hmm_pfn_t to convert to struct page - * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise + * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn + * @range: range use to decode HMM pfn value + * @pfn: HMM pfn value to get corresponding struct page from + * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise * - * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page - * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL. + * If the HMM pfn is valid (ie valid flag set) then return the struct page + * matching the pfn value stored in the HMM pfn. Otherwise return NULL. */ -static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn) +static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, + uint64_t pfn) { - if (!(pfn & HMM_PFN_VALID)) + if (pfn == range->values[HMM_PFN_NONE]) + return NULL; + if (pfn == range->values[HMM_PFN_ERROR]) return NULL; - return pfn_to_page(pfn >> HMM_PFN_SHIFT); + if (pfn == range->values[HMM_PFN_SPECIAL]) + return NULL; + if (!(pfn & range->flags[HMM_PFN_VALID])) + return NULL; + return pfn_to_page(pfn >> range->pfn_shift); } /* - * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t - * @pfn: hmm_pfn_t to extract pfn from - * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise + * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn + * @range: range use to decode HMM pfn value + * @pfn: HMM pfn value to extract pfn from + * Returns: pfn value if HMM pfn is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn) +static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, + uint64_t pfn) { - if (!(pfn & HMM_PFN_VALID)) + if (pfn == range->values[HMM_PFN_NONE]) + return -1UL; + if (pfn == range->values[HMM_PFN_ERROR]) + return -1UL; + if (pfn == range->values[HMM_PFN_SPECIAL]) + return -1UL; + if (!(pfn & range->flags[HMM_PFN_VALID])) return -1UL; - return (pfn >> HMM_PFN_SHIFT); + return (pfn >> range->pfn_shift); } /* - * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page - * @page: struct page pointer for which to create the hmm_pfn_t - * Returns: valid hmm_pfn_t for the page + * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * @range: range use to encode HMM pfn value + * @page: struct page pointer for which to create the HMM pfn + * Returns: valid HMM pfn for the page */ -static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page) +static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, + struct page *page) { - return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; + return (page_to_pfn(page) << range->pfn_shift) | + range->flags[HMM_PFN_VALID]; } /* - * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn - * @pfn: pfn value for which to create the hmm_pfn_t - * Returns: valid hmm_pfn_t for the pfn + * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * @range: range use to encode HMM pfn value + * @pfn: pfn value for which to create the HMM pfn + * Returns: valid HMM pfn for the pfn */ -static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn) +static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, + unsigned long pfn) { - return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; + return (pfn << range->pfn_shift) | + range->flags[HMM_PFN_VALID]; } @@ -218,6 +287,16 @@ enum hmm_update_type { * @update: callback to update range on a device */ struct hmm_mirror_ops { + /* release() - release hmm_mirror + * + * @mirror: pointer to struct hmm_mirror + * + * This is called when the mm_struct is being released. + * The callback should make sure no references to the mirror occur + * after the callback returns. + */ + void (*release)(struct hmm_mirror *mirror); + /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror @@ -262,23 +341,6 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* - * struct hmm_range - track invalidation lock on virtual address range - * - * @list: all range lock are on a list - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @pfns: array of pfns (big enough for the range) - * @valid: pfns array did not change since it has been fill by an HMM function - */ -struct hmm_range { - struct list_head list; - unsigned long start; - unsigned long end; - hmm_pfn_t *pfns; - bool valid; -}; - -/* * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device * driver lock that serializes device page table updates, then call * hmm_vma_range_done(), to check if the snapshot is still valid. The same @@ -291,17 +353,13 @@ struct hmm_range { * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); +int hmm_vma_get_pfns(struct hmm_range *range); +bool hmm_vma_range_done(struct hmm_range *range); /* * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will - * not migrate any device memory back to system memory. The hmm_pfn_t array will + * not migrate any device memory back to system memory. The HMM pfn array will * be updated with the fault result and current snapshot of the CPU page table * for the range. * @@ -310,22 +368,26 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); * function returns -EAGAIN. * * Return value does not reflect if the fault was successful for every single - * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to + * address or not. Therefore, the caller must to inspect the HMM pfn array to * determine fault status for each address. * * Trying to fault inside an invalid vma will result in -EINVAL. * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +int hmm_vma_fault(struct hmm_range *range, bool block); +/* Below are for HMM internal use only! Not to be used by device driver! */ +void hmm_mm_destroy(struct mm_struct *mm); + +static inline void hmm_mm_init(struct mm_struct *mm) +{ + mm->hmm = NULL; +} +#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +static inline void hmm_mm_destroy(struct mm_struct *mm) {} +static inline void hmm_mm_init(struct mm_struct *mm) {} +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct hmm_devmem; @@ -498,23 +560,9 @@ struct hmm_device { struct hmm_device *hmm_device_new(void *drvdata); void hmm_device_put(struct hmm_device *hmm_device); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -#endif /* IS_ENABLED(CONFIG_HMM) */ - -/* Below are for HMM internal use only! Not to be used by device driver! */ -#if IS_ENABLED(CONFIG_HMM_MIRROR) -void hmm_mm_destroy(struct mm_struct *mm); - -static inline void hmm_mm_init(struct mm_struct *mm) -{ - mm->hmm = NULL; -} -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} -static inline void hmm_mm_init(struct mm_struct *mm) {} -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - - #else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} +#endif /* IS_ENABLED(CONFIG_HMM) */ + #endif /* LINUX_HMM_H */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 78f456fcd242..a2656c3ebe81 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -424,6 +424,7 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } extern u64 hrtimer_get_next_event(void); +extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); diff --git a/include/linux/idr.h b/include/linux/idr.h index 7d6a6313f0ab..e856f4e0ab35 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -29,29 +29,31 @@ struct idr { #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ -#define IDR_RT_MARKER ((__force gfp_t)(3 << __GFP_BITS_SHIFT)) +#define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ + (1 << (ROOT_TAG_SHIFT + IDR_FREE))) -#define IDR_INIT_BASE(base) { \ - .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \ +#define IDR_INIT_BASE(name, base) { \ + .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. + * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ -#define IDR_INIT IDR_INIT_BASE(0) +#define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** - * DEFINE_IDR() - Define a statically-allocated IDR - * @name: Name of IDR + * DEFINE_IDR() - Define a statically-allocated IDR. + * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ -#define DEFINE_IDR(name) struct idr name = IDR_INIT +#define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator @@ -218,10 +220,10 @@ struct ida { struct radix_tree_root ida_rt; }; -#define IDA_INIT { \ - .ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT), \ +#define IDA_INIT(name) { \ + .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \ } -#define DEFINE_IDA(name) struct ida name = IDA_INIT +#define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_pre_get(struct ida *ida, gfp_t gfp_mask); int ida_get_new_above(struct ida *ida, int starting_id, int *p_id); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 8dad3dd26eae..ef169d67df92 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -209,12 +209,12 @@ #define DMA_FECTL_IM (((u32)1) << 31) /* FSTS_REG */ -#define DMA_FSTS_PPF ((u32)2) -#define DMA_FSTS_PFO ((u32)1) -#define DMA_FSTS_IQE (1 << 4) -#define DMA_FSTS_ICE (1 << 5) -#define DMA_FSTS_ITE (1 << 6) -#define DMA_FSTS_PRO (1 << 7) +#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */ +#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */ +#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */ +#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */ +#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */ +#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */ #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) /* FRCD_REG, 32 bits access */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 41b8c5757859..19938ee6eb31 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -465,23 +465,23 @@ static inline int iommu_map(struct iommu_domain *domain, unsigned long iova, return -ENODEV; } -static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) +static inline size_t iommu_unmap(struct iommu_domain *domain, + unsigned long iova, size_t size) { - return -ENODEV; + return 0; } -static inline int iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova, - int gfp_order) +static inline size_t iommu_unmap_fast(struct iommu_domain *domain, + unsigned long iova, int gfp_order) { - return -ENODEV; + return 0; } static inline size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot) { - return -ENODEV; + return 0; } static inline void iommu_flush_tlb_all(struct iommu_domain *domain) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 9385aa57497b..a27cf6652327 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -62,8 +62,11 @@ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) -/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ -#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) +/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) + +/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ +#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 52b70894eaa5..6a1eb0b0aad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,7 +439,8 @@ extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); -extern int num_to_str(char *buf, int size, unsigned long long num); +extern int num_to_str(char *buf, int size, + unsigned long long num, unsigned int width); /* lib/printf utilities */ @@ -543,6 +544,7 @@ extern enum system_states { SYSTEM_RESTART, } system_state; +/* This cannot be an enum because some may be used in assembly source. */ #define TAINT_PROPRIETARY_MODULE 0 #define TAINT_FORCED_MODULE 1 #define TAINT_CPU_OUT_OF_SPEC 2 @@ -560,7 +562,8 @@ extern enum system_states { #define TAINT_SOFTLOCKUP 14 #define TAINT_LIVEPATCH 15 #define TAINT_AUX 16 -#define TAINT_FLAGS_COUNT 17 +#define TAINT_RANDSTRUCT 17 +#define TAINT_FLAGS_COUNT 18 struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index e251533a5939..89fc8dc7bf38 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -41,11 +41,11 @@ */ /* - * Note about locking : There is no locking required until only * one reader - * and one writer is using the fifo and no kfifo_reset() will be * called - * kfifo_reset_out() can be safely used, until it will be only called + * Note about locking: There is no locking required until only one reader + * and one writer is using the fifo and no kfifo_reset() will be called. + * kfifo_reset_out() can be safely used, until it will be only called * in the reader thread. - * For multiple writer and one reader there is only a need to lock the writer. + * For multiple writer and one reader there is only a need to lock the writer. * And vice versa for only one writer and multiple reader there is only a need * to lock the reader. */ diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 2eac32095113..99f17cc8e163 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -37,6 +37,7 @@ struct lockref { extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); extern int lockref_get_not_zero(struct lockref *); +extern int lockref_put_not_zero(struct lockref *); extern int lockref_get_or_lock(struct lockref *); extern int lockref_put_or_lock(struct lockref *); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c46016bb25eb..d99b71bc2c66 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -48,13 +48,12 @@ enum memcg_stat_item { MEMCG_NR_STAT, }; -/* Cgroup-specific events, on top of universal VM events */ -enum memcg_event_item { - MEMCG_LOW = NR_VM_EVENT_ITEMS, +enum memcg_memory_event { + MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, - MEMCG_NR_EVENTS, + MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { @@ -88,7 +87,7 @@ enum mem_cgroup_events_target { struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -120,6 +119,9 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; + bool congested; /* memcg has many dirty pages */ + /* backed by a congested BDI */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -202,7 +204,8 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* handle for "memory.events" */ + /* memory.events */ + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; /* protect arrays of thresholds */ @@ -231,9 +234,10 @@ struct mem_cgroup { struct task_struct *move_lock_task; unsigned long move_lock_flags; + /* memory.stat */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; atomic_long_t stat[MEMCG_NR_STAT]; - atomic_long_t events[MEMCG_NR_EVENTS]; + atomic_long_t events[NR_VM_EVENT_ITEMS]; unsigned long socket_pressure; @@ -645,9 +649,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void __count_memcg_events(struct mem_cgroup *memcg, - int idx, unsigned long count) + enum vm_event_item idx, + unsigned long count) { unsigned long x; @@ -663,7 +667,8 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg, } static inline void count_memcg_events(struct mem_cgroup *memcg, - int idx, unsigned long count) + enum vm_event_item idx, + unsigned long count) { unsigned long flags; @@ -672,9 +677,8 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, local_irq_restore(flags); } -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, - int idx) + enum vm_event_item idx) { if (page->mem_cgroup) count_memcg_events(page->mem_cgroup, idx, 1); @@ -698,10 +702,10 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) { - count_memcg_events(memcg, event, 1); + atomic_long_inc(&memcg->memory_events[event]); cgroup_file_notify(&memcg->events_file); } @@ -721,8 +725,8 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) { } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2b0265265c28..e0e49b5b1ee1 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -216,9 +216,6 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ab45f8a0d288..f2b4abbca55e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,8 +7,7 @@ #include <linux/migrate_mode.h> #include <linux/hugetlb.h> -typedef struct page *new_page_t(struct page *page, unsigned long private, - int **reason); +typedef struct page *new_page_t(struct page *page, unsigned long private); typedef void free_page_t(struct page *page, unsigned long private); /* @@ -43,9 +42,9 @@ static inline struct page *new_page_nodemask(struct page *page, return alloc_huge_page_nodemask(page_hstate(compound_head(page)), preferred_nid, nodemask); - if (thp_migration_supported() && PageTransHuge(page)) { - order = HPAGE_PMD_ORDER; + if (PageTransHuge(page)) { gfp_mask |= GFP_TRANSHUGE; + order = HPAGE_PMD_ORDER; } if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3ad632366973..1ac1f06a4be6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -747,7 +747,7 @@ int finish_mkwrite_fault(struct vm_fault *vmf); * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is - * rooted at mapping->page_tree, and indexed by offset. + * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * @@ -1466,6 +1466,7 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); +void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, @@ -2108,6 +2109,7 @@ extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); +extern void setup_zone_pageset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f11ae29005f1..32699b2dc52a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -180,6 +180,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ NR_VM_NODE_STAT_ITEMS }; @@ -884,7 +885,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; +extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 38187c68063d..2f129bbfaae8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -198,14 +198,24 @@ struct nfs_inode { /* * Cache validity bit flags */ -#define NFS_INO_INVALID_ATTR 0x0001 /* cached attrs are invalid */ -#define NFS_INO_INVALID_DATA 0x0002 /* cached data is invalid */ -#define NFS_INO_INVALID_ATIME 0x0004 /* cached atime is invalid */ -#define NFS_INO_INVALID_ACCESS 0x0008 /* cached access cred invalid */ -#define NFS_INO_INVALID_ACL 0x0010 /* cached acls are invalid */ -#define NFS_INO_REVAL_PAGECACHE 0x0020 /* must revalidate pagecache */ -#define NFS_INO_REVAL_FORCED 0x0040 /* force revalidation ignoring a delegation */ -#define NFS_INO_INVALID_LABEL 0x0080 /* cached label is invalid */ +#define NFS_INO_INVALID_DATA BIT(1) /* cached data is invalid */ +#define NFS_INO_INVALID_ATIME BIT(2) /* cached atime is invalid */ +#define NFS_INO_INVALID_ACCESS BIT(3) /* cached access cred invalid */ +#define NFS_INO_INVALID_ACL BIT(4) /* cached acls are invalid */ +#define NFS_INO_REVAL_PAGECACHE BIT(5) /* must revalidate pagecache */ +#define NFS_INO_REVAL_FORCED BIT(6) /* force revalidation ignoring a delegation */ +#define NFS_INO_INVALID_LABEL BIT(7) /* cached label is invalid */ +#define NFS_INO_INVALID_CHANGE BIT(8) /* cached change is invalid */ +#define NFS_INO_INVALID_CTIME BIT(9) /* cached ctime is invalid */ +#define NFS_INO_INVALID_MTIME BIT(10) /* cached mtime is invalid */ +#define NFS_INO_INVALID_SIZE BIT(11) /* cached size is invalid */ +#define NFS_INO_INVALID_OTHER BIT(12) /* other attrs are invalid */ + +#define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ + | NFS_INO_INVALID_CTIME \ + | NFS_INO_INVALID_MTIME \ + | NFS_INO_INVALID_SIZE \ + | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* * Bit offsets in flags field @@ -292,10 +302,11 @@ static inline void nfs_mark_for_revalidate(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR | - NFS_INO_REVAL_PAGECACHE | - NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL; + nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6959968dc36a..34d28564ecf3 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1590,11 +1590,13 @@ struct nfs_rpc_ops { unsigned int); int (*create) (struct inode *, struct dentry *, struct iattr *, int); - int (*remove) (struct inode *, const struct qstr *); - void (*unlink_setup) (struct rpc_message *, struct inode *dir); + int (*remove) (struct inode *, struct dentry *); + void (*unlink_setup) (struct rpc_message *, struct dentry *); void (*unlink_rpc_prepare) (struct rpc_task *, struct nfs_unlinkdata *); int (*unlink_done) (struct rpc_task *, struct inode *); - void (*rename_setup) (struct rpc_message *msg, struct inode *dir); + void (*rename_setup) (struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry); void (*rename_rpc_prepare)(struct rpc_task *task, struct nfs_renamedata *); int (*rename_done) (struct rpc_task *task, struct inode *old_dir, struct inode *new_dir); int (*link) (struct inode *, struct inode *, const struct qstr *); @@ -1633,7 +1635,6 @@ struct nfs_rpc_ops { struct iattr *iattr, int *); int (*have_delegation)(struct inode *, fmode_t); - int (*return_delegation)(struct inode *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); struct nfs_client *(*init_client) (struct nfs_client *, const struct nfs_client_initdata *); diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index cdad58bbfd8b..4ae347cbc36d 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -63,7 +63,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages); -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp); +struct page *alloc_migrate_target(struct page *page, unsigned long private); #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34ce3ebf97d5..b1bd2186e6d2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -144,7 +144,7 @@ void release_pages(struct page **pages, int nr); * 3. check the page is still in pagecache (if no, goto 1) * * Remove-side that cares about stability of _refcount (eg. reclaim) has the - * following (with tree_lock held for write): + * following (with the i_pages lock held): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache * C. free the page @@ -157,7 +157,7 @@ void release_pages(struct page **pages, int nr); * * It is possible that between 1 and 2, the page is removed then the exact same * page is inserted into the same position in pagecache. That's OK: the - * old find_get_page using tree_lock could equally have run before or after + * old find_get_page using a lock could equally have run before or after * such a re-insertion, depending on order that locks are granted. * * Lookups racing against pagecache insertion isn't a big problem: either 1 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index fc55ff31eca7..34149e8b5f73 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -104,25 +104,29 @@ struct radix_tree_node { unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -/* The top bits of gfp_mask are used to store the root tags and the IDR flag */ -#define ROOT_IS_IDR ((__force gfp_t)(1 << __GFP_BITS_SHIFT)) -#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT + 1) +/* The IDR tag is stored in the low bits of the GFP flags */ +#define ROOT_IS_IDR ((__force gfp_t)4) +/* The top bits of gfp_mask are used to store the root tags */ +#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) struct radix_tree_root { + spinlock_t xa_lock; gfp_t gfp_mask; struct radix_tree_node __rcu *rnode; }; -#define RADIX_TREE_INIT(mask) { \ +#define RADIX_TREE_INIT(name, mask) { \ + .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .gfp_mask = (mask), \ .rnode = NULL, \ } #define RADIX_TREE(name, mask) \ - struct radix_tree_root name = RADIX_TREE_INIT(mask) + struct radix_tree_root name = RADIX_TREE_INIT(name, mask) #define INIT_RADIX_TREE(root, mask) \ do { \ + spin_lock_init(&(root)->xa_lock); \ (root)->gfp_mask = (mask); \ (root)->rnode = NULL; \ } while (0) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9806184bb3d5..2c570cd934af 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -104,7 +104,8 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU -extern void arch_pick_mmap_layout(struct mm_struct *mm); +extern void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); @@ -113,7 +114,8 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else -static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} +static inline void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index ab437dd2e3b9..a121982af0f5 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -118,9 +118,14 @@ __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void seq_puts(struct seq_file *m, const char *s); +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width); + void seq_escape(struct seq_file *m, const char *s, const char *esc); void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, @@ -235,4 +240,5 @@ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *hea extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); +void seq_file_init(void); #endif diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index ed761f751ecb..9b11b6a0978c 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -217,5 +217,12 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); void rpc_cleanup_clids(void); + +static inline int rpc_reply_expected(struct rpc_task *task) +{ + return (task->tk_msg.rpc_proc != NULL) && + (task->tk_msg.rpc_proc->p_decode != NULL); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index d950223c64b1..2bd68177a442 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -253,6 +253,12 @@ xdr_stream_remaining(const struct xdr_stream *xdr) return xdr->nwords << 2; } +ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, + size_t size); +ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, + size_t maxlen, gfp_t gfp_flags); +ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, + size_t size); ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags); /** @@ -313,6 +319,31 @@ xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) } /** + * xdr_stream_encode_opaque_inline - Encode opaque xdr data + * @xdr: pointer to xdr_stream + * @ptr: pointer to void pointer + * @len: size of object + * + * Return values: + * On success, returns length in bytes of XDR buffer consumed + * %-EMSGSIZE on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) +{ + size_t count = sizeof(__u32) + xdr_align_size(len); + __be32 *p = xdr_reserve_space(xdr, count); + + if (unlikely(!p)) { + *ptr = NULL; + return -EMSGSIZE; + } + xdr_encode_opaque(p, NULL, len); + *ptr = ++p; + return count; +} + +/** * xdr_stream_encode_opaque_fixed - Encode fixed length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: pointer to opaque data object @@ -356,6 +387,31 @@ xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) } /** + * xdr_stream_encode_uint32_array - Encode variable length array of integers + * @xdr: pointer to xdr_stream + * @array: array of integers + * @array_size: number of elements in @array + * + * Return values: + * On success, returns length in bytes of XDR buffer consumed + * %-EMSGSIZE on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_encode_uint32_array(struct xdr_stream *xdr, + const __u32 *array, size_t array_size) +{ + ssize_t ret = (array_size+1) * sizeof(__u32); + __be32 *p = xdr_reserve_space(xdr, ret); + + if (unlikely(!p)) + return -EMSGSIZE; + *p++ = cpu_to_be32(array_size); + for (; array_size > 0; p++, array++, array_size--) + *p = cpu_to_be32p(array); + return ret; +} + +/** * xdr_stream_decode_u32 - Decode a 32-bit integer * @xdr: pointer to xdr_stream * @ptr: location to store integer @@ -432,6 +488,44 @@ xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxle } return len; } + +/** + * xdr_stream_decode_uint32_array - Decode variable length array of integers + * @xdr: pointer to xdr_stream + * @array: location to store the integer array or NULL + * @array_size: number of elements to store + * + * Return values: + * On success, returns number of elements stored in @array + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE if the size of the array exceeds @array_size + */ +static inline ssize_t +xdr_stream_decode_uint32_array(struct xdr_stream *xdr, + __u32 *array, size_t array_size) +{ + __be32 *p; + __u32 len; + ssize_t retval; + + if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) + return -EBADMSG; + p = xdr_inline_decode(xdr, len * sizeof(*p)); + if (unlikely(!p)) + return -EBADMSG; + if (array == NULL) + return len; + if (len <= array_size) { + if (len < array_size) + memset(array+len, 0, (array_size-len)*sizeof(*array)); + array_size = len; + retval = len; + } else + retval = -EMSGSIZE; + for (; array_size > 0; p++, array++, array_size--) + *array = be32_to_cpup(p); + return retval; +} #endif /* __KERNEL__ */ #endif /* _SUNRPC_XDR_H_ */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 7fad83881ce1..5fea0fb420df 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -197,7 +197,7 @@ struct rpc_xprt { struct list_head free; /* free slots */ unsigned int max_reqs; /* max number of slots */ unsigned int min_reqs; /* min number of slots */ - atomic_t num_reqs; /* total slots */ + unsigned int num_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char resvport : 1; /* use a reserved port */ atomic_t swapper; /* we're swapping over this @@ -373,6 +373,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action); void xprt_write_space(struct rpc_xprt *xprt); void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); +void xprt_update_rtt(struct rpc_task *task); void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_pin_rqst(struct rpc_rqst *req); void xprt_unpin_rqst(struct rpc_rqst *req); diff --git a/include/linux/tick.h b/include/linux/tick.h index 7f8c9a127f5a..55388ab45fd4 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -115,27 +115,46 @@ enum tick_dep_bits { extern bool tick_nohz_enabled; extern bool tick_nohz_tick_stopped(void); extern bool tick_nohz_tick_stopped_cpu(int cpu); +extern void tick_nohz_idle_stop_tick(void); +extern void tick_nohz_idle_retain_tick(void); +extern void tick_nohz_idle_restart_tick(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); -extern ktime_t tick_nohz_get_sleep_length(void); +extern bool tick_nohz_idle_got_tick(void); +extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); extern unsigned long tick_nohz_get_idle_calls(void); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); + +static inline void tick_nohz_idle_stop_tick_protected(void) +{ + local_irq_disable(); + tick_nohz_idle_stop_tick(); + local_irq_enable(); +} + #else /* !CONFIG_NO_HZ_COMMON */ #define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } +static inline void tick_nohz_idle_stop_tick(void) { } +static inline void tick_nohz_idle_retain_tick(void) { } +static inline void tick_nohz_idle_restart_tick(void) { } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } +static inline bool tick_nohz_idle_got_tick(void) { return false; } -static inline ktime_t tick_nohz_get_sleep_length(void) +static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { - return NSEC_PER_SEC / HZ; + *delta_next = TICK_NSEC; + return *delta_next; } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } + +static inline void tick_nohz_idle_stop_tick_protected(void) { } #endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 82c219dfd3bb..9737fbec7019 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -31,6 +31,7 @@ struct timespec64 get_monotonic_coarse64(void); extern void getrawmonotonic64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); extern time64_t ktime_get_seconds(void); +extern time64_t __ktime_get_real_seconds(void); extern time64_t ktime_get_real_seconds(void); extern void ktime_get_active_ts64(struct timespec64 *ts); diff --git a/include/linux/utsname.h b/include/linux/utsname.h index c8060c2ecd04..44429d9142ca 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -44,6 +44,8 @@ static inline void put_uts_ns(struct uts_namespace *ns) { kref_put(&ns->kref, free_uts_ns); } + +void uts_ns_init(void); #else static inline void get_uts_ns(struct uts_namespace *ns) { @@ -61,6 +63,10 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags, return old_ns; } + +static inline void uts_ns_init(void) +{ +} #endif #ifdef CONFIG_PROC_SYSCTL diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a4c2317d8b9f..f25cef84b41d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -20,6 +20,17 @@ extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif +struct reclaim_stat { + unsigned nr_dirty; + unsigned nr_unqueued_dirty; + unsigned nr_congested; + unsigned nr_writeback; + unsigned nr_immediate; + unsigned nr_activate; + unsigned nr_ref_keep; + unsigned nr_unmap_fail; +}; + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/include/linux/xarray.h b/include/linux/xarray.h new file mode 100644 index 000000000000..2dfc8006fe64 --- /dev/null +++ b/include/linux/xarray.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_XARRAY_H +#define _LINUX_XARRAY_H +/* + * eXtensible Arrays + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox <[email protected]> + */ + +#include <linux/spinlock.h> + +#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) +#define xa_lock(xa) spin_lock(&(xa)->xa_lock) +#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) +#define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) +#define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) +#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) +#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) +#define xa_lock_irqsave(xa, flags) \ + spin_lock_irqsave(&(xa)->xa_lock, flags) +#define xa_unlock_irqrestore(xa, flags) \ + spin_unlock_irqrestore(&(xa)->xa_lock, flags) + +#endif /* _LINUX_XARRAY_H */ diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h index 8716d5942b65..8fcf8908a694 100644 --- a/include/net/slhc_vj.h +++ b/include/net/slhc_vj.h @@ -127,6 +127,7 @@ typedef __u32 int32; */ struct cstate { byte_t cs_this; /* connection id number (xmit) */ + bool initialized; /* true if initialized */ struct cstate *next; /* next in ring (xmit) */ struct iphdr cs_ip; /* ip/tcp hdr from most recent packet */ struct tcphdr cs_tcp; diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 63815f66b274..f0820554caa9 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -49,6 +49,7 @@ enum afs_fs_operation { afs_FS_ExtendLock = 157, /* AFS Extend a file lock */ afs_FS_ReleaseLock = 158, /* AFS Release a file lock */ afs_FS_Lookup = 161, /* AFS lookup file in directory */ + afs_FS_InlineBulkStatus = 65536, /* AFS Fetch multiple file statuses with errors */ afs_FS_FetchData64 = 65537, /* AFS Fetch file data */ afs_FS_StoreData64 = 65538, /* AFS Store file data */ afs_FS_GiveUpAllCallBacks = 65539, /* AFS Give up all our callbacks on a server */ @@ -62,6 +63,27 @@ enum afs_vl_operation { afs_VL_GetCapabilities = 65537, /* AFS Get VL server capabilities */ }; +enum afs_edit_dir_op { + afs_edit_dir_create, + afs_edit_dir_create_error, + afs_edit_dir_create_inval, + afs_edit_dir_create_nospc, + afs_edit_dir_delete, + afs_edit_dir_delete_error, + afs_edit_dir_delete_inval, + afs_edit_dir_delete_noent, +}; + +enum afs_edit_dir_reason { + afs_edit_dir_for_create, + afs_edit_dir_for_link, + afs_edit_dir_for_mkdir, + afs_edit_dir_for_rename, + afs_edit_dir_for_rmdir, + afs_edit_dir_for_symlink, + afs_edit_dir_for_unlink, +}; + #endif /* end __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY */ /* @@ -93,6 +115,7 @@ enum afs_vl_operation { EM(afs_FS_ExtendLock, "FS.ExtendLock") \ EM(afs_FS_ReleaseLock, "FS.ReleaseLock") \ EM(afs_FS_Lookup, "FS.Lookup") \ + EM(afs_FS_InlineBulkStatus, "FS.InlineBulkStatus") \ EM(afs_FS_FetchData64, "FS.FetchData64") \ EM(afs_FS_StoreData64, "FS.StoreData64") \ EM(afs_FS_GiveUpAllCallBacks, "FS.GiveUpAllCallBacks") \ @@ -104,6 +127,25 @@ enum afs_vl_operation { EM(afs_YFSVL_GetEndpoints, "YFSVL.GetEndpoints") \ E_(afs_VL_GetCapabilities, "VL.GetCapabilities") +#define afs_edit_dir_ops \ + EM(afs_edit_dir_create, "create") \ + EM(afs_edit_dir_create_error, "c_fail") \ + EM(afs_edit_dir_create_inval, "c_invl") \ + EM(afs_edit_dir_create_nospc, "c_nspc") \ + EM(afs_edit_dir_delete, "delete") \ + EM(afs_edit_dir_delete_error, "d_err ") \ + EM(afs_edit_dir_delete_inval, "d_invl") \ + E_(afs_edit_dir_delete_noent, "d_nent") + +#define afs_edit_dir_reasons \ + EM(afs_edit_dir_for_create, "Create") \ + EM(afs_edit_dir_for_link, "Link ") \ + EM(afs_edit_dir_for_mkdir, "MkDir ") \ + EM(afs_edit_dir_for_rename, "Rename") \ + EM(afs_edit_dir_for_rmdir, "RmDir ") \ + EM(afs_edit_dir_for_symlink, "Symlnk") \ + E_(afs_edit_dir_for_unlink, "Unlink") + /* * Export enum symbols via userspace. @@ -116,6 +158,8 @@ enum afs_vl_operation { afs_call_traces; afs_fs_operations; afs_vl_operations; +afs_edit_dir_ops; +afs_edit_dir_reasons; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -462,6 +506,75 @@ TRACE_EVENT(afs_call_state, __entry->ret, __entry->abort) ); +TRACE_EVENT(afs_edit_dir, + TP_PROTO(struct afs_vnode *dvnode, + enum afs_edit_dir_reason why, + enum afs_edit_dir_op op, + unsigned int block, + unsigned int slot, + unsigned int f_vnode, + unsigned int f_unique, + const char *name), + + TP_ARGS(dvnode, why, op, block, slot, f_vnode, f_unique, name), + + TP_STRUCT__entry( + __field(unsigned int, vnode ) + __field(unsigned int, unique ) + __field(enum afs_edit_dir_reason, why ) + __field(enum afs_edit_dir_op, op ) + __field(unsigned int, block ) + __field(unsigned short, slot ) + __field(unsigned int, f_vnode ) + __field(unsigned int, f_unique ) + __array(char, name, 18 ) + ), + + TP_fast_assign( + int __len = strlen(name); + __len = min(__len, 17); + __entry->vnode = dvnode->fid.vnode; + __entry->unique = dvnode->fid.unique; + __entry->why = why; + __entry->op = op; + __entry->block = block; + __entry->slot = slot; + __entry->f_vnode = f_vnode; + __entry->f_unique = f_unique; + memcpy(__entry->name, name, __len); + __entry->name[__len] = 0; + ), + + TP_printk("d=%x:%x %s %s %u[%u] f=%x:%x %s", + __entry->vnode, __entry->unique, + __print_symbolic(__entry->why, afs_edit_dir_reasons), + __print_symbolic(__entry->op, afs_edit_dir_ops), + __entry->block, __entry->slot, + __entry->f_vnode, __entry->f_unique, + __entry->name) + ); + +TRACE_EVENT(afs_protocol_error, + TP_PROTO(struct afs_call *call, int error, const void *where), + + TP_ARGS(call, error, where), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(int, error ) + __field(const void *, where ) + ), + + TP_fast_assign( + __entry->call = call ? call->debug_id : 0; + __entry->error = error; + __entry->where = where; + ), + + TP_printk("c=%08x r=%d sp=%pSR", + __entry->call, __entry->error, __entry->where) + ); + #endif /* _TRACE_AFS_H */ /* This part must be outside protection */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 922cb8968fb2..335d87242439 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -50,9 +50,9 @@ DEFINE_EVENT(rpc_task_status, rpc_bind_status, ); TRACE_EVENT(rpc_connect_status, - TP_PROTO(struct rpc_task *task, int status), + TP_PROTO(const struct rpc_task *task), - TP_ARGS(task, status), + TP_ARGS(task), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -63,7 +63,7 @@ TRACE_EVENT(rpc_connect_status, TP_fast_assign( __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __entry->status = status; + __entry->status = task->tk_status; ), TP_printk("task:%u@%u status=%d", @@ -103,9 +103,9 @@ TRACE_EVENT(rpc_request, DECLARE_EVENT_CLASS(rpc_task_running, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action), + TP_ARGS(task, action), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -117,7 +117,8 @@ DECLARE_EVENT_CLASS(rpc_task_running, ), TP_fast_assign( - __entry->client_id = clnt ? clnt->cl_clid : -1; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; __entry->task_id = task->tk_pid; __entry->action = action; __entry->runstate = task->tk_runstate; @@ -136,33 +137,33 @@ DECLARE_EVENT_CLASS(rpc_task_running, DEFINE_EVENT(rpc_task_running, rpc_task_begin, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action) + TP_ARGS(task, action) ); DEFINE_EVENT(rpc_task_running, rpc_task_run_action, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action) + TP_ARGS(task, action) ); DEFINE_EVENT(rpc_task_running, rpc_task_complete, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action) + TP_ARGS(task, action) ); DECLARE_EVENT_CLASS(rpc_task_queued, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q), + TP_PROTO(const struct rpc_task *task, const struct rpc_wait_queue *q), - TP_ARGS(clnt, task, q), + TP_ARGS(task, q), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -175,7 +176,8 @@ DECLARE_EVENT_CLASS(rpc_task_queued, ), TP_fast_assign( - __entry->client_id = clnt ? clnt->cl_clid : -1; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; __entry->task_id = task->tk_pid; __entry->timeout = task->tk_timeout; __entry->runstate = task->tk_runstate; @@ -196,18 +198,63 @@ DECLARE_EVENT_CLASS(rpc_task_queued, DEFINE_EVENT(rpc_task_queued, rpc_task_sleep, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q), + TP_PROTO(const struct rpc_task *task, const struct rpc_wait_queue *q), - TP_ARGS(clnt, task, q) + TP_ARGS(task, q) ); DEFINE_EVENT(rpc_task_queued, rpc_task_wakeup, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q), + TP_PROTO(const struct rpc_task *task, const struct rpc_wait_queue *q), + + TP_ARGS(task, q) + +); + +TRACE_EVENT(rpc_stats_latency, + + TP_PROTO( + const struct rpc_task *task, + ktime_t backlog, + ktime_t rtt, + ktime_t execute + ), - TP_ARGS(clnt, task, q) + TP_ARGS(task, backlog, rtt, execute), + TP_STRUCT__entry( + __field(u32, xid) + __field(int, version) + __string(progname, task->tk_client->cl_program->name) + __string(procname, rpc_proc_name(task)) + __field(unsigned long, backlog) + __field(unsigned long, rtt) + __field(unsigned long, execute) + __string(addr, + task->tk_xprt->address_strings[RPC_DISPLAY_ADDR]) + __string(port, + task->tk_xprt->address_strings[RPC_DISPLAY_PORT]) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); + __entry->version = task->tk_client->cl_vers; + __assign_str(progname, task->tk_client->cl_program->name) + __assign_str(procname, rpc_proc_name(task)) + __entry->backlog = ktime_to_us(backlog); + __entry->rtt = ktime_to_us(rtt); + __entry->execute = ktime_to_us(execute); + __assign_str(addr, + task->tk_xprt->address_strings[RPC_DISPLAY_ADDR]); + __assign_str(port, + task->tk_xprt->address_strings[RPC_DISPLAY_PORT]); + ), + + TP_printk("peer=[%s]:%s xid=0x%08x %sv%d %s backlog=%lu rtt=%lu execute=%lu", + __get_str(addr), __get_str(port), __entry->xid, + __get_str(progname), __entry->version, __get_str(procname), + __entry->backlog, __entry->rtt, __entry->execute) ); /* @@ -406,6 +453,27 @@ DEFINE_EVENT(rpc_xprt_event, xprt_complete_rqst, TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status), TP_ARGS(xprt, xid, status)); +TRACE_EVENT(xprt_ping, + TP_PROTO(const struct rpc_xprt *xprt, int status), + + TP_ARGS(xprt, status), + + TP_STRUCT__entry( + __field(int, status) + __string(addr, xprt->address_strings[RPC_DISPLAY_ADDR]) + __string(port, xprt->address_strings[RPC_DISPLAY_PORT]) + ), + + TP_fast_assign( + __entry->status = status; + __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); + __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + ), + + TP_printk("peer=[%s]:%s status=%d", + __get_str(addr), __get_str(port), __entry->status) +); + TRACE_EVENT(xs_tcp_data_ready, TP_PROTO(struct rpc_xprt *xprt, int err, unsigned int total), diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 6570c5b45ba1..a1cb91342231 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -346,15 +346,9 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, - unsigned long nr_dirty, unsigned long nr_writeback, - unsigned long nr_congested, unsigned long nr_immediate, - unsigned long nr_activate, unsigned long nr_ref_keep, - unsigned long nr_unmap_fail, - int priority, int file), + struct reclaim_stat *stat, int priority, int file), - TP_ARGS(nid, nr_scanned, nr_reclaimed, nr_dirty, nr_writeback, - nr_congested, nr_immediate, nr_activate, nr_ref_keep, - nr_unmap_fail, priority, file), + TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file), TP_STRUCT__entry( __field(int, nid) @@ -375,13 +369,13 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; - __entry->nr_dirty = nr_dirty; - __entry->nr_writeback = nr_writeback; - __entry->nr_congested = nr_congested; - __entry->nr_immediate = nr_immediate; - __entry->nr_activate = nr_activate; - __entry->nr_ref_keep = nr_ref_keep; - __entry->nr_unmap_fail = nr_unmap_fail; + __entry->nr_dirty = stat->nr_dirty; + __entry->nr_writeback = stat->nr_writeback; + __entry->nr_congested = stat->nr_congested; + __entry->nr_immediate = stat->nr_immediate; + __entry->nr_activate = stat->nr_activate; + __entry->nr_ref_keep = stat->nr_ref_keep; + __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f8b134f5608f..e7ee32861d51 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -27,6 +27,9 @@ # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ + /* * Flags for mlock */ diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index 92537757590a..5ed721ad5b19 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* const.h: Macros for dealing with constants. */ -#ifndef _LINUX_CONST_H -#define _LINUX_CONST_H +#ifndef _UAPI_LINUX_CONST_H +#define _UAPI_LINUX_CONST_H /* Some constant macros are used in both assembler and * C code. Therefore we cannot annotate them always with @@ -22,7 +22,10 @@ #define _AT(T,X) ((T)(X)) #endif -#define _BITUL(x) (_AC(1,UL) << (x)) -#define _BITULL(x) (_AC(1,ULL) << (x)) +#define _UL(x) (_AC(x, UL)) +#define _ULL(x) (_AC(x, ULL)) -#endif /* !(_LINUX_CONST_H) */ +#define _BITUL(x) (_UL(1) << (x)) +#define _BITULL(x) (_ULL(1) << (x)) + +#endif /* _UAPI_LINUX_CONST_H */ diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index 5d5ab81dc9be..e4a0d9a9a9e8 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h @@ -7,6 +7,7 @@ /* ipcs ctl commands */ #define MSG_STAT 11 #define MSG_INFO 12 +#define MSG_STAT_ANY 13 /* msgrcv options */ #define MSG_NOERROR 010000 /* no error if message is too big */ diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 9c3e745b0656..39a1876f039e 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -19,6 +19,7 @@ /* ipcs ctl cmds */ #define SEM_STAT 18 #define SEM_INFO 19 +#define SEM_STAT_ANY 20 /* Obsolete, used only for backwards compatibility and libc5 compiles */ struct semid_ds { diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 4de12a39b075..dde1344f047c 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -83,8 +83,9 @@ struct shmid_ds { #define SHM_UNLOCK 12 /* ipcs ctl commands */ -#define SHM_STAT 13 -#define SHM_INFO 14 +#define SHM_STAT 13 +#define SHM_INFO 14 +#define SHM_STAT_ANY 15 /* Obsolete, used only for backwards compatibility */ struct shminfo { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 4e8b8304b793..40297a3181ed 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -53,7 +53,9 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ #define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */ #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */ -#define VIRTIO_BALLOON_S_NR 8 +#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ +#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ +#define VIRTIO_BALLOON_S_NR 10 /* * Memory statistics structure. diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h index 9b0eb574f0d1..6d1384abfbdf 100644 --- a/include/xen/interface/features.h +++ b/include/xen/interface/features.h @@ -42,6 +42,9 @@ /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ #define XENFEAT_mmu_pt_update_preserve_ad 5 +/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */ +#define XENFEAT_highmem_assist 6 + /* * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel * available pte bits. @@ -60,6 +63,26 @@ /* operation as Dom0 is supported */ #define XENFEAT_dom0 11 +/* Xen also maps grant references at pfn = mfn. + * This feature flag is deprecated and should not be used. +#define XENFEAT_grant_map_identity 12 + */ + +/* Guest can use XENMEMF_vnode to specify virtual node for memory op. */ +#define XENFEAT_memory_op_vnode_supported 13 + +/* arm: Hypervisor supports ARM SMC calling convention. */ +#define XENFEAT_ARM_SMCCC_supported 14 + +/* + * x86/PVH: If set, ACPI RSDP can be placed at any address. Otherwise RSDP + * must be located in lower 1MB, as required by ACPI Specification for IA-PC + * systems. + * This feature flag is only consulted if XEN_ELFNOTE_GUEST_OS contains + * the "linux" string. + */ +#define XENFEAT_linux_rsdp_unrestricted 15 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 12c159824c7b..035a5f0ab26b 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -255,7 +255,7 @@ int __init rd_load_image(char *from) nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); for (i = 0, disk = 1; i < nblocks; i++) { if (i && (i % devblocks == 0)) { - printk("done disk #%d.\n", disk++); + pr_cont("done disk #%d.\n", disk++); rotate = 0; if (ksys_close(in_fd)) { printk("Error closing the disk.\n"); @@ -278,7 +278,7 @@ int __init rd_load_image(char *from) } #endif } - printk("done.\n"); + pr_cont("done.\n"); successful_load: res = 1; diff --git a/init/main.c b/init/main.c index d499f4a80e0b..b795aa341a3a 100644 --- a/init/main.c +++ b/init/main.c @@ -51,6 +51,7 @@ #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/unistd.h> +#include <linux/utsname.h> #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> @@ -706,6 +707,7 @@ asmlinkage __visible void __init start_kernel(void) cred_init(); fork_init(); proc_caches_init(); + uts_ns_init(); buffer_init(); key_init(); security_init(); @@ -713,6 +715,7 @@ asmlinkage __visible void __init start_kernel(void) vfs_caches_init(); pagecache_init(); signals_init(); + seq_file_init(); proc_root_init(); nsfs_init(); cpuset_init(); diff --git a/ipc/msg.c b/ipc/msg.c index 114a21189613..56fd1c73eedc 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -497,14 +497,14 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, memset(p, 0, sizeof(*p)); rcu_read_lock(); - if (cmd == MSG_STAT) { + if (cmd == MSG_STAT || cmd == MSG_STAT_ANY) { msq = msq_obtain_object(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } id = msq->q_perm.id; - } else { + } else { /* IPC_STAT */ msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); @@ -512,9 +512,14 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, } } - err = -EACCES; - if (ipcperms(ns, &msq->q_perm, S_IRUGO)) - goto out_unlock; + /* see comment for SHM_STAT_ANY */ + if (cmd == MSG_STAT_ANY) + audit_ipc_obj(&msq->q_perm); + else { + err = -EACCES; + if (ipcperms(ns, &msq->q_perm, S_IRUGO)) + goto out_unlock; + } err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) @@ -572,6 +577,7 @@ long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) return err; } case MSG_STAT: /* msqid is an index rather than a msg queue id */ + case MSG_STAT_ANY: case IPC_STAT: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) @@ -690,6 +696,7 @@ long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr) } case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; diff --git a/ipc/sem.c b/ipc/sem.c index 2994da8ccc7f..06be75d9217a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1220,14 +1220,14 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, memset(semid64, 0, sizeof(*semid64)); rcu_read_lock(); - if (cmd == SEM_STAT) { + if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) { sma = sem_obtain_object(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } id = sma->sem_perm.id; - } else { + } else { /* IPC_STAT */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); @@ -1235,9 +1235,14 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, } } - err = -EACCES; - if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) - goto out_unlock; + /* see comment for SHM_STAT_ANY */ + if (cmd == SEM_STAT_ANY) + audit_ipc_obj(&sma->sem_perm); + else { + err = -EACCES; + if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) + goto out_unlock; + } err = security_sem_semctl(&sma->sem_perm, cmd); if (err) @@ -1626,6 +1631,7 @@ long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg) return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; @@ -1732,6 +1738,7 @@ long compat_ksys_semctl(int semid, int semnum, int cmd, int arg) return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; diff --git a/ipc/shm.c b/ipc/shm.c index acefe44fefef..5639345dbec9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -415,7 +415,7 @@ static int shm_split(struct vm_area_struct *vma, unsigned long addr) struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); - if (sfd->vm_ops && sfd->vm_ops->split) + if (sfd->vm_ops->split) return sfd->vm_ops->split(vma, addr); return 0; @@ -947,14 +947,14 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, memset(tbuf, 0, sizeof(*tbuf)); rcu_read_lock(); - if (cmd == SHM_STAT) { + if (cmd == SHM_STAT || cmd == SHM_STAT_ANY) { shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } id = shp->shm_perm.id; - } else { + } else { /* IPC_STAT */ shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); @@ -962,9 +962,20 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, } } - err = -EACCES; - if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) - goto out_unlock; + /* + * Semantically SHM_STAT_ANY ought to be identical to + * that functionality provided by the /proc/sysvipc/ + * interface. As such, only audit these calls and + * do not do traditional S_IRUGO permission checks on + * the ipc object. + */ + if (cmd == SHM_STAT_ANY) + audit_ipc_obj(&shp->shm_perm); + else { + err = -EACCES; + if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) + goto out_unlock; + } err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) @@ -1104,6 +1115,7 @@ long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) return err; } case SHM_STAT: + case SHM_STAT_ANY: case IPC_STAT: { err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) @@ -1282,6 +1294,7 @@ long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr) return err; } case IPC_STAT: + case SHM_STAT_ANY: case SHM_STAT: err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) diff --git a/ipc/util.c b/ipc/util.c index 3783b7991cc7..4e81182fa0ac 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -89,6 +89,7 @@ static int __init ipc_init(void) { int err_sem, err_msg; + proc_mkdir("sysvipc", NULL); err_sem = sem_init(); WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem); err_msg = msg_init(); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 90ff129c88a2..62c301ad0773 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -242,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i) kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); if (bp->bp_enabled) - kdb_printf("\n is enabled"); + kdb_printf("\n is enabled "); else kdb_printf("\n is disabled"); - kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n", + kdb_printf(" addr at %016lx, hardtype=%d installed=%d\n", bp->bp_addr, bp->bp_type, bp->bp_installed); kdb_printf("\n"); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index dbb0781a0533..e405677ee08d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1150,6 +1150,16 @@ void kdb_set_current_task(struct task_struct *p) kdb_current_regs = NULL; } +static void drop_newline(char *buf) +{ + size_t len = strlen(buf); + + if (len == 0) + return; + if (*(buf + len - 1) == '\n') + *(buf + len - 1) = '\0'; +} + /* * kdb_local - The main code for kdb. This routine is invoked on a * specific processor, it is not global. The main kdb() routine @@ -1327,6 +1337,7 @@ do_full_getstr: cmdptr = cmd_head; diag = kdb_parse(cmdbuf); if (diag == KDB_NOTFOUND) { + drop_newline(cmdbuf); kdb_printf("Unknown kdb command: '%s'\n", cmdbuf); diag = 0; } @@ -1566,6 +1577,7 @@ static int kdb_md(int argc, const char **argv) int symbolic = 0; int valid = 0; int phys = 0; + int raw = 0; kdbgetintenv("MDCOUNT", &mdcount); kdbgetintenv("RADIX", &radix); @@ -1575,9 +1587,10 @@ static int kdb_md(int argc, const char **argv) repeat = mdcount * 16 / bytesperword; if (strcmp(argv[0], "mdr") == 0) { - if (argc != 2) + if (argc == 2 || (argc == 0 && last_addr != 0)) + valid = raw = 1; + else return KDB_ARGCOUNT; - valid = 1; } else if (isdigit(argv[0][2])) { bytesperword = (int)(argv[0][2] - '0'); if (bytesperword == 0) { @@ -1613,7 +1626,10 @@ static int kdb_md(int argc, const char **argv) radix = last_radix; bytesperword = last_bytesperword; repeat = last_repeat; - mdcount = ((repeat * bytesperword) + 15) / 16; + if (raw) + mdcount = repeat; + else + mdcount = ((repeat * bytesperword) + 15) / 16; } if (argc) { @@ -1630,7 +1646,10 @@ static int kdb_md(int argc, const char **argv) diag = kdbgetularg(argv[nextarg], &val); if (!diag) { mdcount = (int) val; - repeat = mdcount * 16 / bytesperword; + if (raw) + repeat = mdcount; + else + repeat = mdcount * 16 / bytesperword; } } if (argc >= nextarg+1) { @@ -1640,8 +1659,15 @@ static int kdb_md(int argc, const char **argv) } } - if (strcmp(argv[0], "mdr") == 0) - return kdb_mdr(addr, mdcount); + if (strcmp(argv[0], "mdr") == 0) { + int ret; + last_addr = addr; + ret = kdb_mdr(addr, mdcount); + last_addr += mdcount; + last_repeat = mdcount; + last_bytesperword = bytesperword; // to make REPEAT happy + return ret; + } switch (radix) { case 10: @@ -2473,41 +2499,6 @@ static int kdb_kill(int argc, const char **argv) return 0; } -struct kdb_tm { - int tm_sec; /* seconds */ - int tm_min; /* minutes */ - int tm_hour; /* hours */ - int tm_mday; /* day of the month */ - int tm_mon; /* month */ - int tm_year; /* year */ -}; - -static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) -{ - /* This will work from 1970-2099, 2100 is not a leap year */ - static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31, - 31, 30, 31, 30, 31 }; - memset(tm, 0, sizeof(*tm)); - tm->tm_sec = tv->tv_sec % (24 * 60 * 60); - tm->tm_mday = tv->tv_sec / (24 * 60 * 60) + - (2 * 365 + 1); /* shift base from 1970 to 1968 */ - tm->tm_min = tm->tm_sec / 60 % 60; - tm->tm_hour = tm->tm_sec / 60 / 60; - tm->tm_sec = tm->tm_sec % 60; - tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1)); - tm->tm_mday %= (4*365+1); - mon_day[1] = 29; - while (tm->tm_mday >= mon_day[tm->tm_mon]) { - tm->tm_mday -= mon_day[tm->tm_mon]; - if (++tm->tm_mon == 12) { - tm->tm_mon = 0; - ++tm->tm_year; - mon_day[1] = 28; - } - } - ++tm->tm_mday; -} - /* * Most of this code has been lifted from kernel/timer.c::sys_sysinfo(). * I cannot call that code directly from kdb, it has an unconditional @@ -2515,10 +2506,10 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) */ static void kdb_sysinfo(struct sysinfo *val) { - struct timespec uptime; - ktime_get_ts(&uptime); + u64 uptime = ktime_get_mono_fast_ns(); + memset(val, 0, sizeof(*val)); - val->uptime = uptime.tv_sec; + val->uptime = div_u64(uptime, NSEC_PER_SEC); val->loads[0] = avenrun[0]; val->loads[1] = avenrun[1]; val->loads[2] = avenrun[2]; @@ -2533,8 +2524,8 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { - struct timespec now; - struct kdb_tm tm; + time64_t now; + struct tm tm; struct sysinfo val; if (argc) @@ -2548,9 +2539,9 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - now = __current_kernel_time(); - kdb_gmtime(&now, &tm); - kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " + now = __ktime_get_real_seconds(); + time64_to_tm(now, 0, &tm); + kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index d35cc2d3a4cc..990b3cc526c8 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) } if (i >= ARRAY_SIZE(kdb_name_table)) { debug_kfree(kdb_name_table[0]); - memcpy(kdb_name_table, kdb_name_table+1, + memmove(kdb_name_table, kdb_name_table+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-1)); } else { debug_kfree(knt1); knt1 = kdb_name_table[i]; - memcpy(kdb_name_table+i, kdb_name_table+i+1, + memmove(kdb_name_table+i, kdb_name_table+i+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-i-1)); } diff --git a/kernel/panic.c b/kernel/panic.c index 6c3b08cd1139..42e487488554 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,7 +34,8 @@ #define PANIC_BLINK_SPD 18 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; -static unsigned long tainted_mask; +static unsigned long tainted_mask = + IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -308,52 +309,40 @@ EXPORT_SYMBOL(panic); * is being removed anyway. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ - { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ - { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ - { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ - { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ - { 'B', ' ', false }, /* TAINT_BAD_PAGE */ - { 'U', ' ', false }, /* TAINT_USER */ - { 'D', ' ', false }, /* TAINT_DIE */ - { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ - { 'W', ' ', false }, /* TAINT_WARN */ - { 'C', ' ', true }, /* TAINT_CRAP */ - { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ - { 'O', ' ', true }, /* TAINT_OOT_MODULE */ - { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ - { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ - { 'K', ' ', true }, /* TAINT_LIVEPATCH */ - { 'X', ' ', true }, /* TAINT_AUX */ + [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true }, + [ TAINT_FORCED_MODULE ] = { 'F', ' ', true }, + [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false }, + [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false }, + [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false }, + [ TAINT_BAD_PAGE ] = { 'B', ' ', false }, + [ TAINT_USER ] = { 'U', ' ', false }, + [ TAINT_DIE ] = { 'D', ' ', false }, + [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false }, + [ TAINT_WARN ] = { 'W', ' ', false }, + [ TAINT_CRAP ] = { 'C', ' ', true }, + [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false }, + [ TAINT_OOT_MODULE ] = { 'O', ' ', true }, + [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true }, + [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, + [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, + [ TAINT_AUX ] = { 'X', ' ', true }, + [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, }; /** - * print_tainted - return a string to represent the kernel taint state. + * print_tainted - return a string to represent the kernel taint state. * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'D' - Kernel has oopsed before - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * 'C' - modules from drivers/staging are loaded. - * 'I' - Working around severe firmware bug. - * 'O' - Out-of-tree module has been loaded. - * 'E' - Unsigned module has been loaded. - * 'L' - A soft lockup has previously occurred. - * 'K' - Kernel has been live patched. - * 'X' - Auxiliary taint, for distros' use. + * For individual taint flag meanings, see Documentation/sysctl/kernel.txt * - * The string is overwritten by the next call to print_tainted(). + * The string is overwritten by the next call to print_tainted(), + * but is always NULL terminated. */ const char *print_tainted(void) { static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; + BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); + if (tainted_mask) { char *s; int i; diff --git a/kernel/params.c b/kernel/params.c index cc9108c2a1fd..ce89f757e6da 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b) static void param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { - pr_warn("Setting dangerous option %s - tainting kernel\n", - kp->name); + pr_notice("Setting dangerous option %s - tainting kernel\n", + kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } } diff --git a/kernel/pid.c b/kernel/pid.c index ed6c343fe50d..157fe4b19971 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT; */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .idr = IDR_INIT, + .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d7503910ce2..fa39092b7aea 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, * changed */ plist_del(node, &c->list); + /* fall through */ case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); + /* fall through */ case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2975f195e1c4..1a3e9bddd17b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -141,13 +141,15 @@ static void cpuidle_idle_call(void) } /* - * Tell the RCU framework we are entering an idle section, - * so no more rcu read side critical sections and one more + * The RCU framework needs to be told that we are entering an idle + * section, so no more rcu read side critical sections and one more * step to the grace period */ - rcu_idle_enter(); if (cpuidle_not_available(drv, dev)) { + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + default_idle_call(); goto exit_idle; } @@ -164,20 +166,37 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle() || dev->use_deepest_state) { if (idle_should_enter_s2idle()) { + rcu_idle_enter(); + entered_state = cpuidle_enter_s2idle(drv, dev); if (entered_state > 0) { local_irq_enable(); goto exit_idle; } + + rcu_idle_exit(); } + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + next_state = cpuidle_find_deepest_state(drv, dev); call_cpuidle(drv, dev, next_state); } else { + bool stop_tick = true; + /* * Ask the cpuidle framework to choose a convenient idle state. */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev, &stop_tick); + + if (stop_tick) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); + + rcu_idle_enter(); + entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -222,6 +241,7 @@ static void do_idle(void) rmb(); if (cpu_is_offline(cpu)) { + tick_nohz_idle_stop_tick_protected(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -235,10 +255,12 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + tick_nohz_idle_restart_tick(); cpu_idle_poll(); - else + } else { cpuidle_idle_call(); + } arch_cpu_idle_exit(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bdf7090b106d..6a78cf70761d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,7 +1340,7 @@ static struct ctl_table vm_table[] = { { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirty_expire_interval), + .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, .extra1 = &zero, @@ -2511,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, } #endif +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; @@ -2554,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success. + * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2567,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; @@ -2614,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success. + * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9b082ce86325..eda1210ce50f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -480,6 +480,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { @@ -492,9 +493,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + next = timerqueue_iterate_next(next); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + if (timer->is_soft) cpu_base->softirq_next_timer = timer; else @@ -538,7 +552,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } @@ -546,7 +561,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); } return expires_next; @@ -1190,6 +1206,39 @@ u64 hrtimer_get_next_event(void) return expires; } + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d70da1b9a0d..a09ded765f6c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ /* USER_HZ period (usecs): */ -unsigned long tick_usec = TICK_USEC; +unsigned long tick_usec = USER_TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f3ab08caa2c3..646645e981f9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -122,8 +122,7 @@ static ktime_t tick_init_jiffy_update(void) return period; } - -static void tick_sched_do_timer(ktime_t now) +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -143,6 +142,9 @@ static void tick_sched_do_timer(ktime_t now) /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + + if (ts->inidle) + ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) @@ -474,7 +476,9 @@ __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { - return __this_cpu_read(tick_cpu_sched.tick_stopped); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) @@ -537,14 +541,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) sched_clock_idle_wakeup_event(); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static void tick_nohz_start_idle(struct tick_sched *ts) { - ktime_t now = ktime_get(); - - ts->idle_entrytime = now; + ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); - return now; } /** @@ -653,13 +654,10 @@ static inline bool local_timer_softirq_pending(void) return local_softirq_pending() & TIMER_SOFTIRQ; } -static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long seq, basejiff; - ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { @@ -668,6 +666,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work @@ -712,47 +711,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { - tick = 0; + ts->timer_expires = 0; goto out; } } /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + + /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this CPU - * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferment value. - * Otherwise we can sleep as long as we want. + * was the one which had the do_timer() duty last. */ - delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - delta = KTIME_MAX; ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - delta = KTIME_MAX; } - /* Calculate the next expiry time */ - if (delta < (KTIME_MAX - basemono)) - expires = basemono + delta; - else - expires = KTIME_MAX; - - expires = min_t(u64, expires, next_tick); - tick = expires; - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) - goto out; + return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", @@ -786,7 +801,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); - goto out; + return; } hrtimer_set_expires(&ts->sched_timer, tick); @@ -795,15 +810,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); -out: - /* - * Update the estimated sleep length until the next timer - * (not only the tick). - */ - ts->sleep_length = ktime_sub(dev->next_event, now); - return tick; } +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -839,7 +862,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) return; if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); #endif @@ -865,10 +888,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = NSEC_PER_SEC / HZ; + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; - } if (need_resched()) return false; @@ -903,42 +924,65 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_enter(struct tick_sched *ts) +static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - ktime_t now, expires; + ktime_t expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; + + ts->idle_calls++; - if (can_stop_idle_tick(cpu, ts)) { + if (expires > 0LL) { int was_stopped = ts->tick_stopped; - ts->idle_calls++; + tick_nohz_stop_tick(ts, cpu); - expires = tick_nohz_stop_sched_tick(ts, now, cpu); - if (expires > 0LL) { - ts->idle_sleeps++; - ts->idle_expires = expires; - } + ts->idle_sleeps++; + ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } + } else { + tick_nohz_retain_tick(ts); } } /** - * tick_nohz_idle_enter - stop the idle tick from the idle task + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: + */ +void tick_nohz_idle_stop_tick(void) +{ + __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); +} + +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { @@ -949,8 +993,11 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + ts->inidle = 1; - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); local_irq_enable(); } @@ -968,21 +1015,62 @@ void tick_nohz_irq_exit(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** - * tick_nohz_get_sleep_length - return the length of the current sleep + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; + return true; + } + return false; +} + +/** + * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled */ -ktime_t tick_nohz_get_sleep_length(void) +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + *delta_next = ktime_sub(dev->next_event, now); + + if (!can_stop_idle_tick(cpu, ts)) + return *delta_next; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + return *delta_next; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); - return ts->sleep_length; + return ktime_sub(next_event, now); } /** @@ -1031,6 +1119,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #endif } +static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +{ + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, ktime_get()); +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -1041,24 +1143,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; - if (ts->idle_active || ts->tick_stopped) + if (idle_active || tick_stopped) now = ktime_get(); - if (ts->idle_active) + if (idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_ticks(ts); - } + if (tick_stopped) + __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); } @@ -1074,7 +1178,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) dev->next_event = KTIME_MAX; - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* No need to reprogram if we are running tickless */ @@ -1169,7 +1273,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 954b43dbf21c..6de959a854b2 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -38,31 +38,37 @@ enum tick_nohz_mode { * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @sleep_length: Duration of the current idle sleep + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set */ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; + + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + ktime_t last_tick; ktime_t next_tick; - int inidle; - int tick_stopped; unsigned long idle_jiffies; unsigned long idle_calls; unsigned long idle_sleeps; - int idle_active; ktime_t idle_entrytime; ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t iowait_sleeptime; - ktime_t sleep_length; unsigned long last_jiffies; + u64 timer_expires; + u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; - int do_timer_last; atomic_t tick_dep_mask; }; diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index fdbeeb02dde9..cf5c0828ee31 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -31,6 +31,4 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif -extern time64_t __ktime_get_real_seconds(void); - #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/utsname.c b/kernel/utsname.c index 913fe4336d2b..dcd6be1996fe 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -19,6 +19,8 @@ #include <linux/proc_ns.h> #include <linux/sched/task.h> +static struct kmem_cache *uts_ns_cache __ro_after_init; + static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); @@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; - uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) kref_init(&uts_ns->kref); return uts_ns; @@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) @@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, return ns; fail_free: - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: @@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref) dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = { .install = utsns_install, .owner = utsns_owner, }; + +void __init uts_ns_init(void) +{ + uts_ns_cache = kmem_cache_create_usercopy( + "uts_namespace", sizeof(struct uts_namespace), 0, + SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct uts_namespace, name), + sizeof_field(struct uts_namespace, name), + NULL); +} diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 51c6bf0d93c6..c40c7b734cd1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -800,6 +800,30 @@ config SOFTLOCKUP_DETECTOR chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config BOOTPARAM_SOFTLOCKUP_PANIC + bool "Panic (Reboot) On Soft Lockups" + depends on SOFTLOCKUP_DETECTOR + help + Say Y here to enable the kernel to panic on "soft lockups", + which are bugs that cause the kernel to loop in kernel + mode for more than 20 seconds (configurable using the watchdog_thresh + sysctl), without giving other tasks a chance to run. + + The panic can be used in combination with panic_timeout, + to cause the system to reboot automatically after a + lockup has been detected. This feature is useful for + high-availability systems that have uptime guarantees and + where a lockup must be resolved ASAP. + + Say N if unsure. + +config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE + int + depends on SOFTLOCKUP_DETECTOR + range 0 1 + default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC + default 1 if BOOTPARAM_SOFTLOCKUP_PANIC + config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR @@ -849,30 +873,6 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE default 0 if !BOOTPARAM_HARDLOCKUP_PANIC default 1 if BOOTPARAM_HARDLOCKUP_PANIC -config BOOTPARAM_SOFTLOCKUP_PANIC - bool "Panic (Reboot) On Soft Lockups" - depends on SOFTLOCKUP_DETECTOR - help - Say Y here to enable the kernel to panic on "soft lockups", - which are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds (configurable using the watchdog_thresh - sysctl), without giving other tasks a chance to run. - - The panic can be used in combination with panic_timeout, - to cause the system to reboot automatically after a - lockup has been detected. This feature is useful for - high-availability systems that have uptime guarantees and - where a lockup must be resolved ASAP. - - Say N if unsure. - -config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE - int - depends on SOFTLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC - default 1 if BOOTPARAM_SOFTLOCKUP_PANIC - config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index a669c193b878..19d42ea75ec2 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -46,3 +46,10 @@ config UBSAN_NULL help This option enables detection of memory accesses via a null pointer. + +config TEST_UBSAN + tristate "Module for testing for undefined behavior detection" + depends on m && UBSAN + help + This is a test module for UBSAN. + It triggers various undefined behavior, and detect it. diff --git a/lib/Makefile b/lib/Makefile index 8fc0d3a9b34f..ce20696d5a92 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -53,6 +53,9 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o +CFLAGS_test_kasan.o += -fno-builtin +obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o +UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o diff --git a/lib/list_debug.c b/lib/list_debug.c index a34db8d27667..5d5424b51b74 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -21,13 +21,13 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { if (CHECK_DATA_CORRUPTION(next->prev != prev, - "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || CHECK_DATA_CORRUPTION(prev->next != next, - "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + "list_add corruption. prev->next should be next (%px), but was %px. (prev=%px).\n", next, prev->next, prev) || CHECK_DATA_CORRUPTION(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", + "list_add double add: new=%px, prev=%px, next=%px.\n", new, prev, next)) return false; @@ -43,16 +43,16 @@ bool __list_del_entry_valid(struct list_head *entry) next = entry->next; if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || CHECK_DATA_CORRUPTION(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + "list_del corruption, %px->prev is LIST_POISON2 (%px)\n", entry, LIST_POISON2) || CHECK_DATA_CORRUPTION(prev->next != entry, - "list_del corruption. prev->next should be %p, but was %p\n", + "list_del corruption. prev->next should be %px, but was %px\n", entry, prev->next) || CHECK_DATA_CORRUPTION(next->prev != entry, - "list_del corruption. next->prev should be %p, but was %p\n", + "list_del corruption. next->prev should be %px, but was %px\n", entry, next->prev)) return false; diff --git a/lib/lockref.c b/lib/lockref.c index 47169ed7e964..3d468b53d4c9 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -81,6 +81,34 @@ int lockref_get_not_zero(struct lockref *lockref) EXPORT_SYMBOL(lockref_get_not_zero); /** + * lockref_put_not_zero - Decrements count unless count <= 1 before decrement + * @lockref: pointer to lockref structure + * Return: 1 if count updated successfully or 0 if count would become zero + */ +int lockref_put_not_zero(struct lockref *lockref) +{ + int retval; + + CMPXCHG_LOOP( + new.count--; + if (old.count <= 1) + return 0; + , + return 1; + ); + + spin_lock(&lockref->lock); + retval = 0; + if (lockref->count > 1) { + lockref->count--; + retval = 1; + } + spin_unlock(&lockref->lock); + return retval; +} +EXPORT_SYMBOL(lockref_put_not_zero); + +/** * lockref_get_or_lock - Increments count unless the count is 0 or dead * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count was zero diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e00138d593f..da9e10c827df 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -146,7 +146,7 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent, static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { - return root->gfp_mask & __GFP_BITS_MASK; + return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, @@ -2285,6 +2285,7 @@ void __init radix_tree_init(void) int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); + BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 47aeb04c1997..de7cc540450f 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -719,7 +719,7 @@ swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, goto out_warn; *dma_handle = __phys_to_dma(dev, phys_addr); - if (dma_coherent_ok(dev, *dma_handle, size)) + if (!dma_coherent_ok(dev, *dma_handle, size)) goto out_unmap; memset(phys_to_virt(phys_addr), 0, size); diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 413367cf569e..de16f7869fb1 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -296,15 +296,17 @@ static void __init test_bitmap_parselist(void) } } +#define EXP_BYTES (sizeof(exp) * 8) + static void __init test_bitmap_arr32(void) { - unsigned int nbits, next_bit, len = sizeof(exp) * 8; + unsigned int nbits, next_bit; u32 arr[sizeof(exp) / 4]; - DECLARE_BITMAP(bmap2, len); + DECLARE_BITMAP(bmap2, EXP_BYTES); memset(arr, 0xa5, sizeof(arr)); - for (nbits = 0; nbits < len; ++nbits) { + for (nbits = 0; nbits < EXP_BYTES; ++nbits) { bitmap_to_arr32(arr, exp, nbits); bitmap_from_arr32(bmap2, arr, nbits); expect_eq_bitmap(bmap2, exp, nbits); @@ -316,7 +318,7 @@ static void __init test_bitmap_arr32(void) " tail is not safely cleared: %d\n", nbits, next_bit); - if (nbits < len - 32) + if (nbits < EXP_BYTES - 32) expect_eq_uint(arr[DIV_ROUND_UP(nbits, 32)], 0xa5a5a5a5); } diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 98854a64b014..ec657105edbf 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -567,7 +567,15 @@ static noinline void __init kmem_cache_invalid_free(void) return; } + /* Trigger invalid free, the object doesn't get freed */ kmem_cache_free(cache, p + 1); + + /* + * Properly free the object to prevent the "Objects remaining in + * test_cache on __kmem_cache_shutdown" BUG failure. + */ + kmem_cache_free(cache, p); + kmem_cache_destroy(cache); } diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c new file mode 100644 index 000000000000..280f4979d00e --- /dev/null +++ b/lib/test_ubsan.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> + +typedef void(*test_ubsan_fp)(void); + +static void test_ubsan_add_overflow(void) +{ + volatile int val = INT_MAX; + + val += 2; +} + +static void test_ubsan_sub_overflow(void) +{ + volatile int val = INT_MIN; + volatile int val2 = 2; + + val -= val2; +} + +static void test_ubsan_mul_overflow(void) +{ + volatile int val = INT_MAX / 2; + + val *= 3; +} + +static void test_ubsan_negate_overflow(void) +{ + volatile int val = INT_MIN; + + val = -val; +} + +static void test_ubsan_divrem_overflow(void) +{ + volatile int val = 16; + volatile int val2 = 0; + + val /= val2; +} + +static void test_ubsan_vla_bound_not_positive(void) +{ + volatile int size = -1; + char buf[size]; + + (void)buf; +} + +static void test_ubsan_shift_out_of_bounds(void) +{ + volatile int val = -1; + int val2 = 10; + + val2 <<= val; +} + +static void test_ubsan_out_of_bounds(void) +{ + volatile int i = 4, j = 5; + volatile int arr[i]; + + arr[j] = i; +} + +static void test_ubsan_load_invalid_value(void) +{ + volatile char *dst, *src; + bool val, val2, *ptr; + char c = 4; + + dst = (char *)&val; + src = &c; + *dst = *src; + + ptr = &val2; + val2 = val; +} + +static void test_ubsan_null_ptr_deref(void) +{ + volatile int *ptr = NULL; + int val; + + val = *ptr; +} + +static void test_ubsan_misaligned_access(void) +{ + volatile char arr[5] __aligned(4) = {1, 2, 3, 4, 5}; + volatile int *ptr, val = 6; + + ptr = (int *)(arr + 1); + *ptr = val; +} + +static void test_ubsan_object_size_mismatch(void) +{ + /* "((aligned(8)))" helps this not into be misaligned for ptr-access. */ + volatile int val __aligned(8) = 4; + volatile long long *ptr, val2; + + ptr = (long long *)&val; + val2 = *ptr; +} + +static const test_ubsan_fp test_ubsan_array[] = { + test_ubsan_add_overflow, + test_ubsan_sub_overflow, + test_ubsan_mul_overflow, + test_ubsan_negate_overflow, + test_ubsan_divrem_overflow, + test_ubsan_vla_bound_not_positive, + test_ubsan_shift_out_of_bounds, + test_ubsan_out_of_bounds, + test_ubsan_load_invalid_value, + //test_ubsan_null_ptr_deref, /* exclude it because there is a crash */ + test_ubsan_misaligned_access, + test_ubsan_object_size_mismatch, +}; + +static int __init test_ubsan_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(test_ubsan_array); i++) + test_ubsan_array[i](); + + (void)test_ubsan_null_ptr_deref; /* to avoid unsed-function warning */ + return 0; +} +module_init(test_ubsan_init); + +static void __exit test_ubsan_exit(void) +{ + /* do nothing */ +} +module_exit(test_ubsan_exit); + +MODULE_AUTHOR("Jinbum Park <[email protected]>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 89f8a4a4b770..30c0cb8cc9bc 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -336,7 +336,7 @@ char *put_dec(char *buf, unsigned long long n) * * If speed is not important, use snprintf(). It's easy to read the code. */ -int num_to_str(char *buf, int size, unsigned long long num) +int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[sizeof(num) * 3] __aligned(2); @@ -350,11 +350,21 @@ int num_to_str(char *buf, int size, unsigned long long num) len = put_dec(tmp, num) - tmp; } - if (len > size) + if (len > size || width > size) return 0; + + if (width > len) { + width = width - len; + for (idx = 0; idx < width; idx++) + buf[idx] = ' '; + } else { + width = 0; + } + for (idx = 0; idx < len; ++idx) - buf[idx] = tmp[len - idx - 1]; - return len; + buf[idx + width] = tmp[len - idx - 1]; + + return len + width; } #define SIGN 1 /* unsigned/signed, must be 1 */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 08b9aab631ab..023190c69dce 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1020,23 +1020,18 @@ EXPORT_SYMBOL(congestion_wait); /** * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes - * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * - * In the event of a congested backing_dev (any backing_dev) and the given - * @pgdat has experienced recent congestion, this waits for up to @timeout - * jiffies for either a BDI to exit congestion of the given @sync queue - * or a write to complete. - * - * In the absence of pgdat congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the event of a congested backing_dev (any backing_dev) this waits + * for up to @timeout jiffies for either a BDI to exit congestion of the + * given @sync queue or a write to complete. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) +long wait_iff_congested(int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -1044,12 +1039,10 @@ long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) wait_queue_head_t *wqh = &congestion_wqh[sync]; /* - * If there is no congestion, or heavy congestion is not being - * encountered in the current pgdat, yield if necessary instead + * If there is no congestion, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { + if (atomic_read(&nr_wb_congested[sync]) == 0) { cond_resched(); /* In case we scheduled, work out time remaining */ @@ -39,6 +39,7 @@ #include <trace/events/cma.h> #include "cma.h" +#include "internal.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; @@ -109,23 +110,25 @@ static int __init cma_activate_area(struct cma *cma) if (!cma->bitmap) return -ENOMEM; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { unsigned j; base_pfn = pfn; + if (!pfn_valid(base_pfn)) + goto err; + + zone = page_zone(pfn_to_page(base_pfn)); for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); + if (!pfn_valid(pfn)) + goto err; + /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. + * In init_cma_reserved_pageblock(), present_pages + * is adjusted with assumption that all pages in + * the pageblock come from a single zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; + goto err; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -139,7 +142,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; -not_in_zone: +err: pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; @@ -149,6 +152,41 @@ not_in_zone: static int __init cma_init_reserved_areas(void) { int i; + struct zone *zone; + pg_data_t *pgdat; + + if (!cma_area_count) + return 0; + + for_each_online_pgdat(pgdat) { + unsigned long start_pfn = UINT_MAX, end_pfn = 0; + + zone = &pgdat->node_zones[ZONE_MOVABLE]; + + /* + * In this case, we cannot adjust the zone range + * since it is now maximum node span and we don't + * know original zone range. + */ + if (populated_zone(zone)) + continue; + + for (i = 0; i < cma_area_count; i++) { + if (pfn_to_nid(cma_areas[i].base_pfn) != + pgdat->node_id) + continue; + + start_pfn = min(start_pfn, cma_areas[i].base_pfn); + end_pfn = max(end_pfn, cma_areas[i].base_pfn + + cma_areas[i].count); + } + + if (!end_pfn) + continue; + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } for (i = 0; i < cma_area_count; i++) { int ret = cma_activate_area(&cma_areas[i]); @@ -157,9 +195,32 @@ static int __init cma_init_reserved_areas(void) return ret; } + /* + * Reserved pages for ZONE_MOVABLE are now activated and + * this would change ZONE_MOVABLE's managed page counter and + * the other zones' present counter. We need to re-calculate + * various zone information that depends on this initialization. + */ + build_all_zonelists(NULL); + for_each_populated_zone(zone) { + if (zone_idx(zone) == ZONE_MOVABLE) { + zone_pcp_reset(zone); + setup_zone_pageset(zone); + } else + zone_pcp_update(zone); + + set_zone_contiguous(zone); + } + + /* + * We need to re-init per zone wmark by calling + * init_per_zone_wmark_min() but doesn't call here because it is + * registered on core_initcall and it will be called later than us. + */ + return 0; } -core_initcall(cma_init_reserved_areas); +pure_initcall(cma_init_reserved_areas); /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory diff --git a/mm/compaction.c b/mm/compaction.c index 88d01a50a015..028b7210a669 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1166,8 +1166,7 @@ static void isolate_freepages(struct compact_control *cc) * from the isolated freelists in the block we are migrating to. */ static struct page *compaction_alloc(struct page *migratepage, - unsigned long data, - int **result) + unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; @@ -1451,14 +1450,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. - * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - ALLOC_CMA, wmark_target)) + 0, wmark_target)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; diff --git a/mm/filemap.c b/mm/filemap.c index 693f62212a59..ab77e19ab09c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,7 +66,7 @@ * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) - * ->mapping->tree_lock + * ->i_pages lock * * ->i_mutex * ->i_mmap_rwsem (truncate->unmap_mapping_range) @@ -74,7 +74,7 @@ * ->mmap_sem * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) - * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) @@ -84,7 +84,7 @@ * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) - * ->mapping->tree_lock (__sync_single_inode) + * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) @@ -95,11 +95,11 @@ * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) - * ->tree_lock (try_to_unmap_one) + * ->i_pages lock (try_to_unmap_one) * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) - * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) @@ -118,14 +118,15 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, 0, + error = __radix_tree_create(&mapping->i_pages, page->index, 0, &node, &slot); if (error) return error; if (*slot) { void *p; - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + p = radix_tree_deref_slot_protected(slot, + &mapping->i_pages.xa_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; @@ -133,7 +134,7 @@ static int page_cache_tree_insert(struct address_space *mapping, if (shadowp) *shadowp = p; } - __radix_tree_replace(&mapping->page_tree, node, slot, page, + __radix_tree_replace(&mapping->i_pages, node, slot, page, workingset_lookup_update(mapping)); mapping->nrpages++; return 0; @@ -155,13 +156,13 @@ static void page_cache_tree_delete(struct address_space *mapping, struct radix_tree_node *node; void **slot; - __radix_tree_lookup(&mapping->page_tree, page->index + i, + __radix_tree_lookup(&mapping->i_pages, page->index + i, &node, &slot); VM_BUG_ON_PAGE(!node && nr != 1, page); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + radix_tree_clear_tags(&mapping->i_pages, node, slot); + __radix_tree_replace(&mapping->i_pages, node, slot, shadow, workingset_lookup_update(mapping)); } @@ -253,7 +254,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the i_pages lock. */ void __delete_from_page_cache(struct page *page, void *shadow) { @@ -296,9 +297,9 @@ void delete_from_page_cache(struct page *page) unsigned long flags; BUG_ON(!PageLocked(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); page_cache_free_page(mapping, page); } @@ -309,14 +310,14 @@ EXPORT_SYMBOL(delete_from_page_cache); * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * - * The function walks over mapping->page_tree and removes pages passed in @pvec - * from the radix tree. The function expects @pvec to be sorted by page index. - * It tolerates holes in @pvec (radix tree entries at those indices are not + * The function walks over mapping->i_pages and removes pages passed in @pvec + * from the mapping. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the radix - * tree as well. + * @pvec and takes care to delete all corresponding tail pages from the + * mapping as well. * - * The function expects mapping->tree_lock to be held. + * The function expects the i_pages lock to be held. */ static void page_cache_tree_delete_batch(struct address_space *mapping, @@ -330,11 +331,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, pgoff_t start; start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (i >= pagevec_count(pvec) && !tail_pages) break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page)) continue; if (!tail_pages) { @@ -357,8 +358,8 @@ page_cache_tree_delete_batch(struct address_space *mapping, } else { tail_pages--; } - radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); - __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); + __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, workingset_lookup_update(mapping)); total_pages++; } @@ -374,14 +375,14 @@ void delete_from_page_cache_batch(struct address_space *mapping, if (!pagevec_count(pvec)) return; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_tree_delete_batch(mapping, pvec); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -798,7 +799,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(old, NULL); error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); @@ -810,7 +811,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_node_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_node_page_state(new, NR_SHMEM); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) @@ -852,7 +853,7 @@ static int __add_to_page_cache_locked(struct page *page, page->mapping = mapping; page->index = offset; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) @@ -861,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting. */ if (!huge) __inc_node_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); @@ -869,7 +870,7 @@ static int __add_to_page_cache_locked(struct page *page, err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -1353,7 +1354,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index++; @@ -1394,7 +1395,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index--; @@ -1427,7 +1428,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) rcu_read_lock(); repeat: page = NULL; - pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); if (pagep) { page = radix_tree_deref_slot(pagep); if (unlikely(!page)) @@ -1633,7 +1634,7 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1710,7 +1711,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { struct page *head, *page; if (iter.index > end) @@ -1795,7 +1796,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { + radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1875,8 +1876,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, *index, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { struct page *head, *page; if (iter.index > end) @@ -1969,8 +1969,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, start, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -2624,8 +2623,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start_pgoff) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { if (iter.index > end_pgoff) break; repeat: @@ -160,6 +160,32 @@ static void hmm_invalidate_range(struct hmm *hmm, up_read(&hmm->mirrors_sem); } +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct hmm_mirror *mirror; + struct hmm *hmm = mm->hmm; + + down_write(&hmm->mirrors_sem); + mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, + list); + while (mirror) { + list_del_init(&mirror->list); + if (mirror->ops->release) { + /* + * Drop mirrors_sem so callback can wait on any pending + * work that might itself trigger mmu_notifier callback + * and thus would deadlock with us. + */ + up_write(&hmm->mirrors_sem); + mirror->ops->release(mirror); + down_write(&hmm->mirrors_sem); + } + mirror = list_first_entry_or_null(&hmm->mirrors, + struct hmm_mirror, list); + } + up_write(&hmm->mirrors_sem); +} + static void hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, @@ -185,6 +211,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .release = hmm_release, .invalidate_range_start = hmm_invalidate_range_start, .invalidate_range_end = hmm_invalidate_range_end, }; @@ -206,13 +233,24 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; +again: mirror->hmm = hmm_register(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); + if (mirror->hmm->mm == NULL) { + /* + * A racing hmm_mirror_unregister() is about to destroy the hmm + * struct. Try again to allocate a new one. + */ + up_write(&mirror->hmm->mirrors_sem); + mirror->hmm = NULL; + goto again; + } else { + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + } return 0; } @@ -227,11 +265,32 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = mirror->hmm; + bool should_unregister = false; + struct mm_struct *mm; + struct hmm *hmm; + if (mirror->hmm == NULL) + return; + + hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del(&mirror->list); + list_del_init(&mirror->list); + should_unregister = list_empty(&hmm->mirrors); + mirror->hmm = NULL; + mm = hmm->mm; + hmm->mm = NULL; up_write(&hmm->mirrors_sem); + + if (!should_unregister || mm == NULL) + return; + + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); + + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -240,110 +299,275 @@ struct hmm_vma_walk { unsigned long last; bool fault; bool block; - bool write; }; -static int hmm_vma_do_fault(struct mm_walk *walk, - unsigned long addr, - hmm_pfn_t *pfn) +static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, + bool write_fault, uint64_t *pfn) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; int r; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + flags |= write_fault ? FAULT_FLAG_WRITE : 0; r = handle_mm_fault(vma, addr, flags); if (r & VM_FAULT_RETRY) return -EBUSY; if (r & VM_FAULT_ERROR) { - *pfn = HMM_PFN_ERROR; + *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } return -EAGAIN; } -static void hmm_pfns_special(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = HMM_PFN_SPECIAL; -} - static int hmm_pfns_bad(unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; - hmm_pfn_t *pfns = range->pfns; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = HMM_PFN_ERROR; + pfns[i] = range->values[HMM_PFN_ERROR]; return 0; } -static void hmm_pfns_clear(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = 0; -} - -static int hmm_vma_walk_hole(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +/* + * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @fault: should we fault or not ? + * @write_fault: write fault ? + * @walk: mm_walk structure + * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * + * This function will be called whenever pmd_none() or pte_none() returns true, + * or whenever there is no page directory covering the virtual address range. + */ +static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, + bool fault, bool write_fault, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long i; hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) { + pfns[i] = range->values[HMM_PFN_NONE]; + if (fault || write_fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, write_fault, + &pfns[i]); if (ret != -EAGAIN) return ret; } } - return hmm_vma_walk->fault ? -EAGAIN : 0; + return (fault || write_fault) ? -EAGAIN : 0; } -static int hmm_vma_walk_clear(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags, + bool *fault, bool *write_fault) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + + *fault = *write_fault = false; + if (!hmm_vma_walk->fault) + return; + + /* We aren't ask to do anything ... */ + if (!(pfns & range->flags[HMM_PFN_VALID])) + return; + /* If this is device memory than only fault if explicitly requested */ + if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { + /* Do we fault on device memory ? */ + if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { + *write_fault = pfns & range->flags[HMM_PFN_WRITE]; + *fault = true; + } + return; + } + + /* If CPU page table is not valid then we need to fault */ + *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); + /* Need to write fault ? */ + if ((pfns & range->flags[HMM_PFN_WRITE]) && + !(cpu_flags & range->flags[HMM_PFN_WRITE])) { + *write_fault = true; + *fault = true; + } +} + +static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags, bool *fault, + bool *write_fault) +{ unsigned long i; - hmm_vma_walk->last = addr; + if (!hmm_vma_walk->fault) { + *fault = *write_fault = false; + return; + } + + for (i = 0; i < npages; ++i) { + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, + fault, write_fault); + if ((*fault) || (*write_fault)) + return; + } +} + +static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + bool fault, write_fault; + unsigned long i, npages; + uint64_t *pfns; + i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = 0; - if (hmm_vma_walk->fault) { - int ret; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); +} - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); - if (ret != -EAGAIN) - return ret; +static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) +{ + if (pmd_protnone(pmd)) + return 0; + return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pmd(struct mm_walk *walk, + unsigned long addr, + unsigned long end, + uint64_t *pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long pfn, npages, i; + bool fault, write_fault; + uint64_t cpu_flags; + + npages = (end - addr) >> PAGE_SHIFT; + cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, + &fault, &write_fault); + + if (pmd_protnone(pmd) || fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + hmm_vma_walk->last = end; + return 0; +} + +static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) +{ + if (pte_none(pte) || !pte_present(pte)) + return 0; + return pte_write(pte) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, + unsigned long end, pmd_t *pmdp, pte_t *ptep, + uint64_t *pfn) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + bool fault, write_fault; + uint64_t cpu_flags; + pte_t pte = *ptep; + uint64_t orig_pfn = *pfn; + + *pfn = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + + if (pte_none(pte)) { + if (fault || write_fault) + goto fault; + return 0; + } + + if (!pte_present(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (!non_swap_entry(entry)) { + if (fault || write_fault) + goto fault; + return 0; } + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_device_private_entry(entry)) { + cpu_flags = range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_DEVICE_PRIVATE]; + cpu_flags |= is_write_device_private_entry(entry) ? + range->flags[HMM_PFN_WRITE] : 0; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) + goto fault; + *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn |= cpu_flags; + return 0; + } + + if (is_migration_entry(entry)) { + if (fault || write_fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + return 0; + } + + /* Report error for everything else */ + *pfn = range->values[HMM_PFN_ERROR]; + return -EFAULT; } - return hmm_vma_walk->fault ? -EAGAIN : 0; + if (fault || write_fault) + goto fault; + + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + return 0; + +fault: + pte_unmap(ptep); + /* Fault any virtual address we were asked to fault */ + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -353,26 +577,20 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long addr = start, i; - bool write_fault; - hmm_pfn_t flag; pte_t *ptep; i = (addr - range->start) >> PAGE_SHIFT; - flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; - write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; again: if (pmd_none(*pmdp)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - unsigned long pfn; pmd_t pmd; /* @@ -388,17 +606,8 @@ again: barrier(); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - if (pmd_protnone(pmd)) - return hmm_vma_walk_clear(start, end, walk); - if (write_fault && !pmd_write(pmd)) - return hmm_vma_walk_clear(start, end, walk); - - pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; - return 0; + return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } if (pmd_bad(*pmdp)) @@ -406,79 +615,43 @@ again: ptep = pte_offset_map(pmdp, addr); for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { - pte_t pte = *ptep; - - pfns[i] = 0; + int r; - if (pte_none(pte)) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) - goto fault; - continue; + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + if (r) { + /* hmm_vma_handle_pte() did unmap pte directory */ + hmm_vma_walk->last = addr; + return r; } - - if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (!non_swap_entry(entry)) { - if (hmm_vma_walk->fault) - goto fault; - continue; - } - - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. - */ - if (is_device_private_entry(entry)) { - pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); - if (is_write_device_private_entry(entry)) { - pfns[i] |= HMM_PFN_WRITE; - } else if (write_fault) - goto fault; - pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; - pfns[i] |= flag; - } else if (is_migration_entry(entry)) { - if (hmm_vma_walk->fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, - pmdp, addr); - return -EAGAIN; - } - continue; - } else { - /* Report error for everything else */ - pfns[i] = HMM_PFN_ERROR; - } - continue; - } - - if (write_fault && !pte_write(pte)) - goto fault; - - pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; - continue; - -fault: - pte_unmap(ptep); - /* Fault all pages in range */ - return hmm_vma_walk_clear(start, end, walk); } pte_unmap(ptep - 1); + hmm_vma_walk->last = addr; return 0; } +static void hmm_pfns_clear(struct hmm_range *range, + uint64_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = range->values[HMM_PFN_NONE]; +} + +static void hmm_pfns_special(struct hmm_range *range) +{ + unsigned long addr = range->start, i = 0; + + for (; addr < range->end; addr += PAGE_SIZE, i++) + range->pfns[i] = range->values[HMM_PFN_SPECIAL]; +} + /* * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @vma: virtual memory area containing the virtual address range - * @range: used to track snapshot validity - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @entries: array of hmm_pfn_t: provided by the caller, filled in by function - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * @range: range being snapshotted + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * vma permission, 0 success * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further @@ -491,26 +664,17 @@ fault: * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns) +int hmm_vma_get_pfns(struct hmm_range *range) { + struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return -EINVAL; - } - /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); @@ -520,10 +684,24 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); @@ -541,14 +719,13 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; - walk_page_range(start, end, &mm_walk); + walk_page_range(range->start, range->end, &mm_walk); return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @vma: virtual memory area containing the virtual address range * @range: range being tracked * Returns: false if range data has been invalidated, true otherwise * @@ -568,10 +745,10 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * * There are two ways to use this : * again: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * trans = device_build_page_table_update_transaction(pfns); * device_page_table_lock(); - * if (!hmm_vma_range_done(vma, range)) { + * if (!hmm_vma_range_done(range)) { * device_page_table_unlock(); * goto again; * } @@ -579,13 +756,13 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * device_page_table_unlock(); * * Or: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * device_page_table_lock(); - * hmm_vma_range_done(vma, range); - * device_update_page_table(pfns); + * hmm_vma_range_done(range); + * device_update_page_table(range->pfns); * device_page_table_unlock(); */ -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +bool hmm_vma_range_done(struct hmm_range *range) { unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; struct hmm *hmm; @@ -595,7 +772,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) return false; } - hmm = hmm_register(vma->vm_mm); + hmm = hmm_register(range->vma->vm_mm); if (!hmm) { memset(range->pfns, 0, sizeof(*range->pfns) * npages); return false; @@ -611,36 +788,34 @@ EXPORT_SYMBOL(hmm_vma_range_done); /* * hmm_vma_fault() - try to fault some address in a virtual address range - * @vma: virtual memory area containing the virtual address range - * @range: use to track pfns array content validity - * @start: fault range virtual start address (inclusive) - * @end: fault range virtual end address (exclusive) - * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted - * @write: is it a write fault + * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) * * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs. * - * On error, for one virtual address in the range, the function will set the - * hmm_pfn_t error flag for the corresponding pfn entry. + * On error, for one virtual address in the range, the function will mark the + * corresponding HMM pfn entry with an error flag. * * Expected use pattern: * retry: * down_read(&mm->mmap_sem); * // Find vma and address device wants to fault, initialize hmm_pfn_t * // array accordingly - * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * ret = hmm_vma_fault(range, write, block); * switch (ret) { * case -EAGAIN: - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // You might want to rate limit or yield to play nicely, you may * // also commit any valid pfn in the array assuming that you are * // getting true from hmm_vma_range_monitor_end() * goto retry; * case 0: * break; + * case -ENOMEM: + * case -EINVAL: + * case -EPERM: * default: * // Handle error ! * up_read(&mm->mmap_sem) @@ -648,7 +823,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); * } * // Take device driver lock that serialize device page table update * driver_lock_device_page_table_update(); - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // Commit pfns we got from hmm_vma_fault() * driver_unlock_device_page_table_update(); * up_read(&mm->mmap_sem) @@ -658,51 +833,54 @@ EXPORT_SYMBOL(hmm_vma_range_done); * * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block) +int hmm_vma_fault(struct hmm_range *range, bool block) { + struct vm_area_struct *vma = range->vma; + unsigned long start = range->start; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; int ret; /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); if (!hmm) { - hmm_pfns_clear(pfns, start, end); + hmm_pfns_clear(range, range->pfns, range->start, range->end); return -ENOMEM; } /* Caller must have registered a mirror using hmm_mirror_register() */ if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); spin_unlock(&hmm->lock); - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return 0; - } - hmm_vma_walk.fault = true; - hmm_vma_walk.write = write; hmm_vma_walk.block = block; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; @@ -717,7 +895,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, mm_walk.pte_hole = hmm_vma_walk_hole; do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(start, range->end, &mm_walk); start = hmm_vma_walk.last; } while (ret == -EAGAIN); @@ -725,8 +903,9 @@ int hmm_vma_fault(struct vm_area_struct *vma, unsigned long i; i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); - hmm_vma_range_done(vma, range); + hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, + range->end); + hmm_vma_range_done(range); } return ret; } @@ -845,13 +1024,6 @@ static void hmm_devmem_release(struct device *dev, void *data) hmm_devmem_radix_release(resource); } -static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); -} - static int hmm_devmem_pages_create(struct hmm_devmem *devmem) { resource_size_t key, align_start, align_size, align_end; @@ -892,9 +1064,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { struct hmm_devmem *dup; - rcu_read_lock(); - dup = hmm_devmem_find(key); - rcu_read_unlock(); + dup = radix_tree_lookup(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT); if (dup) { dev_err(device, "%s: collides with mapping for %s\n", __func__, dev_name(dup->device)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0ae8d1d4329..14ed6ee5e02f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,8 +555,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, - true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1317,7 +1316,7 @@ alloc: } if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, - huge_gfp | __GFP_NORETRY, &memcg, true))) { + huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) @@ -2402,6 +2401,12 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + + /* + * always add to the tail because some iterators expect new + * pages to show after the currently processed elements - e.g. + * migrate_pages + */ lru_add_page_tail(head, page_tail, lruvec, list); } @@ -2445,7 +2450,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } else { /* Additional pin to radix tree */ page_ref_add(head, 2); - spin_unlock(&head->mapping->tree_lock); + xa_unlock(&head->mapping->i_pages); } spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); @@ -2653,15 +2658,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { void **pslot; - spin_lock(&mapping->tree_lock); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + xa_lock(&mapping->i_pages); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(head)); /* * Check if the head page is present in radix tree. * We assume all tail are present too, if head is there. */ if (radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock) != head) + &mapping->i_pages.xa_lock) != head) goto fail; } @@ -2695,7 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); ret = -EBUSY; diff --git a/mm/internal.h b/mm/internal.h index e6bd35182dae..62d8c34e63d5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -495,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ enum ttu_flags; struct tlbflush_unmap_batch; @@ -538,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); +extern struct page *alloc_new_node_page(struct page *page, unsigned long node); #endif /* __MM_INTERNAL_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e42568284e06..d7b2a4bf8671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -965,9 +965,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1326,9 +1324,7 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } @@ -1348,8 +1344,8 @@ static void collapse_shmem(struct mm_struct *mm, */ index = start; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { int n = min(iter.index, end) - index; /* @@ -1362,7 +1358,7 @@ static void collapse_shmem(struct mm_struct *mm, } nr_none += n; for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } @@ -1371,16 +1367,16 @@ static void collapse_shmem(struct mm_struct *mm, break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; goto tree_unlocked; } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } else if (trylock_page(page)) { get_page(page); } else { @@ -1389,7 +1385,7 @@ static void collapse_shmem(struct mm_struct *mm, } /* - * The page must be locked, so we can drop the tree_lock + * The page must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1400,7 +1396,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1410,11 +1406,11 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - slot = radix_tree_lookup_slot(&mapping->page_tree, index); + slot = radix_tree_lookup_slot(&mapping->i_pages, index); VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->tree_lock), page); + &mapping->i_pages.xa_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1435,14 +1431,14 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->page_tree, slot, + radix_tree_replace_slot(&mapping->i_pages, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); out_isolate_failed: unlock_page(page); @@ -1468,14 +1464,14 @@ out_unlock: } for (; index < end; index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } nr_none += n; } tree_locked: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); tree_unlocked: if (result == SCAN_SUCCEED) { @@ -1524,9 +1520,8 @@ tree_unlocked: } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; page = list_first_entry_or_null(&pagelist, @@ -1536,8 +1531,7 @@ tree_unlocked: break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->page_tree, - iter.index); + radix_tree_delete(&mapping->i_pages, iter.index); continue; } @@ -1546,16 +1540,15 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->page_tree, - slot, page); + radix_tree_replace_slot(&mapping->i_pages, slot, page); slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); unlock_page(page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } VM_BUG_ON(nr_none); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); @@ -1583,7 +1576,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= start + HPAGE_PMD_NR) break; @@ -1883,8 +1876,16 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - for_each_populated_zone(zone) + for_each_populated_zone(zone) { + /* + * We don't need to worry about fragmentation of + * ZONE_MOVABLE since it only has movable pages. + */ + if (zone_idx(zone) > gfp_zone(GFP_USER)) + continue; + nr_zones++; + } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; @@ -1131,6 +1131,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } else { newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)); + /* + * We're replacing an anonymous page with a zero page, which is + * not anonymous. We need to do proper accounting otherwise we + * will get wrong values in /proc, and a BUG message in dmesg + * when tearing down the mm. + */ + dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9ec024b862ac..e074f7c637aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1485,7 +1485,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) return; /* * We are in the middle of the charge context here, so we @@ -1839,7 +1839,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) } } - for (i = 0; i < MEMCG_NR_EVENTS; i++) { + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x; x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); @@ -1858,7 +1858,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_event(memcg, MEMCG_HIGH); + memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1949,7 +1949,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_event(mem_over_limit, MEMCG_MAX); + memcg_memory_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1992,7 +1992,7 @@ retry: if (fatal_signal_pending(current)) goto force; - mem_cgroup_event(mem_over_limit, MEMCG_OOM); + memcg_memory_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2688,10 +2688,10 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) struct mem_cgroup *iter; int i; - memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); + memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_EVENTS; i++) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) events[i] += memcg_sum_events(iter, i); } } @@ -4108,6 +4108,9 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + return; + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -5178,7 +5181,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_event(memcg, MEMCG_OOM); + memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5191,10 +5194,14 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; @@ -5204,7 +5211,7 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; int i; /* @@ -5967,9 +5974,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) /* * Interrupts should be disabled here because the caller holds the - * mapping->tree_lock lock which is taken with interrupts-off. It is + * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the - * only synchronisation we have for udpating the per-CPU variables. + * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2d4bf647cf01..9d142b9b86dc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1487,7 +1487,7 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private, int **x) +static struct page *new_page(struct page *p, unsigned long private) { int nid = page_to_nid(p); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cc6dfa5832ca..f74826cdceea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1329,8 +1329,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } -static struct page *new_node_page(struct page *page, unsigned long private, - int **result) +static struct page *new_node_page(struct page *page, unsigned long private) { int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; @@ -1373,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } else if (thp_migration_supported() && PageTransHuge(page)) + } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01cbb7078d6c..9ac49ef17b4e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, __split_huge_pmd(walk->vma, pmd, addr, false, NULL); goto out; } - if (!thp_migration_supported()) { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - goto out; - } if (!queue_pages_required(page, qp)) { ret = 1; goto unlock; @@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; -retry: + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -511,22 +502,6 @@ retry: continue; if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page) && !thp_migration_supported()) { - get_page(page); - pte_unmap_unlock(pte, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - /* Failed to split -- skip. */ - if (ret) { - pte = pte_offset_map_lock(walk->mm, pmd, - addr, &ptl); - continue; - } - goto retry; - } - migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); @@ -942,12 +917,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, } } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +/* page allocation callback for NUMA node migration */ +struct page *alloc_new_node_page(struct page *page, unsigned long node) { if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else if (thp_migration_supported() && PageTransHuge(page)) { + else if (PageTransHuge(page)) { struct page *thp; thp = alloc_pages_node(node, @@ -986,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, NULL, dest, + err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1107,7 +1083,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long uninitialized_var(address); @@ -1123,7 +1099,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) if (PageHuge(page)) { return alloc_huge_page_vma(page_hstate(compound_head(page)), vma, address); - } else if (thp_migration_supported() && PageTransHuge(page)) { + } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, @@ -1152,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 003886606a22..f65dd69e1fd1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -467,20 +467,21 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, + &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -494,7 +495,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -522,7 +523,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -531,7 +532,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - 1); - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -574,20 +575,19 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -596,11 +596,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return MIGRATEPAGE_SUCCESS; } @@ -1137,10 +1137,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - int *result = NULL; struct page *newpage; - newpage = get_new_page(page, private, &result); + if (!thp_migration_supported() && PageTransHuge(page)) + return -ENOMEM; + + newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; @@ -1161,14 +1163,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { - lock_page(page); - rc = split_huge_page(page); - unlock_page(page); - if (rc) - goto out; - } - rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1231,12 +1225,6 @@ put_new: put_page(newpage); } - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(newpage); - } return rc; } @@ -1264,7 +1252,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason) { int rc = -EAGAIN; - int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1281,7 +1268,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } - new_hpage = get_new_page(hpage, private, &result); + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; @@ -1345,12 +1332,6 @@ out: else putback_active_hugepage(new_hpage); - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(new_hpage); - } return rc; } @@ -1395,6 +1376,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, retry = 0; list_for_each_entry_safe(page, page2, from, lru) { +retry: cond_resched(); if (PageHuge(page)) @@ -1408,6 +1390,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + if (PageTransHuge(page)) { + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) { + list_safe_reset_next(page, page2, lru); + goto retry; + } + } nr_failed++; goto out; case -EAGAIN: @@ -1444,141 +1446,101 @@ out: } #ifdef CONFIG_NUMA -/* - * Move a list of individual pages - */ -struct page_to_node { - unsigned long addr; - struct page *page; - int node; - int status; -}; -static struct page *new_page_node(struct page *p, unsigned long private, - int **result) +static int store_status(int __user *status, int start, int value, int nr) { - struct page_to_node *pm = (struct page_to_node *)private; - - while (pm->node != MAX_NUMNODES && pm->page != p) - pm++; + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } - if (pm->node == MAX_NUMNODES) - return NULL; + return 0; +} - *result = &pm->status; +static int do_move_pages_to_node(struct mm_struct *mm, + struct list_head *pagelist, int node) +{ + int err; - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - pm->node); - else if (thp_migration_supported() && PageTransHuge(p)) { - struct page *thp; + if (list_empty(pagelist)) + return 0; - thp = alloc_pages_node(pm->node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(pm->node, - GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); + err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(pagelist); + return err; } /* - * Move a set of pages as indicated in the pm array. The addr - * field must be set to the virtual address of the page to be moved - * and the node number must contain a valid target node. - * The pm array ends with node = MAX_NUMNODES. + * Resolves the given address to a struct page, isolates it from the LRU and + * puts it to the given pagelist. + * Returns -errno if the page cannot be found/isolated or 0 when it has been + * queued or the page doesn't need to be migrated because it is already on + * the target node */ -static int do_move_page_to_node_array(struct mm_struct *mm, - struct page_to_node *pm, - int migrate_all) +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, + int node, struct list_head *pagelist, bool migrate_all) { + struct vm_area_struct *vma; + struct page *page; + unsigned int follflags; int err; - struct page_to_node *pp; - LIST_HEAD(pagelist); down_read(&mm->mmap_sem); + err = -EFAULT; + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + goto out; - /* - * Build a list of pages to migrate - */ - for (pp = pm; pp->node != MAX_NUMNODES; pp++) { - struct vm_area_struct *vma; - struct page *page; - struct page *head; - unsigned int follflags; - - err = -EFAULT; - vma = find_vma(mm, pp->addr); - if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) - goto set_status; - - /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - if (!thp_migration_supported()) - follflags |= FOLL_SPLIT; - page = follow_page(vma, pp->addr, follflags); + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; - err = -ENOENT; - if (!page) - goto set_status; + err = -ENOENT; + if (!page) + goto out; - err = page_to_nid(page); + err = 0; + if (page_to_nid(page) == node) + goto out_putpage; - if (err == pp->node) - /* - * Node already in the right place - */ - goto put_and_set; + err = -EACCES; + if (page_mapcount(page) > 1 && !migrate_all) + goto out_putpage; - err = -EACCES; - if (page_mapcount(page) > 1 && - !migrate_all) - goto put_and_set; - - if (PageHuge(page)) { - if (PageHead(page)) { - isolate_huge_page(page, &pagelist); - err = 0; - pp->page = page; - } - goto put_and_set; + if (PageHuge(page)) { + if (PageHead(page)) { + isolate_huge_page(page, pagelist); + err = 0; } + } else { + struct page *head; - pp->page = compound_head(page); head = compound_head(page); err = isolate_lru_page(head); - if (!err) { - list_add_tail(&head->lru, &pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), - hpage_nr_pages(head)); - } -put_and_set: - /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. - */ - put_page(page); -set_status: - pp->status = err; - } - - err = 0; - if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, NULL, - (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_movable_pages(&pagelist); - } + goto out_putpage; + err = 0; + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); + } +out_putpage: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +out: up_read(&mm->mmap_sem); return err; } @@ -1593,79 +1555,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm; - unsigned long chunk_nr_pages; - unsigned long chunk_start; - int err; - - err = -ENOMEM; - pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); - if (!pm) - goto out; + int current_node = NUMA_NO_NODE; + LIST_HEAD(pagelist); + int start, i; + int err = 0, err1; migrate_prep(); - /* - * Store a chunk of page_to_node array in a page, - * but keep the last one as a marker - */ - chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - - for (chunk_start = 0; - chunk_start < nr_pages; - chunk_start += chunk_nr_pages) { - int j; + for (i = start = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; + int node; - if (chunk_start + chunk_nr_pages > nr_pages) - chunk_nr_pages = nr_pages - chunk_start; - - /* fill the chunk pm with addrs and nodes from user-space */ - for (j = 0; j < chunk_nr_pages; j++) { - const void __user *p; - int node; - - err = -EFAULT; - if (get_user(p, pages + j + chunk_start)) - goto out_pm; - pm[j].addr = (unsigned long) p; - - if (get_user(node, nodes + j + chunk_start)) - goto out_pm; - - err = -ENODEV; - if (node < 0 || node >= MAX_NUMNODES) - goto out_pm; - - if (!node_state(node, N_MEMORY)) - goto out_pm; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out_pm; + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_flush; + if (get_user(node, nodes + i)) + goto out_flush; + addr = (unsigned long)p; + + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_flush; + if (!node_state(node, N_MEMORY)) + goto out_flush; - pm[j].node = node; + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_flush; + + if (current_node == NUMA_NO_NODE) { + current_node = node; + start = i; + } else if (node != current_node) { + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + start = i; + current_node = node; } - /* End marker for this chunk */ - pm[chunk_nr_pages].node = MAX_NUMNODES; - - /* Migrate this chunk */ - err = do_move_page_to_node_array(mm, pm, - flags & MPOL_MF_MOVE_ALL); - if (err < 0) - goto out_pm; + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_migration(mm, addr, current_node, + &pagelist, flags & MPOL_MF_MOVE_ALL); + if (!err) + continue; - /* Return status information */ - for (j = 0; j < chunk_nr_pages; j++) - if (put_user(pm[j].status, status + j + chunk_start)) { - err = -EFAULT; - goto out_pm; - } - } - err = 0; + err = store_status(status, i, err, 1); + if (err) + goto out_flush; -out_pm: - free_page((unsigned long)pm); + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + if (i > start) { + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + } + current_node = NUMA_NO_NODE; + } +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = do_move_pages_to_node(mm, &pagelist, current_node); + if (!err1) + err1 = store_status(status, start, current_node, i - start); + if (!err) + err = err1; out: return err; } @@ -1866,8 +1828,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, } static struct page *alloc_misplaced_dst_page(struct page *page, - unsigned long data, - int **result) + unsigned long data) { int nid = (int) data; struct page *newpage; @@ -1987,6 +1948,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; /* + * Also do not migrate dirty pages as not all filesystems can move + * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + */ + if (page_is_file_cache(page) && PageDirty(page)) + goto out; + + /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! @@ -2339,7 +2307,8 @@ again: ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, pte_write(pte)); + entry = make_migration_entry(page, mpfn & + MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); diff --git a/mm/mmap.c b/mm/mmap.c index f2154fc2548b..188f195883b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1342,6 +1342,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; + /* force arch specific MAP_FIXED handling in get_unmapped_area */ + if (flags & MAP_FIXED_NOREPLACE) + flags |= MAP_FIXED; + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1365,6 +1369,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (flags & MAP_FIXED_NOREPLACE) { + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && vma->vm_start <= addr) + return -EEXIST; + } + if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) diff --git a/mm/mprotect.c b/mm/mprotect.c index c1d6af7455da..625608bc8962 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -27,6 +27,7 @@ #include <linux/pkeys.h> #include <linux/ksm.h> #include <linux/uaccess.h> +#include <linux/mm_inline.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -89,6 +90,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page_mapcount(page) != 1) continue; + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (page_is_file_cache(page) && PageDirty(page)) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 586f31261c83..5c1a3279e63f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2099,7 +2099,8 @@ void __init page_writeback_init(void) * so that it can tag pages faster than a dirtying process can create them). */ /* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock + * latency. */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2109,22 +2110,22 @@ void tag_pages_for_writeback(struct address_space *mapping, struct radix_tree_iter iter; void **slot; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, PAGECACHE_TAG_DIRTY) { if (iter.index > end) break; - radix_tree_iter_tag_set(&mapping->page_tree, &iter, + radix_tree_iter_tag_set(&mapping->i_pages, &iter, PAGECACHE_TAG_TOWRITE); tagged++; if ((tagged % WRITEBACK_TAG_BATCH) != 0) continue; slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); cond_resched(); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2467,13 +2468,13 @@ int __set_page_dirty_nobuffers(struct page *page) return 1; } - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); if (mapping->host) { @@ -2718,11 +2719,10 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2736,7 +2736,7 @@ int test_clear_page_writeback(struct page *page) PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestClearPageWriteback(page); } @@ -2766,7 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2774,8 +2774,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2789,14 +2788,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_TOWRITE); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestSetPageWriteback(page); } @@ -2816,7 +2813,7 @@ EXPORT_SYMBOL(__test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - return radix_tree_tagged(&mapping->page_tree, tag); + return radix_tree_tagged(&mapping->i_pages, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b97b8ece4a9..905db9d7962f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -46,7 +46,6 @@ #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> -#include <xen/xen.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> @@ -205,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA - 256, + [ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32 - 256, + [ZONE_DMA32] = 256, #endif + [ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM - 32, + [ZONE_HIGHMEM] = 0, #endif - 32, + [ZONE_MOVABLE] = 0, }; EXPORT_SYMBOL(totalram_pages); @@ -316,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* Xen PV domains need page structures early */ - if (xen_pv_domain()) - return true; (*nr_initialised)++; if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { @@ -1746,16 +1743,38 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA +static void __init adjust_present_page_count(struct page *page, long count) +{ + struct zone *zone = page_zone(page); + + /* We don't need to hold a lock since it is boot-up process */ + zone->present_pages += count; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; + unsigned long pfn = page_to_pfn(page); struct page *p = page; + int nid = page_to_nid(page); + + /* + * ZONE_MOVABLE will steal present pages from other zones by + * changing page links so page_zone() is changed. Before that, + * we need to adjust previous zone's page count first. + */ + adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - } while (++p, --i); + + /* Steal pages from other zones */ + set_page_links(p, ZONE_MOVABLE, nid, pfn); + } while (++p, ++pfn, --i); + + adjust_present_page_count(page, pageblock_nr_pages); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -2870,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3146,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3178,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + if (!list_empty(&area->free_list[MIGRATE_CMA])) return true; - } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3201,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3216,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3852,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif return alloc_flags; } @@ -4322,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; - return true; } @@ -4734,6 +4731,13 @@ long si_mem_available(void) min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + /* + * Part of the kernel memory, which can be released under memory + * pressure. + */ + available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> + PAGE_SHIFT; + if (available < 0) available = 0; return available; @@ -6200,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; + unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6227,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long movable_size = 0; size = zone->spanned_pages; realsize = freesize = zone->present_pages; + if (zone_end_pfn(zone) > node_end_pfn) + node_end_pfn = zone_end_pfn(zone); + /* * Adjust freesize so that it accounts for how much memory @@ -6278,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - if (!size) + /* + * The size of the CMA area is unknown now so we need to + * prepare the memory for the usemap at maximum. + */ + if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && + pgdat->node_spanned_pages) { + movable_size = node_end_pfn - pgdat->node_start_pfn; + } + + if (!size && !movable_size) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + if (movable_size) { + zone->zone_start_pfn = pgdat->node_start_pfn; + zone->spanned_pages = movable_size; + setup_usemap(pgdat, zone, + pgdat->node_start_pfn, movable_size); + init_currently_empty_zone(zone, + pgdat->node_start_pfn, movable_size); + } else { + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + } memmap_init(size, nid, j, zone_start_pfn); } } @@ -7125,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void) struct zone *lower_zone; idx--; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) - sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = managed_pages / - sysctl_lowmem_reserve_ratio[idx]; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) { + sysctl_lowmem_reserve_ratio[idx] = 0; + lower_zone->lowmem_reserve[j] = 0; + } else { + lower_zone->lowmem_reserve[j] = + managed_pages / sysctl_lowmem_reserve_ratio[idx]; + } managed_pages += lower_zone->managed_pages; } } @@ -7922,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 61dee77bb211..43e085608846 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -309,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, return pfn < end_pfn ? -EBUSY : 0; } -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp) +struct page *alloc_migrate_target(struct page *page, unsigned long private) { return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } diff --git a/mm/readahead.c b/mm/readahead.c index 4d57b4644f98..539bbb6c1fad 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -175,7 +175,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, page_offset); + page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) continue; diff --git a/mm/rmap.c b/mm/rmap.c index 9122787c4947..f0dd4e4565bc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,11 +32,11 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) + * i_pages lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, + * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * diff --git a/mm/shmem.c b/mm/shmem.c index 4424fc0c33aa..9d6c7e595415 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -332,12 +332,12 @@ static int shmem_radix_tree_replace(struct address_space *mapping, VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); if (!item) return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, + __radix_tree_replace(&mapping->i_pages, node, pslot, replacement, NULL); return 0; } @@ -355,7 +355,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, void *item; rcu_read_lock(); - item = radix_tree_lookup(&mapping->page_tree, index); + item = radix_tree_lookup(&mapping->i_pages, index); rcu_read_unlock(); return item == swp_to_radix_entry(swap); } @@ -590,14 +590,14 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (PageTransHuge(page)) { void __rcu **results; pgoff_t idx; int i; error = 0; - if (radix_tree_gang_lookup_slot(&mapping->page_tree, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, index, 1) && idx < index + HPAGE_PMD_NR) { error = -EEXIST; @@ -605,14 +605,14 @@ static int shmem_add_to_page_cache(struct page *page, if (!error) { for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->page_tree, + error = radix_tree_insert(&mapping->i_pages, index + i, page + i); VM_BUG_ON(error); } count_vm_event(THP_FILE_ALLOC); } } else if (!expected) { - error = radix_tree_insert(&mapping->page_tree, index, page); + error = radix_tree_insert(&mapping->i_pages, index, page); } else { error = shmem_radix_tree_replace(mapping, index, expected, page); @@ -624,10 +624,10 @@ static int shmem_add_to_page_cache(struct page *page, __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } else { page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); } return error; @@ -643,13 +643,13 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); } @@ -662,9 +662,9 @@ static int shmem_free_swap(struct address_space *mapping, { void *old; - spin_lock_irq(&mapping->tree_lock); - old = radix_tree_delete_item(&mapping->page_tree, index, radswap); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; free_swap_and_cache(radix_to_swp_entry(radswap)); @@ -675,7 +675,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -688,7 +688,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; @@ -717,7 +717,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -1132,7 +1132,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->i_pages, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -1448,7 +1448,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, hindex = round_down(index, HPAGE_PMD_NR); rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, hindex, 1) && idx < hindex + HPAGE_PMD_NR) { rcu_read_unlock(); return NULL; @@ -1561,14 +1561,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ - spin_lock_irq(&swap_mapping->tree_lock); + xa_lock_irq(&swap_mapping->i_pages); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); } - spin_unlock_irq(&swap_mapping->tree_lock); + xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* @@ -2634,7 +2634,7 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -2642,10 +2642,10 @@ static void shmem_tag_pins(struct address_space *mapping) continue; } } else if (page_count(page) - page_mapcount(page) > 1) { - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_set(&mapping->page_tree, iter.index, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } if (need_resched()) { @@ -2677,7 +2677,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) break; if (!scan) @@ -2687,7 +2687,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, SHMEM_TAG_PINNED) { page = radix_tree_deref_slot(slot); @@ -2713,10 +2713,10 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = -EBUSY; } - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); diff --git a/mm/slub.c b/mm/slub.c index 4fb037c98782..44aa7847324a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1363,10 +1363,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x, _RET_IP_); } -static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) { - void *freeptr; - kmemleak_free_recursive(x, s->flags); /* @@ -1386,17 +1384,12 @@ static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - freeptr = get_freepointer(s, x); - /* - * kasan_slab_free() may put x into memory quarantine, delaying its - * reuse. In this case the object's freelist pointer is changed. - */ - kasan_slab_free(s, x, _RET_IP_); - return freeptr; + /* KASAN might put x into memory quarantine, delaying its reuse */ + return kasan_slab_free(s, x, _RET_IP_); } -static inline void slab_free_freelist_hook(struct kmem_cache *s, - void *head, void *tail) +static inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail) { /* * Compiler cannot detect this function can be removed if slab_free_hook() @@ -1407,13 +1400,33 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object = head; - void *tail_obj = tail ? : head; - void *freeptr; + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; do { - freeptr = slab_free_hook(s, object); - } while ((object != tail_obj) && (object = freeptr)); + object = next; + next = get_freepointer(s, object); + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; + } + } while (object != old_tail); + + if (*head == *tail) + *tail = NULL; + + return *head != NULL; +#else + return true; #endif } @@ -2968,14 +2981,12 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *head, void *tail, int cnt, unsigned long addr) { - slab_free_freelist_hook(s, head, tail); /* - * slab_free_freelist_hook() could have put the items into quarantine. - * If so, no need to free them. + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - return; - do_slab_free(s, page, head, tail, cnt, addr); + if (slab_free_freelist_hook(s, &head, &tail)) + do_slab_free(s, page, head, tail, cnt, addr); } #ifdef CONFIG_KASAN diff --git a/mm/swap_state.c b/mm/swap_state.c index f233dccd3b1b..07f9aa2340c3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -124,10 +124,10 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); for (i = 0; i < nr; i++) { set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->page_tree, + error = radix_tree_insert(&address_space->i_pages, idx + i, page + i); if (unlikely(error)) break; @@ -145,13 +145,13 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON(error == -EEXIST); set_page_private(page + i, 0UL); while (i--) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0UL); } ClearPageSwapCache(page); page_ref_sub(page, nr); } - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); return error; } @@ -188,7 +188,7 @@ void __delete_from_swap_cache(struct page *page) address_space = swap_address_space(entry); idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0); } ClearPageSwapCache(page); @@ -272,9 +272,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(page); - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); @@ -628,12 +628,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); - spin_lock_init(&space->tree_lock); } nr_swapper_spaces[type] = nr; rcu_assign_pointer(swapper_spaces[type], spaces); diff --git a/mm/swapfile.c b/mm/swapfile.c index c7a33717d079..cc2cf04d9018 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -struct plist_head *swap_avail_heads; +static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -2961,6 +2961,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; last_page = swap_header->info.last_page; + if (!last_page) { + pr_warn("Empty swap-file\n"); + return 0; + } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", maxpages << (PAGE_SHIFT - 10), diff --git a/mm/truncate.c b/mm/truncate.c index c34e2fd4f583..1d2fb2dca96f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -36,11 +36,11 @@ static inline void __clear_shadow_entry(struct address_space *mapping, struct radix_tree_node *node; void **slot; - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) return; if (*slot != entry) return; - __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + __radix_tree_replace(&mapping->i_pages, node, slot, NULL, workingset_update_node); mapping->nrexceptional--; } @@ -48,9 +48,9 @@ static inline void __clear_shadow_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } /* @@ -79,7 +79,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, dax = dax_mapping(mapping); lock = !dax && indices[j] < end; if (lock) - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -102,7 +102,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, } if (lock) - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -518,8 +518,8 @@ void truncate_inode_pages_final(struct address_space *mapping) * modification that does not see AS_EXITING is * completed before starting the final truncate. */ - spin_lock_irq(&mapping->tree_lock); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); truncate_inode_pages(mapping, 0); } @@ -627,13 +627,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -641,7 +641,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } diff --git a/mm/util.c b/mm/util.c index 029fc2f3b395..1fc4fa7576f7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -287,7 +287,7 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) } #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; @@ -668,6 +668,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_node_page_state(NR_SLAB_RECLAIMABLE); /* + * Part of the kernel memory, which can be released + * under memory pressure. + */ + free += global_node_page_state( + NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + + /* * Leave reserved pages. The pages are not for anonymous pages. */ if (free <= totalreserve_pages) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4390a8d5be41..8b920ce3ae02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -116,6 +116,16 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; }; #ifdef ARCH_HAS_PREFETCH @@ -190,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc) #endif return false; } + +static void set_memcg_congestion(pg_data_t *pgdat, + struct mem_cgroup *memcg, + bool congested) +{ + struct mem_cgroup_per_node *mn; + + if (!memcg) + return; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + WRITE_ONCE(mn->congested, congested); +} + +static bool memcg_congested(pg_data_t *pgdat, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *mn; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + return READ_ONCE(mn->congested); + +} #else static bool global_reclaim(struct scan_control *sc) { @@ -200,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc) { return true; } + +static inline void set_memcg_congestion(struct pglist_data *pgdat, + struct mem_cgroup *memcg, bool congested) +{ +} + +static inline bool memcg_congested(struct pglist_data *pgdat, + struct mem_cgroup *memcg) +{ + return false; + +} #endif /* @@ -648,7 +693,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); /* * The non racy check for a busy page. * @@ -672,7 +717,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, - * and thus under tree_lock, then this ordering is not required. + * and thus under the i_pages lock, then this ordering is not required. */ if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) refcount = 1 + HPAGE_PMD_NR; @@ -690,7 +735,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -711,13 +756,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the - * same page_tree. + * same address_space. */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (freepage != NULL) freepage(page); @@ -726,7 +771,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } @@ -857,17 +902,6 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } -struct reclaim_stat { - unsigned nr_dirty; - unsigned nr_unqueued_dirty; - unsigned nr_congested; - unsigned nr_writeback; - unsigned nr_immediate; - unsigned nr_activate; - unsigned nr_ref_keep; - unsigned nr_unmap_fail; -}; - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -926,7 +960,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* - * The number of dirty pages determines if a zone is marked + * The number of dirty pages determines if a node is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. @@ -1755,23 +1789,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, free_unref_page_list(&page_list); /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number - * of pages under pages flagged for immediate reclaim and stall if any - * are encountered in the nr_immediate check below. - */ - if (stat.nr_writeback && stat.nr_writeback == nr_taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); - - /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty pages to the end of @@ -1785,48 +1802,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_unqueued_dirty == nr_taken) wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling here. - */ - if (sane_reclaim(sc)) { - /* - * Tag a zone as congested if all the dirty pages scanned were - * backed by a congested BDI and wait_iff_congested will stall. - */ - if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - - /* Allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it implies - * that pages are cycling through the LRU faster than - * they are written so also forcibly stall. - */ - if (stat.nr_immediate && current_may_throttle()) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - - /* - * Stall direct reclaim for IO completions if underlying BDIs or zone - * is congested. Allow kswapd to continue until it starts encountering - * unqueued dirty pages or cycling through the LRU too quickly. - */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle()) - wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - nr_scanned, nr_reclaimed, - stat.nr_dirty, stat.nr_writeback, - stat.nr_congested, stat.nr_immediate, - stat.nr_activate, stat.nr_ref_keep, - stat.nr_unmap_fail, - sc->priority, file); + nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } @@ -2507,6 +2493,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return true; } +static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) +{ + return test_bit(PGDAT_CONGESTED, &pgdat->flags) || + (memcg && memcg_congested(pgdat, memcg)); +} + static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2522,6 +2514,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; @@ -2536,7 +2530,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->memcg_low_skipped = 1; continue; } - mem_cgroup_event(memcg, MEMCG_LOW); + memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; @@ -2587,6 +2581,67 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* + * Tag a node as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + */ + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(PGDAT_CONGESTED, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if (!global_reclaim(sc) && sane_reclaim(sc) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_memcg_congestion(pgdat, root, true); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); @@ -2802,6 +2857,7 @@ retry: continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); } delayacct_freepages_end(); @@ -3808,7 +3864,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* - * Free memory by calling shrink zone with increasing + * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { diff --git a/mm/vmstat.c b/mm/vmstat.c index 33581be705f0..536332e988b8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,6 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", + "nr_indirectly_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", diff --git a/mm/workingset.c b/mm/workingset.c index b7d616a3bbbe..40ee02c83978 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -202,7 +202,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->page_tree in place + * Returns a shadow entry to be stored in @mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ void *workingset_eviction(struct address_space *mapping, struct page *page) @@ -348,7 +348,7 @@ void workingset_update_node(struct radix_tree_node *node) * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe - * as node->private_list is protected by &mapping->tree_lock. + * as node->private_list is protected by the i_pages lock. */ if (node->count && node->count == node->exceptional) { if (list_empty(&node->private_list)) @@ -366,7 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); @@ -419,21 +419,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchroneously maintain - * the shadow node LRU under the mapping->tree_lock and the + * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has radix tree nodes on the LRU. * - * We can then safely transition to the mapping->tree_lock to + * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, page_tree); + mapping = container_of(node->root, struct address_space, i_pages); /* Coming from the list, invert the lock order */ - if (!spin_trylock(&mapping->tree_lock)) { + if (!xa_trylock(&mapping->i_pages)) { spin_unlock(lru_lock); ret = LRU_RETRY; goto out; @@ -468,11 +468,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->page_tree, node, + __radix_tree_delete_node(&mapping->i_pages, node, workingset_lookup_update(mapping)); out_invalid: - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: local_irq_enable(); @@ -487,7 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, { unsigned long ret; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); @@ -503,7 +503,7 @@ static struct shrinker workingset_shadow_shrinker = { /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe - * mapping->tree_lock. + * i_pages lock. */ static struct lock_class_key shadow_nodes_key; diff --git a/mm/z3fold.c b/mm/z3fold.c index f579ad4a8100..c0bca6153b95 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -467,6 +467,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + if (!pool->unbuddied) + goto out_pool; for_each_possible_cpu(cpu) { struct list_head *unbuddied = per_cpu_ptr(pool->unbuddied, cpu); @@ -479,7 +481,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->name = name; pool->compact_wq = create_singlethread_workqueue(pool->name); if (!pool->compact_wq) - goto out; + goto out_unbuddied; pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; @@ -489,8 +491,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, out_wq: destroy_workqueue(pool->compact_wq); -out: +out_unbuddied: + free_percpu(pool->unbuddied); +out_pool: kfree(pool); +out: return NULL; } @@ -533,7 +538,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; - bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; + bool can_sleep = gfpflags_allow_blocking(gfp); if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a8772a978224..9c169bb2444d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -781,8 +781,14 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) tunnel->encap.type == TUNNEL_ENCAP_NONE) { dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } else { + dev->features &= ~NETIF_F_GSO_SOFTWARE; + dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } dev->features |= NETIF_F_LLTX; + } else { + dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; + dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE); } } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 14b67dfacc4b..0fbd3ee26165 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -335,26 +335,6 @@ err_tlock: } EXPORT_SYMBOL_GPL(l2tp_session_register); -/* Lookup a tunnel by id - */ -struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id) -{ - struct l2tp_tunnel *tunnel; - struct l2tp_net *pn = l2tp_pernet(net); - - rcu_read_lock_bh(); - list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (tunnel->tunnel_id == tunnel_id) { - rcu_read_unlock_bh(); - return tunnel; - } - } - rcu_read_unlock_bh(); - - return NULL; -} -EXPORT_SYMBOL_GPL(l2tp_tunnel_find); - struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth) { struct l2tp_net *pn = l2tp_pernet(net); @@ -1436,74 +1416,11 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 { struct l2tp_tunnel *tunnel = NULL; int err; - struct socket *sock = NULL; - struct sock *sk = NULL; - struct l2tp_net *pn; enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; - /* Get the tunnel socket from the fd, which was opened by - * the userspace L2TP daemon. If not specified, create a - * kernel socket. - */ - if (fd < 0) { - err = l2tp_tunnel_sock_create(net, tunnel_id, peer_tunnel_id, - cfg, &sock); - if (err < 0) - goto err; - } else { - sock = sockfd_lookup(fd, &err); - if (!sock) { - pr_err("tunl %u: sockfd_lookup(fd=%d) returned %d\n", - tunnel_id, fd, err); - err = -EBADF; - goto err; - } - - /* Reject namespace mismatches */ - if (!net_eq(sock_net(sock->sk), net)) { - pr_err("tunl %u: netns mismatch\n", tunnel_id); - err = -EINVAL; - goto err; - } - } - - sk = sock->sk; - if (cfg != NULL) encap = cfg->encap; - /* Quick sanity checks */ - err = -EPROTONOSUPPORT; - if (sk->sk_type != SOCK_DGRAM) { - pr_debug("tunl %hu: fd %d wrong socket type\n", - tunnel_id, fd); - goto err; - } - switch (encap) { - case L2TP_ENCAPTYPE_UDP: - if (sk->sk_protocol != IPPROTO_UDP) { - pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", - tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP); - goto err; - } - break; - case L2TP_ENCAPTYPE_IP: - if (sk->sk_protocol != IPPROTO_L2TP) { - pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n", - tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP); - goto err; - } - break; - } - - /* Check if this socket has already been prepped */ - tunnel = l2tp_tunnel(sk); - if (tunnel != NULL) { - /* This socket has already been prepped */ - err = -EBUSY; - goto err; - } - tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL); if (tunnel == NULL) { err = -ENOMEM; @@ -1520,72 +1437,126 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - /* The net we belong to */ - tunnel->l2tp_net = net; - pn = l2tp_pernet(net); - if (cfg != NULL) tunnel->debug = cfg->debug; - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ tunnel->encap = encap; - if (encap == L2TP_ENCAPTYPE_UDP) { - struct udp_tunnel_sock_cfg udp_cfg = { }; - - udp_cfg.sk_user_data = tunnel; - udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP; - udp_cfg.encap_rcv = l2tp_udp_encap_recv; - udp_cfg.encap_destroy = l2tp_udp_encap_destroy; - - setup_udp_tunnel_sock(net, sock, &udp_cfg); - } else { - sk->sk_user_data = tunnel; - } - /* Bump the reference count. The tunnel context is deleted - * only when this drops to zero. A reference is also held on - * the tunnel socket to ensure that it is not released while - * the tunnel is extant. Must be done before sk_destruct is - * set. - */ refcount_set(&tunnel->ref_count, 1); - sock_hold(sk); - tunnel->sock = sk; tunnel->fd = fd; - /* Hook on the tunnel socket destructor so that we can cleanup - * if the tunnel socket goes away. - */ - tunnel->old_sk_destruct = sk->sk_destruct; - sk->sk_destruct = &l2tp_tunnel_destruct; - lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock"); - - sk->sk_allocation = GFP_ATOMIC; - /* Init delete workqueue struct */ INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work); - /* Add tunnel to our list */ INIT_LIST_HEAD(&tunnel->list); - spin_lock_bh(&pn->l2tp_tunnel_list_lock); - list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); err = 0; err: if (tunnelp) *tunnelp = tunnel; - /* If tunnel's socket was created by the kernel, it doesn't - * have a file. - */ - if (sock && sock->file) - sockfd_put(sock); - return err; } EXPORT_SYMBOL_GPL(l2tp_tunnel_create); +static int l2tp_validate_socket(const struct sock *sk, const struct net *net, + enum l2tp_encap_type encap) +{ + if (!net_eq(sock_net(sk), net)) + return -EINVAL; + + if (sk->sk_type != SOCK_DGRAM) + return -EPROTONOSUPPORT; + + if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) || + (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP)) + return -EPROTONOSUPPORT; + + if (sk->sk_user_data) + return -EBUSY; + + return 0; +} + +int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, + struct l2tp_tunnel_cfg *cfg) +{ + struct l2tp_tunnel *tunnel_walk; + struct l2tp_net *pn; + struct socket *sock; + struct sock *sk; + int ret; + + if (tunnel->fd < 0) { + ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id, + tunnel->peer_tunnel_id, cfg, + &sock); + if (ret < 0) + goto err; + } else { + sock = sockfd_lookup(tunnel->fd, &ret); + if (!sock) + goto err; + + ret = l2tp_validate_socket(sock->sk, net, tunnel->encap); + if (ret < 0) + goto err_sock; + } + + sk = sock->sk; + + sock_hold(sk); + tunnel->sock = sk; + tunnel->l2tp_net = net; + + pn = l2tp_pernet(net); + + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_for_each_entry(tunnel_walk, &pn->l2tp_tunnel_list, list) { + if (tunnel_walk->tunnel_id == tunnel->tunnel_id) { + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + ret = -EEXIST; + goto err_sock; + } + } + list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { + struct udp_tunnel_sock_cfg udp_cfg = { + .sk_user_data = tunnel, + .encap_type = UDP_ENCAP_L2TPINUDP, + .encap_rcv = l2tp_udp_encap_recv, + .encap_destroy = l2tp_udp_encap_destroy, + }; + + setup_udp_tunnel_sock(net, sock, &udp_cfg); + } else { + sk->sk_user_data = tunnel; + } + + tunnel->old_sk_destruct = sk->sk_destruct; + sk->sk_destruct = &l2tp_tunnel_destruct; + lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, + "l2tp_sock"); + sk->sk_allocation = GFP_ATOMIC; + + if (tunnel->fd >= 0) + sockfd_put(sock); + + return 0; + +err_sock: + if (tunnel->fd < 0) + sock_release(sock); + else + sockfd_put(sock); +err: + return ret; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_register); + /* This function is used by the netlink TUNNEL_DELETE command. */ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 2718d0b284d0..ba33cbec71eb 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -220,12 +220,14 @@ struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); -struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth); int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); +int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, + struct l2tp_tunnel_cfg *cfg); + void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index e7ea9c4b89ff..b05dbd9ffcb2 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -236,12 +236,6 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info if (info->attrs[L2TP_ATTR_DEBUG]) cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel != NULL) { - ret = -EEXIST; - goto out; - } - ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: @@ -251,9 +245,19 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info break; } - if (ret >= 0) - ret = l2tp_tunnel_notify(&l2tp_nl_family, info, - tunnel, L2TP_CMD_TUNNEL_CREATE); + if (ret < 0) + goto out; + + l2tp_tunnel_inc_refcount(tunnel); + ret = l2tp_tunnel_register(tunnel, net, &cfg); + if (ret < 0) { + kfree(tunnel); + goto out; + } + ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, + L2TP_CMD_TUNNEL_CREATE); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index d6deca11da19..896bbca9bdaa 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -698,6 +698,15 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); if (error < 0) goto end; + + l2tp_tunnel_inc_refcount(tunnel); + error = l2tp_tunnel_register(tunnel, sock_net(sk), + &tcfg); + if (error < 0) { + kfree(tunnel); + goto end; + } + drop_tunnel = true; } } else { /* Error if we can't find the tunnel */ diff --git a/net/rds/send.c b/net/rds/send.c index acad04243b41..94c7f74909be 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -1017,10 +1017,15 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) if (conn->c_npaths == 0 && hash != 0) { rds_send_ping(conn, 0); - if (conn->c_npaths == 0) { - wait_event_interruptible(conn->c_hs_waitq, - (conn->c_npaths != 0)); - } + /* The underlying connection is not up yet. Need to wait + * until it is up to be sure that the non-zero c_path can be + * used. But if we are interrupted, we have to use the zero + * c_path in case the connection ends up being non-MP capable. + */ + if (conn->c_npaths == 0) + if (wait_event_interruptible(conn->c_hs_waitq, + conn->c_npaths != 0)) + hash = 0; if (conn->c_npaths == 1) hash = 0; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 806395687bb6..c2266f387213 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1887,7 +1887,7 @@ call_connect_status(struct rpc_task *task) dprint_status(task); - trace_rpc_connect_status(task, status); + trace_rpc_connect_status(task); task->tk_status = 0; switch (status) { case -ECONNREFUSED: @@ -2014,6 +2014,9 @@ call_transmit_status(struct rpc_task *task) case -EPERM: if (RPC_IS_SOFTCONN(task)) { xprt_end_transmit(task); + if (!task->tk_msg.rpc_proc->p_proc) + trace_xprt_ping(task->tk_xprt, + task->tk_status); rpc_exit(task, task->tk_status); break; } @@ -2112,6 +2115,9 @@ call_status(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; int status; + if (!task->tk_msg.rpc_proc->p_proc) + trace_xprt_ping(task->tk_xprt, task->tk_status); + if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) task->tk_status = req->rq_reply_bytes_recvd; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index d9db2eab3a8d..3fe5d60ab0e2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,7 +276,7 @@ static void rpc_set_active(struct rpc_task *task) { rpc_task_set_debuginfo(task); set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); - trace_rpc_task_begin(task->tk_client, task, NULL); + trace_rpc_task_begin(task, NULL); } /* @@ -291,7 +291,7 @@ static int rpc_complete_task(struct rpc_task *task) unsigned long flags; int ret; - trace_rpc_task_complete(task->tk_client, task, NULL); + trace_rpc_task_complete(task, NULL); spin_lock_irqsave(&wq->lock, flags); clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); @@ -358,7 +358,7 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", task->tk_pid, rpc_qname(q), jiffies); - trace_rpc_task_sleep(task->tk_client, task, q); + trace_rpc_task_sleep(task, q); __rpc_add_wait_queue(q, task, queue_priority); @@ -428,7 +428,7 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, return; } - trace_rpc_task_wakeup(task->tk_client, task, queue); + trace_rpc_task_wakeup(task, queue); __rpc_remove_wait_queue(queue, task); @@ -780,7 +780,7 @@ static void __rpc_execute(struct rpc_task *task) } if (!do_action) break; - trace_rpc_task_run_action(task->tk_client, task, do_action); + trace_rpc_task_run_action(task, do_action); do_action(task); /* diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 1e671333c3d5..f68aa46c9dd7 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -24,6 +24,8 @@ #include <linux/sunrpc/metrics.h> #include <linux/rcupdate.h> +#include <trace/events/sunrpc.h> + #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC @@ -148,7 +150,7 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; - ktime_t delta, now; + ktime_t backlog, execute, now; if (!op_metrics || !req) return; @@ -164,16 +166,20 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; + backlog = 0; if (ktime_to_ns(req->rq_xtime)) { - delta = ktime_sub(req->rq_xtime, task->tk_start); - op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); + backlog = ktime_sub(req->rq_xtime, task->tk_start); + op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } + op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); - delta = ktime_sub(now, task->tk_start); - op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta); + execute = ktime_sub(now, task->tk_start); + op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); spin_unlock(&op_metrics->om_lock); + + trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index f2b7cb540e61..09a0315ea77b 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -37,12 +37,6 @@ struct rpc_buffer { char data[]; }; -static inline int rpc_reply_expected(struct rpc_task *task) -{ - return (task->tk_msg.rpc_proc != NULL) && - (task->tk_msg.rpc_proc->p_decode != NULL); -} - static inline int sock_is_loopback(struct sock *sk) { struct dst_entry *dst; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e34f4ee7f2b6..30afbd236656 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1519,6 +1519,88 @@ out: EXPORT_SYMBOL_GPL(xdr_process_buf); /** + * xdr_stream_decode_opaque - Decode variable length opaque + * @xdr: pointer to xdr_stream + * @ptr: location to store opaque data + * @size: size of storage buffer @ptr + * + * Return values: + * On success, returns size of object stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE on overflow of storage buffer @ptr + */ +ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, size); + if (ret <= 0) + return ret; + memcpy(ptr, p, ret); + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque); + +/** + * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque + * @xdr: pointer to xdr_stream + * @ptr: location to store pointer to opaque data + * @maxlen: maximum acceptable object size + * @gfp_flags: GFP mask to use + * + * Return values: + * On success, returns size of object stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE if the size of the object would exceed @maxlen + * %-ENOMEM on memory allocation failure + */ +ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, + size_t maxlen, gfp_t gfp_flags) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); + if (ret > 0) { + *ptr = kmemdup(p, ret, gfp_flags); + if (*ptr != NULL) + return ret; + ret = -ENOMEM; + } + *ptr = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup); + +/** + * xdr_stream_decode_string - Decode variable length string + * @xdr: pointer to xdr_stream + * @str: location to store string + * @size: size of storage buffer @str + * + * Return values: + * On success, returns length of NUL-terminated string stored in *@str + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE on overflow of storage buffer @str + */ +ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, size); + if (ret > 0) { + memcpy(str, p, ret); + str[ret] = '\0'; + return strlen(str); + } + *str = '\0'; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_string); + +/** * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream * @str: location to store pointer to string diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8f0ad4f268da..70f005044f06 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -826,6 +826,7 @@ static void xprt_connect_status(struct rpc_task *task) * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * + * Caller holds xprt->recv_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { @@ -834,6 +835,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) list_for_each_entry(entry, &xprt->recv, rq_list) if (entry->rq_xid == xid) { trace_xprt_lookup_rqst(xprt, xid, 0); + entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); return entry; } @@ -889,7 +891,13 @@ __must_hold(&req->rq_xprt->recv_lock) } } -static void xprt_update_rtt(struct rpc_task *task) +/** + * xprt_update_rtt - Update RPC RTT statistics + * @task: RPC request that recently completed + * + * Caller holds xprt->recv_lock. + */ +void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_rtt *rtt = task->tk_client->cl_rtt; @@ -902,13 +910,14 @@ static void xprt_update_rtt(struct rpc_task *task) rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } } +EXPORT_SYMBOL_GPL(xprt_update_rtt); /** * xprt_complete_rqst - called when reply processing is complete * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * - * Caller holds transport lock. + * Caller holds xprt->recv_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { @@ -920,9 +929,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) trace_xprt_complete_rqst(xprt, req->rq_xid, copied); xprt->stat.recvs++; - req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); - if (xprt->ops->timer != NULL) - xprt_update_rtt(task); list_del_init(&req->rq_list); req->rq_private_buf.len = copied; @@ -1003,7 +1009,7 @@ void xprt_transmit(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; unsigned int connect_cookie; - int status, numreqs; + int status; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); @@ -1027,7 +1033,6 @@ void xprt_transmit(struct rpc_task *task) return; connect_cookie = xprt->connect_cookie; - req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); trace_xprt_transmit(xprt, req->rq_xid, status); if (status != 0) { @@ -1042,9 +1047,6 @@ void xprt_transmit(struct rpc_task *task) xprt->ops->set_retrans_timeout(task); - numreqs = atomic_read(&xprt->num_reqs); - if (numreqs > xprt->stat.max_slots) - xprt->stat.max_slots = numreqs; xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; @@ -1106,14 +1108,15 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); - if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs)) + if (xprt->num_reqs >= xprt->max_reqs) goto out; + ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; - atomic_dec(&xprt->num_reqs); + --xprt->num_reqs; req = ERR_PTR(-ENOMEM); out: return req; @@ -1121,7 +1124,8 @@ out: static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { - if (atomic_add_unless(&xprt->num_reqs, -1, xprt->min_reqs)) { + if (xprt->num_reqs > xprt->min_reqs) { + --xprt->num_reqs; kfree(req); return true; } @@ -1157,6 +1161,8 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock(&xprt->reserve_lock); return; out_init_req: + xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots, + xprt->num_reqs); task->tk_status = 0; task->tk_rqstp = req; xprt_request_init(task, xprt); @@ -1224,7 +1230,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, else xprt->max_reqs = num_prealloc; xprt->min_reqs = num_prealloc; - atomic_set(&xprt->num_reqs, num_prealloc); + xprt->num_reqs = num_prealloc; return xprt; diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index ed1a4a3065ee..47ebac949769 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -44,13 +44,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, if (IS_ERR(req)) return PTR_ERR(req); - rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, - DMA_TO_DEVICE, GFP_KERNEL); - if (IS_ERR(rb)) - goto out_fail; - req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); - size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index d5f95bb39300..5cc68a824f45 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -191,7 +191,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = rpcrdma_mr_get(r_xprt); if (!mr) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-EAGAIN); pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -251,6 +251,16 @@ out_maperr: return ERR_PTR(-EIO); } +/* Post Send WR containing the RPC Call message. + */ +static int +fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_send_wr *bad_wr; + + return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr); +} + /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the @@ -305,6 +315,7 @@ out_reset: const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 90f688f19783..c5743a0960be 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -357,8 +357,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mr *mr; struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; - struct ib_send_wr *bad_wr; - int rc, i, n; + int i, n; u8 key; mr = NULL; @@ -367,7 +366,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, rpcrdma_mr_defer_recovery(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-EAGAIN); } while (mr->frwr.fr_state != FRWR_IS_INVALID); frwr = &mr->frwr; frwr->fr_state = FRWR_IS_VALID; @@ -407,22 +406,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, ib_update_fast_reg_key(ibmr, ++key); reg_wr = &frwr->fr_regwr; - reg_wr->wr.next = NULL; - reg_wr->wr.opcode = IB_WR_REG_MR; - frwr->fr_cqe.done = frwr_wc_fastreg; - reg_wr->wr.wr_cqe = &frwr->fr_cqe; - reg_wr->wr.num_sge = 0; - reg_wr->wr.send_flags = 0; reg_wr->mr = ibmr; reg_wr->key = ibmr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; - rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); - if (rc) - goto out_senderr; - mr->mr_handle = ibmr->rkey; mr->mr_length = ibmr->length; mr->mr_offset = ibmr->iova; @@ -442,11 +431,40 @@ out_mapmr_err: frwr->fr_mr, n, mr->mr_nents); rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-EIO); +} -out_senderr: - pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc); - rpcrdma_mr_defer_recovery(mr); - return ERR_PTR(-ENOTCONN); +/* Post Send WR containing the RPC Call message. + * + * For FRMR, chain any FastReg WRs to the Send WR. Only a + * single ib_post_send call is needed to register memory + * and then post the Send WR. + */ +static int +frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_send_wr *post_wr, *bad_wr; + struct rpcrdma_mr *mr; + + post_wr = &req->rl_sendctx->sc_wr; + list_for_each_entry(mr, &req->rl_registered, mr_list) { + struct rpcrdma_frwr *frwr; + + frwr = &mr->frwr; + + frwr->fr_cqe.done = frwr_wc_fastreg; + frwr->fr_regwr.wr.next = post_wr; + frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; + frwr->fr_regwr.wr.num_sge = 0; + frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; + frwr->fr_regwr.wr.send_flags = 0; + + post_wr = &frwr->fr_regwr.wr; + } + + /* If ib_post_send fails, the next ->send_request for + * @req will queue these MWs for recovery. + */ + return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); } /* Handle a remotely invalidated mr on the @mrs list @@ -561,6 +579,7 @@ reset_mrs: const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, .ro_recover_mr = frwr_op_recover_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f0855a959a27..e8adad33d0bb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -365,7 +365,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) @@ -377,6 +377,11 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } while (nsegs); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /* Register and XDR encode the Write list. Supports encoding a list @@ -423,7 +428,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -440,6 +445,11 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -481,7 +491,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -498,6 +508,11 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /** @@ -724,8 +739,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, * Returns: * %0 if the RPC was sent successfully, * %-ENOTCONN if the connection was lost, - * %-EAGAIN if not enough pages are available for on-demand reply buffer, - * %-ENOBUFS if no MRs are available to register chunks, + * %-EAGAIN if the caller should call again with the same arguments, + * %-ENOBUFS if the caller should call again after a delay, * %-EMSGSIZE if the transport header is too small, * %-EIO if a permanent problem occurred while marshaling. */ @@ -868,10 +883,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) return 0; out_err: - if (ret != -ENOBUFS) { - pr_err("rpcrdma: header marshaling failed (%d)\n", ret); - r_xprt->rx_stats.failed_marshal_count++; - } + r_xprt->rx_stats.failed_marshal_count++; return ret; } @@ -1366,7 +1378,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 4b1ecfe979cf..cc1aad325496 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -52,7 +52,6 @@ #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/sunrpc/addr.h> -#include <linux/smp.h> #include "xprt_rdma.h" @@ -237,8 +236,6 @@ rpcrdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->rx_xprt; spin_lock_bh(&xprt->transport_lock); - if (++xprt->connect_cookie == 0) /* maintain a reserved value */ - ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); @@ -540,29 +537,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* Allocate a fixed-size buffer in which to construct and send the - * RPC-over-RDMA header for this request. - */ -static bool -rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - gfp_t flags) -{ - size_t size = RPCRDMA_HDRBUF_SIZE; - struct rpcrdma_regbuf *rb; - - if (req->rl_rdmabuf) - return true; - - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); - if (IS_ERR(rb)) - return false; - - r_xprt->rx_stats.hardway_register_count += size; - req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); - return true; -} - static bool rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, size_t size, gfp_t flags) @@ -644,15 +618,11 @@ xprt_rdma_allocate(struct rpc_task *task) if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (!rpcrdma_get_rdmabuf(r_xprt, req, flags)) - goto out_fail; if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) goto out_fail; if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - req->rl_cpu = smp_processor_id(); - req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; @@ -694,7 +664,8 @@ xprt_rdma_free(struct rpc_task *task) * Returns: * %0 if the RPC message has been sent * %-ENOTCONN if the caller should reconnect and call again - * %-ENOBUFS if the caller should call again later + * %-EAGAIN if the caller should call again + * %-ENOBUFS if the caller should call again after a delay * %-EIO if a permanent error occurred and the request was not * sent. Do not try to send this message again. */ @@ -723,9 +694,9 @@ xprt_rdma_send_request(struct rpc_task *task) rpcrdma_recv_buffer_get(req); /* Must suppress retransmit to maintain credits */ - if (req->rl_connect_cookie == xprt->connect_cookie) + if (rqst->rq_connect_cookie == xprt->connect_cookie) goto drop_connection; - req->rl_connect_cookie = xprt->connect_cookie; + rqst->rq_xtime = ktime_get(); __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) @@ -733,6 +704,12 @@ xprt_rdma_send_request(struct rpc_task *task) rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; + + /* An RPC with no reply will throw off credit accounting, + * so drop the connection to reset the credit grant. + */ + if (!rpc_reply_expected(task)) + goto drop_connection; return 0; failed_marshal: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e6f84a6434a0..fe5eaca2d197 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -250,11 +250,11 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; - ia->ri_pd = NULL; ia->ri_device = NULL; /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: + ++xprt->rx_xprt.connect_cookie; connstate = 1; rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; @@ -273,6 +273,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: + ++xprt->rx_xprt.connect_cookie; connstate = -ECONNABORTED; connected: xprt->rx_buf.rb_credits = 1; @@ -445,7 +446,9 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) ia->ri_id->qp = NULL; } ib_free_cq(ep->rep_attr.recv_cq); + ep->rep_attr.recv_cq = NULL; ib_free_cq(ep->rep_attr.send_cq); + ep->rep_attr.send_cq = NULL; /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. @@ -458,6 +461,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); } rpcrdma_mrs_destroy(buf); + ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; /* Allow waiters to continue */ complete(&ia->ri_remove_done); @@ -589,11 +594,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ - ep->rep_remote_cma.responder_resources = 32; - else - ep->rep_remote_cma.responder_resources = - ia->ri_device->attrs.max_qp_rd_atom; + ep->rep_remote_cma.responder_resources = + min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing @@ -628,14 +630,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { cancel_delayed_work_sync(&ep->rep_connect_worker); - if (ia->ri_id->qp) { + if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } - ib_free_cq(ep->rep_attr.recv_cq); - ib_free_cq(ep->rep_attr.send_cq); + if (ep->rep_attr.recv_cq) + ib_free_cq(ep->rep_attr.recv_cq); + if (ep->rep_attr.send_cq) + ib_free_cq(ep->rep_attr.send_cq); } /* Re-establish a connection after a device removal event. @@ -1024,7 +1028,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 32; count++) { + for (count = 0; count < 3; count++) { struct rpcrdma_mr *mr; int rc; @@ -1049,8 +1053,9 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; spin_unlock(&buf->rb_mrlock); - trace_xprtrdma_createmrs(r_xprt, count); + + xprt_write_space(&r_xprt->rx_xprt); } static void @@ -1068,17 +1073,27 @@ struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; + struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (req == NULL) return ERR_PTR(-ENOMEM); + rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, + DMA_TO_DEVICE, GFP_KERNEL); + if (IS_ERR(rb)) { + kfree(req); + return ERR_PTR(-ENOMEM); + } + req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); + req->rl_buffer = buffer; + INIT_LIST_HEAD(&req->rl_registered); + spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); - req->rl_buffer = &r_xprt->rx_buf; - INIT_LIST_HEAD(&req->rl_registered); return req; } @@ -1535,7 +1550,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; - struct ib_send_wr *send_wr_fail; int rc; if (req->rl_reply) { @@ -1554,7 +1568,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, --ep->rep_send_count; } - rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); + rc = ia->ri_ops->ro_send(ia, req); trace_xprtrdma_post_send(req, rc); if (rc) return -ENOTCONN; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 69883a960a3f..3d3b423fa9c1 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -334,8 +334,6 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - int rl_cpu; - unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; @@ -474,6 +472,8 @@ struct rpcrdma_memreg_ops { (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mr **); + int (*ro_send)(struct rpcrdma_ia *ia, + struct rpcrdma_req *req); void (*ro_reminv)(struct rpcrdma_rep *rep, struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 956e29c1438d..c8902f11efdd 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -527,6 +527,7 @@ static int xs_local_send_request(struct rpc_task *task) xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, true, &sent); dprintk("RPC: %s(%u) = %d\n", @@ -589,6 +590,7 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, xdr, req->rq_bytes_sent, true, &sent); @@ -678,6 +680,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ + req->rq_xtime = ktime_get(); while (1) { sent = 0; status = xs_sendpages(transport->sock, NULL, 0, xdr, @@ -1060,6 +1063,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); + xprt_update_rtt(rovr->rq_task); spin_unlock(&xprt->recv_lock); task = rovr->rq_task; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 764ffd1bb1c5..e16d6713f236 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -791,7 +791,8 @@ our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)}; our $declaration_macros = qr{(?x: (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(| (?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(| - (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\( + (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(| + (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\( )}; sub deparenthesize { @@ -1075,7 +1076,7 @@ sub parse_email { } elsif ($formatted_email =~ /(\S+\@\S+)(.*)$/) { $address = $1; $comment = $2 if defined $2; - $formatted_email =~ s/$address.*$//; + $formatted_email =~ s/\Q$address\E.*$//; $name = $formatted_email; $name = trim($name); $name =~ s/^\"|\"$//g; @@ -1217,7 +1218,7 @@ sub sanitise_line { for ($off = 1; $off < length($line); $off++) { $c = substr($line, $off, 1); - # Comments we are wacking completly including the begin + # Comments we are whacking completely including the begin # and end, all to $;. if ($sanitise_quote eq '' && substr($line, $off, 2) eq '/*') { $sanitise_quote = '*/'; @@ -1297,6 +1298,7 @@ sub sanitise_line { sub get_quoted_string { my ($line, $rawline) = @_; + return "" if (!defined($line) || !defined($rawline)); return "" if ($line !~ m/($String)/g); return substr($rawline, $-[0], $+[0] - $-[0]); } @@ -1644,6 +1646,28 @@ sub raw_line { return $line; } +sub get_stat_real { + my ($linenr, $lc) = @_; + + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + + return $stat_real; +} + +sub get_stat_here { + my ($linenr, $cnt, $here) = @_; + + my $herectx = $here . "\n"; + for (my $n = 0; $n < $cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; + } + + return $herectx; +} + sub cat_vet { my ($vet) = @_; my ($res, $coded); @@ -2257,6 +2281,8 @@ sub process { my $camelcase_file_seeded = 0; + my $checklicenseline = 1; + sanitise_line_reset(); my $line; foreach my $rawline (@rawlines) { @@ -2448,6 +2474,7 @@ sub process { } else { $check = $check_orig; } + $checklicenseline = 1; next; } @@ -2911,6 +2938,30 @@ sub process { } } +# check for using SPDX license tag at beginning of files + if ($realline == $checklicenseline) { + if ($rawline =~ /^[ \+]\s*\#\!\s*\//) { + $checklicenseline = 2; + } elsif ($rawline =~ /^\+/) { + my $comment = ""; + if ($realfile =~ /\.(h|s|S)$/) { + $comment = '/*'; + } elsif ($realfile =~ /\.(c|dts|dtsi)$/) { + $comment = '//'; + } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc)$/) { + $comment = '#'; + } elsif ($realfile =~ /\.rst$/) { + $comment = '..'; + } + + if ($comment !~ /^$/ && + $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + } + } + } + # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); @@ -3011,6 +3062,12 @@ sub process { } } +# check for assignments on the start of a line + if ($sline =~ /^\+\s+($Assignment)[^=]/) { + CHK("ASSIGNMENT_CONTINUATIONS", + "Assignment operator '$1' should be on the previous line\n" . $hereprev); + } + # check for && or || at the start of a line if ($rawline =~ /^\+\s*(&&|\|\|)/) { CHK("LOGICAL_CONTINUATIONS", @@ -4032,7 +4089,7 @@ sub process { my ($where, $prefix) = ($-[1], $1); if ($prefix !~ /$Type\s+$/ && ($where != 0 || $prefix !~ /^.\s+$/) && - $prefix !~ /[{,]\s+$/) { + $prefix !~ /[{,:]\s+$/) { if (ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr) && $fix) { @@ -4928,12 +4985,8 @@ sub process { #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; $ctx =~ s/\n*$//; - my $herectx = $here . "\n"; my $stmt_cnt = statement_rawlines($ctx); - - for (my $n = 0; $n < $stmt_cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $stmt_cnt, $here); if ($dstat ne '' && $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), @@ -5005,12 +5058,9 @@ sub process { # check for macros with flow control, but without ## concatenation # ## concatenation is commonly a macro that defines a function so ignore those if ($has_flow_statement && !$has_arg_concat) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($ctx); + my $herectx = get_stat_here($linenr, $cnt, $here); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } WARN("MACRO_WITH_FLOW_CONTROL", "Macros with flow control statements should be avoided\n" . "$herectx"); } @@ -5050,11 +5100,7 @@ sub process { $ctx =~ s/\n*$//; my $cnt = statement_rawlines($ctx); - my $herectx = $here . "\n"; - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); if (($stmts =~ tr/;/;/) == 1 && $stmts !~ /^\s*(if|while|for|switch)\b/) { @@ -5068,11 +5114,7 @@ sub process { } elsif ($dstat =~ /^\+\s*#\s*define\s+$Ident.*;\s*$/) { $ctx =~ s/\n*$//; my $cnt = statement_rawlines($ctx); - my $herectx = $here . "\n"; - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); WARN("TRAILING_SEMICOLON", "macros should not use a trailing semicolon\n" . "$herectx"); @@ -5195,12 +5237,8 @@ sub process { } } if ($level == 0 && $block =~ /^\s*\{/ && !$allowed) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($block); - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); WARN("BRACES", "braces {} are not necessary for single statement blocks\n" . $herectx); @@ -5776,36 +5814,50 @@ sub process { } } - # check for vsprintf extension %p<foo> misuses +# check for vsprintf extension %p<foo> misuses if ($^V && $^V ge 5.10.0 && defined $stat && $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && $1 !~ /^_*volatile_*$/) { - my $bad_extension = ""; + my $specifier; + my $extension; + my $bad_specifier = ""; + my $stat_real; + my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; for (my $count = $linenr; $count <= $lc; $count++) { my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0)); $fmt =~ s/%%//g; - if ($fmt =~ /(\%[\*\d\.]*p(?![\WSsBKRraEhMmIiUDdgVCbGNOx]).)/) { - $bad_extension = $1; - last; - } - } - if ($bad_extension ne "") { - my $stat_real = raw_line($linenr, 0); - my $ext_type = "Invalid"; - my $use = ""; - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); + + while ($fmt =~ /(\%[\*\d\.]*p(\w))/g) { + $specifier = $1; + $extension = $2; + if ($extension !~ /[SsBKRraEhMmIiUDdgVCbGNOx]/) { + $bad_specifier = $specifier; + last; + } + if ($extension eq "x" && !defined($stat_real)) { + if (!defined($stat_real)) { + $stat_real = get_stat_real($linenr, $lc); + } + WARN("VSPRINTF_SPECIFIER_PX", + "Using vsprintf specifier '\%px' potentially exposes the kernel memory layout, if you don't really need the address please consider using '\%p'.\n" . "$here\n$stat_real\n"); + } } - if ($bad_extension =~ /p[Ff]/) { - $ext_type = "Deprecated"; - $use = " - use %pS instead"; - $use =~ s/pS/ps/ if ($bad_extension =~ /pf/); + if ($bad_specifier ne "") { + my $stat_real = get_stat_real($linenr, $lc); + my $ext_type = "Invalid"; + my $use = ""; + if ($bad_specifier =~ /p[Ff]/) { + $ext_type = "Deprecated"; + $use = " - use %pS instead"; + $use =~ s/pS/ps/ if ($bad_specifier =~ /pf/); + } + + WARN("VSPRINTF_POINTER_EXTENSION", + "$ext_type vsprintf pointer extension '$bad_specifier'$use\n" . "$here\n$stat_real\n"); } - WARN("VSPRINTF_POINTER_EXTENSION", - "$ext_type vsprintf pointer extension '$bad_extension'$use\n" . "$here\n$stat_real\n"); } } @@ -5918,10 +5970,7 @@ sub process { $stat !~ /(?:$Compare)\s*\bsscanf\s*$balanced_parens/)) { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); WARN("NAKED_SSCANF", "unchecked sscanf return value\n" . "$here\n$stat_real\n"); } @@ -5932,10 +5981,7 @@ sub process { $line =~ /\bsscanf\b/) { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); if ($stat_real =~ /\bsscanf\b\s*\(\s*$FuncArg\s*,\s*("[^"]+")/) { my $format = $6; my $count = $format =~ tr@%@%@; @@ -6065,12 +6111,9 @@ sub process { } if ($r1 !~ /^sizeof\b/ && $r2 =~ /^sizeof\s*\S/ && !($r1 =~ /^$Constant$/ || $r1 =~ /^[A-Z_][A-Z0-9_]*$/)) { - my $ctx = ''; - my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); + if (WARN("ALLOC_WITH_MULTIPLY", "Prefer $newfunc over $oldfunc with multiply\n" . $herectx) && $cnt == 1 && @@ -6153,12 +6196,9 @@ sub process { if ($^V && $^V ge 5.10.0 && defined $stat && $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) { - my $ctx = ''; - my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); + WARN("DEFAULT_NO_BREAK", "switch default: should use break\n" . $herectx); } @@ -6211,6 +6251,12 @@ sub process { } } +# check for bool bitfields + if ($sline =~ /^.\s+bool\s*$Ident\s*:\s*\d+\s*;/) { + WARN("BOOL_BITFIELD", + "Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr); + } + # check for semaphores initialized locked if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) { WARN("CONSIDER_COMPLETION", @@ -6369,10 +6415,7 @@ sub process { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); my $skip_args = ""; if ($arg_pos > 1) { @@ -6398,7 +6441,7 @@ sub process { } # check for uses of S_<PERMS> that could be octal for readability - if ($line =~ /\b($multi_mode_perms_string_search)\b/) { + while ($line =~ m{\b($multi_mode_perms_string_search)\b}g) { my $oval = $1; my $octal = perms_to_octal($oval); if (WARN("SYMBOLIC_PERMS", diff --git a/scripts/dtc/include-prefixes/cris b/scripts/dtc/include-prefixes/cris deleted file mode 120000 index 736d998ba506..000000000000 --- a/scripts/dtc/include-prefixes/cris +++ /dev/null @@ -1 +0,0 @@ -../../../arch/cris/boot/dts
\ No newline at end of file diff --git a/scripts/dtc/include-prefixes/metag b/scripts/dtc/include-prefixes/metag deleted file mode 120000 index 87a3c847db8f..000000000000 --- a/scripts/dtc/include-prefixes/metag +++ /dev/null @@ -1 +0,0 @@ -../../../arch/metag/boot/dts
\ No newline at end of file diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1eeb70e439d7..4cafe6a19167 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6006,6 +6006,7 @@ static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: perms = MSGQ__GETATTR | MSGQ__ASSOCIATE; break; case IPC_SET: @@ -6157,6 +6158,7 @@ static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd) SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case SHM_STAT: + case SHM_STAT_ANY: perms = SHM__GETATTR | SHM__ASSOCIATE; break; case IPC_SET: @@ -6272,6 +6274,7 @@ static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd) break; case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: perms = SEM__GETATTR | SEM__ASSOCIATE; break; default: diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 73549007bf9e..0b414836bebd 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3046,6 +3046,7 @@ static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd) switch (cmd) { case IPC_STAT: case SHM_STAT: + case SHM_STAT_ANY: may = MAY_READ; break; case IPC_SET: @@ -3139,6 +3140,7 @@ static int smack_sem_semctl(struct kern_ipc_perm *isp, int cmd) case GETALL: case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: may = MAY_READ; break; case SETVAL: @@ -3228,6 +3230,7 @@ static int smack_msg_queue_msgctl(struct kern_ipc_perm *isp, int cmd) switch (cmd) { case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: may = MAY_READ; break; case IPC_SET: diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 4ed569fcb139..b21b586b9854 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -7,6 +7,7 @@ #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER; +#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) diff --git a/tools/testing/ktest/config-bisect.pl b/tools/testing/ktest/config-bisect.pl new file mode 100755 index 000000000000..b28feea7c363 --- /dev/null +++ b/tools/testing/ktest/config-bisect.pl @@ -0,0 +1,770 @@ +#!/usr/bin/perl -w +# +# Copyright 2015 - Steven Rostedt, Red Hat Inc. +# Copyright 2017 - Steven Rostedt, VMware, Inc. +# +# Licensed under the terms of the GNU GPL License version 2 +# + +# usage: +# config-bisect.pl [options] good-config bad-config [good|bad] +# + +# Compares a good config to a bad config, then takes half of the diffs +# and produces a config that is somewhere between the good config and +# the bad config. That is, the resulting config will start with the +# good config and will try to make half of the differences of between +# the good and bad configs match the bad config. It tries because of +# dependencies between the two configs it may not be able to change +# exactly half of the configs that are different between the two config +# files. + +# Here's a normal way to use it: +# +# $ cd /path/to/linux/kernel +# $ config-bisect.pl /path/to/good/config /path/to/bad/config + +# This will now pull in good config (blowing away .config in that directory +# so do not make that be one of the good or bad configs), and then +# build the config with "make oldconfig" to make sure it matches the +# current kernel. It will then store the configs in that result for +# the good config. It does the same for the bad config as well. +# The algorithm will run, merging half of the differences between +# the two configs and building them with "make oldconfig" to make sure +# the result changes (dependencies may reset changes the tool had made). +# It then copies the result of its good config to /path/to/good/config.tmp +# and the bad config to /path/to/bad/config.tmp (just appends ".tmp" to the +# files passed in). And the ".config" that you should test will be in +# directory + +# After the first run, determine if the result is good or bad then +# run the same command appending the result + +# For good results: +# $ config-bisect.pl /path/to/good/config /path/to/bad/config good + +# For bad results: +# $ config-bisect.pl /path/to/good/config /path/to/bad/config bad + +# Do not change the good-config or bad-config, config-bisect.pl will +# copy the good-config to a temp file with the same name as good-config +# but with a ".tmp" after it. It will do the same with the bad-config. + +# If "good" or "bad" is not stated at the end, it will copy the good and +# bad configs to the .tmp versions. If a .tmp version already exists, it will +# warn before writing over them (-r will not warn, and just write over them). +# If the last config is labeled "good", then it will copy it to the good .tmp +# version. If the last config is labeled "bad", it will copy it to the bad +# .tmp version. It will continue this until it can not merge the two any more +# without the result being equal to either the good or bad .tmp configs. + +my $start = 0; +my $val = ""; + +my $pwd = `pwd`; +chomp $pwd; +my $tree = $pwd; +my $build; + +my $output_config; +my $reset_bisect; + +sub usage { + print << "EOF" + +usage: config-bisect.pl [-l linux-tree][-b build-dir] good-config bad-config [good|bad] + -l [optional] define location of linux-tree (default is current directory) + -b [optional] define location to build (O=build-dir) (default is linux-tree) + good-config the config that is considered good + bad-config the config that does not work + "good" add this if the last run produced a good config + "bad" add this if the last run produced a bad config + If "good" or "bad" is not specified, then it is the start of a new bisect + + Note, each run will create copy of good and bad configs with ".tmp" appended. + +EOF +; + + exit(-1); +} + +sub doprint { + print @_; +} + +sub dodie { + doprint "CRITICAL FAILURE... ", @_, "\n"; + + die @_, "\n"; +} + +sub expand_path { + my ($file) = @_; + + if ($file =~ m,^/,) { + return $file; + } + return "$pwd/$file"; +} + +sub read_prompt { + my ($cancel, $prompt) = @_; + + my $ans; + + for (;;) { + if ($cancel) { + print "$prompt [y/n/C] "; + } else { + print "$prompt [y/N] "; + } + $ans = <STDIN>; + chomp $ans; + if ($ans =~ /^\s*$/) { + if ($cancel) { + $ans = "c"; + } else { + $ans = "n"; + } + } + last if ($ans =~ /^y$/i || $ans =~ /^n$/i); + if ($cancel) { + last if ($ans =~ /^c$/i); + print "Please answer either 'y', 'n' or 'c'.\n"; + } else { + print "Please answer either 'y' or 'n'.\n"; + } + } + if ($ans =~ /^c/i) { + exit; + } + if ($ans !~ /^y$/i) { + return 0; + } + return 1; +} + +sub read_yn { + my ($prompt) = @_; + + return read_prompt 0, $prompt; +} + +sub read_ync { + my ($prompt) = @_; + + return read_prompt 1, $prompt; +} + +sub run_command { + my ($command, $redirect) = @_; + my $start_time; + my $end_time; + my $dord = 0; + my $pid; + + $start_time = time; + + doprint("$command ... "); + + $pid = open(CMD, "$command 2>&1 |") or + dodie "unable to exec $command"; + + if (defined($redirect)) { + open (RD, ">$redirect") or + dodie "failed to write to redirect $redirect"; + $dord = 1; + } + + while (<CMD>) { + print RD if ($dord); + } + + waitpid($pid, 0); + my $failed = $?; + + close(CMD); + close(RD) if ($dord); + + $end_time = time; + my $delta = $end_time - $start_time; + + if ($delta == 1) { + doprint "[1 second] "; + } else { + doprint "[$delta seconds] "; + } + + if ($failed) { + doprint "FAILED!\n"; + } else { + doprint "SUCCESS\n"; + } + + return !$failed; +} + +###### CONFIG BISECT ###### + +# config_ignore holds the configs that were set (or unset) for +# a good config and we will ignore these configs for the rest +# of a config bisect. These configs stay as they were. +my %config_ignore; + +# config_set holds what all configs were set as. +my %config_set; + +# config_off holds the set of configs that the bad config had disabled. +# We need to record them and set them in the .config when running +# olddefconfig, because olddefconfig keeps the defaults. +my %config_off; + +# config_off_tmp holds a set of configs to turn off for now +my @config_off_tmp; + +# config_list is the set of configs that are being tested +my %config_list; +my %null_config; + +my %dependency; + +my $make; + +sub make_oldconfig { + + if (!run_command "$make olddefconfig") { + # Perhaps olddefconfig doesn't exist in this version of the kernel + # try oldnoconfig + doprint "olddefconfig failed, trying make oldnoconfig\n"; + if (!run_command "$make oldnoconfig") { + doprint "oldnoconfig failed, trying yes '' | make oldconfig\n"; + # try a yes '' | oldconfig + run_command "yes '' | $make oldconfig" or + dodie "failed make config oldconfig"; + } + } +} + +sub assign_configs { + my ($hash, $config) = @_; + + doprint "Reading configs from $config\n"; + + open (IN, $config) + or dodie "Failed to read $config"; + + while (<IN>) { + chomp; + if (/^((CONFIG\S*)=.*)/) { + ${$hash}{$2} = $1; + } elsif (/^(# (CONFIG\S*) is not set)/) { + ${$hash}{$2} = $1; + } + } + + close(IN); +} + +sub process_config_ignore { + my ($config) = @_; + + assign_configs \%config_ignore, $config; +} + +sub get_dependencies { + my ($config) = @_; + + my $arr = $dependency{$config}; + if (!defined($arr)) { + return (); + } + + my @deps = @{$arr}; + + foreach my $dep (@{$arr}) { + print "ADD DEP $dep\n"; + @deps = (@deps, get_dependencies $dep); + } + + return @deps; +} + +sub save_config { + my ($pc, $file) = @_; + + my %configs = %{$pc}; + + doprint "Saving configs into $file\n"; + + open(OUT, ">$file") or dodie "Can not write to $file"; + + foreach my $config (keys %configs) { + print OUT "$configs{$config}\n"; + } + close(OUT); +} + +sub create_config { + my ($name, $pc) = @_; + + doprint "Creating old config from $name configs\n"; + + save_config $pc, $output_config; + + make_oldconfig; +} + +# compare two config hashes, and return configs with different vals. +# It returns B's config values, but you can use A to see what A was. +sub diff_config_vals { + my ($pa, $pb) = @_; + + # crappy Perl way to pass in hashes. + my %a = %{$pa}; + my %b = %{$pb}; + + my %ret; + + foreach my $item (keys %a) { + if (defined($b{$item}) && $b{$item} ne $a{$item}) { + $ret{$item} = $b{$item}; + } + } + + return %ret; +} + +# compare two config hashes and return the configs in B but not A +sub diff_configs { + my ($pa, $pb) = @_; + + my %ret; + + # crappy Perl way to pass in hashes. + my %a = %{$pa}; + my %b = %{$pb}; + + foreach my $item (keys %b) { + if (!defined($a{$item})) { + $ret{$item} = $b{$item}; + } + } + + return %ret; +} + +# return if two configs are equal or not +# 0 is equal +1 b has something a does not +# +1 if a and b have a different item. +# -1 if a has something b does not +sub compare_configs { + my ($pa, $pb) = @_; + + my %ret; + + # crappy Perl way to pass in hashes. + my %a = %{$pa}; + my %b = %{$pb}; + + foreach my $item (keys %b) { + if (!defined($a{$item})) { + return 1; + } + if ($a{$item} ne $b{$item}) { + return 1; + } + } + + foreach my $item (keys %a) { + if (!defined($b{$item})) { + return -1; + } + } + + return 0; +} + +sub process_failed { + my ($config) = @_; + + doprint "\n\n***************************************\n"; + doprint "Found bad config: $config\n"; + doprint "***************************************\n\n"; +} + +sub process_new_config { + my ($tc, $nc, $gc, $bc) = @_; + + my %tmp_config = %{$tc}; + my %good_configs = %{$gc}; + my %bad_configs = %{$bc}; + + my %new_configs; + + my $runtest = 1; + my $ret; + + create_config "tmp_configs", \%tmp_config; + assign_configs \%new_configs, $output_config; + + $ret = compare_configs \%new_configs, \%bad_configs; + if (!$ret) { + doprint "New config equals bad config, try next test\n"; + $runtest = 0; + } + + if ($runtest) { + $ret = compare_configs \%new_configs, \%good_configs; + if (!$ret) { + doprint "New config equals good config, try next test\n"; + $runtest = 0; + } + } + + %{$nc} = %new_configs; + + return $runtest; +} + +sub convert_config { + my ($config) = @_; + + if ($config =~ /^# (.*) is not set/) { + $config = "$1=n"; + } + + $config =~ s/^CONFIG_//; + return $config; +} + +sub print_config { + my ($sym, $config) = @_; + + $config = convert_config $config; + doprint "$sym$config\n"; +} + +sub print_config_compare { + my ($good_config, $bad_config) = @_; + + $good_config = convert_config $good_config; + $bad_config = convert_config $bad_config; + + my $good_value = $good_config; + my $bad_value = $bad_config; + $good_value =~ s/(.*)=//; + my $config = $1; + + $bad_value =~ s/.*=//; + + doprint " $config $good_value -> $bad_value\n"; +} + +# Pass in: +# $phalf: half of the configs names you want to add +# $oconfigs: The orginial configs to start with +# $sconfigs: The source to update $oconfigs with (from $phalf) +# $which: The name of which half that is updating (top / bottom) +# $type: The name of the source type (good / bad) +sub make_half { + my ($phalf, $oconfigs, $sconfigs, $which, $type) = @_; + + my @half = @{$phalf}; + my %orig_configs = %{$oconfigs}; + my %source_configs = %{$sconfigs}; + + my %tmp_config = %orig_configs; + + doprint "Settings bisect with $which half of $type configs:\n"; + foreach my $item (@half) { + doprint "Updating $item to $source_configs{$item}\n"; + $tmp_config{$item} = $source_configs{$item}; + } + + return %tmp_config; +} + +sub run_config_bisect { + my ($pgood, $pbad) = @_; + + my %good_configs = %{$pgood}; + my %bad_configs = %{$pbad}; + + my %diff_configs = diff_config_vals \%good_configs, \%bad_configs; + my %b_configs = diff_configs \%good_configs, \%bad_configs; + my %g_configs = diff_configs \%bad_configs, \%good_configs; + + # diff_arr is what is in both good and bad but are different (y->n) + my @diff_arr = keys %diff_configs; + my $len_diff = $#diff_arr + 1; + + # b_arr is what is in bad but not in good (has depends) + my @b_arr = keys %b_configs; + my $len_b = $#b_arr + 1; + + # g_arr is what is in good but not in bad + my @g_arr = keys %g_configs; + my $len_g = $#g_arr + 1; + + my $runtest = 0; + my %new_configs; + my $ret; + + # Look at the configs that are different between good and bad. + # This does not include those that depend on other configs + # (configs depending on other configs that are not set would + # not show up even as a "# CONFIG_FOO is not set" + + + doprint "# of configs to check: $len_diff\n"; + doprint "# of configs showing only in good: $len_g\n"; + doprint "# of configs showing only in bad: $len_b\n"; + + if ($len_diff > 0) { + # Now test for different values + + doprint "Configs left to check:\n"; + doprint " Good Config\t\t\tBad Config\n"; + doprint " -----------\t\t\t----------\n"; + foreach my $item (@diff_arr) { + doprint " $good_configs{$item}\t$bad_configs{$item}\n"; + } + + my $half = int($#diff_arr / 2); + my @tophalf = @diff_arr[0 .. $half]; + + doprint "Set tmp config to be good config with some bad config values\n"; + + my %tmp_config = make_half \@tophalf, \%good_configs, + \%bad_configs, "top", "bad"; + + $runtest = process_new_config \%tmp_config, \%new_configs, + \%good_configs, \%bad_configs; + + if (!$runtest) { + doprint "Set tmp config to be bad config with some good config values\n"; + + my %tmp_config = make_half \@tophalf, \%bad_configs, + \%good_configs, "top", "good"; + + $runtest = process_new_config \%tmp_config, \%new_configs, + \%good_configs, \%bad_configs; + } + } + + if (!$runtest && $len_diff > 0) { + # do the same thing, but this time with bottom half + + my $half = int($#diff_arr / 2); + my @bottomhalf = @diff_arr[$half+1 .. $#diff_arr]; + + doprint "Set tmp config to be good config with some bad config values\n"; + + my %tmp_config = make_half \@bottomhalf, \%good_configs, + \%bad_configs, "bottom", "bad"; + + $runtest = process_new_config \%tmp_config, \%new_configs, + \%good_configs, \%bad_configs; + + if (!$runtest) { + doprint "Set tmp config to be bad config with some good config values\n"; + + my %tmp_config = make_half \@bottomhalf, \%bad_configs, + \%good_configs, "bottom", "good"; + + $runtest = process_new_config \%tmp_config, \%new_configs, + \%good_configs, \%bad_configs; + } + } + + if ($runtest) { + make_oldconfig; + doprint "READY TO TEST .config IN $build\n"; + return 0; + } + + doprint "\n%%%%%%%% FAILED TO FIND SINGLE BAD CONFIG %%%%%%%%\n"; + doprint "Hmm, can't make any more changes without making good == bad?\n"; + doprint "Difference between good (+) and bad (-)\n"; + + foreach my $item (keys %bad_configs) { + if (!defined($good_configs{$item})) { + print_config "-", $bad_configs{$item}; + } + } + + foreach my $item (keys %good_configs) { + next if (!defined($bad_configs{$item})); + if ($good_configs{$item} ne $bad_configs{$item}) { + print_config_compare $good_configs{$item}, $bad_configs{$item}; + } + } + + foreach my $item (keys %good_configs) { + if (!defined($bad_configs{$item})) { + print_config "+", $good_configs{$item}; + } + } + return -1; +} + +sub config_bisect { + my ($good_config, $bad_config) = @_; + my $ret; + + my %good_configs; + my %bad_configs; + my %tmp_configs; + + doprint "Run good configs through make oldconfig\n"; + assign_configs \%tmp_configs, $good_config; + create_config "$good_config", \%tmp_configs; + assign_configs \%good_configs, $output_config; + + doprint "Run bad configs through make oldconfig\n"; + assign_configs \%tmp_configs, $bad_config; + create_config "$bad_config", \%tmp_configs; + assign_configs \%bad_configs, $output_config; + + save_config \%good_configs, $good_config; + save_config \%bad_configs, $bad_config; + + return run_config_bisect \%good_configs, \%bad_configs; +} + +while ($#ARGV >= 0) { + if ($ARGV[0] !~ m/^-/) { + last; + } + my $opt = shift @ARGV; + + if ($opt eq "-b") { + $val = shift @ARGV; + if (!defined($val)) { + die "-b requires value\n"; + } + $build = $val; + } + + elsif ($opt eq "-l") { + $val = shift @ARGV; + if (!defined($val)) { + die "-l requires value\n"; + } + $tree = $val; + } + + elsif ($opt eq "-r") { + $reset_bisect = 1; + } + + elsif ($opt eq "-h") { + usage; + } + + else { + die "Unknow option $opt\n"; + } +} + +$build = $tree if (!defined($build)); + +$tree = expand_path $tree; +$build = expand_path $build; + +if ( ! -d $tree ) { + die "$tree not a directory\n"; +} + +if ( ! -d $build ) { + die "$build not a directory\n"; +} + +usage if $#ARGV < 1; + +if ($#ARGV == 1) { + $start = 1; +} elsif ($#ARGV == 2) { + $val = $ARGV[2]; + if ($val ne "good" && $val ne "bad") { + die "Unknown command '$val', bust be either \"good\" or \"bad\"\n"; + } +} else { + usage; +} + +my $good_start = expand_path $ARGV[0]; +my $bad_start = expand_path $ARGV[1]; + +my $good = "$good_start.tmp"; +my $bad = "$bad_start.tmp"; + +$make = "make"; + +if ($build ne $tree) { + $make = "make O=$build" +} + +$output_config = "$build/.config"; + +if ($start) { + if ( ! -f $good_start ) { + die "$good_start not found\n"; + } + if ( ! -f $bad_start ) { + die "$bad_start not found\n"; + } + if ( -f $good || -f $bad ) { + my $p = ""; + + if ( -f $good ) { + $p = "$good exists\n"; + } + + if ( -f $bad ) { + $p = "$p$bad exists\n"; + } + + if (!defined($reset_bisect)) { + if (!read_yn "${p}Overwrite and start new bisect anyway?") { + exit (-1); + } + } + } + run_command "cp $good_start $good" or die "failed to copy to $good\n"; + run_command "cp $bad_start $bad" or die "faield to copy to $bad\n"; +} else { + if ( ! -f $good ) { + die "Can not find file $good\n"; + } + if ( ! -f $bad ) { + die "Can not find file $bad\n"; + } + if ($val eq "good") { + run_command "cp $output_config $good" or die "failed to copy $config to $good\n"; + } elsif ($val eq "bad") { + run_command "cp $output_config $bad" or die "failed to copy $config to $bad\n"; + } +} + +chdir $tree || die "can't change directory to $tree"; + +my $ret = config_bisect $good, $bad; + +if (!$ret) { + exit(0); +} + +if ($ret > 0) { + doprint "Cleaning temp files\n"; + run_command "rm $good"; + run_command "rm $bad"; + exit(1); +} else { + doprint "See good and bad configs for details:\n"; + doprint "good: $good\n"; + doprint "bad: $bad\n"; + doprint "%%%%%%%% FAILED TO FIND SINGLE BAD CONFIG %%%%%%%%\n"; +} +exit(2); diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 8809f244bb7c..87af8a68ab25 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -10,6 +10,7 @@ use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK); use File::Path qw(mkpath); use File::Copy qw(cp); use FileHandle; +use FindBin; my $VERSION = "0.2"; @@ -22,6 +23,11 @@ my %evals; #default opts my %default = ( + "MAILER" => "sendmail", # default mailer + "EMAIL_ON_ERROR" => 1, + "EMAIL_WHEN_FINISHED" => 1, + "EMAIL_WHEN_CANCELED" => 0, + "EMAIL_WHEN_STARTED" => 0, "NUM_TESTS" => 1, "TEST_TYPE" => "build", "BUILD_TYPE" => "randconfig", @@ -59,6 +65,7 @@ my %default = ( "GRUB_REBOOT" => "grub2-reboot", "SYSLINUX" => "extlinux", "SYSLINUX_PATH" => "/boot/extlinux", + "CONNECT_TIMEOUT" => 25, # required, and we will ask users if they don't have them but we keep the default # value something that is common. @@ -163,6 +170,8 @@ my $store_failures; my $store_successes; my $test_name; my $timeout; +my $connect_timeout; +my $config_bisect_exec; my $booted_timeout; my $detect_triplefault; my $console; @@ -204,6 +213,20 @@ my $install_time; my $reboot_time; my $test_time; +my $pwd; +my $dirname = $FindBin::Bin; + +my $mailto; +my $mailer; +my $mail_path; +my $mail_command; +my $email_on_error; +my $email_when_finished; +my $email_when_started; +my $email_when_canceled; + +my $script_start_time = localtime(); + # set when a test is something other that just building or install # which would require more options. my $buildonly = 1; @@ -229,6 +252,14 @@ my $no_reboot = 1; my $reboot_success = 0; my %option_map = ( + "MAILTO" => \$mailto, + "MAILER" => \$mailer, + "MAIL_PATH" => \$mail_path, + "MAIL_COMMAND" => \$mail_command, + "EMAIL_ON_ERROR" => \$email_on_error, + "EMAIL_WHEN_FINISHED" => \$email_when_finished, + "EMAIL_WHEN_STARTED" => \$email_when_started, + "EMAIL_WHEN_CANCELED" => \$email_when_canceled, "MACHINE" => \$machine, "SSH_USER" => \$ssh_user, "TMP_DIR" => \$tmpdir, @@ -296,6 +327,8 @@ my %option_map = ( "STORE_SUCCESSES" => \$store_successes, "TEST_NAME" => \$test_name, "TIMEOUT" => \$timeout, + "CONNECT_TIMEOUT" => \$connect_timeout, + "CONFIG_BISECT_EXEC" => \$config_bisect_exec, "BOOTED_TIMEOUT" => \$booted_timeout, "CONSOLE" => \$console, "CLOSE_CONSOLE_SIGNAL" => \$close_console_signal, @@ -337,6 +370,7 @@ my %used_options; # default variables that can be used chomp ($variable{"PWD"} = `pwd`); +$pwd = $variable{"PWD"}; $config_help{"MACHINE"} = << "EOF" The machine hostname that you will test. @@ -718,22 +752,14 @@ sub set_value { my $prvalue = process_variables($rvalue); - if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { + if ($lvalue =~ /^(TEST|BISECT|CONFIG_BISECT)_TYPE(\[.*\])?$/ && + $prvalue !~ /^(config_|)bisect$/ && + $prvalue !~ /^build$/ && + $buildonly) { + # Note if a test is something other than build, then we # will need other mandatory options. if ($prvalue ne "install") { - # for bisect, we need to check BISECT_TYPE - if ($prvalue ne "bisect") { - $buildonly = 0; - } - } else { - # install still limits some mandatory options. - $buildonly = 2; - } - } - - if ($buildonly && $lvalue =~ /^BISECT_TYPE(\[.*\])?$/ && $prvalue ne "build") { - if ($prvalue ne "install") { $buildonly = 0; } else { # install still limits some mandatory options. @@ -1140,7 +1166,8 @@ sub __read_config { sub get_test_case { print "What test case would you like to run?\n"; print " (build, install or boot)\n"; - print " Other tests are available but require editing the config file\n"; + print " Other tests are available but require editing ktest.conf\n"; + print " (see tools/testing/ktest/sample.conf)\n"; my $ans = <STDIN>; chomp $ans; $default{"TEST_TYPE"} = $ans; @@ -1328,8 +1355,8 @@ sub reboot { my ($time) = @_; my $powercycle = 0; - # test if the machine can be connected to within 5 seconds - my $stat = run_ssh("echo check machine status", 5); + # test if the machine can be connected to within a few seconds + my $stat = run_ssh("echo check machine status", $connect_timeout); if (!$stat) { doprint("power cycle\n"); $powercycle = 1; @@ -1404,10 +1431,18 @@ sub do_not_reboot { return $test_type eq "build" || $no_reboot || ($test_type eq "patchcheck" && $opt{"PATCHCHECK_TYPE[$i]"} eq "build") || - ($test_type eq "bisect" && $opt{"BISECT_TYPE[$i]"} eq "build"); + ($test_type eq "bisect" && $opt{"BISECT_TYPE[$i]"} eq "build") || + ($test_type eq "config_bisect" && $opt{"CONFIG_BISECT_TYPE[$i]"} eq "build"); } +my $in_die = 0; + sub dodie { + + # avoid recusion + return if ($in_die); + $in_die = 1; + doprint "CRITICAL FAILURE... ", @_, "\n"; my $i = $iteration; @@ -1426,6 +1461,11 @@ sub dodie { print " See $opt{LOG_FILE} for more info.\n"; } + if ($email_on_error) { + send_email("KTEST: critical failure for your [$test_type] test", + "Your test started at $script_start_time has failed with:\n@_\n"); + } + if ($monitor_cnt) { # restore terminal settings system("stty $stty_orig"); @@ -1477,7 +1517,7 @@ sub exec_console { close($pts); exec $console or - die "Can't open console $console"; + dodie "Can't open console $console"; } sub open_console { @@ -1515,6 +1555,9 @@ sub close_console { doprint "kill child process $pid\n"; kill $close_console_signal, $pid; + doprint "wait for child process $pid to exit\n"; + waitpid($pid, 0); + print "closing!\n"; close($fp); @@ -1625,7 +1668,7 @@ sub save_logs { if (!-d $dir) { mkpath($dir) or - die "can't create $dir"; + dodie "can't create $dir"; } my %files = ( @@ -1638,7 +1681,7 @@ sub save_logs { while (my ($name, $source) = each(%files)) { if (-f "$source") { cp "$source", "$dir/$name" or - die "failed to copy $source"; + dodie "failed to copy $source"; } } @@ -1692,6 +1735,7 @@ sub run_command { my $end_time; my $dolog = 0; my $dord = 0; + my $dostdout = 0; my $pid; $command =~ s/\$SSH_USER/$ssh_user/g; @@ -1710,9 +1754,15 @@ sub run_command { } if (defined($redirect)) { - open (RD, ">$redirect") or - dodie "failed to write to redirect $redirect"; - $dord = 1; + if ($redirect eq 1) { + $dostdout = 1; + # Have the output of the command on its own line + doprint "\n"; + } else { + open (RD, ">$redirect") or + dodie "failed to write to redirect $redirect"; + $dord = 1; + } } my $hit_timeout = 0; @@ -1734,6 +1784,7 @@ sub run_command { } print LOG $line if ($dolog); print RD $line if ($dord); + print $line if ($dostdout); } waitpid($pid, 0); @@ -1812,7 +1863,7 @@ sub get_grub2_index { $ssh_grub =~ s,\$SSH_COMMAND,cat $grub_file,g; open(IN, "$ssh_grub |") - or die "unable to get $grub_file"; + or dodie "unable to get $grub_file"; my $found = 0; @@ -1821,13 +1872,13 @@ sub get_grub2_index { $grub_number++; $found = 1; last; - } elsif (/^menuentry\s/) { + } elsif (/^menuentry\s|^submenu\s/) { $grub_number++; } } close(IN); - die "Could not find '$grub_menu' in $grub_file on $machine" + dodie "Could not find '$grub_menu' in $grub_file on $machine" if (!$found); doprint "$grub_number\n"; $last_grub_menu = $grub_menu; @@ -1855,7 +1906,7 @@ sub get_grub_index { $ssh_grub =~ s,\$SSH_COMMAND,cat /boot/grub/menu.lst,g; open(IN, "$ssh_grub |") - or die "unable to get menu.lst"; + or dodie "unable to get menu.lst"; my $found = 0; @@ -1870,7 +1921,7 @@ sub get_grub_index { } close(IN); - die "Could not find '$grub_menu' in /boot/grub/menu on $machine" + dodie "Could not find '$grub_menu' in /boot/grub/menu on $machine" if (!$found); doprint "$grub_number\n"; $last_grub_menu = $grub_menu; @@ -1983,7 +2034,7 @@ sub monitor { my $full_line = ""; open(DMESG, "> $dmesg") or - die "unable to write to $dmesg"; + dodie "unable to write to $dmesg"; reboot_to; @@ -2862,7 +2913,7 @@ sub run_bisect { sub update_bisect_replay { my $tmp_log = "$tmpdir/ktest_bisect_log"; run_command "git bisect log > $tmp_log" or - die "can't create bisect log"; + dodie "can't create bisect log"; return $tmp_log; } @@ -2871,9 +2922,9 @@ sub bisect { my $result; - die "BISECT_GOOD[$i] not defined\n" if (!defined($bisect_good)); - die "BISECT_BAD[$i] not defined\n" if (!defined($bisect_bad)); - die "BISECT_TYPE[$i] not defined\n" if (!defined($bisect_type)); + dodie "BISECT_GOOD[$i] not defined\n" if (!defined($bisect_good)); + dodie "BISECT_BAD[$i] not defined\n" if (!defined($bisect_bad)); + dodie "BISECT_TYPE[$i] not defined\n" if (!defined($bisect_type)); my $good = $bisect_good; my $bad = $bisect_bad; @@ -2936,7 +2987,7 @@ sub bisect { if ($check ne "good") { doprint "TESTING BISECT BAD [$bad]\n"; run_command "git checkout $bad" or - die "Failed to checkout $bad"; + dodie "Failed to checkout $bad"; $result = run_bisect $type; @@ -2948,7 +2999,7 @@ sub bisect { if ($check ne "bad") { doprint "TESTING BISECT GOOD [$good]\n"; run_command "git checkout $good" or - die "Failed to checkout $good"; + dodie "Failed to checkout $good"; $result = run_bisect $type; @@ -2959,7 +3010,7 @@ sub bisect { # checkout where we started run_command "git checkout $head" or - die "Failed to checkout $head"; + dodie "Failed to checkout $head"; } run_command "git bisect start$start_files" or @@ -3092,76 +3143,6 @@ sub create_config { make_oldconfig; } -# compare two config hashes, and return configs with different vals. -# It returns B's config values, but you can use A to see what A was. -sub diff_config_vals { - my ($pa, $pb) = @_; - - # crappy Perl way to pass in hashes. - my %a = %{$pa}; - my %b = %{$pb}; - - my %ret; - - foreach my $item (keys %a) { - if (defined($b{$item}) && $b{$item} ne $a{$item}) { - $ret{$item} = $b{$item}; - } - } - - return %ret; -} - -# compare two config hashes and return the configs in B but not A -sub diff_configs { - my ($pa, $pb) = @_; - - my %ret; - - # crappy Perl way to pass in hashes. - my %a = %{$pa}; - my %b = %{$pb}; - - foreach my $item (keys %b) { - if (!defined($a{$item})) { - $ret{$item} = $b{$item}; - } - } - - return %ret; -} - -# return if two configs are equal or not -# 0 is equal +1 b has something a does not -# +1 if a and b have a different item. -# -1 if a has something b does not -sub compare_configs { - my ($pa, $pb) = @_; - - my %ret; - - # crappy Perl way to pass in hashes. - my %a = %{$pa}; - my %b = %{$pb}; - - foreach my $item (keys %b) { - if (!defined($a{$item})) { - return 1; - } - if ($a{$item} ne $b{$item}) { - return 1; - } - } - - foreach my $item (keys %a) { - if (!defined($b{$item})) { - return -1; - } - } - - return 0; -} - sub run_config_bisect_test { my ($type) = @_; @@ -3174,166 +3155,57 @@ sub run_config_bisect_test { return $ret; } -sub process_failed { - my ($config) = @_; +sub config_bisect_end { + my ($good, $bad) = @_; + my $diffexec = "diff -u"; + if (-f "$builddir/scripts/diffconfig") { + $diffexec = "$builddir/scripts/diffconfig"; + } doprint "\n\n***************************************\n"; - doprint "Found bad config: $config\n"; + doprint "No more config bisecting possible.\n"; + run_command "$diffexec $good $bad", 1; doprint "***************************************\n\n"; } -# used for config bisecting -my $good_config; -my $bad_config; - -sub process_new_config { - my ($tc, $nc, $gc, $bc) = @_; - - my %tmp_config = %{$tc}; - my %good_configs = %{$gc}; - my %bad_configs = %{$bc}; - - my %new_configs; - - my $runtest = 1; - my $ret; - - create_config "tmp_configs", \%tmp_config; - assign_configs \%new_configs, $output_config; - - $ret = compare_configs \%new_configs, \%bad_configs; - if (!$ret) { - doprint "New config equals bad config, try next test\n"; - $runtest = 0; - } - - if ($runtest) { - $ret = compare_configs \%new_configs, \%good_configs; - if (!$ret) { - doprint "New config equals good config, try next test\n"; - $runtest = 0; - } - } - - %{$nc} = %new_configs; - - return $runtest; -} - sub run_config_bisect { - my ($pgood, $pbad) = @_; - - my $type = $config_bisect_type; - - my %good_configs = %{$pgood}; - my %bad_configs = %{$pbad}; - - my %diff_configs = diff_config_vals \%good_configs, \%bad_configs; - my %b_configs = diff_configs \%good_configs, \%bad_configs; - my %g_configs = diff_configs \%bad_configs, \%good_configs; - - my @diff_arr = keys %diff_configs; - my $len_diff = $#diff_arr + 1; - - my @b_arr = keys %b_configs; - my $len_b = $#b_arr + 1; - - my @g_arr = keys %g_configs; - my $len_g = $#g_arr + 1; - - my $runtest = 1; - my %new_configs; + my ($good, $bad, $last_result) = @_; + my $reset = ""; + my $cmd; my $ret; - # First, lets get it down to a single subset. - # Is the problem with a difference in values? - # Is the problem with a missing config? - # Is the problem with a config that breaks things? - - # Enable all of one set and see if we get a new bad - # or good config. - - # first set the good config to the bad values. - - doprint "d=$len_diff g=$len_g b=$len_b\n"; - - # first lets enable things in bad config that are enabled in good config - - if ($len_diff > 0) { - if ($len_b > 0 || $len_g > 0) { - my %tmp_config = %bad_configs; - - doprint "Set tmp config to be bad config with good config values\n"; - foreach my $item (@diff_arr) { - $tmp_config{$item} = $good_configs{$item}; - } - - $runtest = process_new_config \%tmp_config, \%new_configs, - \%good_configs, \%bad_configs; - } + if (!length($last_result)) { + $reset = "-r"; } + run_command "$config_bisect_exec $reset -b $outputdir $good $bad $last_result", 1; - if (!$runtest && $len_diff > 0) { - - if ($len_diff == 1) { - process_failed $diff_arr[0]; - return 1; - } - my %tmp_config = %bad_configs; - - my $half = int($#diff_arr / 2); - my @tophalf = @diff_arr[0 .. $half]; - - doprint "Settings bisect with top half:\n"; - doprint "Set tmp config to be bad config with some good config values\n"; - foreach my $item (@tophalf) { - $tmp_config{$item} = $good_configs{$item}; - } - - $runtest = process_new_config \%tmp_config, \%new_configs, - \%good_configs, \%bad_configs; - - if (!$runtest) { - my %tmp_config = %bad_configs; - - doprint "Try bottom half\n"; - - my @bottomhalf = @diff_arr[$half+1 .. $#diff_arr]; - - foreach my $item (@bottomhalf) { - $tmp_config{$item} = $good_configs{$item}; - } - - $runtest = process_new_config \%tmp_config, \%new_configs, - \%good_configs, \%bad_configs; - } + # config-bisect returns: + # 0 if there is more to bisect + # 1 for finding a good config + # 2 if it can not find any more configs + # -1 (255) on error + if ($run_command_status) { + return $run_command_status; } - if ($runtest) { - $ret = run_config_bisect_test $type; - if ($ret) { - doprint "NEW GOOD CONFIG\n"; - %good_configs = %new_configs; - run_command "mv $good_config ${good_config}.last"; - save_config \%good_configs, $good_config; - %{$pgood} = %good_configs; - } else { - doprint "NEW BAD CONFIG\n"; - %bad_configs = %new_configs; - run_command "mv $bad_config ${bad_config}.last"; - save_config \%bad_configs, $bad_config; - %{$pbad} = %bad_configs; - } - return 0; + $ret = run_config_bisect_test $config_bisect_type; + if ($ret) { + doprint "NEW GOOD CONFIG\n"; + # Return 3 for good config + return 3; + } else { + doprint "NEW BAD CONFIG\n"; + # Return 4 for bad config + return 4; } - - fail "Hmm, need to do a mix match?\n"; - return -1; } sub config_bisect { my ($i) = @_; + my $good_config; + my $bad_config; + my $type = $config_bisect_type; my $ret; @@ -3353,6 +3225,24 @@ sub config_bisect { $good_config = $output_config; } + if (!defined($config_bisect_exec)) { + # First check the location that ktest.pl ran + my @locations = ( "$pwd/config-bisect.pl", + "$dirname/config-bisect.pl", + "$builddir/tools/testing/ktest/config-bisect.pl", + undef ); + foreach my $loc (@locations) { + doprint "loc = $loc\n"; + $config_bisect_exec = $loc; + last if (defined($config_bisect_exec && -x $config_bisect_exec)); + } + if (!defined($config_bisect_exec)) { + fail "Could not find an executable config-bisect.pl\n", + " Set CONFIG_BISECT_EXEC to point to config-bisect.pl"; + return 1; + } + } + # we don't want min configs to cause issues here. doprint "Disabling 'MIN_CONFIG' for this test\n"; undef $minconfig; @@ -3361,21 +3251,31 @@ sub config_bisect { my %bad_configs; my %tmp_configs; + if (-f "$tmpdir/good_config.tmp" || -f "$tmpdir/bad_config.tmp") { + if (read_yn "Interrupted config-bisect. Continue (n - will start new)?") { + if (-f "$tmpdir/good_config.tmp") { + $good_config = "$tmpdir/good_config.tmp"; + } else { + $good_config = "$tmpdir/good_config"; + } + if (-f "$tmpdir/bad_config.tmp") { + $bad_config = "$tmpdir/bad_config.tmp"; + } else { + $bad_config = "$tmpdir/bad_config"; + } + } + } doprint "Run good configs through make oldconfig\n"; assign_configs \%tmp_configs, $good_config; create_config "$good_config", \%tmp_configs; - assign_configs \%good_configs, $output_config; + $good_config = "$tmpdir/good_config"; + system("cp $output_config $good_config") == 0 or dodie "cp good config"; doprint "Run bad configs through make oldconfig\n"; assign_configs \%tmp_configs, $bad_config; create_config "$bad_config", \%tmp_configs; - assign_configs \%bad_configs, $output_config; - - $good_config = "$tmpdir/good_config"; $bad_config = "$tmpdir/bad_config"; - - save_config \%good_configs, $good_config; - save_config \%bad_configs, $bad_config; + system("cp $output_config $bad_config") == 0 or dodie "cp bad config"; if (defined($config_bisect_check) && $config_bisect_check ne "0") { if ($config_bisect_check ne "good") { @@ -3398,10 +3298,21 @@ sub config_bisect { } } + my $last_run = ""; + do { - $ret = run_config_bisect \%good_configs, \%bad_configs; + $ret = run_config_bisect $good_config, $bad_config, $last_run; + if ($ret == 3) { + $last_run = "good"; + } elsif ($ret == 4) { + $last_run = "bad"; + } print_times; - } while (!$ret); + } while ($ret == 3 || $ret == 4); + + if ($ret == 2) { + config_bisect_end "$good_config.tmp", "$bad_config.tmp"; + } return $ret if ($ret < 0); @@ -3416,9 +3327,9 @@ sub patchcheck_reboot { sub patchcheck { my ($i) = @_; - die "PATCHCHECK_START[$i] not defined\n" + dodie "PATCHCHECK_START[$i] not defined\n" if (!defined($patchcheck_start)); - die "PATCHCHECK_TYPE[$i] not defined\n" + dodie "PATCHCHECK_TYPE[$i] not defined\n" if (!defined($patchcheck_type)); my $start = $patchcheck_start; @@ -3432,7 +3343,7 @@ sub patchcheck { if (defined($patchcheck_end)) { $end = $patchcheck_end; } elsif ($cherry) { - die "PATCHCHECK_END must be defined with PATCHCHECK_CHERRY\n"; + dodie "PATCHCHECK_END must be defined with PATCHCHECK_CHERRY\n"; } # Get the true sha1's since we can use things like HEAD~3 @@ -3496,7 +3407,7 @@ sub patchcheck { doprint "\nProcessing commit \"$item\"\n\n"; run_command "git checkout $sha1" or - die "Failed to checkout $sha1"; + dodie "Failed to checkout $sha1"; # only clean on the first and last patch if ($item eq $list[0] || @@ -3587,7 +3498,7 @@ sub read_kconfig { } open(KIN, "$kconfig") - or die "Can't open $kconfig"; + or dodie "Can't open $kconfig"; while (<KIN>) { chomp; @@ -3746,7 +3657,7 @@ sub get_depends { $dep =~ s/^[^$valid]*[$valid]+//; } else { - die "this should never happen"; + dodie "this should never happen"; } } @@ -4007,7 +3918,7 @@ sub make_min_config { # update new ignore configs if (defined($ignore_config)) { open (OUT, ">$temp_config") - or die "Can't write to $temp_config"; + or dodie "Can't write to $temp_config"; foreach my $config (keys %save_configs) { print OUT "$save_configs{$config}\n"; } @@ -4035,7 +3946,7 @@ sub make_min_config { # Save off all the current mandatory configs open (OUT, ">$temp_config") - or die "Can't write to $temp_config"; + or dodie "Can't write to $temp_config"; foreach my $config (keys %keep_configs) { print OUT "$keep_configs{$config}\n"; } @@ -4222,6 +4133,74 @@ sub set_test_option { return eval_option($name, $option, $i); } +sub find_mailer { + my ($mailer) = @_; + + my @paths = split /:/, $ENV{PATH}; + + # sendmail is usually in /usr/sbin + $paths[$#paths + 1] = "/usr/sbin"; + + foreach my $path (@paths) { + if (-x "$path/$mailer") { + return $path; + } + } + + return undef; +} + +sub do_send_mail { + my ($subject, $message) = @_; + + if (!defined($mail_path)) { + # find the mailer + $mail_path = find_mailer $mailer; + if (!defined($mail_path)) { + die "\nCan not find $mailer in PATH\n"; + } + } + + if (!defined($mail_command)) { + if ($mailer eq "mail" || $mailer eq "mailx") { + $mail_command = "\$MAIL_PATH/\$MAILER -s \'\$SUBJECT\' \$MAILTO <<< \'\$MESSAGE\'"; + } elsif ($mailer eq "sendmail" ) { + $mail_command = "echo \'Subject: \$SUBJECT\n\n\$MESSAGE\' | \$MAIL_PATH/\$MAILER -t \$MAILTO"; + } else { + die "\nYour mailer: $mailer is not supported.\n"; + } + } + + $mail_command =~ s/\$MAILER/$mailer/g; + $mail_command =~ s/\$MAIL_PATH/$mail_path/g; + $mail_command =~ s/\$MAILTO/$mailto/g; + $mail_command =~ s/\$SUBJECT/$subject/g; + $mail_command =~ s/\$MESSAGE/$message/g; + + run_command $mail_command; +} + +sub send_email { + + if (defined($mailto)) { + if (!defined($mailer)) { + doprint "No email sent: email or mailer not specified in config.\n"; + return; + } + do_send_mail @_; + } +} + +sub cancel_test { + if ($email_when_canceled) { + send_email("KTEST: Your [$test_type] test was cancelled", + "Your test started at $script_start_time was cancelled: sig int"); + } + die "\nCaught Sig Int, test interrupted: $!\n" +} + +$SIG{INT} = qw(cancel_test); + # First we need to do is the builds for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { @@ -4245,11 +4224,11 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { $outputdir = set_test_option("OUTPUT_DIR", $i); $builddir = set_test_option("BUILD_DIR", $i); - chdir $builddir || die "can't change directory to $builddir"; + chdir $builddir || dodie "can't change directory to $builddir"; if (!-d $outputdir) { mkpath($outputdir) or - die "can't create $outputdir"; + dodie "can't create $outputdir"; } $make = "$makecmd O=$outputdir"; @@ -4262,9 +4241,15 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { $start_minconfig_defined = 1; # The first test may override the PRE_KTEST option - if (defined($pre_ktest) && $i == 1) { - doprint "\n"; - run_command $pre_ktest; + if ($i == 1) { + if (defined($pre_ktest)) { + doprint "\n"; + run_command $pre_ktest; + } + if ($email_when_started) { + send_email("KTEST: Your [$test_type] test was started", + "Your test was started on $script_start_time"); + } } # Any test can override the POST_KTEST option @@ -4280,7 +4265,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { if (!-d $tmpdir) { mkpath($tmpdir) or - die "can't create $tmpdir"; + dodie "can't create $tmpdir"; } $ENV{"SSH_USER"} = $ssh_user; @@ -4353,7 +4338,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { if (defined($checkout)) { run_command "git checkout $checkout" or - die "failed to checkout $checkout"; + dodie "failed to checkout $checkout"; } $no_reboot = 0; @@ -4428,4 +4413,8 @@ if ($opt{"POWEROFF_ON_SUCCESS"}) { doprint "\n $successes of $opt{NUM_TESTS} tests were successful\n\n"; +if ($email_when_finished) { + send_email("KTEST: Your [$test_type] test has finished!", + "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); +} exit 0; diff --git a/tools/testing/ktest/sample.conf b/tools/testing/ktest/sample.conf index 6c58cd8bbbae..6ca6ca0ce695 100644 --- a/tools/testing/ktest/sample.conf +++ b/tools/testing/ktest/sample.conf @@ -1,6 +1,11 @@ # # Config file for ktest.pl # +# Place your customized version of this, in the working directory that +# ktest.pl is run from. By default, ktest.pl will look for a file +# called "ktest.conf", but you can name it anything you like and specify +# the name of your config file as the first argument of ktest.pl. +# # Note, all paths must be absolute # @@ -396,6 +401,44 @@ #### Optional Config Options (all have defaults) #### +# Email options for receiving notifications. Users must setup +# the specified mailer prior to using this feature. +# +# (default undefined) +#MAILTO = +# +# Supported mailers: sendmail, mail, mailx +# (default sendmail) +#MAILER = sendmail +# +# The executable to run +# (default: for sendmail "/usr/sbin/sendmail", otherwise equals ${MAILER}) +#MAIL_EXEC = /usr/sbin/sendmail +# +# The command used to send mail, which uses the above options +# can be modified. By default if the mailer is "sendmail" then +# MAIL_COMMAND = echo \'Subject: $SUBJECT\n\n$MESSAGE\' | $MAIL_PATH/$MAILER -t $MAILTO +# For mail or mailx: +# MAIL_COMMAND = "$MAIL_PATH/$MAILER -s \'$SUBJECT\' $MAILTO <<< \'$MESSAGE\' +# ktest.pl will do the substitution for MAIL_PATH, MAILER, MAILTO at the time +# it sends the mail if "$FOO" format is used. If "${FOO}" format is used, +# then the substitutions will occur at the time the config file is read. +# But note, MAIL_PATH and MAILER require being set by the config file if +# ${MAIL_PATH} or ${MAILER} are used, but not if $MAIL_PATH or $MAILER are. +#MAIL_COMMAND = echo \'Subject: $SUBJECT\n\n$MESSAGE\' | $MAIL_PATH/$MAILER -t $MAILTO +# +# Errors are defined as those would terminate the script +# (default 1) +#EMAIL_ON_ERROR = 1 +# (default 1) +#EMAIL_WHEN_FINISHED = 1 +# (default 0) +#EMAIL_WHEN_STARTED = 1 +# +# Users can cancel the test by Ctrl^C +# (default 0) +#EMAIL_WHEN_CANCELED = 1 + # Start a test setup. If you leave this off, all options # will be default and the test will run once. # This is a label and not really an option (it takes no value). @@ -725,6 +768,13 @@ # (default 120) #TIMEOUT = 120 +# The timeout in seconds when to test if the box can be rebooted +# or not. Before issuing the reboot command, a ssh connection +# is attempted to see if the target machine is still active. +# If the target does not connect within this timeout, a power cycle +# is issued instead of a reboot. +# CONNECT_TIMEOUT = 25 + # In between tests, a reboot of the box may occur, and this # is the time to wait for the console after it stops producing # output. Some machines may not produce a large lag on reboot @@ -1167,6 +1217,16 @@ # Set it to "good" to test only the good config and set it # to "bad" to only test the bad config. # +# CONFIG_BISECT_EXEC (optional) +# The config bisect is a separate program that comes with ktest.pl. +# By befault, it will look for: +# `pwd`/config-bisect.pl # the location ktest.pl was executed from. +# If it does not find it there, it will look for: +# `dirname <ktest.pl>`/config-bisect.pl # The directory that holds ktest.pl +# If it does not find it there, it will look for: +# ${BUILD_DIR}/tools/testing/ktest/config-bisect.pl +# Setting CONFIG_BISECT_EXEC will override where it looks. +# # Example: # TEST_START # TEST_TYPE = config_bisect diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index e3201ccf54c3..32159c08a52e 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h @@ -19,6 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) +#define GFP_ZONEMASK 0x0fu #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2fc410bc4f33..32aafa92074c 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -25,6 +25,7 @@ TARGETS += mqueue TARGETS += net TARGETS += nsfs TARGETS += powerpc +TARGETS += proc TARGETS += pstore TARGETS += ptrace TARGETS += seccomp diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore new file mode 100644 index 000000000000..6c16f77c722c --- /dev/null +++ b/tools/testing/selftests/proc/.gitignore @@ -0,0 +1,8 @@ +/proc-loadavg-001 +/proc-self-map-files-001 +/proc-self-map-files-002 +/proc-self-syscall +/proc-self-wchan +/proc-uptime-001 +/proc-uptime-002 +/read diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile new file mode 100644 index 000000000000..dbb87e56264c --- /dev/null +++ b/tools/testing/selftests/proc/Makefile @@ -0,0 +1,13 @@ +CFLAGS += -Wall -O2 + +TEST_GEN_PROGS := +TEST_GEN_PROGS += proc-loadavg-001 +TEST_GEN_PROGS += proc-self-map-files-001 +TEST_GEN_PROGS += proc-self-map-files-002 +TEST_GEN_PROGS += proc-self-syscall +TEST_GEN_PROGS += proc-self-wchan +TEST_GEN_PROGS += proc-uptime-001 +TEST_GEN_PROGS += proc-uptime-002 +TEST_GEN_PROGS += read + +include ../lib.mk diff --git a/tools/testing/selftests/proc/config b/tools/testing/selftests/proc/config new file mode 100644 index 000000000000..68fbd2b35884 --- /dev/null +++ b/tools/testing/selftests/proc/config @@ -0,0 +1 @@ +CONFIG_PROC_FS=y diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c new file mode 100644 index 000000000000..e38ad6d94d4b --- /dev/null +++ b/tools/testing/selftests/proc/proc-loadavg-001.c @@ -0,0 +1,83 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test that /proc/loadavg correctly reports last pid in pid namespace. */ +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/wait.h> + +int main(void) +{ + pid_t pid; + int wstatus; + + if (unshare(CLONE_NEWPID) == -1) { + if (errno == ENOSYS || errno == EPERM) + return 2; + return 1; + } + + pid = fork(); + if (pid == -1) + return 1; + if (pid == 0) { + char buf[128], *p; + int fd; + ssize_t rv; + + fd = open("/proc/loadavg" , O_RDONLY); + if (fd == -1) + return 1; + rv = read(fd, buf, sizeof(buf)); + if (rv < 3) + return 1; + p = buf + rv; + + /* pid 1 */ + if (!(p[-3] == ' ' && p[-2] == '1' && p[-1] == '\n')) + return 1; + + pid = fork(); + if (pid == -1) + return 1; + if (pid == 0) + return 0; + if (waitpid(pid, NULL, 0) == -1) + return 1; + + lseek(fd, 0, SEEK_SET); + rv = read(fd, buf, sizeof(buf)); + if (rv < 3) + return 1; + p = buf + rv; + + /* pid 2 */ + if (!(p[-3] == ' ' && p[-2] == '2' && p[-1] == '\n')) + return 1; + + return 0; + } + + if (waitpid(pid, &wstatus, 0) == -1) + return 1; + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) + return 0; + return 1; +} diff --git a/tools/testing/selftests/proc/proc-self-map-files-001.c b/tools/testing/selftests/proc/proc-self-map-files-001.c new file mode 100644 index 000000000000..af1d0a6af810 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-map-files-001.c @@ -0,0 +1,82 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test readlink /proc/self/map_files/... */ +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <stdlib.h> + +static void pass(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1) + exit(1); +} + +static void fail(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT) + return; + exit(1); +} + +int main(void) +{ + const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); + void *p; + int fd; + unsigned long a, b; + + fd = open("/dev/zero", O_RDONLY); + if (fd == -1) + return 1; + + p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE, fd, 0); + if (p == MAP_FAILED) + return 1; + + a = (unsigned long)p; + b = (unsigned long)p + PAGE_SIZE; + + pass("/proc/self/map_files/%lx-%lx", a, b); + fail("/proc/self/map_files/ %lx-%lx", a, b); + fail("/proc/self/map_files/%lx -%lx", a, b); + fail("/proc/self/map_files/%lx- %lx", a, b); + fail("/proc/self/map_files/%lx-%lx ", a, b); + fail("/proc/self/map_files/0%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-0%lx", a, b); + if (sizeof(long) == 4) { + fail("/proc/self/map_files/100000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-100000000%lx", a, b); + } else if (sizeof(long) == 8) { + fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b); + } else + return 1; + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c new file mode 100644 index 000000000000..aebf4be56111 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-map-files-002.c @@ -0,0 +1,85 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test readlink /proc/self/map_files/... with address 0. */ +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <stdlib.h> + +static void pass(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1) + exit(1); +} + +static void fail(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT) + return; + exit(1); +} + +int main(void) +{ + const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); + void *p; + int fd; + unsigned long a, b; + + fd = open("/dev/zero", O_RDONLY); + if (fd == -1) + return 1; + + p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) { + if (errno == EPERM) + return 2; + return 1; + } + + a = (unsigned long)p; + b = (unsigned long)p + PAGE_SIZE; + + pass("/proc/self/map_files/%lx-%lx", a, b); + fail("/proc/self/map_files/ %lx-%lx", a, b); + fail("/proc/self/map_files/%lx -%lx", a, b); + fail("/proc/self/map_files/%lx- %lx", a, b); + fail("/proc/self/map_files/%lx-%lx ", a, b); + fail("/proc/self/map_files/0%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-0%lx", a, b); + if (sizeof(long) == 4) { + fail("/proc/self/map_files/100000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-100000000%lx", a, b); + } else if (sizeof(long) == 8) { + fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b); + } else + return 1; + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c new file mode 100644 index 000000000000..05eb6f91f1e9 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-syscall.c @@ -0,0 +1,45 @@ +#define _GNU_SOURCE +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdio.h> + +static inline ssize_t sys_read(int fd, void *buf, size_t len) +{ + return syscall(SYS_read, fd, buf, len); +} + +int main(void) +{ + char buf1[64]; + char buf2[64]; + int fd; + ssize_t rv; + + fd = open("/proc/self/syscall", O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) + return 2; + return 1; + } + + /* Do direct system call as libc can wrap anything. */ + snprintf(buf1, sizeof(buf1), "%ld 0x%lx 0x%lx 0x%lx", + (long)SYS_read, (long)fd, (long)buf2, (long)sizeof(buf2)); + + memset(buf2, 0, sizeof(buf2)); + rv = sys_read(fd, buf2, sizeof(buf2)); + if (rv < 0) + return 1; + if (rv < strlen(buf1)) + return 1; + if (strncmp(buf1, buf2, strlen(buf1)) != 0) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c new file mode 100644 index 000000000000..b8d8728a6869 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-wchan.c @@ -0,0 +1,25 @@ +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> + +int main(void) +{ + char buf[64]; + int fd; + + fd = open("/proc/self/wchan", O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) + return 2; + return 1; + } + + buf[0] = '\0'; + if (read(fd, buf, sizeof(buf)) != 1) + return 1; + if (buf[0] != '0') + return 1; + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime-001.c b/tools/testing/selftests/proc/proc-uptime-001.c new file mode 100644 index 000000000000..303f26092306 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime-001.c @@ -0,0 +1,45 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that values in /proc/uptime increment monotonically. +#undef NDEBUG +#include <assert.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "proc-uptime.h" + +int main(void) +{ + uint64_t start, u0, u1, i0, i1; + int fd; + + fd = open("/proc/uptime", O_RDONLY); + assert(fd >= 0); + + proc_uptime(fd, &u0, &i0); + start = u0; + do { + proc_uptime(fd, &u1, &i1); + assert(u1 >= u0); + assert(i1 >= i0); + u0 = u1; + i0 = i1; + } while (u1 - start < 100); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c new file mode 100644 index 000000000000..0cb79e1f1674 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime-002.c @@ -0,0 +1,79 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that values in /proc/uptime increment monotonically +// while shifting across CPUs. +#define _GNU_SOURCE +#undef NDEBUG +#include <assert.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <stdlib.h> +#include <string.h> + +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "proc-uptime.h" + +static inline int sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *m) +{ + return syscall(SYS_sched_getaffinity, pid, len, m); +} + +static inline int sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *m) +{ + return syscall(SYS_sched_setaffinity, pid, len, m); +} + +int main(void) +{ + unsigned int len; + unsigned long *m; + unsigned int cpu; + uint64_t u0, u1, i0, i1; + int fd; + + /* find out "nr_cpu_ids" */ + m = NULL; + len = 0; + do { + len += sizeof(unsigned long); + free(m); + m = malloc(len); + } while (sys_sched_getaffinity(0, len, m) == -EINVAL); + + fd = open("/proc/uptime", O_RDONLY); + assert(fd >= 0); + + proc_uptime(fd, &u0, &i0); + for (cpu = 0; cpu < len * 8; cpu++) { + memset(m, 0, len); + m[cpu / (8 * sizeof(unsigned long))] |= 1UL << (cpu % (8 * sizeof(unsigned long))); + + /* CPU might not exist, ignore error */ + sys_sched_setaffinity(0, len, m); + + proc_uptime(fd, &u1, &i1); + assert(u1 >= u0); + assert(i1 >= i0); + u0 = u1; + i0 = i1; + } + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h new file mode 100644 index 000000000000..d584419f50a7 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime.h @@ -0,0 +1,74 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> + +static unsigned long long xstrtoull(const char *p, char **end) +{ + if (*p == '0') { + *end = (char *)p + 1; + return 0; + } else if ('1' <= *p && *p <= '9') { + unsigned long long val; + + errno = 0; + val = strtoull(p, end, 10); + assert(errno == 0); + return val; + } else + assert(0); +} + +static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) +{ + uint64_t val1, val2; + char buf[64], *p; + ssize_t rv; + + /* save "p < end" checks */ + memset(buf, 0, sizeof(buf)); + rv = pread(fd, buf, sizeof(buf), 0); + assert(0 <= rv && rv <= sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + + p = buf; + + val1 = xstrtoull(p, &p); + assert(p[0] == '.'); + assert('0' <= p[1] && p[1] <= '9'); + assert('0' <= p[2] && p[2] <= '9'); + assert(p[3] == ' '); + + val2 = (p[1] - '0') * 10 + p[2] - '0'; + *uptime = val1 * 100 + val2; + + p += 4; + + val1 = xstrtoull(p, &p); + assert(p[0] == '.'); + assert('0' <= p[1] && p[1] <= '9'); + assert('0' <= p[2] && p[2] <= '9'); + assert(p[3] == '\n'); + + val2 = (p[1] - '0') * 10 + p[2] - '0'; + *idle = val1 * 100 + val2; + + assert(p + 4 == buf + rv); +} diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c new file mode 100644 index 000000000000..12e397f78592 --- /dev/null +++ b/tools/testing/selftests/proc/read.c @@ -0,0 +1,147 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test +// 1) read of every file in /proc +// 2) readlink of every symlink in /proc +// 3) recursively (1) + (2) for every directory in /proc +// 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs +// 5) write to /proc/sysrq-trigger +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <sys/types.h> +#include <dirent.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +static inline bool streq(const char *s1, const char *s2) +{ + return strcmp(s1, s2) == 0; +} + +static struct dirent *xreaddir(DIR *d) +{ + struct dirent *de; + + errno = 0; + de = readdir(d); + if (!de && errno != 0) { + exit(1); + } + return de; +} + +static void f_reg(DIR *d, const char *filename) +{ + char buf[4096]; + int fd; + ssize_t rv; + + /* read from /proc/kmsg can block */ + fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK); + if (fd == -1) + return; + rv = read(fd, buf, sizeof(buf)); + assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); + close(fd); +} + +static void f_reg_write(DIR *d, const char *filename, const char *buf, size_t len) +{ + int fd; + ssize_t rv; + + fd = openat(dirfd(d), filename, O_WRONLY); + if (fd == -1) + return; + rv = write(fd, buf, len); + assert((0 <= rv && rv <= len) || rv == -1); + close(fd); +} + +static void f_lnk(DIR *d, const char *filename) +{ + char buf[4096]; + ssize_t rv; + + rv = readlinkat(dirfd(d), filename, buf, sizeof(buf)); + assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); +} + +static void f(DIR *d, unsigned int level) +{ + struct dirent *de; + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, ".")); + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, "..")); + + while ((de = xreaddir(d))) { + assert(!streq(de->d_name, ".")); + assert(!streq(de->d_name, "..")); + + switch (de->d_type) { + DIR *dd; + int fd; + + case DT_REG: + if (level == 0 && streq(de->d_name, "sysrq-trigger")) { + f_reg_write(d, de->d_name, "h", 1); + } else if (level == 1 && streq(de->d_name, "clear_refs")) { + f_reg_write(d, de->d_name, "1", 1); + } else if (level == 3 && streq(de->d_name, "clear_refs")) { + f_reg_write(d, de->d_name, "1", 1); + } else { + f_reg(d, de->d_name); + } + break; + case DT_DIR: + fd = openat(dirfd(d), de->d_name, O_DIRECTORY|O_RDONLY); + if (fd == -1) + continue; + dd = fdopendir(fd); + if (!dd) + continue; + f(dd, level + 1); + closedir(dd); + break; + case DT_LNK: + f_lnk(d, de->d_name); + break; + default: + assert(0); + } + } +} + +int main(void) +{ + DIR *d; + + d = opendir("/proc"); + if (!d) + return 2; + f(d, 0); + return 0; +} |